LCOV - coverage.info - mm/page

LCOV - code coverage report

Current view:	top level - mm - page_alloc.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	1045	2080	50.2 %
Date:	2023-03-27 20:00:47	Functions:	92	170	54.1 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  linux/mm/page_alloc.c
       4             :  *
       5             :  *  Manages the free list, the system allocates free pages here.
       6             :  *  Note that kmalloc() lives in slab.c
       7             :  *
       8             :  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
       9             :  *  Swap reorganised 29.12.95, Stephen Tweedie
      10             :  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
      11             :  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
      12             :  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
      13             :  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
      14             :  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
      15             :  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
      16             :  */
      17             : 
      18             : #include <linux/stddef.h>
      19             : #include <linux/mm.h>
      20             : #include <linux/highmem.h>
      21             : #include <linux/swap.h>
      22             : #include <linux/swapops.h>
      23             : #include <linux/interrupt.h>
      24             : #include <linux/pagemap.h>
      25             : #include <linux/jiffies.h>
      26             : #include <linux/memblock.h>
      27             : #include <linux/compiler.h>
      28             : #include <linux/kernel.h>
      29             : #include <linux/kasan.h>
      30             : #include <linux/kmsan.h>
      31             : #include <linux/module.h>
      32             : #include <linux/suspend.h>
      33             : #include <linux/pagevec.h>
      34             : #include <linux/blkdev.h>
      35             : #include <linux/slab.h>
      36             : #include <linux/ratelimit.h>
      37             : #include <linux/oom.h>
      38             : #include <linux/topology.h>
      39             : #include <linux/sysctl.h>
      40             : #include <linux/cpu.h>
      41             : #include <linux/cpuset.h>
      42             : #include <linux/memory_hotplug.h>
      43             : #include <linux/nodemask.h>
      44             : #include <linux/vmalloc.h>
      45             : #include <linux/vmstat.h>
      46             : #include <linux/mempolicy.h>
      47             : #include <linux/memremap.h>
      48             : #include <linux/stop_machine.h>
      49             : #include <linux/random.h>
      50             : #include <linux/sort.h>
      51             : #include <linux/pfn.h>
      52             : #include <linux/backing-dev.h>
      53             : #include <linux/fault-inject.h>
      54             : #include <linux/page-isolation.h>
      55             : #include <linux/debugobjects.h>
      56             : #include <linux/kmemleak.h>
      57             : #include <linux/compaction.h>
      58             : #include <trace/events/kmem.h>
      59             : #include <trace/events/oom.h>
      60             : #include <linux/prefetch.h>
      61             : #include <linux/mm_inline.h>
      62             : #include <linux/mmu_notifier.h>
      63             : #include <linux/migrate.h>
      64             : #include <linux/hugetlb.h>
      65             : #include <linux/sched/rt.h>
      66             : #include <linux/sched/mm.h>
      67             : #include <linux/page_owner.h>
      68             : #include <linux/page_table_check.h>
      69             : #include <linux/kthread.h>
      70             : #include <linux/memcontrol.h>
      71             : #include <linux/ftrace.h>
      72             : #include <linux/lockdep.h>
      73             : #include <linux/nmi.h>
      74             : #include <linux/psi.h>
      75             : #include <linux/padata.h>
      76             : #include <linux/khugepaged.h>
      77             : #include <linux/buffer_head.h>
      78             : #include <linux/delayacct.h>
      79             : #include <asm/sections.h>
      80             : #include <asm/tlbflush.h>
      81             : #include <asm/div64.h>
      82             : #include "internal.h"
      83             : #include "shuffle.h"
      84             : #include "page_reporting.h"
      85             : #include "swap.h"
      86             : 
      87             : /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
      88             : typedef int __bitwise fpi_t;
      89             : 
      90             : /* No special request */
      91             : #define FPI_NONE                ((__force fpi_t)0)
      92             : 
      93             : /*
      94             :  * Skip free page reporting notification for the (possibly merged) page.
      95             :  * This does not hinder free page reporting from grabbing the page,
      96             :  * reporting it and marking it "reported" -  it only skips notifying
      97             :  * the free page reporting infrastructure about a newly freed page. For
      98             :  * example, used when temporarily pulling a page from a freelist and
      99             :  * putting it back unmodified.
     100             :  */
     101             : #define FPI_SKIP_REPORT_NOTIFY  ((__force fpi_t)BIT(0))
     102             : 
     103             : /*
     104             :  * Place the (possibly merged) page to the tail of the freelist. Will ignore
     105             :  * page shuffling (relevant code - e.g., memory onlining - is expected to
     106             :  * shuffle the whole zone).
     107             :  *
     108             :  * Note: No code should rely on this flag for correctness - it's purely
     109             :  *       to allow for optimizations when handing back either fresh pages
     110             :  *       (memory onlining) or untouched pages (page isolation, free page
     111             :  *       reporting).
     112             :  */
     113             : #define FPI_TO_TAIL             ((__force fpi_t)BIT(1))
     114             : 
     115             : /*
     116             :  * Don't poison memory with KASAN (only for the tag-based modes).
     117             :  * During boot, all non-reserved memblock memory is exposed to page_alloc.
     118             :  * Poisoning all that memory lengthens boot time, especially on systems with
     119             :  * large amount of RAM. This flag is used to skip that poisoning.
     120             :  * This is only done for the tag-based KASAN modes, as those are able to
     121             :  * detect memory corruptions with the memory tags assigned by default.
     122             :  * All memory allocated normally after boot gets poisoned as usual.
     123             :  */
     124             : #define FPI_SKIP_KASAN_POISON   ((__force fpi_t)BIT(2))
     125             : 
     126             : /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
     127             : static DEFINE_MUTEX(pcp_batch_high_lock);
     128             : #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
     129             : 
     130             : #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
     131             : /*
     132             :  * On SMP, spin_trylock is sufficient protection.
     133             :  * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
     134             :  */
     135             : #define pcp_trylock_prepare(flags)      do { } while (0)
     136             : #define pcp_trylock_finish(flag)        do { } while (0)
     137             : #else
     138             : 
     139             : /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
     140             : #define pcp_trylock_prepare(flags)      local_irq_save(flags)
     141             : #define pcp_trylock_finish(flags)       local_irq_restore(flags)
     142             : #endif
     143             : 
     144             : /*
     145             :  * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
     146             :  * a migration causing the wrong PCP to be locked and remote memory being
     147             :  * potentially allocated, pin the task to the CPU for the lookup+lock.
     148             :  * preempt_disable is used on !RT because it is faster than migrate_disable.
     149             :  * migrate_disable is used on RT because otherwise RT spinlock usage is
     150             :  * interfered with and a high priority task cannot preempt the allocator.
     151             :  */
     152             : #ifndef CONFIG_PREEMPT_RT
     153             : #define pcpu_task_pin()         preempt_disable()
     154             : #define pcpu_task_unpin()       preempt_enable()
     155             : #else
     156             : #define pcpu_task_pin()         migrate_disable()
     157             : #define pcpu_task_unpin()       migrate_enable()
     158             : #endif
     159             : 
     160             : /*
     161             :  * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
     162             :  * Return value should be used with equivalent unlock helper.
     163             :  */
     164             : #define pcpu_spin_lock(type, member, ptr)                               \
     165             : ({                                                                      \
     166             :         type *_ret;                                                     \
     167             :         pcpu_task_pin();                                                \
     168             :         _ret = this_cpu_ptr(ptr);                                       \
     169             :         spin_lock(&_ret->member);                                        \
     170             :         _ret;                                                           \
     171             : })
     172             : 
     173             : #define pcpu_spin_trylock(type, member, ptr)                            \
     174             : ({                                                                      \
     175             :         type *_ret;                                                     \
     176             :         pcpu_task_pin();                                                \
     177             :         _ret = this_cpu_ptr(ptr);                                       \
     178             :         if (!spin_trylock(&_ret->member)) {                              \
     179             :                 pcpu_task_unpin();                                      \
     180             :                 _ret = NULL;                                            \
     181             :         }                                                               \
     182             :         _ret;                                                           \
     183             : })
     184             : 
     185             : #define pcpu_spin_unlock(member, ptr)                                   \
     186             : ({                                                                      \
     187             :         spin_unlock(&ptr->member);                                       \
     188             :         pcpu_task_unpin();                                              \
     189             : })
     190             : 
     191             : /* struct per_cpu_pages specific helpers. */
     192             : #define pcp_spin_lock(ptr)                                              \
     193             :         pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
     194             : 
     195             : #define pcp_spin_trylock(ptr)                                           \
     196             :         pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
     197             : 
     198             : #define pcp_spin_unlock(ptr)                                            \
     199             :         pcpu_spin_unlock(lock, ptr)
     200             : 
     201             : #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
     202             : DEFINE_PER_CPU(int, numa_node);
     203             : EXPORT_PER_CPU_SYMBOL(numa_node);
     204             : #endif
     205             : 
     206             : DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
     207             : 
     208             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
     209             : /*
     210             :  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
     211             :  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
     212             :  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
     213             :  * defined in <linux/topology.h>.
     214             :  */
     215             : DEFINE_PER_CPU(int, _numa_mem_);                /* Kernel "local memory" node */
     216             : EXPORT_PER_CPU_SYMBOL(_numa_mem_);
     217             : #endif
     218             : 
     219             : static DEFINE_MUTEX(pcpu_drain_mutex);
     220             : 
     221             : #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
     222             : volatile unsigned long latent_entropy __latent_entropy;
     223             : EXPORT_SYMBOL(latent_entropy);
     224             : #endif
     225             : 
     226             : /*
     227             :  * Array of node states.
     228             :  */
     229             : nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
     230             :         [N_POSSIBLE] = NODE_MASK_ALL,
     231             :         [N_ONLINE] = { { [0] = 1UL } },
     232             : #ifndef CONFIG_NUMA
     233             :         [N_NORMAL_MEMORY] = { { [0] = 1UL } },
     234             : #ifdef CONFIG_HIGHMEM
     235             :         [N_HIGH_MEMORY] = { { [0] = 1UL } },
     236             : #endif
     237             :         [N_MEMORY] = { { [0] = 1UL } },
     238             :         [N_CPU] = { { [0] = 1UL } },
     239             : #endif  /* NUMA */
     240             : };
     241             : EXPORT_SYMBOL(node_states);
     242             : 
     243             : atomic_long_t _totalram_pages __read_mostly;
     244             : EXPORT_SYMBOL(_totalram_pages);
     245             : unsigned long totalreserve_pages __read_mostly;
     246             : unsigned long totalcma_pages __read_mostly;
     247             : 
     248             : int percpu_pagelist_high_fraction;
     249             : gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
     250             : DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
     251             : EXPORT_SYMBOL(init_on_alloc);
     252             : 
     253             : DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
     254             : EXPORT_SYMBOL(init_on_free);
     255             : 
     256             : static bool _init_on_alloc_enabled_early __read_mostly
     257             :                                 = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
     258           0 : static int __init early_init_on_alloc(char *buf)
     259             : {
     260             : 
     261           0 :         return kstrtobool(buf, &_init_on_alloc_enabled_early);
     262             : }
     263             : early_param("init_on_alloc", early_init_on_alloc);
     264             : 
     265             : static bool _init_on_free_enabled_early __read_mostly
     266             :                                 = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
     267           0 : static int __init early_init_on_free(char *buf)
     268             : {
     269           0 :         return kstrtobool(buf, &_init_on_free_enabled_early);
     270             : }
     271             : early_param("init_on_free", early_init_on_free);
     272             : 
     273             : /*
     274             :  * A cached value of the page's pageblock's migratetype, used when the page is
     275             :  * put on a pcplist. Used to avoid the pageblock migratetype lookup when
     276             :  * freeing from pcplists in most cases, at the cost of possibly becoming stale.
     277             :  * Also the migratetype set in the page does not necessarily match the pcplist
     278             :  * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
     279             :  * other index - this ensures that it will be put on the correct CMA freelist.
     280             :  */
     281             : static inline int get_pcppage_migratetype(struct page *page)
     282             : {
     283       45484 :         return page->index;
     284             : }
     285             : 
     286             : static inline void set_pcppage_migratetype(struct page *page, int migratetype)
     287             : {
     288       46860 :         page->index = migratetype;
     289             : }
     290             : 
     291             : #ifdef CONFIG_PM_SLEEP
     292             : /*
     293             :  * The following functions are used by the suspend/hibernate code to temporarily
     294             :  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
     295             :  * while devices are suspended.  To avoid races with the suspend/hibernate code,
     296             :  * they should always be called with system_transition_mutex held
     297             :  * (gfp_allowed_mask also should only be modified with system_transition_mutex
     298             :  * held, unless the suspend/hibernate code is guaranteed not to run in parallel
     299             :  * with that modification).
     300             :  */
     301             : 
     302             : static gfp_t saved_gfp_mask;
     303             : 
     304           0 : void pm_restore_gfp_mask(void)
     305             : {
     306           0 :         WARN_ON(!mutex_is_locked(&system_transition_mutex));
     307           0 :         if (saved_gfp_mask) {
     308           0 :                 gfp_allowed_mask = saved_gfp_mask;
     309           0 :                 saved_gfp_mask = 0;
     310             :         }
     311           0 : }
     312             : 
     313           0 : void pm_restrict_gfp_mask(void)
     314             : {
     315           0 :         WARN_ON(!mutex_is_locked(&system_transition_mutex));
     316           0 :         WARN_ON(saved_gfp_mask);
     317           0 :         saved_gfp_mask = gfp_allowed_mask;
     318           0 :         gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
     319           0 : }
     320             : 
     321           0 : bool pm_suspended_storage(void)
     322             : {
     323           0 :         if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
     324             :                 return false;
     325           0 :         return true;
     326             : }
     327             : #endif /* CONFIG_PM_SLEEP */
     328             : 
     329             : #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
     330             : unsigned int pageblock_order __read_mostly;
     331             : #endif
     332             : 
     333             : static void __free_pages_ok(struct page *page, unsigned int order,
     334             :                             fpi_t fpi_flags);
     335             : 
     336             : /*
     337             :  * results with 256, 32 in the lowmem_reserve sysctl:
     338             :  *      1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
     339             :  *      1G machine -> (16M dma, 784M normal, 224M high)
     340             :  *      NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
     341             :  *      HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
     342             :  *      HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
     343             :  *
     344             :  * TBD: should special case ZONE_DMA32 machines here - in those we normally
     345             :  * don't need any ZONE_NORMAL reservation
     346             :  */
     347             : int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
     348             : #ifdef CONFIG_ZONE_DMA
     349             :         [ZONE_DMA] = 256,
     350             : #endif
     351             : #ifdef CONFIG_ZONE_DMA32
     352             :         [ZONE_DMA32] = 256,
     353             : #endif
     354             :         [ZONE_NORMAL] = 32,
     355             : #ifdef CONFIG_HIGHMEM
     356             :         [ZONE_HIGHMEM] = 0,
     357             : #endif
     358             :         [ZONE_MOVABLE] = 0,
     359             : };
     360             : 
     361             : static char * const zone_names[MAX_NR_ZONES] = {
     362             : #ifdef CONFIG_ZONE_DMA
     363             :          "DMA",
     364             : #endif
     365             : #ifdef CONFIG_ZONE_DMA32
     366             :          "DMA32",
     367             : #endif
     368             :          "Normal",
     369             : #ifdef CONFIG_HIGHMEM
     370             :          "HighMem",
     371             : #endif
     372             :          "Movable",
     373             : #ifdef CONFIG_ZONE_DEVICE
     374             :          "Device",
     375             : #endif
     376             : };
     377             : 
     378             : const char * const migratetype_names[MIGRATE_TYPES] = {
     379             :         "Unmovable",
     380             :         "Movable",
     381             :         "Reclaimable",
     382             :         "HighAtomic",
     383             : #ifdef CONFIG_CMA
     384             :         "CMA",
     385             : #endif
     386             : #ifdef CONFIG_MEMORY_ISOLATION
     387             :         "Isolate",
     388             : #endif
     389             : };
     390             : 
     391             : compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
     392             :         [NULL_COMPOUND_DTOR] = NULL,
     393             :         [COMPOUND_PAGE_DTOR] = free_compound_page,
     394             : #ifdef CONFIG_HUGETLB_PAGE
     395             :         [HUGETLB_PAGE_DTOR] = free_huge_page,
     396             : #endif
     397             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     398             :         [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
     399             : #endif
     400             : };
     401             : 
     402             : int min_free_kbytes = 1024;
     403             : int user_min_free_kbytes = -1;
     404             : int watermark_boost_factor __read_mostly = 15000;
     405             : int watermark_scale_factor = 10;
     406             : 
     407             : static unsigned long nr_kernel_pages __initdata;
     408             : static unsigned long nr_all_pages __initdata;
     409             : static unsigned long dma_reserve __initdata;
     410             : 
     411             : static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
     412             : static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
     413             : static unsigned long required_kernelcore __initdata;
     414             : static unsigned long required_kernelcore_percent __initdata;
     415             : static unsigned long required_movablecore __initdata;
     416             : static unsigned long required_movablecore_percent __initdata;
     417             : static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
     418             : bool mirrored_kernelcore __initdata_memblock;
     419             : 
     420             : /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
     421             : int movable_zone;
     422             : EXPORT_SYMBOL(movable_zone);
     423             : 
     424             : #if MAX_NUMNODES > 1
     425             : unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
     426             : unsigned int nr_online_nodes __read_mostly = 1;
     427             : EXPORT_SYMBOL(nr_node_ids);
     428             : EXPORT_SYMBOL(nr_online_nodes);
     429             : #endif
     430             : 
     431             : int page_group_by_mobility_disabled __read_mostly;
     432             : 
     433             : bool deferred_struct_pages __meminitdata;
     434             : 
     435             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
     436             : /*
     437             :  * During boot we initialize deferred pages on-demand, as needed, but once
     438             :  * page_alloc_init_late() has finished, the deferred pages are all initialized,
     439             :  * and we can permanently disable that path.
     440             :  */
     441             : static DEFINE_STATIC_KEY_TRUE(deferred_pages);
     442             : 
     443             : static inline bool deferred_pages_enabled(void)
     444             : {
     445             :         return static_branch_unlikely(&deferred_pages);
     446             : }
     447             : 
     448             : /* Returns true if the struct page for the pfn is initialised */
     449             : static inline bool __meminit early_page_initialised(unsigned long pfn)
     450             : {
     451             :         int nid = early_pfn_to_nid(pfn);
     452             : 
     453             :         if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
     454             :                 return false;
     455             : 
     456             :         return true;
     457             : }
     458             : 
     459             : /*
     460             :  * Returns true when the remaining initialisation should be deferred until
     461             :  * later in the boot cycle when it can be parallelised.
     462             :  */
     463             : static bool __meminit
     464             : defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
     465             : {
     466             :         static unsigned long prev_end_pfn, nr_initialised;
     467             : 
     468             :         if (early_page_ext_enabled())
     469             :                 return false;
     470             :         /*
     471             :          * prev_end_pfn static that contains the end of previous zone
     472             :          * No need to protect because called very early in boot before smp_init.
     473             :          */
     474             :         if (prev_end_pfn != end_pfn) {
     475             :                 prev_end_pfn = end_pfn;
     476             :                 nr_initialised = 0;
     477             :         }
     478             : 
     479             :         /* Always populate low zones for address-constrained allocations */
     480             :         if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
     481             :                 return false;
     482             : 
     483             :         if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
     484             :                 return true;
     485             :         /*
     486             :          * We start only with one section of pages, more pages are added as
     487             :          * needed until the rest of deferred pages are initialized.
     488             :          */
     489             :         nr_initialised++;
     490             :         if ((nr_initialised > PAGES_PER_SECTION) &&
     491             :             (pfn & (PAGES_PER_SECTION - 1)) == 0) {
     492             :                 NODE_DATA(nid)->first_deferred_pfn = pfn;
     493             :                 return true;
     494             :         }
     495             :         return false;
     496             : }
     497             : #else
     498             : static inline bool deferred_pages_enabled(void)
     499             : {
     500             :         return false;
     501             : }
     502             : 
     503             : static inline bool early_page_initialised(unsigned long pfn)
     504             : {
     505             :         return true;
     506             : }
     507             : 
     508             : static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
     509             : {
     510             :         return false;
     511             : }
     512             : #endif
     513             : 
     514             : /* Return a pointer to the bitmap storing bits affecting a block of pages */
     515             : static inline unsigned long *get_pageblock_bitmap(const struct page *page,
     516             :                                                         unsigned long pfn)
     517             : {
     518             : #ifdef CONFIG_SPARSEMEM
     519             :         return section_to_usemap(__pfn_to_section(pfn));
     520             : #else
     521       45070 :         return page_zone(page)->pageblock_flags;
     522             : #endif /* CONFIG_SPARSEMEM */
     523             : }
     524             : 
     525             : static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
     526             : {
     527             : #ifdef CONFIG_SPARSEMEM
     528             :         pfn &= (PAGES_PER_SECTION-1);
     529             : #else
     530       45070 :         pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
     531             : #endif /* CONFIG_SPARSEMEM */
     532       45070 :         return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
     533             : }
     534             : 
     535             : static __always_inline
     536             : unsigned long __get_pfnblock_flags_mask(const struct page *page,
     537             :                                         unsigned long pfn,
     538             :                                         unsigned long mask)
     539             : {
     540             :         unsigned long *bitmap;
     541             :         unsigned long bitidx, word_bitidx;
     542             :         unsigned long word;
     543             : 
     544       89604 :         bitmap = get_pageblock_bitmap(page, pfn);
     545       44802 :         bitidx = pfn_to_bitidx(page, pfn);
     546       44802 :         word_bitidx = bitidx / BITS_PER_LONG;
     547       44802 :         bitidx &= (BITS_PER_LONG-1);
     548             :         /*
     549             :          * This races, without locks, with set_pfnblock_flags_mask(). Ensure
     550             :          * a consistent read of the memory array, so that results, even though
     551             :          * racy, are not corrupted.
     552             :          */
     553       44802 :         word = READ_ONCE(bitmap[word_bitidx]);
     554       44802 :         return (word >> bitidx) & mask;
     555             : }
     556             : 
     557             : /**
     558             :  * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
     559             :  * @page: The page within the block of interest
     560             :  * @pfn: The target page frame number
     561             :  * @mask: mask of bits that the caller is interested in
     562             :  *
     563             :  * Return: pageblock_bits flags
     564             :  */
     565           0 : unsigned long get_pfnblock_flags_mask(const struct page *page,
     566             :                                         unsigned long pfn, unsigned long mask)
     567             : {
     568           4 :         return __get_pfnblock_flags_mask(page, pfn, mask);
     569             : }
     570             : 
     571             : static __always_inline int get_pfnblock_migratetype(const struct page *page,
     572             :                                         unsigned long pfn)
     573             : {
     574       44798 :         return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
     575             : }
     576             : 
     577             : /**
     578             :  * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
     579             :  * @page: The page within the block of interest
     580             :  * @flags: The flags to set
     581             :  * @pfn: The target page frame number
     582             :  * @mask: mask of bits that the caller is interested in
     583             :  */
     584         268 : void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
     585             :                                         unsigned long pfn,
     586             :                                         unsigned long mask)
     587             : {
     588             :         unsigned long *bitmap;
     589             :         unsigned long bitidx, word_bitidx;
     590             :         unsigned long word;
     591             : 
     592             :         BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
     593             :         BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
     594             : 
     595         536 :         bitmap = get_pageblock_bitmap(page, pfn);
     596         268 :         bitidx = pfn_to_bitidx(page, pfn);
     597         268 :         word_bitidx = bitidx / BITS_PER_LONG;
     598         268 :         bitidx &= (BITS_PER_LONG-1);
     599             : 
     600             :         VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
     601             : 
     602         268 :         mask <<= bitidx;
     603         268 :         flags <<= bitidx;
     604             : 
     605         268 :         word = READ_ONCE(bitmap[word_bitidx]);
     606             :         do {
     607         804 :         } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
     608         268 : }
     609             : 
     610         268 : void set_pageblock_migratetype(struct page *page, int migratetype)
     611             : {
     612         268 :         if (unlikely(page_group_by_mobility_disabled &&
     613             :                      migratetype < MIGRATE_PCPTYPES))
     614           0 :                 migratetype = MIGRATE_UNMOVABLE;
     615             : 
     616         268 :         set_pfnblock_flags_mask(page, (unsigned long)migratetype,
     617         268 :                                 page_to_pfn(page), MIGRATETYPE_MASK);
     618         268 : }
     619             : 
     620             : #ifdef CONFIG_DEBUG_VM
     621             : static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
     622             : {
     623             :         int ret = 0;
     624             :         unsigned seq;
     625             :         unsigned long pfn = page_to_pfn(page);
     626             :         unsigned long sp, start_pfn;
     627             : 
     628             :         do {
     629             :                 seq = zone_span_seqbegin(zone);
     630             :                 start_pfn = zone->zone_start_pfn;
     631             :                 sp = zone->spanned_pages;
     632             :                 if (!zone_spans_pfn(zone, pfn))
     633             :                         ret = 1;
     634             :         } while (zone_span_seqretry(zone, seq));
     635             : 
     636             :         if (ret)
     637             :                 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
     638             :                         pfn, zone_to_nid(zone), zone->name,
     639             :                         start_pfn, start_pfn + sp);
     640             : 
     641             :         return ret;
     642             : }
     643             : 
     644             : static int page_is_consistent(struct zone *zone, struct page *page)
     645             : {
     646             :         if (zone != page_zone(page))
     647             :                 return 0;
     648             : 
     649             :         return 1;
     650             : }
     651             : /*
     652             :  * Temporary debugging check for pages not lying within a given zone.
     653             :  */
     654             : static int __maybe_unused bad_range(struct zone *zone, struct page *page)
     655             : {
     656             :         if (page_outside_zone_boundaries(zone, page))
     657             :                 return 1;
     658             :         if (!page_is_consistent(zone, page))
     659             :                 return 1;
     660             : 
     661             :         return 0;
     662             : }
     663             : #else
     664             : static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
     665             : {
     666             :         return 0;
     667             : }
     668             : #endif
     669             : 
     670           0 : static void bad_page(struct page *page, const char *reason)
     671             : {
     672             :         static unsigned long resume;
     673             :         static unsigned long nr_shown;
     674             :         static unsigned long nr_unshown;
     675             : 
     676             :         /*
     677             :          * Allow a burst of 60 reports, then keep quiet for that minute;
     678             :          * or allow a steady drip of one report per second.
     679             :          */
     680           0 :         if (nr_shown == 60) {
     681           0 :                 if (time_before(jiffies, resume)) {
     682           0 :                         nr_unshown++;
     683           0 :                         goto out;
     684             :                 }
     685           0 :                 if (nr_unshown) {
     686           0 :                         pr_alert(
     687             :                               "BUG: Bad page state: %lu messages suppressed\n",
     688             :                                 nr_unshown);
     689           0 :                         nr_unshown = 0;
     690             :                 }
     691           0 :                 nr_shown = 0;
     692             :         }
     693           0 :         if (nr_shown++ == 0)
     694           0 :                 resume = jiffies + 60 * HZ;
     695             : 
     696           0 :         pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
     697             :                 current->comm, page_to_pfn(page));
     698           0 :         dump_page(page, reason);
     699             : 
     700             :         print_modules();
     701           0 :         dump_stack();
     702             : out:
     703             :         /* Leave bad fields for debug, except PageBuddy could make trouble */
     704           0 :         page_mapcount_reset(page); /* remove PageBuddy */
     705           0 :         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
     706           0 : }
     707             : 
     708             : static inline unsigned int order_to_pindex(int migratetype, int order)
     709             : {
     710       47659 :         int base = order;
     711             : 
     712             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     713             :         if (order > PAGE_ALLOC_COSTLY_ORDER) {
     714             :                 VM_BUG_ON(order != pageblock_order);
     715             :                 return NR_LOWORDER_PCP_LISTS;
     716             :         }
     717             : #else
     718             :         VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
     719             : #endif
     720             : 
     721       47659 :         return (MIGRATE_PCPTYPES * base) + migratetype;
     722             : }
     723             : 
     724             : static inline int pindex_to_order(unsigned int pindex)
     725             : {
     726           4 :         int order = pindex / MIGRATE_PCPTYPES;
     727             : 
     728             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     729             :         if (pindex == NR_LOWORDER_PCP_LISTS)
     730             :                 order = pageblock_order;
     731             : #else
     732             :         VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
     733             : #endif
     734             : 
     735             :         return order;
     736             : }
     737             : 
     738             : static inline bool pcp_allowed_order(unsigned int order)
     739             : {
     740       47063 :         if (order <= PAGE_ALLOC_COSTLY_ORDER)
     741             :                 return true;
     742             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     743             :         if (order == pageblock_order)
     744             :                 return true;
     745             : #endif
     746             :         return false;
     747             : }
     748             : 
     749       44539 : static inline void free_the_page(struct page *page, unsigned int order)
     750             : {
     751       44539 :         if (pcp_allowed_order(order))           /* Via pcp? */
     752       44539 :                 free_unref_page(page, order);
     753             :         else
     754           0 :                 __free_pages_ok(page, order, FPI_NONE);
     755       44539 : }
     756             : 
     757             : /*
     758             :  * Higher-order pages are called "compound pages".  They are structured thusly:
     759             :  *
     760             :  * The first PAGE_SIZE page is called the "head page" and have PG_head set.
     761             :  *
     762             :  * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
     763             :  * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
     764             :  *
     765             :  * The first tail page's ->compound_dtor holds the offset in array of compound
     766             :  * page destructors. See compound_page_dtors.
     767             :  *
     768             :  * The first tail page's ->compound_order holds the order of allocation.
     769             :  * This usage means that zero-order pages may not be compound.
     770             :  */
     771             : 
     772           0 : void free_compound_page(struct page *page)
     773             : {
     774           0 :         mem_cgroup_uncharge(page_folio(page));
     775           0 :         free_the_page(page, compound_order(page));
     776           0 : }
     777             : 
     778             : static void prep_compound_head(struct page *page, unsigned int order)
     779             : {
     780         102 :         struct folio *folio = (struct folio *)page;
     781             : 
     782         102 :         set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
     783         102 :         set_compound_order(page, order);
     784         204 :         atomic_set(&folio->_entire_mapcount, -1);
     785         204 :         atomic_set(&folio->_nr_pages_mapped, 0);
     786         204 :         atomic_set(&folio->_pincount, 0);
     787             : }
     788             : 
     789             : static void prep_compound_tail(struct page *head, int tail_idx)
     790             : {
     791         184 :         struct page *p = head + tail_idx;
     792             : 
     793         184 :         p->mapping = TAIL_MAPPING;
     794         184 :         set_compound_head(p, head);
     795         368 :         set_page_private(p, 0);
     796             : }
     797             : 
     798           0 : void prep_compound_page(struct page *page, unsigned int order)
     799             : {
     800             :         int i;
     801         102 :         int nr_pages = 1 << order;
     802             : 
     803         102 :         __SetPageHead(page);
     804         286 :         for (i = 1; i < nr_pages; i++)
     805         184 :                 prep_compound_tail(page, i);
     806             : 
     807         102 :         prep_compound_head(page, order);
     808           0 : }
     809             : 
     810           0 : void destroy_large_folio(struct folio *folio)
     811             : {
     812           0 :         enum compound_dtor_id dtor = folio->_folio_dtor;
     813             : 
     814             :         VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
     815           0 :         compound_page_dtors[dtor](&folio->page);
     816           0 : }
     817             : 
     818             : #ifdef CONFIG_DEBUG_PAGEALLOC
     819             : unsigned int _debug_guardpage_minorder;
     820             : 
     821             : bool _debug_pagealloc_enabled_early __read_mostly
     822             :                         = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
     823             : EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
     824             : DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
     825             : EXPORT_SYMBOL(_debug_pagealloc_enabled);
     826             : 
     827             : DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
     828             : 
     829             : static int __init early_debug_pagealloc(char *buf)
     830             : {
     831             :         return kstrtobool(buf, &_debug_pagealloc_enabled_early);
     832             : }
     833             : early_param("debug_pagealloc", early_debug_pagealloc);
     834             : 
     835             : static int __init debug_guardpage_minorder_setup(char *buf)
     836             : {
     837             :         unsigned long res;
     838             : 
     839             :         if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
     840             :                 pr_err("Bad debug_guardpage_minorder value\n");
     841             :                 return 0;
     842             :         }
     843             :         _debug_guardpage_minorder = res;
     844             :         pr_info("Setting debug_guardpage_minorder to %lu\n", res);
     845             :         return 0;
     846             : }
     847             : early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
     848             : 
     849             : static inline bool set_page_guard(struct zone *zone, struct page *page,
     850             :                                 unsigned int order, int migratetype)
     851             : {
     852             :         if (!debug_guardpage_enabled())
     853             :                 return false;
     854             : 
     855             :         if (order >= debug_guardpage_minorder())
     856             :                 return false;
     857             : 
     858             :         __SetPageGuard(page);
     859             :         INIT_LIST_HEAD(&page->buddy_list);
     860             :         set_page_private(page, order);
     861             :         /* Guard pages are not available for any usage */
     862             :         if (!is_migrate_isolate(migratetype))
     863             :                 __mod_zone_freepage_state(zone, -(1 << order), migratetype);
     864             : 
     865             :         return true;
     866             : }
     867             : 
     868             : static inline void clear_page_guard(struct zone *zone, struct page *page,
     869             :                                 unsigned int order, int migratetype)
     870             : {
     871             :         if (!debug_guardpage_enabled())
     872             :                 return;
     873             : 
     874             :         __ClearPageGuard(page);
     875             : 
     876             :         set_page_private(page, 0);
     877             :         if (!is_migrate_isolate(migratetype))
     878             :                 __mod_zone_freepage_state(zone, (1 << order), migratetype);
     879             : }
     880             : #else
     881             : static inline bool set_page_guard(struct zone *zone, struct page *page,
     882             :                         unsigned int order, int migratetype) { return false; }
     883             : static inline void clear_page_guard(struct zone *zone, struct page *page,
     884             :                                 unsigned int order, int migratetype) {}
     885             : #endif
     886             : 
     887             : /*
     888             :  * Enable static keys related to various memory debugging and hardening options.
     889             :  * Some override others, and depend on early params that are evaluated in the
     890             :  * order of appearance. So we need to first gather the full picture of what was
     891             :  * enabled, and then make decisions.
     892             :  */
     893           1 : void __init init_mem_debugging_and_hardening(void)
     894             : {
     895           1 :         bool page_poisoning_requested = false;
     896             : 
     897             : #ifdef CONFIG_PAGE_POISONING
     898             :         /*
     899             :          * Page poisoning is debug page alloc for some arches. If
     900             :          * either of those options are enabled, enable poisoning.
     901             :          */
     902             :         if (page_poisoning_enabled() ||
     903             :              (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
     904             :               debug_pagealloc_enabled())) {
     905             :                 static_branch_enable(&_page_poisoning_enabled);
     906             :                 page_poisoning_requested = true;
     907             :         }
     908             : #endif
     909             : 
     910           1 :         if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
     911             :             page_poisoning_requested) {
     912             :                 pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
     913             :                         "will take precedence over init_on_alloc and init_on_free\n");
     914             :                 _init_on_alloc_enabled_early = false;
     915             :                 _init_on_free_enabled_early = false;
     916             :         }
     917             : 
     918           1 :         if (_init_on_alloc_enabled_early)
     919           0 :                 static_branch_enable(&init_on_alloc);
     920             :         else
     921           1 :                 static_branch_disable(&init_on_alloc);
     922             : 
     923           1 :         if (_init_on_free_enabled_early)
     924           0 :                 static_branch_enable(&init_on_free);
     925             :         else
     926           1 :                 static_branch_disable(&init_on_free);
     927             : 
     928             :         if (IS_ENABLED(CONFIG_KMSAN) &&
     929             :             (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
     930             :                 pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
     931             : 
     932             : #ifdef CONFIG_DEBUG_PAGEALLOC
     933             :         if (!debug_pagealloc_enabled())
     934             :                 return;
     935             : 
     936             :         static_branch_enable(&_debug_pagealloc_enabled);
     937             : 
     938             :         if (!debug_guardpage_minorder())
     939             :                 return;
     940             : 
     941             :         static_branch_enable(&_debug_guardpage_enabled);
     942             : #endif
     943           1 : }
     944             : 
     945             : static inline void set_buddy_order(struct page *page, unsigned int order)
     946             : {
     947        7060 :         set_page_private(page, order);
     948        3530 :         __SetPageBuddy(page);
     949             : }
     950             : 
     951             : #ifdef CONFIG_COMPACTION
     952        1204 : static inline struct capture_control *task_capc(struct zone *zone)
     953             : {
     954        1204 :         struct capture_control *capc = current->capture_control;
     955             : 
     956        1204 :         return unlikely(capc) &&
     957           0 :                 !(current->flags & PF_KTHREAD) &&
     958           0 :                 !capc->page &&
     959        2408 :                 capc->cc->zone == zone ? capc : NULL;
     960             : }
     961             : 
     962             : static inline bool
     963             : compaction_capture(struct capture_control *capc, struct page *page,
     964             :                    int order, int migratetype)
     965             : {
     966        1860 :         if (!capc || order != capc->cc->order)
     967             :                 return false;
     968             : 
     969             :         /* Do not accidentally pollute CMA or isolated regions*/
     970             :         if (is_migrate_cma(migratetype) ||
     971           0 :             is_migrate_isolate(migratetype))
     972             :                 return false;
     973             : 
     974             :         /*
     975             :          * Do not let lower order allocations pollute a movable pageblock.
     976             :          * This might let an unmovable request use a reclaimable pageblock
     977             :          * and vice-versa but no more than normal fallback logic which can
     978             :          * have trouble finding a high-order free page.
     979             :          */
     980           0 :         if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
     981             :                 return false;
     982             : 
     983           0 :         capc->page = page;
     984             :         return true;
     985             : }
     986             : 
     987             : #else
     988             : static inline struct capture_control *task_capc(struct zone *zone)
     989             : {
     990             :         return NULL;
     991             : }
     992             : 
     993             : static inline bool
     994             : compaction_capture(struct capture_control *capc, struct page *page,
     995             :                    int order, int migratetype)
     996             : {
     997             :         return false;
     998             : }
     999             : #endif /* CONFIG_COMPACTION */
    1000             : 
    1001             : /* Used for pages not on another list */
    1002             : static inline void add_to_free_list(struct page *page, struct zone *zone,
    1003             :                                     unsigned int order, int migratetype)
    1004             : {
    1005        2858 :         struct free_area *area = &zone->free_area[order];
    1006             : 
    1007        5716 :         list_add(&page->buddy_list, &area->free_list[migratetype]);
    1008        2858 :         area->nr_free++;
    1009             : }
    1010             : 
    1011             : /* Used for pages not on another list */
    1012             : static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
    1013             :                                          unsigned int order, int migratetype)
    1014             : {
    1015         672 :         struct free_area *area = &zone->free_area[order];
    1016             : 
    1017        1344 :         list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
    1018         672 :         area->nr_free++;
    1019             : }
    1020             : 
    1021             : /*
    1022             :  * Used for pages which are on another list. Move the pages to the tail
    1023             :  * of the list - so the moved pages won't immediately be considered for
    1024             :  * allocation again (e.g., optimization for memory onlining).
    1025             :  */
    1026             : static inline void move_to_free_list(struct page *page, struct zone *zone,
    1027             :                                      unsigned int order, int migratetype)
    1028             : {
    1029           4 :         struct free_area *area = &zone->free_area[order];
    1030             : 
    1031           8 :         list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
    1032             : }
    1033             : 
    1034             : static inline void del_page_from_free_list(struct page *page, struct zone *zone,
    1035             :                                            unsigned int order)
    1036             : {
    1037             :         /* clear reported state and update reported page count */
    1038             :         if (page_reported(page))
    1039             :                 __ClearPageReported(page);
    1040             : 
    1041        6448 :         list_del(&page->buddy_list);
    1042        3224 :         __ClearPageBuddy(page);
    1043        6448 :         set_page_private(page, 0);
    1044        3224 :         zone->free_area[order].nr_free--;
    1045             : }
    1046             : 
    1047             : /*
    1048             :  * If this is not the largest possible page, check if the buddy
    1049             :  * of the next-highest order is free. If it is, it's possible
    1050             :  * that pages are being freed that will coalesce soon. In case,
    1051             :  * that is happening, add the free page to the tail of the list
    1052             :  * so it's less likely to be used soon and more likely to be merged
    1053             :  * as a higher order page
    1054             :  */
    1055             : static inline bool
    1056         945 : buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
    1057             :                    struct page *page, unsigned int order)
    1058             : {
    1059             :         unsigned long higher_page_pfn;
    1060             :         struct page *higher_page;
    1061             : 
    1062         945 :         if (order >= MAX_ORDER - 2)
    1063             :                 return false;
    1064             : 
    1065         945 :         higher_page_pfn = buddy_pfn & pfn;
    1066         945 :         higher_page = page + (higher_page_pfn - pfn);
    1067             : 
    1068        1890 :         return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
    1069         945 :                         NULL) != NULL;
    1070             : }
    1071             : 
    1072             : /*
    1073             :  * Freeing function for a buddy system allocator.
    1074             :  *
    1075             :  * The concept of a buddy system is to maintain direct-mapped table
    1076             :  * (containing bit values) for memory blocks of various "orders".
    1077             :  * The bottom level table contains the map for the smallest allocatable
    1078             :  * units of memory (here, pages), and each level above it describes
    1079             :  * pairs of units from the levels below, hence, "buddies".
    1080             :  * At a high level, all that happens here is marking the table entry
    1081             :  * at the bottom level available, and propagating the changes upward
    1082             :  * as necessary, plus some accounting needed to play nicely with other
    1083             :  * parts of the VM system.
    1084             :  * At each level, we keep a list of pages, which are heads of continuous
    1085             :  * free pages of length of (1 << order) and marked with PageBuddy.
    1086             :  * Page's order is recorded in page_private(page) field.
    1087             :  * So when we are allocating or freeing one, we can derive the state of the
    1088             :  * other.  That is, if we allocate a small block, and both were
    1089             :  * free, the remainder of the region must be split into blocks.
    1090             :  * If a block is freed, and its buddy is also free, then this
    1091             :  * triggers coalescing into a block of larger size.
    1092             :  *
    1093             :  * -- nyc
    1094             :  */
    1095             : 
    1096        1204 : static inline void __free_one_page(struct page *page,
    1097             :                 unsigned long pfn,
    1098             :                 struct zone *zone, unsigned int order,
    1099             :                 int migratetype, fpi_t fpi_flags)
    1100             : {
    1101        1204 :         struct capture_control *capc = task_capc(zone);
    1102        1204 :         unsigned long buddy_pfn = 0;
    1103             :         unsigned long combined_pfn;
    1104             :         struct page *buddy;
    1105             :         bool to_tail;
    1106             : 
    1107             :         VM_BUG_ON(!zone_is_initialized(zone));
    1108             :         VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
    1109             : 
    1110             :         VM_BUG_ON(migratetype == -1);
    1111        1204 :         if (likely(!is_migrate_isolate(migratetype)))
    1112        1204 :                 __mod_zone_freepage_state(zone, 1 << order, migratetype);
    1113             : 
    1114             :         VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
    1115             :         VM_BUG_ON_PAGE(bad_range(zone, page), page);
    1116             : 
    1117        2107 :         while (order < MAX_ORDER - 1) {
    1118        3720 :                 if (compaction_capture(capc, page, order, migratetype)) {
    1119           0 :                         __mod_zone_freepage_state(zone, -(1 << order),
    1120             :                                                                 migratetype);
    1121           0 :                         return;
    1122             :                 }
    1123             : 
    1124        1860 :                 buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
    1125        1860 :                 if (!buddy)
    1126             :                         goto done_merging;
    1127             : 
    1128             :                 if (unlikely(order >= pageblock_order)) {
    1129             :                         /*
    1130             :                          * We want to prevent merge between freepages on pageblock
    1131             :                          * without fallbacks and normal pageblock. Without this,
    1132             :                          * pageblock isolation could cause incorrect freepage or CMA
    1133             :                          * accounting or HIGHATOMIC accounting.
    1134             :                          */
    1135             :                         int buddy_mt = get_pageblock_migratetype(buddy);
    1136             : 
    1137             :                         if (migratetype != buddy_mt
    1138             :                                         && (!migratetype_is_mergeable(migratetype) ||
    1139             :                                                 !migratetype_is_mergeable(buddy_mt)))
    1140             :                                 goto done_merging;
    1141             :                 }
    1142             : 
    1143             :                 /*
    1144             :                  * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
    1145             :                  * merge with it and move up one order.
    1146             :                  */
    1147             :                 if (page_is_guard(buddy))
    1148             :                         clear_page_guard(zone, buddy, order, migratetype);
    1149             :                 else
    1150             :                         del_page_from_free_list(buddy, zone, order);
    1151         903 :                 combined_pfn = buddy_pfn & pfn;
    1152         903 :                 page = page + (combined_pfn - pfn);
    1153         903 :                 pfn = combined_pfn;
    1154         903 :                 order++;
    1155             :         }
    1156             : 
    1157             : done_merging:
    1158        1204 :         set_buddy_order(page, order);
    1159             : 
    1160        1204 :         if (fpi_flags & FPI_TO_TAIL)
    1161             :                 to_tail = true;
    1162         945 :         else if (is_shuffle_order(order))
    1163             :                 to_tail = shuffle_pick_tail();
    1164             :         else
    1165         945 :                 to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
    1166             : 
    1167        1204 :         if (to_tail)
    1168             :                 add_to_free_list_tail(page, zone, order, migratetype);
    1169             :         else
    1170             :                 add_to_free_list(page, zone, order, migratetype);
    1171             : 
    1172             :         /* Notify page reporting subsystem of freed page */
    1173             :         if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
    1174             :                 page_reporting_notify_free(order);
    1175             : }
    1176             : 
    1177             : /**
    1178             :  * split_free_page() -- split a free page at split_pfn_offset
    1179             :  * @free_page:          the original free page
    1180             :  * @order:              the order of the page
    1181             :  * @split_pfn_offset:   split offset within the page
    1182             :  *
    1183             :  * Return -ENOENT if the free page is changed, otherwise 0
    1184             :  *
    1185             :  * It is used when the free page crosses two pageblocks with different migratetypes
    1186             :  * at split_pfn_offset within the page. The split free page will be put into
    1187             :  * separate migratetype lists afterwards. Otherwise, the function achieves
    1188             :  * nothing.
    1189             :  */
    1190           0 : int split_free_page(struct page *free_page,
    1191             :                         unsigned int order, unsigned long split_pfn_offset)
    1192             : {
    1193           0 :         struct zone *zone = page_zone(free_page);
    1194           0 :         unsigned long free_page_pfn = page_to_pfn(free_page);
    1195             :         unsigned long pfn;
    1196             :         unsigned long flags;
    1197             :         int free_page_order;
    1198             :         int mt;
    1199           0 :         int ret = 0;
    1200             : 
    1201           0 :         if (split_pfn_offset == 0)
    1202             :                 return ret;
    1203             : 
    1204           0 :         spin_lock_irqsave(&zone->lock, flags);
    1205             : 
    1206           0 :         if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
    1207             :                 ret = -ENOENT;
    1208             :                 goto out;
    1209             :         }
    1210             : 
    1211           0 :         mt = get_pageblock_migratetype(free_page);
    1212           0 :         if (likely(!is_migrate_isolate(mt)))
    1213           0 :                 __mod_zone_freepage_state(zone, -(1UL << order), mt);
    1214             : 
    1215           0 :         del_page_from_free_list(free_page, zone, order);
    1216           0 :         for (pfn = free_page_pfn;
    1217           0 :              pfn < free_page_pfn + (1UL << order);) {
    1218           0 :                 int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
    1219             : 
    1220           0 :                 free_page_order = min_t(unsigned int,
    1221             :                                         pfn ? __ffs(pfn) : order,
    1222             :                                         __fls(split_pfn_offset));
    1223           0 :                 __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
    1224             :                                 mt, FPI_NONE);
    1225           0 :                 pfn += 1UL << free_page_order;
    1226           0 :                 split_pfn_offset -= (1UL << free_page_order);
    1227             :                 /* we have done the first part, now switch to second part */
    1228           0 :                 if (split_pfn_offset == 0)
    1229           0 :                         split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
    1230             :         }
    1231             : out:
    1232           0 :         spin_unlock_irqrestore(&zone->lock, flags);
    1233           0 :         return ret;
    1234             : }
    1235             : /*
    1236             :  * A bad page could be due to a number of fields. Instead of multiple branches,
    1237             :  * try and check multiple fields with one check. The caller must do a detailed
    1238             :  * check if necessary.
    1239             :  */
    1240             : static inline bool page_expected_state(struct page *page,
    1241             :                                         unsigned long check_flags)
    1242             : {
    1243      516096 :         if (unlikely(atomic_read(&page->_mapcount) != -1))
    1244             :                 return false;
    1245             : 
    1246      516096 :         if (unlikely((unsigned long)page->mapping |
    1247             :                         page_ref_count(page) |
    1248             : #ifdef CONFIG_MEMCG
    1249             :                         page->memcg_data |
    1250             : #endif
    1251             :                         (page->flags & check_flags)))
    1252             :                 return false;
    1253             : 
    1254             :         return true;
    1255             : }
    1256             : 
    1257             : static const char *page_bad_reason(struct page *page, unsigned long flags)
    1258             : {
    1259           0 :         const char *bad_reason = NULL;
    1260             : 
    1261           0 :         if (unlikely(atomic_read(&page->_mapcount) != -1))
    1262           0 :                 bad_reason = "nonzero mapcount";
    1263           0 :         if (unlikely(page->mapping != NULL))
    1264           0 :                 bad_reason = "non-NULL mapping";
    1265           0 :         if (unlikely(page_ref_count(page) != 0))
    1266           0 :                 bad_reason = "nonzero _refcount";
    1267           0 :         if (unlikely(page->flags & flags)) {
    1268             :                 if (flags == PAGE_FLAGS_CHECK_AT_PREP)
    1269             :                         bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
    1270             :                 else
    1271           0 :                         bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
    1272             :         }
    1273             : #ifdef CONFIG_MEMCG
    1274             :         if (unlikely(page->memcg_data))
    1275             :                 bad_reason = "page still charged to cgroup";
    1276             : #endif
    1277             :         return bad_reason;
    1278             : }
    1279             : 
    1280           0 : static void free_page_is_bad_report(struct page *page)
    1281             : {
    1282           0 :         bad_page(page,
    1283             :                  page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
    1284           0 : }
    1285             : 
    1286      255413 : static inline bool free_page_is_bad(struct page *page)
    1287             : {
    1288      255413 :         if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
    1289             :                 return false;
    1290             : 
    1291             :         /* Something has gone sideways, find it */
    1292           0 :         free_page_is_bad_report(page);
    1293           0 :         return true;
    1294             : }
    1295             : 
    1296             : static int free_tail_pages_check(struct page *head_page, struct page *page)
    1297             : {
    1298          42 :         struct folio *folio = (struct folio *)head_page;
    1299          42 :         int ret = 1;
    1300             : 
    1301             :         /*
    1302             :          * We rely page->lru.next never has bit 0 set, unless the page
    1303             :          * is PageTail(). Let's make sure that's true even for poisoned ->lru.
    1304             :          */
    1305             :         BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
    1306             : 
    1307             :         if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
    1308          42 :                 ret = 0;
    1309             :                 goto out;
    1310             :         }
    1311             :         switch (page - head_page) {
    1312             :         case 1:
    1313             :                 /* the first tail page: these may be in place of ->mapping */
    1314             :                 if (unlikely(folio_entire_mapcount(folio))) {
    1315             :                         bad_page(page, "nonzero entire_mapcount");
    1316             :                         goto out;
    1317             :                 }
    1318             :                 if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
    1319             :                         bad_page(page, "nonzero nr_pages_mapped");
    1320             :                         goto out;
    1321             :                 }
    1322             :                 if (unlikely(atomic_read(&folio->_pincount))) {
    1323             :                         bad_page(page, "nonzero pincount");
    1324             :                         goto out;
    1325             :                 }
    1326             :                 break;
    1327             :         case 2:
    1328             :                 /*
    1329             :                  * the second tail page: ->mapping is
    1330             :                  * deferred_list.next -- ignore value.
    1331             :                  */
    1332             :                 break;
    1333             :         default:
    1334             :                 if (page->mapping != TAIL_MAPPING) {
    1335             :                         bad_page(page, "corrupted mapping in tail page");
    1336             :                         goto out;
    1337             :                 }
    1338             :                 break;
    1339             :         }
    1340             :         if (unlikely(!PageTail(page))) {
    1341             :                 bad_page(page, "PageTail not set");
    1342             :                 goto out;
    1343             :         }
    1344             :         if (unlikely(compound_head(page) != head_page)) {
    1345             :                 bad_page(page, "compound_head not consistent");
    1346             :                 goto out;
    1347             :         }
    1348             :         ret = 0;
    1349             : out:
    1350          42 :         page->mapping = NULL;
    1351          42 :         clear_compound_head(page);
    1352             :         return ret;
    1353             : }
    1354             : 
    1355             : /*
    1356             :  * Skip KASAN memory poisoning when either:
    1357             :  *
    1358             :  * 1. Deferred memory initialization has not yet completed,
    1359             :  *    see the explanation below.
    1360             :  * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON,
    1361             :  *    see the comment next to it.
    1362             :  * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON,
    1363             :  *    see the comment next to it.
    1364             :  * 4. The allocation is excluded from being checked due to sampling,
    1365             :  *    see the call to kasan_unpoison_pages.
    1366             :  *
    1367             :  * Poisoning pages during deferred memory init will greatly lengthen the
    1368             :  * process and cause problem in large memory systems as the deferred pages
    1369             :  * initialization is done with interrupt disabled.
    1370             :  *
    1371             :  * Assuming that there will be no reference to those newly initialized
    1372             :  * pages before they are ever allocated, this should have no effect on
    1373             :  * KASAN memory tracking as the poison will be properly inserted at page
    1374             :  * allocation time. The only corner case is when pages are allocated by
    1375             :  * on-demand allocation and then freed again before the deferred pages
    1376             :  * initialization is done, but this is not likely to happen.
    1377             :  */
    1378             : static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
    1379             : {
    1380             :         return deferred_pages_enabled() ||
    1381             :                (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
    1382             :                 (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
    1383             :                PageSkipKASanPoison(page);
    1384             : }
    1385             : 
    1386           0 : static void kernel_init_pages(struct page *page, int numpages)
    1387             : {
    1388             :         int i;
    1389             : 
    1390             :         /* s390's use of memset() could override KASAN redzones. */
    1391             :         kasan_disable_current();
    1392       38291 :         for (i = 0; i < numpages; i++)
    1393       76582 :                 clear_highpage_kasan_tagged(page + i);
    1394             :         kasan_enable_current();
    1395           0 : }
    1396             : 
    1397             : static __always_inline bool free_pages_prepare(struct page *page,
    1398             :                         unsigned int order, bool check_free, fpi_t fpi_flags)
    1399             : {
    1400       44798 :         int bad = 0;
    1401       44798 :         bool init = want_init_on_free();
    1402             : 
    1403             :         VM_BUG_ON_PAGE(PageTail(page), page);
    1404             : 
    1405       44798 :         trace_mm_page_free(page, order);
    1406       44798 :         kmsan_free_page(page, order);
    1407             : 
    1408       44798 :         if (unlikely(PageHWPoison(page)) && !order) {
    1409             :                 /*
    1410             :                  * Do not let hwpoison pages hit pcplists/buddy
    1411             :                  * Untie memcg state and reset page's owner
    1412             :                  */
    1413             :                 if (memcg_kmem_online() && PageMemcgKmem(page))
    1414             :                         __memcg_kmem_uncharge_page(page, order);
    1415             :                 reset_page_owner(page, order);
    1416             :                 page_table_check_free(page, order);
    1417             :                 return false;
    1418             :         }
    1419             : 
    1420             :         /*
    1421             :          * Check tail pages before head page information is cleared to
    1422             :          * avoid checking PageCompound for order-0 pages.
    1423             :          */
    1424       44798 :         if (unlikely(order)) {
    1425         263 :                 bool compound = PageCompound(page);
    1426             :                 int i;
    1427             : 
    1428             :                 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
    1429             : 
    1430             :                 if (compound)
    1431             :                         ClearPageHasHWPoisoned(page);
    1432      254209 :                 for (i = 1; i < (1 << order); i++) {
    1433      254209 :                         if (compound)
    1434          84 :                                 bad += free_tail_pages_check(page, page + i);
    1435      254209 :                         if (unlikely(free_page_is_bad(page + i))) {
    1436           0 :                                 bad++;
    1437           0 :                                 continue;
    1438             :                         }
    1439      254209 :                         (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
    1440             :                 }
    1441             :         }
    1442       44798 :         if (PageMappingFlags(page))
    1443           0 :                 page->mapping = NULL;
    1444             :         if (memcg_kmem_online() && PageMemcgKmem(page))
    1445             :                 __memcg_kmem_uncharge_page(page, order);
    1446         259 :         if (check_free && free_page_is_bad(page))
    1447           0 :                 bad++;
    1448       44798 :         if (bad)
    1449             :                 return false;
    1450             : 
    1451       44798 :         page_cpupid_reset_last(page);
    1452       44798 :         page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
    1453             :         reset_page_owner(page, order);
    1454       44798 :         page_table_check_free(page, order);
    1455             : 
    1456       44798 :         if (!PageHighMem(page)) {
    1457             :                 debug_check_no_locks_freed(page_address(page),
    1458             :                                            PAGE_SIZE << order);
    1459             :                 debug_check_no_obj_freed(page_address(page),
    1460             :                                            PAGE_SIZE << order);
    1461             :         }
    1462             : 
    1463       44798 :         kernel_poison_pages(page, 1 << order);
    1464             : 
    1465             :         /*
    1466             :          * As memory initialization might be integrated into KASAN,
    1467             :          * KASAN poisoning and memory initialization code must be
    1468             :          * kept together to avoid discrepancies in behavior.
    1469             :          *
    1470             :          * With hardware tag-based KASAN, memory tags must be set before the
    1471             :          * page becomes unavailable via debug_pagealloc or arch_free_page.
    1472             :          */
    1473       44798 :         if (!should_skip_kasan_poison(page, fpi_flags)) {
    1474             :                 kasan_poison_pages(page, order, init);
    1475             : 
    1476             :                 /* Memory is already initialized if KASAN did it internally. */
    1477             :                 if (kasan_has_integrated_init())
    1478             :                         init = false;
    1479             :         }
    1480       44798 :         if (init)
    1481           0 :                 kernel_init_pages(page, 1 << order);
    1482             : 
    1483             :         /*
    1484             :          * arch_free_page() can make the page's contents inaccessible.  s390
    1485             :          * does this.  So nothing which can access the page's contents should
    1486             :          * happen after this.
    1487             :          */
    1488             :         arch_free_page(page, order);
    1489             : 
    1490             :         debug_pagealloc_unmap_pages(page, 1 << order);
    1491             : 
    1492             :         return true;
    1493             : }
    1494             : 
    1495             : #ifdef CONFIG_DEBUG_VM
    1496             : /*
    1497             :  * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
    1498             :  * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
    1499             :  * moved from pcp lists to free lists.
    1500             :  */
    1501             : static bool free_pcp_prepare(struct page *page, unsigned int order)
    1502             : {
    1503             :         return free_pages_prepare(page, order, true, FPI_NONE);
    1504             : }
    1505             : 
    1506             : /* return true if this page has an inappropriate state */
    1507             : static bool bulkfree_pcp_prepare(struct page *page)
    1508             : {
    1509             :         if (debug_pagealloc_enabled_static())
    1510             :                 return free_page_is_bad(page);
    1511             :         else
    1512             :                 return false;
    1513             : }
    1514             : #else
    1515             : /*
    1516             :  * With DEBUG_VM disabled, order-0 pages being freed are checked only when
    1517             :  * moving from pcp lists to free list in order to reduce overhead. With
    1518             :  * debug_pagealloc enabled, they are checked also immediately when being freed
    1519             :  * to the pcp lists.
    1520             :  */
    1521       44539 : static bool free_pcp_prepare(struct page *page, unsigned int order)
    1522             : {
    1523             :         if (debug_pagealloc_enabled_static())
    1524             :                 return free_pages_prepare(page, order, true, FPI_NONE);
    1525             :         else
    1526       44539 :                 return free_pages_prepare(page, order, false, FPI_NONE);
    1527             : }
    1528             : 
    1529             : static bool bulkfree_pcp_prepare(struct page *page)
    1530             : {
    1531         945 :         return free_page_is_bad(page);
    1532             : }
    1533             : #endif /* CONFIG_DEBUG_VM */
    1534             : 
    1535             : /*
    1536             :  * Frees a number of pages from the PCP lists
    1537             :  * Assumes all pages on list are in same zone.
    1538             :  * count is the number of pages to free.
    1539             :  */
    1540           4 : static void free_pcppages_bulk(struct zone *zone, int count,
    1541             :                                         struct per_cpu_pages *pcp,
    1542             :                                         int pindex)
    1543             : {
    1544             :         unsigned long flags;
    1545           4 :         int min_pindex = 0;
    1546           4 :         int max_pindex = NR_PCP_LISTS - 1;
    1547             :         unsigned int order;
    1548             :         bool isolated_pageblocks;
    1549             :         struct page *page;
    1550             : 
    1551             :         /*
    1552             :          * Ensure proper count is passed which otherwise would stuck in the
    1553             :          * below while (list_empty(list)) loop.
    1554             :          */
    1555           4 :         count = min(pcp->count, count);
    1556             : 
    1557             :         /* Ensure requested pindex is drained first. */
    1558           4 :         pindex = pindex - 1;
    1559             : 
    1560           4 :         spin_lock_irqsave(&zone->lock, flags);
    1561           4 :         isolated_pageblocks = has_isolate_pageblock(zone);
    1562             : 
    1563          12 :         while (count > 0) {
    1564             :                 struct list_head *list;
    1565             :                 int nr_pages;
    1566             : 
    1567             :                 /* Remove pages from lists in a round-robin fashion. */
    1568             :                 do {
    1569           4 :                         if (++pindex > max_pindex)
    1570           0 :                                 pindex = min_pindex;
    1571           4 :                         list = &pcp->lists[pindex];
    1572           4 :                         if (!list_empty(list))
    1573             :                                 break;
    1574             : 
    1575           0 :                         if (pindex == max_pindex)
    1576           0 :                                 max_pindex--;
    1577           0 :                         if (pindex == min_pindex)
    1578           0 :                                 min_pindex++;
    1579             :                 } while (1);
    1580             : 
    1581           8 :                 order = pindex_to_order(pindex);
    1582           4 :                 nr_pages = 1 << order;
    1583             :                 do {
    1584             :                         int mt;
    1585             : 
    1586         945 :                         page = list_last_entry(list, struct page, pcp_list);
    1587        1890 :                         mt = get_pcppage_migratetype(page);
    1588             : 
    1589             :                         /* must delete to avoid corrupting pcp list */
    1590        1890 :                         list_del(&page->pcp_list);
    1591         945 :                         count -= nr_pages;
    1592         945 :                         pcp->count -= nr_pages;
    1593             : 
    1594         945 :                         if (bulkfree_pcp_prepare(page))
    1595           0 :                                 continue;
    1596             : 
    1597             :                         /* MIGRATE_ISOLATE page should not go to pcplists */
    1598             :                         VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
    1599             :                         /* Pageblock could have been isolated meanwhile */
    1600             :                         if (unlikely(isolated_pageblocks))
    1601             :                                 mt = get_pageblock_migratetype(page);
    1602             : 
    1603         945 :                         __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
    1604         945 :                         trace_mm_page_pcpu_drain(page, order, mt);
    1605        1886 :                 } while (count > 0 && !list_empty(list));
    1606             :         }
    1607             : 
    1608           8 :         spin_unlock_irqrestore(&zone->lock, flags);
    1609           4 : }
    1610             : 
    1611           0 : static void free_one_page(struct zone *zone,
    1612             :                                 struct page *page, unsigned long pfn,
    1613             :                                 unsigned int order,
    1614             :                                 int migratetype, fpi_t fpi_flags)
    1615             : {
    1616             :         unsigned long flags;
    1617             : 
    1618           0 :         spin_lock_irqsave(&zone->lock, flags);
    1619           0 :         if (unlikely(has_isolate_pageblock(zone) ||
    1620             :                 is_migrate_isolate(migratetype))) {
    1621             :                 migratetype = get_pfnblock_migratetype(page, pfn);
    1622             :         }
    1623           0 :         __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
    1624           0 :         spin_unlock_irqrestore(&zone->lock, flags);
    1625           0 : }
    1626             : 
    1627      270239 : static void __meminit __init_single_page(struct page *page, unsigned long pfn,
    1628             :                                 unsigned long zone, int nid)
    1629             : {
    1630      270239 :         mm_zero_struct_page(page);
    1631      540478 :         set_page_links(page, zone, nid, pfn);
    1632      270239 :         init_page_count(page);
    1633      270239 :         page_mapcount_reset(page);
    1634      270239 :         page_cpupid_reset_last(page);
    1635      270239 :         page_kasan_tag_reset(page);
    1636             : 
    1637      540478 :         INIT_LIST_HEAD(&page->lru);
    1638             : #ifdef WANT_PAGE_VIRTUAL
    1639             :         /* The shift won't overflow because ZONE_NORMAL is below 4G. */
    1640             :         if (!is_highmem_idx(zone))
    1641             :                 set_page_address(page, __va(pfn << PAGE_SHIFT));
    1642             : #endif
    1643      270239 : }
    1644             : 
    1645             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    1646             : static void __meminit init_reserved_page(unsigned long pfn)
    1647             : {
    1648             :         pg_data_t *pgdat;
    1649             :         int nid, zid;
    1650             : 
    1651             :         if (early_page_initialised(pfn))
    1652             :                 return;
    1653             : 
    1654             :         nid = early_pfn_to_nid(pfn);
    1655             :         pgdat = NODE_DATA(nid);
    1656             : 
    1657             :         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
    1658             :                 struct zone *zone = &pgdat->node_zones[zid];
    1659             : 
    1660             :                 if (zone_spans_pfn(zone, pfn))
    1661             :                         break;
    1662             :         }
    1663             :         __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
    1664             : }
    1665             : #else
    1666             : static inline void init_reserved_page(unsigned long pfn)
    1667             : {
    1668             : }
    1669             : #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
    1670             : 
    1671             : /*
    1672             :  * Initialised pages do not have PageReserved set. This function is
    1673             :  * called for each range allocated by the bootmem allocator and
    1674             :  * marks the pages PageReserved. The remaining valid pages are later
    1675             :  * sent to the buddy page allocator.
    1676             :  */
    1677          11 : void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
    1678             : {
    1679          11 :         unsigned long start_pfn = PFN_DOWN(start);
    1680          11 :         unsigned long end_pfn = PFN_UP(end);
    1681             : 
    1682       15830 :         for (; start_pfn < end_pfn; start_pfn++) {
    1683       15819 :                 if (pfn_valid(start_pfn)) {
    1684       15819 :                         struct page *page = pfn_to_page(start_pfn);
    1685             : 
    1686       15819 :                         init_reserved_page(start_pfn);
    1687             : 
    1688             :                         /* Avoid false-positive PageTail() */
    1689       31638 :                         INIT_LIST_HEAD(&page->lru);
    1690             : 
    1691             :                         /*
    1692             :                          * no need for atomic set_bit because the struct
    1693             :                          * page is not visible yet so nobody should
    1694             :                          * access it yet.
    1695             :                          */
    1696             :                         __SetPageReserved(page);
    1697             :                 }
    1698             :         }
    1699          11 : }
    1700             : 
    1701         259 : static void __free_pages_ok(struct page *page, unsigned int order,
    1702             :                             fpi_t fpi_flags)
    1703             : {
    1704             :         unsigned long flags;
    1705             :         int migratetype;
    1706         259 :         unsigned long pfn = page_to_pfn(page);
    1707         259 :         struct zone *zone = page_zone(page);
    1708             : 
    1709         259 :         if (!free_pages_prepare(page, order, true, fpi_flags))
    1710             :                 return;
    1711             : 
    1712             :         /*
    1713             :          * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
    1714             :          * is used to avoid calling get_pfnblock_migratetype() under the lock.
    1715             :          * This will reduce the lock holding time.
    1716             :          */
    1717         259 :         migratetype = get_pfnblock_migratetype(page, pfn);
    1718             : 
    1719         259 :         spin_lock_irqsave(&zone->lock, flags);
    1720             :         if (unlikely(has_isolate_pageblock(zone) ||
    1721             :                 is_migrate_isolate(migratetype))) {
    1722             :                 migratetype = get_pfnblock_migratetype(page, pfn);
    1723             :         }
    1724         259 :         __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
    1725         518 :         spin_unlock_irqrestore(&zone->lock, flags);
    1726             : 
    1727         259 :         __count_vm_events(PGFREE, 1 << order);
    1728             : }
    1729             : 
    1730         259 : void __free_pages_core(struct page *page, unsigned int order)
    1731             : {
    1732         259 :         unsigned int nr_pages = 1 << order;
    1733         259 :         struct page *p = page;
    1734             :         unsigned int loop;
    1735             : 
    1736             :         /*
    1737             :          * When initializing the memmap, __init_single_page() sets the refcount
    1738             :          * of all pages to 1 ("allocated"/"not free"). We have to set the
    1739             :          * refcount of all involved pages to 0.
    1740             :          */
    1741         259 :         prefetchw(p);
    1742      254426 :         for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
    1743      254167 :                 prefetchw(p + 1);
    1744      254167 :                 __ClearPageReserved(p);
    1745      254167 :                 set_page_count(p, 0);
    1746             :         }
    1747         259 :         __ClearPageReserved(p);
    1748         259 :         set_page_count(p, 0);
    1749             : 
    1750         518 :         atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
    1751             : 
    1752             :         /*
    1753             :          * Bypass PCP and place fresh pages right to the tail, primarily
    1754             :          * relevant for memory onlining.
    1755             :          */
    1756         259 :         __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
    1757         259 : }
    1758             : 
    1759             : #ifdef CONFIG_NUMA
    1760             : 
    1761             : /*
    1762             :  * During memory init memblocks map pfns to nids. The search is expensive and
    1763             :  * this caches recent lookups. The implementation of __early_pfn_to_nid
    1764             :  * treats start/end as pfns.
    1765             :  */
    1766             : struct mminit_pfnnid_cache {
    1767             :         unsigned long last_start;
    1768             :         unsigned long last_end;
    1769             :         int last_nid;
    1770             : };
    1771             : 
    1772             : static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
    1773             : 
    1774             : /*
    1775             :  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
    1776             :  */
    1777             : static int __meminit __early_pfn_to_nid(unsigned long pfn,
    1778             :                                         struct mminit_pfnnid_cache *state)
    1779             : {
    1780             :         unsigned long start_pfn, end_pfn;
    1781             :         int nid;
    1782             : 
    1783             :         if (state->last_start <= pfn && pfn < state->last_end)
    1784             :                 return state->last_nid;
    1785             : 
    1786             :         nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
    1787             :         if (nid != NUMA_NO_NODE) {
    1788             :                 state->last_start = start_pfn;
    1789             :                 state->last_end = end_pfn;
    1790             :                 state->last_nid = nid;
    1791             :         }
    1792             : 
    1793             :         return nid;
    1794             : }
    1795             : 
    1796             : int __meminit early_pfn_to_nid(unsigned long pfn)
    1797             : {
    1798             :         static DEFINE_SPINLOCK(early_pfn_lock);
    1799             :         int nid;
    1800             : 
    1801             :         spin_lock(&early_pfn_lock);
    1802             :         nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
    1803             :         if (nid < 0)
    1804             :                 nid = first_online_node;
    1805             :         spin_unlock(&early_pfn_lock);
    1806             : 
    1807             :         return nid;
    1808             : }
    1809             : #endif /* CONFIG_NUMA */
    1810             : 
    1811         259 : void __init memblock_free_pages(struct page *page, unsigned long pfn,
    1812             :                                                         unsigned int order)
    1813             : {
    1814         259 :         if (!early_page_initialised(pfn))
    1815             :                 return;
    1816         259 :         if (!kmsan_memblock_free_pages(page, order)) {
    1817             :                 /* KMSAN will take care of these pages. */
    1818             :                 return;
    1819             :         }
    1820         259 :         __free_pages_core(page, order);
    1821             : }
    1822             : 
    1823             : /*
    1824             :  * Check that the whole (or subset of) a pageblock given by the interval of
    1825             :  * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
    1826             :  * with the migration of free compaction scanner.
    1827             :  *
    1828             :  * Return struct page pointer of start_pfn, or NULL if checks were not passed.
    1829             :  *
    1830             :  * It's possible on some configurations to have a setup like node0 node1 node0
    1831             :  * i.e. it's possible that all pages within a zones range of pages do not
    1832             :  * belong to a single zone. We assume that a border between node0 and node1
    1833             :  * can occur within a single pageblock, but not a node0 node1 node0
    1834             :  * interleaving within a single pageblock. It is therefore sufficient to check
    1835             :  * the first and last page of a pageblock and avoid checking each individual
    1836             :  * page in a pageblock.
    1837             :  */
    1838         264 : struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
    1839             :                                      unsigned long end_pfn, struct zone *zone)
    1840             : {
    1841             :         struct page *start_page;
    1842             :         struct page *end_page;
    1843             : 
    1844             :         /* end_pfn is one past the range we are checking */
    1845         264 :         end_pfn--;
    1846             : 
    1847         528 :         if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
    1848             :                 return NULL;
    1849             : 
    1850         528 :         start_page = pfn_to_online_page(start_pfn);
    1851         264 :         if (!start_page)
    1852             :                 return NULL;
    1853             : 
    1854         264 :         if (page_zone(start_page) != zone)
    1855             :                 return NULL;
    1856             : 
    1857         264 :         end_page = pfn_to_page(end_pfn);
    1858             : 
    1859             :         /* This gives a shorter code than deriving page_zone(end_page) */
    1860         792 :         if (page_zone_id(start_page) != page_zone_id(end_page))
    1861             :                 return NULL;
    1862             : 
    1863         264 :         return start_page;
    1864             : }
    1865             : 
    1866           1 : void set_zone_contiguous(struct zone *zone)
    1867             : {
    1868           1 :         unsigned long block_start_pfn = zone->zone_start_pfn;
    1869             :         unsigned long block_end_pfn;
    1870             : 
    1871           1 :         block_end_pfn = pageblock_end_pfn(block_start_pfn);
    1872         531 :         for (; block_start_pfn < zone_end_pfn(zone);
    1873         264 :                         block_start_pfn = block_end_pfn,
    1874         264 :                          block_end_pfn += pageblock_nr_pages) {
    1875             : 
    1876         264 :                 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
    1877             : 
    1878         264 :                 if (!__pageblock_pfn_to_page(block_start_pfn,
    1879             :                                              block_end_pfn, zone))
    1880             :                         return;
    1881         264 :                 cond_resched();
    1882             :         }
    1883             : 
    1884             :         /* We confirm that there is no hole */
    1885           1 :         zone->contiguous = true;
    1886             : }
    1887             : 
    1888           0 : void clear_zone_contiguous(struct zone *zone)
    1889             : {
    1890           0 :         zone->contiguous = false;
    1891           0 : }
    1892             : 
    1893             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    1894             : static void __init deferred_free_range(unsigned long pfn,
    1895             :                                        unsigned long nr_pages)
    1896             : {
    1897             :         struct page *page;
    1898             :         unsigned long i;
    1899             : 
    1900             :         if (!nr_pages)
    1901             :                 return;
    1902             : 
    1903             :         page = pfn_to_page(pfn);
    1904             : 
    1905             :         /* Free a large naturally-aligned chunk if possible */
    1906             :         if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) {
    1907             :                 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
    1908             :                 __free_pages_core(page, pageblock_order);
    1909             :                 return;
    1910             :         }
    1911             : 
    1912             :         for (i = 0; i < nr_pages; i++, page++, pfn++) {
    1913             :                 if (pageblock_aligned(pfn))
    1914             :                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
    1915             :                 __free_pages_core(page, 0);
    1916             :         }
    1917             : }
    1918             : 
    1919             : /* Completion tracking for deferred_init_memmap() threads */
    1920             : static atomic_t pgdat_init_n_undone __initdata;
    1921             : static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
    1922             : 
    1923             : static inline void __init pgdat_init_report_one_done(void)
    1924             : {
    1925             :         if (atomic_dec_and_test(&pgdat_init_n_undone))
    1926             :                 complete(&pgdat_init_all_done_comp);
    1927             : }
    1928             : 
    1929             : /*
    1930             :  * Returns true if page needs to be initialized or freed to buddy allocator.
    1931             :  *
    1932             :  * We check if a current large page is valid by only checking the validity
    1933             :  * of the head pfn.
    1934             :  */
    1935             : static inline bool __init deferred_pfn_valid(unsigned long pfn)
    1936             : {
    1937             :         if (pageblock_aligned(pfn) && !pfn_valid(pfn))
    1938             :                 return false;
    1939             :         return true;
    1940             : }
    1941             : 
    1942             : /*
    1943             :  * Free pages to buddy allocator. Try to free aligned pages in
    1944             :  * pageblock_nr_pages sizes.
    1945             :  */
    1946             : static void __init deferred_free_pages(unsigned long pfn,
    1947             :                                        unsigned long end_pfn)
    1948             : {
    1949             :         unsigned long nr_free = 0;
    1950             : 
    1951             :         for (; pfn < end_pfn; pfn++) {
    1952             :                 if (!deferred_pfn_valid(pfn)) {
    1953             :                         deferred_free_range(pfn - nr_free, nr_free);
    1954             :                         nr_free = 0;
    1955             :                 } else if (pageblock_aligned(pfn)) {
    1956             :                         deferred_free_range(pfn - nr_free, nr_free);
    1957             :                         nr_free = 1;
    1958             :                 } else {
    1959             :                         nr_free++;
    1960             :                 }
    1961             :         }
    1962             :         /* Free the last block of pages to allocator */
    1963             :         deferred_free_range(pfn - nr_free, nr_free);
    1964             : }
    1965             : 
    1966             : /*
    1967             :  * Initialize struct pages.  We minimize pfn page lookups and scheduler checks
    1968             :  * by performing it only once every pageblock_nr_pages.
    1969             :  * Return number of pages initialized.
    1970             :  */
    1971             : static unsigned long  __init deferred_init_pages(struct zone *zone,
    1972             :                                                  unsigned long pfn,
    1973             :                                                  unsigned long end_pfn)
    1974             : {
    1975             :         int nid = zone_to_nid(zone);
    1976             :         unsigned long nr_pages = 0;
    1977             :         int zid = zone_idx(zone);
    1978             :         struct page *page = NULL;
    1979             : 
    1980             :         for (; pfn < end_pfn; pfn++) {
    1981             :                 if (!deferred_pfn_valid(pfn)) {
    1982             :                         page = NULL;
    1983             :                         continue;
    1984             :                 } else if (!page || pageblock_aligned(pfn)) {
    1985             :                         page = pfn_to_page(pfn);
    1986             :                 } else {
    1987             :                         page++;
    1988             :                 }
    1989             :                 __init_single_page(page, pfn, zid, nid);
    1990             :                 nr_pages++;
    1991             :         }
    1992             :         return (nr_pages);
    1993             : }
    1994             : 
    1995             : /*
    1996             :  * This function is meant to pre-load the iterator for the zone init.
    1997             :  * Specifically it walks through the ranges until we are caught up to the
    1998             :  * first_init_pfn value and exits there. If we never encounter the value we
    1999             :  * return false indicating there are no valid ranges left.
    2000             :  */
    2001             : static bool __init
    2002             : deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
    2003             :                                     unsigned long *spfn, unsigned long *epfn,
    2004             :                                     unsigned long first_init_pfn)
    2005             : {
    2006             :         u64 j;
    2007             : 
    2008             :         /*
    2009             :          * Start out by walking through the ranges in this zone that have
    2010             :          * already been initialized. We don't need to do anything with them
    2011             :          * so we just need to flush them out of the system.
    2012             :          */
    2013             :         for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
    2014             :                 if (*epfn <= first_init_pfn)
    2015             :                         continue;
    2016             :                 if (*spfn < first_init_pfn)
    2017             :                         *spfn = first_init_pfn;
    2018             :                 *i = j;
    2019             :                 return true;
    2020             :         }
    2021             : 
    2022             :         return false;
    2023             : }
    2024             : 
    2025             : /*
    2026             :  * Initialize and free pages. We do it in two loops: first we initialize
    2027             :  * struct page, then free to buddy allocator, because while we are
    2028             :  * freeing pages we can access pages that are ahead (computing buddy
    2029             :  * page in __free_one_page()).
    2030             :  *
    2031             :  * In order to try and keep some memory in the cache we have the loop
    2032             :  * broken along max page order boundaries. This way we will not cause
    2033             :  * any issues with the buddy page computation.
    2034             :  */
    2035             : static unsigned long __init
    2036             : deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
    2037             :                        unsigned long *end_pfn)
    2038             : {
    2039             :         unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
    2040             :         unsigned long spfn = *start_pfn, epfn = *end_pfn;
    2041             :         unsigned long nr_pages = 0;
    2042             :         u64 j = *i;
    2043             : 
    2044             :         /* First we loop through and initialize the page values */
    2045             :         for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
    2046             :                 unsigned long t;
    2047             : 
    2048             :                 if (mo_pfn <= *start_pfn)
    2049             :                         break;
    2050             : 
    2051             :                 t = min(mo_pfn, *end_pfn);
    2052             :                 nr_pages += deferred_init_pages(zone, *start_pfn, t);
    2053             : 
    2054             :                 if (mo_pfn < *end_pfn) {
    2055             :                         *start_pfn = mo_pfn;
    2056             :                         break;
    2057             :                 }
    2058             :         }
    2059             : 
    2060             :         /* Reset values and now loop through freeing pages as needed */
    2061             :         swap(j, *i);
    2062             : 
    2063             :         for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
    2064             :                 unsigned long t;
    2065             : 
    2066             :                 if (mo_pfn <= spfn)
    2067             :                         break;
    2068             : 
    2069             :                 t = min(mo_pfn, epfn);
    2070             :                 deferred_free_pages(spfn, t);
    2071             : 
    2072             :                 if (mo_pfn <= epfn)
    2073             :                         break;
    2074             :         }
    2075             : 
    2076             :         return nr_pages;
    2077             : }
    2078             : 
    2079             : static void __init
    2080             : deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
    2081             :                            void *arg)
    2082             : {
    2083             :         unsigned long spfn, epfn;
    2084             :         struct zone *zone = arg;
    2085             :         u64 i;
    2086             : 
    2087             :         deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
    2088             : 
    2089             :         /*
    2090             :          * Initialize and free pages in MAX_ORDER sized increments so that we
    2091             :          * can avoid introducing any issues with the buddy allocator.
    2092             :          */
    2093             :         while (spfn < end_pfn) {
    2094             :                 deferred_init_maxorder(&i, zone, &spfn, &epfn);
    2095             :                 cond_resched();
    2096             :         }
    2097             : }
    2098             : 
    2099             : /* An arch may override for more concurrency. */
    2100             : __weak int __init
    2101             : deferred_page_init_max_threads(const struct cpumask *node_cpumask)
    2102             : {
    2103             :         return 1;
    2104             : }
    2105             : 
    2106             : /* Initialise remaining memory on a node */
    2107             : static int __init deferred_init_memmap(void *data)
    2108             : {
    2109             :         pg_data_t *pgdat = data;
    2110             :         const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
    2111             :         unsigned long spfn = 0, epfn = 0;
    2112             :         unsigned long first_init_pfn, flags;
    2113             :         unsigned long start = jiffies;
    2114             :         struct zone *zone;
    2115             :         int zid, max_threads;
    2116             :         u64 i;
    2117             : 
    2118             :         /* Bind memory initialisation thread to a local node if possible */
    2119             :         if (!cpumask_empty(cpumask))
    2120             :                 set_cpus_allowed_ptr(current, cpumask);
    2121             : 
    2122             :         pgdat_resize_lock(pgdat, &flags);
    2123             :         first_init_pfn = pgdat->first_deferred_pfn;
    2124             :         if (first_init_pfn == ULONG_MAX) {
    2125             :                 pgdat_resize_unlock(pgdat, &flags);
    2126             :                 pgdat_init_report_one_done();
    2127             :                 return 0;
    2128             :         }
    2129             : 
    2130             :         /* Sanity check boundaries */
    2131             :         BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
    2132             :         BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
    2133             :         pgdat->first_deferred_pfn = ULONG_MAX;
    2134             : 
    2135             :         /*
    2136             :          * Once we unlock here, the zone cannot be grown anymore, thus if an
    2137             :          * interrupt thread must allocate this early in boot, zone must be
    2138             :          * pre-grown prior to start of deferred page initialization.
    2139             :          */
    2140             :         pgdat_resize_unlock(pgdat, &flags);
    2141             : 
    2142             :         /* Only the highest zone is deferred so find it */
    2143             :         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
    2144             :                 zone = pgdat->node_zones + zid;
    2145             :                 if (first_init_pfn < zone_end_pfn(zone))
    2146             :                         break;
    2147             :         }
    2148             : 
    2149             :         /* If the zone is empty somebody else may have cleared out the zone */
    2150             :         if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
    2151             :                                                  first_init_pfn))
    2152             :                 goto zone_empty;
    2153             : 
    2154             :         max_threads = deferred_page_init_max_threads(cpumask);
    2155             : 
    2156             :         while (spfn < epfn) {
    2157             :                 unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
    2158             :                 struct padata_mt_job job = {
    2159             :                         .thread_fn   = deferred_init_memmap_chunk,
    2160             :                         .fn_arg      = zone,
    2161             :                         .start       = spfn,
    2162             :                         .size        = epfn_align - spfn,
    2163             :                         .align       = PAGES_PER_SECTION,
    2164             :                         .min_chunk   = PAGES_PER_SECTION,
    2165             :                         .max_threads = max_threads,
    2166             :                 };
    2167             : 
    2168             :                 padata_do_multithreaded(&job);
    2169             :                 deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
    2170             :                                                     epfn_align);
    2171             :         }
    2172             : zone_empty:
    2173             :         /* Sanity check that the next zone really is unpopulated */
    2174             :         WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
    2175             : 
    2176             :         pr_info("node %d deferred pages initialised in %ums\n",
    2177             :                 pgdat->node_id, jiffies_to_msecs(jiffies - start));
    2178             : 
    2179             :         pgdat_init_report_one_done();
    2180             :         return 0;
    2181             : }
    2182             : 
    2183             : /*
    2184             :  * If this zone has deferred pages, try to grow it by initializing enough
    2185             :  * deferred pages to satisfy the allocation specified by order, rounded up to
    2186             :  * the nearest PAGES_PER_SECTION boundary.  So we're adding memory in increments
    2187             :  * of SECTION_SIZE bytes by initializing struct pages in increments of
    2188             :  * PAGES_PER_SECTION * sizeof(struct page) bytes.
    2189             :  *
    2190             :  * Return true when zone was grown, otherwise return false. We return true even
    2191             :  * when we grow less than requested, to let the caller decide if there are
    2192             :  * enough pages to satisfy the allocation.
    2193             :  *
    2194             :  * Note: We use noinline because this function is needed only during boot, and
    2195             :  * it is called from a __ref function _deferred_grow_zone. This way we are
    2196             :  * making sure that it is not inlined into permanent text section.
    2197             :  */
    2198             : static noinline bool __init
    2199             : deferred_grow_zone(struct zone *zone, unsigned int order)
    2200             : {
    2201             :         unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
    2202             :         pg_data_t *pgdat = zone->zone_pgdat;
    2203             :         unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
    2204             :         unsigned long spfn, epfn, flags;
    2205             :         unsigned long nr_pages = 0;
    2206             :         u64 i;
    2207             : 
    2208             :         /* Only the last zone may have deferred pages */
    2209             :         if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
    2210             :                 return false;
    2211             : 
    2212             :         pgdat_resize_lock(pgdat, &flags);
    2213             : 
    2214             :         /*
    2215             :          * If someone grew this zone while we were waiting for spinlock, return
    2216             :          * true, as there might be enough pages already.
    2217             :          */
    2218             :         if (first_deferred_pfn != pgdat->first_deferred_pfn) {
    2219             :                 pgdat_resize_unlock(pgdat, &flags);
    2220             :                 return true;
    2221             :         }
    2222             : 
    2223             :         /* If the zone is empty somebody else may have cleared out the zone */
    2224             :         if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
    2225             :                                                  first_deferred_pfn)) {
    2226             :                 pgdat->first_deferred_pfn = ULONG_MAX;
    2227             :                 pgdat_resize_unlock(pgdat, &flags);
    2228             :                 /* Retry only once. */
    2229             :                 return first_deferred_pfn != ULONG_MAX;
    2230             :         }
    2231             : 
    2232             :         /*
    2233             :          * Initialize and free pages in MAX_ORDER sized increments so
    2234             :          * that we can avoid introducing any issues with the buddy
    2235             :          * allocator.
    2236             :          */
    2237             :         while (spfn < epfn) {
    2238             :                 /* update our first deferred PFN for this section */
    2239             :                 first_deferred_pfn = spfn;
    2240             : 
    2241             :                 nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
    2242             :                 touch_nmi_watchdog();
    2243             : 
    2244             :                 /* We should only stop along section boundaries */
    2245             :                 if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
    2246             :                         continue;
    2247             : 
    2248             :                 /* If our quota has been met we can stop here */
    2249             :                 if (nr_pages >= nr_pages_needed)
    2250             :                         break;
    2251             :         }
    2252             : 
    2253             :         pgdat->first_deferred_pfn = spfn;
    2254             :         pgdat_resize_unlock(pgdat, &flags);
    2255             : 
    2256             :         return nr_pages > 0;
    2257             : }
    2258             : 
    2259             : /*
    2260             :  * deferred_grow_zone() is __init, but it is called from
    2261             :  * get_page_from_freelist() during early boot until deferred_pages permanently
    2262             :  * disables this call. This is why we have refdata wrapper to avoid warning,
    2263             :  * and to ensure that the function body gets unloaded.
    2264             :  */
    2265             : static bool __ref
    2266             : _deferred_grow_zone(struct zone *zone, unsigned int order)
    2267             : {
    2268             :         return deferred_grow_zone(zone, order);
    2269             : }
    2270             : 
    2271             : #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
    2272             : 
    2273           1 : void __init page_alloc_init_late(void)
    2274             : {
    2275             :         struct zone *zone;
    2276             :         int nid;
    2277             : 
    2278             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    2279             : 
    2280             :         /* There will be num_node_state(N_MEMORY) threads */
    2281             :         atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
    2282             :         for_each_node_state(nid, N_MEMORY) {
    2283             :                 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
    2284             :         }
    2285             : 
    2286             :         /* Block until all are initialised */
    2287             :         wait_for_completion(&pgdat_init_all_done_comp);
    2288             : 
    2289             :         /*
    2290             :          * We initialized the rest of the deferred pages.  Permanently disable
    2291             :          * on-demand struct page initialization.
    2292             :          */
    2293             :         static_branch_disable(&deferred_pages);
    2294             : 
    2295             :         /* Reinit limits that are based on free pages after the kernel is up */
    2296             :         files_maxfiles_init();
    2297             : #endif
    2298             : 
    2299           1 :         buffer_init();
    2300             : 
    2301             :         /* Discard memblock private memory */
    2302           1 :         memblock_discard();
    2303             : 
    2304           1 :         for_each_node_state(nid, N_MEMORY)
    2305             :                 shuffle_free_memory(NODE_DATA(nid));
    2306             : 
    2307           3 :         for_each_populated_zone(zone)
    2308           1 :                 set_zone_contiguous(zone);
    2309           1 : }
    2310             : 
    2311             : #ifdef CONFIG_CMA
    2312             : /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
    2313             : void __init init_cma_reserved_pageblock(struct page *page)
    2314             : {
    2315             :         unsigned i = pageblock_nr_pages;
    2316             :         struct page *p = page;
    2317             : 
    2318             :         do {
    2319             :                 __ClearPageReserved(p);
    2320             :                 set_page_count(p, 0);
    2321             :         } while (++p, --i);
    2322             : 
    2323             :         set_pageblock_migratetype(page, MIGRATE_CMA);
    2324             :         set_page_refcounted(page);
    2325             :         __free_pages(page, pageblock_order);
    2326             : 
    2327             :         adjust_managed_page_count(page, pageblock_nr_pages);
    2328             :         page_zone(page)->cma_pages += pageblock_nr_pages;
    2329             : }
    2330             : #endif
    2331             : 
    2332             : /*
    2333             :  * The order of subdivision here is critical for the IO subsystem.
    2334             :  * Please do not alter this order without good reasons and regression
    2335             :  * testing. Specifically, as large blocks of memory are subdivided,
    2336             :  * the order in which smaller blocks are delivered depends on the order
    2337             :  * they're subdivided in this function. This is the primary factor
    2338             :  * influencing the order in which pages are delivered to the IO
    2339             :  * subsystem according to empirical testing, and this is also justified
    2340             :  * by considering the behavior of a buddy system containing a single
    2341             :  * large block of memory acted on by a series of small allocations.
    2342             :  * This behavior is a critical factor in sglist merging's success.
    2343             :  *
    2344             :  * -- nyc
    2345             :  */
    2346             : static inline void expand(struct zone *zone, struct page *page,
    2347             :         int low, int high, int migratetype)
    2348             : {
    2349        2321 :         unsigned long size = 1 << high;
    2350             : 
    2351        4647 :         while (high > low) {
    2352        2326 :                 high--;
    2353        2326 :                 size >>= 1;
    2354             :                 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
    2355             : 
    2356             :                 /*
    2357             :                  * Mark as guard pages (or page), that will allow to
    2358             :                  * merge back to allocator when buddy will be freed.
    2359             :                  * Corresponding page table entries will not be touched,
    2360             :                  * pages will stay not present in virtual address space
    2361             :                  */
    2362        2326 :                 if (set_page_guard(zone, &page[size], high, migratetype))
    2363             :                         continue;
    2364             : 
    2365        4652 :                 add_to_free_list(&page[size], zone, high, migratetype);
    2366        2326 :                 set_buddy_order(&page[size], high);
    2367             :         }
    2368             : }
    2369             : 
    2370           0 : static void check_new_page_bad(struct page *page)
    2371             : {
    2372             :         if (unlikely(page->flags & __PG_HWPOISON)) {
    2373             :                 /* Don't complain about hwpoisoned pages */
    2374             :                 page_mapcount_reset(page); /* remove PageBuddy */
    2375             :                 return;
    2376             :         }
    2377             : 
    2378           0 :         bad_page(page,
    2379             :                  page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
    2380             : }
    2381             : 
    2382             : /*
    2383             :  * This page is about to be returned from the page allocator
    2384             :  */
    2385        2635 : static inline int check_new_page(struct page *page)
    2386             : {
    2387        2635 :         if (likely(page_expected_state(page,
    2388             :                                 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
    2389             :                 return 0;
    2390             : 
    2391           0 :         check_new_page_bad(page);
    2392           0 :         return 1;
    2393             : }
    2394             : 
    2395             : static bool check_new_pages(struct page *page, unsigned int order)
    2396             : {
    2397             :         int i;
    2398        2635 :         for (i = 0; i < (1 << order); i++) {
    2399        2635 :                 struct page *p = page + i;
    2400             : 
    2401        2635 :                 if (unlikely(check_new_page(p)))
    2402             :                         return true;
    2403             :         }
    2404             : 
    2405             :         return false;
    2406             : }
    2407             : 
    2408             : #ifdef CONFIG_DEBUG_VM
    2409             : /*
    2410             :  * With DEBUG_VM enabled, order-0 pages are checked for expected state when
    2411             :  * being allocated from pcp lists. With debug_pagealloc also enabled, they are
    2412             :  * also checked when pcp lists are refilled from the free lists.
    2413             :  */
    2414             : static inline bool check_pcp_refill(struct page *page, unsigned int order)
    2415             : {
    2416             :         if (debug_pagealloc_enabled_static())
    2417             :                 return check_new_pages(page, order);
    2418             :         else
    2419             :                 return false;
    2420             : }
    2421             : 
    2422             : static inline bool check_new_pcp(struct page *page, unsigned int order)
    2423             : {
    2424             :         return check_new_pages(page, order);
    2425             : }
    2426             : #else
    2427             : /*
    2428             :  * With DEBUG_VM disabled, free order-0 pages are checked for expected state
    2429             :  * when pcp lists are being refilled from the free lists. With debug_pagealloc
    2430             :  * enabled, they are also checked when being allocated from the pcp lists.
    2431             :  */
    2432             : static inline bool check_pcp_refill(struct page *page, unsigned int order)
    2433             : {
    2434        2321 :         return check_new_pages(page, order);
    2435             : }
    2436             : static inline bool check_new_pcp(struct page *page, unsigned int order)
    2437             : {
    2438             :         if (debug_pagealloc_enabled_static())
    2439             :                 return check_new_pages(page, order);
    2440             :         else
    2441             :                 return false;
    2442             : }
    2443             : #endif /* CONFIG_DEBUG_VM */
    2444             : 
    2445             : static inline bool should_skip_kasan_unpoison(gfp_t flags)
    2446             : {
    2447             :         /* Don't skip if a software KASAN mode is enabled. */
    2448             :         if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
    2449             :             IS_ENABLED(CONFIG_KASAN_SW_TAGS))
    2450             :                 return false;
    2451             : 
    2452             :         /* Skip, if hardware tag-based KASAN is not enabled. */
    2453             :         if (!kasan_hw_tags_enabled())
    2454             :                 return true;
    2455             : 
    2456             :         /*
    2457             :          * With hardware tag-based KASAN enabled, skip if this has been
    2458             :          * requested via __GFP_SKIP_KASAN_UNPOISON.
    2459             :          */
    2460             :         return flags & __GFP_SKIP_KASAN_UNPOISON;
    2461             : }
    2462             : 
    2463             : static inline bool should_skip_init(gfp_t flags)
    2464             : {
    2465             :         /* Don't skip, if hardware tag-based KASAN is not enabled. */
    2466             :         if (!kasan_hw_tags_enabled())
    2467             :                 return false;
    2468             : 
    2469             :         /* For hardware tag-based KASAN, skip if requested. */
    2470             :         return (flags & __GFP_SKIP_ZERO);
    2471             : }
    2472             : 
    2473       45065 : inline void post_alloc_hook(struct page *page, unsigned int order,
    2474             :                                 gfp_t gfp_flags)
    2475             : {
    2476       90130 :         bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
    2477             :                         !should_skip_init(gfp_flags);
    2478       45065 :         bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
    2479       45065 :         bool reset_tags = true;
    2480             :         int i;
    2481             : 
    2482       90130 :         set_page_private(page, 0);
    2483       45065 :         set_page_refcounted(page);
    2484             : 
    2485       45065 :         arch_alloc_page(page, order);
    2486       45065 :         debug_pagealloc_map_pages(page, 1 << order);
    2487             : 
    2488             :         /*
    2489             :          * Page unpoisoning must happen before memory initialization.
    2490             :          * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
    2491             :          * allocations and the page unpoisoning code will complain.
    2492             :          */
    2493       45065 :         kernel_unpoison_pages(page, 1 << order);
    2494             : 
    2495             :         /*
    2496             :          * As memory initialization might be integrated into KASAN,
    2497             :          * KASAN unpoisoning and memory initializion code must be
    2498             :          * kept together to avoid discrepancies in behavior.
    2499             :          */
    2500             : 
    2501             :         /*
    2502             :          * If memory tags should be zeroed
    2503             :          * (which happens only when memory should be initialized as well).
    2504             :          */
    2505       45065 :         if (zero_tags) {
    2506             :                 /* Initialize both memory and memory tags. */
    2507             :                 for (i = 0; i != 1 << order; ++i)
    2508             :                         tag_clear_highpage(page + i);
    2509             : 
    2510             :                 /* Take note that memory was initialized by the loop above. */
    2511             :                 init = false;
    2512             :         }
    2513       45065 :         if (!should_skip_kasan_unpoison(gfp_flags)) {
    2514             :                 /* Try unpoisoning (or setting tags) and initializing memory. */
    2515             :                 if (kasan_unpoison_pages(page, order, init)) {
    2516             :                         /* Take note that memory was initialized by KASAN. */
    2517             :                         if (kasan_has_integrated_init())
    2518             :                                 init = false;
    2519             :                         /* Take note that memory tags were set by KASAN. */
    2520             :                         reset_tags = false;
    2521             :                 } else {
    2522             :                         /*
    2523             :                          * KASAN decided to exclude this allocation from being
    2524             :                          * (un)poisoned due to sampling. Make KASAN skip
    2525             :                          * poisoning when the allocation is freed.
    2526             :                          */
    2527             :                         SetPageSkipKASanPoison(page);
    2528             :                 }
    2529             :         }
    2530             :         /*
    2531             :          * If memory tags have not been set by KASAN, reset the page tags to
    2532             :          * ensure page_address() dereferencing does not fault.
    2533             :          */
    2534             :         if (reset_tags) {
    2535             :                 for (i = 0; i != 1 << order; ++i)
    2536             :                         page_kasan_tag_reset(page + i);
    2537             :         }
    2538             :         /* If memory is still not initialized, initialize it now. */
    2539       45065 :         if (init)
    2540             :                 kernel_init_pages(page, 1 << order);
    2541             :         /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
    2542             :         if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))
    2543             :                 SetPageSkipKASanPoison(page);
    2544             : 
    2545       45065 :         set_page_owner(page, order, gfp_flags);
    2546       45065 :         page_table_check_alloc(page, order);
    2547       45065 : }
    2548             : 
    2549        2524 : static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
    2550             :                                                         unsigned int alloc_flags)
    2551             : {
    2552       45065 :         post_alloc_hook(page, order, gfp_flags);
    2553             : 
    2554        2524 :         if (order && (gfp_flags & __GFP_COMP))
    2555             :                 prep_compound_page(page, order);
    2556             : 
    2557             :         /*
    2558             :          * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
    2559             :          * allocate the page. The expectation is that the caller is taking
    2560             :          * steps that will free more memory. The caller should avoid the page
    2561             :          * being used for !PFMEMALLOC purposes.
    2562             :          */
    2563        2524 :         if (alloc_flags & ALLOC_NO_WATERMARKS)
    2564           0 :                 set_page_pfmemalloc(page);
    2565             :         else
    2566       45065 :                 clear_page_pfmemalloc(page);
    2567        2524 : }
    2568             : 
    2569             : /*
    2570             :  * Go through the free lists for the given migratetype and remove
    2571             :  * the smallest available page from the freelists
    2572             :  */
    2573             : static __always_inline
    2574             : struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
    2575             :                                                 int migratetype)
    2576             : {
    2577             :         unsigned int current_order;
    2578             :         struct free_area *area;
    2579             :         struct page *page;
    2580             : 
    2581             :         /* Find a page of the appropriate size in the preferred list */
    2582        9388 :         for (current_order = order; current_order < MAX_ORDER; ++current_order) {
    2583        4690 :                 area = &(zone->free_area[current_order]);
    2584        4690 :                 page = get_page_from_free_area(area, migratetype);
    2585        4690 :                 if (!page)
    2586        2369 :                         continue;
    2587        2321 :                 del_page_from_free_list(page, zone, current_order);
    2588        4642 :                 expand(zone, page, order, current_order, migratetype);
    2589        2321 :                 set_pcppage_migratetype(page, migratetype);
    2590             :                 trace_mm_page_alloc_zone_locked(page, order, migratetype,
    2591             :                                 pcp_allowed_order(order) &&
    2592             :                                 migratetype < MIGRATE_PCPTYPES);
    2593             :                 return page;
    2594             :         }
    2595             : 
    2596             :         return NULL;
    2597             : }
    2598             : 
    2599             : 
    2600             : /*
    2601             :  * This array describes the order lists are fallen back to when
    2602             :  * the free lists for the desirable migrate type are depleted
    2603             :  *
    2604             :  * The other migratetypes do not have fallbacks.
    2605             :  */
    2606             : static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = {
    2607             :         [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE   },
    2608             :         [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
    2609             :         [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE   },
    2610             : };
    2611             : 
    2612             : #ifdef CONFIG_CMA
    2613             : static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
    2614             :                                         unsigned int order)
    2615             : {
    2616             :         return __rmqueue_smallest(zone, order, MIGRATE_CMA);
    2617             : }
    2618             : #else
    2619             : static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
    2620             :                                         unsigned int order) { return NULL; }
    2621             : #endif
    2622             : 
    2623             : /*
    2624             :  * Move the free pages in a range to the freelist tail of the requested type.
    2625             :  * Note that start_page and end_pages are not aligned on a pageblock
    2626             :  * boundary. If alignment is required, use move_freepages_block()
    2627             :  */
    2628           0 : static int move_freepages(struct zone *zone,
    2629             :                           unsigned long start_pfn, unsigned long end_pfn,
    2630             :                           int migratetype, int *num_movable)
    2631             : {
    2632             :         struct page *page;
    2633             :         unsigned long pfn;
    2634             :         unsigned int order;
    2635           0 :         int pages_moved = 0;
    2636             : 
    2637           0 :         for (pfn = start_pfn; pfn <= end_pfn;) {
    2638           0 :                 page = pfn_to_page(pfn);
    2639           0 :                 if (!PageBuddy(page)) {
    2640             :                         /*
    2641             :                          * We assume that pages that could be isolated for
    2642             :                          * migration are movable. But we don't actually try
    2643             :                          * isolating, as that would be expensive.
    2644             :                          */
    2645           0 :                         if (num_movable &&
    2646           0 :                                         (PageLRU(page) || __PageMovable(page)))
    2647           0 :                                 (*num_movable)++;
    2648           0 :                         pfn++;
    2649           0 :                         continue;
    2650             :                 }
    2651             : 
    2652             :                 /* Make sure we are not inadvertently changing nodes */
    2653             :                 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
    2654             :                 VM_BUG_ON_PAGE(page_zone(page) != zone, page);
    2655             : 
    2656           0 :                 order = buddy_order(page);
    2657           0 :                 move_to_free_list(page, zone, order, migratetype);
    2658           0 :                 pfn += 1 << order;
    2659           0 :                 pages_moved += 1 << order;
    2660             :         }
    2661             : 
    2662           0 :         return pages_moved;
    2663             : }
    2664             : 
    2665           0 : int move_freepages_block(struct zone *zone, struct page *page,
    2666             :                                 int migratetype, int *num_movable)
    2667             : {
    2668             :         unsigned long start_pfn, end_pfn, pfn;
    2669             : 
    2670           0 :         if (num_movable)
    2671           0 :                 *num_movable = 0;
    2672             : 
    2673           0 :         pfn = page_to_pfn(page);
    2674           0 :         start_pfn = pageblock_start_pfn(pfn);
    2675           0 :         end_pfn = pageblock_end_pfn(pfn) - 1;
    2676             : 
    2677             :         /* Do not cross zone boundaries */
    2678           0 :         if (!zone_spans_pfn(zone, start_pfn))
    2679           0 :                 start_pfn = pfn;
    2680           0 :         if (!zone_spans_pfn(zone, end_pfn))
    2681             :                 return 0;
    2682             : 
    2683           0 :         return move_freepages(zone, start_pfn, end_pfn, migratetype,
    2684             :                                                                 num_movable);
    2685             : }
    2686             : 
    2687             : static void change_pageblock_range(struct page *pageblock_page,
    2688             :                                         int start_order, int migratetype)
    2689             : {
    2690           4 :         int nr_pageblocks = 1 << (start_order - pageblock_order);
    2691             : 
    2692           8 :         while (nr_pageblocks--) {
    2693           4 :                 set_pageblock_migratetype(pageblock_page, migratetype);
    2694           4 :                 pageblock_page += pageblock_nr_pages;
    2695             :         }
    2696             : }
    2697             : 
    2698             : /*
    2699             :  * When we are falling back to another migratetype during allocation, try to
    2700             :  * steal extra free pages from the same pageblocks to satisfy further
    2701             :  * allocations, instead of polluting multiple pageblocks.
    2702             :  *
    2703             :  * If we are stealing a relatively large buddy page, it is likely there will
    2704             :  * be more free pages in the pageblock, so try to steal them all. For
    2705             :  * reclaimable and unmovable allocations, we steal regardless of page size,
    2706             :  * as fragmentation caused by those allocations polluting movable pageblocks
    2707             :  * is worse than movable allocations stealing from unmovable and reclaimable
    2708             :  * pageblocks.
    2709             :  */
    2710             : static bool can_steal_fallback(unsigned int order, int start_mt)
    2711             : {
    2712             :         /*
    2713             :          * Leaving this order check is intended, although there is
    2714             :          * relaxed order check in next check. The reason is that
    2715             :          * we can actually steal whole pageblock if this condition met,
    2716             :          * but, below check doesn't guarantee it and that is just heuristic
    2717             :          * so could be changed anytime.
    2718             :          */
    2719           4 :         if (order >= pageblock_order)
    2720             :                 return true;
    2721             : 
    2722           0 :         if (order >= pageblock_order / 2 ||
    2723           0 :                 start_mt == MIGRATE_RECLAIMABLE ||
    2724           0 :                 start_mt == MIGRATE_UNMOVABLE ||
    2725             :                 page_group_by_mobility_disabled)
    2726             :                 return true;
    2727             : 
    2728             :         return false;
    2729             : }
    2730             : 
    2731           0 : static inline bool boost_watermark(struct zone *zone)
    2732             : {
    2733             :         unsigned long max_boost;
    2734             : 
    2735           0 :         if (!watermark_boost_factor)
    2736             :                 return false;
    2737             :         /*
    2738             :          * Don't bother in zones that are unlikely to produce results.
    2739             :          * On small machines, including kdump capture kernels running
    2740             :          * in a small area, boosting the watermark can cause an out of
    2741             :          * memory situation immediately.
    2742             :          */
    2743           0 :         if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
    2744             :                 return false;
    2745             : 
    2746           0 :         max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
    2747             :                         watermark_boost_factor, 10000);
    2748             : 
    2749             :         /*
    2750             :          * high watermark may be uninitialised if fragmentation occurs
    2751             :          * very early in boot so do not boost. We do not fall
    2752             :          * through and boost by pageblock_nr_pages as failing
    2753             :          * allocations that early means that reclaim is not going
    2754             :          * to help and it may even be impossible to reclaim the
    2755             :          * boosted watermark resulting in a hang.
    2756             :          */
    2757           0 :         if (!max_boost)
    2758             :                 return false;
    2759             : 
    2760           0 :         max_boost = max(pageblock_nr_pages, max_boost);
    2761             : 
    2762           0 :         zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
    2763             :                 max_boost);
    2764             : 
    2765           0 :         return true;
    2766             : }
    2767             : 
    2768             : /*
    2769             :  * This function implements actual steal behaviour. If order is large enough,
    2770             :  * we can steal whole pageblock. If not, we first move freepages in this
    2771             :  * pageblock to our migratetype and determine how many already-allocated pages
    2772             :  * are there in the pageblock with a compatible migratetype. If at least half
    2773             :  * of pages are free or compatible, we can change migratetype of the pageblock
    2774             :  * itself, so pages freed in the future will be put on the correct free list.
    2775             :  */
    2776           4 : static void steal_suitable_fallback(struct zone *zone, struct page *page,
    2777             :                 unsigned int alloc_flags, int start_type, bool whole_block)
    2778             : {
    2779           8 :         unsigned int current_order = buddy_order(page);
    2780             :         int free_pages, movable_pages, alike_pages;
    2781             :         int old_block_type;
    2782             : 
    2783           8 :         old_block_type = get_pageblock_migratetype(page);
    2784             : 
    2785             :         /*
    2786             :          * This can happen due to races and we want to prevent broken
    2787             :          * highatomic accounting.
    2788             :          */
    2789           4 :         if (is_migrate_highatomic(old_block_type))
    2790             :                 goto single_page;
    2791             : 
    2792             :         /* Take ownership for orders >= pageblock_order */
    2793           4 :         if (current_order >= pageblock_order) {
    2794           4 :                 change_pageblock_range(page, current_order, start_type);
    2795             :                 goto single_page;
    2796             :         }
    2797             : 
    2798             :         /*
    2799             :          * Boost watermarks to increase reclaim pressure to reduce the
    2800             :          * likelihood of future fallbacks. Wake kswapd now as the node
    2801             :          * may be balanced overall and kswapd will not wake naturally.
    2802             :          */
    2803           0 :         if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
    2804           0 :                 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
    2805             : 
    2806             :         /* We are not allowed to try stealing from the whole block */
    2807           0 :         if (!whole_block)
    2808             :                 goto single_page;
    2809             : 
    2810           0 :         free_pages = move_freepages_block(zone, page, start_type,
    2811             :                                                 &movable_pages);
    2812             :         /*
    2813             :          * Determine how many pages are compatible with our allocation.
    2814             :          * For movable allocation, it's the number of movable pages which
    2815             :          * we just obtained. For other types it's a bit more tricky.
    2816             :          */
    2817           0 :         if (start_type == MIGRATE_MOVABLE) {
    2818           0 :                 alike_pages = movable_pages;
    2819             :         } else {
    2820             :                 /*
    2821             :                  * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
    2822             :                  * to MOVABLE pageblock, consider all non-movable pages as
    2823             :                  * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
    2824             :                  * vice versa, be conservative since we can't distinguish the
    2825             :                  * exact migratetype of non-movable pages.
    2826             :                  */
    2827           0 :                 if (old_block_type == MIGRATE_MOVABLE)
    2828           0 :                         alike_pages = pageblock_nr_pages
    2829           0 :                                                 - (free_pages + movable_pages);
    2830             :                 else
    2831             :                         alike_pages = 0;
    2832             :         }
    2833             : 
    2834             :         /* moving whole block can fail due to zone boundary conditions */
    2835           0 :         if (!free_pages)
    2836             :                 goto single_page;
    2837             : 
    2838             :         /*
    2839             :          * If a sufficient number of pages in the block are either free or of
    2840             :          * comparable migratability as our allocation, claim the whole block.
    2841             :          */
    2842           0 :         if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
    2843             :                         page_group_by_mobility_disabled)
    2844           0 :                 set_pageblock_migratetype(page, start_type);
    2845             : 
    2846           0 :         return;
    2847             : 
    2848             : single_page:
    2849           4 :         move_to_free_list(page, zone, current_order, start_type);
    2850             : }
    2851             : 
    2852             : /*
    2853             :  * Check whether there is a suitable fallback freepage with requested order.
    2854             :  * If only_stealable is true, this function returns fallback_mt only if
    2855             :  * we can steal other freepages all together. This would help to reduce
    2856             :  * fragmentation due to mixed migratetype pages in one pageblock.
    2857             :  */
    2858           4 : int find_suitable_fallback(struct free_area *area, unsigned int order,
    2859             :                         int migratetype, bool only_stealable, bool *can_steal)
    2860             : {
    2861             :         int i;
    2862             :         int fallback_mt;
    2863             : 
    2864           4 :         if (area->nr_free == 0)
    2865             :                 return -1;
    2866             : 
    2867           4 :         *can_steal = false;
    2868           8 :         for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
    2869           8 :                 fallback_mt = fallbacks[migratetype][i];
    2870           8 :                 if (free_area_empty(area, fallback_mt))
    2871           4 :                         continue;
    2872             : 
    2873           4 :                 if (can_steal_fallback(order, migratetype))
    2874           4 :                         *can_steal = true;
    2875             : 
    2876           4 :                 if (!only_stealable)
    2877             :                         return fallback_mt;
    2878             : 
    2879           0 :                 if (*can_steal)
    2880             :                         return fallback_mt;
    2881             :         }
    2882             : 
    2883             :         return -1;
    2884             : }
    2885             : 
    2886             : /*
    2887             :  * Reserve a pageblock for exclusive use of high-order atomic allocations if
    2888             :  * there are no empty page blocks that contain a page with a suitable order
    2889             :  */
    2890           0 : static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
    2891             :                                 unsigned int alloc_order)
    2892             : {
    2893             :         int mt;
    2894             :         unsigned long max_managed, flags;
    2895             : 
    2896             :         /*
    2897             :          * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
    2898             :          * Check is race-prone but harmless.
    2899             :          */
    2900           0 :         max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
    2901           0 :         if (zone->nr_reserved_highatomic >= max_managed)
    2902             :                 return;
    2903             : 
    2904           0 :         spin_lock_irqsave(&zone->lock, flags);
    2905             : 
    2906             :         /* Recheck the nr_reserved_highatomic limit under the lock */
    2907           0 :         if (zone->nr_reserved_highatomic >= max_managed)
    2908             :                 goto out_unlock;
    2909             : 
    2910             :         /* Yoink! */
    2911           0 :         mt = get_pageblock_migratetype(page);
    2912             :         /* Only reserve normal pageblocks (i.e., they can merge with others) */
    2913           0 :         if (migratetype_is_mergeable(mt)) {
    2914           0 :                 zone->nr_reserved_highatomic += pageblock_nr_pages;
    2915           0 :                 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
    2916           0 :                 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
    2917             :         }
    2918             : 
    2919             : out_unlock:
    2920           0 :         spin_unlock_irqrestore(&zone->lock, flags);
    2921             : }
    2922             : 
    2923             : /*
    2924             :  * Used when an allocation is about to fail under memory pressure. This
    2925             :  * potentially hurts the reliability of high-order allocations when under
    2926             :  * intense memory pressure but failed atomic allocations should be easier
    2927             :  * to recover from than an OOM.
    2928             :  *
    2929             :  * If @force is true, try to unreserve a pageblock even though highatomic
    2930             :  * pageblock is exhausted.
    2931             :  */
    2932           0 : static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
    2933             :                                                 bool force)
    2934             : {
    2935           0 :         struct zonelist *zonelist = ac->zonelist;
    2936             :         unsigned long flags;
    2937             :         struct zoneref *z;
    2938             :         struct zone *zone;
    2939             :         struct page *page;
    2940             :         int order;
    2941             :         bool ret;
    2942             : 
    2943           0 :         for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
    2944             :                                                                 ac->nodemask) {
    2945             :                 /*
    2946             :                  * Preserve at least one pageblock unless memory pressure
    2947             :                  * is really high.
    2948             :                  */
    2949           0 :                 if (!force && zone->nr_reserved_highatomic <=
    2950             :                                         pageblock_nr_pages)
    2951           0 :                         continue;
    2952             : 
    2953           0 :                 spin_lock_irqsave(&zone->lock, flags);
    2954           0 :                 for (order = 0; order < MAX_ORDER; order++) {
    2955           0 :                         struct free_area *area = &(zone->free_area[order]);
    2956             : 
    2957           0 :                         page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
    2958           0 :                         if (!page)
    2959           0 :                                 continue;
    2960             : 
    2961             :                         /*
    2962             :                          * In page freeing path, migratetype change is racy so
    2963             :                          * we can counter several free pages in a pageblock
    2964             :                          * in this loop although we changed the pageblock type
    2965             :                          * from highatomic to ac->migratetype. So we should
    2966             :                          * adjust the count once.
    2967             :                          */
    2968           0 :                         if (is_migrate_highatomic_page(page)) {
    2969             :                                 /*
    2970             :                                  * It should never happen but changes to
    2971             :                                  * locking could inadvertently allow a per-cpu
    2972             :                                  * drain to add pages to MIGRATE_HIGHATOMIC
    2973             :                                  * while unreserving so be safe and watch for
    2974             :                                  * underflows.
    2975             :                                  */
    2976           0 :                                 zone->nr_reserved_highatomic -= min(
    2977             :                                                 pageblock_nr_pages,
    2978             :                                                 zone->nr_reserved_highatomic);
    2979             :                         }
    2980             : 
    2981             :                         /*
    2982             :                          * Convert to ac->migratetype and avoid the normal
    2983             :                          * pageblock stealing heuristics. Minimally, the caller
    2984             :                          * is doing the work and needs the pages. More
    2985             :                          * importantly, if the block was always converted to
    2986             :                          * MIGRATE_UNMOVABLE or another type then the number
    2987             :                          * of pageblocks that cannot be completely freed
    2988             :                          * may increase.
    2989             :                          */
    2990           0 :                         set_pageblock_migratetype(page, ac->migratetype);
    2991           0 :                         ret = move_freepages_block(zone, page, ac->migratetype,
    2992             :                                                                         NULL);
    2993           0 :                         if (ret) {
    2994           0 :                                 spin_unlock_irqrestore(&zone->lock, flags);
    2995           0 :                                 return ret;
    2996             :                         }
    2997             :                 }
    2998           0 :                 spin_unlock_irqrestore(&zone->lock, flags);
    2999             :         }
    3000             : 
    3001             :         return false;
    3002             : }
    3003             : 
    3004             : /*
    3005             :  * Try finding a free buddy page on the fallback list and put it on the free
    3006             :  * list of requested migratetype, possibly along with other pages from the same
    3007             :  * block, depending on fragmentation avoidance heuristics. Returns true if
    3008             :  * fallback was found so that __rmqueue_smallest() can grab it.
    3009             :  *
    3010             :  * The use of signed ints for order and current_order is a deliberate
    3011             :  * deviation from the rest of this file, to make the for loop
    3012             :  * condition simpler.
    3013             :  */
    3014             : static __always_inline bool
    3015             : __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
    3016             :                                                 unsigned int alloc_flags)
    3017             : {
    3018             :         struct free_area *area;
    3019             :         int current_order;
    3020           4 :         int min_order = order;
    3021             :         struct page *page;
    3022             :         int fallback_mt;
    3023             :         bool can_steal;
    3024             : 
    3025             :         /*
    3026             :          * Do not steal pages from freelists belonging to other pageblocks
    3027             :          * i.e. orders < pageblock_order. If there are no local zones free,
    3028             :          * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
    3029             :          */
    3030             :         if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
    3031             :                 min_order = pageblock_order;
    3032             : 
    3033             :         /*
    3034             :          * Find the largest available free page in the other list. This roughly
    3035             :          * approximates finding the pageblock with the most free pages, which
    3036             :          * would be too costly to do exactly.
    3037             :          */
    3038           8 :         for (current_order = MAX_ORDER - 1; current_order >= min_order;
    3039           0 :                                 --current_order) {
    3040           4 :                 area = &(zone->free_area[current_order]);
    3041           4 :                 fallback_mt = find_suitable_fallback(area, current_order,
    3042             :                                 start_migratetype, false, &can_steal);
    3043           4 :                 if (fallback_mt == -1)
    3044           0 :                         continue;
    3045             : 
    3046             :                 /*
    3047             :                  * We cannot steal all free pages from the pageblock and the
    3048             :                  * requested migratetype is movable. In that case it's better to
    3049             :                  * steal and split the smallest available page instead of the
    3050             :                  * largest available page, because even if the next movable
    3051             :                  * allocation falls back into a different pageblock than this
    3052             :                  * one, it won't cause permanent fragmentation.
    3053             :                  */
    3054           4 :                 if (!can_steal && start_migratetype == MIGRATE_MOVABLE
    3055           0 :                                         && current_order > order)
    3056             :                         goto find_smallest;
    3057             : 
    3058             :                 goto do_steal;
    3059             :         }
    3060             : 
    3061             :         return false;
    3062             : 
    3063             : find_smallest:
    3064           0 :         for (current_order = order; current_order < MAX_ORDER;
    3065           0 :                                                         current_order++) {
    3066           0 :                 area = &(zone->free_area[current_order]);
    3067           0 :                 fallback_mt = find_suitable_fallback(area, current_order,
    3068             :                                 start_migratetype, false, &can_steal);
    3069           0 :                 if (fallback_mt != -1)
    3070             :                         break;
    3071             :         }
    3072             : 
    3073             :         /*
    3074             :          * This should not happen - we already found a suitable fallback
    3075             :          * when looking for the largest page.
    3076             :          */
    3077             :         VM_BUG_ON(current_order == MAX_ORDER);
    3078             : 
    3079             : do_steal:
    3080           4 :         page = get_page_from_free_area(area, fallback_mt);
    3081             : 
    3082           4 :         steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
    3083             :                                                                 can_steal);
    3084             : 
    3085           4 :         trace_mm_page_alloc_extfrag(page, order, current_order,
    3086             :                 start_migratetype, fallback_mt);
    3087             : 
    3088             :         return true;
    3089             : 
    3090             : }
    3091             : 
    3092             : /*
    3093             :  * Do the hard work of removing an element from the buddy allocator.
    3094             :  * Call me with the zone->lock already held.
    3095             :  */
    3096             : static __always_inline struct page *
    3097             : __rmqueue(struct zone *zone, unsigned int order, int migratetype,
    3098             :                                                 unsigned int alloc_flags)
    3099             : {
    3100             :         struct page *page;
    3101             : 
    3102             :         if (IS_ENABLED(CONFIG_CMA)) {
    3103             :                 /*
    3104             :                  * Balance movable allocations between regular and CMA areas by
    3105             :                  * allocating from CMA when over half of the zone's free memory
    3106             :                  * is in the CMA area.
    3107             :                  */
    3108             :                 if (alloc_flags & ALLOC_CMA &&
    3109             :                     zone_page_state(zone, NR_FREE_CMA_PAGES) >
    3110             :                     zone_page_state(zone, NR_FREE_PAGES) / 2) {
    3111             :                         page = __rmqueue_cma_fallback(zone, order);
    3112             :                         if (page)
    3113             :                                 return page;
    3114             :                 }
    3115             :         }
    3116             : retry:
    3117        2325 :         page = __rmqueue_smallest(zone, order, migratetype);
    3118        2325 :         if (unlikely(!page)) {
    3119           4 :                 if (alloc_flags & ALLOC_CMA)
    3120           0 :                         page = __rmqueue_cma_fallback(zone, order);
    3121             : 
    3122           8 :                 if (!page && __rmqueue_fallback(zone, order, migratetype,
    3123             :                                                                 alloc_flags))
    3124             :                         goto retry;
    3125             :         }
    3126             :         return page;
    3127             : }
    3128             : 
    3129             : /*
    3130             :  * Obtain a specified number of elements from the buddy allocator, all under
    3131             :  * a single hold of the lock, for efficiency.  Add them to the supplied list.
    3132             :  * Returns the number of new pages which were placed at *list.
    3133             :  */
    3134          53 : static int rmqueue_bulk(struct zone *zone, unsigned int order,
    3135             :                         unsigned long count, struct list_head *list,
    3136             :                         int migratetype, unsigned int alloc_flags)
    3137             : {
    3138             :         unsigned long flags;
    3139          53 :         int i, allocated = 0;
    3140             : 
    3141          53 :         spin_lock_irqsave(&zone->lock, flags);
    3142        2374 :         for (i = 0; i < count; ++i) {
    3143        2321 :                 struct page *page = __rmqueue(zone, order, migratetype,
    3144             :                                                                 alloc_flags);
    3145        2321 :                 if (unlikely(page == NULL))
    3146             :                         break;
    3147             : 
    3148        2321 :                 if (unlikely(check_pcp_refill(page, order)))
    3149           0 :                         continue;
    3150             : 
    3151             :                 /*
    3152             :                  * Split buddy pages returned by expand() are received here in
    3153             :                  * physical page order. The page is added to the tail of
    3154             :                  * caller's list. From the callers perspective, the linked list
    3155             :                  * is ordered by page number under some conditions. This is
    3156             :                  * useful for IO devices that can forward direction from the
    3157             :                  * head, thus also in the physical page order. This is useful
    3158             :                  * for IO devices that can merge IO requests if the physical
    3159             :                  * pages are ordered properly.
    3160             :                  */
    3161        4642 :                 list_add_tail(&page->pcp_list, list);
    3162        2321 :                 allocated++;
    3163             :                 if (is_migrate_cma(get_pcppage_migratetype(page)))
    3164             :                         __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
    3165             :                                               -(1 << order));
    3166             :         }
    3167             : 
    3168             :         /*
    3169             :          * i pages were removed from the buddy list even if some leak due
    3170             :          * to check_pcp_refill failing so adjust NR_FREE_PAGES based
    3171             :          * on i. Do not confuse with 'allocated' which is the number of
    3172             :          * pages added to the pcp list.
    3173             :          */
    3174         106 :         __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
    3175         106 :         spin_unlock_irqrestore(&zone->lock, flags);
    3176          53 :         return allocated;
    3177             : }
    3178             : 
    3179             : #ifdef CONFIG_NUMA
    3180             : /*
    3181             :  * Called from the vmstat counter updater to drain pagesets of this
    3182             :  * currently executing processor on remote nodes after they have
    3183             :  * expired.
    3184             :  */
    3185             : void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
    3186             : {
    3187             :         int to_drain, batch;
    3188             : 
    3189             :         batch = READ_ONCE(pcp->batch);
    3190             :         to_drain = min(pcp->count, batch);
    3191             :         if (to_drain > 0) {
    3192             :                 spin_lock(&pcp->lock);
    3193             :                 free_pcppages_bulk(zone, to_drain, pcp, 0);
    3194             :                 spin_unlock(&pcp->lock);
    3195             :         }
    3196             : }
    3197             : #endif
    3198             : 
    3199             : /*
    3200             :  * Drain pcplists of the indicated processor and zone.
    3201             :  */
    3202           0 : static void drain_pages_zone(unsigned int cpu, struct zone *zone)
    3203             : {
    3204             :         struct per_cpu_pages *pcp;
    3205             : 
    3206           0 :         pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    3207           0 :         if (pcp->count) {
    3208           0 :                 spin_lock(&pcp->lock);
    3209           0 :                 free_pcppages_bulk(zone, pcp->count, pcp, 0);
    3210           0 :                 spin_unlock(&pcp->lock);
    3211             :         }
    3212           0 : }
    3213             : 
    3214             : /*
    3215             :  * Drain pcplists of all zones on the indicated processor.
    3216             :  */
    3217           0 : static void drain_pages(unsigned int cpu)
    3218             : {
    3219             :         struct zone *zone;
    3220             : 
    3221           0 :         for_each_populated_zone(zone) {
    3222           0 :                 drain_pages_zone(cpu, zone);
    3223             :         }
    3224           0 : }
    3225             : 
    3226             : /*
    3227             :  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
    3228             :  */
    3229           0 : void drain_local_pages(struct zone *zone)
    3230             : {
    3231           0 :         int cpu = smp_processor_id();
    3232             : 
    3233           0 :         if (zone)
    3234           0 :                 drain_pages_zone(cpu, zone);
    3235             :         else
    3236           0 :                 drain_pages(cpu);
    3237           0 : }
    3238             : 
    3239             : /*
    3240             :  * The implementation of drain_all_pages(), exposing an extra parameter to
    3241             :  * drain on all cpus.
    3242             :  *
    3243             :  * drain_all_pages() is optimized to only execute on cpus where pcplists are
    3244             :  * not empty. The check for non-emptiness can however race with a free to
    3245             :  * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
    3246             :  * that need the guarantee that every CPU has drained can disable the
    3247             :  * optimizing racy check.
    3248             :  */
    3249           0 : static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
    3250             : {
    3251             :         int cpu;
    3252             : 
    3253             :         /*
    3254             :          * Allocate in the BSS so we won't require allocation in
    3255             :          * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
    3256             :          */
    3257             :         static cpumask_t cpus_with_pcps;
    3258             : 
    3259             :         /*
    3260             :          * Do not drain if one is already in progress unless it's specific to
    3261             :          * a zone. Such callers are primarily CMA and memory hotplug and need
    3262             :          * the drain to be complete when the call returns.
    3263             :          */
    3264           0 :         if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
    3265           0 :                 if (!zone)
    3266             :                         return;
    3267           0 :                 mutex_lock(&pcpu_drain_mutex);
    3268             :         }
    3269             : 
    3270             :         /*
    3271             :          * We don't care about racing with CPU hotplug event
    3272             :          * as offline notification will cause the notified
    3273             :          * cpu to drain that CPU pcps and on_each_cpu_mask
    3274             :          * disables preemption as part of its processing
    3275             :          */
    3276           0 :         for_each_online_cpu(cpu) {
    3277             :                 struct per_cpu_pages *pcp;
    3278             :                 struct zone *z;
    3279           0 :                 bool has_pcps = false;
    3280             : 
    3281           0 :                 if (force_all_cpus) {
    3282             :                         /*
    3283             :                          * The pcp.count check is racy, some callers need a
    3284             :                          * guarantee that no cpu is missed.
    3285             :                          */
    3286             :                         has_pcps = true;
    3287           0 :                 } else if (zone) {
    3288           0 :                         pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    3289           0 :                         if (pcp->count)
    3290           0 :                                 has_pcps = true;
    3291             :                 } else {
    3292           0 :                         for_each_populated_zone(z) {
    3293           0 :                                 pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
    3294           0 :                                 if (pcp->count) {
    3295             :                                         has_pcps = true;
    3296             :                                         break;
    3297             :                                 }
    3298             :                         }
    3299             :                 }
    3300             : 
    3301           0 :                 if (has_pcps)
    3302           0 :                         cpumask_set_cpu(cpu, &cpus_with_pcps);
    3303             :                 else
    3304             :                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
    3305             :         }
    3306             : 
    3307           0 :         for_each_cpu(cpu, &cpus_with_pcps) {
    3308           0 :                 if (zone)
    3309           0 :                         drain_pages_zone(cpu, zone);
    3310             :                 else
    3311           0 :                         drain_pages(cpu);
    3312             :         }
    3313             : 
    3314           0 :         mutex_unlock(&pcpu_drain_mutex);
    3315             : }
    3316             : 
    3317             : /*
    3318             :  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
    3319             :  *
    3320             :  * When zone parameter is non-NULL, spill just the single zone's pages.
    3321             :  */
    3322           0 : void drain_all_pages(struct zone *zone)
    3323             : {
    3324           0 :         __drain_all_pages(zone, false);
    3325           0 : }
    3326             : 
    3327             : #ifdef CONFIG_HIBERNATION
    3328             : 
    3329             : /*
    3330             :  * Touch the watchdog for every WD_PAGE_COUNT pages.
    3331             :  */
    3332             : #define WD_PAGE_COUNT   (128*1024)
    3333             : 
    3334             : void mark_free_pages(struct zone *zone)
    3335             : {
    3336             :         unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
    3337             :         unsigned long flags;
    3338             :         unsigned int order, t;
    3339             :         struct page *page;
    3340             : 
    3341             :         if (zone_is_empty(zone))
    3342             :                 return;
    3343             : 
    3344             :         spin_lock_irqsave(&zone->lock, flags);
    3345             : 
    3346             :         max_zone_pfn = zone_end_pfn(zone);
    3347             :         for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
    3348             :                 if (pfn_valid(pfn)) {
    3349             :                         page = pfn_to_page(pfn);
    3350             : 
    3351             :                         if (!--page_count) {
    3352             :                                 touch_nmi_watchdog();
    3353             :                                 page_count = WD_PAGE_COUNT;
    3354             :                         }
    3355             : 
    3356             :                         if (page_zone(page) != zone)
    3357             :                                 continue;
    3358             : 
    3359             :                         if (!swsusp_page_is_forbidden(page))
    3360             :                                 swsusp_unset_page_free(page);
    3361             :                 }
    3362             : 
    3363             :         for_each_migratetype_order(order, t) {
    3364             :                 list_for_each_entry(page,
    3365             :                                 &zone->free_area[order].free_list[t], buddy_list) {
    3366             :                         unsigned long i;
    3367             : 
    3368             :                         pfn = page_to_pfn(page);
    3369             :                         for (i = 0; i < (1UL << order); i++) {
    3370             :                                 if (!--page_count) {
    3371             :                                         touch_nmi_watchdog();
    3372             :                                         page_count = WD_PAGE_COUNT;
    3373             :                                 }
    3374             :                                 swsusp_set_page_free(pfn_to_page(pfn + i));
    3375             :                         }
    3376             :                 }
    3377             :         }
    3378             :         spin_unlock_irqrestore(&zone->lock, flags);
    3379             : }
    3380             : #endif /* CONFIG_PM */
    3381             : 
    3382       44539 : static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
    3383             :                                                         unsigned int order)
    3384             : {
    3385             :         int migratetype;
    3386             : 
    3387       44539 :         if (!free_pcp_prepare(page, order))
    3388             :                 return false;
    3389             : 
    3390       44539 :         migratetype = get_pfnblock_migratetype(page, pfn);
    3391       89078 :         set_pcppage_migratetype(page, migratetype);
    3392       44539 :         return true;
    3393             : }
    3394             : 
    3395             : static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
    3396             :                        bool free_high)
    3397             : {
    3398             :         int min_nr_free, max_nr_free;
    3399             : 
    3400             :         /* Free everything if batch freeing high-order pages. */
    3401           4 :         if (unlikely(free_high))
    3402             :                 return pcp->count;
    3403             : 
    3404             :         /* Check for PCP disabled or boot pageset */
    3405           4 :         if (unlikely(high < batch))
    3406             :                 return 1;
    3407             : 
    3408             :         /* Leave at least pcp->batch pages on the list */
    3409           4 :         min_nr_free = batch;
    3410           4 :         max_nr_free = high - batch;
    3411             : 
    3412             :         /*
    3413             :          * Double the number of pages freed each time there is subsequent
    3414             :          * freeing of pages without any allocation.
    3415             :          */
    3416           4 :         batch <<= pcp->free_factor;
    3417           4 :         if (batch < max_nr_free)
    3418           4 :                 pcp->free_factor++;
    3419           4 :         batch = clamp(batch, min_nr_free, max_nr_free);
    3420             : 
    3421             :         return batch;
    3422             : }
    3423             : 
    3424       44539 : static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
    3425             :                        bool free_high)
    3426             : {
    3427       44539 :         int high = READ_ONCE(pcp->high);
    3428             : 
    3429       44539 :         if (unlikely(!high || free_high))
    3430             :                 return 0;
    3431             : 
    3432       89078 :         if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
    3433             :                 return high;
    3434             : 
    3435             :         /*
    3436             :          * If reclaim is active, limit the number of pages that can be
    3437             :          * stored on pcp lists
    3438             :          */
    3439           0 :         return min(READ_ONCE(pcp->batch) << 2, high);
    3440             : }
    3441             : 
    3442       44539 : static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
    3443             :                                    struct page *page, int migratetype,
    3444             :                                    unsigned int order)
    3445             : {
    3446             :         int high;
    3447             :         int pindex;
    3448             :         bool free_high;
    3449             : 
    3450       89078 :         __count_vm_events(PGFREE, 1 << order);
    3451       89078 :         pindex = order_to_pindex(migratetype, order);
    3452       89078 :         list_add(&page->pcp_list, &pcp->lists[pindex]);
    3453       44539 :         pcp->count += 1 << order;
    3454             : 
    3455             :         /*
    3456             :          * As high-order pages other than THP's stored on PCP can contribute
    3457             :          * to fragmentation, limit the number stored when PCP is heavily
    3458             :          * freeing without allocation. The remainder after bulk freeing
    3459             :          * stops will be drained from vmstat refresh context.
    3460             :          */
    3461       44539 :         free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
    3462             : 
    3463       44539 :         high = nr_pcp_high(pcp, zone, free_high);
    3464       44539 :         if (pcp->count >= high) {
    3465           4 :                 int batch = READ_ONCE(pcp->batch);
    3466             : 
    3467           8 :                 free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
    3468             :         }
    3469       44539 : }
    3470             : 
    3471             : /*
    3472             :  * Free a pcp page
    3473             :  */
    3474       44539 : void free_unref_page(struct page *page, unsigned int order)
    3475             : {
    3476             :         unsigned long __maybe_unused UP_flags;
    3477             :         struct per_cpu_pages *pcp;
    3478             :         struct zone *zone;
    3479       44539 :         unsigned long pfn = page_to_pfn(page);
    3480             :         int migratetype;
    3481             : 
    3482       44539 :         if (!free_unref_page_prepare(page, pfn, order))
    3483             :                 return;
    3484             : 
    3485             :         /*
    3486             :          * We only track unmovable, reclaimable and movable on pcp lists.
    3487             :          * Place ISOLATE pages on the isolated list because they are being
    3488             :          * offlined but treat HIGHATOMIC as movable pages so we can get those
    3489             :          * areas back if necessary. Otherwise, we may have to free
    3490             :          * excessively into the page allocator
    3491             :          */
    3492       89078 :         migratetype = get_pcppage_migratetype(page);
    3493       44539 :         if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
    3494             :                 if (unlikely(is_migrate_isolate(migratetype))) {
    3495             :                         free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
    3496             :                         return;
    3497             :                 }
    3498           0 :                 migratetype = MIGRATE_MOVABLE;
    3499             :         }
    3500             : 
    3501       44539 :         zone = page_zone(page);
    3502       44539 :         pcp_trylock_prepare(UP_flags);
    3503       89078 :         pcp = pcp_spin_trylock(zone->per_cpu_pageset);
    3504       44539 :         if (pcp) {
    3505       44539 :                 free_unref_page_commit(zone, pcp, page, migratetype, order);
    3506       89078 :                 pcp_spin_unlock(pcp);
    3507             :         } else {
    3508           0 :                 free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
    3509             :         }
    3510       44539 :         pcp_trylock_finish(UP_flags);
    3511             : }
    3512             : 
    3513             : /*
    3514             :  * Free a list of 0-order pages
    3515             :  */
    3516           0 : void free_unref_page_list(struct list_head *list)
    3517             : {
    3518             :         unsigned long __maybe_unused UP_flags;
    3519             :         struct page *page, *next;
    3520           0 :         struct per_cpu_pages *pcp = NULL;
    3521           0 :         struct zone *locked_zone = NULL;
    3522           0 :         int batch_count = 0;
    3523             :         int migratetype;
    3524             : 
    3525             :         /* Prepare pages for freeing */
    3526           0 :         list_for_each_entry_safe(page, next, list, lru) {
    3527           0 :                 unsigned long pfn = page_to_pfn(page);
    3528           0 :                 if (!free_unref_page_prepare(page, pfn, 0)) {
    3529           0 :                         list_del(&page->lru);
    3530           0 :                         continue;
    3531             :                 }
    3532             : 
    3533             :                 /*
    3534             :                  * Free isolated pages directly to the allocator, see
    3535             :                  * comment in free_unref_page.
    3536             :                  */
    3537             :                 migratetype = get_pcppage_migratetype(page);
    3538             :                 if (unlikely(is_migrate_isolate(migratetype))) {
    3539             :                         list_del(&page->lru);
    3540             :                         free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
    3541             :                         continue;
    3542             :                 }
    3543             :         }
    3544             : 
    3545           0 :         list_for_each_entry_safe(page, next, list, lru) {
    3546           0 :                 struct zone *zone = page_zone(page);
    3547             : 
    3548           0 :                 list_del(&page->lru);
    3549           0 :                 migratetype = get_pcppage_migratetype(page);
    3550             : 
    3551             :                 /*
    3552             :                  * Either different zone requiring a different pcp lock or
    3553             :                  * excessive lock hold times when freeing a large list of
    3554             :                  * pages.
    3555             :                  */
    3556           0 :                 if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
    3557           0 :                         if (pcp) {
    3558           0 :                                 pcp_spin_unlock(pcp);
    3559           0 :                                 pcp_trylock_finish(UP_flags);
    3560             :                         }
    3561             : 
    3562           0 :                         batch_count = 0;
    3563             : 
    3564             :                         /*
    3565             :                          * trylock is necessary as pages may be getting freed
    3566             :                          * from IRQ or SoftIRQ context after an IO completion.
    3567             :                          */
    3568           0 :                         pcp_trylock_prepare(UP_flags);
    3569           0 :                         pcp = pcp_spin_trylock(zone->per_cpu_pageset);
    3570           0 :                         if (unlikely(!pcp)) {
    3571           0 :                                 pcp_trylock_finish(UP_flags);
    3572           0 :                                 free_one_page(zone, page, page_to_pfn(page),
    3573             :                                               0, migratetype, FPI_NONE);
    3574           0 :                                 locked_zone = NULL;
    3575           0 :                                 continue;
    3576             :                         }
    3577             :                         locked_zone = zone;
    3578             :                 }
    3579             : 
    3580             :                 /*
    3581             :                  * Non-isolated types over MIGRATE_PCPTYPES get added
    3582             :                  * to the MIGRATE_MOVABLE pcp list.
    3583             :                  */
    3584           0 :                 if (unlikely(migratetype >= MIGRATE_PCPTYPES))
    3585           0 :                         migratetype = MIGRATE_MOVABLE;
    3586             : 
    3587           0 :                 trace_mm_page_free_batched(page);
    3588           0 :                 free_unref_page_commit(zone, pcp, page, migratetype, 0);
    3589           0 :                 batch_count++;
    3590             :         }
    3591             : 
    3592           0 :         if (pcp) {
    3593           0 :                 pcp_spin_unlock(pcp);
    3594           0 :                 pcp_trylock_finish(UP_flags);
    3595             :         }
    3596           0 : }
    3597             : 
    3598             : /*
    3599             :  * split_page takes a non-compound higher-order page, and splits it into
    3600             :  * n (1<<order) sub-pages: page[0..n]
    3601             :  * Each sub-page must be freed individually.
    3602             :  *
    3603             :  * Note: this is probably too low level an operation for use in drivers.
    3604             :  * Please consult with lkml before using this in your driver.
    3605             :  */
    3606           0 : void split_page(struct page *page, unsigned int order)
    3607             : {
    3608             :         int i;
    3609             : 
    3610             :         VM_BUG_ON_PAGE(PageCompound(page), page);
    3611             :         VM_BUG_ON_PAGE(!page_count(page), page);
    3612             : 
    3613           0 :         for (i = 1; i < (1 << order); i++)
    3614           0 :                 set_page_refcounted(page + i);
    3615           0 :         split_page_owner(page, 1 << order);
    3616           0 :         split_page_memcg(page, 1 << order);
    3617           0 : }
    3618             : EXPORT_SYMBOL_GPL(split_page);
    3619             : 
    3620           0 : int __isolate_free_page(struct page *page, unsigned int order)
    3621             : {
    3622           0 :         struct zone *zone = page_zone(page);
    3623           0 :         int mt = get_pageblock_migratetype(page);
    3624             : 
    3625           0 :         if (!is_migrate_isolate(mt)) {
    3626             :                 unsigned long watermark;
    3627             :                 /*
    3628             :                  * Obey watermarks as if the page was being allocated. We can
    3629             :                  * emulate a high-order watermark check with a raised order-0
    3630             :                  * watermark, because we already know our high-order page
    3631             :                  * exists.
    3632             :                  */
    3633           0 :                 watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
    3634           0 :                 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
    3635             :                         return 0;
    3636             : 
    3637           0 :                 __mod_zone_freepage_state(zone, -(1UL << order), mt);
    3638             :         }
    3639             : 
    3640           0 :         del_page_from_free_list(page, zone, order);
    3641             : 
    3642             :         /*
    3643             :          * Set the pageblock if the isolated page is at least half of a
    3644             :          * pageblock
    3645             :          */
    3646           0 :         if (order >= pageblock_order - 1) {
    3647           0 :                 struct page *endpage = page + (1 << order) - 1;
    3648           0 :                 for (; page < endpage; page += pageblock_nr_pages) {
    3649           0 :                         int mt = get_pageblock_migratetype(page);
    3650             :                         /*
    3651             :                          * Only change normal pageblocks (i.e., they can merge
    3652             :                          * with others)
    3653             :                          */
    3654           0 :                         if (migratetype_is_mergeable(mt))
    3655           0 :                                 set_pageblock_migratetype(page,
    3656             :                                                           MIGRATE_MOVABLE);
    3657             :                 }
    3658             :         }
    3659             : 
    3660           0 :         return 1UL << order;
    3661             : }
    3662             : 
    3663             : /**
    3664             :  * __putback_isolated_page - Return a now-isolated page back where we got it
    3665             :  * @page: Page that was isolated
    3666             :  * @order: Order of the isolated page
    3667             :  * @mt: The page's pageblock's migratetype
    3668             :  *
    3669             :  * This function is meant to return a page pulled from the free lists via
    3670             :  * __isolate_free_page back to the free lists they were pulled from.
    3671             :  */
    3672           0 : void __putback_isolated_page(struct page *page, unsigned int order, int mt)
    3673             : {
    3674           0 :         struct zone *zone = page_zone(page);
    3675             : 
    3676             :         /* zone lock should be held when this function is called */
    3677             :         lockdep_assert_held(&zone->lock);
    3678             : 
    3679             :         /* Return isolated page to tail of freelist. */
    3680           0 :         __free_one_page(page, page_to_pfn(page), zone, order, mt,
    3681             :                         FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
    3682           0 : }
    3683             : 
    3684             : /*
    3685             :  * Update NUMA hit/miss statistics
    3686             :  */
    3687             : static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
    3688             :                                    long nr_account)
    3689             : {
    3690             : #ifdef CONFIG_NUMA
    3691             :         enum numa_stat_item local_stat = NUMA_LOCAL;
    3692             : 
    3693             :         /* skip numa counters update if numa stats is disabled */
    3694             :         if (!static_branch_likely(&vm_numa_stat_key))
    3695             :                 return;
    3696             : 
    3697             :         if (zone_to_nid(z) != numa_node_id())
    3698             :                 local_stat = NUMA_OTHER;
    3699             : 
    3700             :         if (zone_to_nid(z) == zone_to_nid(preferred_zone))
    3701             :                 __count_numa_events(z, NUMA_HIT, nr_account);
    3702             :         else {
    3703             :                 __count_numa_events(z, NUMA_MISS, nr_account);
    3704             :                 __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
    3705             :         }
    3706             :         __count_numa_events(z, local_stat, nr_account);
    3707             : #endif
    3708             : }
    3709             : 
    3710             : static __always_inline
    3711             : struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
    3712             :                            unsigned int order, unsigned int alloc_flags,
    3713             :                            int migratetype)
    3714             : {
    3715             :         struct page *page;
    3716             :         unsigned long flags;
    3717             : 
    3718             :         do {
    3719           0 :                 page = NULL;
    3720           0 :                 spin_lock_irqsave(&zone->lock, flags);
    3721             :                 /*
    3722             :                  * order-0 request can reach here when the pcplist is skipped
    3723             :                  * due to non-CMA allocation context. HIGHATOMIC area is
    3724             :                  * reserved for high-order atomic allocation, so order-0
    3725             :                  * request should skip it.
    3726             :                  */
    3727           0 :                 if (alloc_flags & ALLOC_HIGHATOMIC)
    3728             :                         page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
    3729           0 :                 if (!page) {
    3730           0 :                         page = __rmqueue(zone, order, migratetype, alloc_flags);
    3731             : 
    3732             :                         /*
    3733             :                          * If the allocation fails, allow OOM handling access
    3734             :                          * to HIGHATOMIC reserves as failing now is worse than
    3735             :                          * failing a high-order atomic allocation in the
    3736             :                          * future.
    3737             :                          */
    3738           0 :                         if (!page && (alloc_flags & ALLOC_OOM))
    3739             :                                 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
    3740             : 
    3741           0 :                         if (!page) {
    3742           0 :                                 spin_unlock_irqrestore(&zone->lock, flags);
    3743             :                                 return NULL;
    3744             :                         }
    3745             :                 }
    3746           0 :                 __mod_zone_freepage_state(zone, -(1 << order),
    3747             :                                           get_pcppage_migratetype(page));
    3748           0 :                 spin_unlock_irqrestore(&zone->lock, flags);
    3749           0 :         } while (check_new_pages(page, order));
    3750             : 
    3751           0 :         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
    3752             :         zone_statistics(preferred_zone, zone, 1);
    3753             : 
    3754             :         return page;
    3755             : }
    3756             : 
    3757             : /* Remove page from the per-cpu list, caller must protect the list */
    3758             : static inline
    3759       45065 : struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
    3760             :                         int migratetype,
    3761             :                         unsigned int alloc_flags,
    3762             :                         struct per_cpu_pages *pcp,
    3763             :                         struct list_head *list)
    3764             : {
    3765             :         struct page *page;
    3766             : 
    3767             :         do {
    3768       45065 :                 if (list_empty(list)) {
    3769          53 :                         int batch = READ_ONCE(pcp->batch);
    3770             :                         int alloced;
    3771             : 
    3772             :                         /*
    3773             :                          * Scale batch relative to order if batch implies
    3774             :                          * free pages can be stored on the PCP. Batch can
    3775             :                          * be 1 for small zones or for boot pagesets which
    3776             :                          * should never store free pages as the pages may
    3777             :                          * belong to arbitrary zones.
    3778             :                          */
    3779          53 :                         if (batch > 1)
    3780          42 :                                 batch = max(batch >> order, 2);
    3781          53 :                         alloced = rmqueue_bulk(zone, order,
    3782             :                                         batch, list,
    3783             :                                         migratetype, alloc_flags);
    3784             : 
    3785          53 :                         pcp->count += alloced << order;
    3786          53 :                         if (unlikely(list_empty(list)))
    3787             :                                 return NULL;
    3788             :                 }
    3789             : 
    3790       45065 :                 page = list_first_entry(list, struct page, pcp_list);
    3791       90130 :                 list_del(&page->pcp_list);
    3792       45065 :                 pcp->count -= 1 << order;
    3793       45065 :         } while (check_new_pcp(page, order));
    3794             : 
    3795       45065 :         return page;
    3796             : }
    3797             : 
    3798             : /* Lock and remove page from the per-cpu list */
    3799        2524 : static struct page *rmqueue_pcplist(struct zone *preferred_zone,
    3800             :                         struct zone *zone, unsigned int order,
    3801             :                         int migratetype, unsigned int alloc_flags)
    3802             : {
    3803             :         struct per_cpu_pages *pcp;
    3804             :         struct list_head *list;
    3805             :         struct page *page;
    3806             :         unsigned long __maybe_unused UP_flags;
    3807             : 
    3808             :         /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
    3809        2524 :         pcp_trylock_prepare(UP_flags);
    3810        5048 :         pcp = pcp_spin_trylock(zone->per_cpu_pageset);
    3811        2524 :         if (!pcp) {
    3812           0 :                 pcp_trylock_finish(UP_flags);
    3813             :                 return NULL;
    3814             :         }
    3815             : 
    3816             :         /*
    3817             :          * On allocation, reduce the number of pages that are batch freed.
    3818             :          * See nr_pcp_free() where free_factor is increased for subsequent
    3819             :          * frees.
    3820             :          */
    3821        2524 :         pcp->free_factor >>= 1;
    3822        5048 :         list = &pcp->lists[order_to_pindex(migratetype, order)];
    3823        2524 :         page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
    3824        5048 :         pcp_spin_unlock(pcp);
    3825        5048 :         pcp_trylock_finish(UP_flags);
    3826        2524 :         if (page) {
    3827        5048 :                 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
    3828             :                 zone_statistics(preferred_zone, zone, 1);
    3829             :         }
    3830             :         return page;
    3831             : }
    3832             : 
    3833             : /*
    3834             :  * Allocate a page from the given zone.
    3835             :  * Use pcplists for THP or "cheap" high-order allocations.
    3836             :  */
    3837             : 
    3838             : /*
    3839             :  * Do not instrument rmqueue() with KMSAN. This function may call
    3840             :  * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
    3841             :  * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
    3842             :  * may call rmqueue() again, which will result in a deadlock.
    3843             :  */
    3844             : __no_sanitize_memory
    3845             : static inline
    3846        2524 : struct page *rmqueue(struct zone *preferred_zone,
    3847             :                         struct zone *zone, unsigned int order,
    3848             :                         gfp_t gfp_flags, unsigned int alloc_flags,
    3849             :                         int migratetype)
    3850             : {
    3851             :         struct page *page;
    3852             : 
    3853             :         /*
    3854             :          * We most definitely don't want callers attempting to
    3855             :          * allocate greater than order-1 page units with __GFP_NOFAIL.
    3856             :          */
    3857        2524 :         WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
    3858             : 
    3859        2524 :         if (likely(pcp_allowed_order(order))) {
    3860             :                 /*
    3861             :                  * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
    3862             :                  * we need to skip it when CMA area isn't allowed.
    3863             :                  */
    3864             :                 if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
    3865             :                                 migratetype != MIGRATE_MOVABLE) {
    3866        2524 :                         page = rmqueue_pcplist(preferred_zone, zone, order,
    3867             :                                         migratetype, alloc_flags);
    3868        2524 :                         if (likely(page))
    3869             :                                 goto out;
    3870             :                 }
    3871             :         }
    3872             : 
    3873             :         page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
    3874             :                                                         migratetype);
    3875             : 
    3876             : out:
    3877             :         /* Separate test+clear to avoid unnecessary atomics */
    3878        5048 :         if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
    3879           0 :                 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
    3880           0 :                 wakeup_kswapd(zone, 0, 0, zone_idx(zone));
    3881             :         }
    3882             : 
    3883             :         VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
    3884        2524 :         return page;
    3885             : }
    3886             : 
    3887             : #ifdef CONFIG_FAIL_PAGE_ALLOC
    3888             : 
    3889             : static struct {
    3890             :         struct fault_attr attr;
    3891             : 
    3892             :         bool ignore_gfp_highmem;
    3893             :         bool ignore_gfp_reclaim;
    3894             :         u32 min_order;
    3895             : } fail_page_alloc = {
    3896             :         .attr = FAULT_ATTR_INITIALIZER,
    3897             :         .ignore_gfp_reclaim = true,
    3898             :         .ignore_gfp_highmem = true,
    3899             :         .min_order = 1,
    3900             : };
    3901             : 
    3902             : static int __init setup_fail_page_alloc(char *str)
    3903             : {
    3904             :         return setup_fault_attr(&fail_page_alloc.attr, str);
    3905             : }
    3906             : __setup("fail_page_alloc=", setup_fail_page_alloc);
    3907             : 
    3908             : static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    3909             : {
    3910             :         int flags = 0;
    3911             : 
    3912             :         if (order < fail_page_alloc.min_order)
    3913             :                 return false;
    3914             :         if (gfp_mask & __GFP_NOFAIL)
    3915             :                 return false;
    3916             :         if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
    3917             :                 return false;
    3918             :         if (fail_page_alloc.ignore_gfp_reclaim &&
    3919             :                         (gfp_mask & __GFP_DIRECT_RECLAIM))
    3920             :                 return false;
    3921             : 
    3922             :         /* See comment in __should_failslab() */
    3923             :         if (gfp_mask & __GFP_NOWARN)
    3924             :                 flags |= FAULT_NOWARN;
    3925             : 
    3926             :         return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
    3927             : }
    3928             : 
    3929             : #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
    3930             : 
    3931             : static int __init fail_page_alloc_debugfs(void)
    3932             : {
    3933             :         umode_t mode = S_IFREG | 0600;
    3934             :         struct dentry *dir;
    3935             : 
    3936             :         dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
    3937             :                                         &fail_page_alloc.attr);
    3938             : 
    3939             :         debugfs_create_bool("ignore-gfp-wait", mode, dir,
    3940             :                             &fail_page_alloc.ignore_gfp_reclaim);
    3941             :         debugfs_create_bool("ignore-gfp-highmem", mode, dir,
    3942             :                             &fail_page_alloc.ignore_gfp_highmem);
    3943             :         debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
    3944             : 
    3945             :         return 0;
    3946             : }
    3947             : 
    3948             : late_initcall(fail_page_alloc_debugfs);
    3949             : 
    3950             : #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
    3951             : 
    3952             : #else /* CONFIG_FAIL_PAGE_ALLOC */
    3953             : 
    3954             : static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    3955             : {
    3956             :         return false;
    3957             : }
    3958             : 
    3959             : #endif /* CONFIG_FAIL_PAGE_ALLOC */
    3960             : 
    3961        3120 : noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    3962             : {
    3963        3120 :         return __should_fail_alloc_page(gfp_mask, order);
    3964             : }
    3965             : ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
    3966             : 
    3967             : static inline long __zone_watermark_unusable_free(struct zone *z,
    3968             :                                 unsigned int order, unsigned int alloc_flags)
    3969             : {
    3970        3122 :         long unusable_free = (1 << order) - 1;
    3971             : 
    3972             :         /*
    3973             :          * If the caller does not have rights to reserves below the min
    3974             :          * watermark then subtract the high-atomic reserves. This will
    3975             :          * over-estimate the size of the atomic reserve but it avoids a search.
    3976             :          */
    3977        3122 :         if (likely(!(alloc_flags & ALLOC_RESERVES)))
    3978        3122 :                 unusable_free += z->nr_reserved_highatomic;
    3979             : 
    3980             : #ifdef CONFIG_CMA
    3981             :         /* If allocation can't use CMA areas don't use free CMA pages */
    3982             :         if (!(alloc_flags & ALLOC_CMA))
    3983             :                 unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
    3984             : #endif
    3985             : 
    3986             :         return unusable_free;
    3987             : }
    3988             : 
    3989             : /*
    3990             :  * Return true if free base pages are above 'mark'. For high-order checks it
    3991             :  * will return true of the order-0 watermark is reached and there is at least
    3992             :  * one free page of a suitable size. Checking now avoids taking the zone lock
    3993             :  * to check in the allocation paths if no pages are free.
    3994             :  */
    3995         107 : bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
    3996             :                          int highest_zoneidx, unsigned int alloc_flags,
    3997             :                          long free_pages)
    3998             : {
    3999         107 :         long min = mark;
    4000             :         int o;
    4001             : 
    4002             :         /* free_pages may go negative - that's OK */
    4003         214 :         free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
    4004             : 
    4005         107 :         if (unlikely(alloc_flags & ALLOC_RESERVES)) {
    4006             :                 /*
    4007             :                  * __GFP_HIGH allows access to 50% of the min reserve as well
    4008             :                  * as OOM.
    4009             :                  */
    4010           0 :                 if (alloc_flags & ALLOC_MIN_RESERVE) {
    4011           0 :                         min -= min / 2;
    4012             : 
    4013             :                         /*
    4014             :                          * Non-blocking allocations (e.g. GFP_ATOMIC) can
    4015             :                          * access more reserves than just __GFP_HIGH. Other
    4016             :                          * non-blocking allocations requests such as GFP_NOWAIT
    4017             :                          * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get
    4018             :                          * access to the min reserve.
    4019             :                          */
    4020           0 :                         if (alloc_flags & ALLOC_NON_BLOCK)
    4021           0 :                                 min -= min / 4;
    4022             :                 }
    4023             : 
    4024             :                 /*
    4025             :                  * OOM victims can try even harder than the normal reserve
    4026             :                  * users on the grounds that it's definitely going to be in
    4027             :                  * the exit path shortly and free memory. Any allocation it
    4028             :                  * makes during the free path will be small and short-lived.
    4029             :                  */
    4030           0 :                 if (alloc_flags & ALLOC_OOM)
    4031           0 :                         min -= min / 2;
    4032             :         }
    4033             : 
    4034             :         /*
    4035             :          * Check watermarks for an order-0 allocation request. If these
    4036             :          * are not met, then a high-order request also cannot go ahead
    4037             :          * even if a suitable page happened to be free.
    4038             :          */
    4039         107 :         if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
    4040             :                 return false;
    4041             : 
    4042             :         /* If this is an order-0 request then the watermark is fine */
    4043         107 :         if (!order)
    4044             :                 return true;
    4045             : 
    4046             :         /* For a high-order request, check at least one suitable page is free */
    4047         110 :         for (o = order; o < MAX_ORDER; o++) {
    4048         110 :                 struct free_area *area = &z->free_area[o];
    4049             :                 int mt;
    4050             : 
    4051         110 :                 if (!area->nr_free)
    4052           5 :                         continue;
    4053             : 
    4054          63 :                 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
    4055         168 :                         if (!free_area_empty(area, mt))
    4056             :                                 return true;
    4057             :                 }
    4058             : 
    4059             : #ifdef CONFIG_CMA
    4060             :                 if ((alloc_flags & ALLOC_CMA) &&
    4061             :                     !free_area_empty(area, MIGRATE_CMA)) {
    4062             :                         return true;
    4063             :                 }
    4064             : #endif
    4065           0 :                 if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) &&
    4066           0 :                     !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
    4067             :                         return true;
    4068             :                 }
    4069             :         }
    4070             :         return false;
    4071             : }
    4072             : 
    4073           0 : bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
    4074             :                       int highest_zoneidx, unsigned int alloc_flags)
    4075             : {
    4076           0 :         return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
    4077           0 :                                         zone_page_state(z, NR_FREE_PAGES));
    4078             : }
    4079             : 
    4080        3120 : static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
    4081             :                                 unsigned long mark, int highest_zoneidx,
    4082             :                                 unsigned int alloc_flags, gfp_t gfp_mask)
    4083             : {
    4084             :         long free_pages;
    4085             : 
    4086        3120 :         free_pages = zone_page_state(z, NR_FREE_PAGES);
    4087             : 
    4088             :         /*
    4089             :          * Fast check for order-0 only. If this fails then the reserves
    4090             :          * need to be calculated.
    4091             :          */
    4092        3120 :         if (!order) {
    4093             :                 long usable_free;
    4094             :                 long reserved;
    4095             : 
    4096        3015 :                 usable_free = free_pages;
    4097        6030 :                 reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
    4098             : 
    4099             :                 /* reserved may over estimate high-atomic reserves. */
    4100        3015 :                 usable_free -= min(usable_free, reserved);
    4101        3015 :                 if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
    4102             :                         return true;
    4103             :         }
    4104             : 
    4105         105 :         if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
    4106             :                                         free_pages))
    4107             :                 return true;
    4108             : 
    4109             :         /*
    4110             :          * Ignore watermark boosting for __GFP_HIGH order-0 allocations
    4111             :          * when checking the min watermark. The min watermark is the
    4112             :          * point where boosting is ignored so that kswapd is woken up
    4113             :          * when below the low watermark.
    4114             :          */
    4115           0 :         if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost
    4116             :                 && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
    4117           0 :                 mark = z->_watermark[WMARK_MIN];
    4118           0 :                 return __zone_watermark_ok(z, order, mark, highest_zoneidx,
    4119             :                                         alloc_flags, free_pages);
    4120             :         }
    4121             : 
    4122             :         return false;
    4123             : }
    4124             : 
    4125           2 : bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
    4126             :                         unsigned long mark, int highest_zoneidx)
    4127             : {
    4128           2 :         long free_pages = zone_page_state(z, NR_FREE_PAGES);
    4129             : 
    4130           2 :         if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
    4131           0 :                 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
    4132             : 
    4133           2 :         return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
    4134             :                                                                 free_pages);
    4135             : }
    4136             : 
    4137             : #ifdef CONFIG_NUMA
    4138             : int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
    4139             : 
    4140             : static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
    4141             : {
    4142             :         return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
    4143             :                                 node_reclaim_distance;
    4144             : }
    4145             : #else   /* CONFIG_NUMA */
    4146             : static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
    4147             : {
    4148             :         return true;
    4149             : }
    4150             : #endif  /* CONFIG_NUMA */
    4151             : 
    4152             : /*
    4153             :  * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
    4154             :  * fragmentation is subtle. If the preferred zone was HIGHMEM then
    4155             :  * premature use of a lower zone may cause lowmem pressure problems that
    4156             :  * are worse than fragmentation. If the next zone is ZONE_DMA then it is
    4157             :  * probably too small. It only makes sense to spread allocations to avoid
    4158             :  * fragmentation between the Normal and DMA32 zones.
    4159             :  */
    4160             : static inline unsigned int
    4161             : alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
    4162             : {
    4163             :         unsigned int alloc_flags;
    4164             : 
    4165             :         /*
    4166             :          * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
    4167             :          * to save a branch.
    4168             :          */
    4169        2524 :         alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
    4170             : 
    4171             : #ifdef CONFIG_ZONE_DMA32
    4172             :         if (!zone)
    4173             :                 return alloc_flags;
    4174             : 
    4175             :         if (zone_idx(zone) != ZONE_NORMAL)
    4176             :                 return alloc_flags;
    4177             : 
    4178             :         /*
    4179             :          * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
    4180             :          * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
    4181             :          * on UMA that if Normal is populated then so is DMA32.
    4182             :          */
    4183             :         BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
    4184             :         if (nr_online_nodes > 1 && !populated_zone(--zone))
    4185             :                 return alloc_flags;
    4186             : 
    4187             :         alloc_flags |= ALLOC_NOFRAGMENT;
    4188             : #endif /* CONFIG_ZONE_DMA32 */
    4189             :         return alloc_flags;
    4190             : }
    4191             : 
    4192             : /* Must be called after current_gfp_context() which can change gfp_mask */
    4193             : static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
    4194             :                                                   unsigned int alloc_flags)
    4195             : {
    4196             : #ifdef CONFIG_CMA
    4197             :         if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
    4198             :                 alloc_flags |= ALLOC_CMA;
    4199             : #endif
    4200             :         return alloc_flags;
    4201             : }
    4202             : 
    4203             : /*
    4204             :  * get_page_from_freelist goes through the zonelist trying to allocate
    4205             :  * a page.
    4206             :  */
    4207             : static struct page *
    4208        2524 : get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
    4209             :                                                 const struct alloc_context *ac)
    4210             : {
    4211             :         struct zoneref *z;
    4212             :         struct zone *zone;
    4213        2524 :         struct pglist_data *last_pgdat = NULL;
    4214        2524 :         bool last_pgdat_dirty_ok = false;
    4215             :         bool no_fallback;
    4216             : 
    4217             : retry:
    4218             :         /*
    4219             :          * Scan zonelist, looking for a zone with enough free.
    4220             :          * See also __cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
    4221             :          */
    4222        2524 :         no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
    4223        2524 :         z = ac->preferred_zoneref;
    4224        2524 :         for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
    4225             :                                         ac->nodemask) {
    4226             :                 struct page *page;
    4227             :                 unsigned long mark;
    4228             : 
    4229             :                 if (cpusets_enabled() &&
    4230             :                         (alloc_flags & ALLOC_CPUSET) &&
    4231             :                         !__cpuset_zone_allowed(zone, gfp_mask))
    4232             :                                 continue;
    4233             :                 /*
    4234             :                  * When allocating a page cache page for writing, we
    4235             :                  * want to get it from a node that is within its dirty
    4236             :                  * limit, such that no single node holds more than its
    4237             :                  * proportional share of globally allowed dirty pages.
    4238             :                  * The dirty limits take into account the node's
    4239             :                  * lowmem reserves and high watermark so that kswapd
    4240             :                  * should be able to balance it without having to
    4241             :                  * write pages from its LRU list.
    4242             :                  *
    4243             :                  * XXX: For now, allow allocations to potentially
    4244             :                  * exceed the per-node dirty limit in the slowpath
    4245             :                  * (spread_dirty_pages unset) before going into reclaim,
    4246             :                  * which is important when on a NUMA setup the allowed
    4247             :                  * nodes are together not big enough to reach the
    4248             :                  * global limit.  The proper fix for these situations
    4249             :                  * will require awareness of nodes in the
    4250             :                  * dirty-throttling and the flusher threads.
    4251             :                  */
    4252        2524 :                 if (ac->spread_dirty_pages) {
    4253           0 :                         if (last_pgdat != zone->zone_pgdat) {
    4254           0 :                                 last_pgdat = zone->zone_pgdat;
    4255           0 :                                 last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
    4256             :                         }
    4257             : 
    4258           0 :                         if (!last_pgdat_dirty_ok)
    4259           0 :                                 continue;
    4260             :                 }
    4261             : 
    4262             :                 if (no_fallback && nr_online_nodes > 1 &&
    4263             :                     zone != ac->preferred_zoneref->zone) {
    4264             :                         int local_nid;
    4265             : 
    4266             :                         /*
    4267             :                          * If moving to a remote node, retry but allow
    4268             :                          * fragmenting fallbacks. Locality is more important
    4269             :                          * than fragmentation avoidance.
    4270             :                          */
    4271             :                         local_nid = zone_to_nid(ac->preferred_zoneref->zone);
    4272             :                         if (zone_to_nid(zone) != local_nid) {
    4273             :                                 alloc_flags &= ~ALLOC_NOFRAGMENT;
    4274             :                                 goto retry;
    4275             :                         }
    4276             :                 }
    4277             : 
    4278        2524 :                 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
    4279        5048 :                 if (!zone_watermark_fast(zone, order, mark,
    4280        2524 :                                        ac->highest_zoneidx, alloc_flags,
    4281             :                                        gfp_mask)) {
    4282             :                         int ret;
    4283             : 
    4284             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    4285             :                         /*
    4286             :                          * Watermark failed for this zone, but see if we can
    4287             :                          * grow this zone if it contains deferred pages.
    4288             :                          */
    4289             :                         if (deferred_pages_enabled()) {
    4290             :                                 if (_deferred_grow_zone(zone, order))
    4291             :                                         goto try_this_zone;
    4292             :                         }
    4293             : #endif
    4294             :                         /* Checked here to keep the fast path fast */
    4295             :                         BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
    4296           0 :                         if (alloc_flags & ALLOC_NO_WATERMARKS)
    4297             :                                 goto try_this_zone;
    4298             : 
    4299             :                         if (!node_reclaim_enabled() ||
    4300             :                             !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
    4301           0 :                                 continue;
    4302             : 
    4303             :                         ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
    4304             :                         switch (ret) {
    4305             :                         case NODE_RECLAIM_NOSCAN:
    4306             :                                 /* did not scan */
    4307             :                                 continue;
    4308             :                         case NODE_RECLAIM_FULL:
    4309             :                                 /* scanned but unreclaimable */
    4310             :                                 continue;
    4311             :                         default:
    4312             :                                 /* did we reclaim enough */
    4313             :                                 if (zone_watermark_ok(zone, order, mark,
    4314             :                                         ac->highest_zoneidx, alloc_flags))
    4315             :                                         goto try_this_zone;
    4316             : 
    4317             :                                 continue;
    4318             :                         }
    4319             :                 }
    4320             : 
    4321             : try_this_zone:
    4322        2524 :                 page = rmqueue(ac->preferred_zoneref->zone, zone, order,
    4323             :                                 gfp_mask, alloc_flags, ac->migratetype);
    4324        2524 :                 if (page) {
    4325        2524 :                         prep_new_page(page, order, gfp_mask, alloc_flags);
    4326             : 
    4327             :                         /*
    4328             :                          * If this is a high-order atomic allocation then check
    4329             :                          * if the pageblock should be reserved for the future
    4330             :                          */
    4331        2524 :                         if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
    4332           0 :                                 reserve_highatomic_pageblock(page, zone, order);
    4333             : 
    4334             :                         return page;
    4335             :                 } else {
    4336             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    4337             :                         /* Try again if zone has deferred pages */
    4338             :                         if (deferred_pages_enabled()) {
    4339             :                                 if (_deferred_grow_zone(zone, order))
    4340             :                                         goto try_this_zone;
    4341             :                         }
    4342             : #endif
    4343             :                 }
    4344             :         }
    4345             : 
    4346             :         /*
    4347             :          * It's possible on a UMA machine to get through all zones that are
    4348             :          * fragmented. If avoiding fragmentation, reset and try again.
    4349             :          */
    4350             :         if (no_fallback) {
    4351             :                 alloc_flags &= ~ALLOC_NOFRAGMENT;
    4352             :                 goto retry;
    4353             :         }
    4354             : 
    4355             :         return NULL;
    4356             : }
    4357             : 
    4358           0 : static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
    4359             : {
    4360           0 :         unsigned int filter = SHOW_MEM_FILTER_NODES;
    4361             : 
    4362             :         /*
    4363             :          * This documents exceptions given to allocations in certain
    4364             :          * contexts that are allowed to allocate outside current's set
    4365             :          * of allowed nodes.
    4366             :          */
    4367           0 :         if (!(gfp_mask & __GFP_NOMEMALLOC))
    4368           0 :                 if (tsk_is_oom_victim(current) ||
    4369           0 :                     (current->flags & (PF_MEMALLOC | PF_EXITING)))
    4370             :                         filter &= ~SHOW_MEM_FILTER_NODES;
    4371           0 :         if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
    4372           0 :                 filter &= ~SHOW_MEM_FILTER_NODES;
    4373             : 
    4374           0 :         __show_mem(filter, nodemask, gfp_zone(gfp_mask));
    4375           0 : }
    4376             : 
    4377           0 : void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
    4378             : {
    4379             :         struct va_format vaf;
    4380             :         va_list args;
    4381             :         static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
    4382             : 
    4383           0 :         if ((gfp_mask & __GFP_NOWARN) ||
    4384           0 :              !__ratelimit(&nopage_rs) ||
    4385           0 :              ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
    4386           0 :                 return;
    4387             : 
    4388           0 :         va_start(args, fmt);
    4389           0 :         vaf.fmt = fmt;
    4390           0 :         vaf.va = &args;
    4391           0 :         pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
    4392             :                         current->comm, &vaf, gfp_mask, &gfp_mask,
    4393             :                         nodemask_pr_args(nodemask));
    4394           0 :         va_end(args);
    4395             : 
    4396             :         cpuset_print_current_mems_allowed();
    4397           0 :         pr_cont("\n");
    4398           0 :         dump_stack();
    4399           0 :         warn_alloc_show_mem(gfp_mask, nodemask);
    4400             : }
    4401             : 
    4402             : static inline struct page *
    4403           0 : __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
    4404             :                               unsigned int alloc_flags,
    4405             :                               const struct alloc_context *ac)
    4406             : {
    4407             :         struct page *page;
    4408             : 
    4409           0 :         page = get_page_from_freelist(gfp_mask, order,
    4410           0 :                         alloc_flags|ALLOC_CPUSET, ac);
    4411             :         /*
    4412             :          * fallback to ignore cpuset restriction if our nodes
    4413             :          * are depleted
    4414             :          */
    4415           0 :         if (!page)
    4416           0 :                 page = get_page_from_freelist(gfp_mask, order,
    4417             :                                 alloc_flags, ac);
    4418             : 
    4419           0 :         return page;
    4420             : }
    4421             : 
    4422             : static inline struct page *
    4423           0 : __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
    4424             :         const struct alloc_context *ac, unsigned long *did_some_progress)
    4425             : {
    4426           0 :         struct oom_control oc = {
    4427           0 :                 .zonelist = ac->zonelist,
    4428           0 :                 .nodemask = ac->nodemask,
    4429             :                 .memcg = NULL,
    4430             :                 .gfp_mask = gfp_mask,
    4431             :                 .order = order,
    4432             :         };
    4433             :         struct page *page;
    4434             : 
    4435           0 :         *did_some_progress = 0;
    4436             : 
    4437             :         /*
    4438             :          * Acquire the oom lock.  If that fails, somebody else is
    4439             :          * making progress for us.
    4440             :          */
    4441           0 :         if (!mutex_trylock(&oom_lock)) {
    4442           0 :                 *did_some_progress = 1;
    4443           0 :                 schedule_timeout_uninterruptible(1);
    4444           0 :                 return NULL;
    4445             :         }
    4446             : 
    4447             :         /*
    4448             :          * Go through the zonelist yet one more time, keep very high watermark
    4449             :          * here, this is only to catch a parallel oom killing, we must fail if
    4450             :          * we're still under heavy pressure. But make sure that this reclaim
    4451             :          * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
    4452             :          * allocation which will never fail due to oom_lock already held.
    4453             :          */
    4454           0 :         page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
    4455             :                                       ~__GFP_DIRECT_RECLAIM, order,
    4456             :                                       ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
    4457           0 :         if (page)
    4458             :                 goto out;
    4459             : 
    4460             :         /* Coredumps can quickly deplete all memory reserves */
    4461           0 :         if (current->flags & PF_DUMPCORE)
    4462             :                 goto out;
    4463             :         /* The OOM killer will not help higher order allocs */
    4464           0 :         if (order > PAGE_ALLOC_COSTLY_ORDER)
    4465             :                 goto out;
    4466             :         /*
    4467             :          * We have already exhausted all our reclaim opportunities without any
    4468             :          * success so it is time to admit defeat. We will skip the OOM killer
    4469             :          * because it is very likely that the caller has a more reasonable
    4470             :          * fallback than shooting a random task.
    4471             :          *
    4472             :          * The OOM killer may not free memory on a specific node.
    4473             :          */
    4474           0 :         if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
    4475             :                 goto out;
    4476             :         /* The OOM killer does not needlessly kill tasks for lowmem */
    4477             :         if (ac->highest_zoneidx < ZONE_NORMAL)
    4478             :                 goto out;
    4479           0 :         if (pm_suspended_storage())
    4480             :                 goto out;
    4481             :         /*
    4482             :          * XXX: GFP_NOFS allocations should rather fail than rely on
    4483             :          * other request to make a forward progress.
    4484             :          * We are in an unfortunate situation where out_of_memory cannot
    4485             :          * do much for this context but let's try it to at least get
    4486             :          * access to memory reserved if the current task is killed (see
    4487             :          * out_of_memory). Once filesystems are ready to handle allocation
    4488             :          * failures more gracefully we should just bail out here.
    4489             :          */
    4490             : 
    4491             :         /* Exhausted what can be done so it's blame time */
    4492           0 :         if (out_of_memory(&oc) ||
    4493           0 :             WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
    4494           0 :                 *did_some_progress = 1;
    4495             : 
    4496             :                 /*
    4497             :                  * Help non-failing allocations by giving them access to memory
    4498             :                  * reserves
    4499             :                  */
    4500           0 :                 if (gfp_mask & __GFP_NOFAIL)
    4501           0 :                         page = __alloc_pages_cpuset_fallback(gfp_mask, order,
    4502             :                                         ALLOC_NO_WATERMARKS, ac);
    4503             :         }
    4504             : out:
    4505           0 :         mutex_unlock(&oom_lock);
    4506           0 :         return page;
    4507             : }
    4508             : 
    4509             : /*
    4510             :  * Maximum number of compaction retries with a progress before OOM
    4511             :  * killer is consider as the only way to move forward.
    4512             :  */
    4513             : #define MAX_COMPACT_RETRIES 16
    4514             : 
    4515             : #ifdef CONFIG_COMPACTION
    4516             : /* Try memory compaction for high-order allocations before reclaim */
    4517             : static struct page *
    4518           0 : __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
    4519             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    4520             :                 enum compact_priority prio, enum compact_result *compact_result)
    4521             : {
    4522           0 :         struct page *page = NULL;
    4523             :         unsigned long pflags;
    4524             :         unsigned int noreclaim_flag;
    4525             : 
    4526           0 :         if (!order)
    4527             :                 return NULL;
    4528             : 
    4529           0 :         psi_memstall_enter(&pflags);
    4530             :         delayacct_compact_start();
    4531           0 :         noreclaim_flag = memalloc_noreclaim_save();
    4532             : 
    4533           0 :         *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
    4534             :                                                                 prio, &page);
    4535             : 
    4536           0 :         memalloc_noreclaim_restore(noreclaim_flag);
    4537           0 :         psi_memstall_leave(&pflags);
    4538             :         delayacct_compact_end();
    4539             : 
    4540           0 :         if (*compact_result == COMPACT_SKIPPED)
    4541             :                 return NULL;
    4542             :         /*
    4543             :          * At least in one zone compaction wasn't deferred or skipped, so let's
    4544             :          * count a compaction stall
    4545             :          */
    4546           0 :         count_vm_event(COMPACTSTALL);
    4547             : 
    4548             :         /* Prep a captured page if available */
    4549           0 :         if (page)
    4550           0 :                 prep_new_page(page, order, gfp_mask, alloc_flags);
    4551             : 
    4552             :         /* Try get a page from the freelist if available */
    4553           0 :         if (!page)
    4554           0 :                 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    4555             : 
    4556           0 :         if (page) {
    4557           0 :                 struct zone *zone = page_zone(page);
    4558             : 
    4559           0 :                 zone->compact_blockskip_flush = false;
    4560           0 :                 compaction_defer_reset(zone, order, true);
    4561           0 :                 count_vm_event(COMPACTSUCCESS);
    4562           0 :                 return page;
    4563             :         }
    4564             : 
    4565             :         /*
    4566             :          * It's bad if compaction run occurs and fails. The most likely reason
    4567             :          * is that pages exist, but not enough to satisfy watermarks.
    4568             :          */
    4569           0 :         count_vm_event(COMPACTFAIL);
    4570             : 
    4571           0 :         cond_resched();
    4572             : 
    4573           0 :         return NULL;
    4574             : }
    4575             : 
    4576             : static inline bool
    4577           0 : should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
    4578             :                      enum compact_result compact_result,
    4579             :                      enum compact_priority *compact_priority,
    4580             :                      int *compaction_retries)
    4581             : {
    4582           0 :         int max_retries = MAX_COMPACT_RETRIES;
    4583             :         int min_priority;
    4584           0 :         bool ret = false;
    4585           0 :         int retries = *compaction_retries;
    4586           0 :         enum compact_priority priority = *compact_priority;
    4587             : 
    4588           0 :         if (!order)
    4589             :                 return false;
    4590             : 
    4591           0 :         if (fatal_signal_pending(current))
    4592             :                 return false;
    4593             : 
    4594           0 :         if (compaction_made_progress(compact_result))
    4595           0 :                 (*compaction_retries)++;
    4596             : 
    4597             :         /*
    4598             :          * compaction considers all the zone as desperately out of memory
    4599             :          * so it doesn't really make much sense to retry except when the
    4600             :          * failure could be caused by insufficient priority
    4601             :          */
    4602           0 :         if (compaction_failed(compact_result))
    4603             :                 goto check_priority;
    4604             : 
    4605             :         /*
    4606             :          * compaction was skipped because there are not enough order-0 pages
    4607             :          * to work with, so we retry only if it looks like reclaim can help.
    4608             :          */
    4609           0 :         if (compaction_needs_reclaim(compact_result)) {
    4610           0 :                 ret = compaction_zonelist_suitable(ac, order, alloc_flags);
    4611           0 :                 goto out;
    4612             :         }
    4613             : 
    4614             :         /*
    4615             :          * make sure the compaction wasn't deferred or didn't bail out early
    4616             :          * due to locks contention before we declare that we should give up.
    4617             :          * But the next retry should use a higher priority if allowed, so
    4618             :          * we don't just keep bailing out endlessly.
    4619             :          */
    4620           0 :         if (compaction_withdrawn(compact_result)) {
    4621             :                 goto check_priority;
    4622             :         }
    4623             : 
    4624             :         /*
    4625             :          * !costly requests are much more important than __GFP_RETRY_MAYFAIL
    4626             :          * costly ones because they are de facto nofail and invoke OOM
    4627             :          * killer to move on while costly can fail and users are ready
    4628             :          * to cope with that. 1/4 retries is rather arbitrary but we
    4629             :          * would need much more detailed feedback from compaction to
    4630             :          * make a better decision.
    4631             :          */
    4632           0 :         if (order > PAGE_ALLOC_COSTLY_ORDER)
    4633           0 :                 max_retries /= 4;
    4634           0 :         if (*compaction_retries <= max_retries) {
    4635             :                 ret = true;
    4636             :                 goto out;
    4637             :         }
    4638             : 
    4639             :         /*
    4640             :          * Make sure there are attempts at the highest priority if we exhausted
    4641             :          * all retries or failed at the lower priorities.
    4642             :          */
    4643             : check_priority:
    4644           0 :         min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
    4645           0 :                         MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
    4646             : 
    4647           0 :         if (*compact_priority > min_priority) {
    4648           0 :                 (*compact_priority)--;
    4649           0 :                 *compaction_retries = 0;
    4650           0 :                 ret = true;
    4651             :         }
    4652             : out:
    4653           0 :         trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
    4654           0 :         return ret;
    4655             : }
    4656             : #else
    4657             : static inline struct page *
    4658             : __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
    4659             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    4660             :                 enum compact_priority prio, enum compact_result *compact_result)
    4661             : {
    4662             :         *compact_result = COMPACT_SKIPPED;
    4663             :         return NULL;
    4664             : }
    4665             : 
    4666             : static inline bool
    4667             : should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
    4668             :                      enum compact_result compact_result,
    4669             :                      enum compact_priority *compact_priority,
    4670             :                      int *compaction_retries)
    4671             : {
    4672             :         struct zone *zone;
    4673             :         struct zoneref *z;
    4674             : 
    4675             :         if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
    4676             :                 return false;
    4677             : 
    4678             :         /*
    4679             :          * There are setups with compaction disabled which would prefer to loop
    4680             :          * inside the allocator rather than hit the oom killer prematurely.
    4681             :          * Let's give them a good hope and keep retrying while the order-0
    4682             :          * watermarks are OK.
    4683             :          */
    4684             :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
    4685             :                                 ac->highest_zoneidx, ac->nodemask) {
    4686             :                 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
    4687             :                                         ac->highest_zoneidx, alloc_flags))
    4688             :                         return true;
    4689             :         }
    4690             :         return false;
    4691             : }
    4692             : #endif /* CONFIG_COMPACTION */
    4693             : 
    4694             : #ifdef CONFIG_LOCKDEP
    4695             : static struct lockdep_map __fs_reclaim_map =
    4696             :         STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
    4697             : 
    4698             : static bool __need_reclaim(gfp_t gfp_mask)
    4699             : {
    4700             :         /* no reclaim without waiting on it */
    4701             :         if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
    4702             :                 return false;
    4703             : 
    4704             :         /* this guy won't enter reclaim */
    4705             :         if (current->flags & PF_MEMALLOC)
    4706             :                 return false;
    4707             : 
    4708             :         if (gfp_mask & __GFP_NOLOCKDEP)
    4709             :                 return false;
    4710             : 
    4711             :         return true;
    4712             : }
    4713             : 
    4714             : void __fs_reclaim_acquire(unsigned long ip)
    4715             : {
    4716             :         lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip);
    4717             : }
    4718             : 
    4719             : void __fs_reclaim_release(unsigned long ip)
    4720             : {
    4721             :         lock_release(&__fs_reclaim_map, ip);
    4722             : }
    4723             : 
    4724             : void fs_reclaim_acquire(gfp_t gfp_mask)
    4725             : {
    4726             :         gfp_mask = current_gfp_context(gfp_mask);
    4727             : 
    4728             :         if (__need_reclaim(gfp_mask)) {
    4729             :                 if (gfp_mask & __GFP_FS)
    4730             :                         __fs_reclaim_acquire(_RET_IP_);
    4731             : 
    4732             : #ifdef CONFIG_MMU_NOTIFIER
    4733             :                 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
    4734             :                 lock_map_release(&__mmu_notifier_invalidate_range_start_map);
    4735             : #endif
    4736             : 
    4737             :         }
    4738             : }
    4739             : EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
    4740             : 
    4741             : void fs_reclaim_release(gfp_t gfp_mask)
    4742             : {
    4743             :         gfp_mask = current_gfp_context(gfp_mask);
    4744             : 
    4745             :         if (__need_reclaim(gfp_mask)) {
    4746             :                 if (gfp_mask & __GFP_FS)
    4747             :                         __fs_reclaim_release(_RET_IP_);
    4748             :         }
    4749             : }
    4750             : EXPORT_SYMBOL_GPL(fs_reclaim_release);
    4751             : #endif
    4752             : 
    4753             : /*
    4754             :  * Zonelists may change due to hotplug during allocation. Detect when zonelists
    4755             :  * have been rebuilt so allocation retries. Reader side does not lock and
    4756             :  * retries the allocation if zonelist changes. Writer side is protected by the
    4757             :  * embedded spin_lock.
    4758             :  */
    4759             : static DEFINE_SEQLOCK(zonelist_update_seq);
    4760             : 
    4761             : static unsigned int zonelist_iter_begin(void)
    4762             : {
    4763             :         if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
    4764             :                 return read_seqbegin(&zonelist_update_seq);
    4765             : 
    4766             :         return 0;
    4767             : }
    4768             : 
    4769             : static unsigned int check_retry_zonelist(unsigned int seq)
    4770             : {
    4771             :         if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
    4772             :                 return read_seqretry(&zonelist_update_seq, seq);
    4773             : 
    4774             :         return seq;
    4775             : }
    4776             : 
    4777             : /* Perform direct synchronous page reclaim */
    4778             : static unsigned long
    4779           0 : __perform_reclaim(gfp_t gfp_mask, unsigned int order,
    4780             :                                         const struct alloc_context *ac)
    4781             : {
    4782             :         unsigned int noreclaim_flag;
    4783             :         unsigned long progress;
    4784             : 
    4785           0 :         cond_resched();
    4786             : 
    4787             :         /* We now go into synchronous reclaim */
    4788             :         cpuset_memory_pressure_bump();
    4789           0 :         fs_reclaim_acquire(gfp_mask);
    4790           0 :         noreclaim_flag = memalloc_noreclaim_save();
    4791             : 
    4792           0 :         progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
    4793             :                                                                 ac->nodemask);
    4794             : 
    4795           0 :         memalloc_noreclaim_restore(noreclaim_flag);
    4796           0 :         fs_reclaim_release(gfp_mask);
    4797             : 
    4798           0 :         cond_resched();
    4799             : 
    4800           0 :         return progress;
    4801             : }
    4802             : 
    4803             : /* The really slow allocator path where we enter direct reclaim */
    4804             : static inline struct page *
    4805           0 : __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
    4806             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    4807             :                 unsigned long *did_some_progress)
    4808             : {
    4809           0 :         struct page *page = NULL;
    4810             :         unsigned long pflags;
    4811           0 :         bool drained = false;
    4812             : 
    4813           0 :         psi_memstall_enter(&pflags);
    4814           0 :         *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
    4815           0 :         if (unlikely(!(*did_some_progress)))
    4816             :                 goto out;
    4817             : 
    4818             : retry:
    4819           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    4820             : 
    4821             :         /*
    4822             :          * If an allocation failed after direct reclaim, it could be because
    4823             :          * pages are pinned on the per-cpu lists or in high alloc reserves.
    4824             :          * Shrink them and try again
    4825             :          */
    4826           0 :         if (!page && !drained) {
    4827           0 :                 unreserve_highatomic_pageblock(ac, false);
    4828           0 :                 drain_all_pages(NULL);
    4829           0 :                 drained = true;
    4830           0 :                 goto retry;
    4831             :         }
    4832             : out:
    4833           0 :         psi_memstall_leave(&pflags);
    4834             : 
    4835           0 :         return page;
    4836             : }
    4837             : 
    4838           0 : static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
    4839             :                              const struct alloc_context *ac)
    4840             : {
    4841             :         struct zoneref *z;
    4842             :         struct zone *zone;
    4843           0 :         pg_data_t *last_pgdat = NULL;
    4844           0 :         enum zone_type highest_zoneidx = ac->highest_zoneidx;
    4845             : 
    4846           0 :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
    4847             :                                         ac->nodemask) {
    4848           0 :                 if (!managed_zone(zone))
    4849           0 :                         continue;
    4850           0 :                 if (last_pgdat != zone->zone_pgdat) {
    4851           0 :                         wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
    4852           0 :                         last_pgdat = zone->zone_pgdat;
    4853             :                 }
    4854             :         }
    4855           0 : }
    4856             : 
    4857             : static inline unsigned int
    4858           0 : gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
    4859             : {
    4860           0 :         unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
    4861             : 
    4862             :         /*
    4863             :          * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE
    4864             :          * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
    4865             :          * to save two branches.
    4866             :          */
    4867             :         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE);
    4868             :         BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
    4869             : 
    4870             :         /*
    4871             :          * The caller may dip into page reserves a bit more if the caller
    4872             :          * cannot run direct reclaim, or if the caller has realtime scheduling
    4873             :          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
    4874             :          * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
    4875             :          */
    4876           0 :         alloc_flags |= (__force int)
    4877             :                 (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
    4878             : 
    4879           0 :         if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
    4880             :                 /*
    4881             :                  * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
    4882             :                  * if it can't schedule.
    4883             :                  */
    4884           0 :                 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
    4885           0 :                         alloc_flags |= ALLOC_NON_BLOCK;
    4886             : 
    4887           0 :                         if (order > 0)
    4888           0 :                                 alloc_flags |= ALLOC_HIGHATOMIC;
    4889             :                 }
    4890             : 
    4891             :                 /*
    4892             :                  * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
    4893             :                  * GFP_ATOMIC) rather than fail, see the comment for
    4894             :                  * __cpuset_node_allowed().
    4895             :                  */
    4896           0 :                 if (alloc_flags & ALLOC_MIN_RESERVE)
    4897           0 :                         alloc_flags &= ~ALLOC_CPUSET;
    4898           0 :         } else if (unlikely(rt_task(current)) && in_task())
    4899           0 :                 alloc_flags |= ALLOC_MIN_RESERVE;
    4900             : 
    4901           0 :         alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
    4902             : 
    4903           0 :         return alloc_flags;
    4904             : }
    4905             : 
    4906             : static bool oom_reserves_allowed(struct task_struct *tsk)
    4907             : {
    4908           0 :         if (!tsk_is_oom_victim(tsk))
    4909             :                 return false;
    4910             : 
    4911             :         /*
    4912             :          * !MMU doesn't have oom reaper so give access to memory reserves
    4913             :          * only to the thread with TIF_MEMDIE set
    4914             :          */
    4915             :         if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
    4916             :                 return false;
    4917             : 
    4918             :         return true;
    4919             : }
    4920             : 
    4921             : /*
    4922             :  * Distinguish requests which really need access to full memory
    4923             :  * reserves from oom victims which can live with a portion of it
    4924             :  */
    4925           0 : static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
    4926             : {
    4927           0 :         if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
    4928             :                 return 0;
    4929           0 :         if (gfp_mask & __GFP_MEMALLOC)
    4930             :                 return ALLOC_NO_WATERMARKS;
    4931           0 :         if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
    4932             :                 return ALLOC_NO_WATERMARKS;
    4933           0 :         if (!in_interrupt()) {
    4934           0 :                 if (current->flags & PF_MEMALLOC)
    4935             :                         return ALLOC_NO_WATERMARKS;
    4936           0 :                 else if (oom_reserves_allowed(current))
    4937             :                         return ALLOC_OOM;
    4938             :         }
    4939             : 
    4940             :         return 0;
    4941             : }
    4942             : 
    4943           0 : bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
    4944             : {
    4945           0 :         return !!__gfp_pfmemalloc_flags(gfp_mask);
    4946             : }
    4947             : 
    4948             : /*
    4949             :  * Checks whether it makes sense to retry the reclaim to make a forward progress
    4950             :  * for the given allocation request.
    4951             :  *
    4952             :  * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
    4953             :  * without success, or when we couldn't even meet the watermark if we
    4954             :  * reclaimed all remaining pages on the LRU lists.
    4955             :  *
    4956             :  * Returns true if a retry is viable or false to enter the oom path.
    4957             :  */
    4958             : static inline bool
    4959           0 : should_reclaim_retry(gfp_t gfp_mask, unsigned order,
    4960             :                      struct alloc_context *ac, int alloc_flags,
    4961             :                      bool did_some_progress, int *no_progress_loops)
    4962             : {
    4963             :         struct zone *zone;
    4964             :         struct zoneref *z;
    4965           0 :         bool ret = false;
    4966             : 
    4967             :         /*
    4968             :          * Costly allocations might have made a progress but this doesn't mean
    4969             :          * their order will become available due to high fragmentation so
    4970             :          * always increment the no progress counter for them
    4971             :          */
    4972           0 :         if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
    4973           0 :                 *no_progress_loops = 0;
    4974             :         else
    4975           0 :                 (*no_progress_loops)++;
    4976             : 
    4977             :         /*
    4978             :          * Make sure we converge to OOM if we cannot make any progress
    4979             :          * several times in the row.
    4980             :          */
    4981           0 :         if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
    4982             :                 /* Before OOM, exhaust highatomic_reserve */
    4983           0 :                 return unreserve_highatomic_pageblock(ac, true);
    4984             :         }
    4985             : 
    4986             :         /*
    4987             :          * Keep reclaiming pages while there is a chance this will lead
    4988             :          * somewhere.  If none of the target zones can satisfy our allocation
    4989             :          * request even if all reclaimable pages are considered then we are
    4990             :          * screwed and have to go OOM.
    4991             :          */
    4992           0 :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
    4993             :                                 ac->highest_zoneidx, ac->nodemask) {
    4994             :                 unsigned long available;
    4995             :                 unsigned long reclaimable;
    4996           0 :                 unsigned long min_wmark = min_wmark_pages(zone);
    4997             :                 bool wmark;
    4998             : 
    4999           0 :                 available = reclaimable = zone_reclaimable_pages(zone);
    5000           0 :                 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
    5001             : 
    5002             :                 /*
    5003             :                  * Would the allocation succeed if we reclaimed all
    5004             :                  * reclaimable pages?
    5005             :                  */
    5006           0 :                 wmark = __zone_watermark_ok(zone, order, min_wmark,
    5007           0 :                                 ac->highest_zoneidx, alloc_flags, available);
    5008           0 :                 trace_reclaim_retry_zone(z, order, reclaimable,
    5009             :                                 available, min_wmark, *no_progress_loops, wmark);
    5010           0 :                 if (wmark) {
    5011             :                         ret = true;
    5012             :                         break;
    5013             :                 }
    5014             :         }
    5015             : 
    5016             :         /*
    5017             :          * Memory allocation/reclaim might be called from a WQ context and the
    5018             :          * current implementation of the WQ concurrency control doesn't
    5019             :          * recognize that a particular WQ is congested if the worker thread is
    5020             :          * looping without ever sleeping. Therefore we have to do a short sleep
    5021             :          * here rather than calling cond_resched().
    5022             :          */
    5023           0 :         if (current->flags & PF_WQ_WORKER)
    5024           0 :                 schedule_timeout_uninterruptible(1);
    5025             :         else
    5026           0 :                 cond_resched();
    5027             :         return ret;
    5028             : }
    5029             : 
    5030             : static inline bool
    5031             : check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
    5032             : {
    5033             :         /*
    5034             :          * It's possible that cpuset's mems_allowed and the nodemask from
    5035             :          * mempolicy don't intersect. This should be normally dealt with by
    5036             :          * policy_nodemask(), but it's possible to race with cpuset update in
    5037             :          * such a way the check therein was true, and then it became false
    5038             :          * before we got our cpuset_mems_cookie here.
    5039             :          * This assumes that for all allocations, ac->nodemask can come only
    5040             :          * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
    5041             :          * when it does not intersect with the cpuset restrictions) or the
    5042             :          * caller can deal with a violated nodemask.
    5043             :          */
    5044             :         if (cpusets_enabled() && ac->nodemask &&
    5045             :                         !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
    5046             :                 ac->nodemask = NULL;
    5047             :                 return true;
    5048             :         }
    5049             : 
    5050             :         /*
    5051             :          * When updating a task's mems_allowed or mempolicy nodemask, it is
    5052             :          * possible to race with parallel threads in such a way that our
    5053             :          * allocation can fail while the mask is being updated. If we are about
    5054             :          * to fail, check if the cpuset changed during allocation and if so,
    5055             :          * retry.
    5056             :          */
    5057           0 :         if (read_mems_allowed_retry(cpuset_mems_cookie))
    5058             :                 return true;
    5059             : 
    5060             :         return false;
    5061             : }
    5062             : 
    5063             : static inline struct page *
    5064           0 : __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
    5065             :                                                 struct alloc_context *ac)
    5066             : {
    5067           0 :         bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
    5068           0 :         const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
    5069           0 :         struct page *page = NULL;
    5070             :         unsigned int alloc_flags;
    5071             :         unsigned long did_some_progress;
    5072             :         enum compact_priority compact_priority;
    5073             :         enum compact_result compact_result;
    5074             :         int compaction_retries;
    5075             :         int no_progress_loops;
    5076             :         unsigned int cpuset_mems_cookie;
    5077             :         unsigned int zonelist_iter_cookie;
    5078             :         int reserve_flags;
    5079             : 
    5080             : restart:
    5081           0 :         compaction_retries = 0;
    5082           0 :         no_progress_loops = 0;
    5083           0 :         compact_priority = DEF_COMPACT_PRIORITY;
    5084           0 :         cpuset_mems_cookie = read_mems_allowed_begin();
    5085           0 :         zonelist_iter_cookie = zonelist_iter_begin();
    5086             : 
    5087             :         /*
    5088             :          * The fast path uses conservative alloc_flags to succeed only until
    5089             :          * kswapd needs to be woken up, and to avoid the cost of setting up
    5090             :          * alloc_flags precisely. So we do that now.
    5091             :          */
    5092           0 :         alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
    5093             : 
    5094             :         /*
    5095             :          * We need to recalculate the starting point for the zonelist iterator
    5096             :          * because we might have used different nodemask in the fast path, or
    5097             :          * there was a cpuset modification and we are retrying - otherwise we
    5098             :          * could end up iterating over non-eligible zones endlessly.
    5099             :          */
    5100           0 :         ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    5101             :                                         ac->highest_zoneidx, ac->nodemask);
    5102           0 :         if (!ac->preferred_zoneref->zone)
    5103             :                 goto nopage;
    5104             : 
    5105             :         /*
    5106             :          * Check for insane configurations where the cpuset doesn't contain
    5107             :          * any suitable zone to satisfy the request - e.g. non-movable
    5108             :          * GFP_HIGHUSER allocations from MOVABLE nodes only.
    5109             :          */
    5110             :         if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
    5111             :                 struct zoneref *z = first_zones_zonelist(ac->zonelist,
    5112             :                                         ac->highest_zoneidx,
    5113             :                                         &cpuset_current_mems_allowed);
    5114             :                 if (!z->zone)
    5115             :                         goto nopage;
    5116             :         }
    5117             : 
    5118           0 :         if (alloc_flags & ALLOC_KSWAPD)
    5119           0 :                 wake_all_kswapds(order, gfp_mask, ac);
    5120             : 
    5121             :         /*
    5122             :          * The adjusted alloc_flags might result in immediate success, so try
    5123             :          * that first
    5124             :          */
    5125           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    5126           0 :         if (page)
    5127             :                 goto got_pg;
    5128             : 
    5129             :         /*
    5130             :          * For costly allocations, try direct compaction first, as it's likely
    5131             :          * that we have enough base pages and don't need to reclaim. For non-
    5132             :          * movable high-order allocations, do that as well, as compaction will
    5133             :          * try prevent permanent fragmentation by migrating from blocks of the
    5134             :          * same migratetype.
    5135             :          * Don't try this for allocations that are allowed to ignore
    5136             :          * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
    5137             :          */
    5138           0 :         if (can_direct_reclaim &&
    5139           0 :                         (costly_order ||
    5140           0 :                            (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
    5141           0 :                         && !gfp_pfmemalloc_allowed(gfp_mask)) {
    5142           0 :                 page = __alloc_pages_direct_compact(gfp_mask, order,
    5143             :                                                 alloc_flags, ac,
    5144             :                                                 INIT_COMPACT_PRIORITY,
    5145             :                                                 &compact_result);
    5146           0 :                 if (page)
    5147             :                         goto got_pg;
    5148             : 
    5149             :                 /*
    5150             :                  * Checks for costly allocations with __GFP_NORETRY, which
    5151             :                  * includes some THP page fault allocations
    5152             :                  */
    5153           0 :                 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
    5154             :                         /*
    5155             :                          * If allocating entire pageblock(s) and compaction
    5156             :                          * failed because all zones are below low watermarks
    5157             :                          * or is prohibited because it recently failed at this
    5158             :                          * order, fail immediately unless the allocator has
    5159             :                          * requested compaction and reclaim retry.
    5160             :                          *
    5161             :                          * Reclaim is
    5162             :                          *  - potentially very expensive because zones are far
    5163             :                          *    below their low watermarks or this is part of very
    5164             :                          *    bursty high order allocations,
    5165             :                          *  - not guaranteed to help because isolate_freepages()
    5166             :                          *    may not iterate over freed pages as part of its
    5167             :                          *    linear scan, and
    5168             :                          *  - unlikely to make entire pageblocks free on its
    5169             :                          *    own.
    5170             :                          */
    5171           0 :                         if (compact_result == COMPACT_SKIPPED ||
    5172             :                             compact_result == COMPACT_DEFERRED)
    5173             :                                 goto nopage;
    5174             : 
    5175             :                         /*
    5176             :                          * Looks like reclaim/compaction is worth trying, but
    5177             :                          * sync compaction could be very expensive, so keep
    5178             :                          * using async compaction.
    5179             :                          */
    5180           0 :                         compact_priority = INIT_COMPACT_PRIORITY;
    5181             :                 }
    5182             :         }
    5183             : 
    5184             : retry:
    5185             :         /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
    5186           0 :         if (alloc_flags & ALLOC_KSWAPD)
    5187           0 :                 wake_all_kswapds(order, gfp_mask, ac);
    5188             : 
    5189           0 :         reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
    5190           0 :         if (reserve_flags)
    5191           0 :                 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) |
    5192             :                                           (alloc_flags & ALLOC_KSWAPD);
    5193             : 
    5194             :         /*
    5195             :          * Reset the nodemask and zonelist iterators if memory policies can be
    5196             :          * ignored. These allocations are high priority and system rather than
    5197             :          * user oriented.
    5198             :          */
    5199           0 :         if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
    5200           0 :                 ac->nodemask = NULL;
    5201           0 :                 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    5202             :                                         ac->highest_zoneidx, ac->nodemask);
    5203             :         }
    5204             : 
    5205             :         /* Attempt with potentially adjusted zonelist and alloc_flags */
    5206           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    5207           0 :         if (page)
    5208             :                 goto got_pg;
    5209             : 
    5210             :         /* Caller is not willing to reclaim, we can't balance anything */
    5211           0 :         if (!can_direct_reclaim)
    5212             :                 goto nopage;
    5213             : 
    5214             :         /* Avoid recursion of direct reclaim */
    5215           0 :         if (current->flags & PF_MEMALLOC)
    5216             :                 goto nopage;
    5217             : 
    5218             :         /* Try direct reclaim and then allocating */
    5219           0 :         page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
    5220             :                                                         &did_some_progress);
    5221           0 :         if (page)
    5222             :                 goto got_pg;
    5223             : 
    5224             :         /* Try direct compaction and then allocating */
    5225           0 :         page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
    5226             :                                         compact_priority, &compact_result);
    5227           0 :         if (page)
    5228             :                 goto got_pg;
    5229             : 
    5230             :         /* Do not loop if specifically requested */
    5231           0 :         if (gfp_mask & __GFP_NORETRY)
    5232             :                 goto nopage;
    5233             : 
    5234             :         /*
    5235             :          * Do not retry costly high order allocations unless they are
    5236             :          * __GFP_RETRY_MAYFAIL
    5237             :          */
    5238           0 :         if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
    5239             :                 goto nopage;
    5240             : 
    5241           0 :         if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
    5242             :                                  did_some_progress > 0, &no_progress_loops))
    5243             :                 goto retry;
    5244             : 
    5245             :         /*
    5246             :          * It doesn't make any sense to retry for the compaction if the order-0
    5247             :          * reclaim is not able to make any progress because the current
    5248             :          * implementation of the compaction depends on the sufficient amount
    5249             :          * of free memory (see __compaction_suitable)
    5250             :          */
    5251           0 :         if (did_some_progress > 0 &&
    5252           0 :                         should_compact_retry(ac, order, alloc_flags,
    5253             :                                 compact_result, &compact_priority,
    5254             :                                 &compaction_retries))
    5255             :                 goto retry;
    5256             : 
    5257             : 
    5258             :         /*
    5259             :          * Deal with possible cpuset update races or zonelist updates to avoid
    5260             :          * a unnecessary OOM kill.
    5261             :          */
    5262           0 :         if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
    5263           0 :             check_retry_zonelist(zonelist_iter_cookie))
    5264             :                 goto restart;
    5265             : 
    5266             :         /* Reclaim has failed us, start killing things */
    5267           0 :         page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
    5268           0 :         if (page)
    5269             :                 goto got_pg;
    5270             : 
    5271             :         /* Avoid allocations with no watermarks from looping endlessly */
    5272           0 :         if (tsk_is_oom_victim(current) &&
    5273           0 :             (alloc_flags & ALLOC_OOM ||
    5274           0 :              (gfp_mask & __GFP_NOMEMALLOC)))
    5275             :                 goto nopage;
    5276             : 
    5277             :         /* Retry as long as the OOM killer is making progress */
    5278           0 :         if (did_some_progress) {
    5279           0 :                 no_progress_loops = 0;
    5280           0 :                 goto retry;
    5281             :         }
    5282             : 
    5283             : nopage:
    5284             :         /*
    5285             :          * Deal with possible cpuset update races or zonelist updates to avoid
    5286             :          * a unnecessary OOM kill.
    5287             :          */
    5288           0 :         if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
    5289           0 :             check_retry_zonelist(zonelist_iter_cookie))
    5290             :                 goto restart;
    5291             : 
    5292             :         /*
    5293             :          * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
    5294             :          * we always retry
    5295             :          */
    5296           0 :         if (gfp_mask & __GFP_NOFAIL) {
    5297             :                 /*
    5298             :                  * All existing users of the __GFP_NOFAIL are blockable, so warn
    5299             :                  * of any new users that actually require GFP_NOWAIT
    5300             :                  */
    5301           0 :                 if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
    5302             :                         goto fail;
    5303             : 
    5304             :                 /*
    5305             :                  * PF_MEMALLOC request from this context is rather bizarre
    5306             :                  * because we cannot reclaim anything and only can loop waiting
    5307             :                  * for somebody to do a work for us
    5308             :                  */
    5309           0 :                 WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
    5310             : 
    5311             :                 /*
    5312             :                  * non failing costly orders are a hard requirement which we
    5313             :                  * are not prepared for much so let's warn about these users
    5314             :                  * so that we can identify them and convert them to something
    5315             :                  * else.
    5316             :                  */
    5317           0 :                 WARN_ON_ONCE_GFP(costly_order, gfp_mask);
    5318             : 
    5319             :                 /*
    5320             :                  * Help non-failing allocations by giving some access to memory
    5321             :                  * reserves normally used for high priority non-blocking
    5322             :                  * allocations but do not use ALLOC_NO_WATERMARKS because this
    5323             :                  * could deplete whole memory reserves which would just make
    5324             :                  * the situation worse.
    5325             :                  */
    5326           0 :                 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac);
    5327           0 :                 if (page)
    5328             :                         goto got_pg;
    5329             : 
    5330           0 :                 cond_resched();
    5331           0 :                 goto retry;
    5332             :         }
    5333             : fail:
    5334           0 :         warn_alloc(gfp_mask, ac->nodemask,
    5335             :                         "page allocation failure: order:%u", order);
    5336             : got_pg:
    5337           0 :         return page;
    5338             : }
    5339             : 
    5340        3120 : static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
    5341             :                 int preferred_nid, nodemask_t *nodemask,
    5342             :                 struct alloc_context *ac, gfp_t *alloc_gfp,
    5343             :                 unsigned int *alloc_flags)
    5344             : {
    5345        3120 :         ac->highest_zoneidx = gfp_zone(gfp_mask);
    5346        6240 :         ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
    5347        3120 :         ac->nodemask = nodemask;
    5348        3120 :         ac->migratetype = gfp_migratetype(gfp_mask);
    5349             : 
    5350             :         if (cpusets_enabled()) {
    5351             :                 *alloc_gfp |= __GFP_HARDWALL;
    5352             :                 /*
    5353             :                  * When we are in the interrupt context, it is irrelevant
    5354             :                  * to the current task context. It means that any node ok.
    5355             :                  */
    5356             :                 if (in_task() && !ac->nodemask)
    5357             :                         ac->nodemask = &cpuset_current_mems_allowed;
    5358             :                 else
    5359             :                         *alloc_flags |= ALLOC_CPUSET;
    5360             :         }
    5361             : 
    5362        3120 :         might_alloc(gfp_mask);
    5363             : 
    5364        3120 :         if (should_fail_alloc_page(gfp_mask, order))
    5365             :                 return false;
    5366             : 
    5367        3120 :         *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
    5368             : 
    5369             :         /* Dirty zone balancing only done in the fast path */
    5370        3120 :         ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
    5371             : 
    5372             :         /*
    5373             :          * The preferred zone is used for statistics but crucially it is
    5374             :          * also used as the starting point for the zonelist iterator. It
    5375             :          * may get reset for allocations that ignore memory policies.
    5376             :          */
    5377        6240 :         ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    5378             :                                         ac->highest_zoneidx, ac->nodemask);
    5379             : 
    5380             :         return true;
    5381             : }
    5382             : 
    5383             : /*
    5384             :  * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
    5385             :  * @gfp: GFP flags for the allocation
    5386             :  * @preferred_nid: The preferred NUMA node ID to allocate from
    5387             :  * @nodemask: Set of nodes to allocate from, may be NULL
    5388             :  * @nr_pages: The number of pages desired on the list or array
    5389             :  * @page_list: Optional list to store the allocated pages
    5390             :  * @page_array: Optional array to store the pages
    5391             :  *
    5392             :  * This is a batched version of the page allocator that attempts to
    5393             :  * allocate nr_pages quickly. Pages are added to page_list if page_list
    5394             :  * is not NULL, otherwise it is assumed that the page_array is valid.
    5395             :  *
    5396             :  * For lists, nr_pages is the number of pages that should be allocated.
    5397             :  *
    5398             :  * For arrays, only NULL elements are populated with pages and nr_pages
    5399             :  * is the maximum number of pages that will be stored in the array.
    5400             :  *
    5401             :  * Returns the number of pages on the list or array.
    5402             :  */
    5403         596 : unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
    5404             :                         nodemask_t *nodemask, int nr_pages,
    5405             :                         struct list_head *page_list,
    5406             :                         struct page **page_array)
    5407             : {
    5408             :         struct page *page;
    5409             :         unsigned long __maybe_unused UP_flags;
    5410             :         struct zone *zone;
    5411             :         struct zoneref *z;
    5412             :         struct per_cpu_pages *pcp;
    5413             :         struct list_head *pcp_list;
    5414             :         struct alloc_context ac;
    5415             :         gfp_t alloc_gfp;
    5416         596 :         unsigned int alloc_flags = ALLOC_WMARK_LOW;
    5417         596 :         int nr_populated = 0, nr_account = 0;
    5418             : 
    5419             :         /*
    5420             :          * Skip populated array elements to determine if any pages need
    5421             :          * to be allocated before disabling IRQs.
    5422             :          */
    5423        1192 :         while (page_array && nr_populated < nr_pages && page_array[nr_populated])
    5424           0 :                 nr_populated++;
    5425             : 
    5426             :         /* No pages requested? */
    5427         596 :         if (unlikely(nr_pages <= 0))
    5428             :                 goto out;
    5429             : 
    5430             :         /* Already populated array? */
    5431         596 :         if (unlikely(page_array && nr_pages - nr_populated == 0))
    5432             :                 goto out;
    5433             : 
    5434             :         /* Bulk allocator does not support memcg accounting. */
    5435             :         if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT))
    5436             :                 goto failed;
    5437             : 
    5438             :         /* Use the single page allocator for one page. */
    5439         596 :         if (nr_pages - nr_populated == 1)
    5440             :                 goto failed;
    5441             : 
    5442             : #ifdef CONFIG_PAGE_OWNER
    5443             :         /*
    5444             :          * PAGE_OWNER may recurse into the allocator to allocate space to
    5445             :          * save the stack with pagesets.lock held. Releasing/reacquiring
    5446             :          * removes much of the performance benefit of bulk allocation so
    5447             :          * force the caller to allocate one page at a time as it'll have
    5448             :          * similar performance to added complexity to the bulk allocator.
    5449             :          */
    5450             :         if (static_branch_unlikely(&page_owner_inited))
    5451             :                 goto failed;
    5452             : #endif
    5453             : 
    5454             :         /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
    5455         596 :         gfp &= gfp_allowed_mask;
    5456         596 :         alloc_gfp = gfp;
    5457         596 :         if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
    5458             :                 goto out;
    5459         596 :         gfp = alloc_gfp;
    5460             : 
    5461             :         /* Find an allowed local zone that meets the low watermark. */
    5462        1192 :         for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
    5463             :                 unsigned long mark;
    5464             : 
    5465             :                 if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
    5466             :                     !__cpuset_zone_allowed(zone, gfp)) {
    5467             :                         continue;
    5468             :                 }
    5469             : 
    5470             :                 if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
    5471             :                     zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
    5472             :                         goto failed;
    5473             :                 }
    5474             : 
    5475         596 :                 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
    5476         596 :                 if (zone_watermark_fast(zone, 0,  mark,
    5477             :                                 zonelist_zone_idx(ac.preferred_zoneref),
    5478             :                                 alloc_flags, gfp)) {
    5479             :                         break;
    5480             :                 }
    5481             :         }
    5482             : 
    5483             :         /*
    5484             :          * If there are no allowed local zones that meets the watermarks then
    5485             :          * try to allocate a single page and reclaim if necessary.
    5486             :          */
    5487         596 :         if (unlikely(!zone))
    5488             :                 goto failed;
    5489             : 
    5490             :         /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
    5491         596 :         pcp_trylock_prepare(UP_flags);
    5492        1192 :         pcp = pcp_spin_trylock(zone->per_cpu_pageset);
    5493         596 :         if (!pcp)
    5494             :                 goto failed_irq;
    5495             : 
    5496             :         /* Attempt the batch allocation */
    5497        1192 :         pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
    5498       43733 :         while (nr_populated < nr_pages) {
    5499             : 
    5500             :                 /* Skip existing pages */
    5501       42541 :                 if (page_array && page_array[nr_populated]) {
    5502           0 :                         nr_populated++;
    5503           0 :                         continue;
    5504             :                 }
    5505             : 
    5506       42541 :                 page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
    5507             :                                                                 pcp, pcp_list);
    5508       42541 :                 if (unlikely(!page)) {
    5509             :                         /* Try and allocate at least one page */
    5510           0 :                         if (!nr_account) {
    5511           0 :                                 pcp_spin_unlock(pcp);
    5512           0 :                                 goto failed_irq;
    5513             :                         }
    5514             :                         break;
    5515             :                 }
    5516       42541 :                 nr_account++;
    5517             : 
    5518       42541 :                 prep_new_page(page, 0, gfp, 0);
    5519       42541 :                 if (page_list)
    5520           0 :                         list_add(&page->lru, page_list);
    5521             :                 else
    5522       42541 :                         page_array[nr_populated] = page;
    5523       42541 :                 nr_populated++;
    5524             :         }
    5525             : 
    5526        1192 :         pcp_spin_unlock(pcp);
    5527        1192 :         pcp_trylock_finish(UP_flags);
    5528             : 
    5529        1192 :         __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
    5530         596 :         zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
    5531             : 
    5532             : out:
    5533         596 :         return nr_populated;
    5534             : 
    5535             : failed_irq:
    5536           0 :         pcp_trylock_finish(UP_flags);
    5537             : 
    5538             : failed:
    5539           0 :         page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
    5540           0 :         if (page) {
    5541           0 :                 if (page_list)
    5542           0 :                         list_add(&page->lru, page_list);
    5543             :                 else
    5544           0 :                         page_array[nr_populated] = page;
    5545           0 :                 nr_populated++;
    5546             :         }
    5547             : 
    5548             :         goto out;
    5549             : }
    5550             : EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
    5551             : 
    5552             : /*
    5553             :  * This is the 'heart' of the zoned buddy allocator.
    5554             :  */
    5555        2524 : struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
    5556             :                                                         nodemask_t *nodemask)
    5557             : {
    5558             :         struct page *page;
    5559        2524 :         unsigned int alloc_flags = ALLOC_WMARK_LOW;
    5560             :         gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
    5561        2524 :         struct alloc_context ac = { };
    5562             : 
    5563             :         /*
    5564             :          * There are several places where we assume that the order value is sane
    5565             :          * so bail out early if the request is out of bound.
    5566             :          */
    5567        2524 :         if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp))
    5568             :                 return NULL;
    5569             : 
    5570        2524 :         gfp &= gfp_allowed_mask;
    5571             :         /*
    5572             :          * Apply scoped allocation constraints. This is mainly about GFP_NOFS
    5573             :          * resp. GFP_NOIO which has to be inherited for all allocation requests
    5574             :          * from a particular context which has been marked by
    5575             :          * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
    5576             :          * movable zones are not used during allocation.
    5577             :          */
    5578        2524 :         gfp = current_gfp_context(gfp);
    5579        2524 :         alloc_gfp = gfp;
    5580        2524 :         if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
    5581             :                         &alloc_gfp, &alloc_flags))
    5582             :                 return NULL;
    5583             : 
    5584             :         /*
    5585             :          * Forbid the first pass from falling back to types that fragment
    5586             :          * memory until all local zones are considered.
    5587             :          */
    5588        5048 :         alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
    5589             : 
    5590             :         /* First allocation attempt */
    5591        2524 :         page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
    5592        2524 :         if (likely(page))
    5593             :                 goto out;
    5594             : 
    5595           0 :         alloc_gfp = gfp;
    5596           0 :         ac.spread_dirty_pages = false;
    5597             : 
    5598             :         /*
    5599             :          * Restore the original nodemask if it was potentially replaced with
    5600             :          * &cpuset_current_mems_allowed to optimize the fast-path attempt.
    5601             :          */
    5602           0 :         ac.nodemask = nodemask;
    5603             : 
    5604           0 :         page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
    5605             : 
    5606             : out:
    5607             :         if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
    5608             :             unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
    5609             :                 __free_pages(page, order);
    5610             :                 page = NULL;
    5611             :         }
    5612             : 
    5613        2524 :         trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
    5614        2524 :         kmsan_alloc_page(page, order, alloc_gfp);
    5615             : 
    5616        2524 :         return page;
    5617             : }
    5618             : EXPORT_SYMBOL(__alloc_pages);
    5619             : 
    5620           0 : struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
    5621             :                 nodemask_t *nodemask)
    5622             : {
    5623           0 :         struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
    5624             :                         preferred_nid, nodemask);
    5625             : 
    5626             :         if (page && order > 1)
    5627             :                 prep_transhuge_page(page);
    5628           0 :         return (struct folio *)page;
    5629             : }
    5630             : EXPORT_SYMBOL(__folio_alloc);
    5631             : 
    5632             : /*
    5633             :  * Common helper functions. Never use with __GFP_HIGHMEM because the returned
    5634             :  * address cannot represent highmem pages. Use alloc_pages and then kmap if
    5635             :  * you need to access high mem.
    5636             :  */
    5637          20 : unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
    5638             : {
    5639             :         struct page *page;
    5640             : 
    5641          40 :         page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
    5642          20 :         if (!page)
    5643             :                 return 0;
    5644          20 :         return (unsigned long) page_address(page);
    5645             : }
    5646             : EXPORT_SYMBOL(__get_free_pages);
    5647             : 
    5648           0 : unsigned long get_zeroed_page(gfp_t gfp_mask)
    5649             : {
    5650           0 :         return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
    5651             : }
    5652             : EXPORT_SYMBOL(get_zeroed_page);
    5653             : 
    5654             : /**
    5655             :  * __free_pages - Free pages allocated with alloc_pages().
    5656             :  * @page: The page pointer returned from alloc_pages().
    5657             :  * @order: The order of the allocation.
    5658             :  *
    5659             :  * This function can free multi-page allocations that are not compound
    5660             :  * pages.  It does not check that the @order passed in matches that of
    5661             :  * the allocation, so it is easy to leak memory.  Freeing more memory
    5662             :  * than was allocated will probably emit a warning.
    5663             :  *
    5664             :  * If the last reference to this page is speculative, it will be released
    5665             :  * by put_page() which only frees the first page of a non-compound
    5666             :  * allocation.  To prevent the remaining pages from being leaked, we free
    5667             :  * the subsequent pages here.  If you want to use the page's reference
    5668             :  * count to decide when to free the allocation, you should allocate a
    5669             :  * compound page, and use put_page() instead of __free_pages().
    5670             :  *
    5671             :  * Context: May be called in interrupt context or while holding a normal
    5672             :  * spinlock, but not in NMI context or while holding a raw spinlock.
    5673             :  */
    5674       44539 : void __free_pages(struct page *page, unsigned int order)
    5675             : {
    5676             :         /* get PageHead before we drop reference */
    5677       44539 :         int head = PageHead(page);
    5678             : 
    5679       44539 :         if (put_page_testzero(page))
    5680       44539 :                 free_the_page(page, order);
    5681           0 :         else if (!head)
    5682           0 :                 while (order-- > 0)
    5683           0 :                         free_the_page(page + (1 << order), order);
    5684       44539 : }
    5685             : EXPORT_SYMBOL(__free_pages);
    5686             : 
    5687           0 : void free_pages(unsigned long addr, unsigned int order)
    5688             : {
    5689           0 :         if (addr != 0) {
    5690             :                 VM_BUG_ON(!virt_addr_valid((void *)addr));
    5691           0 :                 __free_pages(virt_to_page((void *)addr), order);
    5692             :         }
    5693           0 : }
    5694             : 
    5695             : EXPORT_SYMBOL(free_pages);
    5696             : 
    5697             : /*
    5698             :  * Page Fragment:
    5699             :  *  An arbitrary-length arbitrary-offset area of memory which resides
    5700             :  *  within a 0 or higher order page.  Multiple fragments within that page
    5701             :  *  are individually refcounted, in the page's reference counter.
    5702             :  *
    5703             :  * The page_frag functions below provide a simple allocation framework for
    5704             :  * page fragments.  This is used by the network stack and network device
    5705             :  * drivers to provide a backing region of memory for use as either an
    5706             :  * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
    5707             :  */
    5708           0 : static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
    5709             :                                              gfp_t gfp_mask)
    5710             : {
    5711           0 :         struct page *page = NULL;
    5712           0 :         gfp_t gfp = gfp_mask;
    5713             : 
    5714             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    5715           0 :         gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
    5716             :                     __GFP_NOMEMALLOC;
    5717           0 :         page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
    5718           0 :                                 PAGE_FRAG_CACHE_MAX_ORDER);
    5719           0 :         nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
    5720             : #endif
    5721           0 :         if (unlikely(!page))
    5722           0 :                 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
    5723             : 
    5724           0 :         nc->va = page ? page_address(page) : NULL;
    5725             : 
    5726           0 :         return page;
    5727             : }
    5728             : 
    5729           0 : void __page_frag_cache_drain(struct page *page, unsigned int count)
    5730             : {
    5731             :         VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
    5732             : 
    5733           0 :         if (page_ref_sub_and_test(page, count))
    5734           0 :                 free_the_page(page, compound_order(page));
    5735           0 : }
    5736             : EXPORT_SYMBOL(__page_frag_cache_drain);
    5737             : 
    5738           0 : void *page_frag_alloc_align(struct page_frag_cache *nc,
    5739             :                       unsigned int fragsz, gfp_t gfp_mask,
    5740             :                       unsigned int align_mask)
    5741             : {
    5742           0 :         unsigned int size = PAGE_SIZE;
    5743             :         struct page *page;
    5744             :         int offset;
    5745             : 
    5746           0 :         if (unlikely(!nc->va)) {
    5747             : refill:
    5748           0 :                 page = __page_frag_cache_refill(nc, gfp_mask);
    5749           0 :                 if (!page)
    5750             :                         return NULL;
    5751             : 
    5752             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    5753             :                 /* if size can vary use size else just use PAGE_SIZE */
    5754           0 :                 size = nc->size;
    5755             : #endif
    5756             :                 /* Even if we own the page, we do not use atomic_set().
    5757             :                  * This would break get_page_unless_zero() users.
    5758             :                  */
    5759           0 :                 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
    5760             : 
    5761             :                 /* reset page count bias and offset to start of new frag */
    5762           0 :                 nc->pfmemalloc = page_is_pfmemalloc(page);
    5763           0 :                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
    5764           0 :                 nc->offset = size;
    5765             :         }
    5766             : 
    5767           0 :         offset = nc->offset - fragsz;
    5768           0 :         if (unlikely(offset < 0)) {
    5769           0 :                 page = virt_to_page(nc->va);
    5770             : 
    5771           0 :                 if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
    5772             :                         goto refill;
    5773             : 
    5774           0 :                 if (unlikely(nc->pfmemalloc)) {
    5775           0 :                         free_the_page(page, compound_order(page));
    5776           0 :                         goto refill;
    5777             :                 }
    5778             : 
    5779             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    5780             :                 /* if size can vary use size else just use PAGE_SIZE */
    5781           0 :                 size = nc->size;
    5782             : #endif
    5783             :                 /* OK, page count is 0, we can safely set it */
    5784           0 :                 set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
    5785             : 
    5786             :                 /* reset page count bias and offset to start of new frag */
    5787           0 :                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
    5788           0 :                 offset = size - fragsz;
    5789           0 :                 if (unlikely(offset < 0)) {
    5790             :                         /*
    5791             :                          * The caller is trying to allocate a fragment
    5792             :                          * with fragsz > PAGE_SIZE but the cache isn't big
    5793             :                          * enough to satisfy the request, this may
    5794             :                          * happen in low memory conditions.
    5795             :                          * We don't release the cache page because
    5796             :                          * it could make memory pressure worse
    5797             :                          * so we simply return NULL here.
    5798             :                          */
    5799             :                         return NULL;
    5800             :                 }
    5801             :         }
    5802             : 
    5803           0 :         nc->pagecnt_bias--;
    5804           0 :         offset &= align_mask;
    5805           0 :         nc->offset = offset;
    5806             : 
    5807           0 :         return nc->va + offset;
    5808             : }
    5809             : EXPORT_SYMBOL(page_frag_alloc_align);
    5810             : 
    5811             : /*
    5812             :  * Frees a page fragment allocated out of either a compound or order 0 page.
    5813             :  */
    5814           0 : void page_frag_free(void *addr)
    5815             : {
    5816           0 :         struct page *page = virt_to_head_page(addr);
    5817             : 
    5818           0 :         if (unlikely(put_page_testzero(page)))
    5819           0 :                 free_the_page(page, compound_order(page));
    5820           0 : }
    5821             : EXPORT_SYMBOL(page_frag_free);
    5822             : 
    5823           3 : static void *make_alloc_exact(unsigned long addr, unsigned int order,
    5824             :                 size_t size)
    5825             : {
    5826           3 :         if (addr) {
    5827           3 :                 unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE);
    5828           6 :                 struct page *page = virt_to_page((void *)addr);
    5829           3 :                 struct page *last = page + nr;
    5830             : 
    5831           3 :                 split_page_owner(page, 1 << order);
    5832           3 :                 split_page_memcg(page, 1 << order);
    5833          18 :                 while (page < --last)
    5834             :                         set_page_refcounted(last);
    5835             : 
    5836           3 :                 last = page + (1UL << order);
    5837           3 :                 for (page += nr; page < last; page++)
    5838           0 :                         __free_pages_ok(page, 0, FPI_TO_TAIL);
    5839             :         }
    5840           3 :         return (void *)addr;
    5841             : }
    5842             : 
    5843             : /**
    5844             :  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
    5845             :  * @size: the number of bytes to allocate
    5846             :  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
    5847             :  *
    5848             :  * This function is similar to alloc_pages(), except that it allocates the
    5849             :  * minimum number of pages to satisfy the request.  alloc_pages() can only
    5850             :  * allocate memory in power-of-two pages.
    5851             :  *
    5852             :  * This function is also limited by MAX_ORDER.
    5853             :  *
    5854             :  * Memory allocated by this function must be released by free_pages_exact().
    5855             :  *
    5856             :  * Return: pointer to the allocated area or %NULL in case of error.
    5857             :  */
    5858           3 : void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
    5859             : {
    5860           3 :         unsigned int order = get_order(size);
    5861             :         unsigned long addr;
    5862             : 
    5863           3 :         if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
    5864           0 :                 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
    5865             : 
    5866           3 :         addr = __get_free_pages(gfp_mask, order);
    5867           3 :         return make_alloc_exact(addr, order, size);
    5868             : }
    5869             : EXPORT_SYMBOL(alloc_pages_exact);
    5870             : 
    5871             : /**
    5872             :  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
    5873             :  *                         pages on a node.
    5874             :  * @nid: the preferred node ID where memory should be allocated
    5875             :  * @size: the number of bytes to allocate
    5876             :  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
    5877             :  *
    5878             :  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
    5879             :  * back.
    5880             :  *
    5881             :  * Return: pointer to the allocated area or %NULL in case of error.
    5882             :  */
    5883           0 : void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
    5884             : {
    5885           0 :         unsigned int order = get_order(size);
    5886             :         struct page *p;
    5887             : 
    5888           0 :         if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
    5889           0 :                 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
    5890             : 
    5891           0 :         p = alloc_pages_node(nid, gfp_mask, order);
    5892           0 :         if (!p)
    5893             :                 return NULL;
    5894           0 :         return make_alloc_exact((unsigned long)page_address(p), order, size);
    5895             : }
    5896             : 
    5897             : /**
    5898             :  * free_pages_exact - release memory allocated via alloc_pages_exact()
    5899             :  * @virt: the value returned by alloc_pages_exact.
    5900             :  * @size: size of allocation, same value as passed to alloc_pages_exact().
    5901             :  *
    5902             :  * Release the memory allocated by a previous call to alloc_pages_exact.
    5903             :  */
    5904           0 : void free_pages_exact(void *virt, size_t size)
    5905             : {
    5906           0 :         unsigned long addr = (unsigned long)virt;
    5907           0 :         unsigned long end = addr + PAGE_ALIGN(size);
    5908             : 
    5909           0 :         while (addr < end) {
    5910           0 :                 free_page(addr);
    5911           0 :                 addr += PAGE_SIZE;
    5912             :         }
    5913           0 : }
    5914             : EXPORT_SYMBOL(free_pages_exact);
    5915             : 
    5916             : /**
    5917             :  * nr_free_zone_pages - count number of pages beyond high watermark
    5918             :  * @offset: The zone index of the highest zone
    5919             :  *
    5920             :  * nr_free_zone_pages() counts the number of pages which are beyond the
    5921             :  * high watermark within all zones at or below a given zone index.  For each
    5922             :  * zone, the number of pages is calculated as:
    5923             :  *
    5924             :  *     nr_free_zone_pages = managed_pages - high_pages
    5925             :  *
    5926             :  * Return: number of pages beyond high watermark.
    5927             :  */
    5928           3 : static unsigned long nr_free_zone_pages(int offset)
    5929             : {
    5930             :         struct zoneref *z;
    5931             :         struct zone *zone;
    5932             : 
    5933             :         /* Just pick one node, since fallback list is circular */
    5934           3 :         unsigned long sum = 0;
    5935             : 
    5936           6 :         struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
    5937             : 
    5938          12 :         for_each_zone_zonelist(zone, z, zonelist, offset) {
    5939           3 :                 unsigned long size = zone_managed_pages(zone);
    5940           3 :                 unsigned long high = high_wmark_pages(zone);
    5941           3 :                 if (size > high)
    5942           3 :                         sum += size - high;
    5943             :         }
    5944             : 
    5945           3 :         return sum;
    5946             : }
    5947             : 
    5948             : /**
    5949             :  * nr_free_buffer_pages - count number of pages beyond high watermark
    5950             :  *
    5951             :  * nr_free_buffer_pages() counts the number of pages which are beyond the high
    5952             :  * watermark within ZONE_DMA and ZONE_NORMAL.
    5953             :  *
    5954             :  * Return: number of pages beyond high watermark within ZONE_DMA and
    5955             :  * ZONE_NORMAL.
    5956             :  */
    5957           1 : unsigned long nr_free_buffer_pages(void)
    5958             : {
    5959           2 :         return nr_free_zone_pages(gfp_zone(GFP_USER));
    5960             : }
    5961             : EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
    5962             : 
    5963             : static inline void show_node(struct zone *zone)
    5964             : {
    5965             :         if (IS_ENABLED(CONFIG_NUMA))
    5966             :                 printk("Node %d ", zone_to_nid(zone));
    5967             : }
    5968             : 
    5969           0 : long si_mem_available(void)
    5970             : {
    5971             :         long available;
    5972             :         unsigned long pagecache;
    5973           0 :         unsigned long wmark_low = 0;
    5974             :         unsigned long pages[NR_LRU_LISTS];
    5975             :         unsigned long reclaimable;
    5976             :         struct zone *zone;
    5977             :         int lru;
    5978             : 
    5979           0 :         for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
    5980           0 :                 pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
    5981             : 
    5982           0 :         for_each_zone(zone)
    5983           0 :                 wmark_low += low_wmark_pages(zone);
    5984             : 
    5985             :         /*
    5986             :          * Estimate the amount of memory available for userspace allocations,
    5987             :          * without causing swapping or OOM.
    5988             :          */
    5989           0 :         available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
    5990             : 
    5991             :         /*
    5992             :          * Not all the page cache can be freed, otherwise the system will
    5993             :          * start swapping or thrashing. Assume at least half of the page
    5994             :          * cache, or the low watermark worth of cache, needs to stay.
    5995             :          */
    5996           0 :         pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
    5997           0 :         pagecache -= min(pagecache / 2, wmark_low);
    5998           0 :         available += pagecache;
    5999             : 
    6000             :         /*
    6001             :          * Part of the reclaimable slab and other kernel memory consists of
    6002             :          * items that are in use, and cannot be freed. Cap this estimate at the
    6003             :          * low watermark.
    6004             :          */
    6005           0 :         reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
    6006           0 :                 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
    6007           0 :         available += reclaimable - min(reclaimable / 2, wmark_low);
    6008             : 
    6009           0 :         if (available < 0)
    6010           0 :                 available = 0;
    6011           0 :         return available;
    6012             : }
    6013             : EXPORT_SYMBOL_GPL(si_mem_available);
    6014             : 
    6015           2 : void si_meminfo(struct sysinfo *val)
    6016             : {
    6017           2 :         val->totalram = totalram_pages();
    6018           2 :         val->sharedram = global_node_page_state(NR_SHMEM);
    6019           2 :         val->freeram = global_zone_page_state(NR_FREE_PAGES);
    6020           2 :         val->bufferram = nr_blockdev_pages();
    6021           2 :         val->totalhigh = totalhigh_pages();
    6022           2 :         val->freehigh = nr_free_highpages();
    6023           2 :         val->mem_unit = PAGE_SIZE;
    6024           2 : }
    6025             : 
    6026             : EXPORT_SYMBOL(si_meminfo);
    6027             : 
    6028             : #ifdef CONFIG_NUMA
    6029             : void si_meminfo_node(struct sysinfo *val, int nid)
    6030             : {
    6031             :         int zone_type;          /* needs to be signed */
    6032             :         unsigned long managed_pages = 0;
    6033             :         unsigned long managed_highpages = 0;
    6034             :         unsigned long free_highpages = 0;
    6035             :         pg_data_t *pgdat = NODE_DATA(nid);
    6036             : 
    6037             :         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
    6038             :                 managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
    6039             :         val->totalram = managed_pages;
    6040             :         val->sharedram = node_page_state(pgdat, NR_SHMEM);
    6041             :         val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
    6042             : #ifdef CONFIG_HIGHMEM
    6043             :         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
    6044             :                 struct zone *zone = &pgdat->node_zones[zone_type];
    6045             : 
    6046             :                 if (is_highmem(zone)) {
    6047             :                         managed_highpages += zone_managed_pages(zone);
    6048             :                         free_highpages += zone_page_state(zone, NR_FREE_PAGES);
    6049             :                 }
    6050             :         }
    6051             :         val->totalhigh = managed_highpages;
    6052             :         val->freehigh = free_highpages;
    6053             : #else
    6054             :         val->totalhigh = managed_highpages;
    6055             :         val->freehigh = free_highpages;
    6056             : #endif
    6057             :         val->mem_unit = PAGE_SIZE;
    6058             : }
    6059             : #endif
    6060             : 
    6061             : /*
    6062             :  * Determine whether the node should be displayed or not, depending on whether
    6063             :  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
    6064             :  */
    6065           0 : static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
    6066             : {
    6067           0 :         if (!(flags & SHOW_MEM_FILTER_NODES))
    6068             :                 return false;
    6069             : 
    6070             :         /*
    6071             :          * no node mask - aka implicit memory numa policy. Do not bother with
    6072             :          * the synchronization - read_mems_allowed_begin - because we do not
    6073             :          * have to be precise here.
    6074             :          */
    6075           0 :         if (!nodemask)
    6076           0 :                 nodemask = &cpuset_current_mems_allowed;
    6077             : 
    6078           0 :         return !node_isset(nid, *nodemask);
    6079             : }
    6080             : 
    6081             : #define K(x) ((x) << (PAGE_SHIFT-10))
    6082             : 
    6083           0 : static void show_migration_types(unsigned char type)
    6084             : {
    6085             :         static const char types[MIGRATE_TYPES] = {
    6086             :                 [MIGRATE_UNMOVABLE]     = 'U',
    6087             :                 [MIGRATE_MOVABLE]       = 'M',
    6088             :                 [MIGRATE_RECLAIMABLE]   = 'E',
    6089             :                 [MIGRATE_HIGHATOMIC]    = 'H',
    6090             : #ifdef CONFIG_CMA
    6091             :                 [MIGRATE_CMA]           = 'C',
    6092             : #endif
    6093             : #ifdef CONFIG_MEMORY_ISOLATION
    6094             :                 [MIGRATE_ISOLATE]       = 'I',
    6095             : #endif
    6096             :         };
    6097             :         char tmp[MIGRATE_TYPES + 1];
    6098           0 :         char *p = tmp;
    6099             :         int i;
    6100             : 
    6101           0 :         for (i = 0; i < MIGRATE_TYPES; i++) {
    6102           0 :                 if (type & (1 << i))
    6103           0 :                         *p++ = types[i];
    6104             :         }
    6105             : 
    6106           0 :         *p = '\0';
    6107           0 :         printk(KERN_CONT "(%s) ", tmp);
    6108           0 : }
    6109             : 
    6110             : static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx)
    6111             : {
    6112             :         int zone_idx;
    6113           0 :         for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++)
    6114           0 :                 if (zone_managed_pages(pgdat->node_zones + zone_idx))
    6115             :                         return true;
    6116             :         return false;
    6117             : }
    6118             : 
    6119             : /*
    6120             :  * Show free area list (used inside shift_scroll-lock stuff)
    6121             :  * We also calculate the percentage fragmentation. We do this by counting the
    6122             :  * memory on each free list with the exception of the first item on the list.
    6123             :  *
    6124             :  * Bits in @filter:
    6125             :  * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
    6126             :  *   cpuset.
    6127             :  */
    6128           0 : void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
    6129             : {
    6130           0 :         unsigned long free_pcp = 0;
    6131             :         int cpu, nid;
    6132             :         struct zone *zone;
    6133             :         pg_data_t *pgdat;
    6134             : 
    6135           0 :         for_each_populated_zone(zone) {
    6136           0 :                 if (zone_idx(zone) > max_zone_idx)
    6137           0 :                         continue;
    6138           0 :                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
    6139           0 :                         continue;
    6140             : 
    6141           0 :                 for_each_online_cpu(cpu)
    6142           0 :                         free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
    6143             :         }
    6144             : 
    6145           0 :         printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
    6146             :                 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
    6147             :                 " unevictable:%lu dirty:%lu writeback:%lu\n"
    6148             :                 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
    6149             :                 " mapped:%lu shmem:%lu pagetables:%lu\n"
    6150             :                 " sec_pagetables:%lu bounce:%lu\n"
    6151             :                 " kernel_misc_reclaimable:%lu\n"
    6152             :                 " free:%lu free_pcp:%lu free_cma:%lu\n",
    6153             :                 global_node_page_state(NR_ACTIVE_ANON),
    6154             :                 global_node_page_state(NR_INACTIVE_ANON),
    6155             :                 global_node_page_state(NR_ISOLATED_ANON),
    6156             :                 global_node_page_state(NR_ACTIVE_FILE),
    6157             :                 global_node_page_state(NR_INACTIVE_FILE),
    6158             :                 global_node_page_state(NR_ISOLATED_FILE),
    6159             :                 global_node_page_state(NR_UNEVICTABLE),
    6160             :                 global_node_page_state(NR_FILE_DIRTY),
    6161             :                 global_node_page_state(NR_WRITEBACK),
    6162             :                 global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
    6163             :                 global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
    6164             :                 global_node_page_state(NR_FILE_MAPPED),
    6165             :                 global_node_page_state(NR_SHMEM),
    6166             :                 global_node_page_state(NR_PAGETABLE),
    6167             :                 global_node_page_state(NR_SECONDARY_PAGETABLE),
    6168             :                 global_zone_page_state(NR_BOUNCE),
    6169             :                 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
    6170             :                 global_zone_page_state(NR_FREE_PAGES),
    6171             :                 free_pcp,
    6172             :                 global_zone_page_state(NR_FREE_CMA_PAGES));
    6173             : 
    6174           0 :         for_each_online_pgdat(pgdat) {
    6175           0 :                 if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
    6176           0 :                         continue;
    6177           0 :                 if (!node_has_managed_zones(pgdat, max_zone_idx))
    6178           0 :                         continue;
    6179             : 
    6180           0 :                 printk("Node %d"
    6181             :                         " active_anon:%lukB"
    6182             :                         " inactive_anon:%lukB"
    6183             :                         " active_file:%lukB"
    6184             :                         " inactive_file:%lukB"
    6185             :                         " unevictable:%lukB"
    6186             :                         " isolated(anon):%lukB"
    6187             :                         " isolated(file):%lukB"
    6188             :                         " mapped:%lukB"
    6189             :                         " dirty:%lukB"
    6190             :                         " writeback:%lukB"
    6191             :                         " shmem:%lukB"
    6192             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    6193             :                         " shmem_thp: %lukB"
    6194             :                         " shmem_pmdmapped: %lukB"
    6195             :                         " anon_thp: %lukB"
    6196             : #endif
    6197             :                         " writeback_tmp:%lukB"
    6198             :                         " kernel_stack:%lukB"
    6199             : #ifdef CONFIG_SHADOW_CALL_STACK
    6200             :                         " shadow_call_stack:%lukB"
    6201             : #endif
    6202             :                         " pagetables:%lukB"
    6203             :                         " sec_pagetables:%lukB"
    6204             :                         " all_unreclaimable? %s"
    6205             :                         "\n",
    6206             :                         pgdat->node_id,
    6207             :                         K(node_page_state(pgdat, NR_ACTIVE_ANON)),
    6208             :                         K(node_page_state(pgdat, NR_INACTIVE_ANON)),
    6209             :                         K(node_page_state(pgdat, NR_ACTIVE_FILE)),
    6210             :                         K(node_page_state(pgdat, NR_INACTIVE_FILE)),
    6211             :                         K(node_page_state(pgdat, NR_UNEVICTABLE)),
    6212             :                         K(node_page_state(pgdat, NR_ISOLATED_ANON)),
    6213             :                         K(node_page_state(pgdat, NR_ISOLATED_FILE)),
    6214             :                         K(node_page_state(pgdat, NR_FILE_MAPPED)),
    6215             :                         K(node_page_state(pgdat, NR_FILE_DIRTY)),
    6216             :                         K(node_page_state(pgdat, NR_WRITEBACK)),
    6217             :                         K(node_page_state(pgdat, NR_SHMEM)),
    6218             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    6219             :                         K(node_page_state(pgdat, NR_SHMEM_THPS)),
    6220             :                         K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
    6221             :                         K(node_page_state(pgdat, NR_ANON_THPS)),
    6222             : #endif
    6223             :                         K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
    6224             :                         node_page_state(pgdat, NR_KERNEL_STACK_KB),
    6225             : #ifdef CONFIG_SHADOW_CALL_STACK
    6226             :                         node_page_state(pgdat, NR_KERNEL_SCS_KB),
    6227             : #endif
    6228             :                         K(node_page_state(pgdat, NR_PAGETABLE)),
    6229             :                         K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
    6230             :                         pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
    6231             :                                 "yes" : "no");
    6232             :         }
    6233             : 
    6234           0 :         for_each_populated_zone(zone) {
    6235             :                 int i;
    6236             : 
    6237           0 :                 if (zone_idx(zone) > max_zone_idx)
    6238           0 :                         continue;
    6239           0 :                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
    6240           0 :                         continue;
    6241             : 
    6242             :                 free_pcp = 0;
    6243           0 :                 for_each_online_cpu(cpu)
    6244           0 :                         free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
    6245             : 
    6246           0 :                 show_node(zone);
    6247           0 :                 printk(KERN_CONT
    6248             :                         "%s"
    6249             :                         " free:%lukB"
    6250             :                         " boost:%lukB"
    6251             :                         " min:%lukB"
    6252             :                         " low:%lukB"
    6253             :                         " high:%lukB"
    6254             :                         " reserved_highatomic:%luKB"
    6255             :                         " active_anon:%lukB"
    6256             :                         " inactive_anon:%lukB"
    6257             :                         " active_file:%lukB"
    6258             :                         " inactive_file:%lukB"
    6259             :                         " unevictable:%lukB"
    6260             :                         " writepending:%lukB"
    6261             :                         " present:%lukB"
    6262             :                         " managed:%lukB"
    6263             :                         " mlocked:%lukB"
    6264             :                         " bounce:%lukB"
    6265             :                         " free_pcp:%lukB"
    6266             :                         " local_pcp:%ukB"
    6267             :                         " free_cma:%lukB"
    6268             :                         "\n",
    6269             :                         zone->name,
    6270             :                         K(zone_page_state(zone, NR_FREE_PAGES)),
    6271             :                         K(zone->watermark_boost),
    6272             :                         K(min_wmark_pages(zone)),
    6273             :                         K(low_wmark_pages(zone)),
    6274             :                         K(high_wmark_pages(zone)),
    6275             :                         K(zone->nr_reserved_highatomic),
    6276             :                         K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
    6277             :                         K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
    6278             :                         K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
    6279             :                         K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
    6280             :                         K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
    6281             :                         K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
    6282             :                         K(zone->present_pages),
    6283             :                         K(zone_managed_pages(zone)),
    6284             :                         K(zone_page_state(zone, NR_MLOCK)),
    6285             :                         K(zone_page_state(zone, NR_BOUNCE)),
    6286             :                         K(free_pcp),
    6287             :                         K(this_cpu_read(zone->per_cpu_pageset->count)),
    6288             :                         K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
    6289           0 :                 printk("lowmem_reserve[]:");
    6290           0 :                 for (i = 0; i < MAX_NR_ZONES; i++)
    6291           0 :                         printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
    6292           0 :                 printk(KERN_CONT "\n");
    6293             :         }
    6294             : 
    6295           0 :         for_each_populated_zone(zone) {
    6296             :                 unsigned int order;
    6297           0 :                 unsigned long nr[MAX_ORDER], flags, total = 0;
    6298             :                 unsigned char types[MAX_ORDER];
    6299             : 
    6300           0 :                 if (zone_idx(zone) > max_zone_idx)
    6301           0 :                         continue;
    6302           0 :                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
    6303           0 :                         continue;
    6304           0 :                 show_node(zone);
    6305           0 :                 printk(KERN_CONT "%s: ", zone->name);
    6306             : 
    6307           0 :                 spin_lock_irqsave(&zone->lock, flags);
    6308           0 :                 for (order = 0; order < MAX_ORDER; order++) {
    6309           0 :                         struct free_area *area = &zone->free_area[order];
    6310             :                         int type;
    6311             : 
    6312           0 :                         nr[order] = area->nr_free;
    6313           0 :                         total += nr[order] << order;
    6314             : 
    6315           0 :                         types[order] = 0;
    6316           0 :                         for (type = 0; type < MIGRATE_TYPES; type++) {
    6317           0 :                                 if (!free_area_empty(area, type))
    6318           0 :                                         types[order] |= 1 << type;
    6319             :                         }
    6320             :                 }
    6321           0 :                 spin_unlock_irqrestore(&zone->lock, flags);
    6322           0 :                 for (order = 0; order < MAX_ORDER; order++) {
    6323           0 :                         printk(KERN_CONT "%lu*%lukB ",
    6324             :                                nr[order], K(1UL) << order);
    6325           0 :                         if (nr[order])
    6326           0 :                                 show_migration_types(types[order]);
    6327             :                 }
    6328           0 :                 printk(KERN_CONT "= %lukB\n", K(total));
    6329             :         }
    6330             : 
    6331           0 :         for_each_online_node(nid) {
    6332           0 :                 if (show_mem_node_skip(filter, nid, nodemask))
    6333             :                         continue;
    6334             :                 hugetlb_show_meminfo_node(nid);
    6335             :         }
    6336             : 
    6337           0 :         printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
    6338             : 
    6339           0 :         show_swap_cache_info();
    6340           0 : }
    6341             : 
    6342             : static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
    6343             : {
    6344           1 :         zoneref->zone = zone;
    6345           1 :         zoneref->zone_idx = zone_idx(zone);
    6346             : }
    6347             : 
    6348             : /*
    6349             :  * Builds allocation fallback zone lists.
    6350             :  *
    6351             :  * Add all populated zones of a node to the zonelist.
    6352             :  */
    6353             : static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
    6354             : {
    6355             :         struct zone *zone;
    6356           1 :         enum zone_type zone_type = MAX_NR_ZONES;
    6357           1 :         int nr_zones = 0;
    6358             : 
    6359             :         do {
    6360           2 :                 zone_type--;
    6361           2 :                 zone = pgdat->node_zones + zone_type;
    6362           2 :                 if (populated_zone(zone)) {
    6363           2 :                         zoneref_set_zone(zone, &zonerefs[nr_zones++]);
    6364           1 :                         check_highest_zone(zone_type);
    6365             :                 }
    6366           2 :         } while (zone_type);
    6367             : 
    6368             :         return nr_zones;
    6369             : }
    6370             : 
    6371             : #ifdef CONFIG_NUMA
    6372             : 
    6373             : static int __parse_numa_zonelist_order(char *s)
    6374             : {
    6375             :         /*
    6376             :          * We used to support different zonelists modes but they turned
    6377             :          * out to be just not useful. Let's keep the warning in place
    6378             :          * if somebody still use the cmd line parameter so that we do
    6379             :          * not fail it silently
    6380             :          */
    6381             :         if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
    6382             :                 pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
    6383             :                 return -EINVAL;
    6384             :         }
    6385             :         return 0;
    6386             : }
    6387             : 
    6388             : char numa_zonelist_order[] = "Node";
    6389             : 
    6390             : /*
    6391             :  * sysctl handler for numa_zonelist_order
    6392             :  */
    6393             : int numa_zonelist_order_handler(struct ctl_table *table, int write,
    6394             :                 void *buffer, size_t *length, loff_t *ppos)
    6395             : {
    6396             :         if (write)
    6397             :                 return __parse_numa_zonelist_order(buffer);
    6398             :         return proc_dostring(table, write, buffer, length, ppos);
    6399             : }
    6400             : 
    6401             : 
    6402             : static int node_load[MAX_NUMNODES];
    6403             : 
    6404             : /**
    6405             :  * find_next_best_node - find the next node that should appear in a given node's fallback list
    6406             :  * @node: node whose fallback list we're appending
    6407             :  * @used_node_mask: nodemask_t of already used nodes
    6408             :  *
    6409             :  * We use a number of factors to determine which is the next node that should
    6410             :  * appear on a given node's fallback list.  The node should not have appeared
    6411             :  * already in @node's fallback list, and it should be the next closest node
    6412             :  * according to the distance array (which contains arbitrary distance values
    6413             :  * from each node to each node in the system), and should also prefer nodes
    6414             :  * with no CPUs, since presumably they'll have very little allocation pressure
    6415             :  * on them otherwise.
    6416             :  *
    6417             :  * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
    6418             :  */
    6419             : int find_next_best_node(int node, nodemask_t *used_node_mask)
    6420             : {
    6421             :         int n, val;
    6422             :         int min_val = INT_MAX;
    6423             :         int best_node = NUMA_NO_NODE;
    6424             : 
    6425             :         /* Use the local node if we haven't already */
    6426             :         if (!node_isset(node, *used_node_mask)) {
    6427             :                 node_set(node, *used_node_mask);
    6428             :                 return node;
    6429             :         }
    6430             : 
    6431             :         for_each_node_state(n, N_MEMORY) {
    6432             : 
    6433             :                 /* Don't want a node to appear more than once */
    6434             :                 if (node_isset(n, *used_node_mask))
    6435             :                         continue;
    6436             : 
    6437             :                 /* Use the distance array to find the distance */
    6438             :                 val = node_distance(node, n);
    6439             : 
    6440             :                 /* Penalize nodes under us ("prefer the next node") */
    6441             :                 val += (n < node);
    6442             : 
    6443             :                 /* Give preference to headless and unused nodes */
    6444             :                 if (!cpumask_empty(cpumask_of_node(n)))
    6445             :                         val += PENALTY_FOR_NODE_WITH_CPUS;
    6446             : 
    6447             :                 /* Slight preference for less loaded node */
    6448             :                 val *= MAX_NUMNODES;
    6449             :                 val += node_load[n];
    6450             : 
    6451             :                 if (val < min_val) {
    6452             :                         min_val = val;
    6453             :                         best_node = n;
    6454             :                 }
    6455             :         }
    6456             : 
    6457             :         if (best_node >= 0)
    6458             :                 node_set(best_node, *used_node_mask);
    6459             : 
    6460             :         return best_node;
    6461             : }
    6462             : 
    6463             : 
    6464             : /*
    6465             :  * Build zonelists ordered by node and zones within node.
    6466             :  * This results in maximum locality--normal zone overflows into local
    6467             :  * DMA zone, if any--but risks exhausting DMA zone.
    6468             :  */
    6469             : static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
    6470             :                 unsigned nr_nodes)
    6471             : {
    6472             :         struct zoneref *zonerefs;
    6473             :         int i;
    6474             : 
    6475             :         zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
    6476             : 
    6477             :         for (i = 0; i < nr_nodes; i++) {
    6478             :                 int nr_zones;
    6479             : 
    6480             :                 pg_data_t *node = NODE_DATA(node_order[i]);
    6481             : 
    6482             :                 nr_zones = build_zonerefs_node(node, zonerefs);
    6483             :                 zonerefs += nr_zones;
    6484             :         }
    6485             :         zonerefs->zone = NULL;
    6486             :         zonerefs->zone_idx = 0;
    6487             : }
    6488             : 
    6489             : /*
    6490             :  * Build gfp_thisnode zonelists
    6491             :  */
    6492             : static void build_thisnode_zonelists(pg_data_t *pgdat)
    6493             : {
    6494             :         struct zoneref *zonerefs;
    6495             :         int nr_zones;
    6496             : 
    6497             :         zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
    6498             :         nr_zones = build_zonerefs_node(pgdat, zonerefs);
    6499             :         zonerefs += nr_zones;
    6500             :         zonerefs->zone = NULL;
    6501             :         zonerefs->zone_idx = 0;
    6502             : }
    6503             : 
    6504             : /*
    6505             :  * Build zonelists ordered by zone and nodes within zones.
    6506             :  * This results in conserving DMA zone[s] until all Normal memory is
    6507             :  * exhausted, but results in overflowing to remote node while memory
    6508             :  * may still exist in local DMA zone.
    6509             :  */
    6510             : 
    6511             : static void build_zonelists(pg_data_t *pgdat)
    6512             : {
    6513             :         static int node_order[MAX_NUMNODES];
    6514             :         int node, nr_nodes = 0;
    6515             :         nodemask_t used_mask = NODE_MASK_NONE;
    6516             :         int local_node, prev_node;
    6517             : 
    6518             :         /* NUMA-aware ordering of nodes */
    6519             :         local_node = pgdat->node_id;
    6520             :         prev_node = local_node;
    6521             : 
    6522             :         memset(node_order, 0, sizeof(node_order));
    6523             :         while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
    6524             :                 /*
    6525             :                  * We don't want to pressure a particular node.
    6526             :                  * So adding penalty to the first node in same
    6527             :                  * distance group to make it round-robin.
    6528             :                  */
    6529             :                 if (node_distance(local_node, node) !=
    6530             :                     node_distance(local_node, prev_node))
    6531             :                         node_load[node] += 1;
    6532             : 
    6533             :                 node_order[nr_nodes++] = node;
    6534             :                 prev_node = node;
    6535             :         }
    6536             : 
    6537             :         build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
    6538             :         build_thisnode_zonelists(pgdat);
    6539             :         pr_info("Fallback order for Node %d: ", local_node);
    6540             :         for (node = 0; node < nr_nodes; node++)
    6541             :                 pr_cont("%d ", node_order[node]);
    6542             :         pr_cont("\n");
    6543             : }
    6544             : 
    6545             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
    6546             : /*
    6547             :  * Return node id of node used for "local" allocations.
    6548             :  * I.e., first node id of first zone in arg node's generic zonelist.
    6549             :  * Used for initializing percpu 'numa_mem', which is used primarily
    6550             :  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
    6551             :  */
    6552             : int local_memory_node(int node)
    6553             : {
    6554             :         struct zoneref *z;
    6555             : 
    6556             :         z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
    6557             :                                    gfp_zone(GFP_KERNEL),
    6558             :                                    NULL);
    6559             :         return zone_to_nid(z->zone);
    6560             : }
    6561             : #endif
    6562             : 
    6563             : static void setup_min_unmapped_ratio(void);
    6564             : static void setup_min_slab_ratio(void);
    6565             : #else   /* CONFIG_NUMA */
    6566             : 
    6567           1 : static void build_zonelists(pg_data_t *pgdat)
    6568             : {
    6569             :         int node, local_node;
    6570             :         struct zoneref *zonerefs;
    6571             :         int nr_zones;
    6572             : 
    6573           1 :         local_node = pgdat->node_id;
    6574             : 
    6575           1 :         zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
    6576           1 :         nr_zones = build_zonerefs_node(pgdat, zonerefs);
    6577           1 :         zonerefs += nr_zones;
    6578             : 
    6579             :         /*
    6580             :          * Now we build the zonelist so that it contains the zones
    6581             :          * of all the other nodes.
    6582             :          * We don't want to pressure a particular node, so when
    6583             :          * building the zones for node N, we make sure that the
    6584             :          * zones coming right after the local ones are those from
    6585             :          * node N+1 (modulo N)
    6586             :          */
    6587           1 :         for (node = local_node + 1; node < MAX_NUMNODES; node++) {
    6588           0 :                 if (!node_online(node))
    6589           0 :                         continue;
    6590           0 :                 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
    6591           0 :                 zonerefs += nr_zones;
    6592             :         }
    6593           0 :         for (node = 0; node < local_node; node++) {
    6594           0 :                 if (!node_online(node))
    6595           0 :                         continue;
    6596           0 :                 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
    6597           0 :                 zonerefs += nr_zones;
    6598             :         }
    6599             : 
    6600           1 :         zonerefs->zone = NULL;
    6601           1 :         zonerefs->zone_idx = 0;
    6602           1 : }
    6603             : 
    6604             : #endif  /* CONFIG_NUMA */
    6605             : 
    6606             : /*
    6607             :  * Boot pageset table. One per cpu which is going to be used for all
    6608             :  * zones and all nodes. The parameters will be set in such a way
    6609             :  * that an item put on a list will immediately be handed over to
    6610             :  * the buddy list. This is safe since pageset manipulation is done
    6611             :  * with interrupts disabled.
    6612             :  *
    6613             :  * The boot_pagesets must be kept even after bootup is complete for
    6614             :  * unused processors and/or zones. They do play a role for bootstrapping
    6615             :  * hotplugged processors.
    6616             :  *
    6617             :  * zoneinfo_show() and maybe other functions do
    6618             :  * not check if the processor is online before following the pageset pointer.
    6619             :  * Other parts of the kernel may not check if the zone is available.
    6620             :  */
    6621             : static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats);
    6622             : /* These effectively disable the pcplists in the boot pageset completely */
    6623             : #define BOOT_PAGESET_HIGH       0
    6624             : #define BOOT_PAGESET_BATCH      1
    6625             : static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
    6626             : static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
    6627             : static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
    6628             : 
    6629           1 : static void __build_all_zonelists(void *data)
    6630             : {
    6631             :         int nid;
    6632             :         int __maybe_unused cpu;
    6633           1 :         pg_data_t *self = data;
    6634             : 
    6635           1 :         write_seqlock(&zonelist_update_seq);
    6636             : 
    6637             : #ifdef CONFIG_NUMA
    6638             :         memset(node_load, 0, sizeof(node_load));
    6639             : #endif
    6640             : 
    6641             :         /*
    6642             :          * This node is hotadded and no memory is yet present.   So just
    6643             :          * building zonelists is fine - no need to touch other nodes.
    6644             :          */
    6645           1 :         if (self && !node_online(self->node_id)) {
    6646           0 :                 build_zonelists(self);
    6647             :         } else {
    6648             :                 /*
    6649             :                  * All possible nodes have pgdat preallocated
    6650             :                  * in free_area_init
    6651             :                  */
    6652           1 :                 for_each_node(nid) {
    6653           1 :                         pg_data_t *pgdat = NODE_DATA(nid);
    6654             : 
    6655           1 :                         build_zonelists(pgdat);
    6656             :                 }
    6657             : 
    6658             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
    6659             :                 /*
    6660             :                  * We now know the "local memory node" for each node--
    6661             :                  * i.e., the node of the first zone in the generic zonelist.
    6662             :                  * Set up numa_mem percpu variable for on-line cpus.  During
    6663             :                  * boot, only the boot cpu should be on-line;  we'll init the
    6664             :                  * secondary cpus' numa_mem as they come on-line.  During
    6665             :                  * node/memory hotplug, we'll fixup all on-line cpus.
    6666             :                  */
    6667             :                 for_each_online_cpu(cpu)
    6668             :                         set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
    6669             : #endif
    6670             :         }
    6671             : 
    6672           1 :         write_sequnlock(&zonelist_update_seq);
    6673           1 : }
    6674             : 
    6675             : static noinline void __init
    6676           1 : build_all_zonelists_init(void)
    6677             : {
    6678             :         int cpu;
    6679             : 
    6680           1 :         __build_all_zonelists(NULL);
    6681             : 
    6682             :         /*
    6683             :          * Initialize the boot_pagesets that are going to be used
    6684             :          * for bootstrapping processors. The real pagesets for
    6685             :          * each zone will be allocated later when the per cpu
    6686             :          * allocator is available.
    6687             :          *
    6688             :          * boot_pagesets are used also for bootstrapping offline
    6689             :          * cpus if the system is already booted because the pagesets
    6690             :          * are needed to initialize allocators on a specific cpu too.
    6691             :          * F.e. the percpu allocator needs the page allocator which
    6692             :          * needs the percpu allocator in order to allocate its pagesets
    6693             :          * (a chicken-egg dilemma).
    6694             :          */
    6695           2 :         for_each_possible_cpu(cpu)
    6696           1 :                 per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu));
    6697             : 
    6698           1 :         mminit_verify_zonelist();
    6699             :         cpuset_init_current_mems_allowed();
    6700           1 : }
    6701             : 
    6702             : /*
    6703             :  * unless system_state == SYSTEM_BOOTING.
    6704             :  *
    6705             :  * __ref due to call of __init annotated helper build_all_zonelists_init
    6706             :  * [protected by SYSTEM_BOOTING].
    6707             :  */
    6708           1 : void __ref build_all_zonelists(pg_data_t *pgdat)
    6709             : {
    6710             :         unsigned long vm_total_pages;
    6711             : 
    6712           1 :         if (system_state == SYSTEM_BOOTING) {
    6713           1 :                 build_all_zonelists_init();
    6714             :         } else {
    6715           0 :                 __build_all_zonelists(pgdat);
    6716             :                 /* cpuset refresh routine should be here */
    6717             :         }
    6718             :         /* Get the number of free pages beyond high watermark in all zones. */
    6719           1 :         vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
    6720             :         /*
    6721             :          * Disable grouping by mobility if the number of pages in the
    6722             :          * system is too low to allow the mechanism to work. It would be
    6723             :          * more accurate, but expensive to check per-zone. This check is
    6724             :          * made on memory-hotadd so a system can start with mobility
    6725             :          * disabled and enable it later
    6726             :          */
    6727           1 :         if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
    6728           0 :                 page_group_by_mobility_disabled = 1;
    6729             :         else
    6730           1 :                 page_group_by_mobility_disabled = 0;
    6731             : 
    6732           1 :         pr_info("Built %u zonelists, mobility grouping %s.  Total pages: %ld\n",
    6733             :                 nr_online_nodes,
    6734             :                 page_group_by_mobility_disabled ? "off" : "on",
    6735             :                 vm_total_pages);
    6736             : #ifdef CONFIG_NUMA
    6737             :         pr_info("Policy zone: %s\n", zone_names[policy_zone]);
    6738             : #endif
    6739           1 : }
    6740             : 
    6741             : /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
    6742             : static bool __meminit
    6743      270239 : overlap_memmap_init(unsigned long zone, unsigned long *pfn)
    6744             : {
    6745             :         static struct memblock_region *r;
    6746             : 
    6747      270239 :         if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
    6748           0 :                 if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
    6749           0 :                         for_each_mem_region(r) {
    6750           0 :                                 if (*pfn < memblock_region_memory_end_pfn(r))
    6751             :                                         break;
    6752             :                         }
    6753             :                 }
    6754           0 :                 if (*pfn >= memblock_region_memory_base_pfn(r) &&
    6755           0 :                     memblock_is_mirror(r)) {
    6756           0 :                         *pfn = memblock_region_memory_end_pfn(r);
    6757           0 :                         return true;
    6758             :                 }
    6759             :         }
    6760             :         return false;
    6761             : }
    6762             : 
    6763             : /*
    6764             :  * Initially all pages are reserved - free ones are freed
    6765             :  * up by memblock_free_all() once the early boot process is
    6766             :  * done. Non-atomic initialization, single-pass.
    6767             :  *
    6768             :  * All aligned pageblocks are initialized to the specified migratetype
    6769             :  * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
    6770             :  * zone stats (e.g., nr_isolate_pageblock) are touched.
    6771             :  */
    6772           1 : void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
    6773             :                 unsigned long start_pfn, unsigned long zone_end_pfn,
    6774             :                 enum meminit_context context,
    6775             :                 struct vmem_altmap *altmap, int migratetype)
    6776             : {
    6777           1 :         unsigned long pfn, end_pfn = start_pfn + size;
    6778             :         struct page *page;
    6779             : 
    6780           1 :         if (highest_memmap_pfn < end_pfn - 1)
    6781           1 :                 highest_memmap_pfn = end_pfn - 1;
    6782             : 
    6783             : #ifdef CONFIG_ZONE_DEVICE
    6784             :         /*
    6785             :          * Honor reservation requested by the driver for this ZONE_DEVICE
    6786             :          * memory. We limit the total number of pages to initialize to just
    6787             :          * those that might contain the memory mapping. We will defer the
    6788             :          * ZONE_DEVICE page initialization until after we have released
    6789             :          * the hotplug lock.
    6790             :          */
    6791             :         if (zone == ZONE_DEVICE) {
    6792             :                 if (!altmap)
    6793             :                         return;
    6794             : 
    6795             :                 if (start_pfn == altmap->base_pfn)
    6796             :                         start_pfn += altmap->reserve;
    6797             :                 end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
    6798             :         }
    6799             : #endif
    6800             : 
    6801      270241 :         for (pfn = start_pfn; pfn < end_pfn; ) {
    6802             :                 /*
    6803             :                  * There can be holes in boot-time mem_map[]s handed to this
    6804             :                  * function.  They do not exist on hotplugged memory.
    6805             :                  */
    6806      270239 :                 if (context == MEMINIT_EARLY) {
    6807      270239 :                         if (overlap_memmap_init(zone, &pfn))
    6808           0 :                                 continue;
    6809             :                         if (defer_init(nid, pfn, zone_end_pfn)) {
    6810             :                                 deferred_struct_pages = true;
    6811             :                                 break;
    6812             :                         }
    6813             :                 }
    6814             : 
    6815      270239 :                 page = pfn_to_page(pfn);
    6816      270239 :                 __init_single_page(page, pfn, zone, nid);
    6817      270239 :                 if (context == MEMINIT_HOTPLUG)
    6818             :                         __SetPageReserved(page);
    6819             : 
    6820             :                 /*
    6821             :                  * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
    6822             :                  * such that unmovable allocations won't be scattered all
    6823             :                  * over the place during system boot.
    6824             :                  */
    6825      270239 :                 if (pageblock_aligned(pfn)) {
    6826         264 :                         set_pageblock_migratetype(page, migratetype);
    6827         264 :                         cond_resched();
    6828             :                 }
    6829      270239 :                 pfn++;
    6830             :         }
    6831           1 : }
    6832             : 
    6833             : #ifdef CONFIG_ZONE_DEVICE
    6834             : static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
    6835             :                                           unsigned long zone_idx, int nid,
    6836             :                                           struct dev_pagemap *pgmap)
    6837             : {
    6838             : 
    6839             :         __init_single_page(page, pfn, zone_idx, nid);
    6840             : 
    6841             :         /*
    6842             :          * Mark page reserved as it will need to wait for onlining
    6843             :          * phase for it to be fully associated with a zone.
    6844             :          *
    6845             :          * We can use the non-atomic __set_bit operation for setting
    6846             :          * the flag as we are still initializing the pages.
    6847             :          */
    6848             :         __SetPageReserved(page);
    6849             : 
    6850             :         /*
    6851             :          * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
    6852             :          * and zone_device_data.  It is a bug if a ZONE_DEVICE page is
    6853             :          * ever freed or placed on a driver-private list.
    6854             :          */
    6855             :         page->pgmap = pgmap;
    6856             :         page->zone_device_data = NULL;
    6857             : 
    6858             :         /*
    6859             :          * Mark the block movable so that blocks are reserved for
    6860             :          * movable at startup. This will force kernel allocations
    6861             :          * to reserve their blocks rather than leaking throughout
    6862             :          * the address space during boot when many long-lived
    6863             :          * kernel allocations are made.
    6864             :          *
    6865             :          * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
    6866             :          * because this is done early in section_activate()
    6867             :          */
    6868             :         if (pageblock_aligned(pfn)) {
    6869             :                 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
    6870             :                 cond_resched();
    6871             :         }
    6872             : 
    6873             :         /*
    6874             :          * ZONE_DEVICE pages are released directly to the driver page allocator
    6875             :          * which will set the page count to 1 when allocating the page.
    6876             :          */
    6877             :         if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
    6878             :             pgmap->type == MEMORY_DEVICE_COHERENT)
    6879             :                 set_page_count(page, 0);
    6880             : }
    6881             : 
    6882             : /*
    6883             :  * With compound page geometry and when struct pages are stored in ram most
    6884             :  * tail pages are reused. Consequently, the amount of unique struct pages to
    6885             :  * initialize is a lot smaller that the total amount of struct pages being
    6886             :  * mapped. This is a paired / mild layering violation with explicit knowledge
    6887             :  * of how the sparse_vmemmap internals handle compound pages in the lack
    6888             :  * of an altmap. See vmemmap_populate_compound_pages().
    6889             :  */
    6890             : static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
    6891             :                                               unsigned long nr_pages)
    6892             : {
    6893             :         return is_power_of_2(sizeof(struct page)) &&
    6894             :                 !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
    6895             : }
    6896             : 
    6897             : static void __ref memmap_init_compound(struct page *head,
    6898             :                                        unsigned long head_pfn,
    6899             :                                        unsigned long zone_idx, int nid,
    6900             :                                        struct dev_pagemap *pgmap,
    6901             :                                        unsigned long nr_pages)
    6902             : {
    6903             :         unsigned long pfn, end_pfn = head_pfn + nr_pages;
    6904             :         unsigned int order = pgmap->vmemmap_shift;
    6905             : 
    6906             :         __SetPageHead(head);
    6907             :         for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
    6908             :                 struct page *page = pfn_to_page(pfn);
    6909             : 
    6910             :                 __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
    6911             :                 prep_compound_tail(head, pfn - head_pfn);
    6912             :                 set_page_count(page, 0);
    6913             : 
    6914             :                 /*
    6915             :                  * The first tail page stores important compound page info.
    6916             :                  * Call prep_compound_head() after the first tail page has
    6917             :                  * been initialized, to not have the data overwritten.
    6918             :                  */
    6919             :                 if (pfn == head_pfn + 1)
    6920             :                         prep_compound_head(head, order);
    6921             :         }
    6922             : }
    6923             : 
    6924             : void __ref memmap_init_zone_device(struct zone *zone,
    6925             :                                    unsigned long start_pfn,
    6926             :                                    unsigned long nr_pages,
    6927             :                                    struct dev_pagemap *pgmap)
    6928             : {
    6929             :         unsigned long pfn, end_pfn = start_pfn + nr_pages;
    6930             :         struct pglist_data *pgdat = zone->zone_pgdat;
    6931             :         struct vmem_altmap *altmap = pgmap_altmap(pgmap);
    6932             :         unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
    6933             :         unsigned long zone_idx = zone_idx(zone);
    6934             :         unsigned long start = jiffies;
    6935             :         int nid = pgdat->node_id;
    6936             : 
    6937             :         if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE))
    6938             :                 return;
    6939             : 
    6940             :         /*
    6941             :          * The call to memmap_init should have already taken care
    6942             :          * of the pages reserved for the memmap, so we can just jump to
    6943             :          * the end of that region and start processing the device pages.
    6944             :          */
    6945             :         if (altmap) {
    6946             :                 start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
    6947             :                 nr_pages = end_pfn - start_pfn;
    6948             :         }
    6949             : 
    6950             :         for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
    6951             :                 struct page *page = pfn_to_page(pfn);
    6952             : 
    6953             :                 __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
    6954             : 
    6955             :                 if (pfns_per_compound == 1)
    6956             :                         continue;
    6957             : 
    6958             :                 memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
    6959             :                                      compound_nr_pages(altmap, pfns_per_compound));
    6960             :         }
    6961             : 
    6962             :         pr_info("%s initialised %lu pages in %ums\n", __func__,
    6963             :                 nr_pages, jiffies_to_msecs(jiffies - start));
    6964             : }
    6965             : 
    6966             : #endif
    6967           1 : static void __meminit zone_init_free_lists(struct zone *zone)
    6968             : {
    6969             :         unsigned int order, t;
    6970          45 :         for_each_migratetype_order(order, t) {
    6971          88 :                 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
    6972          44 :                 zone->free_area[order].nr_free = 0;
    6973             :         }
    6974           1 : }
    6975             : 
    6976             : /*
    6977             :  * Only struct pages that correspond to ranges defined by memblock.memory
    6978             :  * are zeroed and initialized by going through __init_single_page() during
    6979             :  * memmap_init_zone_range().
    6980             :  *
    6981             :  * But, there could be struct pages that correspond to holes in
    6982             :  * memblock.memory. This can happen because of the following reasons:
    6983             :  * - physical memory bank size is not necessarily the exact multiple of the
    6984             :  *   arbitrary section size
    6985             :  * - early reserved memory may not be listed in memblock.memory
    6986             :  * - memory layouts defined with memmap= kernel parameter may not align
    6987             :  *   nicely with memmap sections
    6988             :  *
    6989             :  * Explicitly initialize those struct pages so that:
    6990             :  * - PG_Reserved is set
    6991             :  * - zone and node links point to zone and node that span the page if the
    6992             :  *   hole is in the middle of a zone
    6993             :  * - zone and node links point to adjacent zone/node if the hole falls on
    6994             :  *   the zone boundary; the pages in such holes will be prepended to the
    6995             :  *   zone/node above the hole except for the trailing pages in the last
    6996             :  *   section that will be appended to the zone/node below.
    6997             :  */
    6998           1 : static void __init init_unavailable_range(unsigned long spfn,
    6999             :                                           unsigned long epfn,
    7000             :                                           int zone, int node)
    7001             : {
    7002             :         unsigned long pfn;
    7003           1 :         u64 pgcnt = 0;
    7004             : 
    7005           1 :         for (pfn = spfn; pfn < epfn; pfn++) {
    7006           0 :                 if (!pfn_valid(pageblock_start_pfn(pfn))) {
    7007           0 :                         pfn = pageblock_end_pfn(pfn) - 1;
    7008           0 :                         continue;
    7009             :                 }
    7010           0 :                 __init_single_page(pfn_to_page(pfn), pfn, zone, node);
    7011           0 :                 __SetPageReserved(pfn_to_page(pfn));
    7012           0 :                 pgcnt++;
    7013             :         }
    7014             : 
    7015           1 :         if (pgcnt)
    7016           0 :                 pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
    7017             :                         node, zone_names[zone], pgcnt);
    7018           1 : }
    7019             : 
    7020           1 : static void __init memmap_init_zone_range(struct zone *zone,
    7021             :                                           unsigned long start_pfn,
    7022             :                                           unsigned long end_pfn,
    7023             :                                           unsigned long *hole_pfn)
    7024             : {
    7025           1 :         unsigned long zone_start_pfn = zone->zone_start_pfn;
    7026           1 :         unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
    7027           1 :         int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
    7028             : 
    7029           1 :         start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
    7030           1 :         end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
    7031             : 
    7032           1 :         if (start_pfn >= end_pfn)
    7033             :                 return;
    7034             : 
    7035           1 :         memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
    7036             :                           zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
    7037             : 
    7038           1 :         if (*hole_pfn < start_pfn)
    7039           0 :                 init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
    7040             : 
    7041           1 :         *hole_pfn = end_pfn;
    7042             : }
    7043             : 
    7044           1 : static void __init memmap_init(void)
    7045             : {
    7046             :         unsigned long start_pfn, end_pfn;
    7047           1 :         unsigned long hole_pfn = 0;
    7048           1 :         int i, j, zone_id = 0, nid;
    7049             : 
    7050           2 :         for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
    7051             :                 struct pglist_data *node = NODE_DATA(nid);
    7052             : 
    7053           2 :                 for (j = 0; j < MAX_NR_ZONES; j++) {
    7054           2 :                         struct zone *zone = node->node_zones + j;
    7055             : 
    7056           2 :                         if (!populated_zone(zone))
    7057           1 :                                 continue;
    7058             : 
    7059           1 :                         memmap_init_zone_range(zone, start_pfn, end_pfn,
    7060             :                                                &hole_pfn);
    7061           1 :                         zone_id = j;
    7062             :                 }
    7063             :         }
    7064             : 
    7065             : #ifdef CONFIG_SPARSEMEM
    7066             :         /*
    7067             :          * Initialize the memory map for hole in the range [memory_end,
    7068             :          * section_end].
    7069             :          * Append the pages in this hole to the highest zone in the last
    7070             :          * node.
    7071             :          * The call to init_unavailable_range() is outside the ifdef to
    7072             :          * silence the compiler warining about zone_id set but not used;
    7073             :          * for FLATMEM it is a nop anyway
    7074             :          */
    7075             :         end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
    7076             :         if (hole_pfn < end_pfn)
    7077             : #endif
    7078           1 :                 init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
    7079           1 : }
    7080             : 
    7081           1 : void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
    7082             :                           phys_addr_t min_addr, int nid, bool exact_nid)
    7083             : {
    7084             :         void *ptr;
    7085             : 
    7086           1 :         if (exact_nid)
    7087           0 :                 ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
    7088             :                                                    MEMBLOCK_ALLOC_ACCESSIBLE,
    7089             :                                                    nid);
    7090             :         else
    7091           1 :                 ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
    7092             :                                                  MEMBLOCK_ALLOC_ACCESSIBLE,
    7093             :                                                  nid);
    7094             : 
    7095             :         if (ptr && size > 0)
    7096             :                 page_init_poison(ptr, size);
    7097             : 
    7098           1 :         return ptr;
    7099             : }
    7100             : 
    7101           3 : static int zone_batchsize(struct zone *zone)
    7102             : {
    7103             : #ifdef CONFIG_MMU
    7104             :         int batch;
    7105             : 
    7106             :         /*
    7107             :          * The number of pages to batch allocate is either ~0.1%
    7108             :          * of the zone or 1MB, whichever is smaller. The batch
    7109             :          * size is striking a balance between allocation latency
    7110             :          * and zone lock contention.
    7111             :          */
    7112           3 :         batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
    7113           3 :         batch /= 4;             /* We effectively *= 4 below */
    7114           3 :         if (batch < 1)
    7115           1 :                 batch = 1;
    7116             : 
    7117             :         /*
    7118             :          * Clamp the batch to a 2^n - 1 value. Having a power
    7119             :          * of 2 value was found to be more likely to have
    7120             :          * suboptimal cache aliasing properties in some cases.
    7121             :          *
    7122             :          * For example if 2 tasks are alternately allocating
    7123             :          * batches of pages, one task can end up with a lot
    7124             :          * of pages of one half of the possible page colors
    7125             :          * and the other with pages of the other colors.
    7126             :          */
    7127           5 :         batch = rounddown_pow_of_two(batch + batch/2) - 1;
    7128             : 
    7129           3 :         return batch;
    7130             : 
    7131             : #else
    7132             :         /* The deferral and batching of frees should be suppressed under NOMMU
    7133             :          * conditions.
    7134             :          *
    7135             :          * The problem is that NOMMU needs to be able to allocate large chunks
    7136             :          * of contiguous memory as there's no hardware page translation to
    7137             :          * assemble apparent contiguous memory from discontiguous pages.
    7138             :          *
    7139             :          * Queueing large contiguous runs of pages for batching, however,
    7140             :          * causes the pages to actually be freed in smaller chunks.  As there
    7141             :          * can be a significant delay between the individual batches being
    7142             :          * recycled, this leads to the once large chunks of space being
    7143             :          * fragmented and becoming unavailable for high-order allocations.
    7144             :          */
    7145             :         return 0;
    7146             : #endif
    7147             : }
    7148             : 
    7149           3 : static int zone_highsize(struct zone *zone, int batch, int cpu_online)
    7150             : {
    7151             : #ifdef CONFIG_MMU
    7152             :         int high;
    7153             :         int nr_split_cpus;
    7154             :         unsigned long total_pages;
    7155             : 
    7156           3 :         if (!percpu_pagelist_high_fraction) {
    7157             :                 /*
    7158             :                  * By default, the high value of the pcp is based on the zone
    7159             :                  * low watermark so that if they are full then background
    7160             :                  * reclaim will not be started prematurely.
    7161             :                  */
    7162           3 :                 total_pages = low_wmark_pages(zone);
    7163             :         } else {
    7164             :                 /*
    7165             :                  * If percpu_pagelist_high_fraction is configured, the high
    7166             :                  * value is based on a fraction of the managed pages in the
    7167             :                  * zone.
    7168             :                  */
    7169           0 :                 total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
    7170             :         }
    7171             : 
    7172             :         /*
    7173             :          * Split the high value across all online CPUs local to the zone. Note
    7174             :          * that early in boot that CPUs may not be online yet and that during
    7175             :          * CPU hotplug that the cpumask is not yet updated when a CPU is being
    7176             :          * onlined. For memory nodes that have no CPUs, split pcp->high across
    7177             :          * all online CPUs to mitigate the risk that reclaim is triggered
    7178             :          * prematurely due to pages stored on pcp lists.
    7179             :          */
    7180           6 :         nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
    7181           3 :         if (!nr_split_cpus)
    7182           0 :                 nr_split_cpus = num_online_cpus();
    7183           3 :         high = total_pages / nr_split_cpus;
    7184             : 
    7185             :         /*
    7186             :          * Ensure high is at least batch*4. The multiple is based on the
    7187             :          * historical relationship between high and batch.
    7188             :          */
    7189           3 :         high = max(high, batch << 2);
    7190             : 
    7191           3 :         return high;
    7192             : #else
    7193             :         return 0;
    7194             : #endif
    7195             : }
    7196             : 
    7197             : /*
    7198             :  * pcp->high and pcp->batch values are related and generally batch is lower
    7199             :  * than high. They are also related to pcp->count such that count is lower
    7200             :  * than high, and as soon as it reaches high, the pcplist is flushed.
    7201             :  *
    7202             :  * However, guaranteeing these relations at all times would require e.g. write
    7203             :  * barriers here but also careful usage of read barriers at the read side, and
    7204             :  * thus be prone to error and bad for performance. Thus the update only prevents
    7205             :  * store tearing. Any new users of pcp->batch and pcp->high should ensure they
    7206             :  * can cope with those fields changing asynchronously, and fully trust only the
    7207             :  * pcp->count field on the local CPU with interrupts disabled.
    7208             :  *
    7209             :  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
    7210             :  * outside of boot time (or some other assurance that no concurrent updaters
    7211             :  * exist).
    7212             :  */
    7213             : static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
    7214             :                 unsigned long batch)
    7215             : {
    7216           3 :         WRITE_ONCE(pcp->batch, batch);
    7217           3 :         WRITE_ONCE(pcp->high, high);
    7218             : }
    7219             : 
    7220           2 : static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
    7221             : {
    7222             :         int pindex;
    7223             : 
    7224           2 :         memset(pcp, 0, sizeof(*pcp));
    7225           2 :         memset(pzstats, 0, sizeof(*pzstats));
    7226             : 
    7227           2 :         spin_lock_init(&pcp->lock);
    7228          26 :         for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
    7229          48 :                 INIT_LIST_HEAD(&pcp->lists[pindex]);
    7230             : 
    7231             :         /*
    7232             :          * Set batch and high values safe for a boot pageset. A true percpu
    7233             :          * pageset's initialization will update them subsequently. Here we don't
    7234             :          * need to be as careful as pageset_update() as nobody can access the
    7235             :          * pageset yet.
    7236             :          */
    7237           2 :         pcp->high = BOOT_PAGESET_HIGH;
    7238           2 :         pcp->batch = BOOT_PAGESET_BATCH;
    7239           2 :         pcp->free_factor = 0;
    7240           2 : }
    7241             : 
    7242             : static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
    7243             :                 unsigned long batch)
    7244             : {
    7245             :         struct per_cpu_pages *pcp;
    7246             :         int cpu;
    7247             : 
    7248           3 :         for_each_possible_cpu(cpu) {
    7249           3 :                 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    7250           3 :                 pageset_update(pcp, high, batch);
    7251             :         }
    7252             : }
    7253             : 
    7254             : /*
    7255             :  * Calculate and set new high and batch values for all per-cpu pagesets of a
    7256             :  * zone based on the zone's size.
    7257             :  */
    7258           3 : static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
    7259             : {
    7260             :         int new_high, new_batch;
    7261             : 
    7262           3 :         new_batch = max(1, zone_batchsize(zone));
    7263           3 :         new_high = zone_highsize(zone, new_batch, cpu_online);
    7264             : 
    7265           3 :         if (zone->pageset_high == new_high &&
    7266           0 :             zone->pageset_batch == new_batch)
    7267             :                 return;
    7268             : 
    7269           3 :         zone->pageset_high = new_high;
    7270           3 :         zone->pageset_batch = new_batch;
    7271             : 
    7272           3 :         __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
    7273             : }
    7274             : 
    7275           1 : void __meminit setup_zone_pageset(struct zone *zone)
    7276             : {
    7277             :         int cpu;
    7278             : 
    7279             :         /* Size may be 0 on !SMP && !NUMA */
    7280             :         if (sizeof(struct per_cpu_zonestat) > 0)
    7281             :                 zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
    7282             : 
    7283           1 :         zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
    7284           2 :         for_each_possible_cpu(cpu) {
    7285             :                 struct per_cpu_pages *pcp;
    7286             :                 struct per_cpu_zonestat *pzstats;
    7287             : 
    7288           1 :                 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    7289           1 :                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
    7290           1 :                 per_cpu_pages_init(pcp, pzstats);
    7291             :         }
    7292             : 
    7293           1 :         zone_set_pageset_high_and_batch(zone, 0);
    7294           1 : }
    7295             : 
    7296             : /*
    7297             :  * The zone indicated has a new number of managed_pages; batch sizes and percpu
    7298             :  * page high values need to be recalculated.
    7299             :  */
    7300           2 : static void zone_pcp_update(struct zone *zone, int cpu_online)
    7301             : {
    7302           2 :         mutex_lock(&pcp_batch_high_lock);
    7303           2 :         zone_set_pageset_high_and_batch(zone, cpu_online);
    7304           2 :         mutex_unlock(&pcp_batch_high_lock);
    7305           2 : }
    7306             : 
    7307             : /*
    7308             :  * Allocate per cpu pagesets and initialize them.
    7309             :  * Before this call only boot pagesets were available.
    7310             :  */
    7311           1 : void __init setup_per_cpu_pageset(void)
    7312             : {
    7313             :         struct pglist_data *pgdat;
    7314             :         struct zone *zone;
    7315             :         int __maybe_unused cpu;
    7316             : 
    7317           3 :         for_each_populated_zone(zone)
    7318           1 :                 setup_zone_pageset(zone);
    7319             : 
    7320             : #ifdef CONFIG_NUMA
    7321             :         /*
    7322             :          * Unpopulated zones continue using the boot pagesets.
    7323             :          * The numa stats for these pagesets need to be reset.
    7324             :          * Otherwise, they will end up skewing the stats of
    7325             :          * the nodes these zones are associated with.
    7326             :          */
    7327             :         for_each_possible_cpu(cpu) {
    7328             :                 struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
    7329             :                 memset(pzstats->vm_numa_event, 0,
    7330             :                        sizeof(pzstats->vm_numa_event));
    7331             :         }
    7332             : #endif
    7333             : 
    7334           2 :         for_each_online_pgdat(pgdat)
    7335           1 :                 pgdat->per_cpu_nodestats =
    7336           1 :                         alloc_percpu(struct per_cpu_nodestat);
    7337           1 : }
    7338             : 
    7339             : static __meminit void zone_pcp_init(struct zone *zone)
    7340             : {
    7341             :         /*
    7342             :          * per cpu subsystem is not up at this point. The following code
    7343             :          * relies on the ability of the linker to provide the
    7344             :          * offset of a (static) per cpu variable into the per cpu area.
    7345             :          */
    7346           2 :         zone->per_cpu_pageset = &boot_pageset;
    7347           2 :         zone->per_cpu_zonestats = &boot_zonestats;
    7348           2 :         zone->pageset_high = BOOT_PAGESET_HIGH;
    7349           2 :         zone->pageset_batch = BOOT_PAGESET_BATCH;
    7350             : 
    7351           2 :         if (populated_zone(zone))
    7352             :                 pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
    7353             :                          zone->present_pages, zone_batchsize(zone));
    7354             : }
    7355             : 
    7356           1 : void __meminit init_currently_empty_zone(struct zone *zone,
    7357             :                                         unsigned long zone_start_pfn,
    7358             :                                         unsigned long size)
    7359             : {
    7360           1 :         struct pglist_data *pgdat = zone->zone_pgdat;
    7361           1 :         int zone_idx = zone_idx(zone) + 1;
    7362             : 
    7363           1 :         if (zone_idx > pgdat->nr_zones)
    7364           1 :                 pgdat->nr_zones = zone_idx;
    7365             : 
    7366           1 :         zone->zone_start_pfn = zone_start_pfn;
    7367             : 
    7368           1 :         mminit_dprintk(MMINIT_TRACE, "memmap_init",
    7369             :                         "Initialising map node %d zone %lu pfns %lu -> %lu\n",
    7370             :                         pgdat->node_id,
    7371             :                         (unsigned long)zone_idx(zone),
    7372             :                         zone_start_pfn, (zone_start_pfn + size));
    7373             : 
    7374           1 :         zone_init_free_lists(zone);
    7375           1 :         zone->initialized = 1;
    7376           1 : }
    7377             : 
    7378             : /**
    7379             :  * get_pfn_range_for_nid - Return the start and end page frames for a node
    7380             :  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
    7381             :  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
    7382             :  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
    7383             :  *
    7384             :  * It returns the start and end page frame of a node based on information
    7385             :  * provided by memblock_set_node(). If called for a node
    7386             :  * with no available memory, a warning is printed and the start and end
    7387             :  * PFNs will be 0.
    7388             :  */
    7389           1 : void __init get_pfn_range_for_nid(unsigned int nid,
    7390             :                         unsigned long *start_pfn, unsigned long *end_pfn)
    7391             : {
    7392             :         unsigned long this_start_pfn, this_end_pfn;
    7393             :         int i;
    7394             : 
    7395           1 :         *start_pfn = -1UL;
    7396           1 :         *end_pfn = 0;
    7397             : 
    7398           2 :         for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
    7399           1 :                 *start_pfn = min(*start_pfn, this_start_pfn);
    7400           1 :                 *end_pfn = max(*end_pfn, this_end_pfn);
    7401             :         }
    7402             : 
    7403           1 :         if (*start_pfn == -1UL)
    7404           0 :                 *start_pfn = 0;
    7405           1 : }
    7406             : 
    7407             : /*
    7408             :  * This finds a zone that can be used for ZONE_MOVABLE pages. The
    7409             :  * assumption is made that zones within a node are ordered in monotonic
    7410             :  * increasing memory addresses so that the "highest" populated zone is used
    7411             :  */
    7412           1 : static void __init find_usable_zone_for_movable(void)
    7413             : {
    7414             :         int zone_index;
    7415           2 :         for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
    7416           2 :                 if (zone_index == ZONE_MOVABLE)
    7417           1 :                         continue;
    7418             : 
    7419           2 :                 if (arch_zone_highest_possible_pfn[zone_index] >
    7420           1 :                                 arch_zone_lowest_possible_pfn[zone_index])
    7421             :                         break;
    7422             :         }
    7423             : 
    7424             :         VM_BUG_ON(zone_index == -1);
    7425           1 :         movable_zone = zone_index;
    7426           1 : }
    7427             : 
    7428             : /*
    7429             :  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
    7430             :  * because it is sized independent of architecture. Unlike the other zones,
    7431             :  * the starting point for ZONE_MOVABLE is not fixed. It may be different
    7432             :  * in each node depending on the size of each node and how evenly kernelcore
    7433             :  * is distributed. This helper function adjusts the zone ranges
    7434             :  * provided by the architecture for a given node by using the end of the
    7435             :  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
    7436             :  * zones within a node are in order of monotonic increases memory addresses
    7437             :  */
    7438           4 : static void __init adjust_zone_range_for_zone_movable(int nid,
    7439             :                                         unsigned long zone_type,
    7440             :                                         unsigned long node_start_pfn,
    7441             :                                         unsigned long node_end_pfn,
    7442             :                                         unsigned long *zone_start_pfn,
    7443             :                                         unsigned long *zone_end_pfn)
    7444             : {
    7445             :         /* Only adjust if ZONE_MOVABLE is on this node */
    7446           4 :         if (zone_movable_pfn[nid]) {
    7447             :                 /* Size ZONE_MOVABLE */
    7448           0 :                 if (zone_type == ZONE_MOVABLE) {
    7449           0 :                         *zone_start_pfn = zone_movable_pfn[nid];
    7450           0 :                         *zone_end_pfn = min(node_end_pfn,
    7451             :                                 arch_zone_highest_possible_pfn[movable_zone]);
    7452             : 
    7453             :                 /* Adjust for ZONE_MOVABLE starting within this range */
    7454           0 :                 } else if (!mirrored_kernelcore &&
    7455           0 :                         *zone_start_pfn < zone_movable_pfn[nid] &&
    7456           0 :                         *zone_end_pfn > zone_movable_pfn[nid]) {
    7457           0 :                         *zone_end_pfn = zone_movable_pfn[nid];
    7458             : 
    7459             :                 /* Check if this whole range is within ZONE_MOVABLE */
    7460           0 :                 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
    7461           0 :                         *zone_start_pfn = *zone_end_pfn;
    7462             :         }
    7463           4 : }
    7464             : 
    7465             : /*
    7466             :  * Return the number of pages a zone spans in a node, including holes
    7467             :  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
    7468             :  */
    7469           2 : static unsigned long __init zone_spanned_pages_in_node(int nid,
    7470             :                                         unsigned long zone_type,
    7471             :                                         unsigned long node_start_pfn,
    7472             :                                         unsigned long node_end_pfn,
    7473             :                                         unsigned long *zone_start_pfn,
    7474             :                                         unsigned long *zone_end_pfn)
    7475             : {
    7476           2 :         unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
    7477           2 :         unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
    7478             :         /* When hotadd a new node from cpu_up(), the node should be empty */
    7479           2 :         if (!node_start_pfn && !node_end_pfn)
    7480             :                 return 0;
    7481             : 
    7482             :         /* Get the start and end of the zone */
    7483           2 :         *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
    7484           2 :         *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
    7485           2 :         adjust_zone_range_for_zone_movable(nid, zone_type,
    7486             :                                 node_start_pfn, node_end_pfn,
    7487             :                                 zone_start_pfn, zone_end_pfn);
    7488             : 
    7489             :         /* Check that this node has pages within the zone's required range */
    7490           2 :         if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
    7491             :                 return 0;
    7492             : 
    7493             :         /* Move the zone boundaries inside the node if necessary */
    7494           2 :         *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
    7495           2 :         *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
    7496             : 
    7497             :         /* Return the spanned pages */
    7498           2 :         return *zone_end_pfn - *zone_start_pfn;
    7499             : }
    7500             : 
    7501             : /*
    7502             :  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
    7503             :  * then all holes in the requested range will be accounted for.
    7504             :  */
    7505           2 : unsigned long __init __absent_pages_in_range(int nid,
    7506             :                                 unsigned long range_start_pfn,
    7507             :                                 unsigned long range_end_pfn)
    7508             : {
    7509           2 :         unsigned long nr_absent = range_end_pfn - range_start_pfn;
    7510             :         unsigned long start_pfn, end_pfn;
    7511             :         int i;
    7512             : 
    7513           4 :         for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
    7514           2 :                 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
    7515           2 :                 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
    7516           2 :                 nr_absent -= end_pfn - start_pfn;
    7517             :         }
    7518           2 :         return nr_absent;
    7519             : }
    7520             : 
    7521             : /**
    7522             :  * absent_pages_in_range - Return number of page frames in holes within a range
    7523             :  * @start_pfn: The start PFN to start searching for holes
    7524             :  * @end_pfn: The end PFN to stop searching for holes
    7525             :  *
    7526             :  * Return: the number of pages frames in memory holes within a range.
    7527             :  */
    7528           0 : unsigned long __init absent_pages_in_range(unsigned long start_pfn,
    7529             :                                                         unsigned long end_pfn)
    7530             : {
    7531           0 :         return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
    7532             : }
    7533             : 
    7534             : /* Return the number of page frames in holes in a zone on a node */
    7535           2 : static unsigned long __init zone_absent_pages_in_node(int nid,
    7536             :                                         unsigned long zone_type,
    7537             :                                         unsigned long node_start_pfn,
    7538             :                                         unsigned long node_end_pfn)
    7539             : {
    7540           2 :         unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
    7541           2 :         unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
    7542             :         unsigned long zone_start_pfn, zone_end_pfn;
    7543             :         unsigned long nr_absent;
    7544             : 
    7545             :         /* When hotadd a new node from cpu_up(), the node should be empty */
    7546           2 :         if (!node_start_pfn && !node_end_pfn)
    7547             :                 return 0;
    7548             : 
    7549           2 :         zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
    7550           2 :         zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
    7551             : 
    7552           2 :         adjust_zone_range_for_zone_movable(nid, zone_type,
    7553             :                         node_start_pfn, node_end_pfn,
    7554             :                         &zone_start_pfn, &zone_end_pfn);
    7555           2 :         nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
    7556             : 
    7557             :         /*
    7558             :          * ZONE_MOVABLE handling.
    7559             :          * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
    7560             :          * and vice versa.
    7561             :          */
    7562           2 :         if (mirrored_kernelcore && zone_movable_pfn[nid]) {
    7563             :                 unsigned long start_pfn, end_pfn;
    7564             :                 struct memblock_region *r;
    7565             : 
    7566           0 :                 for_each_mem_region(r) {
    7567           0 :                         start_pfn = clamp(memblock_region_memory_base_pfn(r),
    7568             :                                           zone_start_pfn, zone_end_pfn);
    7569           0 :                         end_pfn = clamp(memblock_region_memory_end_pfn(r),
    7570             :                                         zone_start_pfn, zone_end_pfn);
    7571             : 
    7572           0 :                         if (zone_type == ZONE_MOVABLE &&
    7573           0 :                             memblock_is_mirror(r))
    7574           0 :                                 nr_absent += end_pfn - start_pfn;
    7575             : 
    7576           0 :                         if (zone_type == ZONE_NORMAL &&
    7577           0 :                             !memblock_is_mirror(r))
    7578           0 :                                 nr_absent += end_pfn - start_pfn;
    7579             :                 }
    7580             :         }
    7581             : 
    7582             :         return nr_absent;
    7583             : }
    7584             : 
    7585           1 : static void __init calculate_node_totalpages(struct pglist_data *pgdat,
    7586             :                                                 unsigned long node_start_pfn,
    7587             :                                                 unsigned long node_end_pfn)
    7588             : {
    7589           1 :         unsigned long realtotalpages = 0, totalpages = 0;
    7590             :         enum zone_type i;
    7591             : 
    7592           3 :         for (i = 0; i < MAX_NR_ZONES; i++) {
    7593           2 :                 struct zone *zone = pgdat->node_zones + i;
    7594             :                 unsigned long zone_start_pfn, zone_end_pfn;
    7595             :                 unsigned long spanned, absent;
    7596             :                 unsigned long size, real_size;
    7597             : 
    7598           2 :                 spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
    7599             :                                                      node_start_pfn,
    7600             :                                                      node_end_pfn,
    7601             :                                                      &zone_start_pfn,
    7602             :                                                      &zone_end_pfn);
    7603           2 :                 absent = zone_absent_pages_in_node(pgdat->node_id, i,
    7604             :                                                    node_start_pfn,
    7605             :                                                    node_end_pfn);
    7606             : 
    7607           2 :                 size = spanned;
    7608           2 :                 real_size = size - absent;
    7609             : 
    7610           2 :                 if (size)
    7611           1 :                         zone->zone_start_pfn = zone_start_pfn;
    7612             :                 else
    7613           1 :                         zone->zone_start_pfn = 0;
    7614           2 :                 zone->spanned_pages = size;
    7615           2 :                 zone->present_pages = real_size;
    7616             : #if defined(CONFIG_MEMORY_HOTPLUG)
    7617             :                 zone->present_early_pages = real_size;
    7618             : #endif
    7619             : 
    7620           2 :                 totalpages += size;
    7621           2 :                 realtotalpages += real_size;
    7622             :         }
    7623             : 
    7624           1 :         pgdat->node_spanned_pages = totalpages;
    7625           1 :         pgdat->node_present_pages = realtotalpages;
    7626             :         pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
    7627           1 : }
    7628             : 
    7629             : #ifndef CONFIG_SPARSEMEM
    7630             : /*
    7631             :  * Calculate the size of the zone->blockflags rounded to an unsigned long
    7632             :  * Start by making sure zonesize is a multiple of pageblock_order by rounding
    7633             :  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
    7634             :  * round what is now in bits to nearest long in bits, then return it in
    7635             :  * bytes.
    7636             :  */
    7637           1 : static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
    7638             : {
    7639             :         unsigned long usemapsize;
    7640             : 
    7641           1 :         zonesize += zone_start_pfn & (pageblock_nr_pages-1);
    7642           1 :         usemapsize = roundup(zonesize, pageblock_nr_pages);
    7643           1 :         usemapsize = usemapsize >> pageblock_order;
    7644           1 :         usemapsize *= NR_PAGEBLOCK_BITS;
    7645           1 :         usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
    7646             : 
    7647           1 :         return usemapsize / 8;
    7648             : }
    7649             : 
    7650           1 : static void __ref setup_usemap(struct zone *zone)
    7651             : {
    7652           1 :         unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
    7653             :                                                zone->spanned_pages);
    7654           1 :         zone->pageblock_flags = NULL;
    7655           1 :         if (usemapsize) {
    7656           1 :                 zone->pageblock_flags =
    7657           2 :                         memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
    7658             :                                             zone_to_nid(zone));
    7659           1 :                 if (!zone->pageblock_flags)
    7660           0 :                         panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
    7661             :                               usemapsize, zone->name, zone_to_nid(zone));
    7662             :         }
    7663           1 : }
    7664             : #else
    7665             : static inline void setup_usemap(struct zone *zone) {}
    7666             : #endif /* CONFIG_SPARSEMEM */
    7667             : 
    7668             : #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
    7669             : 
    7670             : /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
    7671             : void __init set_pageblock_order(void)
    7672             : {
    7673             :         unsigned int order = MAX_ORDER - 1;
    7674             : 
    7675             :         /* Check that pageblock_nr_pages has not already been setup */
    7676             :         if (pageblock_order)
    7677             :                 return;
    7678             : 
    7679             :         /* Don't let pageblocks exceed the maximum allocation granularity. */
    7680             :         if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
    7681             :                 order = HUGETLB_PAGE_ORDER;
    7682             : 
    7683             :         /*
    7684             :          * Assume the largest contiguous order of interest is a huge page.
    7685             :          * This value may be variable depending on boot parameters on IA64 and
    7686             :          * powerpc.
    7687             :          */
    7688             :         pageblock_order = order;
    7689             : }
    7690             : #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
    7691             : 
    7692             : /*
    7693             :  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
    7694             :  * is unused as pageblock_order is set at compile-time. See
    7695             :  * include/linux/pageblock-flags.h for the values of pageblock_order based on
    7696             :  * the kernel config
    7697             :  */
    7698           0 : void __init set_pageblock_order(void)
    7699             : {
    7700           0 : }
    7701             : 
    7702             : #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
    7703             : 
    7704             : static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
    7705             :                                                 unsigned long present_pages)
    7706             : {
    7707           2 :         unsigned long pages = spanned_pages;
    7708             : 
    7709             :         /*
    7710             :          * Provide a more accurate estimation if there are holes within
    7711             :          * the zone and SPARSEMEM is in use. If there are holes within the
    7712             :          * zone, each populated memory region may cost us one or two extra
    7713             :          * memmap pages due to alignment because memmap pages for each
    7714             :          * populated regions may not be naturally aligned on page boundary.
    7715             :          * So the (present_pages >> 4) heuristic is a tradeoff for that.
    7716             :          */
    7717             :         if (spanned_pages > present_pages + (present_pages >> 4) &&
    7718             :             IS_ENABLED(CONFIG_SPARSEMEM))
    7719             :                 pages = present_pages;
    7720             : 
    7721           2 :         return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
    7722             : }
    7723             : 
    7724             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    7725             : static void pgdat_init_split_queue(struct pglist_data *pgdat)
    7726             : {
    7727             :         struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
    7728             : 
    7729             :         spin_lock_init(&ds_queue->split_queue_lock);
    7730             :         INIT_LIST_HEAD(&ds_queue->split_queue);
    7731             :         ds_queue->split_queue_len = 0;
    7732             : }
    7733             : #else
    7734             : static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
    7735             : #endif
    7736             : 
    7737             : #ifdef CONFIG_COMPACTION
    7738             : static void pgdat_init_kcompactd(struct pglist_data *pgdat)
    7739             : {
    7740           1 :         init_waitqueue_head(&pgdat->kcompactd_wait);
    7741             : }
    7742             : #else
    7743             : static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
    7744             : #endif
    7745             : 
    7746           1 : static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
    7747             : {
    7748             :         int i;
    7749             : 
    7750           1 :         pgdat_resize_init(pgdat);
    7751           1 :         pgdat_kswapd_lock_init(pgdat);
    7752             : 
    7753           1 :         pgdat_init_split_queue(pgdat);
    7754           1 :         pgdat_init_kcompactd(pgdat);
    7755             : 
    7756           1 :         init_waitqueue_head(&pgdat->kswapd_wait);
    7757           1 :         init_waitqueue_head(&pgdat->pfmemalloc_wait);
    7758             : 
    7759           5 :         for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
    7760           4 :                 init_waitqueue_head(&pgdat->reclaim_wait[i]);
    7761             : 
    7762           1 :         pgdat_page_ext_init(pgdat);
    7763           1 :         lruvec_init(&pgdat->__lruvec);
    7764           1 : }
    7765             : 
    7766           2 : static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
    7767             :                                                         unsigned long remaining_pages)
    7768             : {
    7769           4 :         atomic_long_set(&zone->managed_pages, remaining_pages);
    7770           2 :         zone_set_nid(zone, nid);
    7771           2 :         zone->name = zone_names[idx];
    7772           2 :         zone->zone_pgdat = NODE_DATA(nid);
    7773           2 :         spin_lock_init(&zone->lock);
    7774           2 :         zone_seqlock_init(zone);
    7775           2 :         zone_pcp_init(zone);
    7776           2 : }
    7777             : 
    7778             : /*
    7779             :  * Set up the zone data structures
    7780             :  * - init pgdat internals
    7781             :  * - init all zones belonging to this node
    7782             :  *
    7783             :  * NOTE: this function is only called during memory hotplug
    7784             :  */
    7785             : #ifdef CONFIG_MEMORY_HOTPLUG
    7786             : void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
    7787             : {
    7788             :         int nid = pgdat->node_id;
    7789             :         enum zone_type z;
    7790             :         int cpu;
    7791             : 
    7792             :         pgdat_init_internals(pgdat);
    7793             : 
    7794             :         if (pgdat->per_cpu_nodestats == &boot_nodestats)
    7795             :                 pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
    7796             : 
    7797             :         /*
    7798             :          * Reset the nr_zones, order and highest_zoneidx before reuse.
    7799             :          * Note that kswapd will init kswapd_highest_zoneidx properly
    7800             :          * when it starts in the near future.
    7801             :          */
    7802             :         pgdat->nr_zones = 0;
    7803             :         pgdat->kswapd_order = 0;
    7804             :         pgdat->kswapd_highest_zoneidx = 0;
    7805             :         pgdat->node_start_pfn = 0;
    7806             :         for_each_online_cpu(cpu) {
    7807             :                 struct per_cpu_nodestat *p;
    7808             : 
    7809             :                 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
    7810             :                 memset(p, 0, sizeof(*p));
    7811             :         }
    7812             : 
    7813             :         for (z = 0; z < MAX_NR_ZONES; z++)
    7814             :                 zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
    7815             : }
    7816             : #endif
    7817             : 
    7818             : /*
    7819             :  * Set up the zone data structures:
    7820             :  *   - mark all pages reserved
    7821             :  *   - mark all memory queues empty
    7822             :  *   - clear the memory bitmaps
    7823             :  *
    7824             :  * NOTE: pgdat should get zeroed by caller.
    7825             :  * NOTE: this function is only called during early init.
    7826             :  */
    7827           1 : static void __init free_area_init_core(struct pglist_data *pgdat)
    7828             : {
    7829             :         enum zone_type j;
    7830           1 :         int nid = pgdat->node_id;
    7831             : 
    7832           1 :         pgdat_init_internals(pgdat);
    7833           1 :         pgdat->per_cpu_nodestats = &boot_nodestats;
    7834             : 
    7835           3 :         for (j = 0; j < MAX_NR_ZONES; j++) {
    7836           2 :                 struct zone *zone = pgdat->node_zones + j;
    7837             :                 unsigned long size, freesize, memmap_pages;
    7838             : 
    7839           2 :                 size = zone->spanned_pages;
    7840           2 :                 freesize = zone->present_pages;
    7841             : 
    7842             :                 /*
    7843             :                  * Adjust freesize so that it accounts for how much memory
    7844             :                  * is used by this zone for memmap. This affects the watermark
    7845             :                  * and per-cpu initialisations
    7846             :                  */
    7847           4 :                 memmap_pages = calc_memmap_size(size, freesize);
    7848           2 :                 if (!is_highmem_idx(j)) {
    7849           2 :                         if (freesize >= memmap_pages) {
    7850           2 :                                 freesize -= memmap_pages;
    7851             :                                 if (memmap_pages)
    7852             :                                         pr_debug("  %s zone: %lu pages used for memmap\n",
    7853             :                                                  zone_names[j], memmap_pages);
    7854             :                         } else
    7855           0 :                                 pr_warn("  %s zone: %lu memmap pages exceeds freesize %lu\n",
    7856             :                                         zone_names[j], memmap_pages, freesize);
    7857             :                 }
    7858             : 
    7859             :                 /* Account for reserved pages */
    7860           2 :                 if (j == 0 && freesize > dma_reserve) {
    7861           1 :                         freesize -= dma_reserve;
    7862             :                         pr_debug("  %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
    7863             :                 }
    7864             : 
    7865           2 :                 if (!is_highmem_idx(j))
    7866           2 :                         nr_kernel_pages += freesize;
    7867             :                 /* Charge for highmem memmap if there are enough kernel pages */
    7868             :                 else if (nr_kernel_pages > memmap_pages * 2)
    7869             :                         nr_kernel_pages -= memmap_pages;
    7870           2 :                 nr_all_pages += freesize;
    7871             : 
    7872             :                 /*
    7873             :                  * Set an approximate value for lowmem here, it will be adjusted
    7874             :                  * when the bootmem allocator frees pages into the buddy system.
    7875             :                  * And all highmem pages will be managed by the buddy system.
    7876             :                  */
    7877           2 :                 zone_init_internals(zone, j, nid, freesize);
    7878             : 
    7879           2 :                 if (!size)
    7880           1 :                         continue;
    7881             : 
    7882             :                 set_pageblock_order();
    7883           1 :                 setup_usemap(zone);
    7884           1 :                 init_currently_empty_zone(zone, zone->zone_start_pfn, size);
    7885             :         }
    7886           1 : }
    7887             : 
    7888             : #ifdef CONFIG_FLATMEM
    7889           1 : static void __init alloc_node_mem_map(struct pglist_data *pgdat)
    7890             : {
    7891           1 :         unsigned long __maybe_unused start = 0;
    7892           1 :         unsigned long __maybe_unused offset = 0;
    7893             : 
    7894             :         /* Skip empty nodes */
    7895           1 :         if (!pgdat->node_spanned_pages)
    7896             :                 return;
    7897             : 
    7898           1 :         start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
    7899           1 :         offset = pgdat->node_start_pfn - start;
    7900             :         /* ia64 gets its own node_mem_map, before this, without bootmem */
    7901           1 :         if (!pgdat->node_mem_map) {
    7902             :                 unsigned long size, end;
    7903             :                 struct page *map;
    7904             : 
    7905             :                 /*
    7906             :                  * The zone's endpoints aren't required to be MAX_ORDER
    7907             :                  * aligned but the node_mem_map endpoints must be in order
    7908             :                  * for the buddy allocator to function correctly.
    7909             :                  */
    7910           2 :                 end = pgdat_end_pfn(pgdat);
    7911           1 :                 end = ALIGN(end, MAX_ORDER_NR_PAGES);
    7912           1 :                 size =  (end - start) * sizeof(struct page);
    7913           1 :                 map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
    7914             :                                    pgdat->node_id, false);
    7915           1 :                 if (!map)
    7916           0 :                         panic("Failed to allocate %ld bytes for node %d memory map\n",
    7917             :                               size, pgdat->node_id);
    7918           1 :                 pgdat->node_mem_map = map + offset;
    7919             :         }
    7920             :         pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
    7921             :                                 __func__, pgdat->node_id, (unsigned long)pgdat,
    7922             :                                 (unsigned long)pgdat->node_mem_map);
    7923             : #ifndef CONFIG_NUMA
    7924             :         /*
    7925             :          * With no DISCONTIG, the global mem_map is just set as node 0's
    7926             :          */
    7927           1 :         if (pgdat == NODE_DATA(0)) {
    7928           1 :                 mem_map = NODE_DATA(0)->node_mem_map;
    7929           1 :                 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
    7930           0 :                         mem_map -= offset;
    7931             :         }
    7932             : #endif
    7933             : }
    7934             : #else
    7935             : static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
    7936             : #endif /* CONFIG_FLATMEM */
    7937             : 
    7938             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    7939             : static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
    7940             : {
    7941             :         pgdat->first_deferred_pfn = ULONG_MAX;
    7942             : }
    7943             : #else
    7944             : static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
    7945             : #endif
    7946             : 
    7947           1 : static void __init free_area_init_node(int nid)
    7948             : {
    7949           1 :         pg_data_t *pgdat = NODE_DATA(nid);
    7950           1 :         unsigned long start_pfn = 0;
    7951           1 :         unsigned long end_pfn = 0;
    7952             : 
    7953             :         /* pg_data_t should be reset to zero when it's allocated */
    7954           1 :         WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
    7955             : 
    7956           1 :         get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
    7957             : 
    7958           1 :         pgdat->node_id = nid;
    7959           1 :         pgdat->node_start_pfn = start_pfn;
    7960           1 :         pgdat->per_cpu_nodestats = NULL;
    7961             : 
    7962           1 :         if (start_pfn != end_pfn) {
    7963           1 :                 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
    7964             :                         (u64)start_pfn << PAGE_SHIFT,
    7965             :                         end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
    7966             :         } else {
    7967           0 :                 pr_info("Initmem setup node %d as memoryless\n", nid);
    7968             :         }
    7969             : 
    7970           1 :         calculate_node_totalpages(pgdat, start_pfn, end_pfn);
    7971             : 
    7972           1 :         alloc_node_mem_map(pgdat);
    7973             :         pgdat_set_deferred_range(pgdat);
    7974             : 
    7975           1 :         free_area_init_core(pgdat);
    7976             :         lru_gen_init_pgdat(pgdat);
    7977           1 : }
    7978             : 
    7979             : static void __init free_area_init_memoryless_node(int nid)
    7980             : {
    7981             :         free_area_init_node(nid);
    7982             : }
    7983             : 
    7984             : #if MAX_NUMNODES > 1
    7985             : /*
    7986             :  * Figure out the number of possible node ids.
    7987             :  */
    7988             : void __init setup_nr_node_ids(void)
    7989             : {
    7990             :         unsigned int highest;
    7991             : 
    7992             :         highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
    7993             :         nr_node_ids = highest + 1;
    7994             : }
    7995             : #endif
    7996             : 
    7997             : /**
    7998             :  * node_map_pfn_alignment - determine the maximum internode alignment
    7999             :  *
    8000             :  * This function should be called after node map is populated and sorted.
    8001             :  * It calculates the maximum power of two alignment which can distinguish
    8002             :  * all the nodes.
    8003             :  *
    8004             :  * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
    8005             :  * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
    8006             :  * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
    8007             :  * shifted, 1GiB is enough and this function will indicate so.
    8008             :  *
    8009             :  * This is used to test whether pfn -> nid mapping of the chosen memory
    8010             :  * model has fine enough granularity to avoid incorrect mapping for the
    8011             :  * populated node map.
    8012             :  *
    8013             :  * Return: the determined alignment in pfn's.  0 if there is no alignment
    8014             :  * requirement (single node).
    8015             :  */
    8016           0 : unsigned long __init node_map_pfn_alignment(void)
    8017             : {
    8018           0 :         unsigned long accl_mask = 0, last_end = 0;
    8019             :         unsigned long start, end, mask;
    8020           0 :         int last_nid = NUMA_NO_NODE;
    8021             :         int i, nid;
    8022             : 
    8023           0 :         for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
    8024           0 :                 if (!start || last_nid < 0 || last_nid == nid) {
    8025           0 :                         last_nid = nid;
    8026           0 :                         last_end = end;
    8027           0 :                         continue;
    8028             :                 }
    8029             : 
    8030             :                 /*
    8031             :                  * Start with a mask granular enough to pin-point to the
    8032             :                  * start pfn and tick off bits one-by-one until it becomes
    8033             :                  * too coarse to separate the current node from the last.
    8034             :                  */
    8035           0 :                 mask = ~((1 << __ffs(start)) - 1);
    8036           0 :                 while (mask && last_end <= (start & (mask << 1)))
    8037             :                         mask <<= 1;
    8038             : 
    8039             :                 /* accumulate all internode masks */
    8040           0 :                 accl_mask |= mask;
    8041             :         }
    8042             : 
    8043             :         /* convert mask to number of pages */
    8044           0 :         return ~accl_mask + 1;
    8045             : }
    8046             : 
    8047             : /*
    8048             :  * early_calculate_totalpages()
    8049             :  * Sum pages in active regions for movable zone.
    8050             :  * Populate N_MEMORY for calculating usable_nodes.
    8051             :  */
    8052           1 : static unsigned long __init early_calculate_totalpages(void)
    8053             : {
    8054           1 :         unsigned long totalpages = 0;
    8055             :         unsigned long start_pfn, end_pfn;
    8056             :         int i, nid;
    8057             : 
    8058           2 :         for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
    8059           1 :                 unsigned long pages = end_pfn - start_pfn;
    8060             : 
    8061           1 :                 totalpages += pages;
    8062             :                 if (pages)
    8063             :                         node_set_state(nid, N_MEMORY);
    8064             :         }
    8065           1 :         return totalpages;
    8066             : }
    8067             : 
    8068             : /*
    8069             :  * Find the PFN the Movable zone begins in each node. Kernel memory
    8070             :  * is spread evenly between nodes as long as the nodes have enough
    8071             :  * memory. When they don't, some nodes will have more kernelcore than
    8072             :  * others
    8073             :  */
    8074           1 : static void __init find_zone_movable_pfns_for_nodes(void)
    8075             : {
    8076             :         int i, nid;
    8077             :         unsigned long usable_startpfn;
    8078             :         unsigned long kernelcore_node, kernelcore_remaining;
    8079             :         /* save the state before borrow the nodemask */
    8080           1 :         nodemask_t saved_node_state = node_states[N_MEMORY];
    8081           1 :         unsigned long totalpages = early_calculate_totalpages();
    8082           1 :         int usable_nodes = nodes_weight(node_states[N_MEMORY]);
    8083             :         struct memblock_region *r;
    8084             : 
    8085             :         /* Need to find movable_zone earlier when movable_node is specified. */
    8086           1 :         find_usable_zone_for_movable();
    8087             : 
    8088             :         /*
    8089             :          * If movable_node is specified, ignore kernelcore and movablecore
    8090             :          * options.
    8091             :          */
    8092             :         if (movable_node_is_enabled()) {
    8093             :                 for_each_mem_region(r) {
    8094             :                         if (!memblock_is_hotpluggable(r))
    8095             :                                 continue;
    8096             : 
    8097             :                         nid = memblock_get_region_node(r);
    8098             : 
    8099             :                         usable_startpfn = PFN_DOWN(r->base);
    8100             :                         zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
    8101             :                                 min(usable_startpfn, zone_movable_pfn[nid]) :
    8102             :                                 usable_startpfn;
    8103             :                 }
    8104             : 
    8105             :                 goto out2;
    8106             :         }
    8107             : 
    8108             :         /*
    8109             :          * If kernelcore=mirror is specified, ignore movablecore option
    8110             :          */
    8111           1 :         if (mirrored_kernelcore) {
    8112           0 :                 bool mem_below_4gb_not_mirrored = false;
    8113             : 
    8114           0 :                 for_each_mem_region(r) {
    8115           0 :                         if (memblock_is_mirror(r))
    8116           0 :                                 continue;
    8117             : 
    8118           0 :                         nid = memblock_get_region_node(r);
    8119             : 
    8120           0 :                         usable_startpfn = memblock_region_memory_base_pfn(r);
    8121             : 
    8122           0 :                         if (usable_startpfn < PHYS_PFN(SZ_4G)) {
    8123           0 :                                 mem_below_4gb_not_mirrored = true;
    8124           0 :                                 continue;
    8125             :                         }
    8126             : 
    8127           0 :                         zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
    8128           0 :                                 min(usable_startpfn, zone_movable_pfn[nid]) :
    8129             :                                 usable_startpfn;
    8130             :                 }
    8131             : 
    8132           0 :                 if (mem_below_4gb_not_mirrored)
    8133           0 :                         pr_warn("This configuration results in unmirrored kernel memory.\n");
    8134             : 
    8135             :                 goto out2;
    8136             :         }
    8137             : 
    8138             :         /*
    8139             :          * If kernelcore=nn% or movablecore=nn% was specified, calculate the
    8140             :          * amount of necessary memory.
    8141             :          */
    8142           1 :         if (required_kernelcore_percent)
    8143           0 :                 required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
    8144             :                                        10000UL;
    8145           1 :         if (required_movablecore_percent)
    8146           0 :                 required_movablecore = (totalpages * 100 * required_movablecore_percent) /
    8147             :                                         10000UL;
    8148             : 
    8149             :         /*
    8150             :          * If movablecore= was specified, calculate what size of
    8151             :          * kernelcore that corresponds so that memory usable for
    8152             :          * any allocation type is evenly spread. If both kernelcore
    8153             :          * and movablecore are specified, then the value of kernelcore
    8154             :          * will be used for required_kernelcore if it's greater than
    8155             :          * what movablecore would have allowed.
    8156             :          */
    8157           1 :         if (required_movablecore) {
    8158             :                 unsigned long corepages;
    8159             : 
    8160             :                 /*
    8161             :                  * Round-up so that ZONE_MOVABLE is at least as large as what
    8162             :                  * was requested by the user
    8163             :                  */
    8164             :                 required_movablecore =
    8165           0 :                         roundup(required_movablecore, MAX_ORDER_NR_PAGES);
    8166           0 :                 required_movablecore = min(totalpages, required_movablecore);
    8167           0 :                 corepages = totalpages - required_movablecore;
    8168             : 
    8169           0 :                 required_kernelcore = max(required_kernelcore, corepages);
    8170             :         }
    8171             : 
    8172             :         /*
    8173             :          * If kernelcore was not specified or kernelcore size is larger
    8174             :          * than totalpages, there is no ZONE_MOVABLE.
    8175             :          */
    8176           1 :         if (!required_kernelcore || required_kernelcore >= totalpages)
    8177             :                 goto out;
    8178             : 
    8179             :         /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
    8180           0 :         usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
    8181             : 
    8182             : restart:
    8183             :         /* Spread kernelcore memory as evenly as possible throughout nodes */
    8184           0 :         kernelcore_node = required_kernelcore / usable_nodes;
    8185           0 :         for_each_node_state(nid, N_MEMORY) {
    8186             :                 unsigned long start_pfn, end_pfn;
    8187             : 
    8188             :                 /*
    8189             :                  * Recalculate kernelcore_node if the division per node
    8190             :                  * now exceeds what is necessary to satisfy the requested
    8191             :                  * amount of memory for the kernel
    8192             :                  */
    8193           0 :                 if (required_kernelcore < kernelcore_node)
    8194           0 :                         kernelcore_node = required_kernelcore / usable_nodes;
    8195             : 
    8196             :                 /*
    8197             :                  * As the map is walked, we track how much memory is usable
    8198             :                  * by the kernel using kernelcore_remaining. When it is
    8199             :                  * 0, the rest of the node is usable by ZONE_MOVABLE
    8200             :                  */
    8201           0 :                 kernelcore_remaining = kernelcore_node;
    8202             : 
    8203             :                 /* Go through each range of PFNs within this node */
    8204           0 :                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
    8205             :                         unsigned long size_pages;
    8206             : 
    8207           0 :                         start_pfn = max(start_pfn, zone_movable_pfn[nid]);
    8208           0 :                         if (start_pfn >= end_pfn)
    8209           0 :                                 continue;
    8210             : 
    8211             :                         /* Account for what is only usable for kernelcore */
    8212           0 :                         if (start_pfn < usable_startpfn) {
    8213             :                                 unsigned long kernel_pages;
    8214           0 :                                 kernel_pages = min(end_pfn, usable_startpfn)
    8215             :                                                                 - start_pfn;
    8216             : 
    8217           0 :                                 kernelcore_remaining -= min(kernel_pages,
    8218             :                                                         kernelcore_remaining);
    8219           0 :                                 required_kernelcore -= min(kernel_pages,
    8220             :                                                         required_kernelcore);
    8221             : 
    8222             :                                 /* Continue if range is now fully accounted */
    8223           0 :                                 if (end_pfn <= usable_startpfn) {
    8224             : 
    8225             :                                         /*
    8226             :                                          * Push zone_movable_pfn to the end so
    8227             :                                          * that if we have to rebalance
    8228             :                                          * kernelcore across nodes, we will
    8229             :                                          * not double account here
    8230             :                                          */
    8231           0 :                                         zone_movable_pfn[nid] = end_pfn;
    8232           0 :                                         continue;
    8233             :                                 }
    8234           0 :                                 start_pfn = usable_startpfn;
    8235             :                         }
    8236             : 
    8237             :                         /*
    8238             :                          * The usable PFN range for ZONE_MOVABLE is from
    8239             :                          * start_pfn->end_pfn. Calculate size_pages as the
    8240             :                          * number of pages used as kernelcore
    8241             :                          */
    8242           0 :                         size_pages = end_pfn - start_pfn;
    8243           0 :                         if (size_pages > kernelcore_remaining)
    8244           0 :                                 size_pages = kernelcore_remaining;
    8245           0 :                         zone_movable_pfn[nid] = start_pfn + size_pages;
    8246             : 
    8247             :                         /*
    8248             :                          * Some kernelcore has been met, update counts and
    8249             :                          * break if the kernelcore for this node has been
    8250             :                          * satisfied
    8251             :                          */
    8252           0 :                         required_kernelcore -= min(required_kernelcore,
    8253             :                                                                 size_pages);
    8254           0 :                         kernelcore_remaining -= size_pages;
    8255           0 :                         if (!kernelcore_remaining)
    8256             :                                 break;
    8257             :                 }
    8258             :         }
    8259             : 
    8260             :         /*
    8261             :          * If there is still required_kernelcore, we do another pass with one
    8262             :          * less node in the count. This will push zone_movable_pfn[nid] further
    8263             :          * along on the nodes that still have memory until kernelcore is
    8264             :          * satisfied
    8265             :          */
    8266           0 :         usable_nodes--;
    8267           0 :         if (usable_nodes && required_kernelcore > usable_nodes)
    8268             :                 goto restart;
    8269             : 
    8270             : out2:
    8271             :         /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
    8272           0 :         for (nid = 0; nid < MAX_NUMNODES; nid++) {
    8273             :                 unsigned long start_pfn, end_pfn;
    8274             : 
    8275           0 :                 zone_movable_pfn[nid] =
    8276           0 :                         roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
    8277             : 
    8278           0 :                 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
    8279           0 :                 if (zone_movable_pfn[nid] >= end_pfn)
    8280           0 :                         zone_movable_pfn[nid] = 0;
    8281             :         }
    8282             : 
    8283             : out:
    8284             :         /* restore the node_state */
    8285           1 :         node_states[N_MEMORY] = saved_node_state;
    8286           1 : }
    8287             : 
    8288             : /* Any regular or high memory on that node ? */
    8289             : static void check_for_memory(pg_data_t *pgdat, int nid)
    8290             : {
    8291             :         enum zone_type zone_type;
    8292             : 
    8293           0 :         for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
    8294           1 :                 struct zone *zone = &pgdat->node_zones[zone_type];
    8295           1 :                 if (populated_zone(zone)) {
    8296             :                         if (IS_ENABLED(CONFIG_HIGHMEM))
    8297             :                                 node_set_state(nid, N_HIGH_MEMORY);
    8298             :                         if (zone_type <= ZONE_NORMAL)
    8299             :                                 node_set_state(nid, N_NORMAL_MEMORY);
    8300             :                         break;
    8301             :                 }
    8302             :         }
    8303             : }
    8304             : 
    8305             : /*
    8306             :  * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
    8307             :  * such cases we allow max_zone_pfn sorted in the descending order
    8308             :  */
    8309           1 : bool __weak arch_has_descending_max_zone_pfns(void)
    8310             : {
    8311           1 :         return false;
    8312             : }
    8313             : 
    8314             : /**
    8315             :  * free_area_init - Initialise all pg_data_t and zone data
    8316             :  * @max_zone_pfn: an array of max PFNs for each zone
    8317             :  *
    8318             :  * This will call free_area_init_node() for each active node in the system.
    8319             :  * Using the page ranges provided by memblock_set_node(), the size of each
    8320             :  * zone in each node and their holes is calculated. If the maximum PFN
    8321             :  * between two adjacent zones match, it is assumed that the zone is empty.
    8322             :  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
    8323             :  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
    8324             :  * starts where the previous one ended. For example, ZONE_DMA32 starts
    8325             :  * at arch_max_dma_pfn.
    8326             :  */
    8327           1 : void __init free_area_init(unsigned long *max_zone_pfn)
    8328             : {
    8329             :         unsigned long start_pfn, end_pfn;
    8330             :         int i, nid, zone;
    8331             :         bool descending;
    8332             : 
    8333             :         /* Record where the zone boundaries are */
    8334           1 :         memset(arch_zone_lowest_possible_pfn, 0,
    8335             :                                 sizeof(arch_zone_lowest_possible_pfn));
    8336           1 :         memset(arch_zone_highest_possible_pfn, 0,
    8337             :                                 sizeof(arch_zone_highest_possible_pfn));
    8338             : 
    8339           1 :         start_pfn = PHYS_PFN(memblock_start_of_DRAM());
    8340           1 :         descending = arch_has_descending_max_zone_pfns();
    8341             : 
    8342           3 :         for (i = 0; i < MAX_NR_ZONES; i++) {
    8343           2 :                 if (descending)
    8344           0 :                         zone = MAX_NR_ZONES - i - 1;
    8345             :                 else
    8346             :                         zone = i;
    8347             : 
    8348           2 :                 if (zone == ZONE_MOVABLE)
    8349           1 :                         continue;
    8350             : 
    8351           1 :                 end_pfn = max(max_zone_pfn[zone], start_pfn);
    8352           1 :                 arch_zone_lowest_possible_pfn[zone] = start_pfn;
    8353           1 :                 arch_zone_highest_possible_pfn[zone] = end_pfn;
    8354             : 
    8355           1 :                 start_pfn = end_pfn;
    8356             :         }
    8357             : 
    8358             :         /* Find the PFNs that ZONE_MOVABLE begins at in each node */
    8359           1 :         memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
    8360           1 :         find_zone_movable_pfns_for_nodes();
    8361             : 
    8362             :         /* Print out the zone ranges */
    8363           1 :         pr_info("Zone ranges:\n");
    8364           3 :         for (i = 0; i < MAX_NR_ZONES; i++) {
    8365           2 :                 if (i == ZONE_MOVABLE)
    8366           1 :                         continue;
    8367           1 :                 pr_info("  %-8s ", zone_names[i]);
    8368           2 :                 if (arch_zone_lowest_possible_pfn[i] ==
    8369           1 :                                 arch_zone_highest_possible_pfn[i])
    8370           0 :                         pr_cont("empty\n");
    8371             :                 else
    8372           1 :                         pr_cont("[mem %#018Lx-%#018Lx]\n",
    8373             :                                 (u64)arch_zone_lowest_possible_pfn[i]
    8374             :                                         << PAGE_SHIFT,
    8375             :                                 ((u64)arch_zone_highest_possible_pfn[i]
    8376             :                                         << PAGE_SHIFT) - 1);
    8377             :         }
    8378             : 
    8379             :         /* Print out the PFNs ZONE_MOVABLE begins at in each node */
    8380           1 :         pr_info("Movable zone start for each node\n");
    8381           2 :         for (i = 0; i < MAX_NUMNODES; i++) {
    8382           1 :                 if (zone_movable_pfn[i])
    8383           0 :                         pr_info("  Node %d: %#018Lx\n", i,
    8384             :                                (u64)zone_movable_pfn[i] << PAGE_SHIFT);
    8385             :         }
    8386             : 
    8387             :         /*
    8388             :          * Print out the early node map, and initialize the
    8389             :          * subsection-map relative to active online memory ranges to
    8390             :          * enable future "sub-section" extensions of the memory map.
    8391             :          */
    8392           1 :         pr_info("Early memory node ranges\n");
    8393           2 :         for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
    8394           1 :                 pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
    8395             :                         (u64)start_pfn << PAGE_SHIFT,
    8396             :                         ((u64)end_pfn << PAGE_SHIFT) - 1);
    8397             :                 subsection_map_init(start_pfn, end_pfn - start_pfn);
    8398             :         }
    8399             : 
    8400             :         /* Initialise every node */
    8401           1 :         mminit_verify_pageflags_layout();
    8402             :         setup_nr_node_ids();
    8403           2 :         for_each_node(nid) {
    8404             :                 pg_data_t *pgdat;
    8405             : 
    8406           1 :                 if (!node_online(nid)) {
    8407             :                         pr_info("Initializing node %d as memoryless\n", nid);
    8408             : 
    8409             :                         /* Allocator not initialized yet */
    8410             :                         pgdat = arch_alloc_nodedata(nid);
    8411             :                         if (!pgdat)
    8412             :                                 panic("Cannot allocate %zuB for node %d.\n",
    8413             :                                        sizeof(*pgdat), nid);
    8414             :                         arch_refresh_nodedata(nid, pgdat);
    8415             :                         free_area_init_memoryless_node(nid);
    8416             : 
    8417             :                         /*
    8418             :                          * We do not want to confuse userspace by sysfs
    8419             :                          * files/directories for node without any memory
    8420             :                          * attached to it, so this node is not marked as
    8421             :                          * N_MEMORY and not marked online so that no sysfs
    8422             :                          * hierarchy will be created via register_one_node for
    8423             :                          * it. The pgdat will get fully initialized by
    8424             :                          * hotadd_init_pgdat() when memory is hotplugged into
    8425             :                          * this node.
    8426             :                          */
    8427             :                         continue;
    8428             :                 }
    8429             : 
    8430           1 :                 pgdat = NODE_DATA(nid);
    8431           1 :                 free_area_init_node(nid);
    8432             : 
    8433             :                 /* Any memory on that node */
    8434             :                 if (pgdat->node_present_pages)
    8435             :                         node_set_state(nid, N_MEMORY);
    8436           2 :                 check_for_memory(pgdat, nid);
    8437             :         }
    8438             : 
    8439           1 :         memmap_init();
    8440           1 : }
    8441             : 
    8442           0 : static int __init cmdline_parse_core(char *p, unsigned long *core,
    8443             :                                      unsigned long *percent)
    8444             : {
    8445             :         unsigned long long coremem;
    8446             :         char *endptr;
    8447             : 
    8448           0 :         if (!p)
    8449             :                 return -EINVAL;
    8450             : 
    8451             :         /* Value may be a percentage of total memory, otherwise bytes */
    8452           0 :         coremem = simple_strtoull(p, &endptr, 0);
    8453           0 :         if (*endptr == '%') {
    8454             :                 /* Paranoid check for percent values greater than 100 */
    8455           0 :                 WARN_ON(coremem > 100);
    8456             : 
    8457           0 :                 *percent = coremem;
    8458             :         } else {
    8459           0 :                 coremem = memparse(p, &p);
    8460             :                 /* Paranoid check that UL is enough for the coremem value */
    8461           0 :                 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
    8462             : 
    8463           0 :                 *core = coremem >> PAGE_SHIFT;
    8464           0 :                 *percent = 0UL;
    8465             :         }
    8466             :         return 0;
    8467             : }
    8468             : 
    8469             : /*
    8470             :  * kernelcore=size sets the amount of memory for use for allocations that
    8471             :  * cannot be reclaimed or migrated.
    8472             :  */
    8473           0 : static int __init cmdline_parse_kernelcore(char *p)
    8474             : {
    8475             :         /* parse kernelcore=mirror */
    8476           0 :         if (parse_option_str(p, "mirror")) {
    8477           0 :                 mirrored_kernelcore = true;
    8478           0 :                 return 0;
    8479             :         }
    8480             : 
    8481           0 :         return cmdline_parse_core(p, &required_kernelcore,
    8482             :                                   &required_kernelcore_percent);
    8483             : }
    8484             : 
    8485             : /*
    8486             :  * movablecore=size sets the amount of memory for use for allocations that
    8487             :  * can be reclaimed or migrated.
    8488             :  */
    8489           0 : static int __init cmdline_parse_movablecore(char *p)
    8490             : {
    8491           0 :         return cmdline_parse_core(p, &required_movablecore,
    8492             :                                   &required_movablecore_percent);
    8493             : }
    8494             : 
    8495             : early_param("kernelcore", cmdline_parse_kernelcore);
    8496             : early_param("movablecore", cmdline_parse_movablecore);
    8497             : 
    8498           0 : void adjust_managed_page_count(struct page *page, long count)
    8499             : {
    8500           0 :         atomic_long_add(count, &page_zone(page)->managed_pages);
    8501           0 :         totalram_pages_add(count);
    8502             : #ifdef CONFIG_HIGHMEM
    8503             :         if (PageHighMem(page))
    8504             :                 totalhigh_pages_add(count);
    8505             : #endif
    8506           0 : }
    8507             : EXPORT_SYMBOL(adjust_managed_page_count);
    8508             : 
    8509           0 : unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
    8510             : {
    8511             :         void *pos;
    8512           0 :         unsigned long pages = 0;
    8513             : 
    8514           0 :         start = (void *)PAGE_ALIGN((unsigned long)start);
    8515           0 :         end = (void *)((unsigned long)end & PAGE_MASK);
    8516           0 :         for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
    8517           0 :                 struct page *page = virt_to_page(pos);
    8518             :                 void *direct_map_addr;
    8519             : 
    8520             :                 /*
    8521             :                  * 'direct_map_addr' might be different from 'pos'
    8522             :                  * because some architectures' virt_to_page()
    8523             :                  * work with aliases.  Getting the direct map
    8524             :                  * address ensures that we get a _writeable_
    8525             :                  * alias for the memset().
    8526             :                  */
    8527           0 :                 direct_map_addr = page_address(page);
    8528             :                 /*
    8529             :                  * Perform a kasan-unchecked memset() since this memory
    8530             :                  * has not been initialized.
    8531             :                  */
    8532           0 :                 direct_map_addr = kasan_reset_tag(direct_map_addr);
    8533           0 :                 if ((unsigned int)poison <= 0xFF)
    8534           0 :                         memset(direct_map_addr, poison, PAGE_SIZE);
    8535             : 
    8536           0 :                 free_reserved_page(page);
    8537             :         }
    8538             : 
    8539           0 :         if (pages && s)
    8540           0 :                 pr_info("Freeing %s memory: %ldK\n", s, K(pages));
    8541             : 
    8542           0 :         return pages;
    8543             : }
    8544             : 
    8545           1 : void __init mem_init_print_info(void)
    8546             : {
    8547             :         unsigned long physpages, codesize, datasize, rosize, bss_size;
    8548             :         unsigned long init_code_size, init_data_size;
    8549             : 
    8550           1 :         physpages = get_num_physpages();
    8551           1 :         codesize = _etext - _stext;
    8552           1 :         datasize = _edata - _sdata;
    8553           1 :         rosize = __end_rodata - __start_rodata;
    8554           1 :         bss_size = __bss_stop - __bss_start;
    8555           1 :         init_data_size = __init_end - __init_begin;
    8556           1 :         init_code_size = _einittext - _sinittext;
    8557             : 
    8558             :         /*
    8559             :          * Detect special cases and adjust section sizes accordingly:
    8560             :          * 1) .init.* may be embedded into .data sections
    8561             :          * 2) .init.text.* may be out of [__init_begin, __init_end],
    8562             :          *    please refer to arch/tile/kernel/vmlinux.lds.S.
    8563             :          * 3) .rodata.* may be embedded into .text or .data sections.
    8564             :          */
    8565             : #define adj_init_size(start, end, size, pos, adj) \
    8566             :         do { \
    8567             :                 if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
    8568             :                         size -= adj; \
    8569             :         } while (0)
    8570             : 
    8571           1 :         adj_init_size(__init_begin, __init_end, init_data_size,
    8572             :                      _sinittext, init_code_size);
    8573           1 :         adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
    8574           1 :         adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
    8575           1 :         adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
    8576           1 :         adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
    8577             : 
    8578             : #undef  adj_init_size
    8579             : 
    8580           3 :         pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
    8581             : #ifdef  CONFIG_HIGHMEM
    8582             :                 ", %luK highmem"
    8583             : #endif
    8584             :                 ")\n",
    8585             :                 K(nr_free_pages()), K(physpages),
    8586             :                 codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
    8587             :                 (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
    8588             :                 K(physpages - totalram_pages() - totalcma_pages),
    8589             :                 K(totalcma_pages)
    8590             : #ifdef  CONFIG_HIGHMEM
    8591             :                 , K(totalhigh_pages())
    8592             : #endif
    8593             :                 );
    8594           1 : }
    8595             : 
    8596             : /**
    8597             :  * set_dma_reserve - set the specified number of pages reserved in the first zone
    8598             :  * @new_dma_reserve: The number of pages to mark reserved
    8599             :  *
    8600             :  * The per-cpu batchsize and zone watermarks are determined by managed_pages.
    8601             :  * In the DMA zone, a significant percentage may be consumed by kernel image
    8602             :  * and other unfreeable allocations which can skew the watermarks badly. This
    8603             :  * function may optionally be used to account for unfreeable pages in the
    8604             :  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
    8605             :  * smaller per-cpu batchsize.
    8606             :  */
    8607           0 : void __init set_dma_reserve(unsigned long new_dma_reserve)
    8608             : {
    8609           0 :         dma_reserve = new_dma_reserve;
    8610           0 : }
    8611             : 
    8612           0 : static int page_alloc_cpu_dead(unsigned int cpu)
    8613             : {
    8614             :         struct zone *zone;
    8615             : 
    8616           0 :         lru_add_drain_cpu(cpu);
    8617           0 :         mlock_drain_remote(cpu);
    8618           0 :         drain_pages(cpu);
    8619             : 
    8620             :         /*
    8621             :          * Spill the event counters of the dead processor
    8622             :          * into the current processors event counters.
    8623             :          * This artificially elevates the count of the current
    8624             :          * processor.
    8625             :          */
    8626           0 :         vm_events_fold_cpu(cpu);
    8627             : 
    8628             :         /*
    8629             :          * Zero the differential counters of the dead processor
    8630             :          * so that the vm statistics are consistent.
    8631             :          *
    8632             :          * This is only okay since the processor is dead and cannot
    8633             :          * race with what we are doing.
    8634             :          */
    8635           0 :         cpu_vm_stats_fold(cpu);
    8636             : 
    8637           0 :         for_each_populated_zone(zone)
    8638           0 :                 zone_pcp_update(zone, 0);
    8639             : 
    8640           0 :         return 0;
    8641             : }
    8642             : 
    8643           0 : static int page_alloc_cpu_online(unsigned int cpu)
    8644             : {
    8645             :         struct zone *zone;
    8646             : 
    8647           0 :         for_each_populated_zone(zone)
    8648           0 :                 zone_pcp_update(zone, 1);
    8649           0 :         return 0;
    8650             : }
    8651             : 
    8652             : #ifdef CONFIG_NUMA
    8653             : int hashdist = HASHDIST_DEFAULT;
    8654             : 
    8655             : static int __init set_hashdist(char *str)
    8656             : {
    8657             :         if (!str)
    8658             :                 return 0;
    8659             :         hashdist = simple_strtoul(str, &str, 0);
    8660             :         return 1;
    8661             : }
    8662             : __setup("hashdist=", set_hashdist);
    8663             : #endif
    8664             : 
    8665           1 : void __init page_alloc_init(void)
    8666             : {
    8667             :         int ret;
    8668             : 
    8669             : #ifdef CONFIG_NUMA
    8670             :         if (num_node_state(N_MEMORY) == 1)
    8671             :                 hashdist = 0;
    8672             : #endif
    8673             : 
    8674           1 :         ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
    8675             :                                         "mm/page_alloc:pcp",
    8676             :                                         page_alloc_cpu_online,
    8677             :                                         page_alloc_cpu_dead);
    8678           1 :         WARN_ON(ret < 0);
    8679           1 : }
    8680             : 
    8681             : /*
    8682             :  * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
    8683             :  *      or min_free_kbytes changes.
    8684             :  */
    8685           2 : static void calculate_totalreserve_pages(void)
    8686             : {
    8687             :         struct pglist_data *pgdat;
    8688           2 :         unsigned long reserve_pages = 0;
    8689             :         enum zone_type i, j;
    8690             : 
    8691           4 :         for_each_online_pgdat(pgdat) {
    8692             : 
    8693           2 :                 pgdat->totalreserve_pages = 0;
    8694             : 
    8695           6 :                 for (i = 0; i < MAX_NR_ZONES; i++) {
    8696           4 :                         struct zone *zone = pgdat->node_zones + i;
    8697           4 :                         long max = 0;
    8698           4 :                         unsigned long managed_pages = zone_managed_pages(zone);
    8699             : 
    8700             :                         /* Find valid and maximum lowmem_reserve in the zone */
    8701          10 :                         for (j = i; j < MAX_NR_ZONES; j++) {
    8702           6 :                                 if (zone->lowmem_reserve[j] > max)
    8703           0 :                                         max = zone->lowmem_reserve[j];
    8704             :                         }
    8705             : 
    8706             :                         /* we treat the high watermark as reserved pages. */
    8707           4 :                         max += high_wmark_pages(zone);
    8708             : 
    8709           4 :                         if (max > managed_pages)
    8710           0 :                                 max = managed_pages;
    8711             : 
    8712           4 :                         pgdat->totalreserve_pages += max;
    8713             : 
    8714           4 :                         reserve_pages += max;
    8715             :                 }
    8716             :         }
    8717           2 :         totalreserve_pages = reserve_pages;
    8718           2 : }
    8719             : 
    8720             : /*
    8721             :  * setup_per_zone_lowmem_reserve - called whenever
    8722             :  *      sysctl_lowmem_reserve_ratio changes.  Ensures that each zone
    8723             :  *      has a correct pages reserved value, so an adequate number of
    8724             :  *      pages are left in the zone after a successful __alloc_pages().
    8725             :  */
    8726           1 : static void setup_per_zone_lowmem_reserve(void)
    8727             : {
    8728             :         struct pglist_data *pgdat;
    8729             :         enum zone_type i, j;
    8730             : 
    8731           2 :         for_each_online_pgdat(pgdat) {
    8732           2 :                 for (i = 0; i < MAX_NR_ZONES - 1; i++) {
    8733           1 :                         struct zone *zone = &pgdat->node_zones[i];
    8734           1 :                         int ratio = sysctl_lowmem_reserve_ratio[i];
    8735           2 :                         bool clear = !ratio || !zone_managed_pages(zone);
    8736           1 :                         unsigned long managed_pages = 0;
    8737             : 
    8738           2 :                         for (j = i + 1; j < MAX_NR_ZONES; j++) {
    8739           1 :                                 struct zone *upper_zone = &pgdat->node_zones[j];
    8740             : 
    8741           1 :                                 managed_pages += zone_managed_pages(upper_zone);
    8742             : 
    8743           1 :                                 if (clear)
    8744           0 :                                         zone->lowmem_reserve[j] = 0;
    8745             :                                 else
    8746           1 :                                         zone->lowmem_reserve[j] = managed_pages / ratio;
    8747             :                         }
    8748             :                 }
    8749             :         }
    8750             : 
    8751             :         /* update totalreserve_pages */
    8752           1 :         calculate_totalreserve_pages();
    8753           1 : }
    8754             : 
    8755           1 : static void __setup_per_zone_wmarks(void)
    8756             : {
    8757           1 :         unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
    8758           1 :         unsigned long lowmem_pages = 0;
    8759             :         struct zone *zone;
    8760             :         unsigned long flags;
    8761             : 
    8762             :         /* Calculate total number of !ZONE_HIGHMEM pages */
    8763           3 :         for_each_zone(zone) {
    8764           2 :                 if (!is_highmem(zone))
    8765           2 :                         lowmem_pages += zone_managed_pages(zone);
    8766             :         }
    8767             : 
    8768           3 :         for_each_zone(zone) {
    8769             :                 u64 tmp;
    8770             : 
    8771           2 :                 spin_lock_irqsave(&zone->lock, flags);
    8772           2 :                 tmp = (u64)pages_min * zone_managed_pages(zone);
    8773           2 :                 do_div(tmp, lowmem_pages);
    8774           2 :                 if (is_highmem(zone)) {
    8775             :                         /*
    8776             :                          * __GFP_HIGH and PF_MEMALLOC allocations usually don't
    8777             :                          * need highmem pages, so cap pages_min to a small
    8778             :                          * value here.
    8779             :                          *
    8780             :                          * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
    8781             :                          * deltas control async page reclaim, and so should
    8782             :                          * not be capped for highmem.
    8783             :                          */
    8784             :                         unsigned long min_pages;
    8785             : 
    8786             :                         min_pages = zone_managed_pages(zone) / 1024;
    8787             :                         min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
    8788             :                         zone->_watermark[WMARK_MIN] = min_pages;
    8789             :                 } else {
    8790             :                         /*
    8791             :                          * If it's a lowmem zone, reserve a number of pages
    8792             :                          * proportionate to the zone's size.
    8793             :                          */
    8794           2 :                         zone->_watermark[WMARK_MIN] = tmp;
    8795             :                 }
    8796             : 
    8797             :                 /*
    8798             :                  * Set the kswapd watermarks distance according to the
    8799             :                  * scale factor in proportion to available memory, but
    8800             :                  * ensure a minimum size on small systems.
    8801             :                  */
    8802           6 :                 tmp = max_t(u64, tmp >> 2,
    8803             :                             mult_frac(zone_managed_pages(zone),
    8804             :                                       watermark_scale_factor, 10000));
    8805             : 
    8806           2 :                 zone->watermark_boost = 0;
    8807           2 :                 zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
    8808           2 :                 zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
    8809           2 :                 zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
    8810             : 
    8811           4 :                 spin_unlock_irqrestore(&zone->lock, flags);
    8812             :         }
    8813             : 
    8814             :         /* update totalreserve_pages */
    8815           1 :         calculate_totalreserve_pages();
    8816           1 : }
    8817             : 
    8818             : /**
    8819             :  * setup_per_zone_wmarks - called when min_free_kbytes changes
    8820             :  * or when memory is hot-{added|removed}
    8821             :  *
    8822             :  * Ensures that the watermark[min,low,high] values for each zone are set
    8823             :  * correctly with respect to min_free_kbytes.
    8824             :  */
    8825           1 : void setup_per_zone_wmarks(void)
    8826             : {
    8827             :         struct zone *zone;
    8828             :         static DEFINE_SPINLOCK(lock);
    8829             : 
    8830           1 :         spin_lock(&lock);
    8831           1 :         __setup_per_zone_wmarks();
    8832           1 :         spin_unlock(&lock);
    8833             : 
    8834             :         /*
    8835             :          * The watermark size have changed so update the pcpu batch
    8836             :          * and high limits or the limits may be inappropriate.
    8837             :          */
    8838           3 :         for_each_zone(zone)
    8839           2 :                 zone_pcp_update(zone, 0);
    8840           1 : }
    8841             : 
    8842             : /*
    8843             :  * Initialise min_free_kbytes.
    8844             :  *
    8845             :  * For small machines we want it small (128k min).  For large machines
    8846             :  * we want it large (256MB max).  But it is not linear, because network
    8847             :  * bandwidth does not increase linearly with machine size.  We use
    8848             :  *
    8849             :  *      min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
    8850             :  *      min_free_kbytes = sqrt(lowmem_kbytes * 16)
    8851             :  *
    8852             :  * which yields
    8853             :  *
    8854             :  * 16MB:        512k
    8855             :  * 32MB:        724k
    8856             :  * 64MB:        1024k
    8857             :  * 128MB:       1448k
    8858             :  * 256MB:       2048k
    8859             :  * 512MB:       2896k
    8860             :  * 1024MB:      4096k
    8861             :  * 2048MB:      5792k
    8862             :  * 4096MB:      8192k
    8863             :  * 8192MB:      11584k
    8864             :  * 16384MB:     16384k
    8865             :  */
    8866           1 : void calculate_min_free_kbytes(void)
    8867             : {
    8868             :         unsigned long lowmem_kbytes;
    8869             :         int new_min_free_kbytes;
    8870             : 
    8871           1 :         lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
    8872           1 :         new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
    8873             : 
    8874           1 :         if (new_min_free_kbytes > user_min_free_kbytes)
    8875           1 :                 min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144);
    8876             :         else
    8877           0 :                 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
    8878             :                                 new_min_free_kbytes, user_min_free_kbytes);
    8879             : 
    8880           1 : }
    8881             : 
    8882           1 : int __meminit init_per_zone_wmark_min(void)
    8883             : {
    8884           1 :         calculate_min_free_kbytes();
    8885           1 :         setup_per_zone_wmarks();
    8886             :         refresh_zone_stat_thresholds();
    8887           1 :         setup_per_zone_lowmem_reserve();
    8888             : 
    8889             : #ifdef CONFIG_NUMA
    8890             :         setup_min_unmapped_ratio();
    8891             :         setup_min_slab_ratio();
    8892             : #endif
    8893             : 
    8894             :         khugepaged_min_free_kbytes_update();
    8895             : 
    8896           1 :         return 0;
    8897             : }
    8898             : postcore_initcall(init_per_zone_wmark_min)
    8899             : 
    8900             : /*
    8901             :  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
    8902             :  *      that we can call two helper functions whenever min_free_kbytes
    8903             :  *      changes.
    8904             :  */
    8905           0 : int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
    8906             :                 void *buffer, size_t *length, loff_t *ppos)
    8907             : {
    8908             :         int rc;
    8909             : 
    8910           0 :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    8911           0 :         if (rc)
    8912             :                 return rc;
    8913             : 
    8914           0 :         if (write) {
    8915           0 :                 user_min_free_kbytes = min_free_kbytes;
    8916           0 :                 setup_per_zone_wmarks();
    8917             :         }
    8918             :         return 0;
    8919             : }
    8920             : 
    8921           0 : int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
    8922             :                 void *buffer, size_t *length, loff_t *ppos)
    8923             : {
    8924             :         int rc;
    8925             : 
    8926           0 :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    8927           0 :         if (rc)
    8928             :                 return rc;
    8929             : 
    8930           0 :         if (write)
    8931           0 :                 setup_per_zone_wmarks();
    8932             : 
    8933             :         return 0;
    8934             : }
    8935             : 
    8936             : #ifdef CONFIG_NUMA
    8937             : static void setup_min_unmapped_ratio(void)
    8938             : {
    8939             :         pg_data_t *pgdat;
    8940             :         struct zone *zone;
    8941             : 
    8942             :         for_each_online_pgdat(pgdat)
    8943             :                 pgdat->min_unmapped_pages = 0;
    8944             : 
    8945             :         for_each_zone(zone)
    8946             :                 zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
    8947             :                                                          sysctl_min_unmapped_ratio) / 100;
    8948             : }
    8949             : 
    8950             : 
    8951             : int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
    8952             :                 void *buffer, size_t *length, loff_t *ppos)
    8953             : {
    8954             :         int rc;
    8955             : 
    8956             :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    8957             :         if (rc)
    8958             :                 return rc;
    8959             : 
    8960             :         setup_min_unmapped_ratio();
    8961             : 
    8962             :         return 0;
    8963             : }
    8964             : 
    8965             : static void setup_min_slab_ratio(void)
    8966             : {
    8967             :         pg_data_t *pgdat;
    8968             :         struct zone *zone;
    8969             : 
    8970             :         for_each_online_pgdat(pgdat)
    8971             :                 pgdat->min_slab_pages = 0;
    8972             : 
    8973             :         for_each_zone(zone)
    8974             :                 zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
    8975             :                                                      sysctl_min_slab_ratio) / 100;
    8976             : }
    8977             : 
    8978             : int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
    8979             :                 void *buffer, size_t *length, loff_t *ppos)
    8980             : {
    8981             :         int rc;
    8982             : 
    8983             :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    8984             :         if (rc)
    8985             :                 return rc;
    8986             : 
    8987             :         setup_min_slab_ratio();
    8988             : 
    8989             :         return 0;
    8990             : }
    8991             : #endif
    8992             : 
    8993             : /*
    8994             :  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
    8995             :  *      proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
    8996             :  *      whenever sysctl_lowmem_reserve_ratio changes.
    8997             :  *
    8998             :  * The reserve ratio obviously has absolutely no relation with the
    8999             :  * minimum watermarks. The lowmem reserve ratio can only make sense
    9000             :  * if in function of the boot time zone sizes.
    9001             :  */
    9002           0 : int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
    9003             :                 void *buffer, size_t *length, loff_t *ppos)
    9004             : {
    9005             :         int i;
    9006             : 
    9007           0 :         proc_dointvec_minmax(table, write, buffer, length, ppos);
    9008             : 
    9009           0 :         for (i = 0; i < MAX_NR_ZONES; i++) {
    9010           0 :                 if (sysctl_lowmem_reserve_ratio[i] < 1)
    9011           0 :                         sysctl_lowmem_reserve_ratio[i] = 0;
    9012             :         }
    9013             : 
    9014           0 :         setup_per_zone_lowmem_reserve();
    9015           0 :         return 0;
    9016             : }
    9017             : 
    9018             : /*
    9019             :  * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
    9020             :  * cpu. It is the fraction of total pages in each zone that a hot per cpu
    9021             :  * pagelist can have before it gets flushed back to buddy allocator.
    9022             :  */
    9023           0 : int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
    9024             :                 int write, void *buffer, size_t *length, loff_t *ppos)
    9025             : {
    9026             :         struct zone *zone;
    9027             :         int old_percpu_pagelist_high_fraction;
    9028             :         int ret;
    9029             : 
    9030           0 :         mutex_lock(&pcp_batch_high_lock);
    9031           0 :         old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
    9032             : 
    9033           0 :         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
    9034           0 :         if (!write || ret < 0)
    9035             :                 goto out;
    9036             : 
    9037             :         /* Sanity checking to avoid pcp imbalance */
    9038           0 :         if (percpu_pagelist_high_fraction &&
    9039             :             percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
    9040           0 :                 percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
    9041           0 :                 ret = -EINVAL;
    9042           0 :                 goto out;
    9043             :         }
    9044             : 
    9045             :         /* No change? */
    9046           0 :         if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
    9047             :                 goto out;
    9048             : 
    9049           0 :         for_each_populated_zone(zone)
    9050           0 :                 zone_set_pageset_high_and_batch(zone, 0);
    9051             : out:
    9052           0 :         mutex_unlock(&pcp_batch_high_lock);
    9053           0 :         return ret;
    9054             : }
    9055             : 
    9056             : #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
    9057             : /*
    9058             :  * Returns the number of pages that arch has reserved but
    9059             :  * is not known to alloc_large_system_hash().
    9060             :  */
    9061             : static unsigned long __init arch_reserved_kernel_pages(void)
    9062             : {
    9063             :         return 0;
    9064             : }
    9065             : #endif
    9066             : 
    9067             : /*
    9068             :  * Adaptive scale is meant to reduce sizes of hash tables on large memory
    9069             :  * machines. As memory size is increased the scale is also increased but at
    9070             :  * slower pace.  Starting from ADAPT_SCALE_BASE (64G), every time memory
    9071             :  * quadruples the scale is increased by one, which means the size of hash table
    9072             :  * only doubles, instead of quadrupling as well.
    9073             :  * Because 32-bit systems cannot have large physical memory, where this scaling
    9074             :  * makes sense, it is disabled on such platforms.
    9075             :  */
    9076             : #if __BITS_PER_LONG > 32
    9077             : #define ADAPT_SCALE_BASE        (64ul << 30)
    9078             : #define ADAPT_SCALE_SHIFT       2
    9079             : #define ADAPT_SCALE_NPAGES      (ADAPT_SCALE_BASE >> PAGE_SHIFT)
    9080             : #endif
    9081             : 
    9082             : /*
    9083             :  * allocate a large system hash table from bootmem
    9084             :  * - it is assumed that the hash table must contain an exact power-of-2
    9085             :  *   quantity of entries
    9086             :  * - limit is the number of hash buckets, not the total allocation size
    9087             :  */
    9088           5 : void *__init alloc_large_system_hash(const char *tablename,
    9089             :                                      unsigned long bucketsize,
    9090             :                                      unsigned long numentries,
    9091             :                                      int scale,
    9092             :                                      int flags,
    9093             :                                      unsigned int *_hash_shift,
    9094             :                                      unsigned int *_hash_mask,
    9095             :                                      unsigned long low_limit,
    9096             :                                      unsigned long high_limit)
    9097             : {
    9098           5 :         unsigned long long max = high_limit;
    9099             :         unsigned long log2qty, size;
    9100             :         void *table;
    9101             :         gfp_t gfp_flags;
    9102             :         bool virt;
    9103             :         bool huge;
    9104             : 
    9105             :         /* allow the kernel cmdline to have a say */
    9106           5 :         if (!numentries) {
    9107             :                 /* round applicable memory size up to nearest megabyte */
    9108           4 :                 numentries = nr_kernel_pages;
    9109           4 :                 numentries -= arch_reserved_kernel_pages();
    9110             : 
    9111             :                 /* It isn't necessary when PAGE_SIZE >= 1MB */
    9112             :                 if (PAGE_SIZE < SZ_1M)
    9113           4 :                         numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
    9114             : 
    9115             : #if __BITS_PER_LONG > 32
    9116           4 :                 if (!high_limit) {
    9117             :                         unsigned long adapt;
    9118             : 
    9119           4 :                         for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
    9120           0 :                              adapt <<= ADAPT_SCALE_SHIFT)
    9121           0 :                                 scale++;
    9122             :                 }
    9123             : #endif
    9124             : 
    9125             :                 /* limit to 1 bucket per 2^scale bytes of low memory */
    9126           4 :                 if (scale > PAGE_SHIFT)
    9127           4 :                         numentries >>= (scale - PAGE_SHIFT);
    9128             :                 else
    9129           0 :                         numentries <<= (PAGE_SHIFT - scale);
    9130             : 
    9131             :                 /* Make sure we've got at least a 0-order allocation.. */
    9132           4 :                 if (unlikely(flags & HASH_SMALL)) {
    9133             :                         /* Makes no sense without HASH_EARLY */
    9134           0 :                         WARN_ON(!(flags & HASH_EARLY));
    9135           0 :                         if (!(numentries >> *_hash_shift)) {
    9136           0 :                                 numentries = 1UL << *_hash_shift;
    9137           0 :                                 BUG_ON(!numentries);
    9138             :                         }
    9139           4 :                 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
    9140           0 :                         numentries = PAGE_SIZE / bucketsize;
    9141             :         }
    9142          10 :         numentries = roundup_pow_of_two(numentries);
    9143             : 
    9144             :         /* limit allocation size to 1/16 total memory by default */
    9145           5 :         if (max == 0) {
    9146           4 :                 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
    9147           4 :                 do_div(max, bucketsize);
    9148             :         }
    9149           5 :         max = min(max, 0x80000000ULL);
    9150             : 
    9151           5 :         if (numentries < low_limit)
    9152           0 :                 numentries = low_limit;
    9153           5 :         if (numentries > max)
    9154           0 :                 numentries = max;
    9155             : 
    9156          10 :         log2qty = ilog2(numentries);
    9157             : 
    9158           5 :         gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
    9159             :         do {
    9160           5 :                 virt = false;
    9161           5 :                 size = bucketsize << log2qty;
    9162           5 :                 if (flags & HASH_EARLY) {
    9163           2 :                         if (flags & HASH_ZERO)
    9164           2 :                                 table = memblock_alloc(size, SMP_CACHE_BYTES);
    9165             :                         else
    9166           0 :                                 table = memblock_alloc_raw(size,
    9167             :                                                            SMP_CACHE_BYTES);
    9168           3 :                 } else if (get_order(size) >= MAX_ORDER || hashdist) {
    9169           0 :                         table = vmalloc_huge(size, gfp_flags);
    9170           0 :                         virt = true;
    9171             :                         if (table)
    9172             :                                 huge = is_vm_area_hugepages(table);
    9173             :                 } else {
    9174             :                         /*
    9175             :                          * If bucketsize is not a power-of-two, we may free
    9176             :                          * some pages at the end of hash table which
    9177             :                          * alloc_pages_exact() automatically does
    9178             :                          */
    9179           3 :                         table = alloc_pages_exact(size, gfp_flags);
    9180           3 :                         kmemleak_alloc(table, size, 1, gfp_flags);
    9181             :                 }
    9182           5 :         } while (!table && size > PAGE_SIZE && --log2qty);
    9183             : 
    9184           5 :         if (!table)
    9185           0 :                 panic("Failed to allocate %s hash table\n", tablename);
    9186             : 
    9187          10 :         pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
    9188             :                 tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
    9189             :                 virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
    9190             : 
    9191           5 :         if (_hash_shift)
    9192           5 :                 *_hash_shift = log2qty;
    9193           5 :         if (_hash_mask)
    9194           3 :                 *_hash_mask = (1 << log2qty) - 1;
    9195             : 
    9196           5 :         return table;
    9197             : }
    9198             : 
    9199             : #ifdef CONFIG_CONTIG_ALLOC
    9200             : #if defined(CONFIG_DYNAMIC_DEBUG) || \
    9201             :         (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
    9202             : /* Usage: See admin-guide/dynamic-debug-howto.rst */
    9203             : static void alloc_contig_dump_pages(struct list_head *page_list)
    9204             : {
    9205             :         DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
    9206             : 
    9207             :         if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
    9208             :                 struct page *page;
    9209             : 
    9210             :                 dump_stack();
    9211             :                 list_for_each_entry(page, page_list, lru)
    9212             :                         dump_page(page, "migration failure");
    9213             :         }
    9214             : }
    9215             : #else
    9216             : static inline void alloc_contig_dump_pages(struct list_head *page_list)
    9217             : {
    9218             : }
    9219             : #endif
    9220             : 
    9221             : /* [start, end) must belong to a single zone. */
    9222             : int __alloc_contig_migrate_range(struct compact_control *cc,
    9223             :                                         unsigned long start, unsigned long end)
    9224             : {
    9225             :         /* This function is based on compact_zone() from compaction.c. */
    9226             :         unsigned int nr_reclaimed;
    9227             :         unsigned long pfn = start;
    9228             :         unsigned int tries = 0;
    9229             :         int ret = 0;
    9230             :         struct migration_target_control mtc = {
    9231             :                 .nid = zone_to_nid(cc->zone),
    9232             :                 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
    9233             :         };
    9234             : 
    9235             :         lru_cache_disable();
    9236             : 
    9237             :         while (pfn < end || !list_empty(&cc->migratepages)) {
    9238             :                 if (fatal_signal_pending(current)) {
    9239             :                         ret = -EINTR;
    9240             :                         break;
    9241             :                 }
    9242             : 
    9243             :                 if (list_empty(&cc->migratepages)) {
    9244             :                         cc->nr_migratepages = 0;
    9245             :                         ret = isolate_migratepages_range(cc, pfn, end);
    9246             :                         if (ret && ret != -EAGAIN)
    9247             :                                 break;
    9248             :                         pfn = cc->migrate_pfn;
    9249             :                         tries = 0;
    9250             :                 } else if (++tries == 5) {
    9251             :                         ret = -EBUSY;
    9252             :                         break;
    9253             :                 }
    9254             : 
    9255             :                 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
    9256             :                                                         &cc->migratepages);
    9257             :                 cc->nr_migratepages -= nr_reclaimed;
    9258             : 
    9259             :                 ret = migrate_pages(&cc->migratepages, alloc_migration_target,
    9260             :                         NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
    9261             : 
    9262             :                 /*
    9263             :                  * On -ENOMEM, migrate_pages() bails out right away. It is pointless
    9264             :                  * to retry again over this error, so do the same here.
    9265             :                  */
    9266             :                 if (ret == -ENOMEM)
    9267             :                         break;
    9268             :         }
    9269             : 
    9270             :         lru_cache_enable();
    9271             :         if (ret < 0) {
    9272             :                 if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
    9273             :                         alloc_contig_dump_pages(&cc->migratepages);
    9274             :                 putback_movable_pages(&cc->migratepages);
    9275             :                 return ret;
    9276             :         }
    9277             :         return 0;
    9278             : }
    9279             : 
    9280             : /**
    9281             :  * alloc_contig_range() -- tries to allocate given range of pages
    9282             :  * @start:      start PFN to allocate
    9283             :  * @end:        one-past-the-last PFN to allocate
    9284             :  * @migratetype:        migratetype of the underlying pageblocks (either
    9285             :  *                      #MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
    9286             :  *                      in range must have the same migratetype and it must
    9287             :  *                      be either of the two.
    9288             :  * @gfp_mask:   GFP mask to use during compaction
    9289             :  *
    9290             :  * The PFN range does not have to be pageblock aligned. The PFN range must
    9291             :  * belong to a single zone.
    9292             :  *
    9293             :  * The first thing this routine does is attempt to MIGRATE_ISOLATE all
    9294             :  * pageblocks in the range.  Once isolated, the pageblocks should not
    9295             :  * be modified by others.
    9296             :  *
    9297             :  * Return: zero on success or negative error code.  On success all
    9298             :  * pages which PFN is in [start, end) are allocated for the caller and
    9299             :  * need to be freed with free_contig_range().
    9300             :  */
    9301             : int alloc_contig_range(unsigned long start, unsigned long end,
    9302             :                        unsigned migratetype, gfp_t gfp_mask)
    9303             : {
    9304             :         unsigned long outer_start, outer_end;
    9305             :         int order;
    9306             :         int ret = 0;
    9307             : 
    9308             :         struct compact_control cc = {
    9309             :                 .nr_migratepages = 0,
    9310             :                 .order = -1,
    9311             :                 .zone = page_zone(pfn_to_page(start)),
    9312             :                 .mode = MIGRATE_SYNC,
    9313             :                 .ignore_skip_hint = true,
    9314             :                 .no_set_skip_hint = true,
    9315             :                 .gfp_mask = current_gfp_context(gfp_mask),
    9316             :                 .alloc_contig = true,
    9317             :         };
    9318             :         INIT_LIST_HEAD(&cc.migratepages);
    9319             : 
    9320             :         /*
    9321             :          * What we do here is we mark all pageblocks in range as
    9322             :          * MIGRATE_ISOLATE.  Because pageblock and max order pages may
    9323             :          * have different sizes, and due to the way page allocator
    9324             :          * work, start_isolate_page_range() has special handlings for this.
    9325             :          *
    9326             :          * Once the pageblocks are marked as MIGRATE_ISOLATE, we
    9327             :          * migrate the pages from an unaligned range (ie. pages that
    9328             :          * we are interested in). This will put all the pages in
    9329             :          * range back to page allocator as MIGRATE_ISOLATE.
    9330             :          *
    9331             :          * When this is done, we take the pages in range from page
    9332             :          * allocator removing them from the buddy system.  This way
    9333             :          * page allocator will never consider using them.
    9334             :          *
    9335             :          * This lets us mark the pageblocks back as
    9336             :          * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
    9337             :          * aligned range but not in the unaligned, original range are
    9338             :          * put back to page allocator so that buddy can use them.
    9339             :          */
    9340             : 
    9341             :         ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
    9342             :         if (ret)
    9343             :                 goto done;
    9344             : 
    9345             :         drain_all_pages(cc.zone);
    9346             : 
    9347             :         /*
    9348             :          * In case of -EBUSY, we'd like to know which page causes problem.
    9349             :          * So, just fall through. test_pages_isolated() has a tracepoint
    9350             :          * which will report the busy page.
    9351             :          *
    9352             :          * It is possible that busy pages could become available before
    9353             :          * the call to test_pages_isolated, and the range will actually be
    9354             :          * allocated.  So, if we fall through be sure to clear ret so that
    9355             :          * -EBUSY is not accidentally used or returned to caller.
    9356             :          */
    9357             :         ret = __alloc_contig_migrate_range(&cc, start, end);
    9358             :         if (ret && ret != -EBUSY)
    9359             :                 goto done;
    9360             :         ret = 0;
    9361             : 
    9362             :         /*
    9363             :          * Pages from [start, end) are within a pageblock_nr_pages
    9364             :          * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
    9365             :          * more, all pages in [start, end) are free in page allocator.
    9366             :          * What we are going to do is to allocate all pages from
    9367             :          * [start, end) (that is remove them from page allocator).
    9368             :          *
    9369             :          * The only problem is that pages at the beginning and at the
    9370             :          * end of interesting range may be not aligned with pages that
    9371             :          * page allocator holds, ie. they can be part of higher order
    9372             :          * pages.  Because of this, we reserve the bigger range and
    9373             :          * once this is done free the pages we are not interested in.
    9374             :          *
    9375             :          * We don't have to hold zone->lock here because the pages are
    9376             :          * isolated thus they won't get removed from buddy.
    9377             :          */
    9378             : 
    9379             :         order = 0;
    9380             :         outer_start = start;
    9381             :         while (!PageBuddy(pfn_to_page(outer_start))) {
    9382             :                 if (++order >= MAX_ORDER) {
    9383             :                         outer_start = start;
    9384             :                         break;
    9385             :                 }
    9386             :                 outer_start &= ~0UL << order;
    9387             :         }
    9388             : 
    9389             :         if (outer_start != start) {
    9390             :                 order = buddy_order(pfn_to_page(outer_start));
    9391             : 
    9392             :                 /*
    9393             :                  * outer_start page could be small order buddy page and
    9394             :                  * it doesn't include start page. Adjust outer_start
    9395             :                  * in this case to report failed page properly
    9396             :                  * on tracepoint in test_pages_isolated()
    9397             :                  */
    9398             :                 if (outer_start + (1UL << order) <= start)
    9399             :                         outer_start = start;
    9400             :         }
    9401             : 
    9402             :         /* Make sure the range is really isolated. */
    9403             :         if (test_pages_isolated(outer_start, end, 0)) {
    9404             :                 ret = -EBUSY;
    9405             :                 goto done;
    9406             :         }
    9407             : 
    9408             :         /* Grab isolated pages from freelists. */
    9409             :         outer_end = isolate_freepages_range(&cc, outer_start, end);
    9410             :         if (!outer_end) {
    9411             :                 ret = -EBUSY;
    9412             :                 goto done;
    9413             :         }
    9414             : 
    9415             :         /* Free head and tail (if any) */
    9416             :         if (start != outer_start)
    9417             :                 free_contig_range(outer_start, start - outer_start);
    9418             :         if (end != outer_end)
    9419             :                 free_contig_range(end, outer_end - end);
    9420             : 
    9421             : done:
    9422             :         undo_isolate_page_range(start, end, migratetype);
    9423             :         return ret;
    9424             : }
    9425             : EXPORT_SYMBOL(alloc_contig_range);
    9426             : 
    9427             : static int __alloc_contig_pages(unsigned long start_pfn,
    9428             :                                 unsigned long nr_pages, gfp_t gfp_mask)
    9429             : {
    9430             :         unsigned long end_pfn = start_pfn + nr_pages;
    9431             : 
    9432             :         return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
    9433             :                                   gfp_mask);
    9434             : }
    9435             : 
    9436             : static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
    9437             :                                    unsigned long nr_pages)
    9438             : {
    9439             :         unsigned long i, end_pfn = start_pfn + nr_pages;
    9440             :         struct page *page;
    9441             : 
    9442             :         for (i = start_pfn; i < end_pfn; i++) {
    9443             :                 page = pfn_to_online_page(i);
    9444             :                 if (!page)
    9445             :                         return false;
    9446             : 
    9447             :                 if (page_zone(page) != z)
    9448             :                         return false;
    9449             : 
    9450             :                 if (PageReserved(page))
    9451             :                         return false;
    9452             :         }
    9453             :         return true;
    9454             : }
    9455             : 
    9456             : static bool zone_spans_last_pfn(const struct zone *zone,
    9457             :                                 unsigned long start_pfn, unsigned long nr_pages)
    9458             : {
    9459             :         unsigned long last_pfn = start_pfn + nr_pages - 1;
    9460             : 
    9461             :         return zone_spans_pfn(zone, last_pfn);
    9462             : }
    9463             : 
    9464             : /**
    9465             :  * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
    9466             :  * @nr_pages:   Number of contiguous pages to allocate
    9467             :  * @gfp_mask:   GFP mask to limit search and used during compaction
    9468             :  * @nid:        Target node
    9469             :  * @nodemask:   Mask for other possible nodes
    9470             :  *
    9471             :  * This routine is a wrapper around alloc_contig_range(). It scans over zones
    9472             :  * on an applicable zonelist to find a contiguous pfn range which can then be
    9473             :  * tried for allocation with alloc_contig_range(). This routine is intended
    9474             :  * for allocation requests which can not be fulfilled with the buddy allocator.
    9475             :  *
    9476             :  * The allocated memory is always aligned to a page boundary. If nr_pages is a
    9477             :  * power of two, then allocated range is also guaranteed to be aligned to same
    9478             :  * nr_pages (e.g. 1GB request would be aligned to 1GB).
    9479             :  *
    9480             :  * Allocated pages can be freed with free_contig_range() or by manually calling
    9481             :  * __free_page() on each allocated page.
    9482             :  *
    9483             :  * Return: pointer to contiguous pages on success, or NULL if not successful.
    9484             :  */
    9485             : struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
    9486             :                                 int nid, nodemask_t *nodemask)
    9487             : {
    9488             :         unsigned long ret, pfn, flags;
    9489             :         struct zonelist *zonelist;
    9490             :         struct zone *zone;
    9491             :         struct zoneref *z;
    9492             : 
    9493             :         zonelist = node_zonelist(nid, gfp_mask);
    9494             :         for_each_zone_zonelist_nodemask(zone, z, zonelist,
    9495             :                                         gfp_zone(gfp_mask), nodemask) {
    9496             :                 spin_lock_irqsave(&zone->lock, flags);
    9497             : 
    9498             :                 pfn = ALIGN(zone->zone_start_pfn, nr_pages);
    9499             :                 while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
    9500             :                         if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
    9501             :                                 /*
    9502             :                                  * We release the zone lock here because
    9503             :                                  * alloc_contig_range() will also lock the zone
    9504             :                                  * at some point. If there's an allocation
    9505             :                                  * spinning on this lock, it may win the race
    9506             :                                  * and cause alloc_contig_range() to fail...
    9507             :                                  */
    9508             :                                 spin_unlock_irqrestore(&zone->lock, flags);
    9509             :                                 ret = __alloc_contig_pages(pfn, nr_pages,
    9510             :                                                         gfp_mask);
    9511             :                                 if (!ret)
    9512             :                                         return pfn_to_page(pfn);
    9513             :                                 spin_lock_irqsave(&zone->lock, flags);
    9514             :                         }
    9515             :                         pfn += nr_pages;
    9516             :                 }
    9517             :                 spin_unlock_irqrestore(&zone->lock, flags);
    9518             :         }
    9519             :         return NULL;
    9520             : }
    9521             : #endif /* CONFIG_CONTIG_ALLOC */
    9522             : 
    9523           0 : void free_contig_range(unsigned long pfn, unsigned long nr_pages)
    9524             : {
    9525           0 :         unsigned long count = 0;
    9526             : 
    9527           0 :         for (; nr_pages--; pfn++) {
    9528           0 :                 struct page *page = pfn_to_page(pfn);
    9529             : 
    9530           0 :                 count += page_count(page) != 1;
    9531           0 :                 __free_page(page);
    9532             :         }
    9533           0 :         WARN(count != 0, "%lu pages are still in use!\n", count);
    9534           0 : }
    9535             : EXPORT_SYMBOL(free_contig_range);
    9536             : 
    9537             : /*
    9538             :  * Effectively disable pcplists for the zone by setting the high limit to 0
    9539             :  * and draining all cpus. A concurrent page freeing on another CPU that's about
    9540             :  * to put the page on pcplist will either finish before the drain and the page
    9541             :  * will be drained, or observe the new high limit and skip the pcplist.
    9542             :  *
    9543             :  * Must be paired with a call to zone_pcp_enable().
    9544             :  */
    9545           0 : void zone_pcp_disable(struct zone *zone)
    9546             : {
    9547           0 :         mutex_lock(&pcp_batch_high_lock);
    9548           0 :         __zone_set_pageset_high_and_batch(zone, 0, 1);
    9549           0 :         __drain_all_pages(zone, true);
    9550           0 : }
    9551             : 
    9552           0 : void zone_pcp_enable(struct zone *zone)
    9553             : {
    9554           0 :         __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
    9555           0 :         mutex_unlock(&pcp_batch_high_lock);
    9556           0 : }
    9557             : 
    9558           0 : void zone_pcp_reset(struct zone *zone)
    9559             : {
    9560             :         int cpu;
    9561             :         struct per_cpu_zonestat *pzstats;
    9562             : 
    9563           0 :         if (zone->per_cpu_pageset != &boot_pageset) {
    9564             :                 for_each_online_cpu(cpu) {
    9565             :                         pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
    9566             :                         drain_zonestat(zone, pzstats);
    9567             :                 }
    9568           0 :                 free_percpu(zone->per_cpu_pageset);
    9569           0 :                 zone->per_cpu_pageset = &boot_pageset;
    9570           0 :                 if (zone->per_cpu_zonestats != &boot_zonestats) {
    9571           0 :                         free_percpu(zone->per_cpu_zonestats);
    9572           0 :                         zone->per_cpu_zonestats = &boot_zonestats;
    9573             :                 }
    9574             :         }
    9575           0 : }
    9576             : 
    9577             : #ifdef CONFIG_MEMORY_HOTREMOVE
    9578             : /*
    9579             :  * All pages in the range must be in a single zone, must not contain holes,
    9580             :  * must span full sections, and must be isolated before calling this function.
    9581             :  */
    9582             : void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
    9583             : {
    9584             :         unsigned long pfn = start_pfn;
    9585             :         struct page *page;
    9586             :         struct zone *zone;
    9587             :         unsigned int order;
    9588             :         unsigned long flags;
    9589             : 
    9590             :         offline_mem_sections(pfn, end_pfn);
    9591             :         zone = page_zone(pfn_to_page(pfn));
    9592             :         spin_lock_irqsave(&zone->lock, flags);
    9593             :         while (pfn < end_pfn) {
    9594             :                 page = pfn_to_page(pfn);
    9595             :                 /*
    9596             :                  * The HWPoisoned page may be not in buddy system, and
    9597             :                  * page_count() is not 0.
    9598             :                  */
    9599             :                 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
    9600             :                         pfn++;
    9601             :                         continue;
    9602             :                 }
    9603             :                 /*
    9604             :                  * At this point all remaining PageOffline() pages have a
    9605             :                  * reference count of 0 and can simply be skipped.
    9606             :                  */
    9607             :                 if (PageOffline(page)) {
    9608             :                         BUG_ON(page_count(page));
    9609             :                         BUG_ON(PageBuddy(page));
    9610             :                         pfn++;
    9611             :                         continue;
    9612             :                 }
    9613             : 
    9614             :                 BUG_ON(page_count(page));
    9615             :                 BUG_ON(!PageBuddy(page));
    9616             :                 order = buddy_order(page);
    9617             :                 del_page_from_free_list(page, zone, order);
    9618             :                 pfn += (1 << order);
    9619             :         }
    9620             :         spin_unlock_irqrestore(&zone->lock, flags);
    9621             : }
    9622             : #endif
    9623             : 
    9624             : /*
    9625             :  * This function returns a stable result only if called under zone lock.
    9626             :  */
    9627           0 : bool is_free_buddy_page(struct page *page)
    9628             : {
    9629           0 :         unsigned long pfn = page_to_pfn(page);
    9630             :         unsigned int order;
    9631             : 
    9632           0 :         for (order = 0; order < MAX_ORDER; order++) {
    9633           0 :                 struct page *page_head = page - (pfn & ((1 << order) - 1));
    9634             : 
    9635           0 :                 if (PageBuddy(page_head) &&
    9636           0 :                     buddy_order_unsafe(page_head) >= order)
    9637             :                         break;
    9638             :         }
    9639             : 
    9640           0 :         return order < MAX_ORDER;
    9641             : }
    9642             : EXPORT_SYMBOL(is_free_buddy_page);
    9643             : 
    9644             : #ifdef CONFIG_MEMORY_FAILURE
    9645             : /*
    9646             :  * Break down a higher-order page in sub-pages, and keep our target out of
    9647             :  * buddy allocator.
    9648             :  */
    9649             : static void break_down_buddy_pages(struct zone *zone, struct page *page,
    9650             :                                    struct page *target, int low, int high,
    9651             :                                    int migratetype)
    9652             : {
    9653             :         unsigned long size = 1 << high;
    9654             :         struct page *current_buddy, *next_page;
    9655             : 
    9656             :         while (high > low) {
    9657             :                 high--;
    9658             :                 size >>= 1;
    9659             : 
    9660             :                 if (target >= &page[size]) {
    9661             :                         next_page = page + size;
    9662             :                         current_buddy = page;
    9663             :                 } else {
    9664             :                         next_page = page;
    9665             :                         current_buddy = page + size;
    9666             :                 }
    9667             : 
    9668             :                 if (set_page_guard(zone, current_buddy, high, migratetype))
    9669             :                         continue;
    9670             : 
    9671             :                 if (current_buddy != target) {
    9672             :                         add_to_free_list(current_buddy, zone, high, migratetype);
    9673             :                         set_buddy_order(current_buddy, high);
    9674             :                         page = next_page;
    9675             :                 }
    9676             :         }
    9677             : }
    9678             : 
    9679             : /*
    9680             :  * Take a page that will be marked as poisoned off the buddy allocator.
    9681             :  */
    9682             : bool take_page_off_buddy(struct page *page)
    9683             : {
    9684             :         struct zone *zone = page_zone(page);
    9685             :         unsigned long pfn = page_to_pfn(page);
    9686             :         unsigned long flags;
    9687             :         unsigned int order;
    9688             :         bool ret = false;
    9689             : 
    9690             :         spin_lock_irqsave(&zone->lock, flags);
    9691             :         for (order = 0; order < MAX_ORDER; order++) {
    9692             :                 struct page *page_head = page - (pfn & ((1 << order) - 1));
    9693             :                 int page_order = buddy_order(page_head);
    9694             : 
    9695             :                 if (PageBuddy(page_head) && page_order >= order) {
    9696             :                         unsigned long pfn_head = page_to_pfn(page_head);
    9697             :                         int migratetype = get_pfnblock_migratetype(page_head,
    9698             :                                                                    pfn_head);
    9699             : 
    9700             :                         del_page_from_free_list(page_head, zone, page_order);
    9701             :                         break_down_buddy_pages(zone, page_head, page, 0,
    9702             :                                                 page_order, migratetype);
    9703             :                         SetPageHWPoisonTakenOff(page);
    9704             :                         if (!is_migrate_isolate(migratetype))
    9705             :                                 __mod_zone_freepage_state(zone, -1, migratetype);
    9706             :                         ret = true;
    9707             :                         break;
    9708             :                 }
    9709             :                 if (page_count(page_head) > 0)
    9710             :                         break;
    9711             :         }
    9712             :         spin_unlock_irqrestore(&zone->lock, flags);
    9713             :         return ret;
    9714             : }
    9715             : 
    9716             : /*
    9717             :  * Cancel takeoff done by take_page_off_buddy().
    9718             :  */
    9719             : bool put_page_back_buddy(struct page *page)
    9720             : {
    9721             :         struct zone *zone = page_zone(page);
    9722             :         unsigned long pfn = page_to_pfn(page);
    9723             :         unsigned long flags;
    9724             :         int migratetype = get_pfnblock_migratetype(page, pfn);
    9725             :         bool ret = false;
    9726             : 
    9727             :         spin_lock_irqsave(&zone->lock, flags);
    9728             :         if (put_page_testzero(page)) {
    9729             :                 ClearPageHWPoisonTakenOff(page);
    9730             :                 __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
    9731             :                 if (TestClearPageHWPoison(page)) {
    9732             :                         ret = true;
    9733             :                 }
    9734             :         }
    9735             :         spin_unlock_irqrestore(&zone->lock, flags);
    9736             : 
    9737             :         return ret;
    9738             : }
    9739             : #endif
    9740             : 
    9741             : #ifdef CONFIG_ZONE_DMA
    9742             : bool has_managed_dma(void)
    9743             : {
    9744             :         struct pglist_data *pgdat;
    9745             : 
    9746             :         for_each_online_pgdat(pgdat) {
    9747             :                 struct zone *zone = &pgdat->node_zones[ZONE_DMA];
    9748             : 
    9749             :                 if (managed_zone(zone))
    9750             :                         return true;
    9751             :         }
    9752             :         return false;
    9753             : }
    9754             : #endif /* CONFIG_ZONE_DMA */

Generated by: LCOV version 1.14