LCOV - coverage.info - mm/page

LCOV - code coverage report

Current view:	top level - mm - page_alloc.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	659	1571	41.9 %
Date:	2023-07-19 18:55:55	Functions:	59	130	45.4 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  linux/mm/page_alloc.c
       4             :  *
       5             :  *  Manages the free list, the system allocates free pages here.
       6             :  *  Note that kmalloc() lives in slab.c
       7             :  *
       8             :  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
       9             :  *  Swap reorganised 29.12.95, Stephen Tweedie
      10             :  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
      11             :  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
      12             :  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
      13             :  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
      14             :  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
      15             :  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
      16             :  */
      17             : 
      18             : #include <linux/stddef.h>
      19             : #include <linux/mm.h>
      20             : #include <linux/highmem.h>
      21             : #include <linux/swap.h>
      22             : #include <linux/swapops.h>
      23             : #include <linux/interrupt.h>
      24             : #include <linux/pagemap.h>
      25             : #include <linux/jiffies.h>
      26             : #include <linux/memblock.h>
      27             : #include <linux/compiler.h>
      28             : #include <linux/kernel.h>
      29             : #include <linux/kasan.h>
      30             : #include <linux/kmsan.h>
      31             : #include <linux/module.h>
      32             : #include <linux/suspend.h>
      33             : #include <linux/pagevec.h>
      34             : #include <linux/blkdev.h>
      35             : #include <linux/slab.h>
      36             : #include <linux/ratelimit.h>
      37             : #include <linux/oom.h>
      38             : #include <linux/topology.h>
      39             : #include <linux/sysctl.h>
      40             : #include <linux/cpu.h>
      41             : #include <linux/cpuset.h>
      42             : #include <linux/memory_hotplug.h>
      43             : #include <linux/nodemask.h>
      44             : #include <linux/vmalloc.h>
      45             : #include <linux/vmstat.h>
      46             : #include <linux/mempolicy.h>
      47             : #include <linux/memremap.h>
      48             : #include <linux/stop_machine.h>
      49             : #include <linux/random.h>
      50             : #include <linux/sort.h>
      51             : #include <linux/pfn.h>
      52             : #include <linux/backing-dev.h>
      53             : #include <linux/fault-inject.h>
      54             : #include <linux/page-isolation.h>
      55             : #include <linux/debugobjects.h>
      56             : #include <linux/kmemleak.h>
      57             : #include <linux/compaction.h>
      58             : #include <trace/events/kmem.h>
      59             : #include <trace/events/oom.h>
      60             : #include <linux/prefetch.h>
      61             : #include <linux/mm_inline.h>
      62             : #include <linux/mmu_notifier.h>
      63             : #include <linux/migrate.h>
      64             : #include <linux/hugetlb.h>
      65             : #include <linux/sched/rt.h>
      66             : #include <linux/sched/mm.h>
      67             : #include <linux/page_owner.h>
      68             : #include <linux/page_table_check.h>
      69             : #include <linux/kthread.h>
      70             : #include <linux/memcontrol.h>
      71             : #include <linux/ftrace.h>
      72             : #include <linux/lockdep.h>
      73             : #include <linux/nmi.h>
      74             : #include <linux/psi.h>
      75             : #include <linux/khugepaged.h>
      76             : #include <linux/delayacct.h>
      77             : #include <asm/sections.h>
      78             : #include <asm/tlbflush.h>
      79             : #include <asm/div64.h>
      80             : #include "internal.h"
      81             : #include "shuffle.h"
      82             : #include "page_reporting.h"
      83             : #include "swap.h"
      84             : 
      85             : /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
      86             : typedef int __bitwise fpi_t;
      87             : 
      88             : /* No special request */
      89             : #define FPI_NONE                ((__force fpi_t)0)
      90             : 
      91             : /*
      92             :  * Skip free page reporting notification for the (possibly merged) page.
      93             :  * This does not hinder free page reporting from grabbing the page,
      94             :  * reporting it and marking it "reported" -  it only skips notifying
      95             :  * the free page reporting infrastructure about a newly freed page. For
      96             :  * example, used when temporarily pulling a page from a freelist and
      97             :  * putting it back unmodified.
      98             :  */
      99             : #define FPI_SKIP_REPORT_NOTIFY  ((__force fpi_t)BIT(0))
     100             : 
     101             : /*
     102             :  * Place the (possibly merged) page to the tail of the freelist. Will ignore
     103             :  * page shuffling (relevant code - e.g., memory onlining - is expected to
     104             :  * shuffle the whole zone).
     105             :  *
     106             :  * Note: No code should rely on this flag for correctness - it's purely
     107             :  *       to allow for optimizations when handing back either fresh pages
     108             :  *       (memory onlining) or untouched pages (page isolation, free page
     109             :  *       reporting).
     110             :  */
     111             : #define FPI_TO_TAIL             ((__force fpi_t)BIT(1))
     112             : 
     113             : /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
     114             : static DEFINE_MUTEX(pcp_batch_high_lock);
     115             : #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
     116             : 
     117             : #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
     118             : /*
     119             :  * On SMP, spin_trylock is sufficient protection.
     120             :  * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
     121             :  */
     122             : #define pcp_trylock_prepare(flags)      do { } while (0)
     123             : #define pcp_trylock_finish(flag)        do { } while (0)
     124             : #else
     125             : 
     126             : /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
     127             : #define pcp_trylock_prepare(flags)      local_irq_save(flags)
     128             : #define pcp_trylock_finish(flags)       local_irq_restore(flags)
     129             : #endif
     130             : 
     131             : /*
     132             :  * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
     133             :  * a migration causing the wrong PCP to be locked and remote memory being
     134             :  * potentially allocated, pin the task to the CPU for the lookup+lock.
     135             :  * preempt_disable is used on !RT because it is faster than migrate_disable.
     136             :  * migrate_disable is used on RT because otherwise RT spinlock usage is
     137             :  * interfered with and a high priority task cannot preempt the allocator.
     138             :  */
     139             : #ifndef CONFIG_PREEMPT_RT
     140             : #define pcpu_task_pin()         preempt_disable()
     141             : #define pcpu_task_unpin()       preempt_enable()
     142             : #else
     143             : #define pcpu_task_pin()         migrate_disable()
     144             : #define pcpu_task_unpin()       migrate_enable()
     145             : #endif
     146             : 
     147             : /*
     148             :  * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
     149             :  * Return value should be used with equivalent unlock helper.
     150             :  */
     151             : #define pcpu_spin_lock(type, member, ptr)                               \
     152             : ({                                                                      \
     153             :         type *_ret;                                                     \
     154             :         pcpu_task_pin();                                                \
     155             :         _ret = this_cpu_ptr(ptr);                                       \
     156             :         spin_lock(&_ret->member);                                        \
     157             :         _ret;                                                           \
     158             : })
     159             : 
     160             : #define pcpu_spin_trylock(type, member, ptr)                            \
     161             : ({                                                                      \
     162             :         type *_ret;                                                     \
     163             :         pcpu_task_pin();                                                \
     164             :         _ret = this_cpu_ptr(ptr);                                       \
     165             :         if (!spin_trylock(&_ret->member)) {                              \
     166             :                 pcpu_task_unpin();                                      \
     167             :                 _ret = NULL;                                            \
     168             :         }                                                               \
     169             :         _ret;                                                           \
     170             : })
     171             : 
     172             : #define pcpu_spin_unlock(member, ptr)                                   \
     173             : ({                                                                      \
     174             :         spin_unlock(&ptr->member);                                       \
     175             :         pcpu_task_unpin();                                              \
     176             : })
     177             : 
     178             : /* struct per_cpu_pages specific helpers. */
     179             : #define pcp_spin_lock(ptr)                                              \
     180             :         pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
     181             : 
     182             : #define pcp_spin_trylock(ptr)                                           \
     183             :         pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
     184             : 
     185             : #define pcp_spin_unlock(ptr)                                            \
     186             :         pcpu_spin_unlock(lock, ptr)
     187             : 
     188             : #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
     189             : DEFINE_PER_CPU(int, numa_node);
     190             : EXPORT_PER_CPU_SYMBOL(numa_node);
     191             : #endif
     192             : 
     193             : DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
     194             : 
     195             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
     196             : /*
     197             :  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
     198             :  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
     199             :  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
     200             :  * defined in <linux/topology.h>.
     201             :  */
     202             : DEFINE_PER_CPU(int, _numa_mem_);                /* Kernel "local memory" node */
     203             : EXPORT_PER_CPU_SYMBOL(_numa_mem_);
     204             : #endif
     205             : 
     206             : static DEFINE_MUTEX(pcpu_drain_mutex);
     207             : 
     208             : #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
     209             : volatile unsigned long latent_entropy __latent_entropy;
     210             : EXPORT_SYMBOL(latent_entropy);
     211             : #endif
     212             : 
     213             : /*
     214             :  * Array of node states.
     215             :  */
     216             : nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
     217             :         [N_POSSIBLE] = NODE_MASK_ALL,
     218             :         [N_ONLINE] = { { [0] = 1UL } },
     219             : #ifndef CONFIG_NUMA
     220             :         [N_NORMAL_MEMORY] = { { [0] = 1UL } },
     221             : #ifdef CONFIG_HIGHMEM
     222             :         [N_HIGH_MEMORY] = { { [0] = 1UL } },
     223             : #endif
     224             :         [N_MEMORY] = { { [0] = 1UL } },
     225             :         [N_CPU] = { { [0] = 1UL } },
     226             : #endif  /* NUMA */
     227             : };
     228             : EXPORT_SYMBOL(node_states);
     229             : 
     230             : atomic_long_t _totalram_pages __read_mostly;
     231             : EXPORT_SYMBOL(_totalram_pages);
     232             : unsigned long totalreserve_pages __read_mostly;
     233             : unsigned long totalcma_pages __read_mostly;
     234             : 
     235             : int percpu_pagelist_high_fraction;
     236             : gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
     237             : DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
     238             : EXPORT_SYMBOL(init_on_alloc);
     239             : 
     240             : DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
     241             : EXPORT_SYMBOL(init_on_free);
     242             : 
     243             : /*
     244             :  * A cached value of the page's pageblock's migratetype, used when the page is
     245             :  * put on a pcplist. Used to avoid the pageblock migratetype lookup when
     246             :  * freeing from pcplists in most cases, at the cost of possibly becoming stale.
     247             :  * Also the migratetype set in the page does not necessarily match the pcplist
     248             :  * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
     249             :  * other index - this ensures that it will be put on the correct CMA freelist.
     250             :  */
     251             : static inline int get_pcppage_migratetype(struct page *page)
     252             : {
     253       45181 :         return page->index;
     254             : }
     255             : 
     256             : static inline void set_pcppage_migratetype(struct page *page, int migratetype)
     257             : {
     258       46557 :         page->index = migratetype;
     259             : }
     260             : 
     261             : #ifdef CONFIG_PM_SLEEP
     262             : /*
     263             :  * The following functions are used by the suspend/hibernate code to temporarily
     264             :  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
     265             :  * while devices are suspended.  To avoid races with the suspend/hibernate code,
     266             :  * they should always be called with system_transition_mutex held
     267             :  * (gfp_allowed_mask also should only be modified with system_transition_mutex
     268             :  * held, unless the suspend/hibernate code is guaranteed not to run in parallel
     269             :  * with that modification).
     270             :  */
     271             : 
     272             : static gfp_t saved_gfp_mask;
     273             : 
     274           0 : void pm_restore_gfp_mask(void)
     275             : {
     276           0 :         WARN_ON(!mutex_is_locked(&system_transition_mutex));
     277           0 :         if (saved_gfp_mask) {
     278           0 :                 gfp_allowed_mask = saved_gfp_mask;
     279           0 :                 saved_gfp_mask = 0;
     280             :         }
     281           0 : }
     282             : 
     283           0 : void pm_restrict_gfp_mask(void)
     284             : {
     285           0 :         WARN_ON(!mutex_is_locked(&system_transition_mutex));
     286           0 :         WARN_ON(saved_gfp_mask);
     287           0 :         saved_gfp_mask = gfp_allowed_mask;
     288           0 :         gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
     289           0 : }
     290             : 
     291           0 : bool pm_suspended_storage(void)
     292             : {
     293           0 :         if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
     294             :                 return false;
     295           0 :         return true;
     296             : }
     297             : #endif /* CONFIG_PM_SLEEP */
     298             : 
     299             : #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
     300             : unsigned int pageblock_order __read_mostly;
     301             : #endif
     302             : 
     303             : static void __free_pages_ok(struct page *page, unsigned int order,
     304             :                             fpi_t fpi_flags);
     305             : 
     306             : /*
     307             :  * results with 256, 32 in the lowmem_reserve sysctl:
     308             :  *      1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
     309             :  *      1G machine -> (16M dma, 784M normal, 224M high)
     310             :  *      NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
     311             :  *      HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
     312             :  *      HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
     313             :  *
     314             :  * TBD: should special case ZONE_DMA32 machines here - in those we normally
     315             :  * don't need any ZONE_NORMAL reservation
     316             :  */
     317             : int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
     318             : #ifdef CONFIG_ZONE_DMA
     319             :         [ZONE_DMA] = 256,
     320             : #endif
     321             : #ifdef CONFIG_ZONE_DMA32
     322             :         [ZONE_DMA32] = 256,
     323             : #endif
     324             :         [ZONE_NORMAL] = 32,
     325             : #ifdef CONFIG_HIGHMEM
     326             :         [ZONE_HIGHMEM] = 0,
     327             : #endif
     328             :         [ZONE_MOVABLE] = 0,
     329             : };
     330             : 
     331             : char * const zone_names[MAX_NR_ZONES] = {
     332             : #ifdef CONFIG_ZONE_DMA
     333             :          "DMA",
     334             : #endif
     335             : #ifdef CONFIG_ZONE_DMA32
     336             :          "DMA32",
     337             : #endif
     338             :          "Normal",
     339             : #ifdef CONFIG_HIGHMEM
     340             :          "HighMem",
     341             : #endif
     342             :          "Movable",
     343             : #ifdef CONFIG_ZONE_DEVICE
     344             :          "Device",
     345             : #endif
     346             : };
     347             : 
     348             : const char * const migratetype_names[MIGRATE_TYPES] = {
     349             :         "Unmovable",
     350             :         "Movable",
     351             :         "Reclaimable",
     352             :         "HighAtomic",
     353             : #ifdef CONFIG_CMA
     354             :         "CMA",
     355             : #endif
     356             : #ifdef CONFIG_MEMORY_ISOLATION
     357             :         "Isolate",
     358             : #endif
     359             : };
     360             : 
     361             : compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
     362             :         [NULL_COMPOUND_DTOR] = NULL,
     363             :         [COMPOUND_PAGE_DTOR] = free_compound_page,
     364             : #ifdef CONFIG_HUGETLB_PAGE
     365             :         [HUGETLB_PAGE_DTOR] = free_huge_page,
     366             : #endif
     367             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     368             :         [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
     369             : #endif
     370             : };
     371             : 
     372             : int min_free_kbytes = 1024;
     373             : int user_min_free_kbytes = -1;
     374             : int watermark_boost_factor __read_mostly = 15000;
     375             : int watermark_scale_factor = 10;
     376             : 
     377             : bool mirrored_kernelcore __initdata_memblock;
     378             : 
     379             : /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
     380             : int movable_zone;
     381             : EXPORT_SYMBOL(movable_zone);
     382             : 
     383             : #if MAX_NUMNODES > 1
     384             : unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
     385             : unsigned int nr_online_nodes __read_mostly = 1;
     386             : EXPORT_SYMBOL(nr_node_ids);
     387             : EXPORT_SYMBOL(nr_online_nodes);
     388             : #endif
     389             : 
     390             : int page_group_by_mobility_disabled __read_mostly;
     391             : 
     392             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
     393             : /*
     394             :  * During boot we initialize deferred pages on-demand, as needed, but once
     395             :  * page_alloc_init_late() has finished, the deferred pages are all initialized,
     396             :  * and we can permanently disable that path.
     397             :  */
     398             : DEFINE_STATIC_KEY_TRUE(deferred_pages);
     399             : 
     400             : static inline bool deferred_pages_enabled(void)
     401             : {
     402             :         return static_branch_unlikely(&deferred_pages);
     403             : }
     404             : 
     405             : /*
     406             :  * deferred_grow_zone() is __init, but it is called from
     407             :  * get_page_from_freelist() during early boot until deferred_pages permanently
     408             :  * disables this call. This is why we have refdata wrapper to avoid warning,
     409             :  * and to ensure that the function body gets unloaded.
     410             :  */
     411             : static bool __ref
     412             : _deferred_grow_zone(struct zone *zone, unsigned int order)
     413             : {
     414             :        return deferred_grow_zone(zone, order);
     415             : }
     416             : #else
     417             : static inline bool deferred_pages_enabled(void)
     418             : {
     419             :         return false;
     420             : }
     421             : #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
     422             : 
     423             : /* Return a pointer to the bitmap storing bits affecting a block of pages */
     424             : static inline unsigned long *get_pageblock_bitmap(const struct page *page,
     425             :                                                         unsigned long pfn)
     426             : {
     427             : #ifdef CONFIG_SPARSEMEM
     428             :         return section_to_usemap(__pfn_to_section(pfn));
     429             : #else
     430       44764 :         return page_zone(page)->pageblock_flags;
     431             : #endif /* CONFIG_SPARSEMEM */
     432             : }
     433             : 
     434             : static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
     435             : {
     436             : #ifdef CONFIG_SPARSEMEM
     437             :         pfn &= (PAGES_PER_SECTION-1);
     438             : #else
     439       44764 :         pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
     440             : #endif /* CONFIG_SPARSEMEM */
     441       44764 :         return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
     442             : }
     443             : 
     444             : static __always_inline
     445             : unsigned long __get_pfnblock_flags_mask(const struct page *page,
     446             :                                         unsigned long pfn,
     447             :                                         unsigned long mask)
     448             : {
     449             :         unsigned long *bitmap;
     450             :         unsigned long bitidx, word_bitidx;
     451             :         unsigned long word;
     452             : 
     453       89000 :         bitmap = get_pageblock_bitmap(page, pfn);
     454       44500 :         bitidx = pfn_to_bitidx(page, pfn);
     455       44500 :         word_bitidx = bitidx / BITS_PER_LONG;
     456       44500 :         bitidx &= (BITS_PER_LONG-1);
     457             :         /*
     458             :          * This races, without locks, with set_pfnblock_flags_mask(). Ensure
     459             :          * a consistent read of the memory array, so that results, even though
     460             :          * racy, are not corrupted.
     461             :          */
     462       44500 :         word = READ_ONCE(bitmap[word_bitidx]);
     463       44500 :         return (word >> bitidx) & mask;
     464             : }
     465             : 
     466             : /**
     467             :  * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
     468             :  * @page: The page within the block of interest
     469             :  * @pfn: The target page frame number
     470             :  * @mask: mask of bits that the caller is interested in
     471             :  *
     472             :  * Return: pageblock_bits flags
     473             :  */
     474           0 : unsigned long get_pfnblock_flags_mask(const struct page *page,
     475             :                                         unsigned long pfn, unsigned long mask)
     476             : {
     477           4 :         return __get_pfnblock_flags_mask(page, pfn, mask);
     478             : }
     479             : 
     480             : static __always_inline int get_pfnblock_migratetype(const struct page *page,
     481             :                                         unsigned long pfn)
     482             : {
     483       44496 :         return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
     484             : }
     485             : 
     486             : /**
     487             :  * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
     488             :  * @page: The page within the block of interest
     489             :  * @flags: The flags to set
     490             :  * @pfn: The target page frame number
     491             :  * @mask: mask of bits that the caller is interested in
     492             :  */
     493         264 : void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
     494             :                                         unsigned long pfn,
     495             :                                         unsigned long mask)
     496             : {
     497             :         unsigned long *bitmap;
     498             :         unsigned long bitidx, word_bitidx;
     499             :         unsigned long word;
     500             : 
     501             :         BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
     502             :         BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
     503             : 
     504         528 :         bitmap = get_pageblock_bitmap(page, pfn);
     505         264 :         bitidx = pfn_to_bitidx(page, pfn);
     506         264 :         word_bitidx = bitidx / BITS_PER_LONG;
     507         264 :         bitidx &= (BITS_PER_LONG-1);
     508             : 
     509             :         VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
     510             : 
     511         264 :         mask <<= bitidx;
     512         264 :         flags <<= bitidx;
     513             : 
     514         264 :         word = READ_ONCE(bitmap[word_bitidx]);
     515             :         do {
     516         792 :         } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
     517         264 : }
     518             : 
     519         264 : void set_pageblock_migratetype(struct page *page, int migratetype)
     520             : {
     521         264 :         if (unlikely(page_group_by_mobility_disabled &&
     522             :                      migratetype < MIGRATE_PCPTYPES))
     523           0 :                 migratetype = MIGRATE_UNMOVABLE;
     524             : 
     525         264 :         set_pfnblock_flags_mask(page, (unsigned long)migratetype,
     526         264 :                                 page_to_pfn(page), MIGRATETYPE_MASK);
     527         264 : }
     528             : 
     529             : #ifdef CONFIG_DEBUG_VM
     530             : static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
     531             : {
     532             :         int ret = 0;
     533             :         unsigned seq;
     534             :         unsigned long pfn = page_to_pfn(page);
     535             :         unsigned long sp, start_pfn;
     536             : 
     537             :         do {
     538             :                 seq = zone_span_seqbegin(zone);
     539             :                 start_pfn = zone->zone_start_pfn;
     540             :                 sp = zone->spanned_pages;
     541             :                 if (!zone_spans_pfn(zone, pfn))
     542             :                         ret = 1;
     543             :         } while (zone_span_seqretry(zone, seq));
     544             : 
     545             :         if (ret)
     546             :                 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
     547             :                         pfn, zone_to_nid(zone), zone->name,
     548             :                         start_pfn, start_pfn + sp);
     549             : 
     550             :         return ret;
     551             : }
     552             : 
     553             : static int page_is_consistent(struct zone *zone, struct page *page)
     554             : {
     555             :         if (zone != page_zone(page))
     556             :                 return 0;
     557             : 
     558             :         return 1;
     559             : }
     560             : /*
     561             :  * Temporary debugging check for pages not lying within a given zone.
     562             :  */
     563             : static int __maybe_unused bad_range(struct zone *zone, struct page *page)
     564             : {
     565             :         if (page_outside_zone_boundaries(zone, page))
     566             :                 return 1;
     567             :         if (!page_is_consistent(zone, page))
     568             :                 return 1;
     569             : 
     570             :         return 0;
     571             : }
     572             : #else
     573             : static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
     574             : {
     575             :         return 0;
     576             : }
     577             : #endif
     578             : 
     579           0 : static void bad_page(struct page *page, const char *reason)
     580             : {
     581             :         static unsigned long resume;
     582             :         static unsigned long nr_shown;
     583             :         static unsigned long nr_unshown;
     584             : 
     585             :         /*
     586             :          * Allow a burst of 60 reports, then keep quiet for that minute;
     587             :          * or allow a steady drip of one report per second.
     588             :          */
     589           0 :         if (nr_shown == 60) {
     590           0 :                 if (time_before(jiffies, resume)) {
     591           0 :                         nr_unshown++;
     592           0 :                         goto out;
     593             :                 }
     594           0 :                 if (nr_unshown) {
     595           0 :                         pr_alert(
     596             :                               "BUG: Bad page state: %lu messages suppressed\n",
     597             :                                 nr_unshown);
     598           0 :                         nr_unshown = 0;
     599             :                 }
     600           0 :                 nr_shown = 0;
     601             :         }
     602           0 :         if (nr_shown++ == 0)
     603           0 :                 resume = jiffies + 60 * HZ;
     604             : 
     605           0 :         pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
     606             :                 current->comm, page_to_pfn(page));
     607           0 :         dump_page(page, reason);
     608             : 
     609             :         print_modules();
     610           0 :         dump_stack();
     611             : out:
     612             :         /* Leave bad fields for debug, except PageBuddy could make trouble */
     613           0 :         page_mapcount_reset(page); /* remove PageBuddy */
     614           0 :         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
     615           0 : }
     616             : 
     617             : static inline unsigned int order_to_pindex(int migratetype, int order)
     618             : {
     619       47054 :         int base = order;
     620             : 
     621             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     622             :         if (order > PAGE_ALLOC_COSTLY_ORDER) {
     623             :                 VM_BUG_ON(order != pageblock_order);
     624             :                 return NR_LOWORDER_PCP_LISTS;
     625             :         }
     626             : #else
     627             :         VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
     628             : #endif
     629             : 
     630       47054 :         return (MIGRATE_PCPTYPES * base) + migratetype;
     631             : }
     632             : 
     633             : static inline int pindex_to_order(unsigned int pindex)
     634             : {
     635           4 :         int order = pindex / MIGRATE_PCPTYPES;
     636             : 
     637             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     638             :         if (pindex == NR_LOWORDER_PCP_LISTS)
     639             :                 order = pageblock_order;
     640             : #else
     641             :         VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
     642             : #endif
     643             : 
     644             :         return order;
     645             : }
     646             : 
     647             : static inline bool pcp_allowed_order(unsigned int order)
     648             : {
     649       46458 :         if (order <= PAGE_ALLOC_COSTLY_ORDER)
     650             :                 return true;
     651             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     652             :         if (order == pageblock_order)
     653             :                 return true;
     654             : #endif
     655             :         return false;
     656             : }
     657             : 
     658       44236 : static inline void free_the_page(struct page *page, unsigned int order)
     659             : {
     660       44236 :         if (pcp_allowed_order(order))           /* Via pcp? */
     661       44236 :                 free_unref_page(page, order);
     662             :         else
     663           0 :                 __free_pages_ok(page, order, FPI_NONE);
     664       44236 : }
     665             : 
     666             : /*
     667             :  * Higher-order pages are called "compound pages".  They are structured thusly:
     668             :  *
     669             :  * The first PAGE_SIZE page is called the "head page" and have PG_head set.
     670             :  *
     671             :  * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
     672             :  * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
     673             :  *
     674             :  * The first tail page's ->compound_dtor holds the offset in array of compound
     675             :  * page destructors. See compound_page_dtors.
     676             :  *
     677             :  * The first tail page's ->compound_order holds the order of allocation.
     678             :  * This usage means that zero-order pages may not be compound.
     679             :  */
     680             : 
     681           0 : void free_compound_page(struct page *page)
     682             : {
     683           0 :         mem_cgroup_uncharge(page_folio(page));
     684           0 :         free_the_page(page, compound_order(page));
     685           0 : }
     686             : 
     687           0 : void prep_compound_page(struct page *page, unsigned int order)
     688             : {
     689             :         int i;
     690         102 :         int nr_pages = 1 << order;
     691             : 
     692         102 :         __SetPageHead(page);
     693         286 :         for (i = 1; i < nr_pages; i++)
     694         184 :                 prep_compound_tail(page, i);
     695             : 
     696         102 :         prep_compound_head(page, order);
     697           0 : }
     698             : 
     699           0 : void destroy_large_folio(struct folio *folio)
     700             : {
     701           0 :         enum compound_dtor_id dtor = folio->_folio_dtor;
     702             : 
     703             :         VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
     704           0 :         compound_page_dtors[dtor](&folio->page);
     705           0 : }
     706             : 
     707             : #ifdef CONFIG_DEBUG_PAGEALLOC
     708             : unsigned int _debug_guardpage_minorder;
     709             : 
     710             : bool _debug_pagealloc_enabled_early __read_mostly
     711             :                         = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
     712             : EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
     713             : DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
     714             : EXPORT_SYMBOL(_debug_pagealloc_enabled);
     715             : 
     716             : DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
     717             : 
     718             : static int __init early_debug_pagealloc(char *buf)
     719             : {
     720             :         return kstrtobool(buf, &_debug_pagealloc_enabled_early);
     721             : }
     722             : early_param("debug_pagealloc", early_debug_pagealloc);
     723             : 
     724             : static int __init debug_guardpage_minorder_setup(char *buf)
     725             : {
     726             :         unsigned long res;
     727             : 
     728             :         if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
     729             :                 pr_err("Bad debug_guardpage_minorder value\n");
     730             :                 return 0;
     731             :         }
     732             :         _debug_guardpage_minorder = res;
     733             :         pr_info("Setting debug_guardpage_minorder to %lu\n", res);
     734             :         return 0;
     735             : }
     736             : early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
     737             : 
     738             : static inline bool set_page_guard(struct zone *zone, struct page *page,
     739             :                                 unsigned int order, int migratetype)
     740             : {
     741             :         if (!debug_guardpage_enabled())
     742             :                 return false;
     743             : 
     744             :         if (order >= debug_guardpage_minorder())
     745             :                 return false;
     746             : 
     747             :         __SetPageGuard(page);
     748             :         INIT_LIST_HEAD(&page->buddy_list);
     749             :         set_page_private(page, order);
     750             :         /* Guard pages are not available for any usage */
     751             :         if (!is_migrate_isolate(migratetype))
     752             :                 __mod_zone_freepage_state(zone, -(1 << order), migratetype);
     753             : 
     754             :         return true;
     755             : }
     756             : 
     757             : static inline void clear_page_guard(struct zone *zone, struct page *page,
     758             :                                 unsigned int order, int migratetype)
     759             : {
     760             :         if (!debug_guardpage_enabled())
     761             :                 return;
     762             : 
     763             :         __ClearPageGuard(page);
     764             : 
     765             :         set_page_private(page, 0);
     766             :         if (!is_migrate_isolate(migratetype))
     767             :                 __mod_zone_freepage_state(zone, (1 << order), migratetype);
     768             : }
     769             : #else
     770             : static inline bool set_page_guard(struct zone *zone, struct page *page,
     771             :                         unsigned int order, int migratetype) { return false; }
     772             : static inline void clear_page_guard(struct zone *zone, struct page *page,
     773             :                                 unsigned int order, int migratetype) {}
     774             : #endif
     775             : 
     776             : static inline void set_buddy_order(struct page *page, unsigned int order)
     777             : {
     778        7062 :         set_page_private(page, order);
     779        3531 :         __SetPageBuddy(page);
     780             : }
     781             : 
     782             : #ifdef CONFIG_COMPACTION
     783        1205 : static inline struct capture_control *task_capc(struct zone *zone)
     784             : {
     785        1205 :         struct capture_control *capc = current->capture_control;
     786             : 
     787        1205 :         return unlikely(capc) &&
     788           0 :                 !(current->flags & PF_KTHREAD) &&
     789           0 :                 !capc->page &&
     790        2410 :                 capc->cc->zone == zone ? capc : NULL;
     791             : }
     792             : 
     793             : static inline bool
     794             : compaction_capture(struct capture_control *capc, struct page *page,
     795             :                    int order, int migratetype)
     796             : {
     797        1874 :         if (!capc || order != capc->cc->order)
     798             :                 return false;
     799             : 
     800             :         /* Do not accidentally pollute CMA or isolated regions*/
     801             :         if (is_migrate_cma(migratetype) ||
     802           0 :             is_migrate_isolate(migratetype))
     803             :                 return false;
     804             : 
     805             :         /*
     806             :          * Do not let lower order allocations pollute a movable pageblock.
     807             :          * This might let an unmovable request use a reclaimable pageblock
     808             :          * and vice-versa but no more than normal fallback logic which can
     809             :          * have trouble finding a high-order free page.
     810             :          */
     811           0 :         if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
     812             :                 return false;
     813             : 
     814           0 :         capc->page = page;
     815             :         return true;
     816             : }
     817             : 
     818             : #else
     819             : static inline struct capture_control *task_capc(struct zone *zone)
     820             : {
     821             :         return NULL;
     822             : }
     823             : 
     824             : static inline bool
     825             : compaction_capture(struct capture_control *capc, struct page *page,
     826             :                    int order, int migratetype)
     827             : {
     828             :         return false;
     829             : }
     830             : #endif /* CONFIG_COMPACTION */
     831             : 
     832             : /* Used for pages not on another list */
     833             : static inline void add_to_free_list(struct page *page, struct zone *zone,
     834             :                                     unsigned int order, int migratetype)
     835             : {
     836        2817 :         struct free_area *area = &zone->free_area[order];
     837             : 
     838        5634 :         list_add(&page->buddy_list, &area->free_list[migratetype]);
     839        2817 :         area->nr_free++;
     840             : }
     841             : 
     842             : /* Used for pages not on another list */
     843             : static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
     844             :                                          unsigned int order, int migratetype)
     845             : {
     846         714 :         struct free_area *area = &zone->free_area[order];
     847             : 
     848        1428 :         list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
     849         714 :         area->nr_free++;
     850             : }
     851             : 
     852             : /*
     853             :  * Used for pages which are on another list. Move the pages to the tail
     854             :  * of the list - so the moved pages won't immediately be considered for
     855             :  * allocation again (e.g., optimization for memory onlining).
     856             :  */
     857             : static inline void move_to_free_list(struct page *page, struct zone *zone,
     858             :                                      unsigned int order, int migratetype)
     859             : {
     860           4 :         struct free_area *area = &zone->free_area[order];
     861             : 
     862           8 :         list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
     863             : }
     864             : 
     865             : static inline void del_page_from_free_list(struct page *page, struct zone *zone,
     866             :                                            unsigned int order)
     867             : {
     868             :         /* clear reported state and update reported page count */
     869             :         if (page_reported(page))
     870             :                 __ClearPageReported(page);
     871             : 
     872        6476 :         list_del(&page->buddy_list);
     873        3238 :         __ClearPageBuddy(page);
     874        6476 :         set_page_private(page, 0);
     875        3238 :         zone->free_area[order].nr_free--;
     876             : }
     877             : 
     878             : static inline struct page *get_page_from_free_area(struct free_area *area,
     879             :                                             int migratetype)
     880             : {
     881        4694 :         return list_first_entry_or_null(&area->free_list[migratetype],
     882             :                                         struct page, lru);
     883             : }
     884             : 
     885             : /*
     886             :  * If this is not the largest possible page, check if the buddy
     887             :  * of the next-highest order is free. If it is, it's possible
     888             :  * that pages are being freed that will coalesce soon. In case,
     889             :  * that is happening, add the free page to the tail of the list
     890             :  * so it's less likely to be used soon and more likely to be merged
     891             :  * as a higher order page
     892             :  */
     893             : static inline bool
     894         945 : buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
     895             :                    struct page *page, unsigned int order)
     896             : {
     897             :         unsigned long higher_page_pfn;
     898             :         struct page *higher_page;
     899             : 
     900         945 :         if (order >= MAX_ORDER - 1)
     901             :                 return false;
     902             : 
     903         945 :         higher_page_pfn = buddy_pfn & pfn;
     904         945 :         higher_page = page + (higher_page_pfn - pfn);
     905             : 
     906        1890 :         return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
     907         945 :                         NULL) != NULL;
     908             : }
     909             : 
     910             : /*
     911             :  * Freeing function for a buddy system allocator.
     912             :  *
     913             :  * The concept of a buddy system is to maintain direct-mapped table
     914             :  * (containing bit values) for memory blocks of various "orders".
     915             :  * The bottom level table contains the map for the smallest allocatable
     916             :  * units of memory (here, pages), and each level above it describes
     917             :  * pairs of units from the levels below, hence, "buddies".
     918             :  * At a high level, all that happens here is marking the table entry
     919             :  * at the bottom level available, and propagating the changes upward
     920             :  * as necessary, plus some accounting needed to play nicely with other
     921             :  * parts of the VM system.
     922             :  * At each level, we keep a list of pages, which are heads of continuous
     923             :  * free pages of length of (1 << order) and marked with PageBuddy.
     924             :  * Page's order is recorded in page_private(page) field.
     925             :  * So when we are allocating or freeing one, we can derive the state of the
     926             :  * other.  That is, if we allocate a small block, and both were
     927             :  * free, the remainder of the region must be split into blocks.
     928             :  * If a block is freed, and its buddy is also free, then this
     929             :  * triggers coalescing into a block of larger size.
     930             :  *
     931             :  * -- nyc
     932             :  */
     933             : 
     934        1205 : static inline void __free_one_page(struct page *page,
     935             :                 unsigned long pfn,
     936             :                 struct zone *zone, unsigned int order,
     937             :                 int migratetype, fpi_t fpi_flags)
     938             : {
     939        1205 :         struct capture_control *capc = task_capc(zone);
     940        1205 :         unsigned long buddy_pfn = 0;
     941             :         unsigned long combined_pfn;
     942             :         struct page *buddy;
     943             :         bool to_tail;
     944             : 
     945             :         VM_BUG_ON(!zone_is_initialized(zone));
     946             :         VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
     947             : 
     948             :         VM_BUG_ON(migratetype == -1);
     949        1205 :         if (likely(!is_migrate_isolate(migratetype)))
     950        1205 :                 __mod_zone_freepage_state(zone, 1 << order, migratetype);
     951             : 
     952             :         VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
     953             :         VM_BUG_ON_PAGE(bad_range(zone, page), page);
     954             : 
     955        2122 :         while (order < MAX_ORDER) {
     956        3748 :                 if (compaction_capture(capc, page, order, migratetype)) {
     957           0 :                         __mod_zone_freepage_state(zone, -(1 << order),
     958             :                                                                 migratetype);
     959           0 :                         return;
     960             :                 }
     961             : 
     962        1874 :                 buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
     963        1874 :                 if (!buddy)
     964             :                         goto done_merging;
     965             : 
     966             :                 if (unlikely(order >= pageblock_order)) {
     967             :                         /*
     968             :                          * We want to prevent merge between freepages on pageblock
     969             :                          * without fallbacks and normal pageblock. Without this,
     970             :                          * pageblock isolation could cause incorrect freepage or CMA
     971             :                          * accounting or HIGHATOMIC accounting.
     972             :                          */
     973             :                         int buddy_mt = get_pageblock_migratetype(buddy);
     974             : 
     975             :                         if (migratetype != buddy_mt
     976             :                                         && (!migratetype_is_mergeable(migratetype) ||
     977             :                                                 !migratetype_is_mergeable(buddy_mt)))
     978             :                                 goto done_merging;
     979             :                 }
     980             : 
     981             :                 /*
     982             :                  * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
     983             :                  * merge with it and move up one order.
     984             :                  */
     985             :                 if (page_is_guard(buddy))
     986             :                         clear_page_guard(zone, buddy, order, migratetype);
     987             :                 else
     988             :                         del_page_from_free_list(buddy, zone, order);
     989         917 :                 combined_pfn = buddy_pfn & pfn;
     990         917 :                 page = page + (combined_pfn - pfn);
     991         917 :                 pfn = combined_pfn;
     992         917 :                 order++;
     993             :         }
     994             : 
     995             : done_merging:
     996        1205 :         set_buddy_order(page, order);
     997             : 
     998        1205 :         if (fpi_flags & FPI_TO_TAIL)
     999             :                 to_tail = true;
    1000         945 :         else if (is_shuffle_order(order))
    1001             :                 to_tail = shuffle_pick_tail();
    1002             :         else
    1003         945 :                 to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
    1004             : 
    1005        1205 :         if (to_tail)
    1006             :                 add_to_free_list_tail(page, zone, order, migratetype);
    1007             :         else
    1008             :                 add_to_free_list(page, zone, order, migratetype);
    1009             : 
    1010             :         /* Notify page reporting subsystem of freed page */
    1011             :         if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
    1012             :                 page_reporting_notify_free(order);
    1013             : }
    1014             : 
    1015             : /**
    1016             :  * split_free_page() -- split a free page at split_pfn_offset
    1017             :  * @free_page:          the original free page
    1018             :  * @order:              the order of the page
    1019             :  * @split_pfn_offset:   split offset within the page
    1020             :  *
    1021             :  * Return -ENOENT if the free page is changed, otherwise 0
    1022             :  *
    1023             :  * It is used when the free page crosses two pageblocks with different migratetypes
    1024             :  * at split_pfn_offset within the page. The split free page will be put into
    1025             :  * separate migratetype lists afterwards. Otherwise, the function achieves
    1026             :  * nothing.
    1027             :  */
    1028           0 : int split_free_page(struct page *free_page,
    1029             :                         unsigned int order, unsigned long split_pfn_offset)
    1030             : {
    1031           0 :         struct zone *zone = page_zone(free_page);
    1032           0 :         unsigned long free_page_pfn = page_to_pfn(free_page);
    1033             :         unsigned long pfn;
    1034             :         unsigned long flags;
    1035             :         int free_page_order;
    1036             :         int mt;
    1037           0 :         int ret = 0;
    1038             : 
    1039           0 :         if (split_pfn_offset == 0)
    1040             :                 return ret;
    1041             : 
    1042           0 :         spin_lock_irqsave(&zone->lock, flags);
    1043             : 
    1044           0 :         if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
    1045             :                 ret = -ENOENT;
    1046             :                 goto out;
    1047             :         }
    1048             : 
    1049           0 :         mt = get_pageblock_migratetype(free_page);
    1050           0 :         if (likely(!is_migrate_isolate(mt)))
    1051           0 :                 __mod_zone_freepage_state(zone, -(1UL << order), mt);
    1052             : 
    1053           0 :         del_page_from_free_list(free_page, zone, order);
    1054           0 :         for (pfn = free_page_pfn;
    1055           0 :              pfn < free_page_pfn + (1UL << order);) {
    1056           0 :                 int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
    1057             : 
    1058           0 :                 free_page_order = min_t(unsigned int,
    1059             :                                         pfn ? __ffs(pfn) : order,
    1060             :                                         __fls(split_pfn_offset));
    1061           0 :                 __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
    1062             :                                 mt, FPI_NONE);
    1063           0 :                 pfn += 1UL << free_page_order;
    1064           0 :                 split_pfn_offset -= (1UL << free_page_order);
    1065             :                 /* we have done the first part, now switch to second part */
    1066           0 :                 if (split_pfn_offset == 0)
    1067           0 :                         split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
    1068             :         }
    1069             : out:
    1070           0 :         spin_unlock_irqrestore(&zone->lock, flags);
    1071           0 :         return ret;
    1072             : }
    1073             : /*
    1074             :  * A bad page could be due to a number of fields. Instead of multiple branches,
    1075             :  * try and check multiple fields with one check. The caller must do a detailed
    1076             :  * check if necessary.
    1077             :  */
    1078             : static inline bool page_expected_state(struct page *page,
    1079             :                                         unsigned long check_flags)
    1080             : {
    1081           0 :         if (unlikely(atomic_read(&page->_mapcount) != -1))
    1082             :                 return false;
    1083             : 
    1084           0 :         if (unlikely((unsigned long)page->mapping |
    1085             :                         page_ref_count(page) |
    1086             : #ifdef CONFIG_MEMCG
    1087             :                         page->memcg_data |
    1088             : #endif
    1089             :                         (page->flags & check_flags)))
    1090             :                 return false;
    1091             : 
    1092             :         return true;
    1093             : }
    1094             : 
    1095             : static const char *page_bad_reason(struct page *page, unsigned long flags)
    1096             : {
    1097           0 :         const char *bad_reason = NULL;
    1098             : 
    1099           0 :         if (unlikely(atomic_read(&page->_mapcount) != -1))
    1100           0 :                 bad_reason = "nonzero mapcount";
    1101           0 :         if (unlikely(page->mapping != NULL))
    1102           0 :                 bad_reason = "non-NULL mapping";
    1103           0 :         if (unlikely(page_ref_count(page) != 0))
    1104           0 :                 bad_reason = "nonzero _refcount";
    1105           0 :         if (unlikely(page->flags & flags)) {
    1106             :                 if (flags == PAGE_FLAGS_CHECK_AT_PREP)
    1107             :                         bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
    1108             :                 else
    1109           0 :                         bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
    1110             :         }
    1111             : #ifdef CONFIG_MEMCG
    1112             :         if (unlikely(page->memcg_data))
    1113             :                 bad_reason = "page still charged to cgroup";
    1114             : #endif
    1115             :         return bad_reason;
    1116             : }
    1117             : 
    1118           0 : static void free_page_is_bad_report(struct page *page)
    1119             : {
    1120           0 :         bad_page(page,
    1121             :                  page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
    1122           0 : }
    1123             : 
    1124           0 : static inline bool free_page_is_bad(struct page *page)
    1125             : {
    1126           0 :         if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
    1127             :                 return false;
    1128             : 
    1129             :         /* Something has gone sideways, find it */
    1130           0 :         free_page_is_bad_report(page);
    1131           0 :         return true;
    1132             : }
    1133             : 
    1134          42 : static int free_tail_page_prepare(struct page *head_page, struct page *page)
    1135             : {
    1136          42 :         struct folio *folio = (struct folio *)head_page;
    1137          42 :         int ret = 1;
    1138             : 
    1139             :         /*
    1140             :          * We rely page->lru.next never has bit 0 set, unless the page
    1141             :          * is PageTail(). Let's make sure that's true even for poisoned ->lru.
    1142             :          */
    1143             :         BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
    1144             : 
    1145          42 :         if (!static_branch_unlikely(&check_pages_enabled)) {
    1146             :                 ret = 0;
    1147             :                 goto out;
    1148             :         }
    1149           0 :         switch (page - head_page) {
    1150             :         case 1:
    1151             :                 /* the first tail page: these may be in place of ->mapping */
    1152           0 :                 if (unlikely(folio_entire_mapcount(folio))) {
    1153           0 :                         bad_page(page, "nonzero entire_mapcount");
    1154           0 :                         goto out;
    1155             :                 }
    1156           0 :                 if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
    1157           0 :                         bad_page(page, "nonzero nr_pages_mapped");
    1158           0 :                         goto out;
    1159             :                 }
    1160           0 :                 if (unlikely(atomic_read(&folio->_pincount))) {
    1161           0 :                         bad_page(page, "nonzero pincount");
    1162           0 :                         goto out;
    1163             :                 }
    1164             :                 break;
    1165             :         case 2:
    1166             :                 /*
    1167             :                  * the second tail page: ->mapping is
    1168             :                  * deferred_list.next -- ignore value.
    1169             :                  */
    1170             :                 break;
    1171             :         default:
    1172           0 :                 if (page->mapping != TAIL_MAPPING) {
    1173           0 :                         bad_page(page, "corrupted mapping in tail page");
    1174           0 :                         goto out;
    1175             :                 }
    1176             :                 break;
    1177             :         }
    1178           0 :         if (unlikely(!PageTail(page))) {
    1179           0 :                 bad_page(page, "PageTail not set");
    1180           0 :                 goto out;
    1181             :         }
    1182           0 :         if (unlikely(compound_head(page) != head_page)) {
    1183           0 :                 bad_page(page, "compound_head not consistent");
    1184           0 :                 goto out;
    1185             :         }
    1186             :         ret = 0;
    1187             : out:
    1188          42 :         page->mapping = NULL;
    1189          42 :         clear_compound_head(page);
    1190          42 :         return ret;
    1191             : }
    1192             : 
    1193             : /*
    1194             :  * Skip KASAN memory poisoning when either:
    1195             :  *
    1196             :  * 1. For generic KASAN: deferred memory initialization has not yet completed.
    1197             :  *    Tag-based KASAN modes skip pages freed via deferred memory initialization
    1198             :  *    using page tags instead (see below).
    1199             :  * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
    1200             :  *    that error detection is disabled for accesses via the page address.
    1201             :  *
    1202             :  * Pages will have match-all tags in the following circumstances:
    1203             :  *
    1204             :  * 1. Pages are being initialized for the first time, including during deferred
    1205             :  *    memory init; see the call to page_kasan_tag_reset in __init_single_page.
    1206             :  * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
    1207             :  *    exception of pages unpoisoned by kasan_unpoison_vmalloc.
    1208             :  * 3. The allocation was excluded from being checked due to sampling,
    1209             :  *    see the call to kasan_unpoison_pages.
    1210             :  *
    1211             :  * Poisoning pages during deferred memory init will greatly lengthen the
    1212             :  * process and cause problem in large memory systems as the deferred pages
    1213             :  * initialization is done with interrupt disabled.
    1214             :  *
    1215             :  * Assuming that there will be no reference to those newly initialized
    1216             :  * pages before they are ever allocated, this should have no effect on
    1217             :  * KASAN memory tracking as the poison will be properly inserted at page
    1218             :  * allocation time. The only corner case is when pages are allocated by
    1219             :  * on-demand allocation and then freed again before the deferred pages
    1220             :  * initialization is done, but this is not likely to happen.
    1221             :  */
    1222             : static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
    1223             : {
    1224             :         if (IS_ENABLED(CONFIG_KASAN_GENERIC))
    1225             :                 return deferred_pages_enabled();
    1226             : 
    1227       44496 :         return page_kasan_tag(page) == 0xff;
    1228             : }
    1229             : 
    1230           0 : static void kernel_init_pages(struct page *page, int numpages)
    1231             : {
    1232             :         int i;
    1233             : 
    1234             :         /* s390's use of memset() could override KASAN redzones. */
    1235             :         kasan_disable_current();
    1236       38291 :         for (i = 0; i < numpages; i++)
    1237       38291 :                 clear_highpage_kasan_tagged(page + i);
    1238             :         kasan_enable_current();
    1239           0 : }
    1240             : 
    1241             : static __always_inline bool free_pages_prepare(struct page *page,
    1242             :                         unsigned int order, fpi_t fpi_flags)
    1243             : {
    1244       44496 :         int bad = 0;
    1245       88992 :         bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
    1246       44496 :         bool init = want_init_on_free();
    1247             : 
    1248             :         VM_BUG_ON_PAGE(PageTail(page), page);
    1249             : 
    1250       44496 :         trace_mm_page_free(page, order);
    1251       44496 :         kmsan_free_page(page, order);
    1252             : 
    1253       44496 :         if (unlikely(PageHWPoison(page)) && !order) {
    1254             :                 /*
    1255             :                  * Do not let hwpoison pages hit pcplists/buddy
    1256             :                  * Untie memcg state and reset page's owner
    1257             :                  */
    1258             :                 if (memcg_kmem_online() && PageMemcgKmem(page))
    1259             :                         __memcg_kmem_uncharge_page(page, order);
    1260             :                 reset_page_owner(page, order);
    1261             :                 page_table_check_free(page, order);
    1262             :                 return false;
    1263             :         }
    1264             : 
    1265             :         /*
    1266             :          * Check tail pages before head page information is cleared to
    1267             :          * avoid checking PageCompound for order-0 pages.
    1268             :          */
    1269       44496 :         if (unlikely(order)) {
    1270         263 :                 bool compound = PageCompound(page);
    1271             :                 int i;
    1272             : 
    1273             :                 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
    1274             : 
    1275             :                 if (compound)
    1276             :                         ClearPageHasHWPoisoned(page);
    1277      254251 :                 for (i = 1; i < (1 << order); i++) {
    1278      254251 :                         if (compound)
    1279          42 :                                 bad += free_tail_page_prepare(page, page + i);
    1280      254251 :                         if (is_check_pages_enabled()) {
    1281           0 :                                 if (free_page_is_bad(page + i)) {
    1282           0 :                                         bad++;
    1283           0 :                                         continue;
    1284             :                                 }
    1285             :                         }
    1286      254251 :                         (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
    1287             :                 }
    1288             :         }
    1289       44496 :         if (PageMappingFlags(page))
    1290           0 :                 page->mapping = NULL;
    1291             :         if (memcg_kmem_online() && PageMemcgKmem(page))
    1292             :                 __memcg_kmem_uncharge_page(page, order);
    1293       44496 :         if (is_check_pages_enabled()) {
    1294           0 :                 if (free_page_is_bad(page))
    1295           0 :                         bad++;
    1296           0 :                 if (bad)
    1297             :                         return false;
    1298             :         }
    1299             : 
    1300       44496 :         page_cpupid_reset_last(page);
    1301       44496 :         page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
    1302             :         reset_page_owner(page, order);
    1303       44496 :         page_table_check_free(page, order);
    1304             : 
    1305       44496 :         if (!PageHighMem(page)) {
    1306             :                 debug_check_no_locks_freed(page_address(page),
    1307             :                                            PAGE_SIZE << order);
    1308             :                 debug_check_no_obj_freed(page_address(page),
    1309             :                                            PAGE_SIZE << order);
    1310             :         }
    1311             : 
    1312       44496 :         kernel_poison_pages(page, 1 << order);
    1313             : 
    1314             :         /*
    1315             :          * As memory initialization might be integrated into KASAN,
    1316             :          * KASAN poisoning and memory initialization code must be
    1317             :          * kept together to avoid discrepancies in behavior.
    1318             :          *
    1319             :          * With hardware tag-based KASAN, memory tags must be set before the
    1320             :          * page becomes unavailable via debug_pagealloc or arch_free_page.
    1321             :          */
    1322             :         if (!skip_kasan_poison) {
    1323             :                 kasan_poison_pages(page, order, init);
    1324             : 
    1325             :                 /* Memory is already initialized if KASAN did it internally. */
    1326             :                 if (kasan_has_integrated_init())
    1327             :                         init = false;
    1328             :         }
    1329       44496 :         if (init)
    1330           0 :                 kernel_init_pages(page, 1 << order);
    1331             : 
    1332             :         /*
    1333             :          * arch_free_page() can make the page's contents inaccessible.  s390
    1334             :          * does this.  So nothing which can access the page's contents should
    1335             :          * happen after this.
    1336             :          */
    1337             :         arch_free_page(page, order);
    1338             : 
    1339             :         debug_pagealloc_unmap_pages(page, 1 << order);
    1340             : 
    1341             :         return true;
    1342             : }
    1343             : 
    1344             : /*
    1345             :  * Frees a number of pages from the PCP lists
    1346             :  * Assumes all pages on list are in same zone.
    1347             :  * count is the number of pages to free.
    1348             :  */
    1349           4 : static void free_pcppages_bulk(struct zone *zone, int count,
    1350             :                                         struct per_cpu_pages *pcp,
    1351             :                                         int pindex)
    1352             : {
    1353             :         unsigned long flags;
    1354           4 :         int min_pindex = 0;
    1355           4 :         int max_pindex = NR_PCP_LISTS - 1;
    1356             :         unsigned int order;
    1357             :         bool isolated_pageblocks;
    1358             :         struct page *page;
    1359             : 
    1360             :         /*
    1361             :          * Ensure proper count is passed which otherwise would stuck in the
    1362             :          * below while (list_empty(list)) loop.
    1363             :          */
    1364           4 :         count = min(pcp->count, count);
    1365             : 
    1366             :         /* Ensure requested pindex is drained first. */
    1367           4 :         pindex = pindex - 1;
    1368             : 
    1369           4 :         spin_lock_irqsave(&zone->lock, flags);
    1370           4 :         isolated_pageblocks = has_isolate_pageblock(zone);
    1371             : 
    1372          12 :         while (count > 0) {
    1373             :                 struct list_head *list;
    1374             :                 int nr_pages;
    1375             : 
    1376             :                 /* Remove pages from lists in a round-robin fashion. */
    1377             :                 do {
    1378           4 :                         if (++pindex > max_pindex)
    1379           0 :                                 pindex = min_pindex;
    1380           4 :                         list = &pcp->lists[pindex];
    1381           4 :                         if (!list_empty(list))
    1382             :                                 break;
    1383             : 
    1384           0 :                         if (pindex == max_pindex)
    1385           0 :                                 max_pindex--;
    1386           0 :                         if (pindex == min_pindex)
    1387           0 :                                 min_pindex++;
    1388             :                 } while (1);
    1389             : 
    1390           8 :                 order = pindex_to_order(pindex);
    1391           4 :                 nr_pages = 1 << order;
    1392             :                 do {
    1393             :                         int mt;
    1394             : 
    1395         945 :                         page = list_last_entry(list, struct page, pcp_list);
    1396        1890 :                         mt = get_pcppage_migratetype(page);
    1397             : 
    1398             :                         /* must delete to avoid corrupting pcp list */
    1399        1890 :                         list_del(&page->pcp_list);
    1400         945 :                         count -= nr_pages;
    1401         945 :                         pcp->count -= nr_pages;
    1402             : 
    1403             :                         /* MIGRATE_ISOLATE page should not go to pcplists */
    1404             :                         VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
    1405             :                         /* Pageblock could have been isolated meanwhile */
    1406             :                         if (unlikely(isolated_pageblocks))
    1407             :                                 mt = get_pageblock_migratetype(page);
    1408             : 
    1409         945 :                         __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
    1410         945 :                         trace_mm_page_pcpu_drain(page, order, mt);
    1411        1886 :                 } while (count > 0 && !list_empty(list));
    1412             :         }
    1413             : 
    1414           8 :         spin_unlock_irqrestore(&zone->lock, flags);
    1415           4 : }
    1416             : 
    1417           0 : static void free_one_page(struct zone *zone,
    1418             :                                 struct page *page, unsigned long pfn,
    1419             :                                 unsigned int order,
    1420             :                                 int migratetype, fpi_t fpi_flags)
    1421             : {
    1422             :         unsigned long flags;
    1423             : 
    1424           0 :         spin_lock_irqsave(&zone->lock, flags);
    1425           0 :         if (unlikely(has_isolate_pageblock(zone) ||
    1426             :                 is_migrate_isolate(migratetype))) {
    1427             :                 migratetype = get_pfnblock_migratetype(page, pfn);
    1428             :         }
    1429           0 :         __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
    1430           0 :         spin_unlock_irqrestore(&zone->lock, flags);
    1431           0 : }
    1432             : 
    1433         260 : static void __free_pages_ok(struct page *page, unsigned int order,
    1434             :                             fpi_t fpi_flags)
    1435             : {
    1436             :         unsigned long flags;
    1437             :         int migratetype;
    1438         260 :         unsigned long pfn = page_to_pfn(page);
    1439         260 :         struct zone *zone = page_zone(page);
    1440             : 
    1441         260 :         if (!free_pages_prepare(page, order, fpi_flags))
    1442             :                 return;
    1443             : 
    1444             :         /*
    1445             :          * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
    1446             :          * is used to avoid calling get_pfnblock_migratetype() under the lock.
    1447             :          * This will reduce the lock holding time.
    1448             :          */
    1449         260 :         migratetype = get_pfnblock_migratetype(page, pfn);
    1450             : 
    1451         260 :         spin_lock_irqsave(&zone->lock, flags);
    1452             :         if (unlikely(has_isolate_pageblock(zone) ||
    1453             :                 is_migrate_isolate(migratetype))) {
    1454             :                 migratetype = get_pfnblock_migratetype(page, pfn);
    1455             :         }
    1456         260 :         __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
    1457         520 :         spin_unlock_irqrestore(&zone->lock, flags);
    1458             : 
    1459         260 :         __count_vm_events(PGFREE, 1 << order);
    1460             : }
    1461             : 
    1462         260 : void __free_pages_core(struct page *page, unsigned int order)
    1463             : {
    1464         260 :         unsigned int nr_pages = 1 << order;
    1465         260 :         struct page *p = page;
    1466             :         unsigned int loop;
    1467             : 
    1468             :         /*
    1469             :          * When initializing the memmap, __init_single_page() sets the refcount
    1470             :          * of all pages to 1 ("allocated"/"not free"). We have to set the
    1471             :          * refcount of all involved pages to 0.
    1472             :          */
    1473         260 :         prefetchw(p);
    1474      254469 :         for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
    1475      254209 :                 prefetchw(p + 1);
    1476      254209 :                 __ClearPageReserved(p);
    1477      254209 :                 set_page_count(p, 0);
    1478             :         }
    1479         260 :         __ClearPageReserved(p);
    1480         260 :         set_page_count(p, 0);
    1481             : 
    1482         520 :         atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
    1483             : 
    1484             :         /*
    1485             :          * Bypass PCP and place fresh pages right to the tail, primarily
    1486             :          * relevant for memory onlining.
    1487             :          */
    1488         260 :         __free_pages_ok(page, order, FPI_TO_TAIL);
    1489         260 : }
    1490             : 
    1491             : /*
    1492             :  * Check that the whole (or subset of) a pageblock given by the interval of
    1493             :  * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
    1494             :  * with the migration of free compaction scanner.
    1495             :  *
    1496             :  * Return struct page pointer of start_pfn, or NULL if checks were not passed.
    1497             :  *
    1498             :  * It's possible on some configurations to have a setup like node0 node1 node0
    1499             :  * i.e. it's possible that all pages within a zones range of pages do not
    1500             :  * belong to a single zone. We assume that a border between node0 and node1
    1501             :  * can occur within a single pageblock, but not a node0 node1 node0
    1502             :  * interleaving within a single pageblock. It is therefore sufficient to check
    1503             :  * the first and last page of a pageblock and avoid checking each individual
    1504             :  * page in a pageblock.
    1505             :  *
    1506             :  * Note: the function may return non-NULL struct page even for a page block
    1507             :  * which contains a memory hole (i.e. there is no physical memory for a subset
    1508             :  * of the pfn range). For example, if the pageblock order is MAX_ORDER, which
    1509             :  * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
    1510             :  * even though the start pfn is online and valid. This should be safe most of
    1511             :  * the time because struct pages are still initialized via init_unavailable_range()
    1512             :  * and pfn walkers shouldn't touch any physical memory range for which they do
    1513             :  * not recognize any specific metadata in struct pages.
    1514             :  */
    1515         260 : struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
    1516             :                                      unsigned long end_pfn, struct zone *zone)
    1517             : {
    1518             :         struct page *start_page;
    1519             :         struct page *end_page;
    1520             : 
    1521             :         /* end_pfn is one past the range we are checking */
    1522         260 :         end_pfn--;
    1523             : 
    1524         520 :         if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
    1525             :                 return NULL;
    1526             : 
    1527         520 :         start_page = pfn_to_online_page(start_pfn);
    1528         260 :         if (!start_page)
    1529             :                 return NULL;
    1530             : 
    1531         260 :         if (page_zone(start_page) != zone)
    1532             :                 return NULL;
    1533             : 
    1534         260 :         end_page = pfn_to_page(end_pfn);
    1535             : 
    1536             :         /* This gives a shorter code than deriving page_zone(end_page) */
    1537         780 :         if (page_zone_id(start_page) != page_zone_id(end_page))
    1538             :                 return NULL;
    1539             : 
    1540         260 :         return start_page;
    1541             : }
    1542             : 
    1543           1 : void set_zone_contiguous(struct zone *zone)
    1544             : {
    1545           1 :         unsigned long block_start_pfn = zone->zone_start_pfn;
    1546             :         unsigned long block_end_pfn;
    1547             : 
    1548           1 :         block_end_pfn = pageblock_end_pfn(block_start_pfn);
    1549         523 :         for (; block_start_pfn < zone_end_pfn(zone);
    1550         260 :                         block_start_pfn = block_end_pfn,
    1551         260 :                          block_end_pfn += pageblock_nr_pages) {
    1552             : 
    1553         260 :                 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
    1554             : 
    1555         260 :                 if (!__pageblock_pfn_to_page(block_start_pfn,
    1556             :                                              block_end_pfn, zone))
    1557             :                         return;
    1558         260 :                 cond_resched();
    1559             :         }
    1560             : 
    1561             :         /* We confirm that there is no hole */
    1562           1 :         zone->contiguous = true;
    1563             : }
    1564             : 
    1565           0 : void clear_zone_contiguous(struct zone *zone)
    1566             : {
    1567           0 :         zone->contiguous = false;
    1568           0 : }
    1569             : 
    1570             : /*
    1571             :  * The order of subdivision here is critical for the IO subsystem.
    1572             :  * Please do not alter this order without good reasons and regression
    1573             :  * testing. Specifically, as large blocks of memory are subdivided,
    1574             :  * the order in which smaller blocks are delivered depends on the order
    1575             :  * they're subdivided in this function. This is the primary factor
    1576             :  * influencing the order in which pages are delivered to the IO
    1577             :  * subsystem according to empirical testing, and this is also justified
    1578             :  * by considering the behavior of a buddy system containing a single
    1579             :  * large block of memory acted on by a series of small allocations.
    1580             :  * This behavior is a critical factor in sglist merging's success.
    1581             :  *
    1582             :  * -- nyc
    1583             :  */
    1584             : static inline void expand(struct zone *zone, struct page *page,
    1585             :         int low, int high, int migratetype)
    1586             : {
    1587        2321 :         unsigned long size = 1 << high;
    1588             : 
    1589        4647 :         while (high > low) {
    1590        2326 :                 high--;
    1591        2326 :                 size >>= 1;
    1592             :                 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
    1593             : 
    1594             :                 /*
    1595             :                  * Mark as guard pages (or page), that will allow to
    1596             :                  * merge back to allocator when buddy will be freed.
    1597             :                  * Corresponding page table entries will not be touched,
    1598             :                  * pages will stay not present in virtual address space
    1599             :                  */
    1600        2326 :                 if (set_page_guard(zone, &page[size], high, migratetype))
    1601             :                         continue;
    1602             : 
    1603        4652 :                 add_to_free_list(&page[size], zone, high, migratetype);
    1604        2326 :                 set_buddy_order(&page[size], high);
    1605             :         }
    1606             : }
    1607             : 
    1608           0 : static void check_new_page_bad(struct page *page)
    1609             : {
    1610             :         if (unlikely(page->flags & __PG_HWPOISON)) {
    1611             :                 /* Don't complain about hwpoisoned pages */
    1612             :                 page_mapcount_reset(page); /* remove PageBuddy */
    1613             :                 return;
    1614             :         }
    1615             : 
    1616           0 :         bad_page(page,
    1617             :                  page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
    1618             : }
    1619             : 
    1620             : /*
    1621             :  * This page is about to be returned from the page allocator
    1622             :  */
    1623           0 : static int check_new_page(struct page *page)
    1624             : {
    1625           0 :         if (likely(page_expected_state(page,
    1626             :                                 PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
    1627             :                 return 0;
    1628             : 
    1629           0 :         check_new_page_bad(page);
    1630           0 :         return 1;
    1631             : }
    1632             : 
    1633       44763 : static inline bool check_new_pages(struct page *page, unsigned int order)
    1634             : {
    1635       44763 :         if (is_check_pages_enabled()) {
    1636           0 :                 for (int i = 0; i < (1 << order); i++) {
    1637           0 :                         struct page *p = page + i;
    1638             : 
    1639           0 :                         if (check_new_page(p))
    1640             :                                 return true;
    1641             :                 }
    1642             :         }
    1643             : 
    1644             :         return false;
    1645             : }
    1646             : 
    1647             : static inline bool should_skip_kasan_unpoison(gfp_t flags)
    1648             : {
    1649             :         /* Don't skip if a software KASAN mode is enabled. */
    1650             :         if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
    1651             :             IS_ENABLED(CONFIG_KASAN_SW_TAGS))
    1652             :                 return false;
    1653             : 
    1654             :         /* Skip, if hardware tag-based KASAN is not enabled. */
    1655             :         if (!kasan_hw_tags_enabled())
    1656             :                 return true;
    1657             : 
    1658             :         /*
    1659             :          * With hardware tag-based KASAN enabled, skip if this has been
    1660             :          * requested via __GFP_SKIP_KASAN.
    1661             :          */
    1662             :         return flags & __GFP_SKIP_KASAN;
    1663             : }
    1664             : 
    1665             : static inline bool should_skip_init(gfp_t flags)
    1666             : {
    1667             :         /* Don't skip, if hardware tag-based KASAN is not enabled. */
    1668             :         if (!kasan_hw_tags_enabled())
    1669             :                 return false;
    1670             : 
    1671             :         /* For hardware tag-based KASAN, skip if requested. */
    1672             :         return (flags & __GFP_SKIP_ZERO);
    1673             : }
    1674             : 
    1675       44763 : inline void post_alloc_hook(struct page *page, unsigned int order,
    1676             :                                 gfp_t gfp_flags)
    1677             : {
    1678       89526 :         bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
    1679             :                         !should_skip_init(gfp_flags);
    1680       44763 :         bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
    1681             :         int i;
    1682             : 
    1683       89526 :         set_page_private(page, 0);
    1684       44763 :         set_page_refcounted(page);
    1685             : 
    1686       44763 :         arch_alloc_page(page, order);
    1687       44763 :         debug_pagealloc_map_pages(page, 1 << order);
    1688             : 
    1689             :         /*
    1690             :          * Page unpoisoning must happen before memory initialization.
    1691             :          * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
    1692             :          * allocations and the page unpoisoning code will complain.
    1693             :          */
    1694       44763 :         kernel_unpoison_pages(page, 1 << order);
    1695             : 
    1696             :         /*
    1697             :          * As memory initialization might be integrated into KASAN,
    1698             :          * KASAN unpoisoning and memory initializion code must be
    1699             :          * kept together to avoid discrepancies in behavior.
    1700             :          */
    1701             : 
    1702             :         /*
    1703             :          * If memory tags should be zeroed
    1704             :          * (which happens only when memory should be initialized as well).
    1705             :          */
    1706       44763 :         if (zero_tags) {
    1707             :                 /* Initialize both memory and memory tags. */
    1708             :                 for (i = 0; i != 1 << order; ++i)
    1709             :                         tag_clear_highpage(page + i);
    1710             : 
    1711             :                 /* Take note that memory was initialized by the loop above. */
    1712             :                 init = false;
    1713             :         }
    1714       44763 :         if (!should_skip_kasan_unpoison(gfp_flags) &&
    1715             :             kasan_unpoison_pages(page, order, init)) {
    1716             :                 /* Take note that memory was initialized by KASAN. */
    1717             :                 if (kasan_has_integrated_init())
    1718             :                         init = false;
    1719             :         } else {
    1720             :                 /*
    1721             :                  * If memory tags have not been set by KASAN, reset the page
    1722             :                  * tags to ensure page_address() dereferencing does not fault.
    1723             :                  */
    1724       44763 :                 for (i = 0; i != 1 << order; ++i)
    1725             :                         page_kasan_tag_reset(page + i);
    1726             :         }
    1727             :         /* If memory is still not initialized, initialize it now. */
    1728       44763 :         if (init)
    1729             :                 kernel_init_pages(page, 1 << order);
    1730             : 
    1731       44763 :         set_page_owner(page, order, gfp_flags);
    1732       44763 :         page_table_check_alloc(page, order);
    1733       44763 : }
    1734             : 
    1735        2222 : static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
    1736             :                                                         unsigned int alloc_flags)
    1737             : {
    1738       44763 :         post_alloc_hook(page, order, gfp_flags);
    1739             : 
    1740        2222 :         if (order && (gfp_flags & __GFP_COMP))
    1741             :                 prep_compound_page(page, order);
    1742             : 
    1743             :         /*
    1744             :          * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
    1745             :          * allocate the page. The expectation is that the caller is taking
    1746             :          * steps that will free more memory. The caller should avoid the page
    1747             :          * being used for !PFMEMALLOC purposes.
    1748             :          */
    1749        2222 :         if (alloc_flags & ALLOC_NO_WATERMARKS)
    1750           0 :                 set_page_pfmemalloc(page);
    1751             :         else
    1752       44763 :                 clear_page_pfmemalloc(page);
    1753        2222 : }
    1754             : 
    1755             : /*
    1756             :  * Go through the free lists for the given migratetype and remove
    1757             :  * the smallest available page from the freelists
    1758             :  */
    1759             : static __always_inline
    1760             : struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
    1761             :                                                 int migratetype)
    1762             : {
    1763             :         unsigned int current_order;
    1764             :         struct free_area *area;
    1765             :         struct page *page;
    1766             : 
    1767             :         /* Find a page of the appropriate size in the preferred list */
    1768        9388 :         for (current_order = order; current_order <= MAX_ORDER; ++current_order) {
    1769        4690 :                 area = &(zone->free_area[current_order]);
    1770        4690 :                 page = get_page_from_free_area(area, migratetype);
    1771        4690 :                 if (!page)
    1772        2369 :                         continue;
    1773        2321 :                 del_page_from_free_list(page, zone, current_order);
    1774        4642 :                 expand(zone, page, order, current_order, migratetype);
    1775        2321 :                 set_pcppage_migratetype(page, migratetype);
    1776             :                 trace_mm_page_alloc_zone_locked(page, order, migratetype,
    1777             :                                 pcp_allowed_order(order) &&
    1778             :                                 migratetype < MIGRATE_PCPTYPES);
    1779             :                 return page;
    1780             :         }
    1781             : 
    1782             :         return NULL;
    1783             : }
    1784             : 
    1785             : 
    1786             : /*
    1787             :  * This array describes the order lists are fallen back to when
    1788             :  * the free lists for the desirable migrate type are depleted
    1789             :  *
    1790             :  * The other migratetypes do not have fallbacks.
    1791             :  */
    1792             : static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = {
    1793             :         [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE   },
    1794             :         [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
    1795             :         [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE   },
    1796             : };
    1797             : 
    1798             : #ifdef CONFIG_CMA
    1799             : static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
    1800             :                                         unsigned int order)
    1801             : {
    1802             :         return __rmqueue_smallest(zone, order, MIGRATE_CMA);
    1803             : }
    1804             : #else
    1805             : static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
    1806             :                                         unsigned int order) { return NULL; }
    1807             : #endif
    1808             : 
    1809             : /*
    1810             :  * Move the free pages in a range to the freelist tail of the requested type.
    1811             :  * Note that start_page and end_pages are not aligned on a pageblock
    1812             :  * boundary. If alignment is required, use move_freepages_block()
    1813             :  */
    1814           0 : static int move_freepages(struct zone *zone,
    1815             :                           unsigned long start_pfn, unsigned long end_pfn,
    1816             :                           int migratetype, int *num_movable)
    1817             : {
    1818             :         struct page *page;
    1819             :         unsigned long pfn;
    1820             :         unsigned int order;
    1821           0 :         int pages_moved = 0;
    1822             : 
    1823           0 :         for (pfn = start_pfn; pfn <= end_pfn;) {
    1824           0 :                 page = pfn_to_page(pfn);
    1825           0 :                 if (!PageBuddy(page)) {
    1826             :                         /*
    1827             :                          * We assume that pages that could be isolated for
    1828             :                          * migration are movable. But we don't actually try
    1829             :                          * isolating, as that would be expensive.
    1830             :                          */
    1831           0 :                         if (num_movable &&
    1832           0 :                                         (PageLRU(page) || __PageMovable(page)))
    1833           0 :                                 (*num_movable)++;
    1834           0 :                         pfn++;
    1835           0 :                         continue;
    1836             :                 }
    1837             : 
    1838             :                 /* Make sure we are not inadvertently changing nodes */
    1839             :                 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
    1840             :                 VM_BUG_ON_PAGE(page_zone(page) != zone, page);
    1841             : 
    1842           0 :                 order = buddy_order(page);
    1843           0 :                 move_to_free_list(page, zone, order, migratetype);
    1844           0 :                 pfn += 1 << order;
    1845           0 :                 pages_moved += 1 << order;
    1846             :         }
    1847             : 
    1848           0 :         return pages_moved;
    1849             : }
    1850             : 
    1851           0 : int move_freepages_block(struct zone *zone, struct page *page,
    1852             :                                 int migratetype, int *num_movable)
    1853             : {
    1854             :         unsigned long start_pfn, end_pfn, pfn;
    1855             : 
    1856           0 :         if (num_movable)
    1857           0 :                 *num_movable = 0;
    1858             : 
    1859           0 :         pfn = page_to_pfn(page);
    1860           0 :         start_pfn = pageblock_start_pfn(pfn);
    1861           0 :         end_pfn = pageblock_end_pfn(pfn) - 1;
    1862             : 
    1863             :         /* Do not cross zone boundaries */
    1864           0 :         if (!zone_spans_pfn(zone, start_pfn))
    1865           0 :                 start_pfn = pfn;
    1866           0 :         if (!zone_spans_pfn(zone, end_pfn))
    1867             :                 return 0;
    1868             : 
    1869           0 :         return move_freepages(zone, start_pfn, end_pfn, migratetype,
    1870             :                                                                 num_movable);
    1871             : }
    1872             : 
    1873             : static void change_pageblock_range(struct page *pageblock_page,
    1874             :                                         int start_order, int migratetype)
    1875             : {
    1876           4 :         int nr_pageblocks = 1 << (start_order - pageblock_order);
    1877             : 
    1878           8 :         while (nr_pageblocks--) {
    1879           4 :                 set_pageblock_migratetype(pageblock_page, migratetype);
    1880           4 :                 pageblock_page += pageblock_nr_pages;
    1881             :         }
    1882             : }
    1883             : 
    1884             : /*
    1885             :  * When we are falling back to another migratetype during allocation, try to
    1886             :  * steal extra free pages from the same pageblocks to satisfy further
    1887             :  * allocations, instead of polluting multiple pageblocks.
    1888             :  *
    1889             :  * If we are stealing a relatively large buddy page, it is likely there will
    1890             :  * be more free pages in the pageblock, so try to steal them all. For
    1891             :  * reclaimable and unmovable allocations, we steal regardless of page size,
    1892             :  * as fragmentation caused by those allocations polluting movable pageblocks
    1893             :  * is worse than movable allocations stealing from unmovable and reclaimable
    1894             :  * pageblocks.
    1895             :  */
    1896             : static bool can_steal_fallback(unsigned int order, int start_mt)
    1897             : {
    1898             :         /*
    1899             :          * Leaving this order check is intended, although there is
    1900             :          * relaxed order check in next check. The reason is that
    1901             :          * we can actually steal whole pageblock if this condition met,
    1902             :          * but, below check doesn't guarantee it and that is just heuristic
    1903             :          * so could be changed anytime.
    1904             :          */
    1905           4 :         if (order >= pageblock_order)
    1906             :                 return true;
    1907             : 
    1908           0 :         if (order >= pageblock_order / 2 ||
    1909           0 :                 start_mt == MIGRATE_RECLAIMABLE ||
    1910           0 :                 start_mt == MIGRATE_UNMOVABLE ||
    1911             :                 page_group_by_mobility_disabled)
    1912             :                 return true;
    1913             : 
    1914             :         return false;
    1915             : }
    1916             : 
    1917           0 : static inline bool boost_watermark(struct zone *zone)
    1918             : {
    1919             :         unsigned long max_boost;
    1920             : 
    1921           0 :         if (!watermark_boost_factor)
    1922             :                 return false;
    1923             :         /*
    1924             :          * Don't bother in zones that are unlikely to produce results.
    1925             :          * On small machines, including kdump capture kernels running
    1926             :          * in a small area, boosting the watermark can cause an out of
    1927             :          * memory situation immediately.
    1928             :          */
    1929           0 :         if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
    1930             :                 return false;
    1931             : 
    1932           0 :         max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
    1933             :                         watermark_boost_factor, 10000);
    1934             : 
    1935             :         /*
    1936             :          * high watermark may be uninitialised if fragmentation occurs
    1937             :          * very early in boot so do not boost. We do not fall
    1938             :          * through and boost by pageblock_nr_pages as failing
    1939             :          * allocations that early means that reclaim is not going
    1940             :          * to help and it may even be impossible to reclaim the
    1941             :          * boosted watermark resulting in a hang.
    1942             :          */
    1943           0 :         if (!max_boost)
    1944             :                 return false;
    1945             : 
    1946           0 :         max_boost = max(pageblock_nr_pages, max_boost);
    1947             : 
    1948           0 :         zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
    1949             :                 max_boost);
    1950             : 
    1951           0 :         return true;
    1952             : }
    1953             : 
    1954             : /*
    1955             :  * This function implements actual steal behaviour. If order is large enough,
    1956             :  * we can steal whole pageblock. If not, we first move freepages in this
    1957             :  * pageblock to our migratetype and determine how many already-allocated pages
    1958             :  * are there in the pageblock with a compatible migratetype. If at least half
    1959             :  * of pages are free or compatible, we can change migratetype of the pageblock
    1960             :  * itself, so pages freed in the future will be put on the correct free list.
    1961             :  */
    1962           4 : static void steal_suitable_fallback(struct zone *zone, struct page *page,
    1963             :                 unsigned int alloc_flags, int start_type, bool whole_block)
    1964             : {
    1965           8 :         unsigned int current_order = buddy_order(page);
    1966             :         int free_pages, movable_pages, alike_pages;
    1967             :         int old_block_type;
    1968             : 
    1969           8 :         old_block_type = get_pageblock_migratetype(page);
    1970             : 
    1971             :         /*
    1972             :          * This can happen due to races and we want to prevent broken
    1973             :          * highatomic accounting.
    1974             :          */
    1975           4 :         if (is_migrate_highatomic(old_block_type))
    1976             :                 goto single_page;
    1977             : 
    1978             :         /* Take ownership for orders >= pageblock_order */
    1979           4 :         if (current_order >= pageblock_order) {
    1980           4 :                 change_pageblock_range(page, current_order, start_type);
    1981             :                 goto single_page;
    1982             :         }
    1983             : 
    1984             :         /*
    1985             :          * Boost watermarks to increase reclaim pressure to reduce the
    1986             :          * likelihood of future fallbacks. Wake kswapd now as the node
    1987             :          * may be balanced overall and kswapd will not wake naturally.
    1988             :          */
    1989           0 :         if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
    1990           0 :                 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
    1991             : 
    1992             :         /* We are not allowed to try stealing from the whole block */
    1993           0 :         if (!whole_block)
    1994             :                 goto single_page;
    1995             : 
    1996           0 :         free_pages = move_freepages_block(zone, page, start_type,
    1997             :                                                 &movable_pages);
    1998             :         /*
    1999             :          * Determine how many pages are compatible with our allocation.
    2000             :          * For movable allocation, it's the number of movable pages which
    2001             :          * we just obtained. For other types it's a bit more tricky.
    2002             :          */
    2003           0 :         if (start_type == MIGRATE_MOVABLE) {
    2004           0 :                 alike_pages = movable_pages;
    2005             :         } else {
    2006             :                 /*
    2007             :                  * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
    2008             :                  * to MOVABLE pageblock, consider all non-movable pages as
    2009             :                  * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
    2010             :                  * vice versa, be conservative since we can't distinguish the
    2011             :                  * exact migratetype of non-movable pages.
    2012             :                  */
    2013           0 :                 if (old_block_type == MIGRATE_MOVABLE)
    2014           0 :                         alike_pages = pageblock_nr_pages
    2015           0 :                                                 - (free_pages + movable_pages);
    2016             :                 else
    2017             :                         alike_pages = 0;
    2018             :         }
    2019             : 
    2020             :         /* moving whole block can fail due to zone boundary conditions */
    2021           0 :         if (!free_pages)
    2022             :                 goto single_page;
    2023             : 
    2024             :         /*
    2025             :          * If a sufficient number of pages in the block are either free or of
    2026             :          * comparable migratability as our allocation, claim the whole block.
    2027             :          */
    2028           0 :         if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
    2029             :                         page_group_by_mobility_disabled)
    2030           0 :                 set_pageblock_migratetype(page, start_type);
    2031             : 
    2032           0 :         return;
    2033             : 
    2034             : single_page:
    2035           4 :         move_to_free_list(page, zone, current_order, start_type);
    2036             : }
    2037             : 
    2038             : /*
    2039             :  * Check whether there is a suitable fallback freepage with requested order.
    2040             :  * If only_stealable is true, this function returns fallback_mt only if
    2041             :  * we can steal other freepages all together. This would help to reduce
    2042             :  * fragmentation due to mixed migratetype pages in one pageblock.
    2043             :  */
    2044           4 : int find_suitable_fallback(struct free_area *area, unsigned int order,
    2045             :                         int migratetype, bool only_stealable, bool *can_steal)
    2046             : {
    2047             :         int i;
    2048             :         int fallback_mt;
    2049             : 
    2050           4 :         if (area->nr_free == 0)
    2051             :                 return -1;
    2052             : 
    2053           4 :         *can_steal = false;
    2054           8 :         for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
    2055           8 :                 fallback_mt = fallbacks[migratetype][i];
    2056           8 :                 if (free_area_empty(area, fallback_mt))
    2057           4 :                         continue;
    2058             : 
    2059           4 :                 if (can_steal_fallback(order, migratetype))
    2060           4 :                         *can_steal = true;
    2061             : 
    2062           4 :                 if (!only_stealable)
    2063             :                         return fallback_mt;
    2064             : 
    2065           0 :                 if (*can_steal)
    2066             :                         return fallback_mt;
    2067             :         }
    2068             : 
    2069             :         return -1;
    2070             : }
    2071             : 
    2072             : /*
    2073             :  * Reserve a pageblock for exclusive use of high-order atomic allocations if
    2074             :  * there are no empty page blocks that contain a page with a suitable order
    2075             :  */
    2076           0 : static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
    2077             :                                 unsigned int alloc_order)
    2078             : {
    2079             :         int mt;
    2080             :         unsigned long max_managed, flags;
    2081             : 
    2082             :         /*
    2083             :          * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
    2084             :          * Check is race-prone but harmless.
    2085             :          */
    2086           0 :         max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
    2087           0 :         if (zone->nr_reserved_highatomic >= max_managed)
    2088             :                 return;
    2089             : 
    2090           0 :         spin_lock_irqsave(&zone->lock, flags);
    2091             : 
    2092             :         /* Recheck the nr_reserved_highatomic limit under the lock */
    2093           0 :         if (zone->nr_reserved_highatomic >= max_managed)
    2094             :                 goto out_unlock;
    2095             : 
    2096             :         /* Yoink! */
    2097           0 :         mt = get_pageblock_migratetype(page);
    2098             :         /* Only reserve normal pageblocks (i.e., they can merge with others) */
    2099           0 :         if (migratetype_is_mergeable(mt)) {
    2100           0 :                 zone->nr_reserved_highatomic += pageblock_nr_pages;
    2101           0 :                 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
    2102           0 :                 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
    2103             :         }
    2104             : 
    2105             : out_unlock:
    2106           0 :         spin_unlock_irqrestore(&zone->lock, flags);
    2107             : }
    2108             : 
    2109             : /*
    2110             :  * Used when an allocation is about to fail under memory pressure. This
    2111             :  * potentially hurts the reliability of high-order allocations when under
    2112             :  * intense memory pressure but failed atomic allocations should be easier
    2113             :  * to recover from than an OOM.
    2114             :  *
    2115             :  * If @force is true, try to unreserve a pageblock even though highatomic
    2116             :  * pageblock is exhausted.
    2117             :  */
    2118           0 : static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
    2119             :                                                 bool force)
    2120             : {
    2121           0 :         struct zonelist *zonelist = ac->zonelist;
    2122             :         unsigned long flags;
    2123             :         struct zoneref *z;
    2124             :         struct zone *zone;
    2125             :         struct page *page;
    2126             :         int order;
    2127             :         bool ret;
    2128             : 
    2129           0 :         for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
    2130             :                                                                 ac->nodemask) {
    2131             :                 /*
    2132             :                  * Preserve at least one pageblock unless memory pressure
    2133             :                  * is really high.
    2134             :                  */
    2135           0 :                 if (!force && zone->nr_reserved_highatomic <=
    2136             :                                         pageblock_nr_pages)
    2137           0 :                         continue;
    2138             : 
    2139           0 :                 spin_lock_irqsave(&zone->lock, flags);
    2140           0 :                 for (order = 0; order <= MAX_ORDER; order++) {
    2141           0 :                         struct free_area *area = &(zone->free_area[order]);
    2142             : 
    2143           0 :                         page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
    2144           0 :                         if (!page)
    2145           0 :                                 continue;
    2146             : 
    2147             :                         /*
    2148             :                          * In page freeing path, migratetype change is racy so
    2149             :                          * we can counter several free pages in a pageblock
    2150             :                          * in this loop although we changed the pageblock type
    2151             :                          * from highatomic to ac->migratetype. So we should
    2152             :                          * adjust the count once.
    2153             :                          */
    2154           0 :                         if (is_migrate_highatomic_page(page)) {
    2155             :                                 /*
    2156             :                                  * It should never happen but changes to
    2157             :                                  * locking could inadvertently allow a per-cpu
    2158             :                                  * drain to add pages to MIGRATE_HIGHATOMIC
    2159             :                                  * while unreserving so be safe and watch for
    2160             :                                  * underflows.
    2161             :                                  */
    2162           0 :                                 zone->nr_reserved_highatomic -= min(
    2163             :                                                 pageblock_nr_pages,
    2164             :                                                 zone->nr_reserved_highatomic);
    2165             :                         }
    2166             : 
    2167             :                         /*
    2168             :                          * Convert to ac->migratetype and avoid the normal
    2169             :                          * pageblock stealing heuristics. Minimally, the caller
    2170             :                          * is doing the work and needs the pages. More
    2171             :                          * importantly, if the block was always converted to
    2172             :                          * MIGRATE_UNMOVABLE or another type then the number
    2173             :                          * of pageblocks that cannot be completely freed
    2174             :                          * may increase.
    2175             :                          */
    2176           0 :                         set_pageblock_migratetype(page, ac->migratetype);
    2177           0 :                         ret = move_freepages_block(zone, page, ac->migratetype,
    2178             :                                                                         NULL);
    2179           0 :                         if (ret) {
    2180           0 :                                 spin_unlock_irqrestore(&zone->lock, flags);
    2181           0 :                                 return ret;
    2182             :                         }
    2183             :                 }
    2184           0 :                 spin_unlock_irqrestore(&zone->lock, flags);
    2185             :         }
    2186             : 
    2187             :         return false;
    2188             : }
    2189             : 
    2190             : /*
    2191             :  * Try finding a free buddy page on the fallback list and put it on the free
    2192             :  * list of requested migratetype, possibly along with other pages from the same
    2193             :  * block, depending on fragmentation avoidance heuristics. Returns true if
    2194             :  * fallback was found so that __rmqueue_smallest() can grab it.
    2195             :  *
    2196             :  * The use of signed ints for order and current_order is a deliberate
    2197             :  * deviation from the rest of this file, to make the for loop
    2198             :  * condition simpler.
    2199             :  */
    2200             : static __always_inline bool
    2201             : __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
    2202             :                                                 unsigned int alloc_flags)
    2203             : {
    2204             :         struct free_area *area;
    2205             :         int current_order;
    2206           4 :         int min_order = order;
    2207             :         struct page *page;
    2208             :         int fallback_mt;
    2209             :         bool can_steal;
    2210             : 
    2211             :         /*
    2212             :          * Do not steal pages from freelists belonging to other pageblocks
    2213             :          * i.e. orders < pageblock_order. If there are no local zones free,
    2214             :          * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
    2215             :          */
    2216             :         if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
    2217             :                 min_order = pageblock_order;
    2218             : 
    2219             :         /*
    2220             :          * Find the largest available free page in the other list. This roughly
    2221             :          * approximates finding the pageblock with the most free pages, which
    2222             :          * would be too costly to do exactly.
    2223             :          */
    2224           8 :         for (current_order = MAX_ORDER; current_order >= min_order;
    2225           0 :                                 --current_order) {
    2226           4 :                 area = &(zone->free_area[current_order]);
    2227           4 :                 fallback_mt = find_suitable_fallback(area, current_order,
    2228             :                                 start_migratetype, false, &can_steal);
    2229           4 :                 if (fallback_mt == -1)
    2230           0 :                         continue;
    2231             : 
    2232             :                 /*
    2233             :                  * We cannot steal all free pages from the pageblock and the
    2234             :                  * requested migratetype is movable. In that case it's better to
    2235             :                  * steal and split the smallest available page instead of the
    2236             :                  * largest available page, because even if the next movable
    2237             :                  * allocation falls back into a different pageblock than this
    2238             :                  * one, it won't cause permanent fragmentation.
    2239             :                  */
    2240           4 :                 if (!can_steal && start_migratetype == MIGRATE_MOVABLE
    2241           0 :                                         && current_order > order)
    2242             :                         goto find_smallest;
    2243             : 
    2244             :                 goto do_steal;
    2245             :         }
    2246             : 
    2247             :         return false;
    2248             : 
    2249             : find_smallest:
    2250           0 :         for (current_order = order; current_order <= MAX_ORDER;
    2251           0 :                                                         current_order++) {
    2252           0 :                 area = &(zone->free_area[current_order]);
    2253           0 :                 fallback_mt = find_suitable_fallback(area, current_order,
    2254             :                                 start_migratetype, false, &can_steal);
    2255           0 :                 if (fallback_mt != -1)
    2256             :                         break;
    2257             :         }
    2258             : 
    2259             :         /*
    2260             :          * This should not happen - we already found a suitable fallback
    2261             :          * when looking for the largest page.
    2262             :          */
    2263             :         VM_BUG_ON(current_order > MAX_ORDER);
    2264             : 
    2265             : do_steal:
    2266           4 :         page = get_page_from_free_area(area, fallback_mt);
    2267             : 
    2268           4 :         steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
    2269             :                                                                 can_steal);
    2270             : 
    2271           4 :         trace_mm_page_alloc_extfrag(page, order, current_order,
    2272             :                 start_migratetype, fallback_mt);
    2273             : 
    2274             :         return true;
    2275             : 
    2276             : }
    2277             : 
    2278             : /*
    2279             :  * Do the hard work of removing an element from the buddy allocator.
    2280             :  * Call me with the zone->lock already held.
    2281             :  */
    2282             : static __always_inline struct page *
    2283             : __rmqueue(struct zone *zone, unsigned int order, int migratetype,
    2284             :                                                 unsigned int alloc_flags)
    2285             : {
    2286             :         struct page *page;
    2287             : 
    2288             :         if (IS_ENABLED(CONFIG_CMA)) {
    2289             :                 /*
    2290             :                  * Balance movable allocations between regular and CMA areas by
    2291             :                  * allocating from CMA when over half of the zone's free memory
    2292             :                  * is in the CMA area.
    2293             :                  */
    2294             :                 if (alloc_flags & ALLOC_CMA &&
    2295             :                     zone_page_state(zone, NR_FREE_CMA_PAGES) >
    2296             :                     zone_page_state(zone, NR_FREE_PAGES) / 2) {
    2297             :                         page = __rmqueue_cma_fallback(zone, order);
    2298             :                         if (page)
    2299             :                                 return page;
    2300             :                 }
    2301             :         }
    2302             : retry:
    2303        2325 :         page = __rmqueue_smallest(zone, order, migratetype);
    2304        2325 :         if (unlikely(!page)) {
    2305           4 :                 if (alloc_flags & ALLOC_CMA)
    2306           0 :                         page = __rmqueue_cma_fallback(zone, order);
    2307             : 
    2308           8 :                 if (!page && __rmqueue_fallback(zone, order, migratetype,
    2309             :                                                                 alloc_flags))
    2310             :                         goto retry;
    2311             :         }
    2312             :         return page;
    2313             : }
    2314             : 
    2315             : /*
    2316             :  * Obtain a specified number of elements from the buddy allocator, all under
    2317             :  * a single hold of the lock, for efficiency.  Add them to the supplied list.
    2318             :  * Returns the number of new pages which were placed at *list.
    2319             :  */
    2320          53 : static int rmqueue_bulk(struct zone *zone, unsigned int order,
    2321             :                         unsigned long count, struct list_head *list,
    2322             :                         int migratetype, unsigned int alloc_flags)
    2323             : {
    2324             :         unsigned long flags;
    2325             :         int i;
    2326             : 
    2327          53 :         spin_lock_irqsave(&zone->lock, flags);
    2328        2374 :         for (i = 0; i < count; ++i) {
    2329        2321 :                 struct page *page = __rmqueue(zone, order, migratetype,
    2330             :                                                                 alloc_flags);
    2331        2321 :                 if (unlikely(page == NULL))
    2332             :                         break;
    2333             : 
    2334             :                 /*
    2335             :                  * Split buddy pages returned by expand() are received here in
    2336             :                  * physical page order. The page is added to the tail of
    2337             :                  * caller's list. From the callers perspective, the linked list
    2338             :                  * is ordered by page number under some conditions. This is
    2339             :                  * useful for IO devices that can forward direction from the
    2340             :                  * head, thus also in the physical page order. This is useful
    2341             :                  * for IO devices that can merge IO requests if the physical
    2342             :                  * pages are ordered properly.
    2343             :                  */
    2344        4642 :                 list_add_tail(&page->pcp_list, list);
    2345             :                 if (is_migrate_cma(get_pcppage_migratetype(page)))
    2346             :                         __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
    2347             :                                               -(1 << order));
    2348             :         }
    2349             : 
    2350         106 :         __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
    2351         106 :         spin_unlock_irqrestore(&zone->lock, flags);
    2352             : 
    2353          53 :         return i;
    2354             : }
    2355             : 
    2356             : #ifdef CONFIG_NUMA
    2357             : /*
    2358             :  * Called from the vmstat counter updater to drain pagesets of this
    2359             :  * currently executing processor on remote nodes after they have
    2360             :  * expired.
    2361             :  */
    2362             : void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
    2363             : {
    2364             :         int to_drain, batch;
    2365             : 
    2366             :         batch = READ_ONCE(pcp->batch);
    2367             :         to_drain = min(pcp->count, batch);
    2368             :         if (to_drain > 0) {
    2369             :                 spin_lock(&pcp->lock);
    2370             :                 free_pcppages_bulk(zone, to_drain, pcp, 0);
    2371             :                 spin_unlock(&pcp->lock);
    2372             :         }
    2373             : }
    2374             : #endif
    2375             : 
    2376             : /*
    2377             :  * Drain pcplists of the indicated processor and zone.
    2378             :  */
    2379           0 : static void drain_pages_zone(unsigned int cpu, struct zone *zone)
    2380             : {
    2381             :         struct per_cpu_pages *pcp;
    2382             : 
    2383           0 :         pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    2384           0 :         if (pcp->count) {
    2385           0 :                 spin_lock(&pcp->lock);
    2386           0 :                 free_pcppages_bulk(zone, pcp->count, pcp, 0);
    2387           0 :                 spin_unlock(&pcp->lock);
    2388             :         }
    2389           0 : }
    2390             : 
    2391             : /*
    2392             :  * Drain pcplists of all zones on the indicated processor.
    2393             :  */
    2394           0 : static void drain_pages(unsigned int cpu)
    2395             : {
    2396             :         struct zone *zone;
    2397             : 
    2398           0 :         for_each_populated_zone(zone) {
    2399           0 :                 drain_pages_zone(cpu, zone);
    2400             :         }
    2401           0 : }
    2402             : 
    2403             : /*
    2404             :  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
    2405             :  */
    2406           0 : void drain_local_pages(struct zone *zone)
    2407             : {
    2408           0 :         int cpu = smp_processor_id();
    2409             : 
    2410           0 :         if (zone)
    2411           0 :                 drain_pages_zone(cpu, zone);
    2412             :         else
    2413           0 :                 drain_pages(cpu);
    2414           0 : }
    2415             : 
    2416             : /*
    2417             :  * The implementation of drain_all_pages(), exposing an extra parameter to
    2418             :  * drain on all cpus.
    2419             :  *
    2420             :  * drain_all_pages() is optimized to only execute on cpus where pcplists are
    2421             :  * not empty. The check for non-emptiness can however race with a free to
    2422             :  * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
    2423             :  * that need the guarantee that every CPU has drained can disable the
    2424             :  * optimizing racy check.
    2425             :  */
    2426           0 : static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
    2427             : {
    2428             :         int cpu;
    2429             : 
    2430             :         /*
    2431             :          * Allocate in the BSS so we won't require allocation in
    2432             :          * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
    2433             :          */
    2434             :         static cpumask_t cpus_with_pcps;
    2435             : 
    2436             :         /*
    2437             :          * Do not drain if one is already in progress unless it's specific to
    2438             :          * a zone. Such callers are primarily CMA and memory hotplug and need
    2439             :          * the drain to be complete when the call returns.
    2440             :          */
    2441           0 :         if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
    2442           0 :                 if (!zone)
    2443             :                         return;
    2444           0 :                 mutex_lock(&pcpu_drain_mutex);
    2445             :         }
    2446             : 
    2447             :         /*
    2448             :          * We don't care about racing with CPU hotplug event
    2449             :          * as offline notification will cause the notified
    2450             :          * cpu to drain that CPU pcps and on_each_cpu_mask
    2451             :          * disables preemption as part of its processing
    2452             :          */
    2453           0 :         for_each_online_cpu(cpu) {
    2454             :                 struct per_cpu_pages *pcp;
    2455             :                 struct zone *z;
    2456           0 :                 bool has_pcps = false;
    2457             : 
    2458           0 :                 if (force_all_cpus) {
    2459             :                         /*
    2460             :                          * The pcp.count check is racy, some callers need a
    2461             :                          * guarantee that no cpu is missed.
    2462             :                          */
    2463             :                         has_pcps = true;
    2464           0 :                 } else if (zone) {
    2465           0 :                         pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    2466           0 :                         if (pcp->count)
    2467           0 :                                 has_pcps = true;
    2468             :                 } else {
    2469           0 :                         for_each_populated_zone(z) {
    2470           0 :                                 pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
    2471           0 :                                 if (pcp->count) {
    2472             :                                         has_pcps = true;
    2473             :                                         break;
    2474             :                                 }
    2475             :                         }
    2476             :                 }
    2477             : 
    2478           0 :                 if (has_pcps)
    2479           0 :                         cpumask_set_cpu(cpu, &cpus_with_pcps);
    2480             :                 else
    2481             :                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
    2482             :         }
    2483             : 
    2484           0 :         for_each_cpu(cpu, &cpus_with_pcps) {
    2485           0 :                 if (zone)
    2486           0 :                         drain_pages_zone(cpu, zone);
    2487             :                 else
    2488           0 :                         drain_pages(cpu);
    2489             :         }
    2490             : 
    2491           0 :         mutex_unlock(&pcpu_drain_mutex);
    2492             : }
    2493             : 
    2494             : /*
    2495             :  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
    2496             :  *
    2497             :  * When zone parameter is non-NULL, spill just the single zone's pages.
    2498             :  */
    2499           0 : void drain_all_pages(struct zone *zone)
    2500             : {
    2501           0 :         __drain_all_pages(zone, false);
    2502           0 : }
    2503             : 
    2504             : #ifdef CONFIG_HIBERNATION
    2505             : 
    2506             : /*
    2507             :  * Touch the watchdog for every WD_PAGE_COUNT pages.
    2508             :  */
    2509             : #define WD_PAGE_COUNT   (128*1024)
    2510             : 
    2511             : void mark_free_pages(struct zone *zone)
    2512             : {
    2513             :         unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
    2514             :         unsigned long flags;
    2515             :         unsigned int order, t;
    2516             :         struct page *page;
    2517             : 
    2518             :         if (zone_is_empty(zone))
    2519             :                 return;
    2520             : 
    2521             :         spin_lock_irqsave(&zone->lock, flags);
    2522             : 
    2523             :         max_zone_pfn = zone_end_pfn(zone);
    2524             :         for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
    2525             :                 if (pfn_valid(pfn)) {
    2526             :                         page = pfn_to_page(pfn);
    2527             : 
    2528             :                         if (!--page_count) {
    2529             :                                 touch_nmi_watchdog();
    2530             :                                 page_count = WD_PAGE_COUNT;
    2531             :                         }
    2532             : 
    2533             :                         if (page_zone(page) != zone)
    2534             :                                 continue;
    2535             : 
    2536             :                         if (!swsusp_page_is_forbidden(page))
    2537             :                                 swsusp_unset_page_free(page);
    2538             :                 }
    2539             : 
    2540             :         for_each_migratetype_order(order, t) {
    2541             :                 list_for_each_entry(page,
    2542             :                                 &zone->free_area[order].free_list[t], buddy_list) {
    2543             :                         unsigned long i;
    2544             : 
    2545             :                         pfn = page_to_pfn(page);
    2546             :                         for (i = 0; i < (1UL << order); i++) {
    2547             :                                 if (!--page_count) {
    2548             :                                         touch_nmi_watchdog();
    2549             :                                         page_count = WD_PAGE_COUNT;
    2550             :                                 }
    2551             :                                 swsusp_set_page_free(pfn_to_page(pfn + i));
    2552             :                         }
    2553             :                 }
    2554             :         }
    2555             :         spin_unlock_irqrestore(&zone->lock, flags);
    2556             : }
    2557             : #endif /* CONFIG_PM */
    2558             : 
    2559       44236 : static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
    2560             :                                                         unsigned int order)
    2561             : {
    2562             :         int migratetype;
    2563             : 
    2564       44236 :         if (!free_pages_prepare(page, order, FPI_NONE))
    2565             :                 return false;
    2566             : 
    2567       44236 :         migratetype = get_pfnblock_migratetype(page, pfn);
    2568       88472 :         set_pcppage_migratetype(page, migratetype);
    2569       44236 :         return true;
    2570             : }
    2571             : 
    2572             : static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
    2573             :                        bool free_high)
    2574             : {
    2575             :         int min_nr_free, max_nr_free;
    2576             : 
    2577             :         /* Free everything if batch freeing high-order pages. */
    2578           4 :         if (unlikely(free_high))
    2579             :                 return pcp->count;
    2580             : 
    2581             :         /* Check for PCP disabled or boot pageset */
    2582           4 :         if (unlikely(high < batch))
    2583             :                 return 1;
    2584             : 
    2585             :         /* Leave at least pcp->batch pages on the list */
    2586           4 :         min_nr_free = batch;
    2587           4 :         max_nr_free = high - batch;
    2588             : 
    2589             :         /*
    2590             :          * Double the number of pages freed each time there is subsequent
    2591             :          * freeing of pages without any allocation.
    2592             :          */
    2593           4 :         batch <<= pcp->free_factor;
    2594           4 :         if (batch < max_nr_free)
    2595           4 :                 pcp->free_factor++;
    2596           4 :         batch = clamp(batch, min_nr_free, max_nr_free);
    2597             : 
    2598             :         return batch;
    2599             : }
    2600             : 
    2601       44236 : static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
    2602             :                        bool free_high)
    2603             : {
    2604       44236 :         int high = READ_ONCE(pcp->high);
    2605             : 
    2606       44236 :         if (unlikely(!high || free_high))
    2607             :                 return 0;
    2608             : 
    2609       88472 :         if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
    2610             :                 return high;
    2611             : 
    2612             :         /*
    2613             :          * If reclaim is active, limit the number of pages that can be
    2614             :          * stored on pcp lists
    2615             :          */
    2616           0 :         return min(READ_ONCE(pcp->batch) << 2, high);
    2617             : }
    2618             : 
    2619       44236 : static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
    2620             :                                    struct page *page, int migratetype,
    2621             :                                    unsigned int order)
    2622             : {
    2623             :         int high;
    2624             :         int pindex;
    2625             :         bool free_high;
    2626             : 
    2627       88472 :         __count_vm_events(PGFREE, 1 << order);
    2628       88472 :         pindex = order_to_pindex(migratetype, order);
    2629       88472 :         list_add(&page->pcp_list, &pcp->lists[pindex]);
    2630       44236 :         pcp->count += 1 << order;
    2631             : 
    2632             :         /*
    2633             :          * As high-order pages other than THP's stored on PCP can contribute
    2634             :          * to fragmentation, limit the number stored when PCP is heavily
    2635             :          * freeing without allocation. The remainder after bulk freeing
    2636             :          * stops will be drained from vmstat refresh context.
    2637             :          */
    2638       44236 :         free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
    2639             : 
    2640       44236 :         high = nr_pcp_high(pcp, zone, free_high);
    2641       44236 :         if (pcp->count >= high) {
    2642           4 :                 int batch = READ_ONCE(pcp->batch);
    2643             : 
    2644           8 :                 free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
    2645             :         }
    2646       44236 : }
    2647             : 
    2648             : /*
    2649             :  * Free a pcp page
    2650             :  */
    2651       44236 : void free_unref_page(struct page *page, unsigned int order)
    2652             : {
    2653             :         unsigned long __maybe_unused UP_flags;
    2654             :         struct per_cpu_pages *pcp;
    2655             :         struct zone *zone;
    2656       44236 :         unsigned long pfn = page_to_pfn(page);
    2657             :         int migratetype;
    2658             : 
    2659       44236 :         if (!free_unref_page_prepare(page, pfn, order))
    2660             :                 return;
    2661             : 
    2662             :         /*
    2663             :          * We only track unmovable, reclaimable and movable on pcp lists.
    2664             :          * Place ISOLATE pages on the isolated list because they are being
    2665             :          * offlined but treat HIGHATOMIC as movable pages so we can get those
    2666             :          * areas back if necessary. Otherwise, we may have to free
    2667             :          * excessively into the page allocator
    2668             :          */
    2669       88472 :         migratetype = get_pcppage_migratetype(page);
    2670       44236 :         if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
    2671             :                 if (unlikely(is_migrate_isolate(migratetype))) {
    2672             :                         free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
    2673             :                         return;
    2674             :                 }
    2675           0 :                 migratetype = MIGRATE_MOVABLE;
    2676             :         }
    2677             : 
    2678       44236 :         zone = page_zone(page);
    2679       44236 :         pcp_trylock_prepare(UP_flags);
    2680       88472 :         pcp = pcp_spin_trylock(zone->per_cpu_pageset);
    2681       44236 :         if (pcp) {
    2682       44236 :                 free_unref_page_commit(zone, pcp, page, migratetype, order);
    2683       88472 :                 pcp_spin_unlock(pcp);
    2684             :         } else {
    2685           0 :                 free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
    2686             :         }
    2687       44236 :         pcp_trylock_finish(UP_flags);
    2688             : }
    2689             : 
    2690             : /*
    2691             :  * Free a list of 0-order pages
    2692             :  */
    2693           0 : void free_unref_page_list(struct list_head *list)
    2694             : {
    2695             :         unsigned long __maybe_unused UP_flags;
    2696             :         struct page *page, *next;
    2697           0 :         struct per_cpu_pages *pcp = NULL;
    2698           0 :         struct zone *locked_zone = NULL;
    2699           0 :         int batch_count = 0;
    2700             :         int migratetype;
    2701             : 
    2702             :         /* Prepare pages for freeing */
    2703           0 :         list_for_each_entry_safe(page, next, list, lru) {
    2704           0 :                 unsigned long pfn = page_to_pfn(page);
    2705           0 :                 if (!free_unref_page_prepare(page, pfn, 0)) {
    2706           0 :                         list_del(&page->lru);
    2707           0 :                         continue;
    2708             :                 }
    2709             : 
    2710             :                 /*
    2711             :                  * Free isolated pages directly to the allocator, see
    2712             :                  * comment in free_unref_page.
    2713             :                  */
    2714             :                 migratetype = get_pcppage_migratetype(page);
    2715             :                 if (unlikely(is_migrate_isolate(migratetype))) {
    2716             :                         list_del(&page->lru);
    2717             :                         free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
    2718             :                         continue;
    2719             :                 }
    2720             :         }
    2721             : 
    2722           0 :         list_for_each_entry_safe(page, next, list, lru) {
    2723           0 :                 struct zone *zone = page_zone(page);
    2724             : 
    2725           0 :                 list_del(&page->lru);
    2726           0 :                 migratetype = get_pcppage_migratetype(page);
    2727             : 
    2728             :                 /*
    2729             :                  * Either different zone requiring a different pcp lock or
    2730             :                  * excessive lock hold times when freeing a large list of
    2731             :                  * pages.
    2732             :                  */
    2733           0 :                 if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
    2734           0 :                         if (pcp) {
    2735           0 :                                 pcp_spin_unlock(pcp);
    2736           0 :                                 pcp_trylock_finish(UP_flags);
    2737             :                         }
    2738             : 
    2739           0 :                         batch_count = 0;
    2740             : 
    2741             :                         /*
    2742             :                          * trylock is necessary as pages may be getting freed
    2743             :                          * from IRQ or SoftIRQ context after an IO completion.
    2744             :                          */
    2745           0 :                         pcp_trylock_prepare(UP_flags);
    2746           0 :                         pcp = pcp_spin_trylock(zone->per_cpu_pageset);
    2747           0 :                         if (unlikely(!pcp)) {
    2748           0 :                                 pcp_trylock_finish(UP_flags);
    2749           0 :                                 free_one_page(zone, page, page_to_pfn(page),
    2750             :                                               0, migratetype, FPI_NONE);
    2751           0 :                                 locked_zone = NULL;
    2752           0 :                                 continue;
    2753             :                         }
    2754             :                         locked_zone = zone;
    2755             :                 }
    2756             : 
    2757             :                 /*
    2758             :                  * Non-isolated types over MIGRATE_PCPTYPES get added
    2759             :                  * to the MIGRATE_MOVABLE pcp list.
    2760             :                  */
    2761           0 :                 if (unlikely(migratetype >= MIGRATE_PCPTYPES))
    2762           0 :                         migratetype = MIGRATE_MOVABLE;
    2763             : 
    2764           0 :                 trace_mm_page_free_batched(page);
    2765           0 :                 free_unref_page_commit(zone, pcp, page, migratetype, 0);
    2766           0 :                 batch_count++;
    2767             :         }
    2768             : 
    2769           0 :         if (pcp) {
    2770           0 :                 pcp_spin_unlock(pcp);
    2771           0 :                 pcp_trylock_finish(UP_flags);
    2772             :         }
    2773           0 : }
    2774             : 
    2775             : /*
    2776             :  * split_page takes a non-compound higher-order page, and splits it into
    2777             :  * n (1<<order) sub-pages: page[0..n]
    2778             :  * Each sub-page must be freed individually.
    2779             :  *
    2780             :  * Note: this is probably too low level an operation for use in drivers.
    2781             :  * Please consult with lkml before using this in your driver.
    2782             :  */
    2783           0 : void split_page(struct page *page, unsigned int order)
    2784             : {
    2785             :         int i;
    2786             : 
    2787             :         VM_BUG_ON_PAGE(PageCompound(page), page);
    2788             :         VM_BUG_ON_PAGE(!page_count(page), page);
    2789             : 
    2790           0 :         for (i = 1; i < (1 << order); i++)
    2791           0 :                 set_page_refcounted(page + i);
    2792           0 :         split_page_owner(page, 1 << order);
    2793           0 :         split_page_memcg(page, 1 << order);
    2794           0 : }
    2795             : EXPORT_SYMBOL_GPL(split_page);
    2796             : 
    2797           0 : int __isolate_free_page(struct page *page, unsigned int order)
    2798             : {
    2799           0 :         struct zone *zone = page_zone(page);
    2800           0 :         int mt = get_pageblock_migratetype(page);
    2801             : 
    2802           0 :         if (!is_migrate_isolate(mt)) {
    2803             :                 unsigned long watermark;
    2804             :                 /*
    2805             :                  * Obey watermarks as if the page was being allocated. We can
    2806             :                  * emulate a high-order watermark check with a raised order-0
    2807             :                  * watermark, because we already know our high-order page
    2808             :                  * exists.
    2809             :                  */
    2810           0 :                 watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
    2811           0 :                 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
    2812             :                         return 0;
    2813             : 
    2814           0 :                 __mod_zone_freepage_state(zone, -(1UL << order), mt);
    2815             :         }
    2816             : 
    2817           0 :         del_page_from_free_list(page, zone, order);
    2818             : 
    2819             :         /*
    2820             :          * Set the pageblock if the isolated page is at least half of a
    2821             :          * pageblock
    2822             :          */
    2823           0 :         if (order >= pageblock_order - 1) {
    2824           0 :                 struct page *endpage = page + (1 << order) - 1;
    2825           0 :                 for (; page < endpage; page += pageblock_nr_pages) {
    2826           0 :                         int mt = get_pageblock_migratetype(page);
    2827             :                         /*
    2828             :                          * Only change normal pageblocks (i.e., they can merge
    2829             :                          * with others)
    2830             :                          */
    2831           0 :                         if (migratetype_is_mergeable(mt))
    2832           0 :                                 set_pageblock_migratetype(page,
    2833             :                                                           MIGRATE_MOVABLE);
    2834             :                 }
    2835             :         }
    2836             : 
    2837           0 :         return 1UL << order;
    2838             : }
    2839             : 
    2840             : /**
    2841             :  * __putback_isolated_page - Return a now-isolated page back where we got it
    2842             :  * @page: Page that was isolated
    2843             :  * @order: Order of the isolated page
    2844             :  * @mt: The page's pageblock's migratetype
    2845             :  *
    2846             :  * This function is meant to return a page pulled from the free lists via
    2847             :  * __isolate_free_page back to the free lists they were pulled from.
    2848             :  */
    2849           0 : void __putback_isolated_page(struct page *page, unsigned int order, int mt)
    2850             : {
    2851           0 :         struct zone *zone = page_zone(page);
    2852             : 
    2853             :         /* zone lock should be held when this function is called */
    2854             :         lockdep_assert_held(&zone->lock);
    2855             : 
    2856             :         /* Return isolated page to tail of freelist. */
    2857           0 :         __free_one_page(page, page_to_pfn(page), zone, order, mt,
    2858             :                         FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
    2859           0 : }
    2860             : 
    2861             : /*
    2862             :  * Update NUMA hit/miss statistics
    2863             :  */
    2864             : static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
    2865             :                                    long nr_account)
    2866             : {
    2867             : #ifdef CONFIG_NUMA
    2868             :         enum numa_stat_item local_stat = NUMA_LOCAL;
    2869             : 
    2870             :         /* skip numa counters update if numa stats is disabled */
    2871             :         if (!static_branch_likely(&vm_numa_stat_key))
    2872             :                 return;
    2873             : 
    2874             :         if (zone_to_nid(z) != numa_node_id())
    2875             :                 local_stat = NUMA_OTHER;
    2876             : 
    2877             :         if (zone_to_nid(z) == zone_to_nid(preferred_zone))
    2878             :                 __count_numa_events(z, NUMA_HIT, nr_account);
    2879             :         else {
    2880             :                 __count_numa_events(z, NUMA_MISS, nr_account);
    2881             :                 __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
    2882             :         }
    2883             :         __count_numa_events(z, local_stat, nr_account);
    2884             : #endif
    2885             : }
    2886             : 
    2887             : static __always_inline
    2888             : struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
    2889             :                            unsigned int order, unsigned int alloc_flags,
    2890             :                            int migratetype)
    2891             : {
    2892             :         struct page *page;
    2893             :         unsigned long flags;
    2894             : 
    2895             :         do {
    2896           0 :                 page = NULL;
    2897           0 :                 spin_lock_irqsave(&zone->lock, flags);
    2898             :                 /*
    2899             :                  * order-0 request can reach here when the pcplist is skipped
    2900             :                  * due to non-CMA allocation context. HIGHATOMIC area is
    2901             :                  * reserved for high-order atomic allocation, so order-0
    2902             :                  * request should skip it.
    2903             :                  */
    2904           0 :                 if (alloc_flags & ALLOC_HIGHATOMIC)
    2905             :                         page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
    2906           0 :                 if (!page) {
    2907           0 :                         page = __rmqueue(zone, order, migratetype, alloc_flags);
    2908             : 
    2909             :                         /*
    2910             :                          * If the allocation fails, allow OOM handling access
    2911             :                          * to HIGHATOMIC reserves as failing now is worse than
    2912             :                          * failing a high-order atomic allocation in the
    2913             :                          * future.
    2914             :                          */
    2915           0 :                         if (!page && (alloc_flags & ALLOC_OOM))
    2916             :                                 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
    2917             : 
    2918           0 :                         if (!page) {
    2919           0 :                                 spin_unlock_irqrestore(&zone->lock, flags);
    2920             :                                 return NULL;
    2921             :                         }
    2922             :                 }
    2923           0 :                 __mod_zone_freepage_state(zone, -(1 << order),
    2924             :                                           get_pcppage_migratetype(page));
    2925           0 :                 spin_unlock_irqrestore(&zone->lock, flags);
    2926           0 :         } while (check_new_pages(page, order));
    2927             : 
    2928           0 :         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
    2929             :         zone_statistics(preferred_zone, zone, 1);
    2930             : 
    2931             :         return page;
    2932             : }
    2933             : 
    2934             : /* Remove page from the per-cpu list, caller must protect the list */
    2935             : static inline
    2936       44763 : struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
    2937             :                         int migratetype,
    2938             :                         unsigned int alloc_flags,
    2939             :                         struct per_cpu_pages *pcp,
    2940             :                         struct list_head *list)
    2941             : {
    2942             :         struct page *page;
    2943             : 
    2944             :         do {
    2945       44763 :                 if (list_empty(list)) {
    2946          53 :                         int batch = READ_ONCE(pcp->batch);
    2947             :                         int alloced;
    2948             : 
    2949             :                         /*
    2950             :                          * Scale batch relative to order if batch implies
    2951             :                          * free pages can be stored on the PCP. Batch can
    2952             :                          * be 1 for small zones or for boot pagesets which
    2953             :                          * should never store free pages as the pages may
    2954             :                          * belong to arbitrary zones.
    2955             :                          */
    2956          53 :                         if (batch > 1)
    2957          42 :                                 batch = max(batch >> order, 2);
    2958          53 :                         alloced = rmqueue_bulk(zone, order,
    2959             :                                         batch, list,
    2960             :                                         migratetype, alloc_flags);
    2961             : 
    2962          53 :                         pcp->count += alloced << order;
    2963          53 :                         if (unlikely(list_empty(list)))
    2964             :                                 return NULL;
    2965             :                 }
    2966             : 
    2967       44763 :                 page = list_first_entry(list, struct page, pcp_list);
    2968       89526 :                 list_del(&page->pcp_list);
    2969       44763 :                 pcp->count -= 1 << order;
    2970       44763 :         } while (check_new_pages(page, order));
    2971             : 
    2972             :         return page;
    2973             : }
    2974             : 
    2975             : /* Lock and remove page from the per-cpu list */
    2976        2222 : static struct page *rmqueue_pcplist(struct zone *preferred_zone,
    2977             :                         struct zone *zone, unsigned int order,
    2978             :                         int migratetype, unsigned int alloc_flags)
    2979             : {
    2980             :         struct per_cpu_pages *pcp;
    2981             :         struct list_head *list;
    2982             :         struct page *page;
    2983             :         unsigned long __maybe_unused UP_flags;
    2984             : 
    2985             :         /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
    2986        2222 :         pcp_trylock_prepare(UP_flags);
    2987        4444 :         pcp = pcp_spin_trylock(zone->per_cpu_pageset);
    2988        2222 :         if (!pcp) {
    2989           0 :                 pcp_trylock_finish(UP_flags);
    2990             :                 return NULL;
    2991             :         }
    2992             : 
    2993             :         /*
    2994             :          * On allocation, reduce the number of pages that are batch freed.
    2995             :          * See nr_pcp_free() where free_factor is increased for subsequent
    2996             :          * frees.
    2997             :          */
    2998        2222 :         pcp->free_factor >>= 1;
    2999        4444 :         list = &pcp->lists[order_to_pindex(migratetype, order)];
    3000        2222 :         page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
    3001        4444 :         pcp_spin_unlock(pcp);
    3002        4444 :         pcp_trylock_finish(UP_flags);
    3003        2222 :         if (page) {
    3004        4444 :                 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
    3005             :                 zone_statistics(preferred_zone, zone, 1);
    3006             :         }
    3007             :         return page;
    3008             : }
    3009             : 
    3010             : /*
    3011             :  * Allocate a page from the given zone.
    3012             :  * Use pcplists for THP or "cheap" high-order allocations.
    3013             :  */
    3014             : 
    3015             : /*
    3016             :  * Do not instrument rmqueue() with KMSAN. This function may call
    3017             :  * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
    3018             :  * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
    3019             :  * may call rmqueue() again, which will result in a deadlock.
    3020             :  */
    3021             : __no_sanitize_memory
    3022             : static inline
    3023        2222 : struct page *rmqueue(struct zone *preferred_zone,
    3024             :                         struct zone *zone, unsigned int order,
    3025             :                         gfp_t gfp_flags, unsigned int alloc_flags,
    3026             :                         int migratetype)
    3027             : {
    3028             :         struct page *page;
    3029             : 
    3030             :         /*
    3031             :          * We most definitely don't want callers attempting to
    3032             :          * allocate greater than order-1 page units with __GFP_NOFAIL.
    3033             :          */
    3034        2222 :         WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
    3035             : 
    3036        2222 :         if (likely(pcp_allowed_order(order))) {
    3037             :                 /*
    3038             :                  * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
    3039             :                  * we need to skip it when CMA area isn't allowed.
    3040             :                  */
    3041             :                 if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
    3042             :                                 migratetype != MIGRATE_MOVABLE) {
    3043        2222 :                         page = rmqueue_pcplist(preferred_zone, zone, order,
    3044             :                                         migratetype, alloc_flags);
    3045        2222 :                         if (likely(page))
    3046             :                                 goto out;
    3047             :                 }
    3048             :         }
    3049             : 
    3050             :         page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
    3051             :                                                         migratetype);
    3052             : 
    3053             : out:
    3054             :         /* Separate test+clear to avoid unnecessary atomics */
    3055        4444 :         if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
    3056           0 :                 clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
    3057           0 :                 wakeup_kswapd(zone, 0, 0, zone_idx(zone));
    3058             :         }
    3059             : 
    3060             :         VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
    3061        2222 :         return page;
    3062             : }
    3063             : 
    3064             : #ifdef CONFIG_FAIL_PAGE_ALLOC
    3065             : 
    3066             : static struct {
    3067             :         struct fault_attr attr;
    3068             : 
    3069             :         bool ignore_gfp_highmem;
    3070             :         bool ignore_gfp_reclaim;
    3071             :         u32 min_order;
    3072             : } fail_page_alloc = {
    3073             :         .attr = FAULT_ATTR_INITIALIZER,
    3074             :         .ignore_gfp_reclaim = true,
    3075             :         .ignore_gfp_highmem = true,
    3076             :         .min_order = 1,
    3077             : };
    3078             : 
    3079             : static int __init setup_fail_page_alloc(char *str)
    3080             : {
    3081             :         return setup_fault_attr(&fail_page_alloc.attr, str);
    3082             : }
    3083             : __setup("fail_page_alloc=", setup_fail_page_alloc);
    3084             : 
    3085             : static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    3086             : {
    3087             :         int flags = 0;
    3088             : 
    3089             :         if (order < fail_page_alloc.min_order)
    3090             :                 return false;
    3091             :         if (gfp_mask & __GFP_NOFAIL)
    3092             :                 return false;
    3093             :         if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
    3094             :                 return false;
    3095             :         if (fail_page_alloc.ignore_gfp_reclaim &&
    3096             :                         (gfp_mask & __GFP_DIRECT_RECLAIM))
    3097             :                 return false;
    3098             : 
    3099             :         /* See comment in __should_failslab() */
    3100             :         if (gfp_mask & __GFP_NOWARN)
    3101             :                 flags |= FAULT_NOWARN;
    3102             : 
    3103             :         return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
    3104             : }
    3105             : 
    3106             : #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
    3107             : 
    3108             : static int __init fail_page_alloc_debugfs(void)
    3109             : {
    3110             :         umode_t mode = S_IFREG | 0600;
    3111             :         struct dentry *dir;
    3112             : 
    3113             :         dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
    3114             :                                         &fail_page_alloc.attr);
    3115             : 
    3116             :         debugfs_create_bool("ignore-gfp-wait", mode, dir,
    3117             :                             &fail_page_alloc.ignore_gfp_reclaim);
    3118             :         debugfs_create_bool("ignore-gfp-highmem", mode, dir,
    3119             :                             &fail_page_alloc.ignore_gfp_highmem);
    3120             :         debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
    3121             : 
    3122             :         return 0;
    3123             : }
    3124             : 
    3125             : late_initcall(fail_page_alloc_debugfs);
    3126             : 
    3127             : #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
    3128             : 
    3129             : #else /* CONFIG_FAIL_PAGE_ALLOC */
    3130             : 
    3131             : static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    3132             : {
    3133             :         return false;
    3134             : }
    3135             : 
    3136             : #endif /* CONFIG_FAIL_PAGE_ALLOC */
    3137             : 
    3138        2818 : noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    3139             : {
    3140        2818 :         return __should_fail_alloc_page(gfp_mask, order);
    3141             : }
    3142             : ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
    3143             : 
    3144             : static inline long __zone_watermark_unusable_free(struct zone *z,
    3145             :                                 unsigned int order, unsigned int alloc_flags)
    3146             : {
    3147        2820 :         long unusable_free = (1 << order) - 1;
    3148             : 
    3149             :         /*
    3150             :          * If the caller does not have rights to reserves below the min
    3151             :          * watermark then subtract the high-atomic reserves. This will
    3152             :          * over-estimate the size of the atomic reserve but it avoids a search.
    3153             :          */
    3154        2820 :         if (likely(!(alloc_flags & ALLOC_RESERVES)))
    3155        2820 :                 unusable_free += z->nr_reserved_highatomic;
    3156             : 
    3157             : #ifdef CONFIG_CMA
    3158             :         /* If allocation can't use CMA areas don't use free CMA pages */
    3159             :         if (!(alloc_flags & ALLOC_CMA))
    3160             :                 unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
    3161             : #endif
    3162             : 
    3163             :         return unusable_free;
    3164             : }
    3165             : 
    3166             : /*
    3167             :  * Return true if free base pages are above 'mark'. For high-order checks it
    3168             :  * will return true of the order-0 watermark is reached and there is at least
    3169             :  * one free page of a suitable size. Checking now avoids taking the zone lock
    3170             :  * to check in the allocation paths if no pages are free.
    3171             :  */
    3172         107 : bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
    3173             :                          int highest_zoneidx, unsigned int alloc_flags,
    3174             :                          long free_pages)
    3175             : {
    3176         107 :         long min = mark;
    3177             :         int o;
    3178             : 
    3179             :         /* free_pages may go negative - that's OK */
    3180         214 :         free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
    3181             : 
    3182         107 :         if (unlikely(alloc_flags & ALLOC_RESERVES)) {
    3183             :                 /*
    3184             :                  * __GFP_HIGH allows access to 50% of the min reserve as well
    3185             :                  * as OOM.
    3186             :                  */
    3187           0 :                 if (alloc_flags & ALLOC_MIN_RESERVE) {
    3188           0 :                         min -= min / 2;
    3189             : 
    3190             :                         /*
    3191             :                          * Non-blocking allocations (e.g. GFP_ATOMIC) can
    3192             :                          * access more reserves than just __GFP_HIGH. Other
    3193             :                          * non-blocking allocations requests such as GFP_NOWAIT
    3194             :                          * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get
    3195             :                          * access to the min reserve.
    3196             :                          */
    3197           0 :                         if (alloc_flags & ALLOC_NON_BLOCK)
    3198           0 :                                 min -= min / 4;
    3199             :                 }
    3200             : 
    3201             :                 /*
    3202             :                  * OOM victims can try even harder than the normal reserve
    3203             :                  * users on the grounds that it's definitely going to be in
    3204             :                  * the exit path shortly and free memory. Any allocation it
    3205             :                  * makes during the free path will be small and short-lived.
    3206             :                  */
    3207           0 :                 if (alloc_flags & ALLOC_OOM)
    3208           0 :                         min -= min / 2;
    3209             :         }
    3210             : 
    3211             :         /*
    3212             :          * Check watermarks for an order-0 allocation request. If these
    3213             :          * are not met, then a high-order request also cannot go ahead
    3214             :          * even if a suitable page happened to be free.
    3215             :          */
    3216         107 :         if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
    3217             :                 return false;
    3218             : 
    3219             :         /* If this is an order-0 request then the watermark is fine */
    3220         107 :         if (!order)
    3221             :                 return true;
    3222             : 
    3223             :         /* For a high-order request, check at least one suitable page is free */
    3224         105 :         for (o = order; o <= MAX_ORDER; o++) {
    3225         105 :                 struct free_area *area = &z->free_area[o];
    3226             :                 int mt;
    3227             : 
    3228         105 :                 if (!area->nr_free)
    3229           0 :                         continue;
    3230             : 
    3231          61 :                 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
    3232         166 :                         if (!free_area_empty(area, mt))
    3233             :                                 return true;
    3234             :                 }
    3235             : 
    3236             : #ifdef CONFIG_CMA
    3237             :                 if ((alloc_flags & ALLOC_CMA) &&
    3238             :                     !free_area_empty(area, MIGRATE_CMA)) {
    3239             :                         return true;
    3240             :                 }
    3241             : #endif
    3242           0 :                 if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) &&
    3243           0 :                     !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
    3244             :                         return true;
    3245             :                 }
    3246             :         }
    3247             :         return false;
    3248             : }
    3249             : 
    3250           0 : bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
    3251             :                       int highest_zoneidx, unsigned int alloc_flags)
    3252             : {
    3253           0 :         return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
    3254           0 :                                         zone_page_state(z, NR_FREE_PAGES));
    3255             : }
    3256             : 
    3257        2818 : static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
    3258             :                                 unsigned long mark, int highest_zoneidx,
    3259             :                                 unsigned int alloc_flags, gfp_t gfp_mask)
    3260             : {
    3261             :         long free_pages;
    3262             : 
    3263        2818 :         free_pages = zone_page_state(z, NR_FREE_PAGES);
    3264             : 
    3265             :         /*
    3266             :          * Fast check for order-0 only. If this fails then the reserves
    3267             :          * need to be calculated.
    3268             :          */
    3269        2818 :         if (!order) {
    3270             :                 long usable_free;
    3271             :                 long reserved;
    3272             : 
    3273        2713 :                 usable_free = free_pages;
    3274        5426 :                 reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
    3275             : 
    3276             :                 /* reserved may over estimate high-atomic reserves. */
    3277        2713 :                 usable_free -= min(usable_free, reserved);
    3278        2713 :                 if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
    3279             :                         return true;
    3280             :         }
    3281             : 
    3282         105 :         if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
    3283             :                                         free_pages))
    3284             :                 return true;
    3285             : 
    3286             :         /*
    3287             :          * Ignore watermark boosting for __GFP_HIGH order-0 allocations
    3288             :          * when checking the min watermark. The min watermark is the
    3289             :          * point where boosting is ignored so that kswapd is woken up
    3290             :          * when below the low watermark.
    3291             :          */
    3292           0 :         if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost
    3293             :                 && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
    3294           0 :                 mark = z->_watermark[WMARK_MIN];
    3295           0 :                 return __zone_watermark_ok(z, order, mark, highest_zoneidx,
    3296             :                                         alloc_flags, free_pages);
    3297             :         }
    3298             : 
    3299             :         return false;
    3300             : }
    3301             : 
    3302           2 : bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
    3303             :                         unsigned long mark, int highest_zoneidx)
    3304             : {
    3305           2 :         long free_pages = zone_page_state(z, NR_FREE_PAGES);
    3306             : 
    3307           2 :         if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
    3308           0 :                 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
    3309             : 
    3310           2 :         return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
    3311             :                                                                 free_pages);
    3312             : }
    3313             : 
    3314             : #ifdef CONFIG_NUMA
    3315             : int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
    3316             : 
    3317             : static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
    3318             : {
    3319             :         return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
    3320             :                                 node_reclaim_distance;
    3321             : }
    3322             : #else   /* CONFIG_NUMA */
    3323             : static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
    3324             : {
    3325             :         return true;
    3326             : }
    3327             : #endif  /* CONFIG_NUMA */
    3328             : 
    3329             : /*
    3330             :  * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
    3331             :  * fragmentation is subtle. If the preferred zone was HIGHMEM then
    3332             :  * premature use of a lower zone may cause lowmem pressure problems that
    3333             :  * are worse than fragmentation. If the next zone is ZONE_DMA then it is
    3334             :  * probably too small. It only makes sense to spread allocations to avoid
    3335             :  * fragmentation between the Normal and DMA32 zones.
    3336             :  */
    3337             : static inline unsigned int
    3338             : alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
    3339             : {
    3340             :         unsigned int alloc_flags;
    3341             : 
    3342             :         /*
    3343             :          * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
    3344             :          * to save a branch.
    3345             :          */
    3346        2222 :         alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
    3347             : 
    3348             : #ifdef CONFIG_ZONE_DMA32
    3349             :         if (!zone)
    3350             :                 return alloc_flags;
    3351             : 
    3352             :         if (zone_idx(zone) != ZONE_NORMAL)
    3353             :                 return alloc_flags;
    3354             : 
    3355             :         /*
    3356             :          * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
    3357             :          * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
    3358             :          * on UMA that if Normal is populated then so is DMA32.
    3359             :          */
    3360             :         BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
    3361             :         if (nr_online_nodes > 1 && !populated_zone(--zone))
    3362             :                 return alloc_flags;
    3363             : 
    3364             :         alloc_flags |= ALLOC_NOFRAGMENT;
    3365             : #endif /* CONFIG_ZONE_DMA32 */
    3366             :         return alloc_flags;
    3367             : }
    3368             : 
    3369             : /* Must be called after current_gfp_context() which can change gfp_mask */
    3370             : static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
    3371             :                                                   unsigned int alloc_flags)
    3372             : {
    3373             : #ifdef CONFIG_CMA
    3374             :         if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
    3375             :                 alloc_flags |= ALLOC_CMA;
    3376             : #endif
    3377             :         return alloc_flags;
    3378             : }
    3379             : 
    3380             : /*
    3381             :  * get_page_from_freelist goes through the zonelist trying to allocate
    3382             :  * a page.
    3383             :  */
    3384             : static struct page *
    3385        2222 : get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
    3386             :                                                 const struct alloc_context *ac)
    3387             : {
    3388             :         struct zoneref *z;
    3389             :         struct zone *zone;
    3390        2222 :         struct pglist_data *last_pgdat = NULL;
    3391        2222 :         bool last_pgdat_dirty_ok = false;
    3392             :         bool no_fallback;
    3393             : 
    3394             : retry:
    3395             :         /*
    3396             :          * Scan zonelist, looking for a zone with enough free.
    3397             :          * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
    3398             :          */
    3399        2222 :         no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
    3400        2222 :         z = ac->preferred_zoneref;
    3401        2222 :         for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
    3402             :                                         ac->nodemask) {
    3403             :                 struct page *page;
    3404             :                 unsigned long mark;
    3405             : 
    3406             :                 if (cpusets_enabled() &&
    3407             :                         (alloc_flags & ALLOC_CPUSET) &&
    3408             :                         !__cpuset_zone_allowed(zone, gfp_mask))
    3409             :                                 continue;
    3410             :                 /*
    3411             :                  * When allocating a page cache page for writing, we
    3412             :                  * want to get it from a node that is within its dirty
    3413             :                  * limit, such that no single node holds more than its
    3414             :                  * proportional share of globally allowed dirty pages.
    3415             :                  * The dirty limits take into account the node's
    3416             :                  * lowmem reserves and high watermark so that kswapd
    3417             :                  * should be able to balance it without having to
    3418             :                  * write pages from its LRU list.
    3419             :                  *
    3420             :                  * XXX: For now, allow allocations to potentially
    3421             :                  * exceed the per-node dirty limit in the slowpath
    3422             :                  * (spread_dirty_pages unset) before going into reclaim,
    3423             :                  * which is important when on a NUMA setup the allowed
    3424             :                  * nodes are together not big enough to reach the
    3425             :                  * global limit.  The proper fix for these situations
    3426             :                  * will require awareness of nodes in the
    3427             :                  * dirty-throttling and the flusher threads.
    3428             :                  */
    3429        2222 :                 if (ac->spread_dirty_pages) {
    3430           0 :                         if (last_pgdat != zone->zone_pgdat) {
    3431           0 :                                 last_pgdat = zone->zone_pgdat;
    3432           0 :                                 last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
    3433             :                         }
    3434             : 
    3435           0 :                         if (!last_pgdat_dirty_ok)
    3436           0 :                                 continue;
    3437             :                 }
    3438             : 
    3439             :                 if (no_fallback && nr_online_nodes > 1 &&
    3440             :                     zone != ac->preferred_zoneref->zone) {
    3441             :                         int local_nid;
    3442             : 
    3443             :                         /*
    3444             :                          * If moving to a remote node, retry but allow
    3445             :                          * fragmenting fallbacks. Locality is more important
    3446             :                          * than fragmentation avoidance.
    3447             :                          */
    3448             :                         local_nid = zone_to_nid(ac->preferred_zoneref->zone);
    3449             :                         if (zone_to_nid(zone) != local_nid) {
    3450             :                                 alloc_flags &= ~ALLOC_NOFRAGMENT;
    3451             :                                 goto retry;
    3452             :                         }
    3453             :                 }
    3454             : 
    3455        2222 :                 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
    3456        4444 :                 if (!zone_watermark_fast(zone, order, mark,
    3457        2222 :                                        ac->highest_zoneidx, alloc_flags,
    3458             :                                        gfp_mask)) {
    3459             :                         int ret;
    3460             : 
    3461             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    3462             :                         /*
    3463             :                          * Watermark failed for this zone, but see if we can
    3464             :                          * grow this zone if it contains deferred pages.
    3465             :                          */
    3466             :                         if (deferred_pages_enabled()) {
    3467             :                                 if (_deferred_grow_zone(zone, order))
    3468             :                                         goto try_this_zone;
    3469             :                         }
    3470             : #endif
    3471             :                         /* Checked here to keep the fast path fast */
    3472             :                         BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
    3473           0 :                         if (alloc_flags & ALLOC_NO_WATERMARKS)
    3474             :                                 goto try_this_zone;
    3475             : 
    3476             :                         if (!node_reclaim_enabled() ||
    3477             :                             !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
    3478           0 :                                 continue;
    3479             : 
    3480             :                         ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
    3481             :                         switch (ret) {
    3482             :                         case NODE_RECLAIM_NOSCAN:
    3483             :                                 /* did not scan */
    3484             :                                 continue;
    3485             :                         case NODE_RECLAIM_FULL:
    3486             :                                 /* scanned but unreclaimable */
    3487             :                                 continue;
    3488             :                         default:
    3489             :                                 /* did we reclaim enough */
    3490             :                                 if (zone_watermark_ok(zone, order, mark,
    3491             :                                         ac->highest_zoneidx, alloc_flags))
    3492             :                                         goto try_this_zone;
    3493             : 
    3494             :                                 continue;
    3495             :                         }
    3496             :                 }
    3497             : 
    3498             : try_this_zone:
    3499        2222 :                 page = rmqueue(ac->preferred_zoneref->zone, zone, order,
    3500             :                                 gfp_mask, alloc_flags, ac->migratetype);
    3501        2222 :                 if (page) {
    3502        2222 :                         prep_new_page(page, order, gfp_mask, alloc_flags);
    3503             : 
    3504             :                         /*
    3505             :                          * If this is a high-order atomic allocation then check
    3506             :                          * if the pageblock should be reserved for the future
    3507             :                          */
    3508        2222 :                         if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
    3509           0 :                                 reserve_highatomic_pageblock(page, zone, order);
    3510             : 
    3511             :                         return page;
    3512             :                 } else {
    3513             : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
    3514             :                         /* Try again if zone has deferred pages */
    3515             :                         if (deferred_pages_enabled()) {
    3516             :                                 if (_deferred_grow_zone(zone, order))
    3517             :                                         goto try_this_zone;
    3518             :                         }
    3519             : #endif
    3520             :                 }
    3521             :         }
    3522             : 
    3523             :         /*
    3524             :          * It's possible on a UMA machine to get through all zones that are
    3525             :          * fragmented. If avoiding fragmentation, reset and try again.
    3526             :          */
    3527             :         if (no_fallback) {
    3528             :                 alloc_flags &= ~ALLOC_NOFRAGMENT;
    3529             :                 goto retry;
    3530             :         }
    3531             : 
    3532             :         return NULL;
    3533             : }
    3534             : 
    3535           0 : static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
    3536             : {
    3537           0 :         unsigned int filter = SHOW_MEM_FILTER_NODES;
    3538             : 
    3539             :         /*
    3540             :          * This documents exceptions given to allocations in certain
    3541             :          * contexts that are allowed to allocate outside current's set
    3542             :          * of allowed nodes.
    3543             :          */
    3544           0 :         if (!(gfp_mask & __GFP_NOMEMALLOC))
    3545           0 :                 if (tsk_is_oom_victim(current) ||
    3546           0 :                     (current->flags & (PF_MEMALLOC | PF_EXITING)))
    3547             :                         filter &= ~SHOW_MEM_FILTER_NODES;
    3548           0 :         if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
    3549           0 :                 filter &= ~SHOW_MEM_FILTER_NODES;
    3550             : 
    3551           0 :         __show_mem(filter, nodemask, gfp_zone(gfp_mask));
    3552           0 : }
    3553             : 
    3554           0 : void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
    3555             : {
    3556             :         struct va_format vaf;
    3557             :         va_list args;
    3558             :         static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
    3559             : 
    3560           0 :         if ((gfp_mask & __GFP_NOWARN) ||
    3561           0 :              !__ratelimit(&nopage_rs) ||
    3562           0 :              ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
    3563           0 :                 return;
    3564             : 
    3565           0 :         va_start(args, fmt);
    3566           0 :         vaf.fmt = fmt;
    3567           0 :         vaf.va = &args;
    3568           0 :         pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
    3569             :                         current->comm, &vaf, gfp_mask, &gfp_mask,
    3570             :                         nodemask_pr_args(nodemask));
    3571           0 :         va_end(args);
    3572             : 
    3573             :         cpuset_print_current_mems_allowed();
    3574           0 :         pr_cont("\n");
    3575           0 :         dump_stack();
    3576           0 :         warn_alloc_show_mem(gfp_mask, nodemask);
    3577             : }
    3578             : 
    3579             : static inline struct page *
    3580           0 : __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
    3581             :                               unsigned int alloc_flags,
    3582             :                               const struct alloc_context *ac)
    3583             : {
    3584             :         struct page *page;
    3585             : 
    3586           0 :         page = get_page_from_freelist(gfp_mask, order,
    3587           0 :                         alloc_flags|ALLOC_CPUSET, ac);
    3588             :         /*
    3589             :          * fallback to ignore cpuset restriction if our nodes
    3590             :          * are depleted
    3591             :          */
    3592           0 :         if (!page)
    3593           0 :                 page = get_page_from_freelist(gfp_mask, order,
    3594             :                                 alloc_flags, ac);
    3595             : 
    3596           0 :         return page;
    3597             : }
    3598             : 
    3599             : static inline struct page *
    3600           0 : __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
    3601             :         const struct alloc_context *ac, unsigned long *did_some_progress)
    3602             : {
    3603           0 :         struct oom_control oc = {
    3604           0 :                 .zonelist = ac->zonelist,
    3605           0 :                 .nodemask = ac->nodemask,
    3606             :                 .memcg = NULL,
    3607             :                 .gfp_mask = gfp_mask,
    3608             :                 .order = order,
    3609             :         };
    3610             :         struct page *page;
    3611             : 
    3612           0 :         *did_some_progress = 0;
    3613             : 
    3614             :         /*
    3615             :          * Acquire the oom lock.  If that fails, somebody else is
    3616             :          * making progress for us.
    3617             :          */
    3618           0 :         if (!mutex_trylock(&oom_lock)) {
    3619           0 :                 *did_some_progress = 1;
    3620           0 :                 schedule_timeout_uninterruptible(1);
    3621           0 :                 return NULL;
    3622             :         }
    3623             : 
    3624             :         /*
    3625             :          * Go through the zonelist yet one more time, keep very high watermark
    3626             :          * here, this is only to catch a parallel oom killing, we must fail if
    3627             :          * we're still under heavy pressure. But make sure that this reclaim
    3628             :          * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
    3629             :          * allocation which will never fail due to oom_lock already held.
    3630             :          */
    3631           0 :         page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
    3632             :                                       ~__GFP_DIRECT_RECLAIM, order,
    3633             :                                       ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
    3634           0 :         if (page)
    3635             :                 goto out;
    3636             : 
    3637             :         /* Coredumps can quickly deplete all memory reserves */
    3638           0 :         if (current->flags & PF_DUMPCORE)
    3639             :                 goto out;
    3640             :         /* The OOM killer will not help higher order allocs */
    3641           0 :         if (order > PAGE_ALLOC_COSTLY_ORDER)
    3642             :                 goto out;
    3643             :         /*
    3644             :          * We have already exhausted all our reclaim opportunities without any
    3645             :          * success so it is time to admit defeat. We will skip the OOM killer
    3646             :          * because it is very likely that the caller has a more reasonable
    3647             :          * fallback than shooting a random task.
    3648             :          *
    3649             :          * The OOM killer may not free memory on a specific node.
    3650             :          */
    3651           0 :         if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
    3652             :                 goto out;
    3653             :         /* The OOM killer does not needlessly kill tasks for lowmem */
    3654             :         if (ac->highest_zoneidx < ZONE_NORMAL)
    3655             :                 goto out;
    3656           0 :         if (pm_suspended_storage())
    3657             :                 goto out;
    3658             :         /*
    3659             :          * XXX: GFP_NOFS allocations should rather fail than rely on
    3660             :          * other request to make a forward progress.
    3661             :          * We are in an unfortunate situation where out_of_memory cannot
    3662             :          * do much for this context but let's try it to at least get
    3663             :          * access to memory reserved if the current task is killed (see
    3664             :          * out_of_memory). Once filesystems are ready to handle allocation
    3665             :          * failures more gracefully we should just bail out here.
    3666             :          */
    3667             : 
    3668             :         /* Exhausted what can be done so it's blame time */
    3669           0 :         if (out_of_memory(&oc) ||
    3670           0 :             WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
    3671           0 :                 *did_some_progress = 1;
    3672             : 
    3673             :                 /*
    3674             :                  * Help non-failing allocations by giving them access to memory
    3675             :                  * reserves
    3676             :                  */
    3677           0 :                 if (gfp_mask & __GFP_NOFAIL)
    3678           0 :                         page = __alloc_pages_cpuset_fallback(gfp_mask, order,
    3679             :                                         ALLOC_NO_WATERMARKS, ac);
    3680             :         }
    3681             : out:
    3682           0 :         mutex_unlock(&oom_lock);
    3683           0 :         return page;
    3684             : }
    3685             : 
    3686             : /*
    3687             :  * Maximum number of compaction retries with a progress before OOM
    3688             :  * killer is consider as the only way to move forward.
    3689             :  */
    3690             : #define MAX_COMPACT_RETRIES 16
    3691             : 
    3692             : #ifdef CONFIG_COMPACTION
    3693             : /* Try memory compaction for high-order allocations before reclaim */
    3694             : static struct page *
    3695           0 : __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
    3696             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    3697             :                 enum compact_priority prio, enum compact_result *compact_result)
    3698             : {
    3699           0 :         struct page *page = NULL;
    3700             :         unsigned long pflags;
    3701             :         unsigned int noreclaim_flag;
    3702             : 
    3703           0 :         if (!order)
    3704             :                 return NULL;
    3705             : 
    3706           0 :         psi_memstall_enter(&pflags);
    3707             :         delayacct_compact_start();
    3708           0 :         noreclaim_flag = memalloc_noreclaim_save();
    3709             : 
    3710           0 :         *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
    3711             :                                                                 prio, &page);
    3712             : 
    3713           0 :         memalloc_noreclaim_restore(noreclaim_flag);
    3714           0 :         psi_memstall_leave(&pflags);
    3715             :         delayacct_compact_end();
    3716             : 
    3717           0 :         if (*compact_result == COMPACT_SKIPPED)
    3718             :                 return NULL;
    3719             :         /*
    3720             :          * At least in one zone compaction wasn't deferred or skipped, so let's
    3721             :          * count a compaction stall
    3722             :          */
    3723           0 :         count_vm_event(COMPACTSTALL);
    3724             : 
    3725             :         /* Prep a captured page if available */
    3726           0 :         if (page)
    3727           0 :                 prep_new_page(page, order, gfp_mask, alloc_flags);
    3728             : 
    3729             :         /* Try get a page from the freelist if available */
    3730           0 :         if (!page)
    3731           0 :                 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    3732             : 
    3733           0 :         if (page) {
    3734           0 :                 struct zone *zone = page_zone(page);
    3735             : 
    3736           0 :                 zone->compact_blockskip_flush = false;
    3737           0 :                 compaction_defer_reset(zone, order, true);
    3738           0 :                 count_vm_event(COMPACTSUCCESS);
    3739           0 :                 return page;
    3740             :         }
    3741             : 
    3742             :         /*
    3743             :          * It's bad if compaction run occurs and fails. The most likely reason
    3744             :          * is that pages exist, but not enough to satisfy watermarks.
    3745             :          */
    3746           0 :         count_vm_event(COMPACTFAIL);
    3747             : 
    3748           0 :         cond_resched();
    3749             : 
    3750           0 :         return NULL;
    3751             : }
    3752             : 
    3753             : static inline bool
    3754           0 : should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
    3755             :                      enum compact_result compact_result,
    3756             :                      enum compact_priority *compact_priority,
    3757             :                      int *compaction_retries)
    3758             : {
    3759           0 :         int max_retries = MAX_COMPACT_RETRIES;
    3760             :         int min_priority;
    3761           0 :         bool ret = false;
    3762           0 :         int retries = *compaction_retries;
    3763           0 :         enum compact_priority priority = *compact_priority;
    3764             : 
    3765           0 :         if (!order)
    3766             :                 return false;
    3767             : 
    3768           0 :         if (fatal_signal_pending(current))
    3769             :                 return false;
    3770             : 
    3771           0 :         if (compaction_made_progress(compact_result))
    3772           0 :                 (*compaction_retries)++;
    3773             : 
    3774             :         /*
    3775             :          * compaction considers all the zone as desperately out of memory
    3776             :          * so it doesn't really make much sense to retry except when the
    3777             :          * failure could be caused by insufficient priority
    3778             :          */
    3779           0 :         if (compaction_failed(compact_result))
    3780             :                 goto check_priority;
    3781             : 
    3782             :         /*
    3783             :          * compaction was skipped because there are not enough order-0 pages
    3784             :          * to work with, so we retry only if it looks like reclaim can help.
    3785             :          */
    3786           0 :         if (compaction_needs_reclaim(compact_result)) {
    3787           0 :                 ret = compaction_zonelist_suitable(ac, order, alloc_flags);
    3788           0 :                 goto out;
    3789             :         }
    3790             : 
    3791             :         /*
    3792             :          * make sure the compaction wasn't deferred or didn't bail out early
    3793             :          * due to locks contention before we declare that we should give up.
    3794             :          * But the next retry should use a higher priority if allowed, so
    3795             :          * we don't just keep bailing out endlessly.
    3796             :          */
    3797           0 :         if (compaction_withdrawn(compact_result)) {
    3798             :                 goto check_priority;
    3799             :         }
    3800             : 
    3801             :         /*
    3802             :          * !costly requests are much more important than __GFP_RETRY_MAYFAIL
    3803             :          * costly ones because they are de facto nofail and invoke OOM
    3804             :          * killer to move on while costly can fail and users are ready
    3805             :          * to cope with that. 1/4 retries is rather arbitrary but we
    3806             :          * would need much more detailed feedback from compaction to
    3807             :          * make a better decision.
    3808             :          */
    3809           0 :         if (order > PAGE_ALLOC_COSTLY_ORDER)
    3810           0 :                 max_retries /= 4;
    3811           0 :         if (*compaction_retries <= max_retries) {
    3812             :                 ret = true;
    3813             :                 goto out;
    3814             :         }
    3815             : 
    3816             :         /*
    3817             :          * Make sure there are attempts at the highest priority if we exhausted
    3818             :          * all retries or failed at the lower priorities.
    3819             :          */
    3820             : check_priority:
    3821           0 :         min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
    3822           0 :                         MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
    3823             : 
    3824           0 :         if (*compact_priority > min_priority) {
    3825           0 :                 (*compact_priority)--;
    3826           0 :                 *compaction_retries = 0;
    3827           0 :                 ret = true;
    3828             :         }
    3829             : out:
    3830           0 :         trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
    3831           0 :         return ret;
    3832             : }
    3833             : #else
    3834             : static inline struct page *
    3835             : __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
    3836             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    3837             :                 enum compact_priority prio, enum compact_result *compact_result)
    3838             : {
    3839             :         *compact_result = COMPACT_SKIPPED;
    3840             :         return NULL;
    3841             : }
    3842             : 
    3843             : static inline bool
    3844             : should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
    3845             :                      enum compact_result compact_result,
    3846             :                      enum compact_priority *compact_priority,
    3847             :                      int *compaction_retries)
    3848             : {
    3849             :         struct zone *zone;
    3850             :         struct zoneref *z;
    3851             : 
    3852             :         if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
    3853             :                 return false;
    3854             : 
    3855             :         /*
    3856             :          * There are setups with compaction disabled which would prefer to loop
    3857             :          * inside the allocator rather than hit the oom killer prematurely.
    3858             :          * Let's give them a good hope and keep retrying while the order-0
    3859             :          * watermarks are OK.
    3860             :          */
    3861             :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
    3862             :                                 ac->highest_zoneidx, ac->nodemask) {
    3863             :                 if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
    3864             :                                         ac->highest_zoneidx, alloc_flags))
    3865             :                         return true;
    3866             :         }
    3867             :         return false;
    3868             : }
    3869             : #endif /* CONFIG_COMPACTION */
    3870             : 
    3871             : #ifdef CONFIG_LOCKDEP
    3872             : static struct lockdep_map __fs_reclaim_map =
    3873             :         STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
    3874             : 
    3875             : static bool __need_reclaim(gfp_t gfp_mask)
    3876             : {
    3877             :         /* no reclaim without waiting on it */
    3878             :         if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
    3879             :                 return false;
    3880             : 
    3881             :         /* this guy won't enter reclaim */
    3882             :         if (current->flags & PF_MEMALLOC)
    3883             :                 return false;
    3884             : 
    3885             :         if (gfp_mask & __GFP_NOLOCKDEP)
    3886             :                 return false;
    3887             : 
    3888             :         return true;
    3889             : }
    3890             : 
    3891             : void __fs_reclaim_acquire(unsigned long ip)
    3892             : {
    3893             :         lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip);
    3894             : }
    3895             : 
    3896             : void __fs_reclaim_release(unsigned long ip)
    3897             : {
    3898             :         lock_release(&__fs_reclaim_map, ip);
    3899             : }
    3900             : 
    3901             : void fs_reclaim_acquire(gfp_t gfp_mask)
    3902             : {
    3903             :         gfp_mask = current_gfp_context(gfp_mask);
    3904             : 
    3905             :         if (__need_reclaim(gfp_mask)) {
    3906             :                 if (gfp_mask & __GFP_FS)
    3907             :                         __fs_reclaim_acquire(_RET_IP_);
    3908             : 
    3909             : #ifdef CONFIG_MMU_NOTIFIER
    3910             :                 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
    3911             :                 lock_map_release(&__mmu_notifier_invalidate_range_start_map);
    3912             : #endif
    3913             : 
    3914             :         }
    3915             : }
    3916             : EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
    3917             : 
    3918             : void fs_reclaim_release(gfp_t gfp_mask)
    3919             : {
    3920             :         gfp_mask = current_gfp_context(gfp_mask);
    3921             : 
    3922             :         if (__need_reclaim(gfp_mask)) {
    3923             :                 if (gfp_mask & __GFP_FS)
    3924             :                         __fs_reclaim_release(_RET_IP_);
    3925             :         }
    3926             : }
    3927             : EXPORT_SYMBOL_GPL(fs_reclaim_release);
    3928             : #endif
    3929             : 
    3930             : /*
    3931             :  * Zonelists may change due to hotplug during allocation. Detect when zonelists
    3932             :  * have been rebuilt so allocation retries. Reader side does not lock and
    3933             :  * retries the allocation if zonelist changes. Writer side is protected by the
    3934             :  * embedded spin_lock.
    3935             :  */
    3936             : static DEFINE_SEQLOCK(zonelist_update_seq);
    3937             : 
    3938             : static unsigned int zonelist_iter_begin(void)
    3939             : {
    3940             :         if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
    3941             :                 return read_seqbegin(&zonelist_update_seq);
    3942             : 
    3943             :         return 0;
    3944             : }
    3945             : 
    3946             : static unsigned int check_retry_zonelist(unsigned int seq)
    3947             : {
    3948             :         if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
    3949             :                 return read_seqretry(&zonelist_update_seq, seq);
    3950             : 
    3951             :         return seq;
    3952             : }
    3953             : 
    3954             : /* Perform direct synchronous page reclaim */
    3955             : static unsigned long
    3956           0 : __perform_reclaim(gfp_t gfp_mask, unsigned int order,
    3957             :                                         const struct alloc_context *ac)
    3958             : {
    3959             :         unsigned int noreclaim_flag;
    3960             :         unsigned long progress;
    3961             : 
    3962           0 :         cond_resched();
    3963             : 
    3964             :         /* We now go into synchronous reclaim */
    3965             :         cpuset_memory_pressure_bump();
    3966           0 :         fs_reclaim_acquire(gfp_mask);
    3967           0 :         noreclaim_flag = memalloc_noreclaim_save();
    3968             : 
    3969           0 :         progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
    3970             :                                                                 ac->nodemask);
    3971             : 
    3972           0 :         memalloc_noreclaim_restore(noreclaim_flag);
    3973           0 :         fs_reclaim_release(gfp_mask);
    3974             : 
    3975           0 :         cond_resched();
    3976             : 
    3977           0 :         return progress;
    3978             : }
    3979             : 
    3980             : /* The really slow allocator path where we enter direct reclaim */
    3981             : static inline struct page *
    3982           0 : __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
    3983             :                 unsigned int alloc_flags, const struct alloc_context *ac,
    3984             :                 unsigned long *did_some_progress)
    3985             : {
    3986           0 :         struct page *page = NULL;
    3987             :         unsigned long pflags;
    3988           0 :         bool drained = false;
    3989             : 
    3990           0 :         psi_memstall_enter(&pflags);
    3991           0 :         *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
    3992           0 :         if (unlikely(!(*did_some_progress)))
    3993             :                 goto out;
    3994             : 
    3995             : retry:
    3996           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    3997             : 
    3998             :         /*
    3999             :          * If an allocation failed after direct reclaim, it could be because
    4000             :          * pages are pinned on the per-cpu lists or in high alloc reserves.
    4001             :          * Shrink them and try again
    4002             :          */
    4003           0 :         if (!page && !drained) {
    4004           0 :                 unreserve_highatomic_pageblock(ac, false);
    4005           0 :                 drain_all_pages(NULL);
    4006           0 :                 drained = true;
    4007           0 :                 goto retry;
    4008             :         }
    4009             : out:
    4010           0 :         psi_memstall_leave(&pflags);
    4011             : 
    4012           0 :         return page;
    4013             : }
    4014             : 
    4015           0 : static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
    4016             :                              const struct alloc_context *ac)
    4017             : {
    4018             :         struct zoneref *z;
    4019             :         struct zone *zone;
    4020           0 :         pg_data_t *last_pgdat = NULL;
    4021           0 :         enum zone_type highest_zoneidx = ac->highest_zoneidx;
    4022             : 
    4023           0 :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
    4024             :                                         ac->nodemask) {
    4025           0 :                 if (!managed_zone(zone))
    4026           0 :                         continue;
    4027           0 :                 if (last_pgdat != zone->zone_pgdat) {
    4028           0 :                         wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
    4029           0 :                         last_pgdat = zone->zone_pgdat;
    4030             :                 }
    4031             :         }
    4032           0 : }
    4033             : 
    4034             : static inline unsigned int
    4035           0 : gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
    4036             : {
    4037           0 :         unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
    4038             : 
    4039             :         /*
    4040             :          * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE
    4041             :          * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
    4042             :          * to save two branches.
    4043             :          */
    4044             :         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE);
    4045             :         BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
    4046             : 
    4047             :         /*
    4048             :          * The caller may dip into page reserves a bit more if the caller
    4049             :          * cannot run direct reclaim, or if the caller has realtime scheduling
    4050             :          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
    4051             :          * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
    4052             :          */
    4053           0 :         alloc_flags |= (__force int)
    4054             :                 (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
    4055             : 
    4056           0 :         if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
    4057             :                 /*
    4058             :                  * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
    4059             :                  * if it can't schedule.
    4060             :                  */
    4061           0 :                 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
    4062           0 :                         alloc_flags |= ALLOC_NON_BLOCK;
    4063             : 
    4064           0 :                         if (order > 0)
    4065           0 :                                 alloc_flags |= ALLOC_HIGHATOMIC;
    4066             :                 }
    4067             : 
    4068             :                 /*
    4069             :                  * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
    4070             :                  * GFP_ATOMIC) rather than fail, see the comment for
    4071             :                  * cpuset_node_allowed().
    4072             :                  */
    4073           0 :                 if (alloc_flags & ALLOC_MIN_RESERVE)
    4074           0 :                         alloc_flags &= ~ALLOC_CPUSET;
    4075           0 :         } else if (unlikely(rt_task(current)) && in_task())
    4076           0 :                 alloc_flags |= ALLOC_MIN_RESERVE;
    4077             : 
    4078           0 :         alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
    4079             : 
    4080           0 :         return alloc_flags;
    4081             : }
    4082             : 
    4083             : static bool oom_reserves_allowed(struct task_struct *tsk)
    4084             : {
    4085           0 :         if (!tsk_is_oom_victim(tsk))
    4086             :                 return false;
    4087             : 
    4088             :         /*
    4089             :          * !MMU doesn't have oom reaper so give access to memory reserves
    4090             :          * only to the thread with TIF_MEMDIE set
    4091             :          */
    4092             :         if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
    4093             :                 return false;
    4094             : 
    4095             :         return true;
    4096             : }
    4097             : 
    4098             : /*
    4099             :  * Distinguish requests which really need access to full memory
    4100             :  * reserves from oom victims which can live with a portion of it
    4101             :  */
    4102           0 : static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
    4103             : {
    4104           0 :         if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
    4105             :                 return 0;
    4106           0 :         if (gfp_mask & __GFP_MEMALLOC)
    4107             :                 return ALLOC_NO_WATERMARKS;
    4108           0 :         if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
    4109             :                 return ALLOC_NO_WATERMARKS;
    4110           0 :         if (!in_interrupt()) {
    4111           0 :                 if (current->flags & PF_MEMALLOC)
    4112             :                         return ALLOC_NO_WATERMARKS;
    4113           0 :                 else if (oom_reserves_allowed(current))
    4114             :                         return ALLOC_OOM;
    4115             :         }
    4116             : 
    4117             :         return 0;
    4118             : }
    4119             : 
    4120           0 : bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
    4121             : {
    4122           0 :         return !!__gfp_pfmemalloc_flags(gfp_mask);
    4123             : }
    4124             : 
    4125             : /*
    4126             :  * Checks whether it makes sense to retry the reclaim to make a forward progress
    4127             :  * for the given allocation request.
    4128             :  *
    4129             :  * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
    4130             :  * without success, or when we couldn't even meet the watermark if we
    4131             :  * reclaimed all remaining pages on the LRU lists.
    4132             :  *
    4133             :  * Returns true if a retry is viable or false to enter the oom path.
    4134             :  */
    4135             : static inline bool
    4136           0 : should_reclaim_retry(gfp_t gfp_mask, unsigned order,
    4137             :                      struct alloc_context *ac, int alloc_flags,
    4138             :                      bool did_some_progress, int *no_progress_loops)
    4139             : {
    4140             :         struct zone *zone;
    4141             :         struct zoneref *z;
    4142           0 :         bool ret = false;
    4143             : 
    4144             :         /*
    4145             :          * Costly allocations might have made a progress but this doesn't mean
    4146             :          * their order will become available due to high fragmentation so
    4147             :          * always increment the no progress counter for them
    4148             :          */
    4149           0 :         if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
    4150           0 :                 *no_progress_loops = 0;
    4151             :         else
    4152           0 :                 (*no_progress_loops)++;
    4153             : 
    4154             :         /*
    4155             :          * Make sure we converge to OOM if we cannot make any progress
    4156             :          * several times in the row.
    4157             :          */
    4158           0 :         if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
    4159             :                 /* Before OOM, exhaust highatomic_reserve */
    4160           0 :                 return unreserve_highatomic_pageblock(ac, true);
    4161             :         }
    4162             : 
    4163             :         /*
    4164             :          * Keep reclaiming pages while there is a chance this will lead
    4165             :          * somewhere.  If none of the target zones can satisfy our allocation
    4166             :          * request even if all reclaimable pages are considered then we are
    4167             :          * screwed and have to go OOM.
    4168             :          */
    4169           0 :         for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
    4170             :                                 ac->highest_zoneidx, ac->nodemask) {
    4171             :                 unsigned long available;
    4172             :                 unsigned long reclaimable;
    4173           0 :                 unsigned long min_wmark = min_wmark_pages(zone);
    4174             :                 bool wmark;
    4175             : 
    4176           0 :                 available = reclaimable = zone_reclaimable_pages(zone);
    4177           0 :                 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
    4178             : 
    4179             :                 /*
    4180             :                  * Would the allocation succeed if we reclaimed all
    4181             :                  * reclaimable pages?
    4182             :                  */
    4183           0 :                 wmark = __zone_watermark_ok(zone, order, min_wmark,
    4184           0 :                                 ac->highest_zoneidx, alloc_flags, available);
    4185           0 :                 trace_reclaim_retry_zone(z, order, reclaimable,
    4186             :                                 available, min_wmark, *no_progress_loops, wmark);
    4187           0 :                 if (wmark) {
    4188             :                         ret = true;
    4189             :                         break;
    4190             :                 }
    4191             :         }
    4192             : 
    4193             :         /*
    4194             :          * Memory allocation/reclaim might be called from a WQ context and the
    4195             :          * current implementation of the WQ concurrency control doesn't
    4196             :          * recognize that a particular WQ is congested if the worker thread is
    4197             :          * looping without ever sleeping. Therefore we have to do a short sleep
    4198             :          * here rather than calling cond_resched().
    4199             :          */
    4200           0 :         if (current->flags & PF_WQ_WORKER)
    4201           0 :                 schedule_timeout_uninterruptible(1);
    4202             :         else
    4203           0 :                 cond_resched();
    4204             :         return ret;
    4205             : }
    4206             : 
    4207             : static inline bool
    4208             : check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
    4209             : {
    4210             :         /*
    4211             :          * It's possible that cpuset's mems_allowed and the nodemask from
    4212             :          * mempolicy don't intersect. This should be normally dealt with by
    4213             :          * policy_nodemask(), but it's possible to race with cpuset update in
    4214             :          * such a way the check therein was true, and then it became false
    4215             :          * before we got our cpuset_mems_cookie here.
    4216             :          * This assumes that for all allocations, ac->nodemask can come only
    4217             :          * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
    4218             :          * when it does not intersect with the cpuset restrictions) or the
    4219             :          * caller can deal with a violated nodemask.
    4220             :          */
    4221             :         if (cpusets_enabled() && ac->nodemask &&
    4222             :                         !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
    4223             :                 ac->nodemask = NULL;
    4224             :                 return true;
    4225             :         }
    4226             : 
    4227             :         /*
    4228             :          * When updating a task's mems_allowed or mempolicy nodemask, it is
    4229             :          * possible to race with parallel threads in such a way that our
    4230             :          * allocation can fail while the mask is being updated. If we are about
    4231             :          * to fail, check if the cpuset changed during allocation and if so,
    4232             :          * retry.
    4233             :          */
    4234           0 :         if (read_mems_allowed_retry(cpuset_mems_cookie))
    4235             :                 return true;
    4236             : 
    4237             :         return false;
    4238             : }
    4239             : 
    4240             : static inline struct page *
    4241           0 : __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
    4242             :                                                 struct alloc_context *ac)
    4243             : {
    4244           0 :         bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
    4245           0 :         const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
    4246           0 :         struct page *page = NULL;
    4247             :         unsigned int alloc_flags;
    4248             :         unsigned long did_some_progress;
    4249             :         enum compact_priority compact_priority;
    4250             :         enum compact_result compact_result;
    4251             :         int compaction_retries;
    4252             :         int no_progress_loops;
    4253             :         unsigned int cpuset_mems_cookie;
    4254             :         unsigned int zonelist_iter_cookie;
    4255             :         int reserve_flags;
    4256             : 
    4257             : restart:
    4258           0 :         compaction_retries = 0;
    4259           0 :         no_progress_loops = 0;
    4260           0 :         compact_priority = DEF_COMPACT_PRIORITY;
    4261           0 :         cpuset_mems_cookie = read_mems_allowed_begin();
    4262           0 :         zonelist_iter_cookie = zonelist_iter_begin();
    4263             : 
    4264             :         /*
    4265             :          * The fast path uses conservative alloc_flags to succeed only until
    4266             :          * kswapd needs to be woken up, and to avoid the cost of setting up
    4267             :          * alloc_flags precisely. So we do that now.
    4268             :          */
    4269           0 :         alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
    4270             : 
    4271             :         /*
    4272             :          * We need to recalculate the starting point for the zonelist iterator
    4273             :          * because we might have used different nodemask in the fast path, or
    4274             :          * there was a cpuset modification and we are retrying - otherwise we
    4275             :          * could end up iterating over non-eligible zones endlessly.
    4276             :          */
    4277           0 :         ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    4278             :                                         ac->highest_zoneidx, ac->nodemask);
    4279           0 :         if (!ac->preferred_zoneref->zone)
    4280             :                 goto nopage;
    4281             : 
    4282             :         /*
    4283             :          * Check for insane configurations where the cpuset doesn't contain
    4284             :          * any suitable zone to satisfy the request - e.g. non-movable
    4285             :          * GFP_HIGHUSER allocations from MOVABLE nodes only.
    4286             :          */
    4287             :         if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
    4288             :                 struct zoneref *z = first_zones_zonelist(ac->zonelist,
    4289             :                                         ac->highest_zoneidx,
    4290             :                                         &cpuset_current_mems_allowed);
    4291             :                 if (!z->zone)
    4292             :                         goto nopage;
    4293             :         }
    4294             : 
    4295           0 :         if (alloc_flags & ALLOC_KSWAPD)
    4296           0 :                 wake_all_kswapds(order, gfp_mask, ac);
    4297             : 
    4298             :         /*
    4299             :          * The adjusted alloc_flags might result in immediate success, so try
    4300             :          * that first
    4301             :          */
    4302           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    4303           0 :         if (page)
    4304             :                 goto got_pg;
    4305             : 
    4306             :         /*
    4307             :          * For costly allocations, try direct compaction first, as it's likely
    4308             :          * that we have enough base pages and don't need to reclaim. For non-
    4309             :          * movable high-order allocations, do that as well, as compaction will
    4310             :          * try prevent permanent fragmentation by migrating from blocks of the
    4311             :          * same migratetype.
    4312             :          * Don't try this for allocations that are allowed to ignore
    4313             :          * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
    4314             :          */
    4315           0 :         if (can_direct_reclaim &&
    4316           0 :                         (costly_order ||
    4317           0 :                            (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
    4318           0 :                         && !gfp_pfmemalloc_allowed(gfp_mask)) {
    4319           0 :                 page = __alloc_pages_direct_compact(gfp_mask, order,
    4320             :                                                 alloc_flags, ac,
    4321             :                                                 INIT_COMPACT_PRIORITY,
    4322             :                                                 &compact_result);
    4323           0 :                 if (page)
    4324             :                         goto got_pg;
    4325             : 
    4326             :                 /*
    4327             :                  * Checks for costly allocations with __GFP_NORETRY, which
    4328             :                  * includes some THP page fault allocations
    4329             :                  */
    4330           0 :                 if (costly_order && (gfp_mask & __GFP_NORETRY)) {
    4331             :                         /*
    4332             :                          * If allocating entire pageblock(s) and compaction
    4333             :                          * failed because all zones are below low watermarks
    4334             :                          * or is prohibited because it recently failed at this
    4335             :                          * order, fail immediately unless the allocator has
    4336             :                          * requested compaction and reclaim retry.
    4337             :                          *
    4338             :                          * Reclaim is
    4339             :                          *  - potentially very expensive because zones are far
    4340             :                          *    below their low watermarks or this is part of very
    4341             :                          *    bursty high order allocations,
    4342             :                          *  - not guaranteed to help because isolate_freepages()
    4343             :                          *    may not iterate over freed pages as part of its
    4344             :                          *    linear scan, and
    4345             :                          *  - unlikely to make entire pageblocks free on its
    4346             :                          *    own.
    4347             :                          */
    4348           0 :                         if (compact_result == COMPACT_SKIPPED ||
    4349             :                             compact_result == COMPACT_DEFERRED)
    4350             :                                 goto nopage;
    4351             : 
    4352             :                         /*
    4353             :                          * Looks like reclaim/compaction is worth trying, but
    4354             :                          * sync compaction could be very expensive, so keep
    4355             :                          * using async compaction.
    4356             :                          */
    4357           0 :                         compact_priority = INIT_COMPACT_PRIORITY;
    4358             :                 }
    4359             :         }
    4360             : 
    4361             : retry:
    4362             :         /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
    4363           0 :         if (alloc_flags & ALLOC_KSWAPD)
    4364           0 :                 wake_all_kswapds(order, gfp_mask, ac);
    4365             : 
    4366           0 :         reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
    4367           0 :         if (reserve_flags)
    4368           0 :                 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) |
    4369             :                                           (alloc_flags & ALLOC_KSWAPD);
    4370             : 
    4371             :         /*
    4372             :          * Reset the nodemask and zonelist iterators if memory policies can be
    4373             :          * ignored. These allocations are high priority and system rather than
    4374             :          * user oriented.
    4375             :          */
    4376           0 :         if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
    4377           0 :                 ac->nodemask = NULL;
    4378           0 :                 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    4379             :                                         ac->highest_zoneidx, ac->nodemask);
    4380             :         }
    4381             : 
    4382             :         /* Attempt with potentially adjusted zonelist and alloc_flags */
    4383           0 :         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    4384           0 :         if (page)
    4385             :                 goto got_pg;
    4386             : 
    4387             :         /* Caller is not willing to reclaim, we can't balance anything */
    4388           0 :         if (!can_direct_reclaim)
    4389             :                 goto nopage;
    4390             : 
    4391             :         /* Avoid recursion of direct reclaim */
    4392           0 :         if (current->flags & PF_MEMALLOC)
    4393             :                 goto nopage;
    4394             : 
    4395             :         /* Try direct reclaim and then allocating */
    4396           0 :         page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
    4397             :                                                         &did_some_progress);
    4398           0 :         if (page)
    4399             :                 goto got_pg;
    4400             : 
    4401             :         /* Try direct compaction and then allocating */
    4402           0 :         page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
    4403             :                                         compact_priority, &compact_result);
    4404           0 :         if (page)
    4405             :                 goto got_pg;
    4406             : 
    4407             :         /* Do not loop if specifically requested */
    4408           0 :         if (gfp_mask & __GFP_NORETRY)
    4409             :                 goto nopage;
    4410             : 
    4411             :         /*
    4412             :          * Do not retry costly high order allocations unless they are
    4413             :          * __GFP_RETRY_MAYFAIL
    4414             :          */
    4415           0 :         if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
    4416             :                 goto nopage;
    4417             : 
    4418           0 :         if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
    4419             :                                  did_some_progress > 0, &no_progress_loops))
    4420             :                 goto retry;
    4421             : 
    4422             :         /*
    4423             :          * It doesn't make any sense to retry for the compaction if the order-0
    4424             :          * reclaim is not able to make any progress because the current
    4425             :          * implementation of the compaction depends on the sufficient amount
    4426             :          * of free memory (see __compaction_suitable)
    4427             :          */
    4428           0 :         if (did_some_progress > 0 &&
    4429           0 :                         should_compact_retry(ac, order, alloc_flags,
    4430             :                                 compact_result, &compact_priority,
    4431             :                                 &compaction_retries))
    4432             :                 goto retry;
    4433             : 
    4434             : 
    4435             :         /*
    4436             :          * Deal with possible cpuset update races or zonelist updates to avoid
    4437             :          * a unnecessary OOM kill.
    4438             :          */
    4439           0 :         if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
    4440           0 :             check_retry_zonelist(zonelist_iter_cookie))
    4441             :                 goto restart;
    4442             : 
    4443             :         /* Reclaim has failed us, start killing things */
    4444           0 :         page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
    4445           0 :         if (page)
    4446             :                 goto got_pg;
    4447             : 
    4448             :         /* Avoid allocations with no watermarks from looping endlessly */
    4449           0 :         if (tsk_is_oom_victim(current) &&
    4450           0 :             (alloc_flags & ALLOC_OOM ||
    4451           0 :              (gfp_mask & __GFP_NOMEMALLOC)))
    4452             :                 goto nopage;
    4453             : 
    4454             :         /* Retry as long as the OOM killer is making progress */
    4455           0 :         if (did_some_progress) {
    4456           0 :                 no_progress_loops = 0;
    4457           0 :                 goto retry;
    4458             :         }
    4459             : 
    4460             : nopage:
    4461             :         /*
    4462             :          * Deal with possible cpuset update races or zonelist updates to avoid
    4463             :          * a unnecessary OOM kill.
    4464             :          */
    4465           0 :         if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
    4466           0 :             check_retry_zonelist(zonelist_iter_cookie))
    4467             :                 goto restart;
    4468             : 
    4469             :         /*
    4470             :          * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
    4471             :          * we always retry
    4472             :          */
    4473           0 :         if (gfp_mask & __GFP_NOFAIL) {
    4474             :                 /*
    4475             :                  * All existing users of the __GFP_NOFAIL are blockable, so warn
    4476             :                  * of any new users that actually require GFP_NOWAIT
    4477             :                  */
    4478           0 :                 if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
    4479             :                         goto fail;
    4480             : 
    4481             :                 /*
    4482             :                  * PF_MEMALLOC request from this context is rather bizarre
    4483             :                  * because we cannot reclaim anything and only can loop waiting
    4484             :                  * for somebody to do a work for us
    4485             :                  */
    4486           0 :                 WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
    4487             : 
    4488             :                 /*
    4489             :                  * non failing costly orders are a hard requirement which we
    4490             :                  * are not prepared for much so let's warn about these users
    4491             :                  * so that we can identify them and convert them to something
    4492             :                  * else.
    4493             :                  */
    4494           0 :                 WARN_ON_ONCE_GFP(costly_order, gfp_mask);
    4495             : 
    4496             :                 /*
    4497             :                  * Help non-failing allocations by giving some access to memory
    4498             :                  * reserves normally used for high priority non-blocking
    4499             :                  * allocations but do not use ALLOC_NO_WATERMARKS because this
    4500             :                  * could deplete whole memory reserves which would just make
    4501             :                  * the situation worse.
    4502             :                  */
    4503           0 :                 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac);
    4504           0 :                 if (page)
    4505             :                         goto got_pg;
    4506             : 
    4507           0 :                 cond_resched();
    4508           0 :                 goto retry;
    4509             :         }
    4510             : fail:
    4511           0 :         warn_alloc(gfp_mask, ac->nodemask,
    4512             :                         "page allocation failure: order:%u", order);
    4513             : got_pg:
    4514           0 :         return page;
    4515             : }
    4516             : 
    4517        2818 : static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
    4518             :                 int preferred_nid, nodemask_t *nodemask,
    4519             :                 struct alloc_context *ac, gfp_t *alloc_gfp,
    4520             :                 unsigned int *alloc_flags)
    4521             : {
    4522        2818 :         ac->highest_zoneidx = gfp_zone(gfp_mask);
    4523        5636 :         ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
    4524        2818 :         ac->nodemask = nodemask;
    4525        2818 :         ac->migratetype = gfp_migratetype(gfp_mask);
    4526             : 
    4527             :         if (cpusets_enabled()) {
    4528             :                 *alloc_gfp |= __GFP_HARDWALL;
    4529             :                 /*
    4530             :                  * When we are in the interrupt context, it is irrelevant
    4531             :                  * to the current task context. It means that any node ok.
    4532             :                  */
    4533             :                 if (in_task() && !ac->nodemask)
    4534             :                         ac->nodemask = &cpuset_current_mems_allowed;
    4535             :                 else
    4536             :                         *alloc_flags |= ALLOC_CPUSET;
    4537             :         }
    4538             : 
    4539        2818 :         might_alloc(gfp_mask);
    4540             : 
    4541        2818 :         if (should_fail_alloc_page(gfp_mask, order))
    4542             :                 return false;
    4543             : 
    4544        2818 :         *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
    4545             : 
    4546             :         /* Dirty zone balancing only done in the fast path */
    4547        2818 :         ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
    4548             : 
    4549             :         /*
    4550             :          * The preferred zone is used for statistics but crucially it is
    4551             :          * also used as the starting point for the zonelist iterator. It
    4552             :          * may get reset for allocations that ignore memory policies.
    4553             :          */
    4554        5636 :         ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
    4555             :                                         ac->highest_zoneidx, ac->nodemask);
    4556             : 
    4557             :         return true;
    4558             : }
    4559             : 
    4560             : /*
    4561             :  * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
    4562             :  * @gfp: GFP flags for the allocation
    4563             :  * @preferred_nid: The preferred NUMA node ID to allocate from
    4564             :  * @nodemask: Set of nodes to allocate from, may be NULL
    4565             :  * @nr_pages: The number of pages desired on the list or array
    4566             :  * @page_list: Optional list to store the allocated pages
    4567             :  * @page_array: Optional array to store the pages
    4568             :  *
    4569             :  * This is a batched version of the page allocator that attempts to
    4570             :  * allocate nr_pages quickly. Pages are added to page_list if page_list
    4571             :  * is not NULL, otherwise it is assumed that the page_array is valid.
    4572             :  *
    4573             :  * For lists, nr_pages is the number of pages that should be allocated.
    4574             :  *
    4575             :  * For arrays, only NULL elements are populated with pages and nr_pages
    4576             :  * is the maximum number of pages that will be stored in the array.
    4577             :  *
    4578             :  * Returns the number of pages on the list or array.
    4579             :  */
    4580         596 : unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
    4581             :                         nodemask_t *nodemask, int nr_pages,
    4582             :                         struct list_head *page_list,
    4583             :                         struct page **page_array)
    4584             : {
    4585             :         struct page *page;
    4586             :         unsigned long __maybe_unused UP_flags;
    4587             :         struct zone *zone;
    4588             :         struct zoneref *z;
    4589             :         struct per_cpu_pages *pcp;
    4590             :         struct list_head *pcp_list;
    4591             :         struct alloc_context ac;
    4592             :         gfp_t alloc_gfp;
    4593         596 :         unsigned int alloc_flags = ALLOC_WMARK_LOW;
    4594         596 :         int nr_populated = 0, nr_account = 0;
    4595             : 
    4596             :         /*
    4597             :          * Skip populated array elements to determine if any pages need
    4598             :          * to be allocated before disabling IRQs.
    4599             :          */
    4600        1192 :         while (page_array && nr_populated < nr_pages && page_array[nr_populated])
    4601           0 :                 nr_populated++;
    4602             : 
    4603             :         /* No pages requested? */
    4604         596 :         if (unlikely(nr_pages <= 0))
    4605             :                 goto out;
    4606             : 
    4607             :         /* Already populated array? */
    4608         596 :         if (unlikely(page_array && nr_pages - nr_populated == 0))
    4609             :                 goto out;
    4610             : 
    4611             :         /* Bulk allocator does not support memcg accounting. */
    4612             :         if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT))
    4613             :                 goto failed;
    4614             : 
    4615             :         /* Use the single page allocator for one page. */
    4616         596 :         if (nr_pages - nr_populated == 1)
    4617             :                 goto failed;
    4618             : 
    4619             : #ifdef CONFIG_PAGE_OWNER
    4620             :         /*
    4621             :          * PAGE_OWNER may recurse into the allocator to allocate space to
    4622             :          * save the stack with pagesets.lock held. Releasing/reacquiring
    4623             :          * removes much of the performance benefit of bulk allocation so
    4624             :          * force the caller to allocate one page at a time as it'll have
    4625             :          * similar performance to added complexity to the bulk allocator.
    4626             :          */
    4627             :         if (static_branch_unlikely(&page_owner_inited))
    4628             :                 goto failed;
    4629             : #endif
    4630             : 
    4631             :         /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
    4632         596 :         gfp &= gfp_allowed_mask;
    4633         596 :         alloc_gfp = gfp;
    4634         596 :         if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
    4635             :                 goto out;
    4636         596 :         gfp = alloc_gfp;
    4637             : 
    4638             :         /* Find an allowed local zone that meets the low watermark. */
    4639        1192 :         for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
    4640             :                 unsigned long mark;
    4641             : 
    4642             :                 if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
    4643             :                     !__cpuset_zone_allowed(zone, gfp)) {
    4644             :                         continue;
    4645             :                 }
    4646             : 
    4647             :                 if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
    4648             :                     zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
    4649             :                         goto failed;
    4650             :                 }
    4651             : 
    4652         596 :                 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
    4653         596 :                 if (zone_watermark_fast(zone, 0,  mark,
    4654             :                                 zonelist_zone_idx(ac.preferred_zoneref),
    4655             :                                 alloc_flags, gfp)) {
    4656             :                         break;
    4657             :                 }
    4658             :         }
    4659             : 
    4660             :         /*
    4661             :          * If there are no allowed local zones that meets the watermarks then
    4662             :          * try to allocate a single page and reclaim if necessary.
    4663             :          */
    4664         596 :         if (unlikely(!zone))
    4665             :                 goto failed;
    4666             : 
    4667             :         /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
    4668         596 :         pcp_trylock_prepare(UP_flags);
    4669        1192 :         pcp = pcp_spin_trylock(zone->per_cpu_pageset);
    4670         596 :         if (!pcp)
    4671             :                 goto failed_irq;
    4672             : 
    4673             :         /* Attempt the batch allocation */
    4674        1192 :         pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
    4675       43733 :         while (nr_populated < nr_pages) {
    4676             : 
    4677             :                 /* Skip existing pages */
    4678       42541 :                 if (page_array && page_array[nr_populated]) {
    4679           0 :                         nr_populated++;
    4680           0 :                         continue;
    4681             :                 }
    4682             : 
    4683       42541 :                 page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
    4684             :                                                                 pcp, pcp_list);
    4685       42541 :                 if (unlikely(!page)) {
    4686             :                         /* Try and allocate at least one page */
    4687           0 :                         if (!nr_account) {
    4688           0 :                                 pcp_spin_unlock(pcp);
    4689           0 :                                 goto failed_irq;
    4690             :                         }
    4691             :                         break;
    4692             :                 }
    4693       42541 :                 nr_account++;
    4694             : 
    4695       42541 :                 prep_new_page(page, 0, gfp, 0);
    4696       42541 :                 if (page_list)
    4697           0 :                         list_add(&page->lru, page_list);
    4698             :                 else
    4699       42541 :                         page_array[nr_populated] = page;
    4700       42541 :                 nr_populated++;
    4701             :         }
    4702             : 
    4703        1192 :         pcp_spin_unlock(pcp);
    4704        1192 :         pcp_trylock_finish(UP_flags);
    4705             : 
    4706        1192 :         __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
    4707         596 :         zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
    4708             : 
    4709             : out:
    4710         596 :         return nr_populated;
    4711             : 
    4712             : failed_irq:
    4713           0 :         pcp_trylock_finish(UP_flags);
    4714             : 
    4715             : failed:
    4716           0 :         page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
    4717           0 :         if (page) {
    4718           0 :                 if (page_list)
    4719           0 :                         list_add(&page->lru, page_list);
    4720             :                 else
    4721           0 :                         page_array[nr_populated] = page;
    4722           0 :                 nr_populated++;
    4723             :         }
    4724             : 
    4725             :         goto out;
    4726             : }
    4727             : EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
    4728             : 
    4729             : /*
    4730             :  * This is the 'heart' of the zoned buddy allocator.
    4731             :  */
    4732        2222 : struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
    4733             :                                                         nodemask_t *nodemask)
    4734             : {
    4735             :         struct page *page;
    4736        2222 :         unsigned int alloc_flags = ALLOC_WMARK_LOW;
    4737             :         gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
    4738        2222 :         struct alloc_context ac = { };
    4739             : 
    4740             :         /*
    4741             :          * There are several places where we assume that the order value is sane
    4742             :          * so bail out early if the request is out of bound.
    4743             :          */
    4744        2222 :         if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
    4745             :                 return NULL;
    4746             : 
    4747        2222 :         gfp &= gfp_allowed_mask;
    4748             :         /*
    4749             :          * Apply scoped allocation constraints. This is mainly about GFP_NOFS
    4750             :          * resp. GFP_NOIO which has to be inherited for all allocation requests
    4751             :          * from a particular context which has been marked by
    4752             :          * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
    4753             :          * movable zones are not used during allocation.
    4754             :          */
    4755        2222 :         gfp = current_gfp_context(gfp);
    4756        2222 :         alloc_gfp = gfp;
    4757        2222 :         if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
    4758             :                         &alloc_gfp, &alloc_flags))
    4759             :                 return NULL;
    4760             : 
    4761             :         /*
    4762             :          * Forbid the first pass from falling back to types that fragment
    4763             :          * memory until all local zones are considered.
    4764             :          */
    4765        4444 :         alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
    4766             : 
    4767             :         /* First allocation attempt */
    4768        2222 :         page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
    4769        2222 :         if (likely(page))
    4770             :                 goto out;
    4771             : 
    4772           0 :         alloc_gfp = gfp;
    4773           0 :         ac.spread_dirty_pages = false;
    4774             : 
    4775             :         /*
    4776             :          * Restore the original nodemask if it was potentially replaced with
    4777             :          * &cpuset_current_mems_allowed to optimize the fast-path attempt.
    4778             :          */
    4779           0 :         ac.nodemask = nodemask;
    4780             : 
    4781           0 :         page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
    4782             : 
    4783             : out:
    4784             :         if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
    4785             :             unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
    4786             :                 __free_pages(page, order);
    4787             :                 page = NULL;
    4788             :         }
    4789             : 
    4790        2222 :         trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
    4791        2222 :         kmsan_alloc_page(page, order, alloc_gfp);
    4792             : 
    4793        2222 :         return page;
    4794             : }
    4795             : EXPORT_SYMBOL(__alloc_pages);
    4796             : 
    4797           0 : struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
    4798             :                 nodemask_t *nodemask)
    4799             : {
    4800           0 :         struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
    4801             :                         preferred_nid, nodemask);
    4802             : 
    4803             :         if (page && order > 1)
    4804             :                 prep_transhuge_page(page);
    4805           0 :         return (struct folio *)page;
    4806             : }
    4807             : EXPORT_SYMBOL(__folio_alloc);
    4808             : 
    4809             : /*
    4810             :  * Common helper functions. Never use with __GFP_HIGHMEM because the returned
    4811             :  * address cannot represent highmem pages. Use alloc_pages and then kmap if
    4812             :  * you need to access high mem.
    4813             :  */
    4814          20 : unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
    4815             : {
    4816             :         struct page *page;
    4817             : 
    4818          40 :         page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
    4819          20 :         if (!page)
    4820             :                 return 0;
    4821          20 :         return (unsigned long) page_address(page);
    4822             : }
    4823             : EXPORT_SYMBOL(__get_free_pages);
    4824             : 
    4825           0 : unsigned long get_zeroed_page(gfp_t gfp_mask)
    4826             : {
    4827           0 :         return __get_free_page(gfp_mask | __GFP_ZERO);
    4828             : }
    4829             : EXPORT_SYMBOL(get_zeroed_page);
    4830             : 
    4831             : /**
    4832             :  * __free_pages - Free pages allocated with alloc_pages().
    4833             :  * @page: The page pointer returned from alloc_pages().
    4834             :  * @order: The order of the allocation.
    4835             :  *
    4836             :  * This function can free multi-page allocations that are not compound
    4837             :  * pages.  It does not check that the @order passed in matches that of
    4838             :  * the allocation, so it is easy to leak memory.  Freeing more memory
    4839             :  * than was allocated will probably emit a warning.
    4840             :  *
    4841             :  * If the last reference to this page is speculative, it will be released
    4842             :  * by put_page() which only frees the first page of a non-compound
    4843             :  * allocation.  To prevent the remaining pages from being leaked, we free
    4844             :  * the subsequent pages here.  If you want to use the page's reference
    4845             :  * count to decide when to free the allocation, you should allocate a
    4846             :  * compound page, and use put_page() instead of __free_pages().
    4847             :  *
    4848             :  * Context: May be called in interrupt context or while holding a normal
    4849             :  * spinlock, but not in NMI context or while holding a raw spinlock.
    4850             :  */
    4851       44236 : void __free_pages(struct page *page, unsigned int order)
    4852             : {
    4853             :         /* get PageHead before we drop reference */
    4854       44236 :         int head = PageHead(page);
    4855             : 
    4856       44236 :         if (put_page_testzero(page))
    4857       44236 :                 free_the_page(page, order);
    4858           0 :         else if (!head)
    4859           0 :                 while (order-- > 0)
    4860           0 :                         free_the_page(page + (1 << order), order);
    4861       44236 : }
    4862             : EXPORT_SYMBOL(__free_pages);
    4863             : 
    4864           0 : void free_pages(unsigned long addr, unsigned int order)
    4865             : {
    4866           0 :         if (addr != 0) {
    4867             :                 VM_BUG_ON(!virt_addr_valid((void *)addr));
    4868           0 :                 __free_pages(virt_to_page((void *)addr), order);
    4869             :         }
    4870           0 : }
    4871             : 
    4872             : EXPORT_SYMBOL(free_pages);
    4873             : 
    4874             : /*
    4875             :  * Page Fragment:
    4876             :  *  An arbitrary-length arbitrary-offset area of memory which resides
    4877             :  *  within a 0 or higher order page.  Multiple fragments within that page
    4878             :  *  are individually refcounted, in the page's reference counter.
    4879             :  *
    4880             :  * The page_frag functions below provide a simple allocation framework for
    4881             :  * page fragments.  This is used by the network stack and network device
    4882             :  * drivers to provide a backing region of memory for use as either an
    4883             :  * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
    4884             :  */
    4885           0 : static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
    4886             :                                              gfp_t gfp_mask)
    4887             : {
    4888           0 :         struct page *page = NULL;
    4889           0 :         gfp_t gfp = gfp_mask;
    4890             : 
    4891             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    4892           0 :         gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
    4893             :                     __GFP_NOMEMALLOC;
    4894           0 :         page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
    4895           0 :                                 PAGE_FRAG_CACHE_MAX_ORDER);
    4896           0 :         nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
    4897             : #endif
    4898           0 :         if (unlikely(!page))
    4899           0 :                 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
    4900             : 
    4901           0 :         nc->va = page ? page_address(page) : NULL;
    4902             : 
    4903           0 :         return page;
    4904             : }
    4905             : 
    4906           0 : void __page_frag_cache_drain(struct page *page, unsigned int count)
    4907             : {
    4908             :         VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
    4909             : 
    4910           0 :         if (page_ref_sub_and_test(page, count))
    4911           0 :                 free_the_page(page, compound_order(page));
    4912           0 : }
    4913             : EXPORT_SYMBOL(__page_frag_cache_drain);
    4914             : 
    4915           0 : void *page_frag_alloc_align(struct page_frag_cache *nc,
    4916             :                       unsigned int fragsz, gfp_t gfp_mask,
    4917             :                       unsigned int align_mask)
    4918             : {
    4919           0 :         unsigned int size = PAGE_SIZE;
    4920             :         struct page *page;
    4921             :         int offset;
    4922             : 
    4923           0 :         if (unlikely(!nc->va)) {
    4924             : refill:
    4925           0 :                 page = __page_frag_cache_refill(nc, gfp_mask);
    4926           0 :                 if (!page)
    4927             :                         return NULL;
    4928             : 
    4929             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    4930             :                 /* if size can vary use size else just use PAGE_SIZE */
    4931           0 :                 size = nc->size;
    4932             : #endif
    4933             :                 /* Even if we own the page, we do not use atomic_set().
    4934             :                  * This would break get_page_unless_zero() users.
    4935             :                  */
    4936           0 :                 page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
    4937             : 
    4938             :                 /* reset page count bias and offset to start of new frag */
    4939           0 :                 nc->pfmemalloc = page_is_pfmemalloc(page);
    4940           0 :                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
    4941           0 :                 nc->offset = size;
    4942             :         }
    4943             : 
    4944           0 :         offset = nc->offset - fragsz;
    4945           0 :         if (unlikely(offset < 0)) {
    4946           0 :                 page = virt_to_page(nc->va);
    4947             : 
    4948           0 :                 if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
    4949             :                         goto refill;
    4950             : 
    4951           0 :                 if (unlikely(nc->pfmemalloc)) {
    4952           0 :                         free_the_page(page, compound_order(page));
    4953           0 :                         goto refill;
    4954             :                 }
    4955             : 
    4956             : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    4957             :                 /* if size can vary use size else just use PAGE_SIZE */
    4958           0 :                 size = nc->size;
    4959             : #endif
    4960             :                 /* OK, page count is 0, we can safely set it */
    4961           0 :                 set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
    4962             : 
    4963             :                 /* reset page count bias and offset to start of new frag */
    4964           0 :                 nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
    4965           0 :                 offset = size - fragsz;
    4966           0 :                 if (unlikely(offset < 0)) {
    4967             :                         /*
    4968             :                          * The caller is trying to allocate a fragment
    4969             :                          * with fragsz > PAGE_SIZE but the cache isn't big
    4970             :                          * enough to satisfy the request, this may
    4971             :                          * happen in low memory conditions.
    4972             :                          * We don't release the cache page because
    4973             :                          * it could make memory pressure worse
    4974             :                          * so we simply return NULL here.
    4975             :                          */
    4976             :                         return NULL;
    4977             :                 }
    4978             :         }
    4979             : 
    4980           0 :         nc->pagecnt_bias--;
    4981           0 :         offset &= align_mask;
    4982           0 :         nc->offset = offset;
    4983             : 
    4984           0 :         return nc->va + offset;
    4985             : }
    4986             : EXPORT_SYMBOL(page_frag_alloc_align);
    4987             : 
    4988             : /*
    4989             :  * Frees a page fragment allocated out of either a compound or order 0 page.
    4990             :  */
    4991           0 : void page_frag_free(void *addr)
    4992             : {
    4993           0 :         struct page *page = virt_to_head_page(addr);
    4994             : 
    4995           0 :         if (unlikely(put_page_testzero(page)))
    4996           0 :                 free_the_page(page, compound_order(page));
    4997           0 : }
    4998             : EXPORT_SYMBOL(page_frag_free);
    4999             : 
    5000           3 : static void *make_alloc_exact(unsigned long addr, unsigned int order,
    5001             :                 size_t size)
    5002             : {
    5003           3 :         if (addr) {
    5004           3 :                 unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE);
    5005           6 :                 struct page *page = virt_to_page((void *)addr);
    5006           3 :                 struct page *last = page + nr;
    5007             : 
    5008           3 :                 split_page_owner(page, 1 << order);
    5009           3 :                 split_page_memcg(page, 1 << order);
    5010          18 :                 while (page < --last)
    5011             :                         set_page_refcounted(last);
    5012             : 
    5013           3 :                 last = page + (1UL << order);
    5014           3 :                 for (page += nr; page < last; page++)
    5015           0 :                         __free_pages_ok(page, 0, FPI_TO_TAIL);
    5016             :         }
    5017           3 :         return (void *)addr;
    5018             : }
    5019             : 
    5020             : /**
    5021             :  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
    5022             :  * @size: the number of bytes to allocate
    5023             :  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
    5024             :  *
    5025             :  * This function is similar to alloc_pages(), except that it allocates the
    5026             :  * minimum number of pages to satisfy the request.  alloc_pages() can only
    5027             :  * allocate memory in power-of-two pages.
    5028             :  *
    5029             :  * This function is also limited by MAX_ORDER.
    5030             :  *
    5031             :  * Memory allocated by this function must be released by free_pages_exact().
    5032             :  *
    5033             :  * Return: pointer to the allocated area or %NULL in case of error.
    5034             :  */
    5035           3 : void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
    5036             : {
    5037           3 :         unsigned int order = get_order(size);
    5038             :         unsigned long addr;
    5039             : 
    5040           3 :         if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
    5041           0 :                 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
    5042             : 
    5043           3 :         addr = __get_free_pages(gfp_mask, order);
    5044           3 :         return make_alloc_exact(addr, order, size);
    5045             : }
    5046             : EXPORT_SYMBOL(alloc_pages_exact);
    5047             : 
    5048             : /**
    5049             :  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
    5050             :  *                         pages on a node.
    5051             :  * @nid: the preferred node ID where memory should be allocated
    5052             :  * @size: the number of bytes to allocate
    5053             :  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
    5054             :  *
    5055             :  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
    5056             :  * back.
    5057             :  *
    5058             :  * Return: pointer to the allocated area or %NULL in case of error.
    5059             :  */
    5060           0 : void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
    5061             : {
    5062           0 :         unsigned int order = get_order(size);
    5063             :         struct page *p;
    5064             : 
    5065           0 :         if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
    5066           0 :                 gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
    5067             : 
    5068           0 :         p = alloc_pages_node(nid, gfp_mask, order);
    5069           0 :         if (!p)
    5070             :                 return NULL;
    5071           0 :         return make_alloc_exact((unsigned long)page_address(p), order, size);
    5072             : }
    5073             : 
    5074             : /**
    5075             :  * free_pages_exact - release memory allocated via alloc_pages_exact()
    5076             :  * @virt: the value returned by alloc_pages_exact.
    5077             :  * @size: size of allocation, same value as passed to alloc_pages_exact().
    5078             :  *
    5079             :  * Release the memory allocated by a previous call to alloc_pages_exact.
    5080             :  */
    5081           0 : void free_pages_exact(void *virt, size_t size)
    5082             : {
    5083           0 :         unsigned long addr = (unsigned long)virt;
    5084           0 :         unsigned long end = addr + PAGE_ALIGN(size);
    5085             : 
    5086           0 :         while (addr < end) {
    5087           0 :                 free_page(addr);
    5088           0 :                 addr += PAGE_SIZE;
    5089             :         }
    5090           0 : }
    5091             : EXPORT_SYMBOL(free_pages_exact);
    5092             : 
    5093             : /**
    5094             :  * nr_free_zone_pages - count number of pages beyond high watermark
    5095             :  * @offset: The zone index of the highest zone
    5096             :  *
    5097             :  * nr_free_zone_pages() counts the number of pages which are beyond the
    5098             :  * high watermark within all zones at or below a given zone index.  For each
    5099             :  * zone, the number of pages is calculated as:
    5100             :  *
    5101             :  *     nr_free_zone_pages = managed_pages - high_pages
    5102             :  *
    5103             :  * Return: number of pages beyond high watermark.
    5104             :  */
    5105           3 : static unsigned long nr_free_zone_pages(int offset)
    5106             : {
    5107             :         struct zoneref *z;
    5108             :         struct zone *zone;
    5109             : 
    5110             :         /* Just pick one node, since fallback list is circular */
    5111           3 :         unsigned long sum = 0;
    5112             : 
    5113           6 :         struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
    5114             : 
    5115          12 :         for_each_zone_zonelist(zone, z, zonelist, offset) {
    5116           3 :                 unsigned long size = zone_managed_pages(zone);
    5117           3 :                 unsigned long high = high_wmark_pages(zone);
    5118           3 :                 if (size > high)
    5119           3 :                         sum += size - high;
    5120             :         }
    5121             : 
    5122           3 :         return sum;
    5123             : }
    5124             : 
    5125             : /**
    5126             :  * nr_free_buffer_pages - count number of pages beyond high watermark
    5127             :  *
    5128             :  * nr_free_buffer_pages() counts the number of pages which are beyond the high
    5129             :  * watermark within ZONE_DMA and ZONE_NORMAL.
    5130             :  *
    5131             :  * Return: number of pages beyond high watermark within ZONE_DMA and
    5132             :  * ZONE_NORMAL.
    5133             :  */
    5134           1 : unsigned long nr_free_buffer_pages(void)
    5135             : {
    5136           2 :         return nr_free_zone_pages(gfp_zone(GFP_USER));
    5137             : }
    5138             : EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
    5139             : 
    5140             : static inline void show_node(struct zone *zone)
    5141             : {
    5142             :         if (IS_ENABLED(CONFIG_NUMA))
    5143             :                 printk("Node %d ", zone_to_nid(zone));
    5144             : }
    5145             : 
    5146           0 : long si_mem_available(void)
    5147             : {
    5148             :         long available;
    5149             :         unsigned long pagecache;
    5150           0 :         unsigned long wmark_low = 0;
    5151             :         unsigned long pages[NR_LRU_LISTS];
    5152             :         unsigned long reclaimable;
    5153             :         struct zone *zone;
    5154             :         int lru;
    5155             : 
    5156           0 :         for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
    5157           0 :                 pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
    5158             : 
    5159           0 :         for_each_zone(zone)
    5160           0 :                 wmark_low += low_wmark_pages(zone);
    5161             : 
    5162             :         /*
    5163             :          * Estimate the amount of memory available for userspace allocations,
    5164             :          * without causing swapping or OOM.
    5165             :          */
    5166           0 :         available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
    5167             : 
    5168             :         /*
    5169             :          * Not all the page cache can be freed, otherwise the system will
    5170             :          * start swapping or thrashing. Assume at least half of the page
    5171             :          * cache, or the low watermark worth of cache, needs to stay.
    5172             :          */
    5173           0 :         pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
    5174           0 :         pagecache -= min(pagecache / 2, wmark_low);
    5175           0 :         available += pagecache;
    5176             : 
    5177             :         /*
    5178             :          * Part of the reclaimable slab and other kernel memory consists of
    5179             :          * items that are in use, and cannot be freed. Cap this estimate at the
    5180             :          * low watermark.
    5181             :          */
    5182           0 :         reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
    5183           0 :                 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
    5184           0 :         available += reclaimable - min(reclaimable / 2, wmark_low);
    5185             : 
    5186           0 :         if (available < 0)
    5187           0 :                 available = 0;
    5188           0 :         return available;
    5189             : }
    5190             : EXPORT_SYMBOL_GPL(si_mem_available);
    5191             : 
    5192           2 : void si_meminfo(struct sysinfo *val)
    5193             : {
    5194           2 :         val->totalram = totalram_pages();
    5195           2 :         val->sharedram = global_node_page_state(NR_SHMEM);
    5196           2 :         val->freeram = global_zone_page_state(NR_FREE_PAGES);
    5197           2 :         val->bufferram = nr_blockdev_pages();
    5198           2 :         val->totalhigh = totalhigh_pages();
    5199           2 :         val->freehigh = nr_free_highpages();
    5200           2 :         val->mem_unit = PAGE_SIZE;
    5201           2 : }
    5202             : 
    5203             : EXPORT_SYMBOL(si_meminfo);
    5204             : 
    5205             : #ifdef CONFIG_NUMA
    5206             : void si_meminfo_node(struct sysinfo *val, int nid)
    5207             : {
    5208             :         int zone_type;          /* needs to be signed */
    5209             :         unsigned long managed_pages = 0;
    5210             :         unsigned long managed_highpages = 0;
    5211             :         unsigned long free_highpages = 0;
    5212             :         pg_data_t *pgdat = NODE_DATA(nid);
    5213             : 
    5214             :         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
    5215             :                 managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
    5216             :         val->totalram = managed_pages;
    5217             :         val->sharedram = node_page_state(pgdat, NR_SHMEM);
    5218             :         val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
    5219             : #ifdef CONFIG_HIGHMEM
    5220             :         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
    5221             :                 struct zone *zone = &pgdat->node_zones[zone_type];
    5222             : 
    5223             :                 if (is_highmem(zone)) {
    5224             :                         managed_highpages += zone_managed_pages(zone);
    5225             :                         free_highpages += zone_page_state(zone, NR_FREE_PAGES);
    5226             :                 }
    5227             :         }
    5228             :         val->totalhigh = managed_highpages;
    5229             :         val->freehigh = free_highpages;
    5230             : #else
    5231             :         val->totalhigh = managed_highpages;
    5232             :         val->freehigh = free_highpages;
    5233             : #endif
    5234             :         val->mem_unit = PAGE_SIZE;
    5235             : }
    5236             : #endif
    5237             : 
    5238             : /*
    5239             :  * Determine whether the node should be displayed or not, depending on whether
    5240             :  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
    5241             :  */
    5242           0 : static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
    5243             : {
    5244           0 :         if (!(flags & SHOW_MEM_FILTER_NODES))
    5245             :                 return false;
    5246             : 
    5247             :         /*
    5248             :          * no node mask - aka implicit memory numa policy. Do not bother with
    5249             :          * the synchronization - read_mems_allowed_begin - because we do not
    5250             :          * have to be precise here.
    5251             :          */
    5252           0 :         if (!nodemask)
    5253           0 :                 nodemask = &cpuset_current_mems_allowed;
    5254             : 
    5255           0 :         return !node_isset(nid, *nodemask);
    5256             : }
    5257             : 
    5258           0 : static void show_migration_types(unsigned char type)
    5259             : {
    5260             :         static const char types[MIGRATE_TYPES] = {
    5261             :                 [MIGRATE_UNMOVABLE]     = 'U',
    5262             :                 [MIGRATE_MOVABLE]       = 'M',
    5263             :                 [MIGRATE_RECLAIMABLE]   = 'E',
    5264             :                 [MIGRATE_HIGHATOMIC]    = 'H',
    5265             : #ifdef CONFIG_CMA
    5266             :                 [MIGRATE_CMA]           = 'C',
    5267             : #endif
    5268             : #ifdef CONFIG_MEMORY_ISOLATION
    5269             :                 [MIGRATE_ISOLATE]       = 'I',
    5270             : #endif
    5271             :         };
    5272             :         char tmp[MIGRATE_TYPES + 1];
    5273           0 :         char *p = tmp;
    5274             :         int i;
    5275             : 
    5276           0 :         for (i = 0; i < MIGRATE_TYPES; i++) {
    5277           0 :                 if (type & (1 << i))
    5278           0 :                         *p++ = types[i];
    5279             :         }
    5280             : 
    5281           0 :         *p = '\0';
    5282           0 :         printk(KERN_CONT "(%s) ", tmp);
    5283           0 : }
    5284             : 
    5285             : static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx)
    5286             : {
    5287             :         int zone_idx;
    5288           0 :         for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++)
    5289           0 :                 if (zone_managed_pages(pgdat->node_zones + zone_idx))
    5290             :                         return true;
    5291             :         return false;
    5292             : }
    5293             : 
    5294             : /*
    5295             :  * Show free area list (used inside shift_scroll-lock stuff)
    5296             :  * We also calculate the percentage fragmentation. We do this by counting the
    5297             :  * memory on each free list with the exception of the first item on the list.
    5298             :  *
    5299             :  * Bits in @filter:
    5300             :  * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
    5301             :  *   cpuset.
    5302             :  */
    5303           0 : void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
    5304             : {
    5305           0 :         unsigned long free_pcp = 0;
    5306             :         int cpu, nid;
    5307             :         struct zone *zone;
    5308             :         pg_data_t *pgdat;
    5309             : 
    5310           0 :         for_each_populated_zone(zone) {
    5311           0 :                 if (zone_idx(zone) > max_zone_idx)
    5312           0 :                         continue;
    5313           0 :                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
    5314           0 :                         continue;
    5315             : 
    5316           0 :                 for_each_online_cpu(cpu)
    5317           0 :                         free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
    5318             :         }
    5319             : 
    5320           0 :         printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
    5321             :                 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
    5322             :                 " unevictable:%lu dirty:%lu writeback:%lu\n"
    5323             :                 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
    5324             :                 " mapped:%lu shmem:%lu pagetables:%lu\n"
    5325             :                 " sec_pagetables:%lu bounce:%lu\n"
    5326             :                 " kernel_misc_reclaimable:%lu\n"
    5327             :                 " free:%lu free_pcp:%lu free_cma:%lu\n",
    5328             :                 global_node_page_state(NR_ACTIVE_ANON),
    5329             :                 global_node_page_state(NR_INACTIVE_ANON),
    5330             :                 global_node_page_state(NR_ISOLATED_ANON),
    5331             :                 global_node_page_state(NR_ACTIVE_FILE),
    5332             :                 global_node_page_state(NR_INACTIVE_FILE),
    5333             :                 global_node_page_state(NR_ISOLATED_FILE),
    5334             :                 global_node_page_state(NR_UNEVICTABLE),
    5335             :                 global_node_page_state(NR_FILE_DIRTY),
    5336             :                 global_node_page_state(NR_WRITEBACK),
    5337             :                 global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
    5338             :                 global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
    5339             :                 global_node_page_state(NR_FILE_MAPPED),
    5340             :                 global_node_page_state(NR_SHMEM),
    5341             :                 global_node_page_state(NR_PAGETABLE),
    5342             :                 global_node_page_state(NR_SECONDARY_PAGETABLE),
    5343             :                 global_zone_page_state(NR_BOUNCE),
    5344             :                 global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
    5345             :                 global_zone_page_state(NR_FREE_PAGES),
    5346             :                 free_pcp,
    5347             :                 global_zone_page_state(NR_FREE_CMA_PAGES));
    5348             : 
    5349           0 :         for_each_online_pgdat(pgdat) {
    5350           0 :                 if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
    5351           0 :                         continue;
    5352           0 :                 if (!node_has_managed_zones(pgdat, max_zone_idx))
    5353           0 :                         continue;
    5354             : 
    5355           0 :                 printk("Node %d"
    5356             :                         " active_anon:%lukB"
    5357             :                         " inactive_anon:%lukB"
    5358             :                         " active_file:%lukB"
    5359             :                         " inactive_file:%lukB"
    5360             :                         " unevictable:%lukB"
    5361             :                         " isolated(anon):%lukB"
    5362             :                         " isolated(file):%lukB"
    5363             :                         " mapped:%lukB"
    5364             :                         " dirty:%lukB"
    5365             :                         " writeback:%lukB"
    5366             :                         " shmem:%lukB"
    5367             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    5368             :                         " shmem_thp: %lukB"
    5369             :                         " shmem_pmdmapped: %lukB"
    5370             :                         " anon_thp: %lukB"
    5371             : #endif
    5372             :                         " writeback_tmp:%lukB"
    5373             :                         " kernel_stack:%lukB"
    5374             : #ifdef CONFIG_SHADOW_CALL_STACK
    5375             :                         " shadow_call_stack:%lukB"
    5376             : #endif
    5377             :                         " pagetables:%lukB"
    5378             :                         " sec_pagetables:%lukB"
    5379             :                         " all_unreclaimable? %s"
    5380             :                         "\n",
    5381             :                         pgdat->node_id,
    5382             :                         K(node_page_state(pgdat, NR_ACTIVE_ANON)),
    5383             :                         K(node_page_state(pgdat, NR_INACTIVE_ANON)),
    5384             :                         K(node_page_state(pgdat, NR_ACTIVE_FILE)),
    5385             :                         K(node_page_state(pgdat, NR_INACTIVE_FILE)),
    5386             :                         K(node_page_state(pgdat, NR_UNEVICTABLE)),
    5387             :                         K(node_page_state(pgdat, NR_ISOLATED_ANON)),
    5388             :                         K(node_page_state(pgdat, NR_ISOLATED_FILE)),
    5389             :                         K(node_page_state(pgdat, NR_FILE_MAPPED)),
    5390             :                         K(node_page_state(pgdat, NR_FILE_DIRTY)),
    5391             :                         K(node_page_state(pgdat, NR_WRITEBACK)),
    5392             :                         K(node_page_state(pgdat, NR_SHMEM)),
    5393             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    5394             :                         K(node_page_state(pgdat, NR_SHMEM_THPS)),
    5395             :                         K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
    5396             :                         K(node_page_state(pgdat, NR_ANON_THPS)),
    5397             : #endif
    5398             :                         K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
    5399             :                         node_page_state(pgdat, NR_KERNEL_STACK_KB),
    5400             : #ifdef CONFIG_SHADOW_CALL_STACK
    5401             :                         node_page_state(pgdat, NR_KERNEL_SCS_KB),
    5402             : #endif
    5403             :                         K(node_page_state(pgdat, NR_PAGETABLE)),
    5404             :                         K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
    5405             :                         pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
    5406             :                                 "yes" : "no");
    5407             :         }
    5408             : 
    5409           0 :         for_each_populated_zone(zone) {
    5410             :                 int i;
    5411             : 
    5412           0 :                 if (zone_idx(zone) > max_zone_idx)
    5413           0 :                         continue;
    5414           0 :                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
    5415           0 :                         continue;
    5416             : 
    5417             :                 free_pcp = 0;
    5418           0 :                 for_each_online_cpu(cpu)
    5419           0 :                         free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
    5420             : 
    5421           0 :                 show_node(zone);
    5422           0 :                 printk(KERN_CONT
    5423             :                         "%s"
    5424             :                         " free:%lukB"
    5425             :                         " boost:%lukB"
    5426             :                         " min:%lukB"
    5427             :                         " low:%lukB"
    5428             :                         " high:%lukB"
    5429             :                         " reserved_highatomic:%luKB"
    5430             :                         " active_anon:%lukB"
    5431             :                         " inactive_anon:%lukB"
    5432             :                         " active_file:%lukB"
    5433             :                         " inactive_file:%lukB"
    5434             :                         " unevictable:%lukB"
    5435             :                         " writepending:%lukB"
    5436             :                         " present:%lukB"
    5437             :                         " managed:%lukB"
    5438             :                         " mlocked:%lukB"
    5439             :                         " bounce:%lukB"
    5440             :                         " free_pcp:%lukB"
    5441             :                         " local_pcp:%ukB"
    5442             :                         " free_cma:%lukB"
    5443             :                         "\n",
    5444             :                         zone->name,
    5445             :                         K(zone_page_state(zone, NR_FREE_PAGES)),
    5446             :                         K(zone->watermark_boost),
    5447             :                         K(min_wmark_pages(zone)),
    5448             :                         K(low_wmark_pages(zone)),
    5449             :                         K(high_wmark_pages(zone)),
    5450             :                         K(zone->nr_reserved_highatomic),
    5451             :                         K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
    5452             :                         K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
    5453             :                         K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
    5454             :                         K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
    5455             :                         K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
    5456             :                         K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
    5457             :                         K(zone->present_pages),
    5458             :                         K(zone_managed_pages(zone)),
    5459             :                         K(zone_page_state(zone, NR_MLOCK)),
    5460             :                         K(zone_page_state(zone, NR_BOUNCE)),
    5461             :                         K(free_pcp),
    5462             :                         K(this_cpu_read(zone->per_cpu_pageset->count)),
    5463             :                         K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
    5464           0 :                 printk("lowmem_reserve[]:");
    5465           0 :                 for (i = 0; i < MAX_NR_ZONES; i++)
    5466           0 :                         printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
    5467           0 :                 printk(KERN_CONT "\n");
    5468             :         }
    5469             : 
    5470           0 :         for_each_populated_zone(zone) {
    5471             :                 unsigned int order;
    5472           0 :                 unsigned long nr[MAX_ORDER + 1], flags, total = 0;
    5473             :                 unsigned char types[MAX_ORDER + 1];
    5474             : 
    5475           0 :                 if (zone_idx(zone) > max_zone_idx)
    5476           0 :                         continue;
    5477           0 :                 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
    5478           0 :                         continue;
    5479           0 :                 show_node(zone);
    5480           0 :                 printk(KERN_CONT "%s: ", zone->name);
    5481             : 
    5482           0 :                 spin_lock_irqsave(&zone->lock, flags);
    5483           0 :                 for (order = 0; order <= MAX_ORDER; order++) {
    5484           0 :                         struct free_area *area = &zone->free_area[order];
    5485             :                         int type;
    5486             : 
    5487           0 :                         nr[order] = area->nr_free;
    5488           0 :                         total += nr[order] << order;
    5489             : 
    5490           0 :                         types[order] = 0;
    5491           0 :                         for (type = 0; type < MIGRATE_TYPES; type++) {
    5492           0 :                                 if (!free_area_empty(area, type))
    5493           0 :                                         types[order] |= 1 << type;
    5494             :                         }
    5495             :                 }
    5496           0 :                 spin_unlock_irqrestore(&zone->lock, flags);
    5497           0 :                 for (order = 0; order <= MAX_ORDER; order++) {
    5498           0 :                         printk(KERN_CONT "%lu*%lukB ",
    5499             :                                nr[order], K(1UL) << order);
    5500           0 :                         if (nr[order])
    5501           0 :                                 show_migration_types(types[order]);
    5502             :                 }
    5503           0 :                 printk(KERN_CONT "= %lukB\n", K(total));
    5504             :         }
    5505             : 
    5506           0 :         for_each_online_node(nid) {
    5507           0 :                 if (show_mem_node_skip(filter, nid, nodemask))
    5508             :                         continue;
    5509             :                 hugetlb_show_meminfo_node(nid);
    5510             :         }
    5511             : 
    5512           0 :         printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
    5513             : 
    5514           0 :         show_swap_cache_info();
    5515           0 : }
    5516             : 
    5517             : static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
    5518             : {
    5519           1 :         zoneref->zone = zone;
    5520           1 :         zoneref->zone_idx = zone_idx(zone);
    5521             : }
    5522             : 
    5523             : /*
    5524             :  * Builds allocation fallback zone lists.
    5525             :  *
    5526             :  * Add all populated zones of a node to the zonelist.
    5527             :  */
    5528             : static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
    5529             : {
    5530             :         struct zone *zone;
    5531           1 :         enum zone_type zone_type = MAX_NR_ZONES;
    5532           1 :         int nr_zones = 0;
    5533             : 
    5534             :         do {
    5535           2 :                 zone_type--;
    5536           2 :                 zone = pgdat->node_zones + zone_type;
    5537           2 :                 if (populated_zone(zone)) {
    5538           2 :                         zoneref_set_zone(zone, &zonerefs[nr_zones++]);
    5539           1 :                         check_highest_zone(zone_type);
    5540             :                 }
    5541           2 :         } while (zone_type);
    5542             : 
    5543             :         return nr_zones;
    5544             : }
    5545             : 
    5546             : #ifdef CONFIG_NUMA
    5547             : 
    5548             : static int __parse_numa_zonelist_order(char *s)
    5549             : {
    5550             :         /*
    5551             :          * We used to support different zonelists modes but they turned
    5552             :          * out to be just not useful. Let's keep the warning in place
    5553             :          * if somebody still use the cmd line parameter so that we do
    5554             :          * not fail it silently
    5555             :          */
    5556             :         if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
    5557             :                 pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
    5558             :                 return -EINVAL;
    5559             :         }
    5560             :         return 0;
    5561             : }
    5562             : 
    5563             : char numa_zonelist_order[] = "Node";
    5564             : 
    5565             : /*
    5566             :  * sysctl handler for numa_zonelist_order
    5567             :  */
    5568             : int numa_zonelist_order_handler(struct ctl_table *table, int write,
    5569             :                 void *buffer, size_t *length, loff_t *ppos)
    5570             : {
    5571             :         if (write)
    5572             :                 return __parse_numa_zonelist_order(buffer);
    5573             :         return proc_dostring(table, write, buffer, length, ppos);
    5574             : }
    5575             : 
    5576             : 
    5577             : static int node_load[MAX_NUMNODES];
    5578             : 
    5579             : /**
    5580             :  * find_next_best_node - find the next node that should appear in a given node's fallback list
    5581             :  * @node: node whose fallback list we're appending
    5582             :  * @used_node_mask: nodemask_t of already used nodes
    5583             :  *
    5584             :  * We use a number of factors to determine which is the next node that should
    5585             :  * appear on a given node's fallback list.  The node should not have appeared
    5586             :  * already in @node's fallback list, and it should be the next closest node
    5587             :  * according to the distance array (which contains arbitrary distance values
    5588             :  * from each node to each node in the system), and should also prefer nodes
    5589             :  * with no CPUs, since presumably they'll have very little allocation pressure
    5590             :  * on them otherwise.
    5591             :  *
    5592             :  * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
    5593             :  */
    5594             : int find_next_best_node(int node, nodemask_t *used_node_mask)
    5595             : {
    5596             :         int n, val;
    5597             :         int min_val = INT_MAX;
    5598             :         int best_node = NUMA_NO_NODE;
    5599             : 
    5600             :         /* Use the local node if we haven't already */
    5601             :         if (!node_isset(node, *used_node_mask)) {
    5602             :                 node_set(node, *used_node_mask);
    5603             :                 return node;
    5604             :         }
    5605             : 
    5606             :         for_each_node_state(n, N_MEMORY) {
    5607             : 
    5608             :                 /* Don't want a node to appear more than once */
    5609             :                 if (node_isset(n, *used_node_mask))
    5610             :                         continue;
    5611             : 
    5612             :                 /* Use the distance array to find the distance */
    5613             :                 val = node_distance(node, n);
    5614             : 
    5615             :                 /* Penalize nodes under us ("prefer the next node") */
    5616             :                 val += (n < node);
    5617             : 
    5618             :                 /* Give preference to headless and unused nodes */
    5619             :                 if (!cpumask_empty(cpumask_of_node(n)))
    5620             :                         val += PENALTY_FOR_NODE_WITH_CPUS;
    5621             : 
    5622             :                 /* Slight preference for less loaded node */
    5623             :                 val *= MAX_NUMNODES;
    5624             :                 val += node_load[n];
    5625             : 
    5626             :                 if (val < min_val) {
    5627             :                         min_val = val;
    5628             :                         best_node = n;
    5629             :                 }
    5630             :         }
    5631             : 
    5632             :         if (best_node >= 0)
    5633             :                 node_set(best_node, *used_node_mask);
    5634             : 
    5635             :         return best_node;
    5636             : }
    5637             : 
    5638             : 
    5639             : /*
    5640             :  * Build zonelists ordered by node and zones within node.
    5641             :  * This results in maximum locality--normal zone overflows into local
    5642             :  * DMA zone, if any--but risks exhausting DMA zone.
    5643             :  */
    5644             : static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
    5645             :                 unsigned nr_nodes)
    5646             : {
    5647             :         struct zoneref *zonerefs;
    5648             :         int i;
    5649             : 
    5650             :         zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
    5651             : 
    5652             :         for (i = 0; i < nr_nodes; i++) {
    5653             :                 int nr_zones;
    5654             : 
    5655             :                 pg_data_t *node = NODE_DATA(node_order[i]);
    5656             : 
    5657             :                 nr_zones = build_zonerefs_node(node, zonerefs);
    5658             :                 zonerefs += nr_zones;
    5659             :         }
    5660             :         zonerefs->zone = NULL;
    5661             :         zonerefs->zone_idx = 0;
    5662             : }
    5663             : 
    5664             : /*
    5665             :  * Build gfp_thisnode zonelists
    5666             :  */
    5667             : static void build_thisnode_zonelists(pg_data_t *pgdat)
    5668             : {
    5669             :         struct zoneref *zonerefs;
    5670             :         int nr_zones;
    5671             : 
    5672             :         zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
    5673             :         nr_zones = build_zonerefs_node(pgdat, zonerefs);
    5674             :         zonerefs += nr_zones;
    5675             :         zonerefs->zone = NULL;
    5676             :         zonerefs->zone_idx = 0;
    5677             : }
    5678             : 
    5679             : /*
    5680             :  * Build zonelists ordered by zone and nodes within zones.
    5681             :  * This results in conserving DMA zone[s] until all Normal memory is
    5682             :  * exhausted, but results in overflowing to remote node while memory
    5683             :  * may still exist in local DMA zone.
    5684             :  */
    5685             : 
    5686             : static void build_zonelists(pg_data_t *pgdat)
    5687             : {
    5688             :         static int node_order[MAX_NUMNODES];
    5689             :         int node, nr_nodes = 0;
    5690             :         nodemask_t used_mask = NODE_MASK_NONE;
    5691             :         int local_node, prev_node;
    5692             : 
    5693             :         /* NUMA-aware ordering of nodes */
    5694             :         local_node = pgdat->node_id;
    5695             :         prev_node = local_node;
    5696             : 
    5697             :         memset(node_order, 0, sizeof(node_order));
    5698             :         while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
    5699             :                 /*
    5700             :                  * We don't want to pressure a particular node.
    5701             :                  * So adding penalty to the first node in same
    5702             :                  * distance group to make it round-robin.
    5703             :                  */
    5704             :                 if (node_distance(local_node, node) !=
    5705             :                     node_distance(local_node, prev_node))
    5706             :                         node_load[node] += 1;
    5707             : 
    5708             :                 node_order[nr_nodes++] = node;
    5709             :                 prev_node = node;
    5710             :         }
    5711             : 
    5712             :         build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
    5713             :         build_thisnode_zonelists(pgdat);
    5714             :         pr_info("Fallback order for Node %d: ", local_node);
    5715             :         for (node = 0; node < nr_nodes; node++)
    5716             :                 pr_cont("%d ", node_order[node]);
    5717             :         pr_cont("\n");
    5718             : }
    5719             : 
    5720             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
    5721             : /*
    5722             :  * Return node id of node used for "local" allocations.
    5723             :  * I.e., first node id of first zone in arg node's generic zonelist.
    5724             :  * Used for initializing percpu 'numa_mem', which is used primarily
    5725             :  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
    5726             :  */
    5727             : int local_memory_node(int node)
    5728             : {
    5729             :         struct zoneref *z;
    5730             : 
    5731             :         z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
    5732             :                                    gfp_zone(GFP_KERNEL),
    5733             :                                    NULL);
    5734             :         return zone_to_nid(z->zone);
    5735             : }
    5736             : #endif
    5737             : 
    5738             : static void setup_min_unmapped_ratio(void);
    5739             : static void setup_min_slab_ratio(void);
    5740             : #else   /* CONFIG_NUMA */
    5741             : 
    5742           1 : static void build_zonelists(pg_data_t *pgdat)
    5743             : {
    5744             :         int node, local_node;
    5745             :         struct zoneref *zonerefs;
    5746             :         int nr_zones;
    5747             : 
    5748           1 :         local_node = pgdat->node_id;
    5749             : 
    5750           1 :         zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
    5751           1 :         nr_zones = build_zonerefs_node(pgdat, zonerefs);
    5752           1 :         zonerefs += nr_zones;
    5753             : 
    5754             :         /*
    5755             :          * Now we build the zonelist so that it contains the zones
    5756             :          * of all the other nodes.
    5757             :          * We don't want to pressure a particular node, so when
    5758             :          * building the zones for node N, we make sure that the
    5759             :          * zones coming right after the local ones are those from
    5760             :          * node N+1 (modulo N)
    5761             :          */
    5762           1 :         for (node = local_node + 1; node < MAX_NUMNODES; node++) {
    5763           0 :                 if (!node_online(node))
    5764           0 :                         continue;
    5765           0 :                 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
    5766           0 :                 zonerefs += nr_zones;
    5767             :         }
    5768           0 :         for (node = 0; node < local_node; node++) {
    5769           0 :                 if (!node_online(node))
    5770           0 :                         continue;
    5771           0 :                 nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
    5772           0 :                 zonerefs += nr_zones;
    5773             :         }
    5774             : 
    5775           1 :         zonerefs->zone = NULL;
    5776           1 :         zonerefs->zone_idx = 0;
    5777           1 : }
    5778             : 
    5779             : #endif  /* CONFIG_NUMA */
    5780             : 
    5781             : /*
    5782             :  * Boot pageset table. One per cpu which is going to be used for all
    5783             :  * zones and all nodes. The parameters will be set in such a way
    5784             :  * that an item put on a list will immediately be handed over to
    5785             :  * the buddy list. This is safe since pageset manipulation is done
    5786             :  * with interrupts disabled.
    5787             :  *
    5788             :  * The boot_pagesets must be kept even after bootup is complete for
    5789             :  * unused processors and/or zones. They do play a role for bootstrapping
    5790             :  * hotplugged processors.
    5791             :  *
    5792             :  * zoneinfo_show() and maybe other functions do
    5793             :  * not check if the processor is online before following the pageset pointer.
    5794             :  * Other parts of the kernel may not check if the zone is available.
    5795             :  */
    5796             : static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats);
    5797             : /* These effectively disable the pcplists in the boot pageset completely */
    5798             : #define BOOT_PAGESET_HIGH       0
    5799             : #define BOOT_PAGESET_BATCH      1
    5800             : static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
    5801             : static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
    5802             : 
    5803           1 : static void __build_all_zonelists(void *data)
    5804             : {
    5805             :         int nid;
    5806             :         int __maybe_unused cpu;
    5807           1 :         pg_data_t *self = data;
    5808             :         unsigned long flags;
    5809             : 
    5810             :         /*
    5811             :          * Explicitly disable this CPU's interrupts before taking seqlock
    5812             :          * to prevent any IRQ handler from calling into the page allocator
    5813             :          * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
    5814             :          */
    5815           1 :         local_irq_save(flags);
    5816             :         /*
    5817             :          * Explicitly disable this CPU's synchronous printk() before taking
    5818             :          * seqlock to prevent any printk() from trying to hold port->lock, for
    5819             :          * tty_insert_flip_string_and_push_buffer() on other CPU might be
    5820             :          * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
    5821             :          */
    5822           1 :         printk_deferred_enter();
    5823           1 :         write_seqlock(&zonelist_update_seq);
    5824             : 
    5825             : #ifdef CONFIG_NUMA
    5826             :         memset(node_load, 0, sizeof(node_load));
    5827             : #endif
    5828             : 
    5829             :         /*
    5830             :          * This node is hotadded and no memory is yet present.   So just
    5831             :          * building zonelists is fine - no need to touch other nodes.
    5832             :          */
    5833           1 :         if (self && !node_online(self->node_id)) {
    5834           0 :                 build_zonelists(self);
    5835             :         } else {
    5836             :                 /*
    5837             :                  * All possible nodes have pgdat preallocated
    5838             :                  * in free_area_init
    5839             :                  */
    5840           1 :                 for_each_node(nid) {
    5841           1 :                         pg_data_t *pgdat = NODE_DATA(nid);
    5842             : 
    5843           1 :                         build_zonelists(pgdat);
    5844             :                 }
    5845             : 
    5846             : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
    5847             :                 /*
    5848             :                  * We now know the "local memory node" for each node--
    5849             :                  * i.e., the node of the first zone in the generic zonelist.
    5850             :                  * Set up numa_mem percpu variable for on-line cpus.  During
    5851             :                  * boot, only the boot cpu should be on-line;  we'll init the
    5852             :                  * secondary cpus' numa_mem as they come on-line.  During
    5853             :                  * node/memory hotplug, we'll fixup all on-line cpus.
    5854             :                  */
    5855             :                 for_each_online_cpu(cpu)
    5856             :                         set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
    5857             : #endif
    5858             :         }
    5859             : 
    5860           1 :         write_sequnlock(&zonelist_update_seq);
    5861           1 :         printk_deferred_exit();
    5862           2 :         local_irq_restore(flags);
    5863           1 : }
    5864             : 
    5865             : static noinline void __init
    5866           1 : build_all_zonelists_init(void)
    5867             : {
    5868             :         int cpu;
    5869             : 
    5870           1 :         __build_all_zonelists(NULL);
    5871             : 
    5872             :         /*
    5873             :          * Initialize the boot_pagesets that are going to be used
    5874             :          * for bootstrapping processors. The real pagesets for
    5875             :          * each zone will be allocated later when the per cpu
    5876             :          * allocator is available.
    5877             :          *
    5878             :          * boot_pagesets are used also for bootstrapping offline
    5879             :          * cpus if the system is already booted because the pagesets
    5880             :          * are needed to initialize allocators on a specific cpu too.
    5881             :          * F.e. the percpu allocator needs the page allocator which
    5882             :          * needs the percpu allocator in order to allocate its pagesets
    5883             :          * (a chicken-egg dilemma).
    5884             :          */
    5885           2 :         for_each_possible_cpu(cpu)
    5886           1 :                 per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu));
    5887             : 
    5888           1 :         mminit_verify_zonelist();
    5889             :         cpuset_init_current_mems_allowed();
    5890           1 : }
    5891             : 
    5892             : /*
    5893             :  * unless system_state == SYSTEM_BOOTING.
    5894             :  *
    5895             :  * __ref due to call of __init annotated helper build_all_zonelists_init
    5896             :  * [protected by SYSTEM_BOOTING].
    5897             :  */
    5898           1 : void __ref build_all_zonelists(pg_data_t *pgdat)
    5899             : {
    5900             :         unsigned long vm_total_pages;
    5901             : 
    5902           1 :         if (system_state == SYSTEM_BOOTING) {
    5903           1 :                 build_all_zonelists_init();
    5904             :         } else {
    5905           0 :                 __build_all_zonelists(pgdat);
    5906             :                 /* cpuset refresh routine should be here */
    5907             :         }
    5908             :         /* Get the number of free pages beyond high watermark in all zones. */
    5909           1 :         vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
    5910             :         /*
    5911             :          * Disable grouping by mobility if the number of pages in the
    5912             :          * system is too low to allow the mechanism to work. It would be
    5913             :          * more accurate, but expensive to check per-zone. This check is
    5914             :          * made on memory-hotadd so a system can start with mobility
    5915             :          * disabled and enable it later
    5916             :          */
    5917           1 :         if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
    5918           0 :                 page_group_by_mobility_disabled = 1;
    5919             :         else
    5920           1 :                 page_group_by_mobility_disabled = 0;
    5921             : 
    5922           1 :         pr_info("Built %u zonelists, mobility grouping %s.  Total pages: %ld\n",
    5923             :                 nr_online_nodes,
    5924             :                 page_group_by_mobility_disabled ? "off" : "on",
    5925             :                 vm_total_pages);
    5926             : #ifdef CONFIG_NUMA
    5927             :         pr_info("Policy zone: %s\n", zone_names[policy_zone]);
    5928             : #endif
    5929           1 : }
    5930             : 
    5931           3 : static int zone_batchsize(struct zone *zone)
    5932             : {
    5933             : #ifdef CONFIG_MMU
    5934             :         int batch;
    5935             : 
    5936             :         /*
    5937             :          * The number of pages to batch allocate is either ~0.1%
    5938             :          * of the zone or 1MB, whichever is smaller. The batch
    5939             :          * size is striking a balance between allocation latency
    5940             :          * and zone lock contention.
    5941             :          */
    5942           3 :         batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
    5943           3 :         batch /= 4;             /* We effectively *= 4 below */
    5944           3 :         if (batch < 1)
    5945           1 :                 batch = 1;
    5946             : 
    5947             :         /*
    5948             :          * Clamp the batch to a 2^n - 1 value. Having a power
    5949             :          * of 2 value was found to be more likely to have
    5950             :          * suboptimal cache aliasing properties in some cases.
    5951             :          *
    5952             :          * For example if 2 tasks are alternately allocating
    5953             :          * batches of pages, one task can end up with a lot
    5954             :          * of pages of one half of the possible page colors
    5955             :          * and the other with pages of the other colors.
    5956             :          */
    5957           5 :         batch = rounddown_pow_of_two(batch + batch/2) - 1;
    5958             : 
    5959           3 :         return batch;
    5960             : 
    5961             : #else
    5962             :         /* The deferral and batching of frees should be suppressed under NOMMU
    5963             :          * conditions.
    5964             :          *
    5965             :          * The problem is that NOMMU needs to be able to allocate large chunks
    5966             :          * of contiguous memory as there's no hardware page translation to
    5967             :          * assemble apparent contiguous memory from discontiguous pages.
    5968             :          *
    5969             :          * Queueing large contiguous runs of pages for batching, however,
    5970             :          * causes the pages to actually be freed in smaller chunks.  As there
    5971             :          * can be a significant delay between the individual batches being
    5972             :          * recycled, this leads to the once large chunks of space being
    5973             :          * fragmented and becoming unavailable for high-order allocations.
    5974             :          */
    5975             :         return 0;
    5976             : #endif
    5977             : }
    5978             : 
    5979           3 : static int zone_highsize(struct zone *zone, int batch, int cpu_online)
    5980             : {
    5981             : #ifdef CONFIG_MMU
    5982             :         int high;
    5983             :         int nr_split_cpus;
    5984             :         unsigned long total_pages;
    5985             : 
    5986           3 :         if (!percpu_pagelist_high_fraction) {
    5987             :                 /*
    5988             :                  * By default, the high value of the pcp is based on the zone
    5989             :                  * low watermark so that if they are full then background
    5990             :                  * reclaim will not be started prematurely.
    5991             :                  */
    5992           3 :                 total_pages = low_wmark_pages(zone);
    5993             :         } else {
    5994             :                 /*
    5995             :                  * If percpu_pagelist_high_fraction is configured, the high
    5996             :                  * value is based on a fraction of the managed pages in the
    5997             :                  * zone.
    5998             :                  */
    5999           0 :                 total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
    6000             :         }
    6001             : 
    6002             :         /*
    6003             :          * Split the high value across all online CPUs local to the zone. Note
    6004             :          * that early in boot that CPUs may not be online yet and that during
    6005             :          * CPU hotplug that the cpumask is not yet updated when a CPU is being
    6006             :          * onlined. For memory nodes that have no CPUs, split pcp->high across
    6007             :          * all online CPUs to mitigate the risk that reclaim is triggered
    6008             :          * prematurely due to pages stored on pcp lists.
    6009             :          */
    6010           6 :         nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
    6011           3 :         if (!nr_split_cpus)
    6012           0 :                 nr_split_cpus = num_online_cpus();
    6013           3 :         high = total_pages / nr_split_cpus;
    6014             : 
    6015             :         /*
    6016             :          * Ensure high is at least batch*4. The multiple is based on the
    6017             :          * historical relationship between high and batch.
    6018             :          */
    6019           3 :         high = max(high, batch << 2);
    6020             : 
    6021           3 :         return high;
    6022             : #else
    6023             :         return 0;
    6024             : #endif
    6025             : }
    6026             : 
    6027             : /*
    6028             :  * pcp->high and pcp->batch values are related and generally batch is lower
    6029             :  * than high. They are also related to pcp->count such that count is lower
    6030             :  * than high, and as soon as it reaches high, the pcplist is flushed.
    6031             :  *
    6032             :  * However, guaranteeing these relations at all times would require e.g. write
    6033             :  * barriers here but also careful usage of read barriers at the read side, and
    6034             :  * thus be prone to error and bad for performance. Thus the update only prevents
    6035             :  * store tearing. Any new users of pcp->batch and pcp->high should ensure they
    6036             :  * can cope with those fields changing asynchronously, and fully trust only the
    6037             :  * pcp->count field on the local CPU with interrupts disabled.
    6038             :  *
    6039             :  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
    6040             :  * outside of boot time (or some other assurance that no concurrent updaters
    6041             :  * exist).
    6042             :  */
    6043             : static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
    6044             :                 unsigned long batch)
    6045             : {
    6046           3 :         WRITE_ONCE(pcp->batch, batch);
    6047           3 :         WRITE_ONCE(pcp->high, high);
    6048             : }
    6049             : 
    6050           2 : static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
    6051             : {
    6052             :         int pindex;
    6053             : 
    6054           2 :         memset(pcp, 0, sizeof(*pcp));
    6055           2 :         memset(pzstats, 0, sizeof(*pzstats));
    6056             : 
    6057           2 :         spin_lock_init(&pcp->lock);
    6058          26 :         for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
    6059          48 :                 INIT_LIST_HEAD(&pcp->lists[pindex]);
    6060             : 
    6061             :         /*
    6062             :          * Set batch and high values safe for a boot pageset. A true percpu
    6063             :          * pageset's initialization will update them subsequently. Here we don't
    6064             :          * need to be as careful as pageset_update() as nobody can access the
    6065             :          * pageset yet.
    6066             :          */
    6067           2 :         pcp->high = BOOT_PAGESET_HIGH;
    6068           2 :         pcp->batch = BOOT_PAGESET_BATCH;
    6069           2 :         pcp->free_factor = 0;
    6070           2 : }
    6071             : 
    6072             : static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
    6073             :                 unsigned long batch)
    6074             : {
    6075             :         struct per_cpu_pages *pcp;
    6076             :         int cpu;
    6077             : 
    6078           3 :         for_each_possible_cpu(cpu) {
    6079           3 :                 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    6080           3 :                 pageset_update(pcp, high, batch);
    6081             :         }
    6082             : }
    6083             : 
    6084             : /*
    6085             :  * Calculate and set new high and batch values for all per-cpu pagesets of a
    6086             :  * zone based on the zone's size.
    6087             :  */
    6088           3 : static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
    6089             : {
    6090             :         int new_high, new_batch;
    6091             : 
    6092           3 :         new_batch = max(1, zone_batchsize(zone));
    6093           3 :         new_high = zone_highsize(zone, new_batch, cpu_online);
    6094             : 
    6095           3 :         if (zone->pageset_high == new_high &&
    6096           0 :             zone->pageset_batch == new_batch)
    6097             :                 return;
    6098             : 
    6099           3 :         zone->pageset_high = new_high;
    6100           3 :         zone->pageset_batch = new_batch;
    6101             : 
    6102           3 :         __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
    6103             : }
    6104             : 
    6105           1 : void __meminit setup_zone_pageset(struct zone *zone)
    6106             : {
    6107             :         int cpu;
    6108             : 
    6109             :         /* Size may be 0 on !SMP && !NUMA */
    6110             :         if (sizeof(struct per_cpu_zonestat) > 0)
    6111             :                 zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
    6112             : 
    6113           1 :         zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
    6114           2 :         for_each_possible_cpu(cpu) {
    6115             :                 struct per_cpu_pages *pcp;
    6116             :                 struct per_cpu_zonestat *pzstats;
    6117             : 
    6118           1 :                 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
    6119           1 :                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
    6120           1 :                 per_cpu_pages_init(pcp, pzstats);
    6121             :         }
    6122             : 
    6123           1 :         zone_set_pageset_high_and_batch(zone, 0);
    6124           1 : }
    6125             : 
    6126             : /*
    6127             :  * The zone indicated has a new number of managed_pages; batch sizes and percpu
    6128             :  * page high values need to be recalculated.
    6129             :  */
    6130           2 : static void zone_pcp_update(struct zone *zone, int cpu_online)
    6131             : {
    6132           2 :         mutex_lock(&pcp_batch_high_lock);
    6133           2 :         zone_set_pageset_high_and_batch(zone, cpu_online);
    6134           2 :         mutex_unlock(&pcp_batch_high_lock);
    6135           2 : }
    6136             : 
    6137             : /*
    6138             :  * Allocate per cpu pagesets and initialize them.
    6139             :  * Before this call only boot pagesets were available.
    6140             :  */
    6141           1 : void __init setup_per_cpu_pageset(void)
    6142             : {
    6143             :         struct pglist_data *pgdat;
    6144             :         struct zone *zone;
    6145             :         int __maybe_unused cpu;
    6146             : 
    6147           3 :         for_each_populated_zone(zone)
    6148           1 :                 setup_zone_pageset(zone);
    6149             : 
    6150             : #ifdef CONFIG_NUMA
    6151             :         /*
    6152             :          * Unpopulated zones continue using the boot pagesets.
    6153             :          * The numa stats for these pagesets need to be reset.
    6154             :          * Otherwise, they will end up skewing the stats of
    6155             :          * the nodes these zones are associated with.
    6156             :          */
    6157             :         for_each_possible_cpu(cpu) {
    6158             :                 struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
    6159             :                 memset(pzstats->vm_numa_event, 0,
    6160             :                        sizeof(pzstats->vm_numa_event));
    6161             :         }
    6162             : #endif
    6163             : 
    6164           2 :         for_each_online_pgdat(pgdat)
    6165           1 :                 pgdat->per_cpu_nodestats =
    6166           1 :                         alloc_percpu(struct per_cpu_nodestat);
    6167           1 : }
    6168             : 
    6169           2 : __meminit void zone_pcp_init(struct zone *zone)
    6170             : {
    6171             :         /*
    6172             :          * per cpu subsystem is not up at this point. The following code
    6173             :          * relies on the ability of the linker to provide the
    6174             :          * offset of a (static) per cpu variable into the per cpu area.
    6175             :          */
    6176           2 :         zone->per_cpu_pageset = &boot_pageset;
    6177           2 :         zone->per_cpu_zonestats = &boot_zonestats;
    6178           2 :         zone->pageset_high = BOOT_PAGESET_HIGH;
    6179           2 :         zone->pageset_batch = BOOT_PAGESET_BATCH;
    6180             : 
    6181           2 :         if (populated_zone(zone))
    6182             :                 pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
    6183             :                          zone->present_pages, zone_batchsize(zone));
    6184           2 : }
    6185             : 
    6186           0 : void adjust_managed_page_count(struct page *page, long count)
    6187             : {
    6188           0 :         atomic_long_add(count, &page_zone(page)->managed_pages);
    6189           0 :         totalram_pages_add(count);
    6190             : #ifdef CONFIG_HIGHMEM
    6191             :         if (PageHighMem(page))
    6192             :                 totalhigh_pages_add(count);
    6193             : #endif
    6194           0 : }
    6195             : EXPORT_SYMBOL(adjust_managed_page_count);
    6196             : 
    6197           0 : unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
    6198             : {
    6199             :         void *pos;
    6200           0 :         unsigned long pages = 0;
    6201             : 
    6202           0 :         start = (void *)PAGE_ALIGN((unsigned long)start);
    6203           0 :         end = (void *)((unsigned long)end & PAGE_MASK);
    6204           0 :         for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
    6205           0 :                 struct page *page = virt_to_page(pos);
    6206             :                 void *direct_map_addr;
    6207             : 
    6208             :                 /*
    6209             :                  * 'direct_map_addr' might be different from 'pos'
    6210             :                  * because some architectures' virt_to_page()
    6211             :                  * work with aliases.  Getting the direct map
    6212             :                  * address ensures that we get a _writeable_
    6213             :                  * alias for the memset().
    6214             :                  */
    6215           0 :                 direct_map_addr = page_address(page);
    6216             :                 /*
    6217             :                  * Perform a kasan-unchecked memset() since this memory
    6218             :                  * has not been initialized.
    6219             :                  */
    6220           0 :                 direct_map_addr = kasan_reset_tag(direct_map_addr);
    6221           0 :                 if ((unsigned int)poison <= 0xFF)
    6222           0 :                         memset(direct_map_addr, poison, PAGE_SIZE);
    6223             : 
    6224           0 :                 free_reserved_page(page);
    6225             :         }
    6226             : 
    6227           0 :         if (pages && s)
    6228           0 :                 pr_info("Freeing %s memory: %ldK\n", s, K(pages));
    6229             : 
    6230           0 :         return pages;
    6231             : }
    6232             : 
    6233           0 : static int page_alloc_cpu_dead(unsigned int cpu)
    6234             : {
    6235             :         struct zone *zone;
    6236             : 
    6237           0 :         lru_add_drain_cpu(cpu);
    6238           0 :         mlock_drain_remote(cpu);
    6239           0 :         drain_pages(cpu);
    6240             : 
    6241             :         /*
    6242             :          * Spill the event counters of the dead processor
    6243             :          * into the current processors event counters.
    6244             :          * This artificially elevates the count of the current
    6245             :          * processor.
    6246             :          */
    6247           0 :         vm_events_fold_cpu(cpu);
    6248             : 
    6249             :         /*
    6250             :          * Zero the differential counters of the dead processor
    6251             :          * so that the vm statistics are consistent.
    6252             :          *
    6253             :          * This is only okay since the processor is dead and cannot
    6254             :          * race with what we are doing.
    6255             :          */
    6256           0 :         cpu_vm_stats_fold(cpu);
    6257             : 
    6258           0 :         for_each_populated_zone(zone)
    6259           0 :                 zone_pcp_update(zone, 0);
    6260             : 
    6261           0 :         return 0;
    6262             : }
    6263             : 
    6264           0 : static int page_alloc_cpu_online(unsigned int cpu)
    6265             : {
    6266             :         struct zone *zone;
    6267             : 
    6268           0 :         for_each_populated_zone(zone)
    6269           0 :                 zone_pcp_update(zone, 1);
    6270           0 :         return 0;
    6271             : }
    6272             : 
    6273           1 : void __init page_alloc_init_cpuhp(void)
    6274             : {
    6275             :         int ret;
    6276             : 
    6277           1 :         ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
    6278             :                                         "mm/page_alloc:pcp",
    6279             :                                         page_alloc_cpu_online,
    6280             :                                         page_alloc_cpu_dead);
    6281           1 :         WARN_ON(ret < 0);
    6282           1 : }
    6283             : 
    6284             : /*
    6285             :  * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
    6286             :  *      or min_free_kbytes changes.
    6287             :  */
    6288           2 : static void calculate_totalreserve_pages(void)
    6289             : {
    6290             :         struct pglist_data *pgdat;
    6291           2 :         unsigned long reserve_pages = 0;
    6292             :         enum zone_type i, j;
    6293             : 
    6294           4 :         for_each_online_pgdat(pgdat) {
    6295             : 
    6296           2 :                 pgdat->totalreserve_pages = 0;
    6297             : 
    6298           6 :                 for (i = 0; i < MAX_NR_ZONES; i++) {
    6299           4 :                         struct zone *zone = pgdat->node_zones + i;
    6300           4 :                         long max = 0;
    6301           4 :                         unsigned long managed_pages = zone_managed_pages(zone);
    6302             : 
    6303             :                         /* Find valid and maximum lowmem_reserve in the zone */
    6304          10 :                         for (j = i; j < MAX_NR_ZONES; j++) {
    6305           6 :                                 if (zone->lowmem_reserve[j] > max)
    6306           0 :                                         max = zone->lowmem_reserve[j];
    6307             :                         }
    6308             : 
    6309             :                         /* we treat the high watermark as reserved pages. */
    6310           4 :                         max += high_wmark_pages(zone);
    6311             : 
    6312           4 :                         if (max > managed_pages)
    6313           0 :                                 max = managed_pages;
    6314             : 
    6315           4 :                         pgdat->totalreserve_pages += max;
    6316             : 
    6317           4 :                         reserve_pages += max;
    6318             :                 }
    6319             :         }
    6320           2 :         totalreserve_pages = reserve_pages;
    6321           2 : }
    6322             : 
    6323             : /*
    6324             :  * setup_per_zone_lowmem_reserve - called whenever
    6325             :  *      sysctl_lowmem_reserve_ratio changes.  Ensures that each zone
    6326             :  *      has a correct pages reserved value, so an adequate number of
    6327             :  *      pages are left in the zone after a successful __alloc_pages().
    6328             :  */
    6329           1 : static void setup_per_zone_lowmem_reserve(void)
    6330             : {
    6331             :         struct pglist_data *pgdat;
    6332             :         enum zone_type i, j;
    6333             : 
    6334           2 :         for_each_online_pgdat(pgdat) {
    6335           2 :                 for (i = 0; i < MAX_NR_ZONES - 1; i++) {
    6336           1 :                         struct zone *zone = &pgdat->node_zones[i];
    6337           1 :                         int ratio = sysctl_lowmem_reserve_ratio[i];
    6338           2 :                         bool clear = !ratio || !zone_managed_pages(zone);
    6339           1 :                         unsigned long managed_pages = 0;
    6340             : 
    6341           2 :                         for (j = i + 1; j < MAX_NR_ZONES; j++) {
    6342           1 :                                 struct zone *upper_zone = &pgdat->node_zones[j];
    6343             : 
    6344           1 :                                 managed_pages += zone_managed_pages(upper_zone);
    6345             : 
    6346           1 :                                 if (clear)
    6347           0 :                                         zone->lowmem_reserve[j] = 0;
    6348             :                                 else
    6349           1 :                                         zone->lowmem_reserve[j] = managed_pages / ratio;
    6350             :                         }
    6351             :                 }
    6352             :         }
    6353             : 
    6354             :         /* update totalreserve_pages */
    6355           1 :         calculate_totalreserve_pages();
    6356           1 : }
    6357             : 
    6358           1 : static void __setup_per_zone_wmarks(void)
    6359             : {
    6360           1 :         unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
    6361           1 :         unsigned long lowmem_pages = 0;
    6362             :         struct zone *zone;
    6363             :         unsigned long flags;
    6364             : 
    6365             :         /* Calculate total number of !ZONE_HIGHMEM pages */
    6366           3 :         for_each_zone(zone) {
    6367           2 :                 if (!is_highmem(zone))
    6368           2 :                         lowmem_pages += zone_managed_pages(zone);
    6369             :         }
    6370             : 
    6371           3 :         for_each_zone(zone) {
    6372             :                 u64 tmp;
    6373             : 
    6374           2 :                 spin_lock_irqsave(&zone->lock, flags);
    6375           2 :                 tmp = (u64)pages_min * zone_managed_pages(zone);
    6376           2 :                 do_div(tmp, lowmem_pages);
    6377           2 :                 if (is_highmem(zone)) {
    6378             :                         /*
    6379             :                          * __GFP_HIGH and PF_MEMALLOC allocations usually don't
    6380             :                          * need highmem pages, so cap pages_min to a small
    6381             :                          * value here.
    6382             :                          *
    6383             :                          * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
    6384             :                          * deltas control async page reclaim, and so should
    6385             :                          * not be capped for highmem.
    6386             :                          */
    6387             :                         unsigned long min_pages;
    6388             : 
    6389             :                         min_pages = zone_managed_pages(zone) / 1024;
    6390             :                         min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
    6391             :                         zone->_watermark[WMARK_MIN] = min_pages;
    6392             :                 } else {
    6393             :                         /*
    6394             :                          * If it's a lowmem zone, reserve a number of pages
    6395             :                          * proportionate to the zone's size.
    6396             :                          */
    6397           2 :                         zone->_watermark[WMARK_MIN] = tmp;
    6398             :                 }
    6399             : 
    6400             :                 /*
    6401             :                  * Set the kswapd watermarks distance according to the
    6402             :                  * scale factor in proportion to available memory, but
    6403             :                  * ensure a minimum size on small systems.
    6404             :                  */
    6405           6 :                 tmp = max_t(u64, tmp >> 2,
    6406             :                             mult_frac(zone_managed_pages(zone),
    6407             :                                       watermark_scale_factor, 10000));
    6408             : 
    6409           2 :                 zone->watermark_boost = 0;
    6410           2 :                 zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
    6411           2 :                 zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
    6412           2 :                 zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
    6413             : 
    6414           4 :                 spin_unlock_irqrestore(&zone->lock, flags);
    6415             :         }
    6416             : 
    6417             :         /* update totalreserve_pages */
    6418           1 :         calculate_totalreserve_pages();
    6419           1 : }
    6420             : 
    6421             : /**
    6422             :  * setup_per_zone_wmarks - called when min_free_kbytes changes
    6423             :  * or when memory is hot-{added|removed}
    6424             :  *
    6425             :  * Ensures that the watermark[min,low,high] values for each zone are set
    6426             :  * correctly with respect to min_free_kbytes.
    6427             :  */
    6428           1 : void setup_per_zone_wmarks(void)
    6429             : {
    6430             :         struct zone *zone;
    6431             :         static DEFINE_SPINLOCK(lock);
    6432             : 
    6433           1 :         spin_lock(&lock);
    6434           1 :         __setup_per_zone_wmarks();
    6435           1 :         spin_unlock(&lock);
    6436             : 
    6437             :         /*
    6438             :          * The watermark size have changed so update the pcpu batch
    6439             :          * and high limits or the limits may be inappropriate.
    6440             :          */
    6441           3 :         for_each_zone(zone)
    6442           2 :                 zone_pcp_update(zone, 0);
    6443           1 : }
    6444             : 
    6445             : /*
    6446             :  * Initialise min_free_kbytes.
    6447             :  *
    6448             :  * For small machines we want it small (128k min).  For large machines
    6449             :  * we want it large (256MB max).  But it is not linear, because network
    6450             :  * bandwidth does not increase linearly with machine size.  We use
    6451             :  *
    6452             :  *      min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
    6453             :  *      min_free_kbytes = sqrt(lowmem_kbytes * 16)
    6454             :  *
    6455             :  * which yields
    6456             :  *
    6457             :  * 16MB:        512k
    6458             :  * 32MB:        724k
    6459             :  * 64MB:        1024k
    6460             :  * 128MB:       1448k
    6461             :  * 256MB:       2048k
    6462             :  * 512MB:       2896k
    6463             :  * 1024MB:      4096k
    6464             :  * 2048MB:      5792k
    6465             :  * 4096MB:      8192k
    6466             :  * 8192MB:      11584k
    6467             :  * 16384MB:     16384k
    6468             :  */
    6469           1 : void calculate_min_free_kbytes(void)
    6470             : {
    6471             :         unsigned long lowmem_kbytes;
    6472             :         int new_min_free_kbytes;
    6473             : 
    6474           1 :         lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
    6475           1 :         new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
    6476             : 
    6477           1 :         if (new_min_free_kbytes > user_min_free_kbytes)
    6478           1 :                 min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144);
    6479             :         else
    6480           0 :                 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
    6481             :                                 new_min_free_kbytes, user_min_free_kbytes);
    6482             : 
    6483           1 : }
    6484             : 
    6485           1 : int __meminit init_per_zone_wmark_min(void)
    6486             : {
    6487           1 :         calculate_min_free_kbytes();
    6488           1 :         setup_per_zone_wmarks();
    6489             :         refresh_zone_stat_thresholds();
    6490           1 :         setup_per_zone_lowmem_reserve();
    6491             : 
    6492             : #ifdef CONFIG_NUMA
    6493             :         setup_min_unmapped_ratio();
    6494             :         setup_min_slab_ratio();
    6495             : #endif
    6496             : 
    6497             :         khugepaged_min_free_kbytes_update();
    6498             : 
    6499           1 :         return 0;
    6500             : }
    6501             : postcore_initcall(init_per_zone_wmark_min)
    6502             : 
    6503             : /*
    6504             :  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
    6505             :  *      that we can call two helper functions whenever min_free_kbytes
    6506             :  *      changes.
    6507             :  */
    6508           0 : int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
    6509             :                 void *buffer, size_t *length, loff_t *ppos)
    6510             : {
    6511             :         int rc;
    6512             : 
    6513           0 :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    6514           0 :         if (rc)
    6515             :                 return rc;
    6516             : 
    6517           0 :         if (write) {
    6518           0 :                 user_min_free_kbytes = min_free_kbytes;
    6519           0 :                 setup_per_zone_wmarks();
    6520             :         }
    6521             :         return 0;
    6522             : }
    6523             : 
    6524           0 : int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
    6525             :                 void *buffer, size_t *length, loff_t *ppos)
    6526             : {
    6527             :         int rc;
    6528             : 
    6529           0 :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    6530           0 :         if (rc)
    6531             :                 return rc;
    6532             : 
    6533           0 :         if (write)
    6534           0 :                 setup_per_zone_wmarks();
    6535             : 
    6536             :         return 0;
    6537             : }
    6538             : 
    6539             : #ifdef CONFIG_NUMA
    6540             : static void setup_min_unmapped_ratio(void)
    6541             : {
    6542             :         pg_data_t *pgdat;
    6543             :         struct zone *zone;
    6544             : 
    6545             :         for_each_online_pgdat(pgdat)
    6546             :                 pgdat->min_unmapped_pages = 0;
    6547             : 
    6548             :         for_each_zone(zone)
    6549             :                 zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
    6550             :                                                          sysctl_min_unmapped_ratio) / 100;
    6551             : }
    6552             : 
    6553             : 
    6554             : int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
    6555             :                 void *buffer, size_t *length, loff_t *ppos)
    6556             : {
    6557             :         int rc;
    6558             : 
    6559             :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    6560             :         if (rc)
    6561             :                 return rc;
    6562             : 
    6563             :         setup_min_unmapped_ratio();
    6564             : 
    6565             :         return 0;
    6566             : }
    6567             : 
    6568             : static void setup_min_slab_ratio(void)
    6569             : {
    6570             :         pg_data_t *pgdat;
    6571             :         struct zone *zone;
    6572             : 
    6573             :         for_each_online_pgdat(pgdat)
    6574             :                 pgdat->min_slab_pages = 0;
    6575             : 
    6576             :         for_each_zone(zone)
    6577             :                 zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
    6578             :                                                      sysctl_min_slab_ratio) / 100;
    6579             : }
    6580             : 
    6581             : int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
    6582             :                 void *buffer, size_t *length, loff_t *ppos)
    6583             : {
    6584             :         int rc;
    6585             : 
    6586             :         rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
    6587             :         if (rc)
    6588             :                 return rc;
    6589             : 
    6590             :         setup_min_slab_ratio();
    6591             : 
    6592             :         return 0;
    6593             : }
    6594             : #endif
    6595             : 
    6596             : /*
    6597             :  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
    6598             :  *      proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
    6599             :  *      whenever sysctl_lowmem_reserve_ratio changes.
    6600             :  *
    6601             :  * The reserve ratio obviously has absolutely no relation with the
    6602             :  * minimum watermarks. The lowmem reserve ratio can only make sense
    6603             :  * if in function of the boot time zone sizes.
    6604             :  */
    6605           0 : int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
    6606             :                 void *buffer, size_t *length, loff_t *ppos)
    6607             : {
    6608             :         int i;
    6609             : 
    6610           0 :         proc_dointvec_minmax(table, write, buffer, length, ppos);
    6611             : 
    6612           0 :         for (i = 0; i < MAX_NR_ZONES; i++) {
    6613           0 :                 if (sysctl_lowmem_reserve_ratio[i] < 1)
    6614           0 :                         sysctl_lowmem_reserve_ratio[i] = 0;
    6615             :         }
    6616             : 
    6617           0 :         setup_per_zone_lowmem_reserve();
    6618           0 :         return 0;
    6619             : }
    6620             : 
    6621             : /*
    6622             :  * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
    6623             :  * cpu. It is the fraction of total pages in each zone that a hot per cpu
    6624             :  * pagelist can have before it gets flushed back to buddy allocator.
    6625             :  */
    6626           0 : int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
    6627             :                 int write, void *buffer, size_t *length, loff_t *ppos)
    6628             : {
    6629             :         struct zone *zone;
    6630             :         int old_percpu_pagelist_high_fraction;
    6631             :         int ret;
    6632             : 
    6633           0 :         mutex_lock(&pcp_batch_high_lock);
    6634           0 :         old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
    6635             : 
    6636           0 :         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
    6637           0 :         if (!write || ret < 0)
    6638             :                 goto out;
    6639             : 
    6640             :         /* Sanity checking to avoid pcp imbalance */
    6641           0 :         if (percpu_pagelist_high_fraction &&
    6642             :             percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
    6643           0 :                 percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
    6644           0 :                 ret = -EINVAL;
    6645           0 :                 goto out;
    6646             :         }
    6647             : 
    6648             :         /* No change? */
    6649           0 :         if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
    6650             :                 goto out;
    6651             : 
    6652           0 :         for_each_populated_zone(zone)
    6653           0 :                 zone_set_pageset_high_and_batch(zone, 0);
    6654             : out:
    6655           0 :         mutex_unlock(&pcp_batch_high_lock);
    6656           0 :         return ret;
    6657             : }
    6658             : 
    6659             : #ifdef CONFIG_CONTIG_ALLOC
    6660             : #if defined(CONFIG_DYNAMIC_DEBUG) || \
    6661             :         (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
    6662             : /* Usage: See admin-guide/dynamic-debug-howto.rst */
    6663             : static void alloc_contig_dump_pages(struct list_head *page_list)
    6664             : {
    6665             :         DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
    6666             : 
    6667             :         if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
    6668             :                 struct page *page;
    6669             : 
    6670             :                 dump_stack();
    6671             :                 list_for_each_entry(page, page_list, lru)
    6672             :                         dump_page(page, "migration failure");
    6673             :         }
    6674             : }
    6675             : #else
    6676             : static inline void alloc_contig_dump_pages(struct list_head *page_list)
    6677             : {
    6678             : }
    6679             : #endif
    6680             : 
    6681             : /* [start, end) must belong to a single zone. */
    6682             : int __alloc_contig_migrate_range(struct compact_control *cc,
    6683             :                                         unsigned long start, unsigned long end)
    6684             : {
    6685             :         /* This function is based on compact_zone() from compaction.c. */
    6686             :         unsigned int nr_reclaimed;
    6687             :         unsigned long pfn = start;
    6688             :         unsigned int tries = 0;
    6689             :         int ret = 0;
    6690             :         struct migration_target_control mtc = {
    6691             :                 .nid = zone_to_nid(cc->zone),
    6692             :                 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
    6693             :         };
    6694             : 
    6695             :         lru_cache_disable();
    6696             : 
    6697             :         while (pfn < end || !list_empty(&cc->migratepages)) {
    6698             :                 if (fatal_signal_pending(current)) {
    6699             :                         ret = -EINTR;
    6700             :                         break;
    6701             :                 }
    6702             : 
    6703             :                 if (list_empty(&cc->migratepages)) {
    6704             :                         cc->nr_migratepages = 0;
    6705             :                         ret = isolate_migratepages_range(cc, pfn, end);
    6706             :                         if (ret && ret != -EAGAIN)
    6707             :                                 break;
    6708             :                         pfn = cc->migrate_pfn;
    6709             :                         tries = 0;
    6710             :                 } else if (++tries == 5) {
    6711             :                         ret = -EBUSY;
    6712             :                         break;
    6713             :                 }
    6714             : 
    6715             :                 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
    6716             :                                                         &cc->migratepages);
    6717             :                 cc->nr_migratepages -= nr_reclaimed;
    6718             : 
    6719             :                 ret = migrate_pages(&cc->migratepages, alloc_migration_target,
    6720             :                         NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
    6721             : 
    6722             :                 /*
    6723             :                  * On -ENOMEM, migrate_pages() bails out right away. It is pointless
    6724             :                  * to retry again over this error, so do the same here.
    6725             :                  */
    6726             :                 if (ret == -ENOMEM)
    6727             :                         break;
    6728             :         }
    6729             : 
    6730             :         lru_cache_enable();
    6731             :         if (ret < 0) {
    6732             :                 if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
    6733             :                         alloc_contig_dump_pages(&cc->migratepages);
    6734             :                 putback_movable_pages(&cc->migratepages);
    6735             :                 return ret;
    6736             :         }
    6737             :         return 0;
    6738             : }
    6739             : 
    6740             : /**
    6741             :  * alloc_contig_range() -- tries to allocate given range of pages
    6742             :  * @start:      start PFN to allocate
    6743             :  * @end:        one-past-the-last PFN to allocate
    6744             :  * @migratetype:        migratetype of the underlying pageblocks (either
    6745             :  *                      #MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
    6746             :  *                      in range must have the same migratetype and it must
    6747             :  *                      be either of the two.
    6748             :  * @gfp_mask:   GFP mask to use during compaction
    6749             :  *
    6750             :  * The PFN range does not have to be pageblock aligned. The PFN range must
    6751             :  * belong to a single zone.
    6752             :  *
    6753             :  * The first thing this routine does is attempt to MIGRATE_ISOLATE all
    6754             :  * pageblocks in the range.  Once isolated, the pageblocks should not
    6755             :  * be modified by others.
    6756             :  *
    6757             :  * Return: zero on success or negative error code.  On success all
    6758             :  * pages which PFN is in [start, end) are allocated for the caller and
    6759             :  * need to be freed with free_contig_range().
    6760             :  */
    6761             : int alloc_contig_range(unsigned long start, unsigned long end,
    6762             :                        unsigned migratetype, gfp_t gfp_mask)
    6763             : {
    6764             :         unsigned long outer_start, outer_end;
    6765             :         int order;
    6766             :         int ret = 0;
    6767             : 
    6768             :         struct compact_control cc = {
    6769             :                 .nr_migratepages = 0,
    6770             :                 .order = -1,
    6771             :                 .zone = page_zone(pfn_to_page(start)),
    6772             :                 .mode = MIGRATE_SYNC,
    6773             :                 .ignore_skip_hint = true,
    6774             :                 .no_set_skip_hint = true,
    6775             :                 .gfp_mask = current_gfp_context(gfp_mask),
    6776             :                 .alloc_contig = true,
    6777             :         };
    6778             :         INIT_LIST_HEAD(&cc.migratepages);
    6779             : 
    6780             :         /*
    6781             :          * What we do here is we mark all pageblocks in range as
    6782             :          * MIGRATE_ISOLATE.  Because pageblock and max order pages may
    6783             :          * have different sizes, and due to the way page allocator
    6784             :          * work, start_isolate_page_range() has special handlings for this.
    6785             :          *
    6786             :          * Once the pageblocks are marked as MIGRATE_ISOLATE, we
    6787             :          * migrate the pages from an unaligned range (ie. pages that
    6788             :          * we are interested in). This will put all the pages in
    6789             :          * range back to page allocator as MIGRATE_ISOLATE.
    6790             :          *
    6791             :          * When this is done, we take the pages in range from page
    6792             :          * allocator removing them from the buddy system.  This way
    6793             :          * page allocator will never consider using them.
    6794             :          *
    6795             :          * This lets us mark the pageblocks back as
    6796             :          * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
    6797             :          * aligned range but not in the unaligned, original range are
    6798             :          * put back to page allocator so that buddy can use them.
    6799             :          */
    6800             : 
    6801             :         ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
    6802             :         if (ret)
    6803             :                 goto done;
    6804             : 
    6805             :         drain_all_pages(cc.zone);
    6806             : 
    6807             :         /*
    6808             :          * In case of -EBUSY, we'd like to know which page causes problem.
    6809             :          * So, just fall through. test_pages_isolated() has a tracepoint
    6810             :          * which will report the busy page.
    6811             :          *
    6812             :          * It is possible that busy pages could become available before
    6813             :          * the call to test_pages_isolated, and the range will actually be
    6814             :          * allocated.  So, if we fall through be sure to clear ret so that
    6815             :          * -EBUSY is not accidentally used or returned to caller.
    6816             :          */
    6817             :         ret = __alloc_contig_migrate_range(&cc, start, end);
    6818             :         if (ret && ret != -EBUSY)
    6819             :                 goto done;
    6820             :         ret = 0;
    6821             : 
    6822             :         /*
    6823             :          * Pages from [start, end) are within a pageblock_nr_pages
    6824             :          * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
    6825             :          * more, all pages in [start, end) are free in page allocator.
    6826             :          * What we are going to do is to allocate all pages from
    6827             :          * [start, end) (that is remove them from page allocator).
    6828             :          *
    6829             :          * The only problem is that pages at the beginning and at the
    6830             :          * end of interesting range may be not aligned with pages that
    6831             :          * page allocator holds, ie. they can be part of higher order
    6832             :          * pages.  Because of this, we reserve the bigger range and
    6833             :          * once this is done free the pages we are not interested in.
    6834             :          *
    6835             :          * We don't have to hold zone->lock here because the pages are
    6836             :          * isolated thus they won't get removed from buddy.
    6837             :          */
    6838             : 
    6839             :         order = 0;
    6840             :         outer_start = start;
    6841             :         while (!PageBuddy(pfn_to_page(outer_start))) {
    6842             :                 if (++order > MAX_ORDER) {
    6843             :                         outer_start = start;
    6844             :                         break;
    6845             :                 }
    6846             :                 outer_start &= ~0UL << order;
    6847             :         }
    6848             : 
    6849             :         if (outer_start != start) {
    6850             :                 order = buddy_order(pfn_to_page(outer_start));
    6851             : 
    6852             :                 /*
    6853             :                  * outer_start page could be small order buddy page and
    6854             :                  * it doesn't include start page. Adjust outer_start
    6855             :                  * in this case to report failed page properly
    6856             :                  * on tracepoint in test_pages_isolated()
    6857             :                  */
    6858             :                 if (outer_start + (1UL << order) <= start)
    6859             :                         outer_start = start;
    6860             :         }
    6861             : 
    6862             :         /* Make sure the range is really isolated. */
    6863             :         if (test_pages_isolated(outer_start, end, 0)) {
    6864             :                 ret = -EBUSY;
    6865             :                 goto done;
    6866             :         }
    6867             : 
    6868             :         /* Grab isolated pages from freelists. */
    6869             :         outer_end = isolate_freepages_range(&cc, outer_start, end);
    6870             :         if (!outer_end) {
    6871             :                 ret = -EBUSY;
    6872             :                 goto done;
    6873             :         }
    6874             : 
    6875             :         /* Free head and tail (if any) */
    6876             :         if (start != outer_start)
    6877             :                 free_contig_range(outer_start, start - outer_start);
    6878             :         if (end != outer_end)
    6879             :                 free_contig_range(end, outer_end - end);
    6880             : 
    6881             : done:
    6882             :         undo_isolate_page_range(start, end, migratetype);
    6883             :         return ret;
    6884             : }
    6885             : EXPORT_SYMBOL(alloc_contig_range);
    6886             : 
    6887             : static int __alloc_contig_pages(unsigned long start_pfn,
    6888             :                                 unsigned long nr_pages, gfp_t gfp_mask)
    6889             : {
    6890             :         unsigned long end_pfn = start_pfn + nr_pages;
    6891             : 
    6892             :         return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
    6893             :                                   gfp_mask);
    6894             : }
    6895             : 
    6896             : static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
    6897             :                                    unsigned long nr_pages)
    6898             : {
    6899             :         unsigned long i, end_pfn = start_pfn + nr_pages;
    6900             :         struct page *page;
    6901             : 
    6902             :         for (i = start_pfn; i < end_pfn; i++) {
    6903             :                 page = pfn_to_online_page(i);
    6904             :                 if (!page)
    6905             :                         return false;
    6906             : 
    6907             :                 if (page_zone(page) != z)
    6908             :                         return false;
    6909             : 
    6910             :                 if (PageReserved(page))
    6911             :                         return false;
    6912             : 
    6913             :                 if (PageHuge(page))
    6914             :                         return false;
    6915             :         }
    6916             :         return true;
    6917             : }
    6918             : 
    6919             : static bool zone_spans_last_pfn(const struct zone *zone,
    6920             :                                 unsigned long start_pfn, unsigned long nr_pages)
    6921             : {
    6922             :         unsigned long last_pfn = start_pfn + nr_pages - 1;
    6923             : 
    6924             :         return zone_spans_pfn(zone, last_pfn);
    6925             : }
    6926             : 
    6927             : /**
    6928             :  * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
    6929             :  * @nr_pages:   Number of contiguous pages to allocate
    6930             :  * @gfp_mask:   GFP mask to limit search and used during compaction
    6931             :  * @nid:        Target node
    6932             :  * @nodemask:   Mask for other possible nodes
    6933             :  *
    6934             :  * This routine is a wrapper around alloc_contig_range(). It scans over zones
    6935             :  * on an applicable zonelist to find a contiguous pfn range which can then be
    6936             :  * tried for allocation with alloc_contig_range(). This routine is intended
    6937             :  * for allocation requests which can not be fulfilled with the buddy allocator.
    6938             :  *
    6939             :  * The allocated memory is always aligned to a page boundary. If nr_pages is a
    6940             :  * power of two, then allocated range is also guaranteed to be aligned to same
    6941             :  * nr_pages (e.g. 1GB request would be aligned to 1GB).
    6942             :  *
    6943             :  * Allocated pages can be freed with free_contig_range() or by manually calling
    6944             :  * __free_page() on each allocated page.
    6945             :  *
    6946             :  * Return: pointer to contiguous pages on success, or NULL if not successful.
    6947             :  */
    6948             : struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
    6949             :                                 int nid, nodemask_t *nodemask)
    6950             : {
    6951             :         unsigned long ret, pfn, flags;
    6952             :         struct zonelist *zonelist;
    6953             :         struct zone *zone;
    6954             :         struct zoneref *z;
    6955             : 
    6956             :         zonelist = node_zonelist(nid, gfp_mask);
    6957             :         for_each_zone_zonelist_nodemask(zone, z, zonelist,
    6958             :                                         gfp_zone(gfp_mask), nodemask) {
    6959             :                 spin_lock_irqsave(&zone->lock, flags);
    6960             : 
    6961             :                 pfn = ALIGN(zone->zone_start_pfn, nr_pages);
    6962             :                 while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
    6963             :                         if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
    6964             :                                 /*
    6965             :                                  * We release the zone lock here because
    6966             :                                  * alloc_contig_range() will also lock the zone
    6967             :                                  * at some point. If there's an allocation
    6968             :                                  * spinning on this lock, it may win the race
    6969             :                                  * and cause alloc_contig_range() to fail...
    6970             :                                  */
    6971             :                                 spin_unlock_irqrestore(&zone->lock, flags);
    6972             :                                 ret = __alloc_contig_pages(pfn, nr_pages,
    6973             :                                                         gfp_mask);
    6974             :                                 if (!ret)
    6975             :                                         return pfn_to_page(pfn);
    6976             :                                 spin_lock_irqsave(&zone->lock, flags);
    6977             :                         }
    6978             :                         pfn += nr_pages;
    6979             :                 }
    6980             :                 spin_unlock_irqrestore(&zone->lock, flags);
    6981             :         }
    6982             :         return NULL;
    6983             : }
    6984             : #endif /* CONFIG_CONTIG_ALLOC */
    6985             : 
    6986           0 : void free_contig_range(unsigned long pfn, unsigned long nr_pages)
    6987             : {
    6988           0 :         unsigned long count = 0;
    6989             : 
    6990           0 :         for (; nr_pages--; pfn++) {
    6991           0 :                 struct page *page = pfn_to_page(pfn);
    6992             : 
    6993           0 :                 count += page_count(page) != 1;
    6994           0 :                 __free_page(page);
    6995             :         }
    6996           0 :         WARN(count != 0, "%lu pages are still in use!\n", count);
    6997           0 : }
    6998             : EXPORT_SYMBOL(free_contig_range);
    6999             : 
    7000             : /*
    7001             :  * Effectively disable pcplists for the zone by setting the high limit to 0
    7002             :  * and draining all cpus. A concurrent page freeing on another CPU that's about
    7003             :  * to put the page on pcplist will either finish before the drain and the page
    7004             :  * will be drained, or observe the new high limit and skip the pcplist.
    7005             :  *
    7006             :  * Must be paired with a call to zone_pcp_enable().
    7007             :  */
    7008           0 : void zone_pcp_disable(struct zone *zone)
    7009             : {
    7010           0 :         mutex_lock(&pcp_batch_high_lock);
    7011           0 :         __zone_set_pageset_high_and_batch(zone, 0, 1);
    7012           0 :         __drain_all_pages(zone, true);
    7013           0 : }
    7014             : 
    7015           0 : void zone_pcp_enable(struct zone *zone)
    7016             : {
    7017           0 :         __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
    7018           0 :         mutex_unlock(&pcp_batch_high_lock);
    7019           0 : }
    7020             : 
    7021           0 : void zone_pcp_reset(struct zone *zone)
    7022             : {
    7023             :         int cpu;
    7024             :         struct per_cpu_zonestat *pzstats;
    7025             : 
    7026           0 :         if (zone->per_cpu_pageset != &boot_pageset) {
    7027             :                 for_each_online_cpu(cpu) {
    7028             :                         pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
    7029             :                         drain_zonestat(zone, pzstats);
    7030             :                 }
    7031           0 :                 free_percpu(zone->per_cpu_pageset);
    7032           0 :                 zone->per_cpu_pageset = &boot_pageset;
    7033           0 :                 if (zone->per_cpu_zonestats != &boot_zonestats) {
    7034           0 :                         free_percpu(zone->per_cpu_zonestats);
    7035           0 :                         zone->per_cpu_zonestats = &boot_zonestats;
    7036             :                 }
    7037             :         }
    7038           0 : }
    7039             : 
    7040             : #ifdef CONFIG_MEMORY_HOTREMOVE
    7041             : /*
    7042             :  * All pages in the range must be in a single zone, must not contain holes,
    7043             :  * must span full sections, and must be isolated before calling this function.
    7044             :  */
    7045             : void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
    7046             : {
    7047             :         unsigned long pfn = start_pfn;
    7048             :         struct page *page;
    7049             :         struct zone *zone;
    7050             :         unsigned int order;
    7051             :         unsigned long flags;
    7052             : 
    7053             :         offline_mem_sections(pfn, end_pfn);
    7054             :         zone = page_zone(pfn_to_page(pfn));
    7055             :         spin_lock_irqsave(&zone->lock, flags);
    7056             :         while (pfn < end_pfn) {
    7057             :                 page = pfn_to_page(pfn);
    7058             :                 /*
    7059             :                  * The HWPoisoned page may be not in buddy system, and
    7060             :                  * page_count() is not 0.
    7061             :                  */
    7062             :                 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
    7063             :                         pfn++;
    7064             :                         continue;
    7065             :                 }
    7066             :                 /*
    7067             :                  * At this point all remaining PageOffline() pages have a
    7068             :                  * reference count of 0 and can simply be skipped.
    7069             :                  */
    7070             :                 if (PageOffline(page)) {
    7071             :                         BUG_ON(page_count(page));
    7072             :                         BUG_ON(PageBuddy(page));
    7073             :                         pfn++;
    7074             :                         continue;
    7075             :                 }
    7076             : 
    7077             :                 BUG_ON(page_count(page));
    7078             :                 BUG_ON(!PageBuddy(page));
    7079             :                 order = buddy_order(page);
    7080             :                 del_page_from_free_list(page, zone, order);
    7081             :                 pfn += (1 << order);
    7082             :         }
    7083             :         spin_unlock_irqrestore(&zone->lock, flags);
    7084             : }
    7085             : #endif
    7086             : 
    7087             : /*
    7088             :  * This function returns a stable result only if called under zone lock.
    7089             :  */
    7090           0 : bool is_free_buddy_page(struct page *page)
    7091             : {
    7092           0 :         unsigned long pfn = page_to_pfn(page);
    7093             :         unsigned int order;
    7094             : 
    7095           0 :         for (order = 0; order <= MAX_ORDER; order++) {
    7096           0 :                 struct page *page_head = page - (pfn & ((1 << order) - 1));
    7097             : 
    7098           0 :                 if (PageBuddy(page_head) &&
    7099           0 :                     buddy_order_unsafe(page_head) >= order)
    7100             :                         break;
    7101             :         }
    7102             : 
    7103           0 :         return order <= MAX_ORDER;
    7104             : }
    7105             : EXPORT_SYMBOL(is_free_buddy_page);
    7106             : 
    7107             : #ifdef CONFIG_MEMORY_FAILURE
    7108             : /*
    7109             :  * Break down a higher-order page in sub-pages, and keep our target out of
    7110             :  * buddy allocator.
    7111             :  */
    7112             : static void break_down_buddy_pages(struct zone *zone, struct page *page,
    7113             :                                    struct page *target, int low, int high,
    7114             :                                    int migratetype)
    7115             : {
    7116             :         unsigned long size = 1 << high;
    7117             :         struct page *current_buddy, *next_page;
    7118             : 
    7119             :         while (high > low) {
    7120             :                 high--;
    7121             :                 size >>= 1;
    7122             : 
    7123             :                 if (target >= &page[size]) {
    7124             :                         next_page = page + size;
    7125             :                         current_buddy = page;
    7126             :                 } else {
    7127             :                         next_page = page;
    7128             :                         current_buddy = page + size;
    7129             :                 }
    7130             : 
    7131             :                 if (set_page_guard(zone, current_buddy, high, migratetype))
    7132             :                         continue;
    7133             : 
    7134             :                 if (current_buddy != target) {
    7135             :                         add_to_free_list(current_buddy, zone, high, migratetype);
    7136             :                         set_buddy_order(current_buddy, high);
    7137             :                         page = next_page;
    7138             :                 }
    7139             :         }
    7140             : }
    7141             : 
    7142             : /*
    7143             :  * Take a page that will be marked as poisoned off the buddy allocator.
    7144             :  */
    7145             : bool take_page_off_buddy(struct page *page)
    7146             : {
    7147             :         struct zone *zone = page_zone(page);
    7148             :         unsigned long pfn = page_to_pfn(page);
    7149             :         unsigned long flags;
    7150             :         unsigned int order;
    7151             :         bool ret = false;
    7152             : 
    7153             :         spin_lock_irqsave(&zone->lock, flags);
    7154             :         for (order = 0; order <= MAX_ORDER; order++) {
    7155             :                 struct page *page_head = page - (pfn & ((1 << order) - 1));
    7156             :                 int page_order = buddy_order(page_head);
    7157             : 
    7158             :                 if (PageBuddy(page_head) && page_order >= order) {
    7159             :                         unsigned long pfn_head = page_to_pfn(page_head);
    7160             :                         int migratetype = get_pfnblock_migratetype(page_head,
    7161             :                                                                    pfn_head);
    7162             : 
    7163             :                         del_page_from_free_list(page_head, zone, page_order);
    7164             :                         break_down_buddy_pages(zone, page_head, page, 0,
    7165             :                                                 page_order, migratetype);
    7166             :                         SetPageHWPoisonTakenOff(page);
    7167             :                         if (!is_migrate_isolate(migratetype))
    7168             :                                 __mod_zone_freepage_state(zone, -1, migratetype);
    7169             :                         ret = true;
    7170             :                         break;
    7171             :                 }
    7172             :                 if (page_count(page_head) > 0)
    7173             :                         break;
    7174             :         }
    7175             :         spin_unlock_irqrestore(&zone->lock, flags);
    7176             :         return ret;
    7177             : }
    7178             : 
    7179             : /*
    7180             :  * Cancel takeoff done by take_page_off_buddy().
    7181             :  */
    7182             : bool put_page_back_buddy(struct page *page)
    7183             : {
    7184             :         struct zone *zone = page_zone(page);
    7185             :         unsigned long pfn = page_to_pfn(page);
    7186             :         unsigned long flags;
    7187             :         int migratetype = get_pfnblock_migratetype(page, pfn);
    7188             :         bool ret = false;
    7189             : 
    7190             :         spin_lock_irqsave(&zone->lock, flags);
    7191             :         if (put_page_testzero(page)) {
    7192             :                 ClearPageHWPoisonTakenOff(page);
    7193             :                 __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
    7194             :                 if (TestClearPageHWPoison(page)) {
    7195             :                         ret = true;
    7196             :                 }
    7197             :         }
    7198             :         spin_unlock_irqrestore(&zone->lock, flags);
    7199             : 
    7200             :         return ret;
    7201             : }
    7202             : #endif
    7203             : 
    7204             : #ifdef CONFIG_ZONE_DMA
    7205             : bool has_managed_dma(void)
    7206             : {
    7207             :         struct pglist_data *pgdat;
    7208             : 
    7209             :         for_each_online_pgdat(pgdat) {
    7210             :                 struct zone *zone = &pgdat->node_zones[ZONE_DMA];
    7211             : 
    7212             :                 if (managed_zone(zone))
    7213             :                         return true;
    7214             :         }
    7215             :         return false;
    7216             : }
    7217             : #endif /* CONFIG_ZONE_DMA */

Generated by: LCOV version 1.14