Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * linux/mm/page_alloc.c
4 : *
5 : * Manages the free list, the system allocates free pages here.
6 : * Note that kmalloc() lives in slab.c
7 : *
8 : * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
9 : * Swap reorganised 29.12.95, Stephen Tweedie
10 : * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11 : * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
12 : * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
13 : * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
14 : * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
15 : * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
16 : */
17 :
18 : #include <linux/stddef.h>
19 : #include <linux/mm.h>
20 : #include <linux/highmem.h>
21 : #include <linux/swap.h>
22 : #include <linux/swapops.h>
23 : #include <linux/interrupt.h>
24 : #include <linux/pagemap.h>
25 : #include <linux/jiffies.h>
26 : #include <linux/memblock.h>
27 : #include <linux/compiler.h>
28 : #include <linux/kernel.h>
29 : #include <linux/kasan.h>
30 : #include <linux/kmsan.h>
31 : #include <linux/module.h>
32 : #include <linux/suspend.h>
33 : #include <linux/pagevec.h>
34 : #include <linux/blkdev.h>
35 : #include <linux/slab.h>
36 : #include <linux/ratelimit.h>
37 : #include <linux/oom.h>
38 : #include <linux/topology.h>
39 : #include <linux/sysctl.h>
40 : #include <linux/cpu.h>
41 : #include <linux/cpuset.h>
42 : #include <linux/memory_hotplug.h>
43 : #include <linux/nodemask.h>
44 : #include <linux/vmalloc.h>
45 : #include <linux/vmstat.h>
46 : #include <linux/mempolicy.h>
47 : #include <linux/memremap.h>
48 : #include <linux/stop_machine.h>
49 : #include <linux/random.h>
50 : #include <linux/sort.h>
51 : #include <linux/pfn.h>
52 : #include <linux/backing-dev.h>
53 : #include <linux/fault-inject.h>
54 : #include <linux/page-isolation.h>
55 : #include <linux/debugobjects.h>
56 : #include <linux/kmemleak.h>
57 : #include <linux/compaction.h>
58 : #include <trace/events/kmem.h>
59 : #include <trace/events/oom.h>
60 : #include <linux/prefetch.h>
61 : #include <linux/mm_inline.h>
62 : #include <linux/mmu_notifier.h>
63 : #include <linux/migrate.h>
64 : #include <linux/hugetlb.h>
65 : #include <linux/sched/rt.h>
66 : #include <linux/sched/mm.h>
67 : #include <linux/page_owner.h>
68 : #include <linux/page_table_check.h>
69 : #include <linux/kthread.h>
70 : #include <linux/memcontrol.h>
71 : #include <linux/ftrace.h>
72 : #include <linux/lockdep.h>
73 : #include <linux/nmi.h>
74 : #include <linux/psi.h>
75 : #include <linux/padata.h>
76 : #include <linux/khugepaged.h>
77 : #include <linux/buffer_head.h>
78 : #include <linux/delayacct.h>
79 : #include <asm/sections.h>
80 : #include <asm/tlbflush.h>
81 : #include <asm/div64.h>
82 : #include "internal.h"
83 : #include "shuffle.h"
84 : #include "page_reporting.h"
85 : #include "swap.h"
86 :
87 : /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
88 : typedef int __bitwise fpi_t;
89 :
90 : /* No special request */
91 : #define FPI_NONE ((__force fpi_t)0)
92 :
93 : /*
94 : * Skip free page reporting notification for the (possibly merged) page.
95 : * This does not hinder free page reporting from grabbing the page,
96 : * reporting it and marking it "reported" - it only skips notifying
97 : * the free page reporting infrastructure about a newly freed page. For
98 : * example, used when temporarily pulling a page from a freelist and
99 : * putting it back unmodified.
100 : */
101 : #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
102 :
103 : /*
104 : * Place the (possibly merged) page to the tail of the freelist. Will ignore
105 : * page shuffling (relevant code - e.g., memory onlining - is expected to
106 : * shuffle the whole zone).
107 : *
108 : * Note: No code should rely on this flag for correctness - it's purely
109 : * to allow for optimizations when handing back either fresh pages
110 : * (memory onlining) or untouched pages (page isolation, free page
111 : * reporting).
112 : */
113 : #define FPI_TO_TAIL ((__force fpi_t)BIT(1))
114 :
115 : /*
116 : * Don't poison memory with KASAN (only for the tag-based modes).
117 : * During boot, all non-reserved memblock memory is exposed to page_alloc.
118 : * Poisoning all that memory lengthens boot time, especially on systems with
119 : * large amount of RAM. This flag is used to skip that poisoning.
120 : * This is only done for the tag-based KASAN modes, as those are able to
121 : * detect memory corruptions with the memory tags assigned by default.
122 : * All memory allocated normally after boot gets poisoned as usual.
123 : */
124 : #define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
125 :
126 : /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
127 : static DEFINE_MUTEX(pcp_batch_high_lock);
128 : #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
129 :
130 : #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
131 : /*
132 : * On SMP, spin_trylock is sufficient protection.
133 : * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
134 : */
135 : #define pcp_trylock_prepare(flags) do { } while (0)
136 : #define pcp_trylock_finish(flag) do { } while (0)
137 : #else
138 :
139 : /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
140 : #define pcp_trylock_prepare(flags) local_irq_save(flags)
141 : #define pcp_trylock_finish(flags) local_irq_restore(flags)
142 : #endif
143 :
144 : /*
145 : * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
146 : * a migration causing the wrong PCP to be locked and remote memory being
147 : * potentially allocated, pin the task to the CPU for the lookup+lock.
148 : * preempt_disable is used on !RT because it is faster than migrate_disable.
149 : * migrate_disable is used on RT because otherwise RT spinlock usage is
150 : * interfered with and a high priority task cannot preempt the allocator.
151 : */
152 : #ifndef CONFIG_PREEMPT_RT
153 : #define pcpu_task_pin() preempt_disable()
154 : #define pcpu_task_unpin() preempt_enable()
155 : #else
156 : #define pcpu_task_pin() migrate_disable()
157 : #define pcpu_task_unpin() migrate_enable()
158 : #endif
159 :
160 : /*
161 : * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
162 : * Return value should be used with equivalent unlock helper.
163 : */
164 : #define pcpu_spin_lock(type, member, ptr) \
165 : ({ \
166 : type *_ret; \
167 : pcpu_task_pin(); \
168 : _ret = this_cpu_ptr(ptr); \
169 : spin_lock(&_ret->member); \
170 : _ret; \
171 : })
172 :
173 : #define pcpu_spin_trylock(type, member, ptr) \
174 : ({ \
175 : type *_ret; \
176 : pcpu_task_pin(); \
177 : _ret = this_cpu_ptr(ptr); \
178 : if (!spin_trylock(&_ret->member)) { \
179 : pcpu_task_unpin(); \
180 : _ret = NULL; \
181 : } \
182 : _ret; \
183 : })
184 :
185 : #define pcpu_spin_unlock(member, ptr) \
186 : ({ \
187 : spin_unlock(&ptr->member); \
188 : pcpu_task_unpin(); \
189 : })
190 :
191 : /* struct per_cpu_pages specific helpers. */
192 : #define pcp_spin_lock(ptr) \
193 : pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
194 :
195 : #define pcp_spin_trylock(ptr) \
196 : pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
197 :
198 : #define pcp_spin_unlock(ptr) \
199 : pcpu_spin_unlock(lock, ptr)
200 :
201 : #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
202 : DEFINE_PER_CPU(int, numa_node);
203 : EXPORT_PER_CPU_SYMBOL(numa_node);
204 : #endif
205 :
206 : DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
207 :
208 : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
209 : /*
210 : * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
211 : * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
212 : * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
213 : * defined in <linux/topology.h>.
214 : */
215 : DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
216 : EXPORT_PER_CPU_SYMBOL(_numa_mem_);
217 : #endif
218 :
219 : static DEFINE_MUTEX(pcpu_drain_mutex);
220 :
221 : #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
222 : volatile unsigned long latent_entropy __latent_entropy;
223 : EXPORT_SYMBOL(latent_entropy);
224 : #endif
225 :
226 : /*
227 : * Array of node states.
228 : */
229 : nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
230 : [N_POSSIBLE] = NODE_MASK_ALL,
231 : [N_ONLINE] = { { [0] = 1UL } },
232 : #ifndef CONFIG_NUMA
233 : [N_NORMAL_MEMORY] = { { [0] = 1UL } },
234 : #ifdef CONFIG_HIGHMEM
235 : [N_HIGH_MEMORY] = { { [0] = 1UL } },
236 : #endif
237 : [N_MEMORY] = { { [0] = 1UL } },
238 : [N_CPU] = { { [0] = 1UL } },
239 : #endif /* NUMA */
240 : };
241 : EXPORT_SYMBOL(node_states);
242 :
243 : atomic_long_t _totalram_pages __read_mostly;
244 : EXPORT_SYMBOL(_totalram_pages);
245 : unsigned long totalreserve_pages __read_mostly;
246 : unsigned long totalcma_pages __read_mostly;
247 :
248 : int percpu_pagelist_high_fraction;
249 : gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
250 : DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
251 : EXPORT_SYMBOL(init_on_alloc);
252 :
253 : DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
254 : EXPORT_SYMBOL(init_on_free);
255 :
256 : static bool _init_on_alloc_enabled_early __read_mostly
257 : = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
258 0 : static int __init early_init_on_alloc(char *buf)
259 : {
260 :
261 0 : return kstrtobool(buf, &_init_on_alloc_enabled_early);
262 : }
263 : early_param("init_on_alloc", early_init_on_alloc);
264 :
265 : static bool _init_on_free_enabled_early __read_mostly
266 : = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
267 0 : static int __init early_init_on_free(char *buf)
268 : {
269 0 : return kstrtobool(buf, &_init_on_free_enabled_early);
270 : }
271 : early_param("init_on_free", early_init_on_free);
272 :
273 : /*
274 : * A cached value of the page's pageblock's migratetype, used when the page is
275 : * put on a pcplist. Used to avoid the pageblock migratetype lookup when
276 : * freeing from pcplists in most cases, at the cost of possibly becoming stale.
277 : * Also the migratetype set in the page does not necessarily match the pcplist
278 : * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
279 : * other index - this ensures that it will be put on the correct CMA freelist.
280 : */
281 : static inline int get_pcppage_migratetype(struct page *page)
282 : {
283 55618 : return page->index;
284 : }
285 :
286 : static inline void set_pcppage_migratetype(struct page *page, int migratetype)
287 : {
288 56894 : page->index = migratetype;
289 : }
290 :
291 : #ifdef CONFIG_PM_SLEEP
292 : /*
293 : * The following functions are used by the suspend/hibernate code to temporarily
294 : * change gfp_allowed_mask in order to avoid using I/O during memory allocations
295 : * while devices are suspended. To avoid races with the suspend/hibernate code,
296 : * they should always be called with system_transition_mutex held
297 : * (gfp_allowed_mask also should only be modified with system_transition_mutex
298 : * held, unless the suspend/hibernate code is guaranteed not to run in parallel
299 : * with that modification).
300 : */
301 :
302 : static gfp_t saved_gfp_mask;
303 :
304 0 : void pm_restore_gfp_mask(void)
305 : {
306 0 : WARN_ON(!mutex_is_locked(&system_transition_mutex));
307 0 : if (saved_gfp_mask) {
308 0 : gfp_allowed_mask = saved_gfp_mask;
309 0 : saved_gfp_mask = 0;
310 : }
311 0 : }
312 :
313 0 : void pm_restrict_gfp_mask(void)
314 : {
315 0 : WARN_ON(!mutex_is_locked(&system_transition_mutex));
316 0 : WARN_ON(saved_gfp_mask);
317 0 : saved_gfp_mask = gfp_allowed_mask;
318 0 : gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
319 0 : }
320 :
321 0 : bool pm_suspended_storage(void)
322 : {
323 0 : if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
324 : return false;
325 0 : return true;
326 : }
327 : #endif /* CONFIG_PM_SLEEP */
328 :
329 : #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
330 : unsigned int pageblock_order __read_mostly;
331 : #endif
332 :
333 : static void __free_pages_ok(struct page *page, unsigned int order,
334 : fpi_t fpi_flags);
335 :
336 : /*
337 : * results with 256, 32 in the lowmem_reserve sysctl:
338 : * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
339 : * 1G machine -> (16M dma, 784M normal, 224M high)
340 : * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
341 : * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
342 : * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
343 : *
344 : * TBD: should special case ZONE_DMA32 machines here - in those we normally
345 : * don't need any ZONE_NORMAL reservation
346 : */
347 : int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
348 : #ifdef CONFIG_ZONE_DMA
349 : [ZONE_DMA] = 256,
350 : #endif
351 : #ifdef CONFIG_ZONE_DMA32
352 : [ZONE_DMA32] = 256,
353 : #endif
354 : [ZONE_NORMAL] = 32,
355 : #ifdef CONFIG_HIGHMEM
356 : [ZONE_HIGHMEM] = 0,
357 : #endif
358 : [ZONE_MOVABLE] = 0,
359 : };
360 :
361 : static char * const zone_names[MAX_NR_ZONES] = {
362 : #ifdef CONFIG_ZONE_DMA
363 : "DMA",
364 : #endif
365 : #ifdef CONFIG_ZONE_DMA32
366 : "DMA32",
367 : #endif
368 : "Normal",
369 : #ifdef CONFIG_HIGHMEM
370 : "HighMem",
371 : #endif
372 : "Movable",
373 : #ifdef CONFIG_ZONE_DEVICE
374 : "Device",
375 : #endif
376 : };
377 :
378 : const char * const migratetype_names[MIGRATE_TYPES] = {
379 : "Unmovable",
380 : "Movable",
381 : "Reclaimable",
382 : "HighAtomic",
383 : #ifdef CONFIG_CMA
384 : "CMA",
385 : #endif
386 : #ifdef CONFIG_MEMORY_ISOLATION
387 : "Isolate",
388 : #endif
389 : };
390 :
391 : compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
392 : [NULL_COMPOUND_DTOR] = NULL,
393 : [COMPOUND_PAGE_DTOR] = free_compound_page,
394 : #ifdef CONFIG_HUGETLB_PAGE
395 : [HUGETLB_PAGE_DTOR] = free_huge_page,
396 : #endif
397 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
398 : [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
399 : #endif
400 : };
401 :
402 : int min_free_kbytes = 1024;
403 : int user_min_free_kbytes = -1;
404 : int watermark_boost_factor __read_mostly = 15000;
405 : int watermark_scale_factor = 10;
406 :
407 : static unsigned long nr_kernel_pages __initdata;
408 : static unsigned long nr_all_pages __initdata;
409 : static unsigned long dma_reserve __initdata;
410 :
411 : static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
412 : static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
413 : static unsigned long required_kernelcore __initdata;
414 : static unsigned long required_kernelcore_percent __initdata;
415 : static unsigned long required_movablecore __initdata;
416 : static unsigned long required_movablecore_percent __initdata;
417 : static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
418 : bool mirrored_kernelcore __initdata_memblock;
419 :
420 : /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
421 : int movable_zone;
422 : EXPORT_SYMBOL(movable_zone);
423 :
424 : #if MAX_NUMNODES > 1
425 : unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
426 : unsigned int nr_online_nodes __read_mostly = 1;
427 : EXPORT_SYMBOL(nr_node_ids);
428 : EXPORT_SYMBOL(nr_online_nodes);
429 : #endif
430 :
431 : int page_group_by_mobility_disabled __read_mostly;
432 :
433 : bool deferred_struct_pages __meminitdata;
434 :
435 : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
436 : /*
437 : * During boot we initialize deferred pages on-demand, as needed, but once
438 : * page_alloc_init_late() has finished, the deferred pages are all initialized,
439 : * and we can permanently disable that path.
440 : */
441 : static DEFINE_STATIC_KEY_TRUE(deferred_pages);
442 :
443 : static inline bool deferred_pages_enabled(void)
444 : {
445 : return static_branch_unlikely(&deferred_pages);
446 : }
447 :
448 : /* Returns true if the struct page for the pfn is initialised */
449 : static inline bool __meminit early_page_initialised(unsigned long pfn)
450 : {
451 : int nid = early_pfn_to_nid(pfn);
452 :
453 : if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
454 : return false;
455 :
456 : return true;
457 : }
458 :
459 : /*
460 : * Returns true when the remaining initialisation should be deferred until
461 : * later in the boot cycle when it can be parallelised.
462 : */
463 : static bool __meminit
464 : defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
465 : {
466 : static unsigned long prev_end_pfn, nr_initialised;
467 :
468 : if (early_page_ext_enabled())
469 : return false;
470 : /*
471 : * prev_end_pfn static that contains the end of previous zone
472 : * No need to protect because called very early in boot before smp_init.
473 : */
474 : if (prev_end_pfn != end_pfn) {
475 : prev_end_pfn = end_pfn;
476 : nr_initialised = 0;
477 : }
478 :
479 : /* Always populate low zones for address-constrained allocations */
480 : if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
481 : return false;
482 :
483 : if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
484 : return true;
485 : /*
486 : * We start only with one section of pages, more pages are added as
487 : * needed until the rest of deferred pages are initialized.
488 : */
489 : nr_initialised++;
490 : if ((nr_initialised > PAGES_PER_SECTION) &&
491 : (pfn & (PAGES_PER_SECTION - 1)) == 0) {
492 : NODE_DATA(nid)->first_deferred_pfn = pfn;
493 : return true;
494 : }
495 : return false;
496 : }
497 : #else
498 : static inline bool deferred_pages_enabled(void)
499 : {
500 : return false;
501 : }
502 :
503 : static inline bool early_page_initialised(unsigned long pfn)
504 : {
505 : return true;
506 : }
507 :
508 : static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
509 : {
510 : return false;
511 : }
512 : #endif
513 :
514 : /* Return a pointer to the bitmap storing bits affecting a block of pages */
515 : static inline unsigned long *get_pageblock_bitmap(const struct page *page,
516 : unsigned long pfn)
517 : {
518 : #ifdef CONFIG_SPARSEMEM
519 : return section_to_usemap(__pfn_to_section(pfn));
520 : #else
521 51168 : return page_zone(page)->pageblock_flags;
522 : #endif /* CONFIG_SPARSEMEM */
523 : }
524 :
525 : static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
526 : {
527 : #ifdef CONFIG_SPARSEMEM
528 : pfn &= (PAGES_PER_SECTION-1);
529 : #else
530 51168 : pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
531 : #endif /* CONFIG_SPARSEMEM */
532 51168 : return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
533 : }
534 :
535 : static __always_inline
536 : unsigned long __get_pfnblock_flags_mask(const struct page *page,
537 : unsigned long pfn,
538 : unsigned long mask)
539 : {
540 : unsigned long *bitmap;
541 : unsigned long bitidx, word_bitidx;
542 : unsigned long word;
543 :
544 101812 : bitmap = get_pageblock_bitmap(page, pfn);
545 50906 : bitidx = pfn_to_bitidx(page, pfn);
546 50906 : word_bitidx = bitidx / BITS_PER_LONG;
547 50906 : bitidx &= (BITS_PER_LONG-1);
548 : /*
549 : * This races, without locks, with set_pfnblock_flags_mask(). Ensure
550 : * a consistent read of the memory array, so that results, even though
551 : * racy, are not corrupted.
552 : */
553 50906 : word = READ_ONCE(bitmap[word_bitidx]);
554 50906 : return (word >> bitidx) & mask;
555 : }
556 :
557 : /**
558 : * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
559 : * @page: The page within the block of interest
560 : * @pfn: The target page frame number
561 : * @mask: mask of bits that the caller is interested in
562 : *
563 : * Return: pageblock_bits flags
564 : */
565 0 : unsigned long get_pfnblock_flags_mask(const struct page *page,
566 : unsigned long pfn, unsigned long mask)
567 : {
568 5 : return __get_pfnblock_flags_mask(page, pfn, mask);
569 : }
570 :
571 : static __always_inline int get_pfnblock_migratetype(const struct page *page,
572 : unsigned long pfn)
573 : {
574 50901 : return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
575 : }
576 :
577 : /**
578 : * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
579 : * @page: The page within the block of interest
580 : * @flags: The flags to set
581 : * @pfn: The target page frame number
582 : * @mask: mask of bits that the caller is interested in
583 : */
584 262 : void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
585 : unsigned long pfn,
586 : unsigned long mask)
587 : {
588 : unsigned long *bitmap;
589 : unsigned long bitidx, word_bitidx;
590 : unsigned long word;
591 :
592 : BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
593 : BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
594 :
595 524 : bitmap = get_pageblock_bitmap(page, pfn);
596 262 : bitidx = pfn_to_bitidx(page, pfn);
597 262 : word_bitidx = bitidx / BITS_PER_LONG;
598 262 : bitidx &= (BITS_PER_LONG-1);
599 :
600 : VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
601 :
602 262 : mask <<= bitidx;
603 262 : flags <<= bitidx;
604 :
605 262 : word = READ_ONCE(bitmap[word_bitidx]);
606 : do {
607 786 : } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
608 262 : }
609 :
610 262 : void set_pageblock_migratetype(struct page *page, int migratetype)
611 : {
612 262 : if (unlikely(page_group_by_mobility_disabled &&
613 : migratetype < MIGRATE_PCPTYPES))
614 0 : migratetype = MIGRATE_UNMOVABLE;
615 :
616 262 : set_pfnblock_flags_mask(page, (unsigned long)migratetype,
617 262 : page_to_pfn(page), MIGRATETYPE_MASK);
618 262 : }
619 :
620 : #ifdef CONFIG_DEBUG_VM
621 : static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
622 : {
623 : int ret = 0;
624 : unsigned seq;
625 : unsigned long pfn = page_to_pfn(page);
626 : unsigned long sp, start_pfn;
627 :
628 : do {
629 : seq = zone_span_seqbegin(zone);
630 : start_pfn = zone->zone_start_pfn;
631 : sp = zone->spanned_pages;
632 : if (!zone_spans_pfn(zone, pfn))
633 : ret = 1;
634 : } while (zone_span_seqretry(zone, seq));
635 :
636 : if (ret)
637 : pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
638 : pfn, zone_to_nid(zone), zone->name,
639 : start_pfn, start_pfn + sp);
640 :
641 : return ret;
642 : }
643 :
644 : static int page_is_consistent(struct zone *zone, struct page *page)
645 : {
646 : if (zone != page_zone(page))
647 : return 0;
648 :
649 : return 1;
650 : }
651 : /*
652 : * Temporary debugging check for pages not lying within a given zone.
653 : */
654 : static int __maybe_unused bad_range(struct zone *zone, struct page *page)
655 : {
656 : if (page_outside_zone_boundaries(zone, page))
657 : return 1;
658 : if (!page_is_consistent(zone, page))
659 : return 1;
660 :
661 : return 0;
662 : }
663 : #else
664 : static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
665 : {
666 : return 0;
667 : }
668 : #endif
669 :
670 0 : static void bad_page(struct page *page, const char *reason)
671 : {
672 : static unsigned long resume;
673 : static unsigned long nr_shown;
674 : static unsigned long nr_unshown;
675 :
676 : /*
677 : * Allow a burst of 60 reports, then keep quiet for that minute;
678 : * or allow a steady drip of one report per second.
679 : */
680 0 : if (nr_shown == 60) {
681 0 : if (time_before(jiffies, resume)) {
682 0 : nr_unshown++;
683 0 : goto out;
684 : }
685 0 : if (nr_unshown) {
686 0 : pr_alert(
687 : "BUG: Bad page state: %lu messages suppressed\n",
688 : nr_unshown);
689 0 : nr_unshown = 0;
690 : }
691 0 : nr_shown = 0;
692 : }
693 0 : if (nr_shown++ == 0)
694 0 : resume = jiffies + 60 * HZ;
695 :
696 0 : pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
697 : current->comm, page_to_pfn(page));
698 0 : dump_page(page, reason);
699 :
700 : print_modules();
701 0 : dump_stack();
702 : out:
703 : /* Leave bad fields for debug, except PageBuddy could make trouble */
704 0 : page_mapcount_reset(page); /* remove PageBuddy */
705 0 : add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
706 0 : }
707 :
708 : static inline unsigned int order_to_pindex(int migratetype, int order)
709 : {
710 59867 : int base = order;
711 :
712 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
713 : if (order > PAGE_ALLOC_COSTLY_ORDER) {
714 : VM_BUG_ON(order != pageblock_order);
715 : return NR_LOWORDER_PCP_LISTS;
716 : }
717 : #else
718 : VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
719 : #endif
720 :
721 59867 : return (MIGRATE_PCPTYPES * base) + migratetype;
722 : }
723 :
724 : static inline int pindex_to_order(unsigned int pindex)
725 : {
726 18 : int order = pindex / MIGRATE_PCPTYPES;
727 :
728 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
729 : if (pindex == NR_LOWORDER_PCP_LISTS)
730 : order = pageblock_order;
731 : #else
732 : VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
733 : #endif
734 :
735 : return order;
736 : }
737 :
738 : static inline bool pcp_allowed_order(unsigned int order)
739 : {
740 59271 : if (order <= PAGE_ALLOC_COSTLY_ORDER)
741 : return true;
742 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
743 : if (order == pageblock_order)
744 : return true;
745 : #endif
746 : return false;
747 : }
748 :
749 50643 : static inline void free_the_page(struct page *page, unsigned int order)
750 : {
751 50643 : if (pcp_allowed_order(order)) /* Via pcp? */
752 50643 : free_unref_page(page, order);
753 : else
754 0 : __free_pages_ok(page, order, FPI_NONE);
755 50643 : }
756 :
757 : /*
758 : * Higher-order pages are called "compound pages". They are structured thusly:
759 : *
760 : * The first PAGE_SIZE page is called the "head page" and have PG_head set.
761 : *
762 : * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
763 : * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
764 : *
765 : * The first tail page's ->compound_dtor holds the offset in array of compound
766 : * page destructors. See compound_page_dtors.
767 : *
768 : * The first tail page's ->compound_order holds the order of allocation.
769 : * This usage means that zero-order pages may not be compound.
770 : */
771 :
772 0 : void free_compound_page(struct page *page)
773 : {
774 0 : mem_cgroup_uncharge(page_folio(page));
775 0 : free_the_page(page, compound_order(page));
776 0 : }
777 :
778 : static void prep_compound_head(struct page *page, unsigned int order)
779 : {
780 102 : struct folio *folio = (struct folio *)page;
781 :
782 102 : set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
783 102 : set_compound_order(page, order);
784 204 : atomic_set(&folio->_entire_mapcount, -1);
785 204 : atomic_set(&folio->_nr_pages_mapped, 0);
786 204 : atomic_set(&folio->_pincount, 0);
787 : }
788 :
789 : static void prep_compound_tail(struct page *head, int tail_idx)
790 : {
791 184 : struct page *p = head + tail_idx;
792 :
793 184 : p->mapping = TAIL_MAPPING;
794 184 : set_compound_head(p, head);
795 368 : set_page_private(p, 0);
796 : }
797 :
798 0 : void prep_compound_page(struct page *page, unsigned int order)
799 : {
800 : int i;
801 102 : int nr_pages = 1 << order;
802 :
803 102 : __SetPageHead(page);
804 286 : for (i = 1; i < nr_pages; i++)
805 184 : prep_compound_tail(page, i);
806 :
807 102 : prep_compound_head(page, order);
808 0 : }
809 :
810 0 : void destroy_large_folio(struct folio *folio)
811 : {
812 0 : enum compound_dtor_id dtor = folio->_folio_dtor;
813 :
814 : VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
815 0 : compound_page_dtors[dtor](&folio->page);
816 0 : }
817 :
818 : #ifdef CONFIG_DEBUG_PAGEALLOC
819 : unsigned int _debug_guardpage_minorder;
820 :
821 : bool _debug_pagealloc_enabled_early __read_mostly
822 : = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
823 : EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
824 : DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
825 : EXPORT_SYMBOL(_debug_pagealloc_enabled);
826 :
827 : DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
828 :
829 : static int __init early_debug_pagealloc(char *buf)
830 : {
831 : return kstrtobool(buf, &_debug_pagealloc_enabled_early);
832 : }
833 : early_param("debug_pagealloc", early_debug_pagealloc);
834 :
835 : static int __init debug_guardpage_minorder_setup(char *buf)
836 : {
837 : unsigned long res;
838 :
839 : if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
840 : pr_err("Bad debug_guardpage_minorder value\n");
841 : return 0;
842 : }
843 : _debug_guardpage_minorder = res;
844 : pr_info("Setting debug_guardpage_minorder to %lu\n", res);
845 : return 0;
846 : }
847 : early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
848 :
849 : static inline bool set_page_guard(struct zone *zone, struct page *page,
850 : unsigned int order, int migratetype)
851 : {
852 : if (!debug_guardpage_enabled())
853 : return false;
854 :
855 : if (order >= debug_guardpage_minorder())
856 : return false;
857 :
858 : __SetPageGuard(page);
859 : INIT_LIST_HEAD(&page->buddy_list);
860 : set_page_private(page, order);
861 : /* Guard pages are not available for any usage */
862 : if (!is_migrate_isolate(migratetype))
863 : __mod_zone_freepage_state(zone, -(1 << order), migratetype);
864 :
865 : return true;
866 : }
867 :
868 : static inline void clear_page_guard(struct zone *zone, struct page *page,
869 : unsigned int order, int migratetype)
870 : {
871 : if (!debug_guardpage_enabled())
872 : return;
873 :
874 : __ClearPageGuard(page);
875 :
876 : set_page_private(page, 0);
877 : if (!is_migrate_isolate(migratetype))
878 : __mod_zone_freepage_state(zone, (1 << order), migratetype);
879 : }
880 : #else
881 : static inline bool set_page_guard(struct zone *zone, struct page *page,
882 : unsigned int order, int migratetype) { return false; }
883 : static inline void clear_page_guard(struct zone *zone, struct page *page,
884 : unsigned int order, int migratetype) {}
885 : #endif
886 :
887 : /*
888 : * Enable static keys related to various memory debugging and hardening options.
889 : * Some override others, and depend on early params that are evaluated in the
890 : * order of appearance. So we need to first gather the full picture of what was
891 : * enabled, and then make decisions.
892 : */
893 1 : void __init init_mem_debugging_and_hardening(void)
894 : {
895 1 : bool page_poisoning_requested = false;
896 :
897 : #ifdef CONFIG_PAGE_POISONING
898 : /*
899 : * Page poisoning is debug page alloc for some arches. If
900 : * either of those options are enabled, enable poisoning.
901 : */
902 : if (page_poisoning_enabled() ||
903 : (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
904 : debug_pagealloc_enabled())) {
905 : static_branch_enable(&_page_poisoning_enabled);
906 : page_poisoning_requested = true;
907 : }
908 : #endif
909 :
910 1 : if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
911 : page_poisoning_requested) {
912 : pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
913 : "will take precedence over init_on_alloc and init_on_free\n");
914 : _init_on_alloc_enabled_early = false;
915 : _init_on_free_enabled_early = false;
916 : }
917 :
918 1 : if (_init_on_alloc_enabled_early)
919 0 : static_branch_enable(&init_on_alloc);
920 : else
921 1 : static_branch_disable(&init_on_alloc);
922 :
923 1 : if (_init_on_free_enabled_early)
924 0 : static_branch_enable(&init_on_free);
925 : else
926 1 : static_branch_disable(&init_on_free);
927 :
928 : if (IS_ENABLED(CONFIG_KMSAN) &&
929 : (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
930 : pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
931 :
932 : #ifdef CONFIG_DEBUG_PAGEALLOC
933 : if (!debug_pagealloc_enabled())
934 : return;
935 :
936 : static_branch_enable(&_debug_pagealloc_enabled);
937 :
938 : if (!debug_guardpage_minorder())
939 : return;
940 :
941 : static_branch_enable(&_debug_guardpage_enabled);
942 : #endif
943 1 : }
944 :
945 : static inline void set_buddy_order(struct page *page, unsigned int order)
946 : {
947 22748 : set_page_private(page, order);
948 11374 : __SetPageBuddy(page);
949 : }
950 :
951 : #ifdef CONFIG_COMPACTION
952 5233 : static inline struct capture_control *task_capc(struct zone *zone)
953 : {
954 5233 : struct capture_control *capc = current->capture_control;
955 :
956 5233 : return unlikely(capc) &&
957 0 : !(current->flags & PF_KTHREAD) &&
958 0 : !capc->page &&
959 10466 : capc->cc->zone == zone ? capc : NULL;
960 : }
961 :
962 : static inline bool
963 : compaction_capture(struct capture_control *capc, struct page *page,
964 : int order, int migratetype)
965 : {
966 9787 : if (!capc || order != capc->cc->order)
967 : return false;
968 :
969 : /* Do not accidentally pollute CMA or isolated regions*/
970 : if (is_migrate_cma(migratetype) ||
971 0 : is_migrate_isolate(migratetype))
972 : return false;
973 :
974 : /*
975 : * Do not let lower order allocations pollute a movable pageblock.
976 : * This might let an unmovable request use a reclaimable pageblock
977 : * and vice-versa but no more than normal fallback logic which can
978 : * have trouble finding a high-order free page.
979 : */
980 0 : if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
981 : return false;
982 :
983 0 : capc->page = page;
984 : return true;
985 : }
986 :
987 : #else
988 : static inline struct capture_control *task_capc(struct zone *zone)
989 : {
990 : return NULL;
991 : }
992 :
993 : static inline bool
994 : compaction_capture(struct capture_control *capc, struct page *page,
995 : int order, int migratetype)
996 : {
997 : return false;
998 : }
999 : #endif /* CONFIG_COMPACTION */
1000 :
1001 : /* Used for pages not on another list */
1002 : static inline void add_to_free_list(struct page *page, struct zone *zone,
1003 : unsigned int order, int migratetype)
1004 : {
1005 9966 : struct free_area *area = &zone->free_area[order];
1006 :
1007 19932 : list_add(&page->buddy_list, &area->free_list[migratetype]);
1008 9966 : area->nr_free++;
1009 : }
1010 :
1011 : /* Used for pages not on another list */
1012 : static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
1013 : unsigned int order, int migratetype)
1014 : {
1015 1408 : struct free_area *area = &zone->free_area[order];
1016 :
1017 2816 : list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
1018 1408 : area->nr_free++;
1019 : }
1020 :
1021 : /*
1022 : * Used for pages which are on another list. Move the pages to the tail
1023 : * of the list - so the moved pages won't immediately be considered for
1024 : * allocation again (e.g., optimization for memory onlining).
1025 : */
1026 : static inline void move_to_free_list(struct page *page, struct zone *zone,
1027 : unsigned int order, int migratetype)
1028 : {
1029 5 : struct free_area *area = &zone->free_area[order];
1030 :
1031 10 : list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
1032 : }
1033 :
1034 : static inline void del_page_from_free_list(struct page *page, struct zone *zone,
1035 : unsigned int order)
1036 : {
1037 : /* clear reported state and update reported page count */
1038 : if (page_reported(page))
1039 : __ClearPageReported(page);
1040 :
1041 22106 : list_del(&page->buddy_list);
1042 11053 : __ClearPageBuddy(page);
1043 22106 : set_page_private(page, 0);
1044 11053 : zone->free_area[order].nr_free--;
1045 : }
1046 :
1047 : /*
1048 : * If this is not the largest possible page, check if the buddy
1049 : * of the next-highest order is free. If it is, it's possible
1050 : * that pages are being freed that will coalesce soon. In case,
1051 : * that is happening, add the free page to the tail of the list
1052 : * so it's less likely to be used soon and more likely to be merged
1053 : * as a higher order page
1054 : */
1055 : static inline bool
1056 4975 : buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
1057 : struct page *page, unsigned int order)
1058 : {
1059 : unsigned long higher_page_pfn;
1060 : struct page *higher_page;
1061 :
1062 4975 : if (order >= MAX_ORDER - 2)
1063 : return false;
1064 :
1065 4974 : higher_page_pfn = buddy_pfn & pfn;
1066 4974 : higher_page = page + (higher_page_pfn - pfn);
1067 :
1068 9948 : return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
1069 4974 : NULL) != NULL;
1070 : }
1071 :
1072 : /*
1073 : * Freeing function for a buddy system allocator.
1074 : *
1075 : * The concept of a buddy system is to maintain direct-mapped table
1076 : * (containing bit values) for memory blocks of various "orders".
1077 : * The bottom level table contains the map for the smallest allocatable
1078 : * units of memory (here, pages), and each level above it describes
1079 : * pairs of units from the levels below, hence, "buddies".
1080 : * At a high level, all that happens here is marking the table entry
1081 : * at the bottom level available, and propagating the changes upward
1082 : * as necessary, plus some accounting needed to play nicely with other
1083 : * parts of the VM system.
1084 : * At each level, we keep a list of pages, which are heads of continuous
1085 : * free pages of length of (1 << order) and marked with PageBuddy.
1086 : * Page's order is recorded in page_private(page) field.
1087 : * So when we are allocating or freeing one, we can derive the state of the
1088 : * other. That is, if we allocate a small block, and both were
1089 : * free, the remainder of the region must be split into blocks.
1090 : * If a block is freed, and its buddy is also free, then this
1091 : * triggers coalescing into a block of larger size.
1092 : *
1093 : * -- nyc
1094 : */
1095 :
1096 5233 : static inline void __free_one_page(struct page *page,
1097 : unsigned long pfn,
1098 : struct zone *zone, unsigned int order,
1099 : int migratetype, fpi_t fpi_flags)
1100 : {
1101 5233 : struct capture_control *capc = task_capc(zone);
1102 5233 : unsigned long buddy_pfn = 0;
1103 : unsigned long combined_pfn;
1104 : struct page *buddy;
1105 : bool to_tail;
1106 :
1107 : VM_BUG_ON(!zone_is_initialized(zone));
1108 : VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
1109 :
1110 : VM_BUG_ON(migratetype == -1);
1111 5233 : if (likely(!is_migrate_isolate(migratetype)))
1112 5233 : __mod_zone_freepage_state(zone, 1 << order, migratetype);
1113 :
1114 : VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
1115 : VM_BUG_ON_PAGE(bad_range(zone, page), page);
1116 :
1117 10035 : while (order < MAX_ORDER - 1) {
1118 19574 : if (compaction_capture(capc, page, order, migratetype)) {
1119 0 : __mod_zone_freepage_state(zone, -(1 << order),
1120 : migratetype);
1121 0 : return;
1122 : }
1123 :
1124 9787 : buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
1125 9787 : if (!buddy)
1126 : goto done_merging;
1127 :
1128 : if (unlikely(order >= pageblock_order)) {
1129 : /*
1130 : * We want to prevent merge between freepages on pageblock
1131 : * without fallbacks and normal pageblock. Without this,
1132 : * pageblock isolation could cause incorrect freepage or CMA
1133 : * accounting or HIGHATOMIC accounting.
1134 : */
1135 : int buddy_mt = get_pageblock_migratetype(buddy);
1136 :
1137 : if (migratetype != buddy_mt
1138 : && (!migratetype_is_mergeable(migratetype) ||
1139 : !migratetype_is_mergeable(buddy_mt)))
1140 : goto done_merging;
1141 : }
1142 :
1143 : /*
1144 : * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
1145 : * merge with it and move up one order.
1146 : */
1147 : if (page_is_guard(buddy))
1148 : clear_page_guard(zone, buddy, order, migratetype);
1149 : else
1150 : del_page_from_free_list(buddy, zone, order);
1151 4802 : combined_pfn = buddy_pfn & pfn;
1152 4802 : page = page + (combined_pfn - pfn);
1153 4802 : pfn = combined_pfn;
1154 4802 : order++;
1155 : }
1156 :
1157 : done_merging:
1158 5233 : set_buddy_order(page, order);
1159 :
1160 5233 : if (fpi_flags & FPI_TO_TAIL)
1161 : to_tail = true;
1162 4975 : else if (is_shuffle_order(order))
1163 : to_tail = shuffle_pick_tail();
1164 : else
1165 4975 : to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
1166 :
1167 5233 : if (to_tail)
1168 : add_to_free_list_tail(page, zone, order, migratetype);
1169 : else
1170 : add_to_free_list(page, zone, order, migratetype);
1171 :
1172 : /* Notify page reporting subsystem of freed page */
1173 : if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
1174 : page_reporting_notify_free(order);
1175 : }
1176 :
1177 : /**
1178 : * split_free_page() -- split a free page at split_pfn_offset
1179 : * @free_page: the original free page
1180 : * @order: the order of the page
1181 : * @split_pfn_offset: split offset within the page
1182 : *
1183 : * Return -ENOENT if the free page is changed, otherwise 0
1184 : *
1185 : * It is used when the free page crosses two pageblocks with different migratetypes
1186 : * at split_pfn_offset within the page. The split free page will be put into
1187 : * separate migratetype lists afterwards. Otherwise, the function achieves
1188 : * nothing.
1189 : */
1190 0 : int split_free_page(struct page *free_page,
1191 : unsigned int order, unsigned long split_pfn_offset)
1192 : {
1193 0 : struct zone *zone = page_zone(free_page);
1194 0 : unsigned long free_page_pfn = page_to_pfn(free_page);
1195 : unsigned long pfn;
1196 : unsigned long flags;
1197 : int free_page_order;
1198 : int mt;
1199 0 : int ret = 0;
1200 :
1201 0 : if (split_pfn_offset == 0)
1202 : return ret;
1203 :
1204 0 : spin_lock_irqsave(&zone->lock, flags);
1205 :
1206 0 : if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
1207 : ret = -ENOENT;
1208 : goto out;
1209 : }
1210 :
1211 0 : mt = get_pageblock_migratetype(free_page);
1212 0 : if (likely(!is_migrate_isolate(mt)))
1213 0 : __mod_zone_freepage_state(zone, -(1UL << order), mt);
1214 :
1215 0 : del_page_from_free_list(free_page, zone, order);
1216 0 : for (pfn = free_page_pfn;
1217 0 : pfn < free_page_pfn + (1UL << order);) {
1218 0 : int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
1219 :
1220 0 : free_page_order = min_t(unsigned int,
1221 : pfn ? __ffs(pfn) : order,
1222 : __fls(split_pfn_offset));
1223 0 : __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
1224 : mt, FPI_NONE);
1225 0 : pfn += 1UL << free_page_order;
1226 0 : split_pfn_offset -= (1UL << free_page_order);
1227 : /* we have done the first part, now switch to second part */
1228 0 : if (split_pfn_offset == 0)
1229 0 : split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
1230 : }
1231 : out:
1232 0 : spin_unlock_irqrestore(&zone->lock, flags);
1233 0 : return ret;
1234 : }
1235 : /*
1236 : * A bad page could be due to a number of fields. Instead of multiple branches,
1237 : * try and check multiple fields with one check. The caller must do a detailed
1238 : * check if necessary.
1239 : */
1240 : static inline bool page_expected_state(struct page *page,
1241 : unsigned long check_flags)
1242 : {
1243 532944 : if (unlikely(atomic_read(&page->_mapcount) != -1))
1244 : return false;
1245 :
1246 532944 : if (unlikely((unsigned long)page->mapping |
1247 : page_ref_count(page) |
1248 : #ifdef CONFIG_MEMCG
1249 : page->memcg_data |
1250 : #endif
1251 : (page->flags & check_flags)))
1252 : return false;
1253 :
1254 : return true;
1255 : }
1256 :
1257 : static const char *page_bad_reason(struct page *page, unsigned long flags)
1258 : {
1259 0 : const char *bad_reason = NULL;
1260 :
1261 0 : if (unlikely(atomic_read(&page->_mapcount) != -1))
1262 0 : bad_reason = "nonzero mapcount";
1263 0 : if (unlikely(page->mapping != NULL))
1264 0 : bad_reason = "non-NULL mapping";
1265 0 : if (unlikely(page_ref_count(page) != 0))
1266 0 : bad_reason = "nonzero _refcount";
1267 0 : if (unlikely(page->flags & flags)) {
1268 : if (flags == PAGE_FLAGS_CHECK_AT_PREP)
1269 : bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
1270 : else
1271 0 : bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1272 : }
1273 : #ifdef CONFIG_MEMCG
1274 : if (unlikely(page->memcg_data))
1275 : bad_reason = "page still charged to cgroup";
1276 : #endif
1277 : return bad_reason;
1278 : }
1279 :
1280 0 : static void free_page_is_bad_report(struct page *page)
1281 : {
1282 0 : bad_page(page,
1283 : page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
1284 0 : }
1285 :
1286 259925 : static inline bool free_page_is_bad(struct page *page)
1287 : {
1288 259925 : if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
1289 : return false;
1290 :
1291 : /* Something has gone sideways, find it */
1292 0 : free_page_is_bad_report(page);
1293 0 : return true;
1294 : }
1295 :
1296 : static int free_tail_pages_check(struct page *head_page, struct page *page)
1297 : {
1298 42 : struct folio *folio = (struct folio *)head_page;
1299 42 : int ret = 1;
1300 :
1301 : /*
1302 : * We rely page->lru.next never has bit 0 set, unless the page
1303 : * is PageTail(). Let's make sure that's true even for poisoned ->lru.
1304 : */
1305 : BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
1306 :
1307 : if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
1308 42 : ret = 0;
1309 : goto out;
1310 : }
1311 : switch (page - head_page) {
1312 : case 1:
1313 : /* the first tail page: these may be in place of ->mapping */
1314 : if (unlikely(folio_entire_mapcount(folio))) {
1315 : bad_page(page, "nonzero entire_mapcount");
1316 : goto out;
1317 : }
1318 : if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
1319 : bad_page(page, "nonzero nr_pages_mapped");
1320 : goto out;
1321 : }
1322 : if (unlikely(atomic_read(&folio->_pincount))) {
1323 : bad_page(page, "nonzero pincount");
1324 : goto out;
1325 : }
1326 : break;
1327 : case 2:
1328 : /*
1329 : * the second tail page: ->mapping is
1330 : * deferred_list.next -- ignore value.
1331 : */
1332 : break;
1333 : default:
1334 : if (page->mapping != TAIL_MAPPING) {
1335 : bad_page(page, "corrupted mapping in tail page");
1336 : goto out;
1337 : }
1338 : break;
1339 : }
1340 : if (unlikely(!PageTail(page))) {
1341 : bad_page(page, "PageTail not set");
1342 : goto out;
1343 : }
1344 : if (unlikely(compound_head(page) != head_page)) {
1345 : bad_page(page, "compound_head not consistent");
1346 : goto out;
1347 : }
1348 : ret = 0;
1349 : out:
1350 42 : page->mapping = NULL;
1351 42 : clear_compound_head(page);
1352 : return ret;
1353 : }
1354 :
1355 : /*
1356 : * Skip KASAN memory poisoning when either:
1357 : *
1358 : * 1. Deferred memory initialization has not yet completed,
1359 : * see the explanation below.
1360 : * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON,
1361 : * see the comment next to it.
1362 : * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON,
1363 : * see the comment next to it.
1364 : * 4. The allocation is excluded from being checked due to sampling,
1365 : * see the call to kasan_unpoison_pages.
1366 : *
1367 : * Poisoning pages during deferred memory init will greatly lengthen the
1368 : * process and cause problem in large memory systems as the deferred pages
1369 : * initialization is done with interrupt disabled.
1370 : *
1371 : * Assuming that there will be no reference to those newly initialized
1372 : * pages before they are ever allocated, this should have no effect on
1373 : * KASAN memory tracking as the poison will be properly inserted at page
1374 : * allocation time. The only corner case is when pages are allocated by
1375 : * on-demand allocation and then freed again before the deferred pages
1376 : * initialization is done, but this is not likely to happen.
1377 : */
1378 : static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
1379 : {
1380 : return deferred_pages_enabled() ||
1381 : (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
1382 : (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
1383 : PageSkipKASanPoison(page);
1384 : }
1385 :
1386 0 : static void kernel_init_pages(struct page *page, int numpages)
1387 : {
1388 : int i;
1389 :
1390 : /* s390's use of memset() could override KASAN redzones. */
1391 : kasan_disable_current();
1392 38283 : for (i = 0; i < numpages; i++)
1393 76566 : clear_highpage_kasan_tagged(page + i);
1394 : kasan_enable_current();
1395 0 : }
1396 :
1397 : static __always_inline bool free_pages_prepare(struct page *page,
1398 : unsigned int order, bool check_free, fpi_t fpi_flags)
1399 : {
1400 50901 : int bad = 0;
1401 50901 : bool init = want_init_on_free();
1402 :
1403 : VM_BUG_ON_PAGE(PageTail(page), page);
1404 :
1405 50901 : trace_mm_page_free(page, order);
1406 50901 : kmsan_free_page(page, order);
1407 :
1408 50901 : if (unlikely(PageHWPoison(page)) && !order) {
1409 : /*
1410 : * Do not let hwpoison pages hit pcplists/buddy
1411 : * Untie memcg state and reset page's owner
1412 : */
1413 : if (memcg_kmem_online() && PageMemcgKmem(page))
1414 : __memcg_kmem_uncharge_page(page, order);
1415 : reset_page_owner(page, order);
1416 : page_table_check_free(page, order);
1417 : return false;
1418 : }
1419 :
1420 : /*
1421 : * Check tail pages before head page information is cleared to
1422 : * avoid checking PageCompound for order-0 pages.
1423 : */
1424 50901 : if (unlikely(order)) {
1425 262 : bool compound = PageCompound(page);
1426 : int i;
1427 :
1428 : VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
1429 :
1430 : if (compound)
1431 : ClearPageHasHWPoisoned(page);
1432 254692 : for (i = 1; i < (1 << order); i++) {
1433 254692 : if (compound)
1434 84 : bad += free_tail_pages_check(page, page + i);
1435 254692 : if (unlikely(free_page_is_bad(page + i))) {
1436 0 : bad++;
1437 0 : continue;
1438 : }
1439 254692 : (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1440 : }
1441 : }
1442 50901 : if (PageMappingFlags(page))
1443 0 : page->mapping = NULL;
1444 : if (memcg_kmem_online() && PageMemcgKmem(page))
1445 : __memcg_kmem_uncharge_page(page, order);
1446 258 : if (check_free && free_page_is_bad(page))
1447 0 : bad++;
1448 50901 : if (bad)
1449 : return false;
1450 :
1451 50901 : page_cpupid_reset_last(page);
1452 50901 : page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1453 : reset_page_owner(page, order);
1454 50901 : page_table_check_free(page, order);
1455 :
1456 50901 : if (!PageHighMem(page)) {
1457 : debug_check_no_locks_freed(page_address(page),
1458 : PAGE_SIZE << order);
1459 : debug_check_no_obj_freed(page_address(page),
1460 : PAGE_SIZE << order);
1461 : }
1462 :
1463 50901 : kernel_poison_pages(page, 1 << order);
1464 :
1465 : /*
1466 : * As memory initialization might be integrated into KASAN,
1467 : * KASAN poisoning and memory initialization code must be
1468 : * kept together to avoid discrepancies in behavior.
1469 : *
1470 : * With hardware tag-based KASAN, memory tags must be set before the
1471 : * page becomes unavailable via debug_pagealloc or arch_free_page.
1472 : */
1473 50901 : if (!should_skip_kasan_poison(page, fpi_flags)) {
1474 : kasan_poison_pages(page, order, init);
1475 :
1476 : /* Memory is already initialized if KASAN did it internally. */
1477 : if (kasan_has_integrated_init())
1478 : init = false;
1479 : }
1480 50901 : if (init)
1481 0 : kernel_init_pages(page, 1 << order);
1482 :
1483 : /*
1484 : * arch_free_page() can make the page's contents inaccessible. s390
1485 : * does this. So nothing which can access the page's contents should
1486 : * happen after this.
1487 : */
1488 : arch_free_page(page, order);
1489 :
1490 : debug_pagealloc_unmap_pages(page, 1 << order);
1491 :
1492 : return true;
1493 : }
1494 :
1495 : #ifdef CONFIG_DEBUG_VM
1496 : /*
1497 : * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
1498 : * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
1499 : * moved from pcp lists to free lists.
1500 : */
1501 : static bool free_pcp_prepare(struct page *page, unsigned int order)
1502 : {
1503 : return free_pages_prepare(page, order, true, FPI_NONE);
1504 : }
1505 :
1506 : /* return true if this page has an inappropriate state */
1507 : static bool bulkfree_pcp_prepare(struct page *page)
1508 : {
1509 : if (debug_pagealloc_enabled_static())
1510 : return free_page_is_bad(page);
1511 : else
1512 : return false;
1513 : }
1514 : #else
1515 : /*
1516 : * With DEBUG_VM disabled, order-0 pages being freed are checked only when
1517 : * moving from pcp lists to free list in order to reduce overhead. With
1518 : * debug_pagealloc enabled, they are checked also immediately when being freed
1519 : * to the pcp lists.
1520 : */
1521 50643 : static bool free_pcp_prepare(struct page *page, unsigned int order)
1522 : {
1523 : if (debug_pagealloc_enabled_static())
1524 : return free_pages_prepare(page, order, true, FPI_NONE);
1525 : else
1526 50643 : return free_pages_prepare(page, order, false, FPI_NONE);
1527 : }
1528 :
1529 : static bool bulkfree_pcp_prepare(struct page *page)
1530 : {
1531 4975 : return free_page_is_bad(page);
1532 : }
1533 : #endif /* CONFIG_DEBUG_VM */
1534 :
1535 : /*
1536 : * Frees a number of pages from the PCP lists
1537 : * Assumes all pages on list are in same zone.
1538 : * count is the number of pages to free.
1539 : */
1540 14 : static void free_pcppages_bulk(struct zone *zone, int count,
1541 : struct per_cpu_pages *pcp,
1542 : int pindex)
1543 : {
1544 : unsigned long flags;
1545 14 : int min_pindex = 0;
1546 14 : int max_pindex = NR_PCP_LISTS - 1;
1547 : unsigned int order;
1548 : bool isolated_pageblocks;
1549 : struct page *page;
1550 :
1551 : /*
1552 : * Ensure proper count is passed which otherwise would stuck in the
1553 : * below while (list_empty(list)) loop.
1554 : */
1555 14 : count = min(pcp->count, count);
1556 :
1557 : /* Ensure requested pindex is drained first. */
1558 14 : pindex = pindex - 1;
1559 :
1560 14 : spin_lock_irqsave(&zone->lock, flags);
1561 14 : isolated_pageblocks = has_isolate_pageblock(zone);
1562 :
1563 46 : while (count > 0) {
1564 : struct list_head *list;
1565 : int nr_pages;
1566 :
1567 : /* Remove pages from lists in a round-robin fashion. */
1568 : do {
1569 20 : if (++pindex > max_pindex)
1570 0 : pindex = min_pindex;
1571 20 : list = &pcp->lists[pindex];
1572 20 : if (!list_empty(list))
1573 : break;
1574 :
1575 2 : if (pindex == max_pindex)
1576 0 : max_pindex--;
1577 2 : if (pindex == min_pindex)
1578 0 : min_pindex++;
1579 : } while (1);
1580 :
1581 36 : order = pindex_to_order(pindex);
1582 18 : nr_pages = 1 << order;
1583 : do {
1584 : int mt;
1585 :
1586 4975 : page = list_last_entry(list, struct page, pcp_list);
1587 9950 : mt = get_pcppage_migratetype(page);
1588 :
1589 : /* must delete to avoid corrupting pcp list */
1590 9950 : list_del(&page->pcp_list);
1591 4975 : count -= nr_pages;
1592 4975 : pcp->count -= nr_pages;
1593 :
1594 4975 : if (bulkfree_pcp_prepare(page))
1595 0 : continue;
1596 :
1597 : /* MIGRATE_ISOLATE page should not go to pcplists */
1598 : VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1599 : /* Pageblock could have been isolated meanwhile */
1600 : if (unlikely(isolated_pageblocks))
1601 : mt = get_pageblock_migratetype(page);
1602 :
1603 4975 : __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
1604 4975 : trace_mm_page_pcpu_drain(page, order, mt);
1605 9936 : } while (count > 0 && !list_empty(list));
1606 : }
1607 :
1608 28 : spin_unlock_irqrestore(&zone->lock, flags);
1609 14 : }
1610 :
1611 0 : static void free_one_page(struct zone *zone,
1612 : struct page *page, unsigned long pfn,
1613 : unsigned int order,
1614 : int migratetype, fpi_t fpi_flags)
1615 : {
1616 : unsigned long flags;
1617 :
1618 0 : spin_lock_irqsave(&zone->lock, flags);
1619 0 : if (unlikely(has_isolate_pageblock(zone) ||
1620 : is_migrate_isolate(migratetype))) {
1621 : migratetype = get_pfnblock_migratetype(page, pfn);
1622 : }
1623 0 : __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
1624 0 : spin_unlock_irqrestore(&zone->lock, flags);
1625 0 : }
1626 :
1627 262670 : static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1628 : unsigned long zone, int nid)
1629 : {
1630 262670 : mm_zero_struct_page(page);
1631 525340 : set_page_links(page, zone, nid, pfn);
1632 262670 : init_page_count(page);
1633 262670 : page_mapcount_reset(page);
1634 262670 : page_cpupid_reset_last(page);
1635 262670 : page_kasan_tag_reset(page);
1636 :
1637 525340 : INIT_LIST_HEAD(&page->lru);
1638 : #ifdef WANT_PAGE_VIRTUAL
1639 : /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1640 : if (!is_highmem_idx(zone))
1641 : set_page_address(page, __va(pfn << PAGE_SHIFT));
1642 : #endif
1643 262670 : }
1644 :
1645 : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1646 : static void __meminit init_reserved_page(unsigned long pfn)
1647 : {
1648 : pg_data_t *pgdat;
1649 : int nid, zid;
1650 :
1651 : if (early_page_initialised(pfn))
1652 : return;
1653 :
1654 : nid = early_pfn_to_nid(pfn);
1655 : pgdat = NODE_DATA(nid);
1656 :
1657 : for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1658 : struct zone *zone = &pgdat->node_zones[zid];
1659 :
1660 : if (zone_spans_pfn(zone, pfn))
1661 : break;
1662 : }
1663 : __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
1664 : }
1665 : #else
1666 : static inline void init_reserved_page(unsigned long pfn)
1667 : {
1668 : }
1669 : #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1670 :
1671 : /*
1672 : * Initialised pages do not have PageReserved set. This function is
1673 : * called for each range allocated by the bootmem allocator and
1674 : * marks the pages PageReserved. The remaining valid pages are later
1675 : * sent to the buddy page allocator.
1676 : */
1677 11 : void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
1678 : {
1679 11 : unsigned long start_pfn = PFN_DOWN(start);
1680 11 : unsigned long end_pfn = PFN_UP(end);
1681 :
1682 7779 : for (; start_pfn < end_pfn; start_pfn++) {
1683 7768 : if (pfn_valid(start_pfn)) {
1684 7768 : struct page *page = pfn_to_page(start_pfn);
1685 :
1686 7768 : init_reserved_page(start_pfn);
1687 :
1688 : /* Avoid false-positive PageTail() */
1689 15536 : INIT_LIST_HEAD(&page->lru);
1690 :
1691 : /*
1692 : * no need for atomic set_bit because the struct
1693 : * page is not visible yet so nobody should
1694 : * access it yet.
1695 : */
1696 : __SetPageReserved(page);
1697 : }
1698 : }
1699 11 : }
1700 :
1701 258 : static void __free_pages_ok(struct page *page, unsigned int order,
1702 : fpi_t fpi_flags)
1703 : {
1704 : unsigned long flags;
1705 : int migratetype;
1706 258 : unsigned long pfn = page_to_pfn(page);
1707 258 : struct zone *zone = page_zone(page);
1708 :
1709 258 : if (!free_pages_prepare(page, order, true, fpi_flags))
1710 : return;
1711 :
1712 : /*
1713 : * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
1714 : * is used to avoid calling get_pfnblock_migratetype() under the lock.
1715 : * This will reduce the lock holding time.
1716 : */
1717 258 : migratetype = get_pfnblock_migratetype(page, pfn);
1718 :
1719 258 : spin_lock_irqsave(&zone->lock, flags);
1720 : if (unlikely(has_isolate_pageblock(zone) ||
1721 : is_migrate_isolate(migratetype))) {
1722 : migratetype = get_pfnblock_migratetype(page, pfn);
1723 : }
1724 258 : __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
1725 516 : spin_unlock_irqrestore(&zone->lock, flags);
1726 :
1727 258 : __count_vm_events(PGFREE, 1 << order);
1728 : }
1729 :
1730 258 : void __free_pages_core(struct page *page, unsigned int order)
1731 : {
1732 258 : unsigned int nr_pages = 1 << order;
1733 258 : struct page *p = page;
1734 : unsigned int loop;
1735 :
1736 : /*
1737 : * When initializing the memmap, __init_single_page() sets the refcount
1738 : * of all pages to 1 ("allocated"/"not free"). We have to set the
1739 : * refcount of all involved pages to 0.
1740 : */
1741 258 : prefetchw(p);
1742 254908 : for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
1743 254650 : prefetchw(p + 1);
1744 254650 : __ClearPageReserved(p);
1745 254650 : set_page_count(p, 0);
1746 : }
1747 258 : __ClearPageReserved(p);
1748 258 : set_page_count(p, 0);
1749 :
1750 516 : atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
1751 :
1752 : /*
1753 : * Bypass PCP and place fresh pages right to the tail, primarily
1754 : * relevant for memory onlining.
1755 : */
1756 258 : __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
1757 258 : }
1758 :
1759 : #ifdef CONFIG_NUMA
1760 :
1761 : /*
1762 : * During memory init memblocks map pfns to nids. The search is expensive and
1763 : * this caches recent lookups. The implementation of __early_pfn_to_nid
1764 : * treats start/end as pfns.
1765 : */
1766 : struct mminit_pfnnid_cache {
1767 : unsigned long last_start;
1768 : unsigned long last_end;
1769 : int last_nid;
1770 : };
1771 :
1772 : static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
1773 :
1774 : /*
1775 : * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
1776 : */
1777 : static int __meminit __early_pfn_to_nid(unsigned long pfn,
1778 : struct mminit_pfnnid_cache *state)
1779 : {
1780 : unsigned long start_pfn, end_pfn;
1781 : int nid;
1782 :
1783 : if (state->last_start <= pfn && pfn < state->last_end)
1784 : return state->last_nid;
1785 :
1786 : nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
1787 : if (nid != NUMA_NO_NODE) {
1788 : state->last_start = start_pfn;
1789 : state->last_end = end_pfn;
1790 : state->last_nid = nid;
1791 : }
1792 :
1793 : return nid;
1794 : }
1795 :
1796 : int __meminit early_pfn_to_nid(unsigned long pfn)
1797 : {
1798 : static DEFINE_SPINLOCK(early_pfn_lock);
1799 : int nid;
1800 :
1801 : spin_lock(&early_pfn_lock);
1802 : nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
1803 : if (nid < 0)
1804 : nid = first_online_node;
1805 : spin_unlock(&early_pfn_lock);
1806 :
1807 : return nid;
1808 : }
1809 : #endif /* CONFIG_NUMA */
1810 :
1811 258 : void __init memblock_free_pages(struct page *page, unsigned long pfn,
1812 : unsigned int order)
1813 : {
1814 258 : if (!early_page_initialised(pfn))
1815 : return;
1816 258 : if (!kmsan_memblock_free_pages(page, order)) {
1817 : /* KMSAN will take care of these pages. */
1818 : return;
1819 : }
1820 258 : __free_pages_core(page, order);
1821 : }
1822 :
1823 : /*
1824 : * Check that the whole (or subset of) a pageblock given by the interval of
1825 : * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1826 : * with the migration of free compaction scanner.
1827 : *
1828 : * Return struct page pointer of start_pfn, or NULL if checks were not passed.
1829 : *
1830 : * It's possible on some configurations to have a setup like node0 node1 node0
1831 : * i.e. it's possible that all pages within a zones range of pages do not
1832 : * belong to a single zone. We assume that a border between node0 and node1
1833 : * can occur within a single pageblock, but not a node0 node1 node0
1834 : * interleaving within a single pageblock. It is therefore sufficient to check
1835 : * the first and last page of a pageblock and avoid checking each individual
1836 : * page in a pageblock.
1837 : */
1838 257 : struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
1839 : unsigned long end_pfn, struct zone *zone)
1840 : {
1841 : struct page *start_page;
1842 : struct page *end_page;
1843 :
1844 : /* end_pfn is one past the range we are checking */
1845 257 : end_pfn--;
1846 :
1847 514 : if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
1848 : return NULL;
1849 :
1850 514 : start_page = pfn_to_online_page(start_pfn);
1851 257 : if (!start_page)
1852 : return NULL;
1853 :
1854 257 : if (page_zone(start_page) != zone)
1855 : return NULL;
1856 :
1857 257 : end_page = pfn_to_page(end_pfn);
1858 :
1859 : /* This gives a shorter code than deriving page_zone(end_page) */
1860 771 : if (page_zone_id(start_page) != page_zone_id(end_page))
1861 : return NULL;
1862 :
1863 257 : return start_page;
1864 : }
1865 :
1866 1 : void set_zone_contiguous(struct zone *zone)
1867 : {
1868 1 : unsigned long block_start_pfn = zone->zone_start_pfn;
1869 : unsigned long block_end_pfn;
1870 :
1871 1 : block_end_pfn = pageblock_end_pfn(block_start_pfn);
1872 517 : for (; block_start_pfn < zone_end_pfn(zone);
1873 257 : block_start_pfn = block_end_pfn,
1874 257 : block_end_pfn += pageblock_nr_pages) {
1875 :
1876 257 : block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
1877 :
1878 257 : if (!__pageblock_pfn_to_page(block_start_pfn,
1879 : block_end_pfn, zone))
1880 : return;
1881 257 : cond_resched();
1882 : }
1883 :
1884 : /* We confirm that there is no hole */
1885 1 : zone->contiguous = true;
1886 : }
1887 :
1888 0 : void clear_zone_contiguous(struct zone *zone)
1889 : {
1890 0 : zone->contiguous = false;
1891 0 : }
1892 :
1893 : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1894 : static void __init deferred_free_range(unsigned long pfn,
1895 : unsigned long nr_pages)
1896 : {
1897 : struct page *page;
1898 : unsigned long i;
1899 :
1900 : if (!nr_pages)
1901 : return;
1902 :
1903 : page = pfn_to_page(pfn);
1904 :
1905 : /* Free a large naturally-aligned chunk if possible */
1906 : if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) {
1907 : set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1908 : __free_pages_core(page, pageblock_order);
1909 : return;
1910 : }
1911 :
1912 : for (i = 0; i < nr_pages; i++, page++, pfn++) {
1913 : if (pageblock_aligned(pfn))
1914 : set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1915 : __free_pages_core(page, 0);
1916 : }
1917 : }
1918 :
1919 : /* Completion tracking for deferred_init_memmap() threads */
1920 : static atomic_t pgdat_init_n_undone __initdata;
1921 : static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
1922 :
1923 : static inline void __init pgdat_init_report_one_done(void)
1924 : {
1925 : if (atomic_dec_and_test(&pgdat_init_n_undone))
1926 : complete(&pgdat_init_all_done_comp);
1927 : }
1928 :
1929 : /*
1930 : * Returns true if page needs to be initialized or freed to buddy allocator.
1931 : *
1932 : * We check if a current large page is valid by only checking the validity
1933 : * of the head pfn.
1934 : */
1935 : static inline bool __init deferred_pfn_valid(unsigned long pfn)
1936 : {
1937 : if (pageblock_aligned(pfn) && !pfn_valid(pfn))
1938 : return false;
1939 : return true;
1940 : }
1941 :
1942 : /*
1943 : * Free pages to buddy allocator. Try to free aligned pages in
1944 : * pageblock_nr_pages sizes.
1945 : */
1946 : static void __init deferred_free_pages(unsigned long pfn,
1947 : unsigned long end_pfn)
1948 : {
1949 : unsigned long nr_free = 0;
1950 :
1951 : for (; pfn < end_pfn; pfn++) {
1952 : if (!deferred_pfn_valid(pfn)) {
1953 : deferred_free_range(pfn - nr_free, nr_free);
1954 : nr_free = 0;
1955 : } else if (pageblock_aligned(pfn)) {
1956 : deferred_free_range(pfn - nr_free, nr_free);
1957 : nr_free = 1;
1958 : } else {
1959 : nr_free++;
1960 : }
1961 : }
1962 : /* Free the last block of pages to allocator */
1963 : deferred_free_range(pfn - nr_free, nr_free);
1964 : }
1965 :
1966 : /*
1967 : * Initialize struct pages. We minimize pfn page lookups and scheduler checks
1968 : * by performing it only once every pageblock_nr_pages.
1969 : * Return number of pages initialized.
1970 : */
1971 : static unsigned long __init deferred_init_pages(struct zone *zone,
1972 : unsigned long pfn,
1973 : unsigned long end_pfn)
1974 : {
1975 : int nid = zone_to_nid(zone);
1976 : unsigned long nr_pages = 0;
1977 : int zid = zone_idx(zone);
1978 : struct page *page = NULL;
1979 :
1980 : for (; pfn < end_pfn; pfn++) {
1981 : if (!deferred_pfn_valid(pfn)) {
1982 : page = NULL;
1983 : continue;
1984 : } else if (!page || pageblock_aligned(pfn)) {
1985 : page = pfn_to_page(pfn);
1986 : } else {
1987 : page++;
1988 : }
1989 : __init_single_page(page, pfn, zid, nid);
1990 : nr_pages++;
1991 : }
1992 : return (nr_pages);
1993 : }
1994 :
1995 : /*
1996 : * This function is meant to pre-load the iterator for the zone init.
1997 : * Specifically it walks through the ranges until we are caught up to the
1998 : * first_init_pfn value and exits there. If we never encounter the value we
1999 : * return false indicating there are no valid ranges left.
2000 : */
2001 : static bool __init
2002 : deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
2003 : unsigned long *spfn, unsigned long *epfn,
2004 : unsigned long first_init_pfn)
2005 : {
2006 : u64 j;
2007 :
2008 : /*
2009 : * Start out by walking through the ranges in this zone that have
2010 : * already been initialized. We don't need to do anything with them
2011 : * so we just need to flush them out of the system.
2012 : */
2013 : for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
2014 : if (*epfn <= first_init_pfn)
2015 : continue;
2016 : if (*spfn < first_init_pfn)
2017 : *spfn = first_init_pfn;
2018 : *i = j;
2019 : return true;
2020 : }
2021 :
2022 : return false;
2023 : }
2024 :
2025 : /*
2026 : * Initialize and free pages. We do it in two loops: first we initialize
2027 : * struct page, then free to buddy allocator, because while we are
2028 : * freeing pages we can access pages that are ahead (computing buddy
2029 : * page in __free_one_page()).
2030 : *
2031 : * In order to try and keep some memory in the cache we have the loop
2032 : * broken along max page order boundaries. This way we will not cause
2033 : * any issues with the buddy page computation.
2034 : */
2035 : static unsigned long __init
2036 : deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
2037 : unsigned long *end_pfn)
2038 : {
2039 : unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
2040 : unsigned long spfn = *start_pfn, epfn = *end_pfn;
2041 : unsigned long nr_pages = 0;
2042 : u64 j = *i;
2043 :
2044 : /* First we loop through and initialize the page values */
2045 : for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
2046 : unsigned long t;
2047 :
2048 : if (mo_pfn <= *start_pfn)
2049 : break;
2050 :
2051 : t = min(mo_pfn, *end_pfn);
2052 : nr_pages += deferred_init_pages(zone, *start_pfn, t);
2053 :
2054 : if (mo_pfn < *end_pfn) {
2055 : *start_pfn = mo_pfn;
2056 : break;
2057 : }
2058 : }
2059 :
2060 : /* Reset values and now loop through freeing pages as needed */
2061 : swap(j, *i);
2062 :
2063 : for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
2064 : unsigned long t;
2065 :
2066 : if (mo_pfn <= spfn)
2067 : break;
2068 :
2069 : t = min(mo_pfn, epfn);
2070 : deferred_free_pages(spfn, t);
2071 :
2072 : if (mo_pfn <= epfn)
2073 : break;
2074 : }
2075 :
2076 : return nr_pages;
2077 : }
2078 :
2079 : static void __init
2080 : deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
2081 : void *arg)
2082 : {
2083 : unsigned long spfn, epfn;
2084 : struct zone *zone = arg;
2085 : u64 i;
2086 :
2087 : deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
2088 :
2089 : /*
2090 : * Initialize and free pages in MAX_ORDER sized increments so that we
2091 : * can avoid introducing any issues with the buddy allocator.
2092 : */
2093 : while (spfn < end_pfn) {
2094 : deferred_init_maxorder(&i, zone, &spfn, &epfn);
2095 : cond_resched();
2096 : }
2097 : }
2098 :
2099 : /* An arch may override for more concurrency. */
2100 : __weak int __init
2101 : deferred_page_init_max_threads(const struct cpumask *node_cpumask)
2102 : {
2103 : return 1;
2104 : }
2105 :
2106 : /* Initialise remaining memory on a node */
2107 : static int __init deferred_init_memmap(void *data)
2108 : {
2109 : pg_data_t *pgdat = data;
2110 : const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
2111 : unsigned long spfn = 0, epfn = 0;
2112 : unsigned long first_init_pfn, flags;
2113 : unsigned long start = jiffies;
2114 : struct zone *zone;
2115 : int zid, max_threads;
2116 : u64 i;
2117 :
2118 : /* Bind memory initialisation thread to a local node if possible */
2119 : if (!cpumask_empty(cpumask))
2120 : set_cpus_allowed_ptr(current, cpumask);
2121 :
2122 : pgdat_resize_lock(pgdat, &flags);
2123 : first_init_pfn = pgdat->first_deferred_pfn;
2124 : if (first_init_pfn == ULONG_MAX) {
2125 : pgdat_resize_unlock(pgdat, &flags);
2126 : pgdat_init_report_one_done();
2127 : return 0;
2128 : }
2129 :
2130 : /* Sanity check boundaries */
2131 : BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
2132 : BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
2133 : pgdat->first_deferred_pfn = ULONG_MAX;
2134 :
2135 : /*
2136 : * Once we unlock here, the zone cannot be grown anymore, thus if an
2137 : * interrupt thread must allocate this early in boot, zone must be
2138 : * pre-grown prior to start of deferred page initialization.
2139 : */
2140 : pgdat_resize_unlock(pgdat, &flags);
2141 :
2142 : /* Only the highest zone is deferred so find it */
2143 : for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2144 : zone = pgdat->node_zones + zid;
2145 : if (first_init_pfn < zone_end_pfn(zone))
2146 : break;
2147 : }
2148 :
2149 : /* If the zone is empty somebody else may have cleared out the zone */
2150 : if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2151 : first_init_pfn))
2152 : goto zone_empty;
2153 :
2154 : max_threads = deferred_page_init_max_threads(cpumask);
2155 :
2156 : while (spfn < epfn) {
2157 : unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
2158 : struct padata_mt_job job = {
2159 : .thread_fn = deferred_init_memmap_chunk,
2160 : .fn_arg = zone,
2161 : .start = spfn,
2162 : .size = epfn_align - spfn,
2163 : .align = PAGES_PER_SECTION,
2164 : .min_chunk = PAGES_PER_SECTION,
2165 : .max_threads = max_threads,
2166 : };
2167 :
2168 : padata_do_multithreaded(&job);
2169 : deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2170 : epfn_align);
2171 : }
2172 : zone_empty:
2173 : /* Sanity check that the next zone really is unpopulated */
2174 : WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
2175 :
2176 : pr_info("node %d deferred pages initialised in %ums\n",
2177 : pgdat->node_id, jiffies_to_msecs(jiffies - start));
2178 :
2179 : pgdat_init_report_one_done();
2180 : return 0;
2181 : }
2182 :
2183 : /*
2184 : * If this zone has deferred pages, try to grow it by initializing enough
2185 : * deferred pages to satisfy the allocation specified by order, rounded up to
2186 : * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
2187 : * of SECTION_SIZE bytes by initializing struct pages in increments of
2188 : * PAGES_PER_SECTION * sizeof(struct page) bytes.
2189 : *
2190 : * Return true when zone was grown, otherwise return false. We return true even
2191 : * when we grow less than requested, to let the caller decide if there are
2192 : * enough pages to satisfy the allocation.
2193 : *
2194 : * Note: We use noinline because this function is needed only during boot, and
2195 : * it is called from a __ref function _deferred_grow_zone. This way we are
2196 : * making sure that it is not inlined into permanent text section.
2197 : */
2198 : static noinline bool __init
2199 : deferred_grow_zone(struct zone *zone, unsigned int order)
2200 : {
2201 : unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
2202 : pg_data_t *pgdat = zone->zone_pgdat;
2203 : unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
2204 : unsigned long spfn, epfn, flags;
2205 : unsigned long nr_pages = 0;
2206 : u64 i;
2207 :
2208 : /* Only the last zone may have deferred pages */
2209 : if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
2210 : return false;
2211 :
2212 : pgdat_resize_lock(pgdat, &flags);
2213 :
2214 : /*
2215 : * If someone grew this zone while we were waiting for spinlock, return
2216 : * true, as there might be enough pages already.
2217 : */
2218 : if (first_deferred_pfn != pgdat->first_deferred_pfn) {
2219 : pgdat_resize_unlock(pgdat, &flags);
2220 : return true;
2221 : }
2222 :
2223 : /* If the zone is empty somebody else may have cleared out the zone */
2224 : if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2225 : first_deferred_pfn)) {
2226 : pgdat->first_deferred_pfn = ULONG_MAX;
2227 : pgdat_resize_unlock(pgdat, &flags);
2228 : /* Retry only once. */
2229 : return first_deferred_pfn != ULONG_MAX;
2230 : }
2231 :
2232 : /*
2233 : * Initialize and free pages in MAX_ORDER sized increments so
2234 : * that we can avoid introducing any issues with the buddy
2235 : * allocator.
2236 : */
2237 : while (spfn < epfn) {
2238 : /* update our first deferred PFN for this section */
2239 : first_deferred_pfn = spfn;
2240 :
2241 : nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
2242 : touch_nmi_watchdog();
2243 :
2244 : /* We should only stop along section boundaries */
2245 : if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
2246 : continue;
2247 :
2248 : /* If our quota has been met we can stop here */
2249 : if (nr_pages >= nr_pages_needed)
2250 : break;
2251 : }
2252 :
2253 : pgdat->first_deferred_pfn = spfn;
2254 : pgdat_resize_unlock(pgdat, &flags);
2255 :
2256 : return nr_pages > 0;
2257 : }
2258 :
2259 : /*
2260 : * deferred_grow_zone() is __init, but it is called from
2261 : * get_page_from_freelist() during early boot until deferred_pages permanently
2262 : * disables this call. This is why we have refdata wrapper to avoid warning,
2263 : * and to ensure that the function body gets unloaded.
2264 : */
2265 : static bool __ref
2266 : _deferred_grow_zone(struct zone *zone, unsigned int order)
2267 : {
2268 : return deferred_grow_zone(zone, order);
2269 : }
2270 :
2271 : #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
2272 :
2273 1 : void __init page_alloc_init_late(void)
2274 : {
2275 : struct zone *zone;
2276 : int nid;
2277 :
2278 : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
2279 :
2280 : /* There will be num_node_state(N_MEMORY) threads */
2281 : atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
2282 : for_each_node_state(nid, N_MEMORY) {
2283 : kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
2284 : }
2285 :
2286 : /* Block until all are initialised */
2287 : wait_for_completion(&pgdat_init_all_done_comp);
2288 :
2289 : /*
2290 : * We initialized the rest of the deferred pages. Permanently disable
2291 : * on-demand struct page initialization.
2292 : */
2293 : static_branch_disable(&deferred_pages);
2294 :
2295 : /* Reinit limits that are based on free pages after the kernel is up */
2296 : files_maxfiles_init();
2297 : #endif
2298 :
2299 1 : buffer_init();
2300 :
2301 : /* Discard memblock private memory */
2302 1 : memblock_discard();
2303 :
2304 1 : for_each_node_state(nid, N_MEMORY)
2305 : shuffle_free_memory(NODE_DATA(nid));
2306 :
2307 3 : for_each_populated_zone(zone)
2308 1 : set_zone_contiguous(zone);
2309 1 : }
2310 :
2311 : #ifdef CONFIG_CMA
2312 : /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
2313 : void __init init_cma_reserved_pageblock(struct page *page)
2314 : {
2315 : unsigned i = pageblock_nr_pages;
2316 : struct page *p = page;
2317 :
2318 : do {
2319 : __ClearPageReserved(p);
2320 : set_page_count(p, 0);
2321 : } while (++p, --i);
2322 :
2323 : set_pageblock_migratetype(page, MIGRATE_CMA);
2324 : set_page_refcounted(page);
2325 : __free_pages(page, pageblock_order);
2326 :
2327 : adjust_managed_page_count(page, pageblock_nr_pages);
2328 : page_zone(page)->cma_pages += pageblock_nr_pages;
2329 : }
2330 : #endif
2331 :
2332 : /*
2333 : * The order of subdivision here is critical for the IO subsystem.
2334 : * Please do not alter this order without good reasons and regression
2335 : * testing. Specifically, as large blocks of memory are subdivided,
2336 : * the order in which smaller blocks are delivered depends on the order
2337 : * they're subdivided in this function. This is the primary factor
2338 : * influencing the order in which pages are delivered to the IO
2339 : * subsystem according to empirical testing, and this is also justified
2340 : * by considering the behavior of a buddy system containing a single
2341 : * large block of memory acted on by a series of small allocations.
2342 : * This behavior is a critical factor in sglist merging's success.
2343 : *
2344 : * -- nyc
2345 : */
2346 : static inline void expand(struct zone *zone, struct page *page,
2347 : int low, int high, int migratetype)
2348 : {
2349 6251 : unsigned long size = 1 << high;
2350 :
2351 12392 : while (high > low) {
2352 6141 : high--;
2353 6141 : size >>= 1;
2354 : VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
2355 :
2356 : /*
2357 : * Mark as guard pages (or page), that will allow to
2358 : * merge back to allocator when buddy will be freed.
2359 : * Corresponding page table entries will not be touched,
2360 : * pages will stay not present in virtual address space
2361 : */
2362 6141 : if (set_page_guard(zone, &page[size], high, migratetype))
2363 : continue;
2364 :
2365 12282 : add_to_free_list(&page[size], zone, high, migratetype);
2366 6141 : set_buddy_order(&page[size], high);
2367 : }
2368 : }
2369 :
2370 0 : static void check_new_page_bad(struct page *page)
2371 : {
2372 : if (unlikely(page->flags & __PG_HWPOISON)) {
2373 : /* Don't complain about hwpoisoned pages */
2374 : page_mapcount_reset(page); /* remove PageBuddy */
2375 : return;
2376 : }
2377 :
2378 0 : bad_page(page,
2379 : page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
2380 : }
2381 :
2382 : /*
2383 : * This page is about to be returned from the page allocator
2384 : */
2385 6547 : static inline int check_new_page(struct page *page)
2386 : {
2387 6547 : if (likely(page_expected_state(page,
2388 : PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
2389 : return 0;
2390 :
2391 0 : check_new_page_bad(page);
2392 0 : return 1;
2393 : }
2394 :
2395 : static bool check_new_pages(struct page *page, unsigned int order)
2396 : {
2397 : int i;
2398 6547 : for (i = 0; i < (1 << order); i++) {
2399 6547 : struct page *p = page + i;
2400 :
2401 6547 : if (unlikely(check_new_page(p)))
2402 : return true;
2403 : }
2404 :
2405 : return false;
2406 : }
2407 :
2408 : #ifdef CONFIG_DEBUG_VM
2409 : /*
2410 : * With DEBUG_VM enabled, order-0 pages are checked for expected state when
2411 : * being allocated from pcp lists. With debug_pagealloc also enabled, they are
2412 : * also checked when pcp lists are refilled from the free lists.
2413 : */
2414 : static inline bool check_pcp_refill(struct page *page, unsigned int order)
2415 : {
2416 : if (debug_pagealloc_enabled_static())
2417 : return check_new_pages(page, order);
2418 : else
2419 : return false;
2420 : }
2421 :
2422 : static inline bool check_new_pcp(struct page *page, unsigned int order)
2423 : {
2424 : return check_new_pages(page, order);
2425 : }
2426 : #else
2427 : /*
2428 : * With DEBUG_VM disabled, free order-0 pages are checked for expected state
2429 : * when pcp lists are being refilled from the free lists. With debug_pagealloc
2430 : * enabled, they are also checked when being allocated from the pcp lists.
2431 : */
2432 : static inline bool check_pcp_refill(struct page *page, unsigned int order)
2433 : {
2434 6251 : return check_new_pages(page, order);
2435 : }
2436 : static inline bool check_new_pcp(struct page *page, unsigned int order)
2437 : {
2438 : if (debug_pagealloc_enabled_static())
2439 : return check_new_pages(page, order);
2440 : else
2441 : return false;
2442 : }
2443 : #endif /* CONFIG_DEBUG_VM */
2444 :
2445 : static inline bool should_skip_kasan_unpoison(gfp_t flags)
2446 : {
2447 : /* Don't skip if a software KASAN mode is enabled. */
2448 : if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
2449 : IS_ENABLED(CONFIG_KASAN_SW_TAGS))
2450 : return false;
2451 :
2452 : /* Skip, if hardware tag-based KASAN is not enabled. */
2453 : if (!kasan_hw_tags_enabled())
2454 : return true;
2455 :
2456 : /*
2457 : * With hardware tag-based KASAN enabled, skip if this has been
2458 : * requested via __GFP_SKIP_KASAN_UNPOISON.
2459 : */
2460 : return flags & __GFP_SKIP_KASAN_UNPOISON;
2461 : }
2462 :
2463 : static inline bool should_skip_init(gfp_t flags)
2464 : {
2465 : /* Don't skip, if hardware tag-based KASAN is not enabled. */
2466 : if (!kasan_hw_tags_enabled())
2467 : return false;
2468 :
2469 : /* For hardware tag-based KASAN, skip if requested. */
2470 : return (flags & __GFP_SKIP_ZERO);
2471 : }
2472 :
2473 51169 : inline void post_alloc_hook(struct page *page, unsigned int order,
2474 : gfp_t gfp_flags)
2475 : {
2476 102338 : bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
2477 : !should_skip_init(gfp_flags);
2478 51169 : bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
2479 51169 : bool reset_tags = true;
2480 : int i;
2481 :
2482 102338 : set_page_private(page, 0);
2483 51169 : set_page_refcounted(page);
2484 :
2485 51169 : arch_alloc_page(page, order);
2486 51169 : debug_pagealloc_map_pages(page, 1 << order);
2487 :
2488 : /*
2489 : * Page unpoisoning must happen before memory initialization.
2490 : * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
2491 : * allocations and the page unpoisoning code will complain.
2492 : */
2493 51169 : kernel_unpoison_pages(page, 1 << order);
2494 :
2495 : /*
2496 : * As memory initialization might be integrated into KASAN,
2497 : * KASAN unpoisoning and memory initializion code must be
2498 : * kept together to avoid discrepancies in behavior.
2499 : */
2500 :
2501 : /*
2502 : * If memory tags should be zeroed
2503 : * (which happens only when memory should be initialized as well).
2504 : */
2505 51169 : if (zero_tags) {
2506 : /* Initialize both memory and memory tags. */
2507 : for (i = 0; i != 1 << order; ++i)
2508 : tag_clear_highpage(page + i);
2509 :
2510 : /* Take note that memory was initialized by the loop above. */
2511 : init = false;
2512 : }
2513 51169 : if (!should_skip_kasan_unpoison(gfp_flags)) {
2514 : /* Try unpoisoning (or setting tags) and initializing memory. */
2515 : if (kasan_unpoison_pages(page, order, init)) {
2516 : /* Take note that memory was initialized by KASAN. */
2517 : if (kasan_has_integrated_init())
2518 : init = false;
2519 : /* Take note that memory tags were set by KASAN. */
2520 : reset_tags = false;
2521 : } else {
2522 : /*
2523 : * KASAN decided to exclude this allocation from being
2524 : * (un)poisoned due to sampling. Make KASAN skip
2525 : * poisoning when the allocation is freed.
2526 : */
2527 : SetPageSkipKASanPoison(page);
2528 : }
2529 : }
2530 : /*
2531 : * If memory tags have not been set by KASAN, reset the page tags to
2532 : * ensure page_address() dereferencing does not fault.
2533 : */
2534 : if (reset_tags) {
2535 : for (i = 0; i != 1 << order; ++i)
2536 : page_kasan_tag_reset(page + i);
2537 : }
2538 : /* If memory is still not initialized, initialize it now. */
2539 51169 : if (init)
2540 : kernel_init_pages(page, 1 << order);
2541 : /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
2542 : if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))
2543 : SetPageSkipKASanPoison(page);
2544 :
2545 51169 : set_page_owner(page, order, gfp_flags);
2546 51169 : page_table_check_alloc(page, order);
2547 51169 : }
2548 :
2549 8628 : static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
2550 : unsigned int alloc_flags)
2551 : {
2552 51169 : post_alloc_hook(page, order, gfp_flags);
2553 :
2554 8628 : if (order && (gfp_flags & __GFP_COMP))
2555 : prep_compound_page(page, order);
2556 :
2557 : /*
2558 : * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
2559 : * allocate the page. The expectation is that the caller is taking
2560 : * steps that will free more memory. The caller should avoid the page
2561 : * being used for !PFMEMALLOC purposes.
2562 : */
2563 8628 : if (alloc_flags & ALLOC_NO_WATERMARKS)
2564 0 : set_page_pfmemalloc(page);
2565 : else
2566 51169 : clear_page_pfmemalloc(page);
2567 8628 : }
2568 :
2569 : /*
2570 : * Go through the free lists for the given migratetype and remove
2571 : * the smallest available page from the freelists
2572 : */
2573 : static __always_inline
2574 : struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
2575 : int migratetype)
2576 : {
2577 : unsigned int current_order;
2578 : struct free_area *area;
2579 : struct page *page;
2580 :
2581 : /* Find a page of the appropriate size in the preferred list */
2582 24902 : for (current_order = order; current_order < MAX_ORDER; ++current_order) {
2583 12446 : area = &(zone->free_area[current_order]);
2584 12446 : page = get_page_from_free_area(area, migratetype);
2585 12446 : if (!page)
2586 6195 : continue;
2587 6251 : del_page_from_free_list(page, zone, current_order);
2588 12502 : expand(zone, page, order, current_order, migratetype);
2589 6251 : set_pcppage_migratetype(page, migratetype);
2590 : trace_mm_page_alloc_zone_locked(page, order, migratetype,
2591 : pcp_allowed_order(order) &&
2592 : migratetype < MIGRATE_PCPTYPES);
2593 : return page;
2594 : }
2595 :
2596 : return NULL;
2597 : }
2598 :
2599 :
2600 : /*
2601 : * This array describes the order lists are fallen back to when
2602 : * the free lists for the desirable migrate type are depleted
2603 : *
2604 : * The other migratetypes do not have fallbacks.
2605 : */
2606 : static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = {
2607 : [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE },
2608 : [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
2609 : [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE },
2610 : };
2611 :
2612 : #ifdef CONFIG_CMA
2613 : static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
2614 : unsigned int order)
2615 : {
2616 : return __rmqueue_smallest(zone, order, MIGRATE_CMA);
2617 : }
2618 : #else
2619 : static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
2620 : unsigned int order) { return NULL; }
2621 : #endif
2622 :
2623 : /*
2624 : * Move the free pages in a range to the freelist tail of the requested type.
2625 : * Note that start_page and end_pages are not aligned on a pageblock
2626 : * boundary. If alignment is required, use move_freepages_block()
2627 : */
2628 0 : static int move_freepages(struct zone *zone,
2629 : unsigned long start_pfn, unsigned long end_pfn,
2630 : int migratetype, int *num_movable)
2631 : {
2632 : struct page *page;
2633 : unsigned long pfn;
2634 : unsigned int order;
2635 0 : int pages_moved = 0;
2636 :
2637 0 : for (pfn = start_pfn; pfn <= end_pfn;) {
2638 0 : page = pfn_to_page(pfn);
2639 0 : if (!PageBuddy(page)) {
2640 : /*
2641 : * We assume that pages that could be isolated for
2642 : * migration are movable. But we don't actually try
2643 : * isolating, as that would be expensive.
2644 : */
2645 0 : if (num_movable &&
2646 0 : (PageLRU(page) || __PageMovable(page)))
2647 0 : (*num_movable)++;
2648 0 : pfn++;
2649 0 : continue;
2650 : }
2651 :
2652 : /* Make sure we are not inadvertently changing nodes */
2653 : VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
2654 : VM_BUG_ON_PAGE(page_zone(page) != zone, page);
2655 :
2656 0 : order = buddy_order(page);
2657 0 : move_to_free_list(page, zone, order, migratetype);
2658 0 : pfn += 1 << order;
2659 0 : pages_moved += 1 << order;
2660 : }
2661 :
2662 0 : return pages_moved;
2663 : }
2664 :
2665 0 : int move_freepages_block(struct zone *zone, struct page *page,
2666 : int migratetype, int *num_movable)
2667 : {
2668 : unsigned long start_pfn, end_pfn, pfn;
2669 :
2670 0 : if (num_movable)
2671 0 : *num_movable = 0;
2672 :
2673 0 : pfn = page_to_pfn(page);
2674 0 : start_pfn = pageblock_start_pfn(pfn);
2675 0 : end_pfn = pageblock_end_pfn(pfn) - 1;
2676 :
2677 : /* Do not cross zone boundaries */
2678 0 : if (!zone_spans_pfn(zone, start_pfn))
2679 0 : start_pfn = pfn;
2680 0 : if (!zone_spans_pfn(zone, end_pfn))
2681 : return 0;
2682 :
2683 0 : return move_freepages(zone, start_pfn, end_pfn, migratetype,
2684 : num_movable);
2685 : }
2686 :
2687 : static void change_pageblock_range(struct page *pageblock_page,
2688 : int start_order, int migratetype)
2689 : {
2690 5 : int nr_pageblocks = 1 << (start_order - pageblock_order);
2691 :
2692 10 : while (nr_pageblocks--) {
2693 5 : set_pageblock_migratetype(pageblock_page, migratetype);
2694 5 : pageblock_page += pageblock_nr_pages;
2695 : }
2696 : }
2697 :
2698 : /*
2699 : * When we are falling back to another migratetype during allocation, try to
2700 : * steal extra free pages from the same pageblocks to satisfy further
2701 : * allocations, instead of polluting multiple pageblocks.
2702 : *
2703 : * If we are stealing a relatively large buddy page, it is likely there will
2704 : * be more free pages in the pageblock, so try to steal them all. For
2705 : * reclaimable and unmovable allocations, we steal regardless of page size,
2706 : * as fragmentation caused by those allocations polluting movable pageblocks
2707 : * is worse than movable allocations stealing from unmovable and reclaimable
2708 : * pageblocks.
2709 : */
2710 : static bool can_steal_fallback(unsigned int order, int start_mt)
2711 : {
2712 : /*
2713 : * Leaving this order check is intended, although there is
2714 : * relaxed order check in next check. The reason is that
2715 : * we can actually steal whole pageblock if this condition met,
2716 : * but, below check doesn't guarantee it and that is just heuristic
2717 : * so could be changed anytime.
2718 : */
2719 5 : if (order >= pageblock_order)
2720 : return true;
2721 :
2722 0 : if (order >= pageblock_order / 2 ||
2723 0 : start_mt == MIGRATE_RECLAIMABLE ||
2724 0 : start_mt == MIGRATE_UNMOVABLE ||
2725 : page_group_by_mobility_disabled)
2726 : return true;
2727 :
2728 : return false;
2729 : }
2730 :
2731 0 : static inline bool boost_watermark(struct zone *zone)
2732 : {
2733 : unsigned long max_boost;
2734 :
2735 0 : if (!watermark_boost_factor)
2736 : return false;
2737 : /*
2738 : * Don't bother in zones that are unlikely to produce results.
2739 : * On small machines, including kdump capture kernels running
2740 : * in a small area, boosting the watermark can cause an out of
2741 : * memory situation immediately.
2742 : */
2743 0 : if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
2744 : return false;
2745 :
2746 0 : max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
2747 : watermark_boost_factor, 10000);
2748 :
2749 : /*
2750 : * high watermark may be uninitialised if fragmentation occurs
2751 : * very early in boot so do not boost. We do not fall
2752 : * through and boost by pageblock_nr_pages as failing
2753 : * allocations that early means that reclaim is not going
2754 : * to help and it may even be impossible to reclaim the
2755 : * boosted watermark resulting in a hang.
2756 : */
2757 0 : if (!max_boost)
2758 : return false;
2759 :
2760 0 : max_boost = max(pageblock_nr_pages, max_boost);
2761 :
2762 0 : zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
2763 : max_boost);
2764 :
2765 0 : return true;
2766 : }
2767 :
2768 : /*
2769 : * This function implements actual steal behaviour. If order is large enough,
2770 : * we can steal whole pageblock. If not, we first move freepages in this
2771 : * pageblock to our migratetype and determine how many already-allocated pages
2772 : * are there in the pageblock with a compatible migratetype. If at least half
2773 : * of pages are free or compatible, we can change migratetype of the pageblock
2774 : * itself, so pages freed in the future will be put on the correct free list.
2775 : */
2776 5 : static void steal_suitable_fallback(struct zone *zone, struct page *page,
2777 : unsigned int alloc_flags, int start_type, bool whole_block)
2778 : {
2779 10 : unsigned int current_order = buddy_order(page);
2780 : int free_pages, movable_pages, alike_pages;
2781 : int old_block_type;
2782 :
2783 10 : old_block_type = get_pageblock_migratetype(page);
2784 :
2785 : /*
2786 : * This can happen due to races and we want to prevent broken
2787 : * highatomic accounting.
2788 : */
2789 5 : if (is_migrate_highatomic(old_block_type))
2790 : goto single_page;
2791 :
2792 : /* Take ownership for orders >= pageblock_order */
2793 5 : if (current_order >= pageblock_order) {
2794 5 : change_pageblock_range(page, current_order, start_type);
2795 : goto single_page;
2796 : }
2797 :
2798 : /*
2799 : * Boost watermarks to increase reclaim pressure to reduce the
2800 : * likelihood of future fallbacks. Wake kswapd now as the node
2801 : * may be balanced overall and kswapd will not wake naturally.
2802 : */
2803 0 : if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
2804 0 : set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
2805 :
2806 : /* We are not allowed to try stealing from the whole block */
2807 0 : if (!whole_block)
2808 : goto single_page;
2809 :
2810 0 : free_pages = move_freepages_block(zone, page, start_type,
2811 : &movable_pages);
2812 : /*
2813 : * Determine how many pages are compatible with our allocation.
2814 : * For movable allocation, it's the number of movable pages which
2815 : * we just obtained. For other types it's a bit more tricky.
2816 : */
2817 0 : if (start_type == MIGRATE_MOVABLE) {
2818 0 : alike_pages = movable_pages;
2819 : } else {
2820 : /*
2821 : * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
2822 : * to MOVABLE pageblock, consider all non-movable pages as
2823 : * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
2824 : * vice versa, be conservative since we can't distinguish the
2825 : * exact migratetype of non-movable pages.
2826 : */
2827 0 : if (old_block_type == MIGRATE_MOVABLE)
2828 0 : alike_pages = pageblock_nr_pages
2829 0 : - (free_pages + movable_pages);
2830 : else
2831 : alike_pages = 0;
2832 : }
2833 :
2834 : /* moving whole block can fail due to zone boundary conditions */
2835 0 : if (!free_pages)
2836 : goto single_page;
2837 :
2838 : /*
2839 : * If a sufficient number of pages in the block are either free or of
2840 : * comparable migratability as our allocation, claim the whole block.
2841 : */
2842 0 : if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
2843 : page_group_by_mobility_disabled)
2844 0 : set_pageblock_migratetype(page, start_type);
2845 :
2846 0 : return;
2847 :
2848 : single_page:
2849 5 : move_to_free_list(page, zone, current_order, start_type);
2850 : }
2851 :
2852 : /*
2853 : * Check whether there is a suitable fallback freepage with requested order.
2854 : * If only_stealable is true, this function returns fallback_mt only if
2855 : * we can steal other freepages all together. This would help to reduce
2856 : * fragmentation due to mixed migratetype pages in one pageblock.
2857 : */
2858 5 : int find_suitable_fallback(struct free_area *area, unsigned int order,
2859 : int migratetype, bool only_stealable, bool *can_steal)
2860 : {
2861 : int i;
2862 : int fallback_mt;
2863 :
2864 5 : if (area->nr_free == 0)
2865 : return -1;
2866 :
2867 5 : *can_steal = false;
2868 10 : for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
2869 10 : fallback_mt = fallbacks[migratetype][i];
2870 10 : if (free_area_empty(area, fallback_mt))
2871 5 : continue;
2872 :
2873 5 : if (can_steal_fallback(order, migratetype))
2874 5 : *can_steal = true;
2875 :
2876 5 : if (!only_stealable)
2877 : return fallback_mt;
2878 :
2879 0 : if (*can_steal)
2880 : return fallback_mt;
2881 : }
2882 :
2883 : return -1;
2884 : }
2885 :
2886 : /*
2887 : * Reserve a pageblock for exclusive use of high-order atomic allocations if
2888 : * there are no empty page blocks that contain a page with a suitable order
2889 : */
2890 0 : static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
2891 : unsigned int alloc_order)
2892 : {
2893 : int mt;
2894 : unsigned long max_managed, flags;
2895 :
2896 : /*
2897 : * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2898 : * Check is race-prone but harmless.
2899 : */
2900 0 : max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
2901 0 : if (zone->nr_reserved_highatomic >= max_managed)
2902 : return;
2903 :
2904 0 : spin_lock_irqsave(&zone->lock, flags);
2905 :
2906 : /* Recheck the nr_reserved_highatomic limit under the lock */
2907 0 : if (zone->nr_reserved_highatomic >= max_managed)
2908 : goto out_unlock;
2909 :
2910 : /* Yoink! */
2911 0 : mt = get_pageblock_migratetype(page);
2912 : /* Only reserve normal pageblocks (i.e., they can merge with others) */
2913 0 : if (migratetype_is_mergeable(mt)) {
2914 0 : zone->nr_reserved_highatomic += pageblock_nr_pages;
2915 0 : set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
2916 0 : move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
2917 : }
2918 :
2919 : out_unlock:
2920 0 : spin_unlock_irqrestore(&zone->lock, flags);
2921 : }
2922 :
2923 : /*
2924 : * Used when an allocation is about to fail under memory pressure. This
2925 : * potentially hurts the reliability of high-order allocations when under
2926 : * intense memory pressure but failed atomic allocations should be easier
2927 : * to recover from than an OOM.
2928 : *
2929 : * If @force is true, try to unreserve a pageblock even though highatomic
2930 : * pageblock is exhausted.
2931 : */
2932 0 : static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2933 : bool force)
2934 : {
2935 0 : struct zonelist *zonelist = ac->zonelist;
2936 : unsigned long flags;
2937 : struct zoneref *z;
2938 : struct zone *zone;
2939 : struct page *page;
2940 : int order;
2941 : bool ret;
2942 :
2943 0 : for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
2944 : ac->nodemask) {
2945 : /*
2946 : * Preserve at least one pageblock unless memory pressure
2947 : * is really high.
2948 : */
2949 0 : if (!force && zone->nr_reserved_highatomic <=
2950 : pageblock_nr_pages)
2951 0 : continue;
2952 :
2953 0 : spin_lock_irqsave(&zone->lock, flags);
2954 0 : for (order = 0; order < MAX_ORDER; order++) {
2955 0 : struct free_area *area = &(zone->free_area[order]);
2956 :
2957 0 : page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
2958 0 : if (!page)
2959 0 : continue;
2960 :
2961 : /*
2962 : * In page freeing path, migratetype change is racy so
2963 : * we can counter several free pages in a pageblock
2964 : * in this loop although we changed the pageblock type
2965 : * from highatomic to ac->migratetype. So we should
2966 : * adjust the count once.
2967 : */
2968 0 : if (is_migrate_highatomic_page(page)) {
2969 : /*
2970 : * It should never happen but changes to
2971 : * locking could inadvertently allow a per-cpu
2972 : * drain to add pages to MIGRATE_HIGHATOMIC
2973 : * while unreserving so be safe and watch for
2974 : * underflows.
2975 : */
2976 0 : zone->nr_reserved_highatomic -= min(
2977 : pageblock_nr_pages,
2978 : zone->nr_reserved_highatomic);
2979 : }
2980 :
2981 : /*
2982 : * Convert to ac->migratetype and avoid the normal
2983 : * pageblock stealing heuristics. Minimally, the caller
2984 : * is doing the work and needs the pages. More
2985 : * importantly, if the block was always converted to
2986 : * MIGRATE_UNMOVABLE or another type then the number
2987 : * of pageblocks that cannot be completely freed
2988 : * may increase.
2989 : */
2990 0 : set_pageblock_migratetype(page, ac->migratetype);
2991 0 : ret = move_freepages_block(zone, page, ac->migratetype,
2992 : NULL);
2993 0 : if (ret) {
2994 0 : spin_unlock_irqrestore(&zone->lock, flags);
2995 0 : return ret;
2996 : }
2997 : }
2998 0 : spin_unlock_irqrestore(&zone->lock, flags);
2999 : }
3000 :
3001 : return false;
3002 : }
3003 :
3004 : /*
3005 : * Try finding a free buddy page on the fallback list and put it on the free
3006 : * list of requested migratetype, possibly along with other pages from the same
3007 : * block, depending on fragmentation avoidance heuristics. Returns true if
3008 : * fallback was found so that __rmqueue_smallest() can grab it.
3009 : *
3010 : * The use of signed ints for order and current_order is a deliberate
3011 : * deviation from the rest of this file, to make the for loop
3012 : * condition simpler.
3013 : */
3014 : static __always_inline bool
3015 : __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
3016 : unsigned int alloc_flags)
3017 : {
3018 : struct free_area *area;
3019 : int current_order;
3020 5 : int min_order = order;
3021 : struct page *page;
3022 : int fallback_mt;
3023 : bool can_steal;
3024 :
3025 : /*
3026 : * Do not steal pages from freelists belonging to other pageblocks
3027 : * i.e. orders < pageblock_order. If there are no local zones free,
3028 : * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
3029 : */
3030 : if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
3031 : min_order = pageblock_order;
3032 :
3033 : /*
3034 : * Find the largest available free page in the other list. This roughly
3035 : * approximates finding the pageblock with the most free pages, which
3036 : * would be too costly to do exactly.
3037 : */
3038 10 : for (current_order = MAX_ORDER - 1; current_order >= min_order;
3039 0 : --current_order) {
3040 5 : area = &(zone->free_area[current_order]);
3041 5 : fallback_mt = find_suitable_fallback(area, current_order,
3042 : start_migratetype, false, &can_steal);
3043 5 : if (fallback_mt == -1)
3044 0 : continue;
3045 :
3046 : /*
3047 : * We cannot steal all free pages from the pageblock and the
3048 : * requested migratetype is movable. In that case it's better to
3049 : * steal and split the smallest available page instead of the
3050 : * largest available page, because even if the next movable
3051 : * allocation falls back into a different pageblock than this
3052 : * one, it won't cause permanent fragmentation.
3053 : */
3054 5 : if (!can_steal && start_migratetype == MIGRATE_MOVABLE
3055 0 : && current_order > order)
3056 : goto find_smallest;
3057 :
3058 : goto do_steal;
3059 : }
3060 :
3061 : return false;
3062 :
3063 : find_smallest:
3064 0 : for (current_order = order; current_order < MAX_ORDER;
3065 0 : current_order++) {
3066 0 : area = &(zone->free_area[current_order]);
3067 0 : fallback_mt = find_suitable_fallback(area, current_order,
3068 : start_migratetype, false, &can_steal);
3069 0 : if (fallback_mt != -1)
3070 : break;
3071 : }
3072 :
3073 : /*
3074 : * This should not happen - we already found a suitable fallback
3075 : * when looking for the largest page.
3076 : */
3077 : VM_BUG_ON(current_order == MAX_ORDER);
3078 :
3079 : do_steal:
3080 5 : page = get_page_from_free_area(area, fallback_mt);
3081 :
3082 5 : steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
3083 : can_steal);
3084 :
3085 5 : trace_mm_page_alloc_extfrag(page, order, current_order,
3086 : start_migratetype, fallback_mt);
3087 :
3088 : return true;
3089 :
3090 : }
3091 :
3092 : /*
3093 : * Do the hard work of removing an element from the buddy allocator.
3094 : * Call me with the zone->lock already held.
3095 : */
3096 : static __always_inline struct page *
3097 : __rmqueue(struct zone *zone, unsigned int order, int migratetype,
3098 : unsigned int alloc_flags)
3099 : {
3100 : struct page *page;
3101 :
3102 : if (IS_ENABLED(CONFIG_CMA)) {
3103 : /*
3104 : * Balance movable allocations between regular and CMA areas by
3105 : * allocating from CMA when over half of the zone's free memory
3106 : * is in the CMA area.
3107 : */
3108 : if (alloc_flags & ALLOC_CMA &&
3109 : zone_page_state(zone, NR_FREE_CMA_PAGES) >
3110 : zone_page_state(zone, NR_FREE_PAGES) / 2) {
3111 : page = __rmqueue_cma_fallback(zone, order);
3112 : if (page)
3113 : return page;
3114 : }
3115 : }
3116 : retry:
3117 6256 : page = __rmqueue_smallest(zone, order, migratetype);
3118 6256 : if (unlikely(!page)) {
3119 5 : if (alloc_flags & ALLOC_CMA)
3120 0 : page = __rmqueue_cma_fallback(zone, order);
3121 :
3122 10 : if (!page && __rmqueue_fallback(zone, order, migratetype,
3123 : alloc_flags))
3124 : goto retry;
3125 : }
3126 : return page;
3127 : }
3128 :
3129 : /*
3130 : * Obtain a specified number of elements from the buddy allocator, all under
3131 : * a single hold of the lock, for efficiency. Add them to the supplied list.
3132 : * Returns the number of new pages which were placed at *list.
3133 : */
3134 115 : static int rmqueue_bulk(struct zone *zone, unsigned int order,
3135 : unsigned long count, struct list_head *list,
3136 : int migratetype, unsigned int alloc_flags)
3137 : {
3138 : unsigned long flags;
3139 115 : int i, allocated = 0;
3140 :
3141 115 : spin_lock_irqsave(&zone->lock, flags);
3142 6366 : for (i = 0; i < count; ++i) {
3143 6251 : struct page *page = __rmqueue(zone, order, migratetype,
3144 : alloc_flags);
3145 6251 : if (unlikely(page == NULL))
3146 : break;
3147 :
3148 6251 : if (unlikely(check_pcp_refill(page, order)))
3149 0 : continue;
3150 :
3151 : /*
3152 : * Split buddy pages returned by expand() are received here in
3153 : * physical page order. The page is added to the tail of
3154 : * caller's list. From the callers perspective, the linked list
3155 : * is ordered by page number under some conditions. This is
3156 : * useful for IO devices that can forward direction from the
3157 : * head, thus also in the physical page order. This is useful
3158 : * for IO devices that can merge IO requests if the physical
3159 : * pages are ordered properly.
3160 : */
3161 12502 : list_add_tail(&page->pcp_list, list);
3162 6251 : allocated++;
3163 : if (is_migrate_cma(get_pcppage_migratetype(page)))
3164 : __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
3165 : -(1 << order));
3166 : }
3167 :
3168 : /*
3169 : * i pages were removed from the buddy list even if some leak due
3170 : * to check_pcp_refill failing so adjust NR_FREE_PAGES based
3171 : * on i. Do not confuse with 'allocated' which is the number of
3172 : * pages added to the pcp list.
3173 : */
3174 230 : __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
3175 230 : spin_unlock_irqrestore(&zone->lock, flags);
3176 115 : return allocated;
3177 : }
3178 :
3179 : #ifdef CONFIG_NUMA
3180 : /*
3181 : * Called from the vmstat counter updater to drain pagesets of this
3182 : * currently executing processor on remote nodes after they have
3183 : * expired.
3184 : */
3185 : void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
3186 : {
3187 : int to_drain, batch;
3188 :
3189 : batch = READ_ONCE(pcp->batch);
3190 : to_drain = min(pcp->count, batch);
3191 : if (to_drain > 0) {
3192 : spin_lock(&pcp->lock);
3193 : free_pcppages_bulk(zone, to_drain, pcp, 0);
3194 : spin_unlock(&pcp->lock);
3195 : }
3196 : }
3197 : #endif
3198 :
3199 : /*
3200 : * Drain pcplists of the indicated processor and zone.
3201 : */
3202 0 : static void drain_pages_zone(unsigned int cpu, struct zone *zone)
3203 : {
3204 : struct per_cpu_pages *pcp;
3205 :
3206 0 : pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
3207 0 : if (pcp->count) {
3208 0 : spin_lock(&pcp->lock);
3209 0 : free_pcppages_bulk(zone, pcp->count, pcp, 0);
3210 0 : spin_unlock(&pcp->lock);
3211 : }
3212 0 : }
3213 :
3214 : /*
3215 : * Drain pcplists of all zones on the indicated processor.
3216 : */
3217 0 : static void drain_pages(unsigned int cpu)
3218 : {
3219 : struct zone *zone;
3220 :
3221 0 : for_each_populated_zone(zone) {
3222 0 : drain_pages_zone(cpu, zone);
3223 : }
3224 0 : }
3225 :
3226 : /*
3227 : * Spill all of this CPU's per-cpu pages back into the buddy allocator.
3228 : */
3229 0 : void drain_local_pages(struct zone *zone)
3230 : {
3231 0 : int cpu = smp_processor_id();
3232 :
3233 0 : if (zone)
3234 0 : drain_pages_zone(cpu, zone);
3235 : else
3236 0 : drain_pages(cpu);
3237 0 : }
3238 :
3239 : /*
3240 : * The implementation of drain_all_pages(), exposing an extra parameter to
3241 : * drain on all cpus.
3242 : *
3243 : * drain_all_pages() is optimized to only execute on cpus where pcplists are
3244 : * not empty. The check for non-emptiness can however race with a free to
3245 : * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
3246 : * that need the guarantee that every CPU has drained can disable the
3247 : * optimizing racy check.
3248 : */
3249 0 : static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
3250 : {
3251 : int cpu;
3252 :
3253 : /*
3254 : * Allocate in the BSS so we won't require allocation in
3255 : * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
3256 : */
3257 : static cpumask_t cpus_with_pcps;
3258 :
3259 : /*
3260 : * Do not drain if one is already in progress unless it's specific to
3261 : * a zone. Such callers are primarily CMA and memory hotplug and need
3262 : * the drain to be complete when the call returns.
3263 : */
3264 0 : if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
3265 0 : if (!zone)
3266 : return;
3267 0 : mutex_lock(&pcpu_drain_mutex);
3268 : }
3269 :
3270 : /*
3271 : * We don't care about racing with CPU hotplug event
3272 : * as offline notification will cause the notified
3273 : * cpu to drain that CPU pcps and on_each_cpu_mask
3274 : * disables preemption as part of its processing
3275 : */
3276 0 : for_each_online_cpu(cpu) {
3277 : struct per_cpu_pages *pcp;
3278 : struct zone *z;
3279 0 : bool has_pcps = false;
3280 :
3281 0 : if (force_all_cpus) {
3282 : /*
3283 : * The pcp.count check is racy, some callers need a
3284 : * guarantee that no cpu is missed.
3285 : */
3286 : has_pcps = true;
3287 0 : } else if (zone) {
3288 0 : pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
3289 0 : if (pcp->count)
3290 0 : has_pcps = true;
3291 : } else {
3292 0 : for_each_populated_zone(z) {
3293 0 : pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
3294 0 : if (pcp->count) {
3295 : has_pcps = true;
3296 : break;
3297 : }
3298 : }
3299 : }
3300 :
3301 0 : if (has_pcps)
3302 0 : cpumask_set_cpu(cpu, &cpus_with_pcps);
3303 : else
3304 : cpumask_clear_cpu(cpu, &cpus_with_pcps);
3305 : }
3306 :
3307 0 : for_each_cpu(cpu, &cpus_with_pcps) {
3308 0 : if (zone)
3309 0 : drain_pages_zone(cpu, zone);
3310 : else
3311 0 : drain_pages(cpu);
3312 : }
3313 :
3314 0 : mutex_unlock(&pcpu_drain_mutex);
3315 : }
3316 :
3317 : /*
3318 : * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
3319 : *
3320 : * When zone parameter is non-NULL, spill just the single zone's pages.
3321 : */
3322 0 : void drain_all_pages(struct zone *zone)
3323 : {
3324 0 : __drain_all_pages(zone, false);
3325 0 : }
3326 :
3327 : #ifdef CONFIG_HIBERNATION
3328 :
3329 : /*
3330 : * Touch the watchdog for every WD_PAGE_COUNT pages.
3331 : */
3332 : #define WD_PAGE_COUNT (128*1024)
3333 :
3334 : void mark_free_pages(struct zone *zone)
3335 : {
3336 : unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
3337 : unsigned long flags;
3338 : unsigned int order, t;
3339 : struct page *page;
3340 :
3341 : if (zone_is_empty(zone))
3342 : return;
3343 :
3344 : spin_lock_irqsave(&zone->lock, flags);
3345 :
3346 : max_zone_pfn = zone_end_pfn(zone);
3347 : for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
3348 : if (pfn_valid(pfn)) {
3349 : page = pfn_to_page(pfn);
3350 :
3351 : if (!--page_count) {
3352 : touch_nmi_watchdog();
3353 : page_count = WD_PAGE_COUNT;
3354 : }
3355 :
3356 : if (page_zone(page) != zone)
3357 : continue;
3358 :
3359 : if (!swsusp_page_is_forbidden(page))
3360 : swsusp_unset_page_free(page);
3361 : }
3362 :
3363 : for_each_migratetype_order(order, t) {
3364 : list_for_each_entry(page,
3365 : &zone->free_area[order].free_list[t], buddy_list) {
3366 : unsigned long i;
3367 :
3368 : pfn = page_to_pfn(page);
3369 : for (i = 0; i < (1UL << order); i++) {
3370 : if (!--page_count) {
3371 : touch_nmi_watchdog();
3372 : page_count = WD_PAGE_COUNT;
3373 : }
3374 : swsusp_set_page_free(pfn_to_page(pfn + i));
3375 : }
3376 : }
3377 : }
3378 : spin_unlock_irqrestore(&zone->lock, flags);
3379 : }
3380 : #endif /* CONFIG_PM */
3381 :
3382 50643 : static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
3383 : unsigned int order)
3384 : {
3385 : int migratetype;
3386 :
3387 50643 : if (!free_pcp_prepare(page, order))
3388 : return false;
3389 :
3390 50643 : migratetype = get_pfnblock_migratetype(page, pfn);
3391 101286 : set_pcppage_migratetype(page, migratetype);
3392 50643 : return true;
3393 : }
3394 :
3395 : static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
3396 : bool free_high)
3397 : {
3398 : int min_nr_free, max_nr_free;
3399 :
3400 : /* Free everything if batch freeing high-order pages. */
3401 14 : if (unlikely(free_high))
3402 : return pcp->count;
3403 :
3404 : /* Check for PCP disabled or boot pageset */
3405 14 : if (unlikely(high < batch))
3406 : return 1;
3407 :
3408 : /* Leave at least pcp->batch pages on the list */
3409 14 : min_nr_free = batch;
3410 14 : max_nr_free = high - batch;
3411 :
3412 : /*
3413 : * Double the number of pages freed each time there is subsequent
3414 : * freeing of pages without any allocation.
3415 : */
3416 14 : batch <<= pcp->free_factor;
3417 14 : if (batch < max_nr_free)
3418 13 : pcp->free_factor++;
3419 14 : batch = clamp(batch, min_nr_free, max_nr_free);
3420 :
3421 : return batch;
3422 : }
3423 :
3424 50643 : static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
3425 : bool free_high)
3426 : {
3427 50643 : int high = READ_ONCE(pcp->high);
3428 :
3429 50643 : if (unlikely(!high || free_high))
3430 : return 0;
3431 :
3432 101286 : if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
3433 : return high;
3434 :
3435 : /*
3436 : * If reclaim is active, limit the number of pages that can be
3437 : * stored on pcp lists
3438 : */
3439 0 : return min(READ_ONCE(pcp->batch) << 2, high);
3440 : }
3441 :
3442 50643 : static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
3443 : struct page *page, int migratetype,
3444 : unsigned int order)
3445 : {
3446 : int high;
3447 : int pindex;
3448 : bool free_high;
3449 :
3450 101286 : __count_vm_events(PGFREE, 1 << order);
3451 101286 : pindex = order_to_pindex(migratetype, order);
3452 101286 : list_add(&page->pcp_list, &pcp->lists[pindex]);
3453 50643 : pcp->count += 1 << order;
3454 :
3455 : /*
3456 : * As high-order pages other than THP's stored on PCP can contribute
3457 : * to fragmentation, limit the number stored when PCP is heavily
3458 : * freeing without allocation. The remainder after bulk freeing
3459 : * stops will be drained from vmstat refresh context.
3460 : */
3461 50643 : free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
3462 :
3463 50643 : high = nr_pcp_high(pcp, zone, free_high);
3464 50643 : if (pcp->count >= high) {
3465 14 : int batch = READ_ONCE(pcp->batch);
3466 :
3467 28 : free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
3468 : }
3469 50643 : }
3470 :
3471 : /*
3472 : * Free a pcp page
3473 : */
3474 50643 : void free_unref_page(struct page *page, unsigned int order)
3475 : {
3476 : unsigned long __maybe_unused UP_flags;
3477 : struct per_cpu_pages *pcp;
3478 : struct zone *zone;
3479 50643 : unsigned long pfn = page_to_pfn(page);
3480 : int migratetype;
3481 :
3482 50643 : if (!free_unref_page_prepare(page, pfn, order))
3483 : return;
3484 :
3485 : /*
3486 : * We only track unmovable, reclaimable and movable on pcp lists.
3487 : * Place ISOLATE pages on the isolated list because they are being
3488 : * offlined but treat HIGHATOMIC as movable pages so we can get those
3489 : * areas back if necessary. Otherwise, we may have to free
3490 : * excessively into the page allocator
3491 : */
3492 101286 : migratetype = get_pcppage_migratetype(page);
3493 50643 : if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
3494 : if (unlikely(is_migrate_isolate(migratetype))) {
3495 : free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
3496 : return;
3497 : }
3498 0 : migratetype = MIGRATE_MOVABLE;
3499 : }
3500 :
3501 50643 : zone = page_zone(page);
3502 50643 : pcp_trylock_prepare(UP_flags);
3503 101286 : pcp = pcp_spin_trylock(zone->per_cpu_pageset);
3504 50643 : if (pcp) {
3505 50643 : free_unref_page_commit(zone, pcp, page, migratetype, order);
3506 101286 : pcp_spin_unlock(pcp);
3507 : } else {
3508 0 : free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
3509 : }
3510 50643 : pcp_trylock_finish(UP_flags);
3511 : }
3512 :
3513 : /*
3514 : * Free a list of 0-order pages
3515 : */
3516 0 : void free_unref_page_list(struct list_head *list)
3517 : {
3518 : unsigned long __maybe_unused UP_flags;
3519 : struct page *page, *next;
3520 0 : struct per_cpu_pages *pcp = NULL;
3521 0 : struct zone *locked_zone = NULL;
3522 0 : int batch_count = 0;
3523 : int migratetype;
3524 :
3525 : /* Prepare pages for freeing */
3526 0 : list_for_each_entry_safe(page, next, list, lru) {
3527 0 : unsigned long pfn = page_to_pfn(page);
3528 0 : if (!free_unref_page_prepare(page, pfn, 0)) {
3529 0 : list_del(&page->lru);
3530 0 : continue;
3531 : }
3532 :
3533 : /*
3534 : * Free isolated pages directly to the allocator, see
3535 : * comment in free_unref_page.
3536 : */
3537 : migratetype = get_pcppage_migratetype(page);
3538 : if (unlikely(is_migrate_isolate(migratetype))) {
3539 : list_del(&page->lru);
3540 : free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
3541 : continue;
3542 : }
3543 : }
3544 :
3545 0 : list_for_each_entry_safe(page, next, list, lru) {
3546 0 : struct zone *zone = page_zone(page);
3547 :
3548 0 : list_del(&page->lru);
3549 0 : migratetype = get_pcppage_migratetype(page);
3550 :
3551 : /*
3552 : * Either different zone requiring a different pcp lock or
3553 : * excessive lock hold times when freeing a large list of
3554 : * pages.
3555 : */
3556 0 : if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
3557 0 : if (pcp) {
3558 0 : pcp_spin_unlock(pcp);
3559 0 : pcp_trylock_finish(UP_flags);
3560 : }
3561 :
3562 0 : batch_count = 0;
3563 :
3564 : /*
3565 : * trylock is necessary as pages may be getting freed
3566 : * from IRQ or SoftIRQ context after an IO completion.
3567 : */
3568 0 : pcp_trylock_prepare(UP_flags);
3569 0 : pcp = pcp_spin_trylock(zone->per_cpu_pageset);
3570 0 : if (unlikely(!pcp)) {
3571 0 : pcp_trylock_finish(UP_flags);
3572 0 : free_one_page(zone, page, page_to_pfn(page),
3573 : 0, migratetype, FPI_NONE);
3574 0 : locked_zone = NULL;
3575 0 : continue;
3576 : }
3577 : locked_zone = zone;
3578 : }
3579 :
3580 : /*
3581 : * Non-isolated types over MIGRATE_PCPTYPES get added
3582 : * to the MIGRATE_MOVABLE pcp list.
3583 : */
3584 0 : if (unlikely(migratetype >= MIGRATE_PCPTYPES))
3585 0 : migratetype = MIGRATE_MOVABLE;
3586 :
3587 0 : trace_mm_page_free_batched(page);
3588 0 : free_unref_page_commit(zone, pcp, page, migratetype, 0);
3589 0 : batch_count++;
3590 : }
3591 :
3592 0 : if (pcp) {
3593 0 : pcp_spin_unlock(pcp);
3594 0 : pcp_trylock_finish(UP_flags);
3595 : }
3596 0 : }
3597 :
3598 : /*
3599 : * split_page takes a non-compound higher-order page, and splits it into
3600 : * n (1<<order) sub-pages: page[0..n]
3601 : * Each sub-page must be freed individually.
3602 : *
3603 : * Note: this is probably too low level an operation for use in drivers.
3604 : * Please consult with lkml before using this in your driver.
3605 : */
3606 0 : void split_page(struct page *page, unsigned int order)
3607 : {
3608 : int i;
3609 :
3610 : VM_BUG_ON_PAGE(PageCompound(page), page);
3611 : VM_BUG_ON_PAGE(!page_count(page), page);
3612 :
3613 0 : for (i = 1; i < (1 << order); i++)
3614 0 : set_page_refcounted(page + i);
3615 0 : split_page_owner(page, 1 << order);
3616 0 : split_page_memcg(page, 1 << order);
3617 0 : }
3618 : EXPORT_SYMBOL_GPL(split_page);
3619 :
3620 0 : int __isolate_free_page(struct page *page, unsigned int order)
3621 : {
3622 0 : struct zone *zone = page_zone(page);
3623 0 : int mt = get_pageblock_migratetype(page);
3624 :
3625 0 : if (!is_migrate_isolate(mt)) {
3626 : unsigned long watermark;
3627 : /*
3628 : * Obey watermarks as if the page was being allocated. We can
3629 : * emulate a high-order watermark check with a raised order-0
3630 : * watermark, because we already know our high-order page
3631 : * exists.
3632 : */
3633 0 : watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
3634 0 : if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
3635 : return 0;
3636 :
3637 0 : __mod_zone_freepage_state(zone, -(1UL << order), mt);
3638 : }
3639 :
3640 0 : del_page_from_free_list(page, zone, order);
3641 :
3642 : /*
3643 : * Set the pageblock if the isolated page is at least half of a
3644 : * pageblock
3645 : */
3646 0 : if (order >= pageblock_order - 1) {
3647 0 : struct page *endpage = page + (1 << order) - 1;
3648 0 : for (; page < endpage; page += pageblock_nr_pages) {
3649 0 : int mt = get_pageblock_migratetype(page);
3650 : /*
3651 : * Only change normal pageblocks (i.e., they can merge
3652 : * with others)
3653 : */
3654 0 : if (migratetype_is_mergeable(mt))
3655 0 : set_pageblock_migratetype(page,
3656 : MIGRATE_MOVABLE);
3657 : }
3658 : }
3659 :
3660 0 : return 1UL << order;
3661 : }
3662 :
3663 : /**
3664 : * __putback_isolated_page - Return a now-isolated page back where we got it
3665 : * @page: Page that was isolated
3666 : * @order: Order of the isolated page
3667 : * @mt: The page's pageblock's migratetype
3668 : *
3669 : * This function is meant to return a page pulled from the free lists via
3670 : * __isolate_free_page back to the free lists they were pulled from.
3671 : */
3672 0 : void __putback_isolated_page(struct page *page, unsigned int order, int mt)
3673 : {
3674 0 : struct zone *zone = page_zone(page);
3675 :
3676 : /* zone lock should be held when this function is called */
3677 : lockdep_assert_held(&zone->lock);
3678 :
3679 : /* Return isolated page to tail of freelist. */
3680 0 : __free_one_page(page, page_to_pfn(page), zone, order, mt,
3681 : FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
3682 0 : }
3683 :
3684 : /*
3685 : * Update NUMA hit/miss statistics
3686 : */
3687 : static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
3688 : long nr_account)
3689 : {
3690 : #ifdef CONFIG_NUMA
3691 : enum numa_stat_item local_stat = NUMA_LOCAL;
3692 :
3693 : /* skip numa counters update if numa stats is disabled */
3694 : if (!static_branch_likely(&vm_numa_stat_key))
3695 : return;
3696 :
3697 : if (zone_to_nid(z) != numa_node_id())
3698 : local_stat = NUMA_OTHER;
3699 :
3700 : if (zone_to_nid(z) == zone_to_nid(preferred_zone))
3701 : __count_numa_events(z, NUMA_HIT, nr_account);
3702 : else {
3703 : __count_numa_events(z, NUMA_MISS, nr_account);
3704 : __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
3705 : }
3706 : __count_numa_events(z, local_stat, nr_account);
3707 : #endif
3708 : }
3709 :
3710 : static __always_inline
3711 : struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
3712 : unsigned int order, unsigned int alloc_flags,
3713 : int migratetype)
3714 : {
3715 : struct page *page;
3716 : unsigned long flags;
3717 :
3718 : do {
3719 0 : page = NULL;
3720 0 : spin_lock_irqsave(&zone->lock, flags);
3721 : /*
3722 : * order-0 request can reach here when the pcplist is skipped
3723 : * due to non-CMA allocation context. HIGHATOMIC area is
3724 : * reserved for high-order atomic allocation, so order-0
3725 : * request should skip it.
3726 : */
3727 0 : if (alloc_flags & ALLOC_HIGHATOMIC)
3728 : page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
3729 0 : if (!page) {
3730 0 : page = __rmqueue(zone, order, migratetype, alloc_flags);
3731 :
3732 : /*
3733 : * If the allocation fails, allow OOM handling access
3734 : * to HIGHATOMIC reserves as failing now is worse than
3735 : * failing a high-order atomic allocation in the
3736 : * future.
3737 : */
3738 0 : if (!page && (alloc_flags & ALLOC_OOM))
3739 : page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
3740 :
3741 0 : if (!page) {
3742 0 : spin_unlock_irqrestore(&zone->lock, flags);
3743 : return NULL;
3744 : }
3745 : }
3746 0 : __mod_zone_freepage_state(zone, -(1 << order),
3747 : get_pcppage_migratetype(page));
3748 0 : spin_unlock_irqrestore(&zone->lock, flags);
3749 0 : } while (check_new_pages(page, order));
3750 :
3751 0 : __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
3752 : zone_statistics(preferred_zone, zone, 1);
3753 :
3754 : return page;
3755 : }
3756 :
3757 : /* Remove page from the per-cpu list, caller must protect the list */
3758 : static inline
3759 51169 : struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
3760 : int migratetype,
3761 : unsigned int alloc_flags,
3762 : struct per_cpu_pages *pcp,
3763 : struct list_head *list)
3764 : {
3765 : struct page *page;
3766 :
3767 : do {
3768 51169 : if (list_empty(list)) {
3769 115 : int batch = READ_ONCE(pcp->batch);
3770 : int alloced;
3771 :
3772 : /*
3773 : * Scale batch relative to order if batch implies
3774 : * free pages can be stored on the PCP. Batch can
3775 : * be 1 for small zones or for boot pagesets which
3776 : * should never store free pages as the pages may
3777 : * belong to arbitrary zones.
3778 : */
3779 115 : if (batch > 1)
3780 104 : batch = max(batch >> order, 2);
3781 115 : alloced = rmqueue_bulk(zone, order,
3782 : batch, list,
3783 : migratetype, alloc_flags);
3784 :
3785 115 : pcp->count += alloced << order;
3786 115 : if (unlikely(list_empty(list)))
3787 : return NULL;
3788 : }
3789 :
3790 51169 : page = list_first_entry(list, struct page, pcp_list);
3791 102338 : list_del(&page->pcp_list);
3792 51169 : pcp->count -= 1 << order;
3793 51169 : } while (check_new_pcp(page, order));
3794 :
3795 51169 : return page;
3796 : }
3797 :
3798 : /* Lock and remove page from the per-cpu list */
3799 8628 : static struct page *rmqueue_pcplist(struct zone *preferred_zone,
3800 : struct zone *zone, unsigned int order,
3801 : int migratetype, unsigned int alloc_flags)
3802 : {
3803 : struct per_cpu_pages *pcp;
3804 : struct list_head *list;
3805 : struct page *page;
3806 : unsigned long __maybe_unused UP_flags;
3807 :
3808 : /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
3809 8628 : pcp_trylock_prepare(UP_flags);
3810 17256 : pcp = pcp_spin_trylock(zone->per_cpu_pageset);
3811 8628 : if (!pcp) {
3812 0 : pcp_trylock_finish(UP_flags);
3813 : return NULL;
3814 : }
3815 :
3816 : /*
3817 : * On allocation, reduce the number of pages that are batch freed.
3818 : * See nr_pcp_free() where free_factor is increased for subsequent
3819 : * frees.
3820 : */
3821 8628 : pcp->free_factor >>= 1;
3822 17256 : list = &pcp->lists[order_to_pindex(migratetype, order)];
3823 8628 : page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
3824 17256 : pcp_spin_unlock(pcp);
3825 17256 : pcp_trylock_finish(UP_flags);
3826 8628 : if (page) {
3827 17256 : __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
3828 : zone_statistics(preferred_zone, zone, 1);
3829 : }
3830 : return page;
3831 : }
3832 :
3833 : /*
3834 : * Allocate a page from the given zone.
3835 : * Use pcplists for THP or "cheap" high-order allocations.
3836 : */
3837 :
3838 : /*
3839 : * Do not instrument rmqueue() with KMSAN. This function may call
3840 : * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
3841 : * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
3842 : * may call rmqueue() again, which will result in a deadlock.
3843 : */
3844 : __no_sanitize_memory
3845 : static inline
3846 8628 : struct page *rmqueue(struct zone *preferred_zone,
3847 : struct zone *zone, unsigned int order,
3848 : gfp_t gfp_flags, unsigned int alloc_flags,
3849 : int migratetype)
3850 : {
3851 : struct page *page;
3852 :
3853 : /*
3854 : * We most definitely don't want callers attempting to
3855 : * allocate greater than order-1 page units with __GFP_NOFAIL.
3856 : */
3857 8628 : WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
3858 :
3859 8628 : if (likely(pcp_allowed_order(order))) {
3860 : /*
3861 : * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
3862 : * we need to skip it when CMA area isn't allowed.
3863 : */
3864 : if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
3865 : migratetype != MIGRATE_MOVABLE) {
3866 8628 : page = rmqueue_pcplist(preferred_zone, zone, order,
3867 : migratetype, alloc_flags);
3868 8628 : if (likely(page))
3869 : goto out;
3870 : }
3871 : }
3872 :
3873 : page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
3874 : migratetype);
3875 :
3876 : out:
3877 : /* Separate test+clear to avoid unnecessary atomics */
3878 17256 : if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
3879 0 : clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
3880 0 : wakeup_kswapd(zone, 0, 0, zone_idx(zone));
3881 : }
3882 :
3883 : VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
3884 8628 : return page;
3885 : }
3886 :
3887 : #ifdef CONFIG_FAIL_PAGE_ALLOC
3888 :
3889 : static struct {
3890 : struct fault_attr attr;
3891 :
3892 : bool ignore_gfp_highmem;
3893 : bool ignore_gfp_reclaim;
3894 : u32 min_order;
3895 : } fail_page_alloc = {
3896 : .attr = FAULT_ATTR_INITIALIZER,
3897 : .ignore_gfp_reclaim = true,
3898 : .ignore_gfp_highmem = true,
3899 : .min_order = 1,
3900 : };
3901 :
3902 : static int __init setup_fail_page_alloc(char *str)
3903 : {
3904 : return setup_fault_attr(&fail_page_alloc.attr, str);
3905 : }
3906 : __setup("fail_page_alloc=", setup_fail_page_alloc);
3907 :
3908 : static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3909 : {
3910 : int flags = 0;
3911 :
3912 : if (order < fail_page_alloc.min_order)
3913 : return false;
3914 : if (gfp_mask & __GFP_NOFAIL)
3915 : return false;
3916 : if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
3917 : return false;
3918 : if (fail_page_alloc.ignore_gfp_reclaim &&
3919 : (gfp_mask & __GFP_DIRECT_RECLAIM))
3920 : return false;
3921 :
3922 : /* See comment in __should_failslab() */
3923 : if (gfp_mask & __GFP_NOWARN)
3924 : flags |= FAULT_NOWARN;
3925 :
3926 : return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
3927 : }
3928 :
3929 : #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3930 :
3931 : static int __init fail_page_alloc_debugfs(void)
3932 : {
3933 : umode_t mode = S_IFREG | 0600;
3934 : struct dentry *dir;
3935 :
3936 : dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
3937 : &fail_page_alloc.attr);
3938 :
3939 : debugfs_create_bool("ignore-gfp-wait", mode, dir,
3940 : &fail_page_alloc.ignore_gfp_reclaim);
3941 : debugfs_create_bool("ignore-gfp-highmem", mode, dir,
3942 : &fail_page_alloc.ignore_gfp_highmem);
3943 : debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
3944 :
3945 : return 0;
3946 : }
3947 :
3948 : late_initcall(fail_page_alloc_debugfs);
3949 :
3950 : #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3951 :
3952 : #else /* CONFIG_FAIL_PAGE_ALLOC */
3953 :
3954 : static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3955 : {
3956 : return false;
3957 : }
3958 :
3959 : #endif /* CONFIG_FAIL_PAGE_ALLOC */
3960 :
3961 9224 : noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3962 : {
3963 9224 : return __should_fail_alloc_page(gfp_mask, order);
3964 : }
3965 : ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
3966 :
3967 : static inline long __zone_watermark_unusable_free(struct zone *z,
3968 : unsigned int order, unsigned int alloc_flags)
3969 : {
3970 9226 : long unusable_free = (1 << order) - 1;
3971 :
3972 : /*
3973 : * If the caller does not have rights to reserves below the min
3974 : * watermark then subtract the high-atomic reserves. This will
3975 : * over-estimate the size of the atomic reserve but it avoids a search.
3976 : */
3977 9226 : if (likely(!(alloc_flags & ALLOC_RESERVES)))
3978 9226 : unusable_free += z->nr_reserved_highatomic;
3979 :
3980 : #ifdef CONFIG_CMA
3981 : /* If allocation can't use CMA areas don't use free CMA pages */
3982 : if (!(alloc_flags & ALLOC_CMA))
3983 : unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
3984 : #endif
3985 :
3986 : return unusable_free;
3987 : }
3988 :
3989 : /*
3990 : * Return true if free base pages are above 'mark'. For high-order checks it
3991 : * will return true of the order-0 watermark is reached and there is at least
3992 : * one free page of a suitable size. Checking now avoids taking the zone lock
3993 : * to check in the allocation paths if no pages are free.
3994 : */
3995 107 : bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3996 : int highest_zoneidx, unsigned int alloc_flags,
3997 : long free_pages)
3998 : {
3999 107 : long min = mark;
4000 : int o;
4001 :
4002 : /* free_pages may go negative - that's OK */
4003 214 : free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
4004 :
4005 107 : if (unlikely(alloc_flags & ALLOC_RESERVES)) {
4006 : /*
4007 : * __GFP_HIGH allows access to 50% of the min reserve as well
4008 : * as OOM.
4009 : */
4010 0 : if (alloc_flags & ALLOC_MIN_RESERVE) {
4011 0 : min -= min / 2;
4012 :
4013 : /*
4014 : * Non-blocking allocations (e.g. GFP_ATOMIC) can
4015 : * access more reserves than just __GFP_HIGH. Other
4016 : * non-blocking allocations requests such as GFP_NOWAIT
4017 : * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get
4018 : * access to the min reserve.
4019 : */
4020 0 : if (alloc_flags & ALLOC_NON_BLOCK)
4021 0 : min -= min / 4;
4022 : }
4023 :
4024 : /*
4025 : * OOM victims can try even harder than the normal reserve
4026 : * users on the grounds that it's definitely going to be in
4027 : * the exit path shortly and free memory. Any allocation it
4028 : * makes during the free path will be small and short-lived.
4029 : */
4030 0 : if (alloc_flags & ALLOC_OOM)
4031 0 : min -= min / 2;
4032 : }
4033 :
4034 : /*
4035 : * Check watermarks for an order-0 allocation request. If these
4036 : * are not met, then a high-order request also cannot go ahead
4037 : * even if a suitable page happened to be free.
4038 : */
4039 107 : if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
4040 : return false;
4041 :
4042 : /* If this is an order-0 request then the watermark is fine */
4043 107 : if (!order)
4044 : return true;
4045 :
4046 : /* For a high-order request, check at least one suitable page is free */
4047 105 : for (o = order; o < MAX_ORDER; o++) {
4048 105 : struct free_area *area = &z->free_area[o];
4049 : int mt;
4050 :
4051 105 : if (!area->nr_free)
4052 0 : continue;
4053 :
4054 63 : for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
4055 168 : if (!free_area_empty(area, mt))
4056 : return true;
4057 : }
4058 :
4059 : #ifdef CONFIG_CMA
4060 : if ((alloc_flags & ALLOC_CMA) &&
4061 : !free_area_empty(area, MIGRATE_CMA)) {
4062 : return true;
4063 : }
4064 : #endif
4065 0 : if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) &&
4066 0 : !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
4067 : return true;
4068 : }
4069 : }
4070 : return false;
4071 : }
4072 :
4073 0 : bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
4074 : int highest_zoneidx, unsigned int alloc_flags)
4075 : {
4076 0 : return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
4077 0 : zone_page_state(z, NR_FREE_PAGES));
4078 : }
4079 :
4080 9224 : static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
4081 : unsigned long mark, int highest_zoneidx,
4082 : unsigned int alloc_flags, gfp_t gfp_mask)
4083 : {
4084 : long free_pages;
4085 :
4086 9224 : free_pages = zone_page_state(z, NR_FREE_PAGES);
4087 :
4088 : /*
4089 : * Fast check for order-0 only. If this fails then the reserves
4090 : * need to be calculated.
4091 : */
4092 9224 : if (!order) {
4093 : long usable_free;
4094 : long reserved;
4095 :
4096 9119 : usable_free = free_pages;
4097 18238 : reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
4098 :
4099 : /* reserved may over estimate high-atomic reserves. */
4100 9119 : usable_free -= min(usable_free, reserved);
4101 9119 : if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
4102 : return true;
4103 : }
4104 :
4105 105 : if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
4106 : free_pages))
4107 : return true;
4108 :
4109 : /*
4110 : * Ignore watermark boosting for __GFP_HIGH order-0 allocations
4111 : * when checking the min watermark. The min watermark is the
4112 : * point where boosting is ignored so that kswapd is woken up
4113 : * when below the low watermark.
4114 : */
4115 0 : if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost
4116 : && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
4117 0 : mark = z->_watermark[WMARK_MIN];
4118 0 : return __zone_watermark_ok(z, order, mark, highest_zoneidx,
4119 : alloc_flags, free_pages);
4120 : }
4121 :
4122 : return false;
4123 : }
4124 :
4125 2 : bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
4126 : unsigned long mark, int highest_zoneidx)
4127 : {
4128 2 : long free_pages = zone_page_state(z, NR_FREE_PAGES);
4129 :
4130 2 : if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
4131 0 : free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
4132 :
4133 2 : return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
4134 : free_pages);
4135 : }
4136 :
4137 : #ifdef CONFIG_NUMA
4138 : int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
4139 :
4140 : static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
4141 : {
4142 : return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
4143 : node_reclaim_distance;
4144 : }
4145 : #else /* CONFIG_NUMA */
4146 : static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
4147 : {
4148 : return true;
4149 : }
4150 : #endif /* CONFIG_NUMA */
4151 :
4152 : /*
4153 : * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
4154 : * fragmentation is subtle. If the preferred zone was HIGHMEM then
4155 : * premature use of a lower zone may cause lowmem pressure problems that
4156 : * are worse than fragmentation. If the next zone is ZONE_DMA then it is
4157 : * probably too small. It only makes sense to spread allocations to avoid
4158 : * fragmentation between the Normal and DMA32 zones.
4159 : */
4160 : static inline unsigned int
4161 : alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
4162 : {
4163 : unsigned int alloc_flags;
4164 :
4165 : /*
4166 : * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
4167 : * to save a branch.
4168 : */
4169 8628 : alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
4170 :
4171 : #ifdef CONFIG_ZONE_DMA32
4172 : if (!zone)
4173 : return alloc_flags;
4174 :
4175 : if (zone_idx(zone) != ZONE_NORMAL)
4176 : return alloc_flags;
4177 :
4178 : /*
4179 : * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
4180 : * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
4181 : * on UMA that if Normal is populated then so is DMA32.
4182 : */
4183 : BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
4184 : if (nr_online_nodes > 1 && !populated_zone(--zone))
4185 : return alloc_flags;
4186 :
4187 : alloc_flags |= ALLOC_NOFRAGMENT;
4188 : #endif /* CONFIG_ZONE_DMA32 */
4189 : return alloc_flags;
4190 : }
4191 :
4192 : /* Must be called after current_gfp_context() which can change gfp_mask */
4193 : static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
4194 : unsigned int alloc_flags)
4195 : {
4196 : #ifdef CONFIG_CMA
4197 : if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
4198 : alloc_flags |= ALLOC_CMA;
4199 : #endif
4200 : return alloc_flags;
4201 : }
4202 :
4203 : /*
4204 : * get_page_from_freelist goes through the zonelist trying to allocate
4205 : * a page.
4206 : */
4207 : static struct page *
4208 8628 : get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
4209 : const struct alloc_context *ac)
4210 : {
4211 : struct zoneref *z;
4212 : struct zone *zone;
4213 8628 : struct pglist_data *last_pgdat = NULL;
4214 8628 : bool last_pgdat_dirty_ok = false;
4215 : bool no_fallback;
4216 :
4217 : retry:
4218 : /*
4219 : * Scan zonelist, looking for a zone with enough free.
4220 : * See also __cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
4221 : */
4222 8628 : no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
4223 8628 : z = ac->preferred_zoneref;
4224 8628 : for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
4225 : ac->nodemask) {
4226 : struct page *page;
4227 : unsigned long mark;
4228 :
4229 : if (cpusets_enabled() &&
4230 : (alloc_flags & ALLOC_CPUSET) &&
4231 : !__cpuset_zone_allowed(zone, gfp_mask))
4232 : continue;
4233 : /*
4234 : * When allocating a page cache page for writing, we
4235 : * want to get it from a node that is within its dirty
4236 : * limit, such that no single node holds more than its
4237 : * proportional share of globally allowed dirty pages.
4238 : * The dirty limits take into account the node's
4239 : * lowmem reserves and high watermark so that kswapd
4240 : * should be able to balance it without having to
4241 : * write pages from its LRU list.
4242 : *
4243 : * XXX: For now, allow allocations to potentially
4244 : * exceed the per-node dirty limit in the slowpath
4245 : * (spread_dirty_pages unset) before going into reclaim,
4246 : * which is important when on a NUMA setup the allowed
4247 : * nodes are together not big enough to reach the
4248 : * global limit. The proper fix for these situations
4249 : * will require awareness of nodes in the
4250 : * dirty-throttling and the flusher threads.
4251 : */
4252 8628 : if (ac->spread_dirty_pages) {
4253 0 : if (last_pgdat != zone->zone_pgdat) {
4254 0 : last_pgdat = zone->zone_pgdat;
4255 0 : last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
4256 : }
4257 :
4258 0 : if (!last_pgdat_dirty_ok)
4259 0 : continue;
4260 : }
4261 :
4262 : if (no_fallback && nr_online_nodes > 1 &&
4263 : zone != ac->preferred_zoneref->zone) {
4264 : int local_nid;
4265 :
4266 : /*
4267 : * If moving to a remote node, retry but allow
4268 : * fragmenting fallbacks. Locality is more important
4269 : * than fragmentation avoidance.
4270 : */
4271 : local_nid = zone_to_nid(ac->preferred_zoneref->zone);
4272 : if (zone_to_nid(zone) != local_nid) {
4273 : alloc_flags &= ~ALLOC_NOFRAGMENT;
4274 : goto retry;
4275 : }
4276 : }
4277 :
4278 8628 : mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
4279 17256 : if (!zone_watermark_fast(zone, order, mark,
4280 8628 : ac->highest_zoneidx, alloc_flags,
4281 : gfp_mask)) {
4282 : int ret;
4283 :
4284 : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
4285 : /*
4286 : * Watermark failed for this zone, but see if we can
4287 : * grow this zone if it contains deferred pages.
4288 : */
4289 : if (deferred_pages_enabled()) {
4290 : if (_deferred_grow_zone(zone, order))
4291 : goto try_this_zone;
4292 : }
4293 : #endif
4294 : /* Checked here to keep the fast path fast */
4295 : BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
4296 0 : if (alloc_flags & ALLOC_NO_WATERMARKS)
4297 : goto try_this_zone;
4298 :
4299 : if (!node_reclaim_enabled() ||
4300 : !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
4301 0 : continue;
4302 :
4303 : ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
4304 : switch (ret) {
4305 : case NODE_RECLAIM_NOSCAN:
4306 : /* did not scan */
4307 : continue;
4308 : case NODE_RECLAIM_FULL:
4309 : /* scanned but unreclaimable */
4310 : continue;
4311 : default:
4312 : /* did we reclaim enough */
4313 : if (zone_watermark_ok(zone, order, mark,
4314 : ac->highest_zoneidx, alloc_flags))
4315 : goto try_this_zone;
4316 :
4317 : continue;
4318 : }
4319 : }
4320 :
4321 : try_this_zone:
4322 8628 : page = rmqueue(ac->preferred_zoneref->zone, zone, order,
4323 : gfp_mask, alloc_flags, ac->migratetype);
4324 8628 : if (page) {
4325 8628 : prep_new_page(page, order, gfp_mask, alloc_flags);
4326 :
4327 : /*
4328 : * If this is a high-order atomic allocation then check
4329 : * if the pageblock should be reserved for the future
4330 : */
4331 8628 : if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
4332 0 : reserve_highatomic_pageblock(page, zone, order);
4333 :
4334 : return page;
4335 : } else {
4336 : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
4337 : /* Try again if zone has deferred pages */
4338 : if (deferred_pages_enabled()) {
4339 : if (_deferred_grow_zone(zone, order))
4340 : goto try_this_zone;
4341 : }
4342 : #endif
4343 : }
4344 : }
4345 :
4346 : /*
4347 : * It's possible on a UMA machine to get through all zones that are
4348 : * fragmented. If avoiding fragmentation, reset and try again.
4349 : */
4350 : if (no_fallback) {
4351 : alloc_flags &= ~ALLOC_NOFRAGMENT;
4352 : goto retry;
4353 : }
4354 :
4355 : return NULL;
4356 : }
4357 :
4358 0 : static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
4359 : {
4360 0 : unsigned int filter = SHOW_MEM_FILTER_NODES;
4361 :
4362 : /*
4363 : * This documents exceptions given to allocations in certain
4364 : * contexts that are allowed to allocate outside current's set
4365 : * of allowed nodes.
4366 : */
4367 0 : if (!(gfp_mask & __GFP_NOMEMALLOC))
4368 0 : if (tsk_is_oom_victim(current) ||
4369 0 : (current->flags & (PF_MEMALLOC | PF_EXITING)))
4370 : filter &= ~SHOW_MEM_FILTER_NODES;
4371 0 : if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
4372 0 : filter &= ~SHOW_MEM_FILTER_NODES;
4373 :
4374 0 : __show_mem(filter, nodemask, gfp_zone(gfp_mask));
4375 0 : }
4376 :
4377 0 : void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
4378 : {
4379 : struct va_format vaf;
4380 : va_list args;
4381 : static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
4382 :
4383 0 : if ((gfp_mask & __GFP_NOWARN) ||
4384 0 : !__ratelimit(&nopage_rs) ||
4385 0 : ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
4386 0 : return;
4387 :
4388 0 : va_start(args, fmt);
4389 0 : vaf.fmt = fmt;
4390 0 : vaf.va = &args;
4391 0 : pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
4392 : current->comm, &vaf, gfp_mask, &gfp_mask,
4393 : nodemask_pr_args(nodemask));
4394 0 : va_end(args);
4395 :
4396 : cpuset_print_current_mems_allowed();
4397 0 : pr_cont("\n");
4398 0 : dump_stack();
4399 0 : warn_alloc_show_mem(gfp_mask, nodemask);
4400 : }
4401 :
4402 : static inline struct page *
4403 0 : __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
4404 : unsigned int alloc_flags,
4405 : const struct alloc_context *ac)
4406 : {
4407 : struct page *page;
4408 :
4409 0 : page = get_page_from_freelist(gfp_mask, order,
4410 0 : alloc_flags|ALLOC_CPUSET, ac);
4411 : /*
4412 : * fallback to ignore cpuset restriction if our nodes
4413 : * are depleted
4414 : */
4415 0 : if (!page)
4416 0 : page = get_page_from_freelist(gfp_mask, order,
4417 : alloc_flags, ac);
4418 :
4419 0 : return page;
4420 : }
4421 :
4422 : static inline struct page *
4423 0 : __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
4424 : const struct alloc_context *ac, unsigned long *did_some_progress)
4425 : {
4426 0 : struct oom_control oc = {
4427 0 : .zonelist = ac->zonelist,
4428 0 : .nodemask = ac->nodemask,
4429 : .memcg = NULL,
4430 : .gfp_mask = gfp_mask,
4431 : .order = order,
4432 : };
4433 : struct page *page;
4434 :
4435 0 : *did_some_progress = 0;
4436 :
4437 : /*
4438 : * Acquire the oom lock. If that fails, somebody else is
4439 : * making progress for us.
4440 : */
4441 0 : if (!mutex_trylock(&oom_lock)) {
4442 0 : *did_some_progress = 1;
4443 0 : schedule_timeout_uninterruptible(1);
4444 0 : return NULL;
4445 : }
4446 :
4447 : /*
4448 : * Go through the zonelist yet one more time, keep very high watermark
4449 : * here, this is only to catch a parallel oom killing, we must fail if
4450 : * we're still under heavy pressure. But make sure that this reclaim
4451 : * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
4452 : * allocation which will never fail due to oom_lock already held.
4453 : */
4454 0 : page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
4455 : ~__GFP_DIRECT_RECLAIM, order,
4456 : ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
4457 0 : if (page)
4458 : goto out;
4459 :
4460 : /* Coredumps can quickly deplete all memory reserves */
4461 0 : if (current->flags & PF_DUMPCORE)
4462 : goto out;
4463 : /* The OOM killer will not help higher order allocs */
4464 0 : if (order > PAGE_ALLOC_COSTLY_ORDER)
4465 : goto out;
4466 : /*
4467 : * We have already exhausted all our reclaim opportunities without any
4468 : * success so it is time to admit defeat. We will skip the OOM killer
4469 : * because it is very likely that the caller has a more reasonable
4470 : * fallback than shooting a random task.
4471 : *
4472 : * The OOM killer may not free memory on a specific node.
4473 : */
4474 0 : if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
4475 : goto out;
4476 : /* The OOM killer does not needlessly kill tasks for lowmem */
4477 : if (ac->highest_zoneidx < ZONE_NORMAL)
4478 : goto out;
4479 0 : if (pm_suspended_storage())
4480 : goto out;
4481 : /*
4482 : * XXX: GFP_NOFS allocations should rather fail than rely on
4483 : * other request to make a forward progress.
4484 : * We are in an unfortunate situation where out_of_memory cannot
4485 : * do much for this context but let's try it to at least get
4486 : * access to memory reserved if the current task is killed (see
4487 : * out_of_memory). Once filesystems are ready to handle allocation
4488 : * failures more gracefully we should just bail out here.
4489 : */
4490 :
4491 : /* Exhausted what can be done so it's blame time */
4492 0 : if (out_of_memory(&oc) ||
4493 0 : WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
4494 0 : *did_some_progress = 1;
4495 :
4496 : /*
4497 : * Help non-failing allocations by giving them access to memory
4498 : * reserves
4499 : */
4500 0 : if (gfp_mask & __GFP_NOFAIL)
4501 0 : page = __alloc_pages_cpuset_fallback(gfp_mask, order,
4502 : ALLOC_NO_WATERMARKS, ac);
4503 : }
4504 : out:
4505 0 : mutex_unlock(&oom_lock);
4506 0 : return page;
4507 : }
4508 :
4509 : /*
4510 : * Maximum number of compaction retries with a progress before OOM
4511 : * killer is consider as the only way to move forward.
4512 : */
4513 : #define MAX_COMPACT_RETRIES 16
4514 :
4515 : #ifdef CONFIG_COMPACTION
4516 : /* Try memory compaction for high-order allocations before reclaim */
4517 : static struct page *
4518 0 : __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
4519 : unsigned int alloc_flags, const struct alloc_context *ac,
4520 : enum compact_priority prio, enum compact_result *compact_result)
4521 : {
4522 0 : struct page *page = NULL;
4523 : unsigned long pflags;
4524 : unsigned int noreclaim_flag;
4525 :
4526 0 : if (!order)
4527 : return NULL;
4528 :
4529 0 : psi_memstall_enter(&pflags);
4530 : delayacct_compact_start();
4531 0 : noreclaim_flag = memalloc_noreclaim_save();
4532 :
4533 0 : *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
4534 : prio, &page);
4535 :
4536 0 : memalloc_noreclaim_restore(noreclaim_flag);
4537 0 : psi_memstall_leave(&pflags);
4538 : delayacct_compact_end();
4539 :
4540 0 : if (*compact_result == COMPACT_SKIPPED)
4541 : return NULL;
4542 : /*
4543 : * At least in one zone compaction wasn't deferred or skipped, so let's
4544 : * count a compaction stall
4545 : */
4546 0 : count_vm_event(COMPACTSTALL);
4547 :
4548 : /* Prep a captured page if available */
4549 0 : if (page)
4550 0 : prep_new_page(page, order, gfp_mask, alloc_flags);
4551 :
4552 : /* Try get a page from the freelist if available */
4553 0 : if (!page)
4554 0 : page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4555 :
4556 0 : if (page) {
4557 0 : struct zone *zone = page_zone(page);
4558 :
4559 0 : zone->compact_blockskip_flush = false;
4560 0 : compaction_defer_reset(zone, order, true);
4561 0 : count_vm_event(COMPACTSUCCESS);
4562 0 : return page;
4563 : }
4564 :
4565 : /*
4566 : * It's bad if compaction run occurs and fails. The most likely reason
4567 : * is that pages exist, but not enough to satisfy watermarks.
4568 : */
4569 0 : count_vm_event(COMPACTFAIL);
4570 :
4571 0 : cond_resched();
4572 :
4573 0 : return NULL;
4574 : }
4575 :
4576 : static inline bool
4577 0 : should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
4578 : enum compact_result compact_result,
4579 : enum compact_priority *compact_priority,
4580 : int *compaction_retries)
4581 : {
4582 0 : int max_retries = MAX_COMPACT_RETRIES;
4583 : int min_priority;
4584 0 : bool ret = false;
4585 0 : int retries = *compaction_retries;
4586 0 : enum compact_priority priority = *compact_priority;
4587 :
4588 0 : if (!order)
4589 : return false;
4590 :
4591 0 : if (fatal_signal_pending(current))
4592 : return false;
4593 :
4594 0 : if (compaction_made_progress(compact_result))
4595 0 : (*compaction_retries)++;
4596 :
4597 : /*
4598 : * compaction considers all the zone as desperately out of memory
4599 : * so it doesn't really make much sense to retry except when the
4600 : * failure could be caused by insufficient priority
4601 : */
4602 0 : if (compaction_failed(compact_result))
4603 : goto check_priority;
4604 :
4605 : /*
4606 : * compaction was skipped because there are not enough order-0 pages
4607 : * to work with, so we retry only if it looks like reclaim can help.
4608 : */
4609 0 : if (compaction_needs_reclaim(compact_result)) {
4610 0 : ret = compaction_zonelist_suitable(ac, order, alloc_flags);
4611 0 : goto out;
4612 : }
4613 :
4614 : /*
4615 : * make sure the compaction wasn't deferred or didn't bail out early
4616 : * due to locks contention before we declare that we should give up.
4617 : * But the next retry should use a higher priority if allowed, so
4618 : * we don't just keep bailing out endlessly.
4619 : */
4620 0 : if (compaction_withdrawn(compact_result)) {
4621 : goto check_priority;
4622 : }
4623 :
4624 : /*
4625 : * !costly requests are much more important than __GFP_RETRY_MAYFAIL
4626 : * costly ones because they are de facto nofail and invoke OOM
4627 : * killer to move on while costly can fail and users are ready
4628 : * to cope with that. 1/4 retries is rather arbitrary but we
4629 : * would need much more detailed feedback from compaction to
4630 : * make a better decision.
4631 : */
4632 0 : if (order > PAGE_ALLOC_COSTLY_ORDER)
4633 0 : max_retries /= 4;
4634 0 : if (*compaction_retries <= max_retries) {
4635 : ret = true;
4636 : goto out;
4637 : }
4638 :
4639 : /*
4640 : * Make sure there are attempts at the highest priority if we exhausted
4641 : * all retries or failed at the lower priorities.
4642 : */
4643 : check_priority:
4644 0 : min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
4645 0 : MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
4646 :
4647 0 : if (*compact_priority > min_priority) {
4648 0 : (*compact_priority)--;
4649 0 : *compaction_retries = 0;
4650 0 : ret = true;
4651 : }
4652 : out:
4653 0 : trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
4654 0 : return ret;
4655 : }
4656 : #else
4657 : static inline struct page *
4658 : __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
4659 : unsigned int alloc_flags, const struct alloc_context *ac,
4660 : enum compact_priority prio, enum compact_result *compact_result)
4661 : {
4662 : *compact_result = COMPACT_SKIPPED;
4663 : return NULL;
4664 : }
4665 :
4666 : static inline bool
4667 : should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
4668 : enum compact_result compact_result,
4669 : enum compact_priority *compact_priority,
4670 : int *compaction_retries)
4671 : {
4672 : struct zone *zone;
4673 : struct zoneref *z;
4674 :
4675 : if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
4676 : return false;
4677 :
4678 : /*
4679 : * There are setups with compaction disabled which would prefer to loop
4680 : * inside the allocator rather than hit the oom killer prematurely.
4681 : * Let's give them a good hope and keep retrying while the order-0
4682 : * watermarks are OK.
4683 : */
4684 : for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4685 : ac->highest_zoneidx, ac->nodemask) {
4686 : if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
4687 : ac->highest_zoneidx, alloc_flags))
4688 : return true;
4689 : }
4690 : return false;
4691 : }
4692 : #endif /* CONFIG_COMPACTION */
4693 :
4694 : #ifdef CONFIG_LOCKDEP
4695 : static struct lockdep_map __fs_reclaim_map =
4696 : STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
4697 :
4698 : static bool __need_reclaim(gfp_t gfp_mask)
4699 : {
4700 : /* no reclaim without waiting on it */
4701 : if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
4702 : return false;
4703 :
4704 : /* this guy won't enter reclaim */
4705 : if (current->flags & PF_MEMALLOC)
4706 : return false;
4707 :
4708 : if (gfp_mask & __GFP_NOLOCKDEP)
4709 : return false;
4710 :
4711 : return true;
4712 : }
4713 :
4714 : void __fs_reclaim_acquire(unsigned long ip)
4715 : {
4716 : lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip);
4717 : }
4718 :
4719 : void __fs_reclaim_release(unsigned long ip)
4720 : {
4721 : lock_release(&__fs_reclaim_map, ip);
4722 : }
4723 :
4724 : void fs_reclaim_acquire(gfp_t gfp_mask)
4725 : {
4726 : gfp_mask = current_gfp_context(gfp_mask);
4727 :
4728 : if (__need_reclaim(gfp_mask)) {
4729 : if (gfp_mask & __GFP_FS)
4730 : __fs_reclaim_acquire(_RET_IP_);
4731 :
4732 : #ifdef CONFIG_MMU_NOTIFIER
4733 : lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
4734 : lock_map_release(&__mmu_notifier_invalidate_range_start_map);
4735 : #endif
4736 :
4737 : }
4738 : }
4739 : EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
4740 :
4741 : void fs_reclaim_release(gfp_t gfp_mask)
4742 : {
4743 : gfp_mask = current_gfp_context(gfp_mask);
4744 :
4745 : if (__need_reclaim(gfp_mask)) {
4746 : if (gfp_mask & __GFP_FS)
4747 : __fs_reclaim_release(_RET_IP_);
4748 : }
4749 : }
4750 : EXPORT_SYMBOL_GPL(fs_reclaim_release);
4751 : #endif
4752 :
4753 : /*
4754 : * Zonelists may change due to hotplug during allocation. Detect when zonelists
4755 : * have been rebuilt so allocation retries. Reader side does not lock and
4756 : * retries the allocation if zonelist changes. Writer side is protected by the
4757 : * embedded spin_lock.
4758 : */
4759 : static DEFINE_SEQLOCK(zonelist_update_seq);
4760 :
4761 : static unsigned int zonelist_iter_begin(void)
4762 : {
4763 : if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
4764 : return read_seqbegin(&zonelist_update_seq);
4765 :
4766 : return 0;
4767 : }
4768 :
4769 : static unsigned int check_retry_zonelist(unsigned int seq)
4770 : {
4771 : if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
4772 : return read_seqretry(&zonelist_update_seq, seq);
4773 :
4774 : return seq;
4775 : }
4776 :
4777 : /* Perform direct synchronous page reclaim */
4778 : static unsigned long
4779 0 : __perform_reclaim(gfp_t gfp_mask, unsigned int order,
4780 : const struct alloc_context *ac)
4781 : {
4782 : unsigned int noreclaim_flag;
4783 : unsigned long progress;
4784 :
4785 0 : cond_resched();
4786 :
4787 : /* We now go into synchronous reclaim */
4788 : cpuset_memory_pressure_bump();
4789 0 : fs_reclaim_acquire(gfp_mask);
4790 0 : noreclaim_flag = memalloc_noreclaim_save();
4791 :
4792 0 : progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
4793 : ac->nodemask);
4794 :
4795 0 : memalloc_noreclaim_restore(noreclaim_flag);
4796 0 : fs_reclaim_release(gfp_mask);
4797 :
4798 0 : cond_resched();
4799 :
4800 0 : return progress;
4801 : }
4802 :
4803 : /* The really slow allocator path where we enter direct reclaim */
4804 : static inline struct page *
4805 0 : __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
4806 : unsigned int alloc_flags, const struct alloc_context *ac,
4807 : unsigned long *did_some_progress)
4808 : {
4809 0 : struct page *page = NULL;
4810 : unsigned long pflags;
4811 0 : bool drained = false;
4812 :
4813 0 : psi_memstall_enter(&pflags);
4814 0 : *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
4815 0 : if (unlikely(!(*did_some_progress)))
4816 : goto out;
4817 :
4818 : retry:
4819 0 : page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4820 :
4821 : /*
4822 : * If an allocation failed after direct reclaim, it could be because
4823 : * pages are pinned on the per-cpu lists or in high alloc reserves.
4824 : * Shrink them and try again
4825 : */
4826 0 : if (!page && !drained) {
4827 0 : unreserve_highatomic_pageblock(ac, false);
4828 0 : drain_all_pages(NULL);
4829 0 : drained = true;
4830 0 : goto retry;
4831 : }
4832 : out:
4833 0 : psi_memstall_leave(&pflags);
4834 :
4835 0 : return page;
4836 : }
4837 :
4838 0 : static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
4839 : const struct alloc_context *ac)
4840 : {
4841 : struct zoneref *z;
4842 : struct zone *zone;
4843 0 : pg_data_t *last_pgdat = NULL;
4844 0 : enum zone_type highest_zoneidx = ac->highest_zoneidx;
4845 :
4846 0 : for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
4847 : ac->nodemask) {
4848 0 : if (!managed_zone(zone))
4849 0 : continue;
4850 0 : if (last_pgdat != zone->zone_pgdat) {
4851 0 : wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
4852 0 : last_pgdat = zone->zone_pgdat;
4853 : }
4854 : }
4855 0 : }
4856 :
4857 : static inline unsigned int
4858 0 : gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
4859 : {
4860 0 : unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
4861 :
4862 : /*
4863 : * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE
4864 : * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
4865 : * to save two branches.
4866 : */
4867 : BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE);
4868 : BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
4869 :
4870 : /*
4871 : * The caller may dip into page reserves a bit more if the caller
4872 : * cannot run direct reclaim, or if the caller has realtime scheduling
4873 : * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
4874 : * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
4875 : */
4876 0 : alloc_flags |= (__force int)
4877 : (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
4878 :
4879 0 : if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
4880 : /*
4881 : * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
4882 : * if it can't schedule.
4883 : */
4884 0 : if (!(gfp_mask & __GFP_NOMEMALLOC)) {
4885 0 : alloc_flags |= ALLOC_NON_BLOCK;
4886 :
4887 0 : if (order > 0)
4888 0 : alloc_flags |= ALLOC_HIGHATOMIC;
4889 : }
4890 :
4891 : /*
4892 : * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
4893 : * GFP_ATOMIC) rather than fail, see the comment for
4894 : * __cpuset_node_allowed().
4895 : */
4896 0 : if (alloc_flags & ALLOC_MIN_RESERVE)
4897 0 : alloc_flags &= ~ALLOC_CPUSET;
4898 0 : } else if (unlikely(rt_task(current)) && in_task())
4899 0 : alloc_flags |= ALLOC_MIN_RESERVE;
4900 :
4901 0 : alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
4902 :
4903 0 : return alloc_flags;
4904 : }
4905 :
4906 : static bool oom_reserves_allowed(struct task_struct *tsk)
4907 : {
4908 0 : if (!tsk_is_oom_victim(tsk))
4909 : return false;
4910 :
4911 : /*
4912 : * !MMU doesn't have oom reaper so give access to memory reserves
4913 : * only to the thread with TIF_MEMDIE set
4914 : */
4915 : if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
4916 : return false;
4917 :
4918 : return true;
4919 : }
4920 :
4921 : /*
4922 : * Distinguish requests which really need access to full memory
4923 : * reserves from oom victims which can live with a portion of it
4924 : */
4925 0 : static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
4926 : {
4927 0 : if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
4928 : return 0;
4929 0 : if (gfp_mask & __GFP_MEMALLOC)
4930 : return ALLOC_NO_WATERMARKS;
4931 0 : if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
4932 : return ALLOC_NO_WATERMARKS;
4933 0 : if (!in_interrupt()) {
4934 0 : if (current->flags & PF_MEMALLOC)
4935 : return ALLOC_NO_WATERMARKS;
4936 0 : else if (oom_reserves_allowed(current))
4937 : return ALLOC_OOM;
4938 : }
4939 :
4940 : return 0;
4941 : }
4942 :
4943 0 : bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
4944 : {
4945 0 : return !!__gfp_pfmemalloc_flags(gfp_mask);
4946 : }
4947 :
4948 : /*
4949 : * Checks whether it makes sense to retry the reclaim to make a forward progress
4950 : * for the given allocation request.
4951 : *
4952 : * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
4953 : * without success, or when we couldn't even meet the watermark if we
4954 : * reclaimed all remaining pages on the LRU lists.
4955 : *
4956 : * Returns true if a retry is viable or false to enter the oom path.
4957 : */
4958 : static inline bool
4959 0 : should_reclaim_retry(gfp_t gfp_mask, unsigned order,
4960 : struct alloc_context *ac, int alloc_flags,
4961 : bool did_some_progress, int *no_progress_loops)
4962 : {
4963 : struct zone *zone;
4964 : struct zoneref *z;
4965 0 : bool ret = false;
4966 :
4967 : /*
4968 : * Costly allocations might have made a progress but this doesn't mean
4969 : * their order will become available due to high fragmentation so
4970 : * always increment the no progress counter for them
4971 : */
4972 0 : if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
4973 0 : *no_progress_loops = 0;
4974 : else
4975 0 : (*no_progress_loops)++;
4976 :
4977 : /*
4978 : * Make sure we converge to OOM if we cannot make any progress
4979 : * several times in the row.
4980 : */
4981 0 : if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
4982 : /* Before OOM, exhaust highatomic_reserve */
4983 0 : return unreserve_highatomic_pageblock(ac, true);
4984 : }
4985 :
4986 : /*
4987 : * Keep reclaiming pages while there is a chance this will lead
4988 : * somewhere. If none of the target zones can satisfy our allocation
4989 : * request even if all reclaimable pages are considered then we are
4990 : * screwed and have to go OOM.
4991 : */
4992 0 : for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4993 : ac->highest_zoneidx, ac->nodemask) {
4994 : unsigned long available;
4995 : unsigned long reclaimable;
4996 0 : unsigned long min_wmark = min_wmark_pages(zone);
4997 : bool wmark;
4998 :
4999 0 : available = reclaimable = zone_reclaimable_pages(zone);
5000 0 : available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
5001 :
5002 : /*
5003 : * Would the allocation succeed if we reclaimed all
5004 : * reclaimable pages?
5005 : */
5006 0 : wmark = __zone_watermark_ok(zone, order, min_wmark,
5007 0 : ac->highest_zoneidx, alloc_flags, available);
5008 0 : trace_reclaim_retry_zone(z, order, reclaimable,
5009 : available, min_wmark, *no_progress_loops, wmark);
5010 0 : if (wmark) {
5011 : ret = true;
5012 : break;
5013 : }
5014 : }
5015 :
5016 : /*
5017 : * Memory allocation/reclaim might be called from a WQ context and the
5018 : * current implementation of the WQ concurrency control doesn't
5019 : * recognize that a particular WQ is congested if the worker thread is
5020 : * looping without ever sleeping. Therefore we have to do a short sleep
5021 : * here rather than calling cond_resched().
5022 : */
5023 0 : if (current->flags & PF_WQ_WORKER)
5024 0 : schedule_timeout_uninterruptible(1);
5025 : else
5026 0 : cond_resched();
5027 : return ret;
5028 : }
5029 :
5030 : static inline bool
5031 : check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
5032 : {
5033 : /*
5034 : * It's possible that cpuset's mems_allowed and the nodemask from
5035 : * mempolicy don't intersect. This should be normally dealt with by
5036 : * policy_nodemask(), but it's possible to race with cpuset update in
5037 : * such a way the check therein was true, and then it became false
5038 : * before we got our cpuset_mems_cookie here.
5039 : * This assumes that for all allocations, ac->nodemask can come only
5040 : * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
5041 : * when it does not intersect with the cpuset restrictions) or the
5042 : * caller can deal with a violated nodemask.
5043 : */
5044 : if (cpusets_enabled() && ac->nodemask &&
5045 : !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
5046 : ac->nodemask = NULL;
5047 : return true;
5048 : }
5049 :
5050 : /*
5051 : * When updating a task's mems_allowed or mempolicy nodemask, it is
5052 : * possible to race with parallel threads in such a way that our
5053 : * allocation can fail while the mask is being updated. If we are about
5054 : * to fail, check if the cpuset changed during allocation and if so,
5055 : * retry.
5056 : */
5057 0 : if (read_mems_allowed_retry(cpuset_mems_cookie))
5058 : return true;
5059 :
5060 : return false;
5061 : }
5062 :
5063 : static inline struct page *
5064 0 : __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
5065 : struct alloc_context *ac)
5066 : {
5067 0 : bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
5068 0 : const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
5069 0 : struct page *page = NULL;
5070 : unsigned int alloc_flags;
5071 : unsigned long did_some_progress;
5072 : enum compact_priority compact_priority;
5073 : enum compact_result compact_result;
5074 : int compaction_retries;
5075 : int no_progress_loops;
5076 : unsigned int cpuset_mems_cookie;
5077 : unsigned int zonelist_iter_cookie;
5078 : int reserve_flags;
5079 :
5080 : restart:
5081 0 : compaction_retries = 0;
5082 0 : no_progress_loops = 0;
5083 0 : compact_priority = DEF_COMPACT_PRIORITY;
5084 0 : cpuset_mems_cookie = read_mems_allowed_begin();
5085 0 : zonelist_iter_cookie = zonelist_iter_begin();
5086 :
5087 : /*
5088 : * The fast path uses conservative alloc_flags to succeed only until
5089 : * kswapd needs to be woken up, and to avoid the cost of setting up
5090 : * alloc_flags precisely. So we do that now.
5091 : */
5092 0 : alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
5093 :
5094 : /*
5095 : * We need to recalculate the starting point for the zonelist iterator
5096 : * because we might have used different nodemask in the fast path, or
5097 : * there was a cpuset modification and we are retrying - otherwise we
5098 : * could end up iterating over non-eligible zones endlessly.
5099 : */
5100 0 : ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
5101 : ac->highest_zoneidx, ac->nodemask);
5102 0 : if (!ac->preferred_zoneref->zone)
5103 : goto nopage;
5104 :
5105 : /*
5106 : * Check for insane configurations where the cpuset doesn't contain
5107 : * any suitable zone to satisfy the request - e.g. non-movable
5108 : * GFP_HIGHUSER allocations from MOVABLE nodes only.
5109 : */
5110 : if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
5111 : struct zoneref *z = first_zones_zonelist(ac->zonelist,
5112 : ac->highest_zoneidx,
5113 : &cpuset_current_mems_allowed);
5114 : if (!z->zone)
5115 : goto nopage;
5116 : }
5117 :
5118 0 : if (alloc_flags & ALLOC_KSWAPD)
5119 0 : wake_all_kswapds(order, gfp_mask, ac);
5120 :
5121 : /*
5122 : * The adjusted alloc_flags might result in immediate success, so try
5123 : * that first
5124 : */
5125 0 : page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
5126 0 : if (page)
5127 : goto got_pg;
5128 :
5129 : /*
5130 : * For costly allocations, try direct compaction first, as it's likely
5131 : * that we have enough base pages and don't need to reclaim. For non-
5132 : * movable high-order allocations, do that as well, as compaction will
5133 : * try prevent permanent fragmentation by migrating from blocks of the
5134 : * same migratetype.
5135 : * Don't try this for allocations that are allowed to ignore
5136 : * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
5137 : */
5138 0 : if (can_direct_reclaim &&
5139 0 : (costly_order ||
5140 0 : (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
5141 0 : && !gfp_pfmemalloc_allowed(gfp_mask)) {
5142 0 : page = __alloc_pages_direct_compact(gfp_mask, order,
5143 : alloc_flags, ac,
5144 : INIT_COMPACT_PRIORITY,
5145 : &compact_result);
5146 0 : if (page)
5147 : goto got_pg;
5148 :
5149 : /*
5150 : * Checks for costly allocations with __GFP_NORETRY, which
5151 : * includes some THP page fault allocations
5152 : */
5153 0 : if (costly_order && (gfp_mask & __GFP_NORETRY)) {
5154 : /*
5155 : * If allocating entire pageblock(s) and compaction
5156 : * failed because all zones are below low watermarks
5157 : * or is prohibited because it recently failed at this
5158 : * order, fail immediately unless the allocator has
5159 : * requested compaction and reclaim retry.
5160 : *
5161 : * Reclaim is
5162 : * - potentially very expensive because zones are far
5163 : * below their low watermarks or this is part of very
5164 : * bursty high order allocations,
5165 : * - not guaranteed to help because isolate_freepages()
5166 : * may not iterate over freed pages as part of its
5167 : * linear scan, and
5168 : * - unlikely to make entire pageblocks free on its
5169 : * own.
5170 : */
5171 0 : if (compact_result == COMPACT_SKIPPED ||
5172 : compact_result == COMPACT_DEFERRED)
5173 : goto nopage;
5174 :
5175 : /*
5176 : * Looks like reclaim/compaction is worth trying, but
5177 : * sync compaction could be very expensive, so keep
5178 : * using async compaction.
5179 : */
5180 0 : compact_priority = INIT_COMPACT_PRIORITY;
5181 : }
5182 : }
5183 :
5184 : retry:
5185 : /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
5186 0 : if (alloc_flags & ALLOC_KSWAPD)
5187 0 : wake_all_kswapds(order, gfp_mask, ac);
5188 :
5189 0 : reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
5190 0 : if (reserve_flags)
5191 0 : alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) |
5192 : (alloc_flags & ALLOC_KSWAPD);
5193 :
5194 : /*
5195 : * Reset the nodemask and zonelist iterators if memory policies can be
5196 : * ignored. These allocations are high priority and system rather than
5197 : * user oriented.
5198 : */
5199 0 : if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
5200 0 : ac->nodemask = NULL;
5201 0 : ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
5202 : ac->highest_zoneidx, ac->nodemask);
5203 : }
5204 :
5205 : /* Attempt with potentially adjusted zonelist and alloc_flags */
5206 0 : page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
5207 0 : if (page)
5208 : goto got_pg;
5209 :
5210 : /* Caller is not willing to reclaim, we can't balance anything */
5211 0 : if (!can_direct_reclaim)
5212 : goto nopage;
5213 :
5214 : /* Avoid recursion of direct reclaim */
5215 0 : if (current->flags & PF_MEMALLOC)
5216 : goto nopage;
5217 :
5218 : /* Try direct reclaim and then allocating */
5219 0 : page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
5220 : &did_some_progress);
5221 0 : if (page)
5222 : goto got_pg;
5223 :
5224 : /* Try direct compaction and then allocating */
5225 0 : page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
5226 : compact_priority, &compact_result);
5227 0 : if (page)
5228 : goto got_pg;
5229 :
5230 : /* Do not loop if specifically requested */
5231 0 : if (gfp_mask & __GFP_NORETRY)
5232 : goto nopage;
5233 :
5234 : /*
5235 : * Do not retry costly high order allocations unless they are
5236 : * __GFP_RETRY_MAYFAIL
5237 : */
5238 0 : if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
5239 : goto nopage;
5240 :
5241 0 : if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
5242 : did_some_progress > 0, &no_progress_loops))
5243 : goto retry;
5244 :
5245 : /*
5246 : * It doesn't make any sense to retry for the compaction if the order-0
5247 : * reclaim is not able to make any progress because the current
5248 : * implementation of the compaction depends on the sufficient amount
5249 : * of free memory (see __compaction_suitable)
5250 : */
5251 0 : if (did_some_progress > 0 &&
5252 0 : should_compact_retry(ac, order, alloc_flags,
5253 : compact_result, &compact_priority,
5254 : &compaction_retries))
5255 : goto retry;
5256 :
5257 :
5258 : /*
5259 : * Deal with possible cpuset update races or zonelist updates to avoid
5260 : * a unnecessary OOM kill.
5261 : */
5262 0 : if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
5263 0 : check_retry_zonelist(zonelist_iter_cookie))
5264 : goto restart;
5265 :
5266 : /* Reclaim has failed us, start killing things */
5267 0 : page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
5268 0 : if (page)
5269 : goto got_pg;
5270 :
5271 : /* Avoid allocations with no watermarks from looping endlessly */
5272 0 : if (tsk_is_oom_victim(current) &&
5273 0 : (alloc_flags & ALLOC_OOM ||
5274 0 : (gfp_mask & __GFP_NOMEMALLOC)))
5275 : goto nopage;
5276 :
5277 : /* Retry as long as the OOM killer is making progress */
5278 0 : if (did_some_progress) {
5279 0 : no_progress_loops = 0;
5280 0 : goto retry;
5281 : }
5282 :
5283 : nopage:
5284 : /*
5285 : * Deal with possible cpuset update races or zonelist updates to avoid
5286 : * a unnecessary OOM kill.
5287 : */
5288 0 : if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
5289 0 : check_retry_zonelist(zonelist_iter_cookie))
5290 : goto restart;
5291 :
5292 : /*
5293 : * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
5294 : * we always retry
5295 : */
5296 0 : if (gfp_mask & __GFP_NOFAIL) {
5297 : /*
5298 : * All existing users of the __GFP_NOFAIL are blockable, so warn
5299 : * of any new users that actually require GFP_NOWAIT
5300 : */
5301 0 : if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
5302 : goto fail;
5303 :
5304 : /*
5305 : * PF_MEMALLOC request from this context is rather bizarre
5306 : * because we cannot reclaim anything and only can loop waiting
5307 : * for somebody to do a work for us
5308 : */
5309 0 : WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
5310 :
5311 : /*
5312 : * non failing costly orders are a hard requirement which we
5313 : * are not prepared for much so let's warn about these users
5314 : * so that we can identify them and convert them to something
5315 : * else.
5316 : */
5317 0 : WARN_ON_ONCE_GFP(costly_order, gfp_mask);
5318 :
5319 : /*
5320 : * Help non-failing allocations by giving some access to memory
5321 : * reserves normally used for high priority non-blocking
5322 : * allocations but do not use ALLOC_NO_WATERMARKS because this
5323 : * could deplete whole memory reserves which would just make
5324 : * the situation worse.
5325 : */
5326 0 : page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac);
5327 0 : if (page)
5328 : goto got_pg;
5329 :
5330 0 : cond_resched();
5331 0 : goto retry;
5332 : }
5333 : fail:
5334 0 : warn_alloc(gfp_mask, ac->nodemask,
5335 : "page allocation failure: order:%u", order);
5336 : got_pg:
5337 0 : return page;
5338 : }
5339 :
5340 9224 : static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
5341 : int preferred_nid, nodemask_t *nodemask,
5342 : struct alloc_context *ac, gfp_t *alloc_gfp,
5343 : unsigned int *alloc_flags)
5344 : {
5345 9224 : ac->highest_zoneidx = gfp_zone(gfp_mask);
5346 18448 : ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
5347 9224 : ac->nodemask = nodemask;
5348 9224 : ac->migratetype = gfp_migratetype(gfp_mask);
5349 :
5350 : if (cpusets_enabled()) {
5351 : *alloc_gfp |= __GFP_HARDWALL;
5352 : /*
5353 : * When we are in the interrupt context, it is irrelevant
5354 : * to the current task context. It means that any node ok.
5355 : */
5356 : if (in_task() && !ac->nodemask)
5357 : ac->nodemask = &cpuset_current_mems_allowed;
5358 : else
5359 : *alloc_flags |= ALLOC_CPUSET;
5360 : }
5361 :
5362 9224 : might_alloc(gfp_mask);
5363 :
5364 9224 : if (should_fail_alloc_page(gfp_mask, order))
5365 : return false;
5366 :
5367 9224 : *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
5368 :
5369 : /* Dirty zone balancing only done in the fast path */
5370 9224 : ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
5371 :
5372 : /*
5373 : * The preferred zone is used for statistics but crucially it is
5374 : * also used as the starting point for the zonelist iterator. It
5375 : * may get reset for allocations that ignore memory policies.
5376 : */
5377 18448 : ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
5378 : ac->highest_zoneidx, ac->nodemask);
5379 :
5380 : return true;
5381 : }
5382 :
5383 : /*
5384 : * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
5385 : * @gfp: GFP flags for the allocation
5386 : * @preferred_nid: The preferred NUMA node ID to allocate from
5387 : * @nodemask: Set of nodes to allocate from, may be NULL
5388 : * @nr_pages: The number of pages desired on the list or array
5389 : * @page_list: Optional list to store the allocated pages
5390 : * @page_array: Optional array to store the pages
5391 : *
5392 : * This is a batched version of the page allocator that attempts to
5393 : * allocate nr_pages quickly. Pages are added to page_list if page_list
5394 : * is not NULL, otherwise it is assumed that the page_array is valid.
5395 : *
5396 : * For lists, nr_pages is the number of pages that should be allocated.
5397 : *
5398 : * For arrays, only NULL elements are populated with pages and nr_pages
5399 : * is the maximum number of pages that will be stored in the array.
5400 : *
5401 : * Returns the number of pages on the list or array.
5402 : */
5403 596 : unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
5404 : nodemask_t *nodemask, int nr_pages,
5405 : struct list_head *page_list,
5406 : struct page **page_array)
5407 : {
5408 : struct page *page;
5409 : unsigned long __maybe_unused UP_flags;
5410 : struct zone *zone;
5411 : struct zoneref *z;
5412 : struct per_cpu_pages *pcp;
5413 : struct list_head *pcp_list;
5414 : struct alloc_context ac;
5415 : gfp_t alloc_gfp;
5416 596 : unsigned int alloc_flags = ALLOC_WMARK_LOW;
5417 596 : int nr_populated = 0, nr_account = 0;
5418 :
5419 : /*
5420 : * Skip populated array elements to determine if any pages need
5421 : * to be allocated before disabling IRQs.
5422 : */
5423 1192 : while (page_array && nr_populated < nr_pages && page_array[nr_populated])
5424 0 : nr_populated++;
5425 :
5426 : /* No pages requested? */
5427 596 : if (unlikely(nr_pages <= 0))
5428 : goto out;
5429 :
5430 : /* Already populated array? */
5431 596 : if (unlikely(page_array && nr_pages - nr_populated == 0))
5432 : goto out;
5433 :
5434 : /* Bulk allocator does not support memcg accounting. */
5435 : if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT))
5436 : goto failed;
5437 :
5438 : /* Use the single page allocator for one page. */
5439 596 : if (nr_pages - nr_populated == 1)
5440 : goto failed;
5441 :
5442 : #ifdef CONFIG_PAGE_OWNER
5443 : /*
5444 : * PAGE_OWNER may recurse into the allocator to allocate space to
5445 : * save the stack with pagesets.lock held. Releasing/reacquiring
5446 : * removes much of the performance benefit of bulk allocation so
5447 : * force the caller to allocate one page at a time as it'll have
5448 : * similar performance to added complexity to the bulk allocator.
5449 : */
5450 : if (static_branch_unlikely(&page_owner_inited))
5451 : goto failed;
5452 : #endif
5453 :
5454 : /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
5455 596 : gfp &= gfp_allowed_mask;
5456 596 : alloc_gfp = gfp;
5457 596 : if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
5458 : goto out;
5459 596 : gfp = alloc_gfp;
5460 :
5461 : /* Find an allowed local zone that meets the low watermark. */
5462 1192 : for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
5463 : unsigned long mark;
5464 :
5465 : if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
5466 : !__cpuset_zone_allowed(zone, gfp)) {
5467 : continue;
5468 : }
5469 :
5470 : if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
5471 : zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
5472 : goto failed;
5473 : }
5474 :
5475 596 : mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
5476 596 : if (zone_watermark_fast(zone, 0, mark,
5477 : zonelist_zone_idx(ac.preferred_zoneref),
5478 : alloc_flags, gfp)) {
5479 : break;
5480 : }
5481 : }
5482 :
5483 : /*
5484 : * If there are no allowed local zones that meets the watermarks then
5485 : * try to allocate a single page and reclaim if necessary.
5486 : */
5487 596 : if (unlikely(!zone))
5488 : goto failed;
5489 :
5490 : /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
5491 596 : pcp_trylock_prepare(UP_flags);
5492 1192 : pcp = pcp_spin_trylock(zone->per_cpu_pageset);
5493 596 : if (!pcp)
5494 : goto failed_irq;
5495 :
5496 : /* Attempt the batch allocation */
5497 1192 : pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
5498 43733 : while (nr_populated < nr_pages) {
5499 :
5500 : /* Skip existing pages */
5501 42541 : if (page_array && page_array[nr_populated]) {
5502 0 : nr_populated++;
5503 0 : continue;
5504 : }
5505 :
5506 42541 : page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
5507 : pcp, pcp_list);
5508 42541 : if (unlikely(!page)) {
5509 : /* Try and allocate at least one page */
5510 0 : if (!nr_account) {
5511 0 : pcp_spin_unlock(pcp);
5512 0 : goto failed_irq;
5513 : }
5514 : break;
5515 : }
5516 42541 : nr_account++;
5517 :
5518 42541 : prep_new_page(page, 0, gfp, 0);
5519 42541 : if (page_list)
5520 0 : list_add(&page->lru, page_list);
5521 : else
5522 42541 : page_array[nr_populated] = page;
5523 42541 : nr_populated++;
5524 : }
5525 :
5526 1192 : pcp_spin_unlock(pcp);
5527 1192 : pcp_trylock_finish(UP_flags);
5528 :
5529 1192 : __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
5530 596 : zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
5531 :
5532 : out:
5533 596 : return nr_populated;
5534 :
5535 : failed_irq:
5536 0 : pcp_trylock_finish(UP_flags);
5537 :
5538 : failed:
5539 0 : page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
5540 0 : if (page) {
5541 0 : if (page_list)
5542 0 : list_add(&page->lru, page_list);
5543 : else
5544 0 : page_array[nr_populated] = page;
5545 0 : nr_populated++;
5546 : }
5547 :
5548 : goto out;
5549 : }
5550 : EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
5551 :
5552 : /*
5553 : * This is the 'heart' of the zoned buddy allocator.
5554 : */
5555 8628 : struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
5556 : nodemask_t *nodemask)
5557 : {
5558 : struct page *page;
5559 8628 : unsigned int alloc_flags = ALLOC_WMARK_LOW;
5560 : gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
5561 8628 : struct alloc_context ac = { };
5562 :
5563 : /*
5564 : * There are several places where we assume that the order value is sane
5565 : * so bail out early if the request is out of bound.
5566 : */
5567 8628 : if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp))
5568 : return NULL;
5569 :
5570 8628 : gfp &= gfp_allowed_mask;
5571 : /*
5572 : * Apply scoped allocation constraints. This is mainly about GFP_NOFS
5573 : * resp. GFP_NOIO which has to be inherited for all allocation requests
5574 : * from a particular context which has been marked by
5575 : * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
5576 : * movable zones are not used during allocation.
5577 : */
5578 8628 : gfp = current_gfp_context(gfp);
5579 8628 : alloc_gfp = gfp;
5580 8628 : if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
5581 : &alloc_gfp, &alloc_flags))
5582 : return NULL;
5583 :
5584 : /*
5585 : * Forbid the first pass from falling back to types that fragment
5586 : * memory until all local zones are considered.
5587 : */
5588 17256 : alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
5589 :
5590 : /* First allocation attempt */
5591 8628 : page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
5592 8628 : if (likely(page))
5593 : goto out;
5594 :
5595 0 : alloc_gfp = gfp;
5596 0 : ac.spread_dirty_pages = false;
5597 :
5598 : /*
5599 : * Restore the original nodemask if it was potentially replaced with
5600 : * &cpuset_current_mems_allowed to optimize the fast-path attempt.
5601 : */
5602 0 : ac.nodemask = nodemask;
5603 :
5604 0 : page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
5605 :
5606 : out:
5607 : if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
5608 : unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
5609 : __free_pages(page, order);
5610 : page = NULL;
5611 : }
5612 :
5613 8628 : trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
5614 8628 : kmsan_alloc_page(page, order, alloc_gfp);
5615 :
5616 8628 : return page;
5617 : }
5618 : EXPORT_SYMBOL(__alloc_pages);
5619 :
5620 0 : struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
5621 : nodemask_t *nodemask)
5622 : {
5623 0 : struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
5624 : preferred_nid, nodemask);
5625 :
5626 : if (page && order > 1)
5627 : prep_transhuge_page(page);
5628 0 : return (struct folio *)page;
5629 : }
5630 : EXPORT_SYMBOL(__folio_alloc);
5631 :
5632 : /*
5633 : * Common helper functions. Never use with __GFP_HIGHMEM because the returned
5634 : * address cannot represent highmem pages. Use alloc_pages and then kmap if
5635 : * you need to access high mem.
5636 : */
5637 20 : unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
5638 : {
5639 : struct page *page;
5640 :
5641 40 : page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
5642 20 : if (!page)
5643 : return 0;
5644 20 : return (unsigned long) page_address(page);
5645 : }
5646 : EXPORT_SYMBOL(__get_free_pages);
5647 :
5648 0 : unsigned long get_zeroed_page(gfp_t gfp_mask)
5649 : {
5650 0 : return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
5651 : }
5652 : EXPORT_SYMBOL(get_zeroed_page);
5653 :
5654 : /**
5655 : * __free_pages - Free pages allocated with alloc_pages().
5656 : * @page: The page pointer returned from alloc_pages().
5657 : * @order: The order of the allocation.
5658 : *
5659 : * This function can free multi-page allocations that are not compound
5660 : * pages. It does not check that the @order passed in matches that of
5661 : * the allocation, so it is easy to leak memory. Freeing more memory
5662 : * than was allocated will probably emit a warning.
5663 : *
5664 : * If the last reference to this page is speculative, it will be released
5665 : * by put_page() which only frees the first page of a non-compound
5666 : * allocation. To prevent the remaining pages from being leaked, we free
5667 : * the subsequent pages here. If you want to use the page's reference
5668 : * count to decide when to free the allocation, you should allocate a
5669 : * compound page, and use put_page() instead of __free_pages().
5670 : *
5671 : * Context: May be called in interrupt context or while holding a normal
5672 : * spinlock, but not in NMI context or while holding a raw spinlock.
5673 : */
5674 50643 : void __free_pages(struct page *page, unsigned int order)
5675 : {
5676 : /* get PageHead before we drop reference */
5677 50643 : int head = PageHead(page);
5678 :
5679 50643 : if (put_page_testzero(page))
5680 50643 : free_the_page(page, order);
5681 0 : else if (!head)
5682 0 : while (order-- > 0)
5683 0 : free_the_page(page + (1 << order), order);
5684 50643 : }
5685 : EXPORT_SYMBOL(__free_pages);
5686 :
5687 0 : void free_pages(unsigned long addr, unsigned int order)
5688 : {
5689 0 : if (addr != 0) {
5690 : VM_BUG_ON(!virt_addr_valid((void *)addr));
5691 0 : __free_pages(virt_to_page((void *)addr), order);
5692 : }
5693 0 : }
5694 :
5695 : EXPORT_SYMBOL(free_pages);
5696 :
5697 : /*
5698 : * Page Fragment:
5699 : * An arbitrary-length arbitrary-offset area of memory which resides
5700 : * within a 0 or higher order page. Multiple fragments within that page
5701 : * are individually refcounted, in the page's reference counter.
5702 : *
5703 : * The page_frag functions below provide a simple allocation framework for
5704 : * page fragments. This is used by the network stack and network device
5705 : * drivers to provide a backing region of memory for use as either an
5706 : * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
5707 : */
5708 0 : static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
5709 : gfp_t gfp_mask)
5710 : {
5711 0 : struct page *page = NULL;
5712 0 : gfp_t gfp = gfp_mask;
5713 :
5714 : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
5715 0 : gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
5716 : __GFP_NOMEMALLOC;
5717 0 : page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
5718 0 : PAGE_FRAG_CACHE_MAX_ORDER);
5719 0 : nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
5720 : #endif
5721 0 : if (unlikely(!page))
5722 0 : page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
5723 :
5724 0 : nc->va = page ? page_address(page) : NULL;
5725 :
5726 0 : return page;
5727 : }
5728 :
5729 0 : void __page_frag_cache_drain(struct page *page, unsigned int count)
5730 : {
5731 : VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
5732 :
5733 0 : if (page_ref_sub_and_test(page, count))
5734 0 : free_the_page(page, compound_order(page));
5735 0 : }
5736 : EXPORT_SYMBOL(__page_frag_cache_drain);
5737 :
5738 0 : void *page_frag_alloc_align(struct page_frag_cache *nc,
5739 : unsigned int fragsz, gfp_t gfp_mask,
5740 : unsigned int align_mask)
5741 : {
5742 0 : unsigned int size = PAGE_SIZE;
5743 : struct page *page;
5744 : int offset;
5745 :
5746 0 : if (unlikely(!nc->va)) {
5747 : refill:
5748 0 : page = __page_frag_cache_refill(nc, gfp_mask);
5749 0 : if (!page)
5750 : return NULL;
5751 :
5752 : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
5753 : /* if size can vary use size else just use PAGE_SIZE */
5754 0 : size = nc->size;
5755 : #endif
5756 : /* Even if we own the page, we do not use atomic_set().
5757 : * This would break get_page_unless_zero() users.
5758 : */
5759 0 : page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
5760 :
5761 : /* reset page count bias and offset to start of new frag */
5762 0 : nc->pfmemalloc = page_is_pfmemalloc(page);
5763 0 : nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
5764 0 : nc->offset = size;
5765 : }
5766 :
5767 0 : offset = nc->offset - fragsz;
5768 0 : if (unlikely(offset < 0)) {
5769 0 : page = virt_to_page(nc->va);
5770 :
5771 0 : if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
5772 : goto refill;
5773 :
5774 0 : if (unlikely(nc->pfmemalloc)) {
5775 0 : free_the_page(page, compound_order(page));
5776 0 : goto refill;
5777 : }
5778 :
5779 : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
5780 : /* if size can vary use size else just use PAGE_SIZE */
5781 0 : size = nc->size;
5782 : #endif
5783 : /* OK, page count is 0, we can safely set it */
5784 0 : set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
5785 :
5786 : /* reset page count bias and offset to start of new frag */
5787 0 : nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
5788 0 : offset = size - fragsz;
5789 0 : if (unlikely(offset < 0)) {
5790 : /*
5791 : * The caller is trying to allocate a fragment
5792 : * with fragsz > PAGE_SIZE but the cache isn't big
5793 : * enough to satisfy the request, this may
5794 : * happen in low memory conditions.
5795 : * We don't release the cache page because
5796 : * it could make memory pressure worse
5797 : * so we simply return NULL here.
5798 : */
5799 : return NULL;
5800 : }
5801 : }
5802 :
5803 0 : nc->pagecnt_bias--;
5804 0 : offset &= align_mask;
5805 0 : nc->offset = offset;
5806 :
5807 0 : return nc->va + offset;
5808 : }
5809 : EXPORT_SYMBOL(page_frag_alloc_align);
5810 :
5811 : /*
5812 : * Frees a page fragment allocated out of either a compound or order 0 page.
5813 : */
5814 0 : void page_frag_free(void *addr)
5815 : {
5816 0 : struct page *page = virt_to_head_page(addr);
5817 :
5818 0 : if (unlikely(put_page_testzero(page)))
5819 0 : free_the_page(page, compound_order(page));
5820 0 : }
5821 : EXPORT_SYMBOL(page_frag_free);
5822 :
5823 3 : static void *make_alloc_exact(unsigned long addr, unsigned int order,
5824 : size_t size)
5825 : {
5826 3 : if (addr) {
5827 3 : unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE);
5828 6 : struct page *page = virt_to_page((void *)addr);
5829 3 : struct page *last = page + nr;
5830 :
5831 3 : split_page_owner(page, 1 << order);
5832 3 : split_page_memcg(page, 1 << order);
5833 10 : while (page < --last)
5834 : set_page_refcounted(last);
5835 :
5836 3 : last = page + (1UL << order);
5837 3 : for (page += nr; page < last; page++)
5838 0 : __free_pages_ok(page, 0, FPI_TO_TAIL);
5839 : }
5840 3 : return (void *)addr;
5841 : }
5842 :
5843 : /**
5844 : * alloc_pages_exact - allocate an exact number physically-contiguous pages.
5845 : * @size: the number of bytes to allocate
5846 : * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
5847 : *
5848 : * This function is similar to alloc_pages(), except that it allocates the
5849 : * minimum number of pages to satisfy the request. alloc_pages() can only
5850 : * allocate memory in power-of-two pages.
5851 : *
5852 : * This function is also limited by MAX_ORDER.
5853 : *
5854 : * Memory allocated by this function must be released by free_pages_exact().
5855 : *
5856 : * Return: pointer to the allocated area or %NULL in case of error.
5857 : */
5858 3 : void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
5859 : {
5860 3 : unsigned int order = get_order(size);
5861 : unsigned long addr;
5862 :
5863 3 : if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
5864 0 : gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
5865 :
5866 3 : addr = __get_free_pages(gfp_mask, order);
5867 3 : return make_alloc_exact(addr, order, size);
5868 : }
5869 : EXPORT_SYMBOL(alloc_pages_exact);
5870 :
5871 : /**
5872 : * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
5873 : * pages on a node.
5874 : * @nid: the preferred node ID where memory should be allocated
5875 : * @size: the number of bytes to allocate
5876 : * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
5877 : *
5878 : * Like alloc_pages_exact(), but try to allocate on node nid first before falling
5879 : * back.
5880 : *
5881 : * Return: pointer to the allocated area or %NULL in case of error.
5882 : */
5883 0 : void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
5884 : {
5885 0 : unsigned int order = get_order(size);
5886 : struct page *p;
5887 :
5888 0 : if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
5889 0 : gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
5890 :
5891 0 : p = alloc_pages_node(nid, gfp_mask, order);
5892 0 : if (!p)
5893 : return NULL;
5894 0 : return make_alloc_exact((unsigned long)page_address(p), order, size);
5895 : }
5896 :
5897 : /**
5898 : * free_pages_exact - release memory allocated via alloc_pages_exact()
5899 : * @virt: the value returned by alloc_pages_exact.
5900 : * @size: size of allocation, same value as passed to alloc_pages_exact().
5901 : *
5902 : * Release the memory allocated by a previous call to alloc_pages_exact.
5903 : */
5904 0 : void free_pages_exact(void *virt, size_t size)
5905 : {
5906 0 : unsigned long addr = (unsigned long)virt;
5907 0 : unsigned long end = addr + PAGE_ALIGN(size);
5908 :
5909 0 : while (addr < end) {
5910 0 : free_page(addr);
5911 0 : addr += PAGE_SIZE;
5912 : }
5913 0 : }
5914 : EXPORT_SYMBOL(free_pages_exact);
5915 :
5916 : /**
5917 : * nr_free_zone_pages - count number of pages beyond high watermark
5918 : * @offset: The zone index of the highest zone
5919 : *
5920 : * nr_free_zone_pages() counts the number of pages which are beyond the
5921 : * high watermark within all zones at or below a given zone index. For each
5922 : * zone, the number of pages is calculated as:
5923 : *
5924 : * nr_free_zone_pages = managed_pages - high_pages
5925 : *
5926 : * Return: number of pages beyond high watermark.
5927 : */
5928 3 : static unsigned long nr_free_zone_pages(int offset)
5929 : {
5930 : struct zoneref *z;
5931 : struct zone *zone;
5932 :
5933 : /* Just pick one node, since fallback list is circular */
5934 3 : unsigned long sum = 0;
5935 :
5936 6 : struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
5937 :
5938 12 : for_each_zone_zonelist(zone, z, zonelist, offset) {
5939 3 : unsigned long size = zone_managed_pages(zone);
5940 3 : unsigned long high = high_wmark_pages(zone);
5941 3 : if (size > high)
5942 3 : sum += size - high;
5943 : }
5944 :
5945 3 : return sum;
5946 : }
5947 :
5948 : /**
5949 : * nr_free_buffer_pages - count number of pages beyond high watermark
5950 : *
5951 : * nr_free_buffer_pages() counts the number of pages which are beyond the high
5952 : * watermark within ZONE_DMA and ZONE_NORMAL.
5953 : *
5954 : * Return: number of pages beyond high watermark within ZONE_DMA and
5955 : * ZONE_NORMAL.
5956 : */
5957 1 : unsigned long nr_free_buffer_pages(void)
5958 : {
5959 2 : return nr_free_zone_pages(gfp_zone(GFP_USER));
5960 : }
5961 : EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
5962 :
5963 : static inline void show_node(struct zone *zone)
5964 : {
5965 : if (IS_ENABLED(CONFIG_NUMA))
5966 : printk("Node %d ", zone_to_nid(zone));
5967 : }
5968 :
5969 0 : long si_mem_available(void)
5970 : {
5971 : long available;
5972 : unsigned long pagecache;
5973 0 : unsigned long wmark_low = 0;
5974 : unsigned long pages[NR_LRU_LISTS];
5975 : unsigned long reclaimable;
5976 : struct zone *zone;
5977 : int lru;
5978 :
5979 0 : for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
5980 0 : pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
5981 :
5982 0 : for_each_zone(zone)
5983 0 : wmark_low += low_wmark_pages(zone);
5984 :
5985 : /*
5986 : * Estimate the amount of memory available for userspace allocations,
5987 : * without causing swapping or OOM.
5988 : */
5989 0 : available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
5990 :
5991 : /*
5992 : * Not all the page cache can be freed, otherwise the system will
5993 : * start swapping or thrashing. Assume at least half of the page
5994 : * cache, or the low watermark worth of cache, needs to stay.
5995 : */
5996 0 : pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
5997 0 : pagecache -= min(pagecache / 2, wmark_low);
5998 0 : available += pagecache;
5999 :
6000 : /*
6001 : * Part of the reclaimable slab and other kernel memory consists of
6002 : * items that are in use, and cannot be freed. Cap this estimate at the
6003 : * low watermark.
6004 : */
6005 0 : reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
6006 0 : global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
6007 0 : available += reclaimable - min(reclaimable / 2, wmark_low);
6008 :
6009 0 : if (available < 0)
6010 0 : available = 0;
6011 0 : return available;
6012 : }
6013 : EXPORT_SYMBOL_GPL(si_mem_available);
6014 :
6015 2 : void si_meminfo(struct sysinfo *val)
6016 : {
6017 2 : val->totalram = totalram_pages();
6018 2 : val->sharedram = global_node_page_state(NR_SHMEM);
6019 2 : val->freeram = global_zone_page_state(NR_FREE_PAGES);
6020 2 : val->bufferram = nr_blockdev_pages();
6021 2 : val->totalhigh = totalhigh_pages();
6022 2 : val->freehigh = nr_free_highpages();
6023 2 : val->mem_unit = PAGE_SIZE;
6024 2 : }
6025 :
6026 : EXPORT_SYMBOL(si_meminfo);
6027 :
6028 : #ifdef CONFIG_NUMA
6029 : void si_meminfo_node(struct sysinfo *val, int nid)
6030 : {
6031 : int zone_type; /* needs to be signed */
6032 : unsigned long managed_pages = 0;
6033 : unsigned long managed_highpages = 0;
6034 : unsigned long free_highpages = 0;
6035 : pg_data_t *pgdat = NODE_DATA(nid);
6036 :
6037 : for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
6038 : managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
6039 : val->totalram = managed_pages;
6040 : val->sharedram = node_page_state(pgdat, NR_SHMEM);
6041 : val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
6042 : #ifdef CONFIG_HIGHMEM
6043 : for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
6044 : struct zone *zone = &pgdat->node_zones[zone_type];
6045 :
6046 : if (is_highmem(zone)) {
6047 : managed_highpages += zone_managed_pages(zone);
6048 : free_highpages += zone_page_state(zone, NR_FREE_PAGES);
6049 : }
6050 : }
6051 : val->totalhigh = managed_highpages;
6052 : val->freehigh = free_highpages;
6053 : #else
6054 : val->totalhigh = managed_highpages;
6055 : val->freehigh = free_highpages;
6056 : #endif
6057 : val->mem_unit = PAGE_SIZE;
6058 : }
6059 : #endif
6060 :
6061 : /*
6062 : * Determine whether the node should be displayed or not, depending on whether
6063 : * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
6064 : */
6065 0 : static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
6066 : {
6067 0 : if (!(flags & SHOW_MEM_FILTER_NODES))
6068 : return false;
6069 :
6070 : /*
6071 : * no node mask - aka implicit memory numa policy. Do not bother with
6072 : * the synchronization - read_mems_allowed_begin - because we do not
6073 : * have to be precise here.
6074 : */
6075 0 : if (!nodemask)
6076 0 : nodemask = &cpuset_current_mems_allowed;
6077 :
6078 0 : return !node_isset(nid, *nodemask);
6079 : }
6080 :
6081 : #define K(x) ((x) << (PAGE_SHIFT-10))
6082 :
6083 0 : static void show_migration_types(unsigned char type)
6084 : {
6085 : static const char types[MIGRATE_TYPES] = {
6086 : [MIGRATE_UNMOVABLE] = 'U',
6087 : [MIGRATE_MOVABLE] = 'M',
6088 : [MIGRATE_RECLAIMABLE] = 'E',
6089 : [MIGRATE_HIGHATOMIC] = 'H',
6090 : #ifdef CONFIG_CMA
6091 : [MIGRATE_CMA] = 'C',
6092 : #endif
6093 : #ifdef CONFIG_MEMORY_ISOLATION
6094 : [MIGRATE_ISOLATE] = 'I',
6095 : #endif
6096 : };
6097 : char tmp[MIGRATE_TYPES + 1];
6098 0 : char *p = tmp;
6099 : int i;
6100 :
6101 0 : for (i = 0; i < MIGRATE_TYPES; i++) {
6102 0 : if (type & (1 << i))
6103 0 : *p++ = types[i];
6104 : }
6105 :
6106 0 : *p = '\0';
6107 0 : printk(KERN_CONT "(%s) ", tmp);
6108 0 : }
6109 :
6110 : static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx)
6111 : {
6112 : int zone_idx;
6113 0 : for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++)
6114 0 : if (zone_managed_pages(pgdat->node_zones + zone_idx))
6115 : return true;
6116 : return false;
6117 : }
6118 :
6119 : /*
6120 : * Show free area list (used inside shift_scroll-lock stuff)
6121 : * We also calculate the percentage fragmentation. We do this by counting the
6122 : * memory on each free list with the exception of the first item on the list.
6123 : *
6124 : * Bits in @filter:
6125 : * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
6126 : * cpuset.
6127 : */
6128 0 : void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
6129 : {
6130 0 : unsigned long free_pcp = 0;
6131 : int cpu, nid;
6132 : struct zone *zone;
6133 : pg_data_t *pgdat;
6134 :
6135 0 : for_each_populated_zone(zone) {
6136 0 : if (zone_idx(zone) > max_zone_idx)
6137 0 : continue;
6138 0 : if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
6139 0 : continue;
6140 :
6141 0 : for_each_online_cpu(cpu)
6142 0 : free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
6143 : }
6144 :
6145 0 : printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
6146 : " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
6147 : " unevictable:%lu dirty:%lu writeback:%lu\n"
6148 : " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
6149 : " mapped:%lu shmem:%lu pagetables:%lu\n"
6150 : " sec_pagetables:%lu bounce:%lu\n"
6151 : " kernel_misc_reclaimable:%lu\n"
6152 : " free:%lu free_pcp:%lu free_cma:%lu\n",
6153 : global_node_page_state(NR_ACTIVE_ANON),
6154 : global_node_page_state(NR_INACTIVE_ANON),
6155 : global_node_page_state(NR_ISOLATED_ANON),
6156 : global_node_page_state(NR_ACTIVE_FILE),
6157 : global_node_page_state(NR_INACTIVE_FILE),
6158 : global_node_page_state(NR_ISOLATED_FILE),
6159 : global_node_page_state(NR_UNEVICTABLE),
6160 : global_node_page_state(NR_FILE_DIRTY),
6161 : global_node_page_state(NR_WRITEBACK),
6162 : global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
6163 : global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
6164 : global_node_page_state(NR_FILE_MAPPED),
6165 : global_node_page_state(NR_SHMEM),
6166 : global_node_page_state(NR_PAGETABLE),
6167 : global_node_page_state(NR_SECONDARY_PAGETABLE),
6168 : global_zone_page_state(NR_BOUNCE),
6169 : global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
6170 : global_zone_page_state(NR_FREE_PAGES),
6171 : free_pcp,
6172 : global_zone_page_state(NR_FREE_CMA_PAGES));
6173 :
6174 0 : for_each_online_pgdat(pgdat) {
6175 0 : if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
6176 0 : continue;
6177 0 : if (!node_has_managed_zones(pgdat, max_zone_idx))
6178 0 : continue;
6179 :
6180 0 : printk("Node %d"
6181 : " active_anon:%lukB"
6182 : " inactive_anon:%lukB"
6183 : " active_file:%lukB"
6184 : " inactive_file:%lukB"
6185 : " unevictable:%lukB"
6186 : " isolated(anon):%lukB"
6187 : " isolated(file):%lukB"
6188 : " mapped:%lukB"
6189 : " dirty:%lukB"
6190 : " writeback:%lukB"
6191 : " shmem:%lukB"
6192 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6193 : " shmem_thp: %lukB"
6194 : " shmem_pmdmapped: %lukB"
6195 : " anon_thp: %lukB"
6196 : #endif
6197 : " writeback_tmp:%lukB"
6198 : " kernel_stack:%lukB"
6199 : #ifdef CONFIG_SHADOW_CALL_STACK
6200 : " shadow_call_stack:%lukB"
6201 : #endif
6202 : " pagetables:%lukB"
6203 : " sec_pagetables:%lukB"
6204 : " all_unreclaimable? %s"
6205 : "\n",
6206 : pgdat->node_id,
6207 : K(node_page_state(pgdat, NR_ACTIVE_ANON)),
6208 : K(node_page_state(pgdat, NR_INACTIVE_ANON)),
6209 : K(node_page_state(pgdat, NR_ACTIVE_FILE)),
6210 : K(node_page_state(pgdat, NR_INACTIVE_FILE)),
6211 : K(node_page_state(pgdat, NR_UNEVICTABLE)),
6212 : K(node_page_state(pgdat, NR_ISOLATED_ANON)),
6213 : K(node_page_state(pgdat, NR_ISOLATED_FILE)),
6214 : K(node_page_state(pgdat, NR_FILE_MAPPED)),
6215 : K(node_page_state(pgdat, NR_FILE_DIRTY)),
6216 : K(node_page_state(pgdat, NR_WRITEBACK)),
6217 : K(node_page_state(pgdat, NR_SHMEM)),
6218 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6219 : K(node_page_state(pgdat, NR_SHMEM_THPS)),
6220 : K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
6221 : K(node_page_state(pgdat, NR_ANON_THPS)),
6222 : #endif
6223 : K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
6224 : node_page_state(pgdat, NR_KERNEL_STACK_KB),
6225 : #ifdef CONFIG_SHADOW_CALL_STACK
6226 : node_page_state(pgdat, NR_KERNEL_SCS_KB),
6227 : #endif
6228 : K(node_page_state(pgdat, NR_PAGETABLE)),
6229 : K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
6230 : pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
6231 : "yes" : "no");
6232 : }
6233 :
6234 0 : for_each_populated_zone(zone) {
6235 : int i;
6236 :
6237 0 : if (zone_idx(zone) > max_zone_idx)
6238 0 : continue;
6239 0 : if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
6240 0 : continue;
6241 :
6242 : free_pcp = 0;
6243 0 : for_each_online_cpu(cpu)
6244 0 : free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
6245 :
6246 0 : show_node(zone);
6247 0 : printk(KERN_CONT
6248 : "%s"
6249 : " free:%lukB"
6250 : " boost:%lukB"
6251 : " min:%lukB"
6252 : " low:%lukB"
6253 : " high:%lukB"
6254 : " reserved_highatomic:%luKB"
6255 : " active_anon:%lukB"
6256 : " inactive_anon:%lukB"
6257 : " active_file:%lukB"
6258 : " inactive_file:%lukB"
6259 : " unevictable:%lukB"
6260 : " writepending:%lukB"
6261 : " present:%lukB"
6262 : " managed:%lukB"
6263 : " mlocked:%lukB"
6264 : " bounce:%lukB"
6265 : " free_pcp:%lukB"
6266 : " local_pcp:%ukB"
6267 : " free_cma:%lukB"
6268 : "\n",
6269 : zone->name,
6270 : K(zone_page_state(zone, NR_FREE_PAGES)),
6271 : K(zone->watermark_boost),
6272 : K(min_wmark_pages(zone)),
6273 : K(low_wmark_pages(zone)),
6274 : K(high_wmark_pages(zone)),
6275 : K(zone->nr_reserved_highatomic),
6276 : K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
6277 : K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
6278 : K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
6279 : K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
6280 : K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
6281 : K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
6282 : K(zone->present_pages),
6283 : K(zone_managed_pages(zone)),
6284 : K(zone_page_state(zone, NR_MLOCK)),
6285 : K(zone_page_state(zone, NR_BOUNCE)),
6286 : K(free_pcp),
6287 : K(this_cpu_read(zone->per_cpu_pageset->count)),
6288 : K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
6289 0 : printk("lowmem_reserve[]:");
6290 0 : for (i = 0; i < MAX_NR_ZONES; i++)
6291 0 : printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
6292 0 : printk(KERN_CONT "\n");
6293 : }
6294 :
6295 0 : for_each_populated_zone(zone) {
6296 : unsigned int order;
6297 0 : unsigned long nr[MAX_ORDER], flags, total = 0;
6298 : unsigned char types[MAX_ORDER];
6299 :
6300 0 : if (zone_idx(zone) > max_zone_idx)
6301 0 : continue;
6302 0 : if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
6303 0 : continue;
6304 0 : show_node(zone);
6305 0 : printk(KERN_CONT "%s: ", zone->name);
6306 :
6307 0 : spin_lock_irqsave(&zone->lock, flags);
6308 0 : for (order = 0; order < MAX_ORDER; order++) {
6309 0 : struct free_area *area = &zone->free_area[order];
6310 : int type;
6311 :
6312 0 : nr[order] = area->nr_free;
6313 0 : total += nr[order] << order;
6314 :
6315 0 : types[order] = 0;
6316 0 : for (type = 0; type < MIGRATE_TYPES; type++) {
6317 0 : if (!free_area_empty(area, type))
6318 0 : types[order] |= 1 << type;
6319 : }
6320 : }
6321 0 : spin_unlock_irqrestore(&zone->lock, flags);
6322 0 : for (order = 0; order < MAX_ORDER; order++) {
6323 0 : printk(KERN_CONT "%lu*%lukB ",
6324 : nr[order], K(1UL) << order);
6325 0 : if (nr[order])
6326 0 : show_migration_types(types[order]);
6327 : }
6328 0 : printk(KERN_CONT "= %lukB\n", K(total));
6329 : }
6330 :
6331 0 : for_each_online_node(nid) {
6332 0 : if (show_mem_node_skip(filter, nid, nodemask))
6333 : continue;
6334 : hugetlb_show_meminfo_node(nid);
6335 : }
6336 :
6337 0 : printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
6338 :
6339 0 : show_swap_cache_info();
6340 0 : }
6341 :
6342 : static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
6343 : {
6344 1 : zoneref->zone = zone;
6345 1 : zoneref->zone_idx = zone_idx(zone);
6346 : }
6347 :
6348 : /*
6349 : * Builds allocation fallback zone lists.
6350 : *
6351 : * Add all populated zones of a node to the zonelist.
6352 : */
6353 : static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
6354 : {
6355 : struct zone *zone;
6356 1 : enum zone_type zone_type = MAX_NR_ZONES;
6357 1 : int nr_zones = 0;
6358 :
6359 : do {
6360 2 : zone_type--;
6361 2 : zone = pgdat->node_zones + zone_type;
6362 2 : if (populated_zone(zone)) {
6363 2 : zoneref_set_zone(zone, &zonerefs[nr_zones++]);
6364 1 : check_highest_zone(zone_type);
6365 : }
6366 2 : } while (zone_type);
6367 :
6368 : return nr_zones;
6369 : }
6370 :
6371 : #ifdef CONFIG_NUMA
6372 :
6373 : static int __parse_numa_zonelist_order(char *s)
6374 : {
6375 : /*
6376 : * We used to support different zonelists modes but they turned
6377 : * out to be just not useful. Let's keep the warning in place
6378 : * if somebody still use the cmd line parameter so that we do
6379 : * not fail it silently
6380 : */
6381 : if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
6382 : pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
6383 : return -EINVAL;
6384 : }
6385 : return 0;
6386 : }
6387 :
6388 : char numa_zonelist_order[] = "Node";
6389 :
6390 : /*
6391 : * sysctl handler for numa_zonelist_order
6392 : */
6393 : int numa_zonelist_order_handler(struct ctl_table *table, int write,
6394 : void *buffer, size_t *length, loff_t *ppos)
6395 : {
6396 : if (write)
6397 : return __parse_numa_zonelist_order(buffer);
6398 : return proc_dostring(table, write, buffer, length, ppos);
6399 : }
6400 :
6401 :
6402 : static int node_load[MAX_NUMNODES];
6403 :
6404 : /**
6405 : * find_next_best_node - find the next node that should appear in a given node's fallback list
6406 : * @node: node whose fallback list we're appending
6407 : * @used_node_mask: nodemask_t of already used nodes
6408 : *
6409 : * We use a number of factors to determine which is the next node that should
6410 : * appear on a given node's fallback list. The node should not have appeared
6411 : * already in @node's fallback list, and it should be the next closest node
6412 : * according to the distance array (which contains arbitrary distance values
6413 : * from each node to each node in the system), and should also prefer nodes
6414 : * with no CPUs, since presumably they'll have very little allocation pressure
6415 : * on them otherwise.
6416 : *
6417 : * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
6418 : */
6419 : int find_next_best_node(int node, nodemask_t *used_node_mask)
6420 : {
6421 : int n, val;
6422 : int min_val = INT_MAX;
6423 : int best_node = NUMA_NO_NODE;
6424 :
6425 : /* Use the local node if we haven't already */
6426 : if (!node_isset(node, *used_node_mask)) {
6427 : node_set(node, *used_node_mask);
6428 : return node;
6429 : }
6430 :
6431 : for_each_node_state(n, N_MEMORY) {
6432 :
6433 : /* Don't want a node to appear more than once */
6434 : if (node_isset(n, *used_node_mask))
6435 : continue;
6436 :
6437 : /* Use the distance array to find the distance */
6438 : val = node_distance(node, n);
6439 :
6440 : /* Penalize nodes under us ("prefer the next node") */
6441 : val += (n < node);
6442 :
6443 : /* Give preference to headless and unused nodes */
6444 : if (!cpumask_empty(cpumask_of_node(n)))
6445 : val += PENALTY_FOR_NODE_WITH_CPUS;
6446 :
6447 : /* Slight preference for less loaded node */
6448 : val *= MAX_NUMNODES;
6449 : val += node_load[n];
6450 :
6451 : if (val < min_val) {
6452 : min_val = val;
6453 : best_node = n;
6454 : }
6455 : }
6456 :
6457 : if (best_node >= 0)
6458 : node_set(best_node, *used_node_mask);
6459 :
6460 : return best_node;
6461 : }
6462 :
6463 :
6464 : /*
6465 : * Build zonelists ordered by node and zones within node.
6466 : * This results in maximum locality--normal zone overflows into local
6467 : * DMA zone, if any--but risks exhausting DMA zone.
6468 : */
6469 : static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
6470 : unsigned nr_nodes)
6471 : {
6472 : struct zoneref *zonerefs;
6473 : int i;
6474 :
6475 : zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
6476 :
6477 : for (i = 0; i < nr_nodes; i++) {
6478 : int nr_zones;
6479 :
6480 : pg_data_t *node = NODE_DATA(node_order[i]);
6481 :
6482 : nr_zones = build_zonerefs_node(node, zonerefs);
6483 : zonerefs += nr_zones;
6484 : }
6485 : zonerefs->zone = NULL;
6486 : zonerefs->zone_idx = 0;
6487 : }
6488 :
6489 : /*
6490 : * Build gfp_thisnode zonelists
6491 : */
6492 : static void build_thisnode_zonelists(pg_data_t *pgdat)
6493 : {
6494 : struct zoneref *zonerefs;
6495 : int nr_zones;
6496 :
6497 : zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
6498 : nr_zones = build_zonerefs_node(pgdat, zonerefs);
6499 : zonerefs += nr_zones;
6500 : zonerefs->zone = NULL;
6501 : zonerefs->zone_idx = 0;
6502 : }
6503 :
6504 : /*
6505 : * Build zonelists ordered by zone and nodes within zones.
6506 : * This results in conserving DMA zone[s] until all Normal memory is
6507 : * exhausted, but results in overflowing to remote node while memory
6508 : * may still exist in local DMA zone.
6509 : */
6510 :
6511 : static void build_zonelists(pg_data_t *pgdat)
6512 : {
6513 : static int node_order[MAX_NUMNODES];
6514 : int node, nr_nodes = 0;
6515 : nodemask_t used_mask = NODE_MASK_NONE;
6516 : int local_node, prev_node;
6517 :
6518 : /* NUMA-aware ordering of nodes */
6519 : local_node = pgdat->node_id;
6520 : prev_node = local_node;
6521 :
6522 : memset(node_order, 0, sizeof(node_order));
6523 : while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
6524 : /*
6525 : * We don't want to pressure a particular node.
6526 : * So adding penalty to the first node in same
6527 : * distance group to make it round-robin.
6528 : */
6529 : if (node_distance(local_node, node) !=
6530 : node_distance(local_node, prev_node))
6531 : node_load[node] += 1;
6532 :
6533 : node_order[nr_nodes++] = node;
6534 : prev_node = node;
6535 : }
6536 :
6537 : build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
6538 : build_thisnode_zonelists(pgdat);
6539 : pr_info("Fallback order for Node %d: ", local_node);
6540 : for (node = 0; node < nr_nodes; node++)
6541 : pr_cont("%d ", node_order[node]);
6542 : pr_cont("\n");
6543 : }
6544 :
6545 : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
6546 : /*
6547 : * Return node id of node used for "local" allocations.
6548 : * I.e., first node id of first zone in arg node's generic zonelist.
6549 : * Used for initializing percpu 'numa_mem', which is used primarily
6550 : * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
6551 : */
6552 : int local_memory_node(int node)
6553 : {
6554 : struct zoneref *z;
6555 :
6556 : z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
6557 : gfp_zone(GFP_KERNEL),
6558 : NULL);
6559 : return zone_to_nid(z->zone);
6560 : }
6561 : #endif
6562 :
6563 : static void setup_min_unmapped_ratio(void);
6564 : static void setup_min_slab_ratio(void);
6565 : #else /* CONFIG_NUMA */
6566 :
6567 1 : static void build_zonelists(pg_data_t *pgdat)
6568 : {
6569 : int node, local_node;
6570 : struct zoneref *zonerefs;
6571 : int nr_zones;
6572 :
6573 1 : local_node = pgdat->node_id;
6574 :
6575 1 : zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
6576 1 : nr_zones = build_zonerefs_node(pgdat, zonerefs);
6577 1 : zonerefs += nr_zones;
6578 :
6579 : /*
6580 : * Now we build the zonelist so that it contains the zones
6581 : * of all the other nodes.
6582 : * We don't want to pressure a particular node, so when
6583 : * building the zones for node N, we make sure that the
6584 : * zones coming right after the local ones are those from
6585 : * node N+1 (modulo N)
6586 : */
6587 1 : for (node = local_node + 1; node < MAX_NUMNODES; node++) {
6588 0 : if (!node_online(node))
6589 0 : continue;
6590 0 : nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
6591 0 : zonerefs += nr_zones;
6592 : }
6593 0 : for (node = 0; node < local_node; node++) {
6594 0 : if (!node_online(node))
6595 0 : continue;
6596 0 : nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
6597 0 : zonerefs += nr_zones;
6598 : }
6599 :
6600 1 : zonerefs->zone = NULL;
6601 1 : zonerefs->zone_idx = 0;
6602 1 : }
6603 :
6604 : #endif /* CONFIG_NUMA */
6605 :
6606 : /*
6607 : * Boot pageset table. One per cpu which is going to be used for all
6608 : * zones and all nodes. The parameters will be set in such a way
6609 : * that an item put on a list will immediately be handed over to
6610 : * the buddy list. This is safe since pageset manipulation is done
6611 : * with interrupts disabled.
6612 : *
6613 : * The boot_pagesets must be kept even after bootup is complete for
6614 : * unused processors and/or zones. They do play a role for bootstrapping
6615 : * hotplugged processors.
6616 : *
6617 : * zoneinfo_show() and maybe other functions do
6618 : * not check if the processor is online before following the pageset pointer.
6619 : * Other parts of the kernel may not check if the zone is available.
6620 : */
6621 : static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats);
6622 : /* These effectively disable the pcplists in the boot pageset completely */
6623 : #define BOOT_PAGESET_HIGH 0
6624 : #define BOOT_PAGESET_BATCH 1
6625 : static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
6626 : static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
6627 : static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
6628 :
6629 1 : static void __build_all_zonelists(void *data)
6630 : {
6631 : int nid;
6632 : int __maybe_unused cpu;
6633 1 : pg_data_t *self = data;
6634 :
6635 1 : write_seqlock(&zonelist_update_seq);
6636 :
6637 : #ifdef CONFIG_NUMA
6638 : memset(node_load, 0, sizeof(node_load));
6639 : #endif
6640 :
6641 : /*
6642 : * This node is hotadded and no memory is yet present. So just
6643 : * building zonelists is fine - no need to touch other nodes.
6644 : */
6645 1 : if (self && !node_online(self->node_id)) {
6646 0 : build_zonelists(self);
6647 : } else {
6648 : /*
6649 : * All possible nodes have pgdat preallocated
6650 : * in free_area_init
6651 : */
6652 1 : for_each_node(nid) {
6653 1 : pg_data_t *pgdat = NODE_DATA(nid);
6654 :
6655 1 : build_zonelists(pgdat);
6656 : }
6657 :
6658 : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
6659 : /*
6660 : * We now know the "local memory node" for each node--
6661 : * i.e., the node of the first zone in the generic zonelist.
6662 : * Set up numa_mem percpu variable for on-line cpus. During
6663 : * boot, only the boot cpu should be on-line; we'll init the
6664 : * secondary cpus' numa_mem as they come on-line. During
6665 : * node/memory hotplug, we'll fixup all on-line cpus.
6666 : */
6667 : for_each_online_cpu(cpu)
6668 : set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
6669 : #endif
6670 : }
6671 :
6672 1 : write_sequnlock(&zonelist_update_seq);
6673 1 : }
6674 :
6675 : static noinline void __init
6676 1 : build_all_zonelists_init(void)
6677 : {
6678 : int cpu;
6679 :
6680 1 : __build_all_zonelists(NULL);
6681 :
6682 : /*
6683 : * Initialize the boot_pagesets that are going to be used
6684 : * for bootstrapping processors. The real pagesets for
6685 : * each zone will be allocated later when the per cpu
6686 : * allocator is available.
6687 : *
6688 : * boot_pagesets are used also for bootstrapping offline
6689 : * cpus if the system is already booted because the pagesets
6690 : * are needed to initialize allocators on a specific cpu too.
6691 : * F.e. the percpu allocator needs the page allocator which
6692 : * needs the percpu allocator in order to allocate its pagesets
6693 : * (a chicken-egg dilemma).
6694 : */
6695 2 : for_each_possible_cpu(cpu)
6696 1 : per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu));
6697 :
6698 1 : mminit_verify_zonelist();
6699 : cpuset_init_current_mems_allowed();
6700 1 : }
6701 :
6702 : /*
6703 : * unless system_state == SYSTEM_BOOTING.
6704 : *
6705 : * __ref due to call of __init annotated helper build_all_zonelists_init
6706 : * [protected by SYSTEM_BOOTING].
6707 : */
6708 1 : void __ref build_all_zonelists(pg_data_t *pgdat)
6709 : {
6710 : unsigned long vm_total_pages;
6711 :
6712 1 : if (system_state == SYSTEM_BOOTING) {
6713 1 : build_all_zonelists_init();
6714 : } else {
6715 0 : __build_all_zonelists(pgdat);
6716 : /* cpuset refresh routine should be here */
6717 : }
6718 : /* Get the number of free pages beyond high watermark in all zones. */
6719 1 : vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
6720 : /*
6721 : * Disable grouping by mobility if the number of pages in the
6722 : * system is too low to allow the mechanism to work. It would be
6723 : * more accurate, but expensive to check per-zone. This check is
6724 : * made on memory-hotadd so a system can start with mobility
6725 : * disabled and enable it later
6726 : */
6727 1 : if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
6728 0 : page_group_by_mobility_disabled = 1;
6729 : else
6730 1 : page_group_by_mobility_disabled = 0;
6731 :
6732 1 : pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
6733 : nr_online_nodes,
6734 : page_group_by_mobility_disabled ? "off" : "on",
6735 : vm_total_pages);
6736 : #ifdef CONFIG_NUMA
6737 : pr_info("Policy zone: %s\n", zone_names[policy_zone]);
6738 : #endif
6739 1 : }
6740 :
6741 : /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
6742 : static bool __meminit
6743 262670 : overlap_memmap_init(unsigned long zone, unsigned long *pfn)
6744 : {
6745 : static struct memblock_region *r;
6746 :
6747 262670 : if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
6748 0 : if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
6749 0 : for_each_mem_region(r) {
6750 0 : if (*pfn < memblock_region_memory_end_pfn(r))
6751 : break;
6752 : }
6753 : }
6754 0 : if (*pfn >= memblock_region_memory_base_pfn(r) &&
6755 0 : memblock_is_mirror(r)) {
6756 0 : *pfn = memblock_region_memory_end_pfn(r);
6757 0 : return true;
6758 : }
6759 : }
6760 : return false;
6761 : }
6762 :
6763 : /*
6764 : * Initially all pages are reserved - free ones are freed
6765 : * up by memblock_free_all() once the early boot process is
6766 : * done. Non-atomic initialization, single-pass.
6767 : *
6768 : * All aligned pageblocks are initialized to the specified migratetype
6769 : * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
6770 : * zone stats (e.g., nr_isolate_pageblock) are touched.
6771 : */
6772 1 : void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
6773 : unsigned long start_pfn, unsigned long zone_end_pfn,
6774 : enum meminit_context context,
6775 : struct vmem_altmap *altmap, int migratetype)
6776 : {
6777 1 : unsigned long pfn, end_pfn = start_pfn + size;
6778 : struct page *page;
6779 :
6780 1 : if (highest_memmap_pfn < end_pfn - 1)
6781 1 : highest_memmap_pfn = end_pfn - 1;
6782 :
6783 : #ifdef CONFIG_ZONE_DEVICE
6784 : /*
6785 : * Honor reservation requested by the driver for this ZONE_DEVICE
6786 : * memory. We limit the total number of pages to initialize to just
6787 : * those that might contain the memory mapping. We will defer the
6788 : * ZONE_DEVICE page initialization until after we have released
6789 : * the hotplug lock.
6790 : */
6791 : if (zone == ZONE_DEVICE) {
6792 : if (!altmap)
6793 : return;
6794 :
6795 : if (start_pfn == altmap->base_pfn)
6796 : start_pfn += altmap->reserve;
6797 : end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
6798 : }
6799 : #endif
6800 :
6801 262672 : for (pfn = start_pfn; pfn < end_pfn; ) {
6802 : /*
6803 : * There can be holes in boot-time mem_map[]s handed to this
6804 : * function. They do not exist on hotplugged memory.
6805 : */
6806 262670 : if (context == MEMINIT_EARLY) {
6807 262670 : if (overlap_memmap_init(zone, &pfn))
6808 0 : continue;
6809 : if (defer_init(nid, pfn, zone_end_pfn)) {
6810 : deferred_struct_pages = true;
6811 : break;
6812 : }
6813 : }
6814 :
6815 262670 : page = pfn_to_page(pfn);
6816 262670 : __init_single_page(page, pfn, zone, nid);
6817 262670 : if (context == MEMINIT_HOTPLUG)
6818 : __SetPageReserved(page);
6819 :
6820 : /*
6821 : * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
6822 : * such that unmovable allocations won't be scattered all
6823 : * over the place during system boot.
6824 : */
6825 262670 : if (pageblock_aligned(pfn)) {
6826 257 : set_pageblock_migratetype(page, migratetype);
6827 257 : cond_resched();
6828 : }
6829 262670 : pfn++;
6830 : }
6831 1 : }
6832 :
6833 : #ifdef CONFIG_ZONE_DEVICE
6834 : static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
6835 : unsigned long zone_idx, int nid,
6836 : struct dev_pagemap *pgmap)
6837 : {
6838 :
6839 : __init_single_page(page, pfn, zone_idx, nid);
6840 :
6841 : /*
6842 : * Mark page reserved as it will need to wait for onlining
6843 : * phase for it to be fully associated with a zone.
6844 : *
6845 : * We can use the non-atomic __set_bit operation for setting
6846 : * the flag as we are still initializing the pages.
6847 : */
6848 : __SetPageReserved(page);
6849 :
6850 : /*
6851 : * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
6852 : * and zone_device_data. It is a bug if a ZONE_DEVICE page is
6853 : * ever freed or placed on a driver-private list.
6854 : */
6855 : page->pgmap = pgmap;
6856 : page->zone_device_data = NULL;
6857 :
6858 : /*
6859 : * Mark the block movable so that blocks are reserved for
6860 : * movable at startup. This will force kernel allocations
6861 : * to reserve their blocks rather than leaking throughout
6862 : * the address space during boot when many long-lived
6863 : * kernel allocations are made.
6864 : *
6865 : * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
6866 : * because this is done early in section_activate()
6867 : */
6868 : if (pageblock_aligned(pfn)) {
6869 : set_pageblock_migratetype(page, MIGRATE_MOVABLE);
6870 : cond_resched();
6871 : }
6872 :
6873 : /*
6874 : * ZONE_DEVICE pages are released directly to the driver page allocator
6875 : * which will set the page count to 1 when allocating the page.
6876 : */
6877 : if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
6878 : pgmap->type == MEMORY_DEVICE_COHERENT)
6879 : set_page_count(page, 0);
6880 : }
6881 :
6882 : /*
6883 : * With compound page geometry and when struct pages are stored in ram most
6884 : * tail pages are reused. Consequently, the amount of unique struct pages to
6885 : * initialize is a lot smaller that the total amount of struct pages being
6886 : * mapped. This is a paired / mild layering violation with explicit knowledge
6887 : * of how the sparse_vmemmap internals handle compound pages in the lack
6888 : * of an altmap. See vmemmap_populate_compound_pages().
6889 : */
6890 : static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
6891 : unsigned long nr_pages)
6892 : {
6893 : return is_power_of_2(sizeof(struct page)) &&
6894 : !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
6895 : }
6896 :
6897 : static void __ref memmap_init_compound(struct page *head,
6898 : unsigned long head_pfn,
6899 : unsigned long zone_idx, int nid,
6900 : struct dev_pagemap *pgmap,
6901 : unsigned long nr_pages)
6902 : {
6903 : unsigned long pfn, end_pfn = head_pfn + nr_pages;
6904 : unsigned int order = pgmap->vmemmap_shift;
6905 :
6906 : __SetPageHead(head);
6907 : for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
6908 : struct page *page = pfn_to_page(pfn);
6909 :
6910 : __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
6911 : prep_compound_tail(head, pfn - head_pfn);
6912 : set_page_count(page, 0);
6913 :
6914 : /*
6915 : * The first tail page stores important compound page info.
6916 : * Call prep_compound_head() after the first tail page has
6917 : * been initialized, to not have the data overwritten.
6918 : */
6919 : if (pfn == head_pfn + 1)
6920 : prep_compound_head(head, order);
6921 : }
6922 : }
6923 :
6924 : void __ref memmap_init_zone_device(struct zone *zone,
6925 : unsigned long start_pfn,
6926 : unsigned long nr_pages,
6927 : struct dev_pagemap *pgmap)
6928 : {
6929 : unsigned long pfn, end_pfn = start_pfn + nr_pages;
6930 : struct pglist_data *pgdat = zone->zone_pgdat;
6931 : struct vmem_altmap *altmap = pgmap_altmap(pgmap);
6932 : unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
6933 : unsigned long zone_idx = zone_idx(zone);
6934 : unsigned long start = jiffies;
6935 : int nid = pgdat->node_id;
6936 :
6937 : if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE))
6938 : return;
6939 :
6940 : /*
6941 : * The call to memmap_init should have already taken care
6942 : * of the pages reserved for the memmap, so we can just jump to
6943 : * the end of that region and start processing the device pages.
6944 : */
6945 : if (altmap) {
6946 : start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
6947 : nr_pages = end_pfn - start_pfn;
6948 : }
6949 :
6950 : for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
6951 : struct page *page = pfn_to_page(pfn);
6952 :
6953 : __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
6954 :
6955 : if (pfns_per_compound == 1)
6956 : continue;
6957 :
6958 : memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
6959 : compound_nr_pages(altmap, pfns_per_compound));
6960 : }
6961 :
6962 : pr_info("%s initialised %lu pages in %ums\n", __func__,
6963 : nr_pages, jiffies_to_msecs(jiffies - start));
6964 : }
6965 :
6966 : #endif
6967 1 : static void __meminit zone_init_free_lists(struct zone *zone)
6968 : {
6969 : unsigned int order, t;
6970 45 : for_each_migratetype_order(order, t) {
6971 88 : INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
6972 44 : zone->free_area[order].nr_free = 0;
6973 : }
6974 1 : }
6975 :
6976 : /*
6977 : * Only struct pages that correspond to ranges defined by memblock.memory
6978 : * are zeroed and initialized by going through __init_single_page() during
6979 : * memmap_init_zone_range().
6980 : *
6981 : * But, there could be struct pages that correspond to holes in
6982 : * memblock.memory. This can happen because of the following reasons:
6983 : * - physical memory bank size is not necessarily the exact multiple of the
6984 : * arbitrary section size
6985 : * - early reserved memory may not be listed in memblock.memory
6986 : * - memory layouts defined with memmap= kernel parameter may not align
6987 : * nicely with memmap sections
6988 : *
6989 : * Explicitly initialize those struct pages so that:
6990 : * - PG_Reserved is set
6991 : * - zone and node links point to zone and node that span the page if the
6992 : * hole is in the middle of a zone
6993 : * - zone and node links point to adjacent zone/node if the hole falls on
6994 : * the zone boundary; the pages in such holes will be prepended to the
6995 : * zone/node above the hole except for the trailing pages in the last
6996 : * section that will be appended to the zone/node below.
6997 : */
6998 1 : static void __init init_unavailable_range(unsigned long spfn,
6999 : unsigned long epfn,
7000 : int zone, int node)
7001 : {
7002 : unsigned long pfn;
7003 1 : u64 pgcnt = 0;
7004 :
7005 1 : for (pfn = spfn; pfn < epfn; pfn++) {
7006 0 : if (!pfn_valid(pageblock_start_pfn(pfn))) {
7007 0 : pfn = pageblock_end_pfn(pfn) - 1;
7008 0 : continue;
7009 : }
7010 0 : __init_single_page(pfn_to_page(pfn), pfn, zone, node);
7011 0 : __SetPageReserved(pfn_to_page(pfn));
7012 0 : pgcnt++;
7013 : }
7014 :
7015 1 : if (pgcnt)
7016 0 : pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
7017 : node, zone_names[zone], pgcnt);
7018 1 : }
7019 :
7020 1 : static void __init memmap_init_zone_range(struct zone *zone,
7021 : unsigned long start_pfn,
7022 : unsigned long end_pfn,
7023 : unsigned long *hole_pfn)
7024 : {
7025 1 : unsigned long zone_start_pfn = zone->zone_start_pfn;
7026 1 : unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
7027 1 : int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
7028 :
7029 1 : start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
7030 1 : end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
7031 :
7032 1 : if (start_pfn >= end_pfn)
7033 : return;
7034 :
7035 1 : memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
7036 : zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
7037 :
7038 1 : if (*hole_pfn < start_pfn)
7039 0 : init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
7040 :
7041 1 : *hole_pfn = end_pfn;
7042 : }
7043 :
7044 1 : static void __init memmap_init(void)
7045 : {
7046 : unsigned long start_pfn, end_pfn;
7047 1 : unsigned long hole_pfn = 0;
7048 1 : int i, j, zone_id = 0, nid;
7049 :
7050 2 : for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
7051 : struct pglist_data *node = NODE_DATA(nid);
7052 :
7053 2 : for (j = 0; j < MAX_NR_ZONES; j++) {
7054 2 : struct zone *zone = node->node_zones + j;
7055 :
7056 2 : if (!populated_zone(zone))
7057 1 : continue;
7058 :
7059 1 : memmap_init_zone_range(zone, start_pfn, end_pfn,
7060 : &hole_pfn);
7061 1 : zone_id = j;
7062 : }
7063 : }
7064 :
7065 : #ifdef CONFIG_SPARSEMEM
7066 : /*
7067 : * Initialize the memory map for hole in the range [memory_end,
7068 : * section_end].
7069 : * Append the pages in this hole to the highest zone in the last
7070 : * node.
7071 : * The call to init_unavailable_range() is outside the ifdef to
7072 : * silence the compiler warining about zone_id set but not used;
7073 : * for FLATMEM it is a nop anyway
7074 : */
7075 : end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
7076 : if (hole_pfn < end_pfn)
7077 : #endif
7078 1 : init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
7079 1 : }
7080 :
7081 1 : void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
7082 : phys_addr_t min_addr, int nid, bool exact_nid)
7083 : {
7084 : void *ptr;
7085 :
7086 1 : if (exact_nid)
7087 0 : ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
7088 : MEMBLOCK_ALLOC_ACCESSIBLE,
7089 : nid);
7090 : else
7091 1 : ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
7092 : MEMBLOCK_ALLOC_ACCESSIBLE,
7093 : nid);
7094 :
7095 : if (ptr && size > 0)
7096 : page_init_poison(ptr, size);
7097 :
7098 1 : return ptr;
7099 : }
7100 :
7101 3 : static int zone_batchsize(struct zone *zone)
7102 : {
7103 : #ifdef CONFIG_MMU
7104 : int batch;
7105 :
7106 : /*
7107 : * The number of pages to batch allocate is either ~0.1%
7108 : * of the zone or 1MB, whichever is smaller. The batch
7109 : * size is striking a balance between allocation latency
7110 : * and zone lock contention.
7111 : */
7112 3 : batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
7113 3 : batch /= 4; /* We effectively *= 4 below */
7114 3 : if (batch < 1)
7115 1 : batch = 1;
7116 :
7117 : /*
7118 : * Clamp the batch to a 2^n - 1 value. Having a power
7119 : * of 2 value was found to be more likely to have
7120 : * suboptimal cache aliasing properties in some cases.
7121 : *
7122 : * For example if 2 tasks are alternately allocating
7123 : * batches of pages, one task can end up with a lot
7124 : * of pages of one half of the possible page colors
7125 : * and the other with pages of the other colors.
7126 : */
7127 5 : batch = rounddown_pow_of_two(batch + batch/2) - 1;
7128 :
7129 3 : return batch;
7130 :
7131 : #else
7132 : /* The deferral and batching of frees should be suppressed under NOMMU
7133 : * conditions.
7134 : *
7135 : * The problem is that NOMMU needs to be able to allocate large chunks
7136 : * of contiguous memory as there's no hardware page translation to
7137 : * assemble apparent contiguous memory from discontiguous pages.
7138 : *
7139 : * Queueing large contiguous runs of pages for batching, however,
7140 : * causes the pages to actually be freed in smaller chunks. As there
7141 : * can be a significant delay between the individual batches being
7142 : * recycled, this leads to the once large chunks of space being
7143 : * fragmented and becoming unavailable for high-order allocations.
7144 : */
7145 : return 0;
7146 : #endif
7147 : }
7148 :
7149 3 : static int zone_highsize(struct zone *zone, int batch, int cpu_online)
7150 : {
7151 : #ifdef CONFIG_MMU
7152 : int high;
7153 : int nr_split_cpus;
7154 : unsigned long total_pages;
7155 :
7156 3 : if (!percpu_pagelist_high_fraction) {
7157 : /*
7158 : * By default, the high value of the pcp is based on the zone
7159 : * low watermark so that if they are full then background
7160 : * reclaim will not be started prematurely.
7161 : */
7162 3 : total_pages = low_wmark_pages(zone);
7163 : } else {
7164 : /*
7165 : * If percpu_pagelist_high_fraction is configured, the high
7166 : * value is based on a fraction of the managed pages in the
7167 : * zone.
7168 : */
7169 0 : total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
7170 : }
7171 :
7172 : /*
7173 : * Split the high value across all online CPUs local to the zone. Note
7174 : * that early in boot that CPUs may not be online yet and that during
7175 : * CPU hotplug that the cpumask is not yet updated when a CPU is being
7176 : * onlined. For memory nodes that have no CPUs, split pcp->high across
7177 : * all online CPUs to mitigate the risk that reclaim is triggered
7178 : * prematurely due to pages stored on pcp lists.
7179 : */
7180 6 : nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
7181 3 : if (!nr_split_cpus)
7182 0 : nr_split_cpus = num_online_cpus();
7183 3 : high = total_pages / nr_split_cpus;
7184 :
7185 : /*
7186 : * Ensure high is at least batch*4. The multiple is based on the
7187 : * historical relationship between high and batch.
7188 : */
7189 3 : high = max(high, batch << 2);
7190 :
7191 3 : return high;
7192 : #else
7193 : return 0;
7194 : #endif
7195 : }
7196 :
7197 : /*
7198 : * pcp->high and pcp->batch values are related and generally batch is lower
7199 : * than high. They are also related to pcp->count such that count is lower
7200 : * than high, and as soon as it reaches high, the pcplist is flushed.
7201 : *
7202 : * However, guaranteeing these relations at all times would require e.g. write
7203 : * barriers here but also careful usage of read barriers at the read side, and
7204 : * thus be prone to error and bad for performance. Thus the update only prevents
7205 : * store tearing. Any new users of pcp->batch and pcp->high should ensure they
7206 : * can cope with those fields changing asynchronously, and fully trust only the
7207 : * pcp->count field on the local CPU with interrupts disabled.
7208 : *
7209 : * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
7210 : * outside of boot time (or some other assurance that no concurrent updaters
7211 : * exist).
7212 : */
7213 : static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
7214 : unsigned long batch)
7215 : {
7216 3 : WRITE_ONCE(pcp->batch, batch);
7217 3 : WRITE_ONCE(pcp->high, high);
7218 : }
7219 :
7220 2 : static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
7221 : {
7222 : int pindex;
7223 :
7224 2 : memset(pcp, 0, sizeof(*pcp));
7225 2 : memset(pzstats, 0, sizeof(*pzstats));
7226 :
7227 2 : spin_lock_init(&pcp->lock);
7228 26 : for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
7229 48 : INIT_LIST_HEAD(&pcp->lists[pindex]);
7230 :
7231 : /*
7232 : * Set batch and high values safe for a boot pageset. A true percpu
7233 : * pageset's initialization will update them subsequently. Here we don't
7234 : * need to be as careful as pageset_update() as nobody can access the
7235 : * pageset yet.
7236 : */
7237 2 : pcp->high = BOOT_PAGESET_HIGH;
7238 2 : pcp->batch = BOOT_PAGESET_BATCH;
7239 2 : pcp->free_factor = 0;
7240 2 : }
7241 :
7242 : static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
7243 : unsigned long batch)
7244 : {
7245 : struct per_cpu_pages *pcp;
7246 : int cpu;
7247 :
7248 3 : for_each_possible_cpu(cpu) {
7249 3 : pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
7250 3 : pageset_update(pcp, high, batch);
7251 : }
7252 : }
7253 :
7254 : /*
7255 : * Calculate and set new high and batch values for all per-cpu pagesets of a
7256 : * zone based on the zone's size.
7257 : */
7258 3 : static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
7259 : {
7260 : int new_high, new_batch;
7261 :
7262 3 : new_batch = max(1, zone_batchsize(zone));
7263 3 : new_high = zone_highsize(zone, new_batch, cpu_online);
7264 :
7265 3 : if (zone->pageset_high == new_high &&
7266 0 : zone->pageset_batch == new_batch)
7267 : return;
7268 :
7269 3 : zone->pageset_high = new_high;
7270 3 : zone->pageset_batch = new_batch;
7271 :
7272 3 : __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
7273 : }
7274 :
7275 1 : void __meminit setup_zone_pageset(struct zone *zone)
7276 : {
7277 : int cpu;
7278 :
7279 : /* Size may be 0 on !SMP && !NUMA */
7280 : if (sizeof(struct per_cpu_zonestat) > 0)
7281 : zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
7282 :
7283 1 : zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
7284 2 : for_each_possible_cpu(cpu) {
7285 : struct per_cpu_pages *pcp;
7286 : struct per_cpu_zonestat *pzstats;
7287 :
7288 1 : pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
7289 1 : pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
7290 1 : per_cpu_pages_init(pcp, pzstats);
7291 : }
7292 :
7293 1 : zone_set_pageset_high_and_batch(zone, 0);
7294 1 : }
7295 :
7296 : /*
7297 : * The zone indicated has a new number of managed_pages; batch sizes and percpu
7298 : * page high values need to be recalculated.
7299 : */
7300 2 : static void zone_pcp_update(struct zone *zone, int cpu_online)
7301 : {
7302 2 : mutex_lock(&pcp_batch_high_lock);
7303 2 : zone_set_pageset_high_and_batch(zone, cpu_online);
7304 2 : mutex_unlock(&pcp_batch_high_lock);
7305 2 : }
7306 :
7307 : /*
7308 : * Allocate per cpu pagesets and initialize them.
7309 : * Before this call only boot pagesets were available.
7310 : */
7311 1 : void __init setup_per_cpu_pageset(void)
7312 : {
7313 : struct pglist_data *pgdat;
7314 : struct zone *zone;
7315 : int __maybe_unused cpu;
7316 :
7317 3 : for_each_populated_zone(zone)
7318 1 : setup_zone_pageset(zone);
7319 :
7320 : #ifdef CONFIG_NUMA
7321 : /*
7322 : * Unpopulated zones continue using the boot pagesets.
7323 : * The numa stats for these pagesets need to be reset.
7324 : * Otherwise, they will end up skewing the stats of
7325 : * the nodes these zones are associated with.
7326 : */
7327 : for_each_possible_cpu(cpu) {
7328 : struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
7329 : memset(pzstats->vm_numa_event, 0,
7330 : sizeof(pzstats->vm_numa_event));
7331 : }
7332 : #endif
7333 :
7334 2 : for_each_online_pgdat(pgdat)
7335 1 : pgdat->per_cpu_nodestats =
7336 1 : alloc_percpu(struct per_cpu_nodestat);
7337 1 : }
7338 :
7339 : static __meminit void zone_pcp_init(struct zone *zone)
7340 : {
7341 : /*
7342 : * per cpu subsystem is not up at this point. The following code
7343 : * relies on the ability of the linker to provide the
7344 : * offset of a (static) per cpu variable into the per cpu area.
7345 : */
7346 2 : zone->per_cpu_pageset = &boot_pageset;
7347 2 : zone->per_cpu_zonestats = &boot_zonestats;
7348 2 : zone->pageset_high = BOOT_PAGESET_HIGH;
7349 2 : zone->pageset_batch = BOOT_PAGESET_BATCH;
7350 :
7351 2 : if (populated_zone(zone))
7352 : pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name,
7353 : zone->present_pages, zone_batchsize(zone));
7354 : }
7355 :
7356 1 : void __meminit init_currently_empty_zone(struct zone *zone,
7357 : unsigned long zone_start_pfn,
7358 : unsigned long size)
7359 : {
7360 1 : struct pglist_data *pgdat = zone->zone_pgdat;
7361 1 : int zone_idx = zone_idx(zone) + 1;
7362 :
7363 1 : if (zone_idx > pgdat->nr_zones)
7364 1 : pgdat->nr_zones = zone_idx;
7365 :
7366 1 : zone->zone_start_pfn = zone_start_pfn;
7367 :
7368 1 : mminit_dprintk(MMINIT_TRACE, "memmap_init",
7369 : "Initialising map node %d zone %lu pfns %lu -> %lu\n",
7370 : pgdat->node_id,
7371 : (unsigned long)zone_idx(zone),
7372 : zone_start_pfn, (zone_start_pfn + size));
7373 :
7374 1 : zone_init_free_lists(zone);
7375 1 : zone->initialized = 1;
7376 1 : }
7377 :
7378 : /**
7379 : * get_pfn_range_for_nid - Return the start and end page frames for a node
7380 : * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
7381 : * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
7382 : * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
7383 : *
7384 : * It returns the start and end page frame of a node based on information
7385 : * provided by memblock_set_node(). If called for a node
7386 : * with no available memory, a warning is printed and the start and end
7387 : * PFNs will be 0.
7388 : */
7389 1 : void __init get_pfn_range_for_nid(unsigned int nid,
7390 : unsigned long *start_pfn, unsigned long *end_pfn)
7391 : {
7392 : unsigned long this_start_pfn, this_end_pfn;
7393 : int i;
7394 :
7395 1 : *start_pfn = -1UL;
7396 1 : *end_pfn = 0;
7397 :
7398 2 : for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
7399 1 : *start_pfn = min(*start_pfn, this_start_pfn);
7400 1 : *end_pfn = max(*end_pfn, this_end_pfn);
7401 : }
7402 :
7403 1 : if (*start_pfn == -1UL)
7404 0 : *start_pfn = 0;
7405 1 : }
7406 :
7407 : /*
7408 : * This finds a zone that can be used for ZONE_MOVABLE pages. The
7409 : * assumption is made that zones within a node are ordered in monotonic
7410 : * increasing memory addresses so that the "highest" populated zone is used
7411 : */
7412 1 : static void __init find_usable_zone_for_movable(void)
7413 : {
7414 : int zone_index;
7415 2 : for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
7416 2 : if (zone_index == ZONE_MOVABLE)
7417 1 : continue;
7418 :
7419 2 : if (arch_zone_highest_possible_pfn[zone_index] >
7420 1 : arch_zone_lowest_possible_pfn[zone_index])
7421 : break;
7422 : }
7423 :
7424 : VM_BUG_ON(zone_index == -1);
7425 1 : movable_zone = zone_index;
7426 1 : }
7427 :
7428 : /*
7429 : * The zone ranges provided by the architecture do not include ZONE_MOVABLE
7430 : * because it is sized independent of architecture. Unlike the other zones,
7431 : * the starting point for ZONE_MOVABLE is not fixed. It may be different
7432 : * in each node depending on the size of each node and how evenly kernelcore
7433 : * is distributed. This helper function adjusts the zone ranges
7434 : * provided by the architecture for a given node by using the end of the
7435 : * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
7436 : * zones within a node are in order of monotonic increases memory addresses
7437 : */
7438 4 : static void __init adjust_zone_range_for_zone_movable(int nid,
7439 : unsigned long zone_type,
7440 : unsigned long node_start_pfn,
7441 : unsigned long node_end_pfn,
7442 : unsigned long *zone_start_pfn,
7443 : unsigned long *zone_end_pfn)
7444 : {
7445 : /* Only adjust if ZONE_MOVABLE is on this node */
7446 4 : if (zone_movable_pfn[nid]) {
7447 : /* Size ZONE_MOVABLE */
7448 0 : if (zone_type == ZONE_MOVABLE) {
7449 0 : *zone_start_pfn = zone_movable_pfn[nid];
7450 0 : *zone_end_pfn = min(node_end_pfn,
7451 : arch_zone_highest_possible_pfn[movable_zone]);
7452 :
7453 : /* Adjust for ZONE_MOVABLE starting within this range */
7454 0 : } else if (!mirrored_kernelcore &&
7455 0 : *zone_start_pfn < zone_movable_pfn[nid] &&
7456 0 : *zone_end_pfn > zone_movable_pfn[nid]) {
7457 0 : *zone_end_pfn = zone_movable_pfn[nid];
7458 :
7459 : /* Check if this whole range is within ZONE_MOVABLE */
7460 0 : } else if (*zone_start_pfn >= zone_movable_pfn[nid])
7461 0 : *zone_start_pfn = *zone_end_pfn;
7462 : }
7463 4 : }
7464 :
7465 : /*
7466 : * Return the number of pages a zone spans in a node, including holes
7467 : * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
7468 : */
7469 2 : static unsigned long __init zone_spanned_pages_in_node(int nid,
7470 : unsigned long zone_type,
7471 : unsigned long node_start_pfn,
7472 : unsigned long node_end_pfn,
7473 : unsigned long *zone_start_pfn,
7474 : unsigned long *zone_end_pfn)
7475 : {
7476 2 : unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
7477 2 : unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
7478 : /* When hotadd a new node from cpu_up(), the node should be empty */
7479 2 : if (!node_start_pfn && !node_end_pfn)
7480 : return 0;
7481 :
7482 : /* Get the start and end of the zone */
7483 2 : *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
7484 2 : *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
7485 2 : adjust_zone_range_for_zone_movable(nid, zone_type,
7486 : node_start_pfn, node_end_pfn,
7487 : zone_start_pfn, zone_end_pfn);
7488 :
7489 : /* Check that this node has pages within the zone's required range */
7490 2 : if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
7491 : return 0;
7492 :
7493 : /* Move the zone boundaries inside the node if necessary */
7494 2 : *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
7495 2 : *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
7496 :
7497 : /* Return the spanned pages */
7498 2 : return *zone_end_pfn - *zone_start_pfn;
7499 : }
7500 :
7501 : /*
7502 : * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
7503 : * then all holes in the requested range will be accounted for.
7504 : */
7505 2 : unsigned long __init __absent_pages_in_range(int nid,
7506 : unsigned long range_start_pfn,
7507 : unsigned long range_end_pfn)
7508 : {
7509 2 : unsigned long nr_absent = range_end_pfn - range_start_pfn;
7510 : unsigned long start_pfn, end_pfn;
7511 : int i;
7512 :
7513 4 : for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
7514 2 : start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
7515 2 : end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
7516 2 : nr_absent -= end_pfn - start_pfn;
7517 : }
7518 2 : return nr_absent;
7519 : }
7520 :
7521 : /**
7522 : * absent_pages_in_range - Return number of page frames in holes within a range
7523 : * @start_pfn: The start PFN to start searching for holes
7524 : * @end_pfn: The end PFN to stop searching for holes
7525 : *
7526 : * Return: the number of pages frames in memory holes within a range.
7527 : */
7528 0 : unsigned long __init absent_pages_in_range(unsigned long start_pfn,
7529 : unsigned long end_pfn)
7530 : {
7531 0 : return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
7532 : }
7533 :
7534 : /* Return the number of page frames in holes in a zone on a node */
7535 2 : static unsigned long __init zone_absent_pages_in_node(int nid,
7536 : unsigned long zone_type,
7537 : unsigned long node_start_pfn,
7538 : unsigned long node_end_pfn)
7539 : {
7540 2 : unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
7541 2 : unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
7542 : unsigned long zone_start_pfn, zone_end_pfn;
7543 : unsigned long nr_absent;
7544 :
7545 : /* When hotadd a new node from cpu_up(), the node should be empty */
7546 2 : if (!node_start_pfn && !node_end_pfn)
7547 : return 0;
7548 :
7549 2 : zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
7550 2 : zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
7551 :
7552 2 : adjust_zone_range_for_zone_movable(nid, zone_type,
7553 : node_start_pfn, node_end_pfn,
7554 : &zone_start_pfn, &zone_end_pfn);
7555 2 : nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
7556 :
7557 : /*
7558 : * ZONE_MOVABLE handling.
7559 : * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
7560 : * and vice versa.
7561 : */
7562 2 : if (mirrored_kernelcore && zone_movable_pfn[nid]) {
7563 : unsigned long start_pfn, end_pfn;
7564 : struct memblock_region *r;
7565 :
7566 0 : for_each_mem_region(r) {
7567 0 : start_pfn = clamp(memblock_region_memory_base_pfn(r),
7568 : zone_start_pfn, zone_end_pfn);
7569 0 : end_pfn = clamp(memblock_region_memory_end_pfn(r),
7570 : zone_start_pfn, zone_end_pfn);
7571 :
7572 0 : if (zone_type == ZONE_MOVABLE &&
7573 0 : memblock_is_mirror(r))
7574 0 : nr_absent += end_pfn - start_pfn;
7575 :
7576 0 : if (zone_type == ZONE_NORMAL &&
7577 0 : !memblock_is_mirror(r))
7578 0 : nr_absent += end_pfn - start_pfn;
7579 : }
7580 : }
7581 :
7582 : return nr_absent;
7583 : }
7584 :
7585 1 : static void __init calculate_node_totalpages(struct pglist_data *pgdat,
7586 : unsigned long node_start_pfn,
7587 : unsigned long node_end_pfn)
7588 : {
7589 1 : unsigned long realtotalpages = 0, totalpages = 0;
7590 : enum zone_type i;
7591 :
7592 3 : for (i = 0; i < MAX_NR_ZONES; i++) {
7593 2 : struct zone *zone = pgdat->node_zones + i;
7594 : unsigned long zone_start_pfn, zone_end_pfn;
7595 : unsigned long spanned, absent;
7596 : unsigned long size, real_size;
7597 :
7598 2 : spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
7599 : node_start_pfn,
7600 : node_end_pfn,
7601 : &zone_start_pfn,
7602 : &zone_end_pfn);
7603 2 : absent = zone_absent_pages_in_node(pgdat->node_id, i,
7604 : node_start_pfn,
7605 : node_end_pfn);
7606 :
7607 2 : size = spanned;
7608 2 : real_size = size - absent;
7609 :
7610 2 : if (size)
7611 1 : zone->zone_start_pfn = zone_start_pfn;
7612 : else
7613 1 : zone->zone_start_pfn = 0;
7614 2 : zone->spanned_pages = size;
7615 2 : zone->present_pages = real_size;
7616 : #if defined(CONFIG_MEMORY_HOTPLUG)
7617 : zone->present_early_pages = real_size;
7618 : #endif
7619 :
7620 2 : totalpages += size;
7621 2 : realtotalpages += real_size;
7622 : }
7623 :
7624 1 : pgdat->node_spanned_pages = totalpages;
7625 1 : pgdat->node_present_pages = realtotalpages;
7626 : pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
7627 1 : }
7628 :
7629 : #ifndef CONFIG_SPARSEMEM
7630 : /*
7631 : * Calculate the size of the zone->blockflags rounded to an unsigned long
7632 : * Start by making sure zonesize is a multiple of pageblock_order by rounding
7633 : * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
7634 : * round what is now in bits to nearest long in bits, then return it in
7635 : * bytes.
7636 : */
7637 1 : static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
7638 : {
7639 : unsigned long usemapsize;
7640 :
7641 1 : zonesize += zone_start_pfn & (pageblock_nr_pages-1);
7642 1 : usemapsize = roundup(zonesize, pageblock_nr_pages);
7643 1 : usemapsize = usemapsize >> pageblock_order;
7644 1 : usemapsize *= NR_PAGEBLOCK_BITS;
7645 1 : usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
7646 :
7647 1 : return usemapsize / 8;
7648 : }
7649 :
7650 1 : static void __ref setup_usemap(struct zone *zone)
7651 : {
7652 1 : unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
7653 : zone->spanned_pages);
7654 1 : zone->pageblock_flags = NULL;
7655 1 : if (usemapsize) {
7656 1 : zone->pageblock_flags =
7657 2 : memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
7658 : zone_to_nid(zone));
7659 1 : if (!zone->pageblock_flags)
7660 0 : panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
7661 : usemapsize, zone->name, zone_to_nid(zone));
7662 : }
7663 1 : }
7664 : #else
7665 : static inline void setup_usemap(struct zone *zone) {}
7666 : #endif /* CONFIG_SPARSEMEM */
7667 :
7668 : #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
7669 :
7670 : /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
7671 : void __init set_pageblock_order(void)
7672 : {
7673 : unsigned int order = MAX_ORDER - 1;
7674 :
7675 : /* Check that pageblock_nr_pages has not already been setup */
7676 : if (pageblock_order)
7677 : return;
7678 :
7679 : /* Don't let pageblocks exceed the maximum allocation granularity. */
7680 : if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
7681 : order = HUGETLB_PAGE_ORDER;
7682 :
7683 : /*
7684 : * Assume the largest contiguous order of interest is a huge page.
7685 : * This value may be variable depending on boot parameters on IA64 and
7686 : * powerpc.
7687 : */
7688 : pageblock_order = order;
7689 : }
7690 : #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
7691 :
7692 : /*
7693 : * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
7694 : * is unused as pageblock_order is set at compile-time. See
7695 : * include/linux/pageblock-flags.h for the values of pageblock_order based on
7696 : * the kernel config
7697 : */
7698 0 : void __init set_pageblock_order(void)
7699 : {
7700 0 : }
7701 :
7702 : #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
7703 :
7704 : static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
7705 : unsigned long present_pages)
7706 : {
7707 2 : unsigned long pages = spanned_pages;
7708 :
7709 : /*
7710 : * Provide a more accurate estimation if there are holes within
7711 : * the zone and SPARSEMEM is in use. If there are holes within the
7712 : * zone, each populated memory region may cost us one or two extra
7713 : * memmap pages due to alignment because memmap pages for each
7714 : * populated regions may not be naturally aligned on page boundary.
7715 : * So the (present_pages >> 4) heuristic is a tradeoff for that.
7716 : */
7717 : if (spanned_pages > present_pages + (present_pages >> 4) &&
7718 : IS_ENABLED(CONFIG_SPARSEMEM))
7719 : pages = present_pages;
7720 :
7721 2 : return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
7722 : }
7723 :
7724 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
7725 : static void pgdat_init_split_queue(struct pglist_data *pgdat)
7726 : {
7727 : struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
7728 :
7729 : spin_lock_init(&ds_queue->split_queue_lock);
7730 : INIT_LIST_HEAD(&ds_queue->split_queue);
7731 : ds_queue->split_queue_len = 0;
7732 : }
7733 : #else
7734 : static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
7735 : #endif
7736 :
7737 : #ifdef CONFIG_COMPACTION
7738 : static void pgdat_init_kcompactd(struct pglist_data *pgdat)
7739 : {
7740 1 : init_waitqueue_head(&pgdat->kcompactd_wait);
7741 : }
7742 : #else
7743 : static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
7744 : #endif
7745 :
7746 1 : static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
7747 : {
7748 : int i;
7749 :
7750 1 : pgdat_resize_init(pgdat);
7751 1 : pgdat_kswapd_lock_init(pgdat);
7752 :
7753 1 : pgdat_init_split_queue(pgdat);
7754 1 : pgdat_init_kcompactd(pgdat);
7755 :
7756 1 : init_waitqueue_head(&pgdat->kswapd_wait);
7757 1 : init_waitqueue_head(&pgdat->pfmemalloc_wait);
7758 :
7759 5 : for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
7760 4 : init_waitqueue_head(&pgdat->reclaim_wait[i]);
7761 :
7762 1 : pgdat_page_ext_init(pgdat);
7763 1 : lruvec_init(&pgdat->__lruvec);
7764 1 : }
7765 :
7766 2 : static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
7767 : unsigned long remaining_pages)
7768 : {
7769 4 : atomic_long_set(&zone->managed_pages, remaining_pages);
7770 2 : zone_set_nid(zone, nid);
7771 2 : zone->name = zone_names[idx];
7772 2 : zone->zone_pgdat = NODE_DATA(nid);
7773 2 : spin_lock_init(&zone->lock);
7774 2 : zone_seqlock_init(zone);
7775 2 : zone_pcp_init(zone);
7776 2 : }
7777 :
7778 : /*
7779 : * Set up the zone data structures
7780 : * - init pgdat internals
7781 : * - init all zones belonging to this node
7782 : *
7783 : * NOTE: this function is only called during memory hotplug
7784 : */
7785 : #ifdef CONFIG_MEMORY_HOTPLUG
7786 : void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
7787 : {
7788 : int nid = pgdat->node_id;
7789 : enum zone_type z;
7790 : int cpu;
7791 :
7792 : pgdat_init_internals(pgdat);
7793 :
7794 : if (pgdat->per_cpu_nodestats == &boot_nodestats)
7795 : pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
7796 :
7797 : /*
7798 : * Reset the nr_zones, order and highest_zoneidx before reuse.
7799 : * Note that kswapd will init kswapd_highest_zoneidx properly
7800 : * when it starts in the near future.
7801 : */
7802 : pgdat->nr_zones = 0;
7803 : pgdat->kswapd_order = 0;
7804 : pgdat->kswapd_highest_zoneidx = 0;
7805 : pgdat->node_start_pfn = 0;
7806 : for_each_online_cpu(cpu) {
7807 : struct per_cpu_nodestat *p;
7808 :
7809 : p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
7810 : memset(p, 0, sizeof(*p));
7811 : }
7812 :
7813 : for (z = 0; z < MAX_NR_ZONES; z++)
7814 : zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
7815 : }
7816 : #endif
7817 :
7818 : /*
7819 : * Set up the zone data structures:
7820 : * - mark all pages reserved
7821 : * - mark all memory queues empty
7822 : * - clear the memory bitmaps
7823 : *
7824 : * NOTE: pgdat should get zeroed by caller.
7825 : * NOTE: this function is only called during early init.
7826 : */
7827 1 : static void __init free_area_init_core(struct pglist_data *pgdat)
7828 : {
7829 : enum zone_type j;
7830 1 : int nid = pgdat->node_id;
7831 :
7832 1 : pgdat_init_internals(pgdat);
7833 1 : pgdat->per_cpu_nodestats = &boot_nodestats;
7834 :
7835 3 : for (j = 0; j < MAX_NR_ZONES; j++) {
7836 2 : struct zone *zone = pgdat->node_zones + j;
7837 : unsigned long size, freesize, memmap_pages;
7838 :
7839 2 : size = zone->spanned_pages;
7840 2 : freesize = zone->present_pages;
7841 :
7842 : /*
7843 : * Adjust freesize so that it accounts for how much memory
7844 : * is used by this zone for memmap. This affects the watermark
7845 : * and per-cpu initialisations
7846 : */
7847 4 : memmap_pages = calc_memmap_size(size, freesize);
7848 2 : if (!is_highmem_idx(j)) {
7849 2 : if (freesize >= memmap_pages) {
7850 2 : freesize -= memmap_pages;
7851 : if (memmap_pages)
7852 : pr_debug(" %s zone: %lu pages used for memmap\n",
7853 : zone_names[j], memmap_pages);
7854 : } else
7855 0 : pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n",
7856 : zone_names[j], memmap_pages, freesize);
7857 : }
7858 :
7859 : /* Account for reserved pages */
7860 2 : if (j == 0 && freesize > dma_reserve) {
7861 1 : freesize -= dma_reserve;
7862 : pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
7863 : }
7864 :
7865 2 : if (!is_highmem_idx(j))
7866 2 : nr_kernel_pages += freesize;
7867 : /* Charge for highmem memmap if there are enough kernel pages */
7868 : else if (nr_kernel_pages > memmap_pages * 2)
7869 : nr_kernel_pages -= memmap_pages;
7870 2 : nr_all_pages += freesize;
7871 :
7872 : /*
7873 : * Set an approximate value for lowmem here, it will be adjusted
7874 : * when the bootmem allocator frees pages into the buddy system.
7875 : * And all highmem pages will be managed by the buddy system.
7876 : */
7877 2 : zone_init_internals(zone, j, nid, freesize);
7878 :
7879 2 : if (!size)
7880 1 : continue;
7881 :
7882 : set_pageblock_order();
7883 1 : setup_usemap(zone);
7884 1 : init_currently_empty_zone(zone, zone->zone_start_pfn, size);
7885 : }
7886 1 : }
7887 :
7888 : #ifdef CONFIG_FLATMEM
7889 1 : static void __init alloc_node_mem_map(struct pglist_data *pgdat)
7890 : {
7891 1 : unsigned long __maybe_unused start = 0;
7892 1 : unsigned long __maybe_unused offset = 0;
7893 :
7894 : /* Skip empty nodes */
7895 1 : if (!pgdat->node_spanned_pages)
7896 : return;
7897 :
7898 1 : start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
7899 1 : offset = pgdat->node_start_pfn - start;
7900 : /* ia64 gets its own node_mem_map, before this, without bootmem */
7901 1 : if (!pgdat->node_mem_map) {
7902 : unsigned long size, end;
7903 : struct page *map;
7904 :
7905 : /*
7906 : * The zone's endpoints aren't required to be MAX_ORDER
7907 : * aligned but the node_mem_map endpoints must be in order
7908 : * for the buddy allocator to function correctly.
7909 : */
7910 2 : end = pgdat_end_pfn(pgdat);
7911 1 : end = ALIGN(end, MAX_ORDER_NR_PAGES);
7912 1 : size = (end - start) * sizeof(struct page);
7913 1 : map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
7914 : pgdat->node_id, false);
7915 1 : if (!map)
7916 0 : panic("Failed to allocate %ld bytes for node %d memory map\n",
7917 : size, pgdat->node_id);
7918 1 : pgdat->node_mem_map = map + offset;
7919 : }
7920 : pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
7921 : __func__, pgdat->node_id, (unsigned long)pgdat,
7922 : (unsigned long)pgdat->node_mem_map);
7923 : #ifndef CONFIG_NUMA
7924 : /*
7925 : * With no DISCONTIG, the global mem_map is just set as node 0's
7926 : */
7927 1 : if (pgdat == NODE_DATA(0)) {
7928 1 : mem_map = NODE_DATA(0)->node_mem_map;
7929 1 : if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
7930 0 : mem_map -= offset;
7931 : }
7932 : #endif
7933 : }
7934 : #else
7935 : static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
7936 : #endif /* CONFIG_FLATMEM */
7937 :
7938 : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
7939 : static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
7940 : {
7941 : pgdat->first_deferred_pfn = ULONG_MAX;
7942 : }
7943 : #else
7944 : static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
7945 : #endif
7946 :
7947 1 : static void __init free_area_init_node(int nid)
7948 : {
7949 1 : pg_data_t *pgdat = NODE_DATA(nid);
7950 1 : unsigned long start_pfn = 0;
7951 1 : unsigned long end_pfn = 0;
7952 :
7953 : /* pg_data_t should be reset to zero when it's allocated */
7954 1 : WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
7955 :
7956 1 : get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
7957 :
7958 1 : pgdat->node_id = nid;
7959 1 : pgdat->node_start_pfn = start_pfn;
7960 1 : pgdat->per_cpu_nodestats = NULL;
7961 :
7962 1 : if (start_pfn != end_pfn) {
7963 1 : pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
7964 : (u64)start_pfn << PAGE_SHIFT,
7965 : end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
7966 : } else {
7967 0 : pr_info("Initmem setup node %d as memoryless\n", nid);
7968 : }
7969 :
7970 1 : calculate_node_totalpages(pgdat, start_pfn, end_pfn);
7971 :
7972 1 : alloc_node_mem_map(pgdat);
7973 : pgdat_set_deferred_range(pgdat);
7974 :
7975 1 : free_area_init_core(pgdat);
7976 : lru_gen_init_pgdat(pgdat);
7977 1 : }
7978 :
7979 : static void __init free_area_init_memoryless_node(int nid)
7980 : {
7981 : free_area_init_node(nid);
7982 : }
7983 :
7984 : #if MAX_NUMNODES > 1
7985 : /*
7986 : * Figure out the number of possible node ids.
7987 : */
7988 : void __init setup_nr_node_ids(void)
7989 : {
7990 : unsigned int highest;
7991 :
7992 : highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
7993 : nr_node_ids = highest + 1;
7994 : }
7995 : #endif
7996 :
7997 : /**
7998 : * node_map_pfn_alignment - determine the maximum internode alignment
7999 : *
8000 : * This function should be called after node map is populated and sorted.
8001 : * It calculates the maximum power of two alignment which can distinguish
8002 : * all the nodes.
8003 : *
8004 : * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
8005 : * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
8006 : * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
8007 : * shifted, 1GiB is enough and this function will indicate so.
8008 : *
8009 : * This is used to test whether pfn -> nid mapping of the chosen memory
8010 : * model has fine enough granularity to avoid incorrect mapping for the
8011 : * populated node map.
8012 : *
8013 : * Return: the determined alignment in pfn's. 0 if there is no alignment
8014 : * requirement (single node).
8015 : */
8016 0 : unsigned long __init node_map_pfn_alignment(void)
8017 : {
8018 0 : unsigned long accl_mask = 0, last_end = 0;
8019 : unsigned long start, end, mask;
8020 0 : int last_nid = NUMA_NO_NODE;
8021 : int i, nid;
8022 :
8023 0 : for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
8024 0 : if (!start || last_nid < 0 || last_nid == nid) {
8025 0 : last_nid = nid;
8026 0 : last_end = end;
8027 0 : continue;
8028 : }
8029 :
8030 : /*
8031 : * Start with a mask granular enough to pin-point to the
8032 : * start pfn and tick off bits one-by-one until it becomes
8033 : * too coarse to separate the current node from the last.
8034 : */
8035 0 : mask = ~((1 << __ffs(start)) - 1);
8036 0 : while (mask && last_end <= (start & (mask << 1)))
8037 : mask <<= 1;
8038 :
8039 : /* accumulate all internode masks */
8040 0 : accl_mask |= mask;
8041 : }
8042 :
8043 : /* convert mask to number of pages */
8044 0 : return ~accl_mask + 1;
8045 : }
8046 :
8047 : /*
8048 : * early_calculate_totalpages()
8049 : * Sum pages in active regions for movable zone.
8050 : * Populate N_MEMORY for calculating usable_nodes.
8051 : */
8052 1 : static unsigned long __init early_calculate_totalpages(void)
8053 : {
8054 1 : unsigned long totalpages = 0;
8055 : unsigned long start_pfn, end_pfn;
8056 : int i, nid;
8057 :
8058 2 : for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
8059 1 : unsigned long pages = end_pfn - start_pfn;
8060 :
8061 1 : totalpages += pages;
8062 : if (pages)
8063 : node_set_state(nid, N_MEMORY);
8064 : }
8065 1 : return totalpages;
8066 : }
8067 :
8068 : /*
8069 : * Find the PFN the Movable zone begins in each node. Kernel memory
8070 : * is spread evenly between nodes as long as the nodes have enough
8071 : * memory. When they don't, some nodes will have more kernelcore than
8072 : * others
8073 : */
8074 1 : static void __init find_zone_movable_pfns_for_nodes(void)
8075 : {
8076 : int i, nid;
8077 : unsigned long usable_startpfn;
8078 : unsigned long kernelcore_node, kernelcore_remaining;
8079 : /* save the state before borrow the nodemask */
8080 1 : nodemask_t saved_node_state = node_states[N_MEMORY];
8081 1 : unsigned long totalpages = early_calculate_totalpages();
8082 1 : int usable_nodes = nodes_weight(node_states[N_MEMORY]);
8083 : struct memblock_region *r;
8084 :
8085 : /* Need to find movable_zone earlier when movable_node is specified. */
8086 1 : find_usable_zone_for_movable();
8087 :
8088 : /*
8089 : * If movable_node is specified, ignore kernelcore and movablecore
8090 : * options.
8091 : */
8092 : if (movable_node_is_enabled()) {
8093 : for_each_mem_region(r) {
8094 : if (!memblock_is_hotpluggable(r))
8095 : continue;
8096 :
8097 : nid = memblock_get_region_node(r);
8098 :
8099 : usable_startpfn = PFN_DOWN(r->base);
8100 : zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
8101 : min(usable_startpfn, zone_movable_pfn[nid]) :
8102 : usable_startpfn;
8103 : }
8104 :
8105 : goto out2;
8106 : }
8107 :
8108 : /*
8109 : * If kernelcore=mirror is specified, ignore movablecore option
8110 : */
8111 1 : if (mirrored_kernelcore) {
8112 0 : bool mem_below_4gb_not_mirrored = false;
8113 :
8114 0 : for_each_mem_region(r) {
8115 0 : if (memblock_is_mirror(r))
8116 0 : continue;
8117 :
8118 0 : nid = memblock_get_region_node(r);
8119 :
8120 0 : usable_startpfn = memblock_region_memory_base_pfn(r);
8121 :
8122 0 : if (usable_startpfn < PHYS_PFN(SZ_4G)) {
8123 0 : mem_below_4gb_not_mirrored = true;
8124 0 : continue;
8125 : }
8126 :
8127 0 : zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
8128 0 : min(usable_startpfn, zone_movable_pfn[nid]) :
8129 : usable_startpfn;
8130 : }
8131 :
8132 0 : if (mem_below_4gb_not_mirrored)
8133 0 : pr_warn("This configuration results in unmirrored kernel memory.\n");
8134 :
8135 : goto out2;
8136 : }
8137 :
8138 : /*
8139 : * If kernelcore=nn% or movablecore=nn% was specified, calculate the
8140 : * amount of necessary memory.
8141 : */
8142 1 : if (required_kernelcore_percent)
8143 0 : required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
8144 : 10000UL;
8145 1 : if (required_movablecore_percent)
8146 0 : required_movablecore = (totalpages * 100 * required_movablecore_percent) /
8147 : 10000UL;
8148 :
8149 : /*
8150 : * If movablecore= was specified, calculate what size of
8151 : * kernelcore that corresponds so that memory usable for
8152 : * any allocation type is evenly spread. If both kernelcore
8153 : * and movablecore are specified, then the value of kernelcore
8154 : * will be used for required_kernelcore if it's greater than
8155 : * what movablecore would have allowed.
8156 : */
8157 1 : if (required_movablecore) {
8158 : unsigned long corepages;
8159 :
8160 : /*
8161 : * Round-up so that ZONE_MOVABLE is at least as large as what
8162 : * was requested by the user
8163 : */
8164 : required_movablecore =
8165 0 : roundup(required_movablecore, MAX_ORDER_NR_PAGES);
8166 0 : required_movablecore = min(totalpages, required_movablecore);
8167 0 : corepages = totalpages - required_movablecore;
8168 :
8169 0 : required_kernelcore = max(required_kernelcore, corepages);
8170 : }
8171 :
8172 : /*
8173 : * If kernelcore was not specified or kernelcore size is larger
8174 : * than totalpages, there is no ZONE_MOVABLE.
8175 : */
8176 1 : if (!required_kernelcore || required_kernelcore >= totalpages)
8177 : goto out;
8178 :
8179 : /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
8180 0 : usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
8181 :
8182 : restart:
8183 : /* Spread kernelcore memory as evenly as possible throughout nodes */
8184 0 : kernelcore_node = required_kernelcore / usable_nodes;
8185 0 : for_each_node_state(nid, N_MEMORY) {
8186 : unsigned long start_pfn, end_pfn;
8187 :
8188 : /*
8189 : * Recalculate kernelcore_node if the division per node
8190 : * now exceeds what is necessary to satisfy the requested
8191 : * amount of memory for the kernel
8192 : */
8193 0 : if (required_kernelcore < kernelcore_node)
8194 0 : kernelcore_node = required_kernelcore / usable_nodes;
8195 :
8196 : /*
8197 : * As the map is walked, we track how much memory is usable
8198 : * by the kernel using kernelcore_remaining. When it is
8199 : * 0, the rest of the node is usable by ZONE_MOVABLE
8200 : */
8201 0 : kernelcore_remaining = kernelcore_node;
8202 :
8203 : /* Go through each range of PFNs within this node */
8204 0 : for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
8205 : unsigned long size_pages;
8206 :
8207 0 : start_pfn = max(start_pfn, zone_movable_pfn[nid]);
8208 0 : if (start_pfn >= end_pfn)
8209 0 : continue;
8210 :
8211 : /* Account for what is only usable for kernelcore */
8212 0 : if (start_pfn < usable_startpfn) {
8213 : unsigned long kernel_pages;
8214 0 : kernel_pages = min(end_pfn, usable_startpfn)
8215 : - start_pfn;
8216 :
8217 0 : kernelcore_remaining -= min(kernel_pages,
8218 : kernelcore_remaining);
8219 0 : required_kernelcore -= min(kernel_pages,
8220 : required_kernelcore);
8221 :
8222 : /* Continue if range is now fully accounted */
8223 0 : if (end_pfn <= usable_startpfn) {
8224 :
8225 : /*
8226 : * Push zone_movable_pfn to the end so
8227 : * that if we have to rebalance
8228 : * kernelcore across nodes, we will
8229 : * not double account here
8230 : */
8231 0 : zone_movable_pfn[nid] = end_pfn;
8232 0 : continue;
8233 : }
8234 0 : start_pfn = usable_startpfn;
8235 : }
8236 :
8237 : /*
8238 : * The usable PFN range for ZONE_MOVABLE is from
8239 : * start_pfn->end_pfn. Calculate size_pages as the
8240 : * number of pages used as kernelcore
8241 : */
8242 0 : size_pages = end_pfn - start_pfn;
8243 0 : if (size_pages > kernelcore_remaining)
8244 0 : size_pages = kernelcore_remaining;
8245 0 : zone_movable_pfn[nid] = start_pfn + size_pages;
8246 :
8247 : /*
8248 : * Some kernelcore has been met, update counts and
8249 : * break if the kernelcore for this node has been
8250 : * satisfied
8251 : */
8252 0 : required_kernelcore -= min(required_kernelcore,
8253 : size_pages);
8254 0 : kernelcore_remaining -= size_pages;
8255 0 : if (!kernelcore_remaining)
8256 : break;
8257 : }
8258 : }
8259 :
8260 : /*
8261 : * If there is still required_kernelcore, we do another pass with one
8262 : * less node in the count. This will push zone_movable_pfn[nid] further
8263 : * along on the nodes that still have memory until kernelcore is
8264 : * satisfied
8265 : */
8266 0 : usable_nodes--;
8267 0 : if (usable_nodes && required_kernelcore > usable_nodes)
8268 : goto restart;
8269 :
8270 : out2:
8271 : /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
8272 0 : for (nid = 0; nid < MAX_NUMNODES; nid++) {
8273 : unsigned long start_pfn, end_pfn;
8274 :
8275 0 : zone_movable_pfn[nid] =
8276 0 : roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
8277 :
8278 0 : get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
8279 0 : if (zone_movable_pfn[nid] >= end_pfn)
8280 0 : zone_movable_pfn[nid] = 0;
8281 : }
8282 :
8283 : out:
8284 : /* restore the node_state */
8285 1 : node_states[N_MEMORY] = saved_node_state;
8286 1 : }
8287 :
8288 : /* Any regular or high memory on that node ? */
8289 : static void check_for_memory(pg_data_t *pgdat, int nid)
8290 : {
8291 : enum zone_type zone_type;
8292 :
8293 0 : for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
8294 1 : struct zone *zone = &pgdat->node_zones[zone_type];
8295 1 : if (populated_zone(zone)) {
8296 : if (IS_ENABLED(CONFIG_HIGHMEM))
8297 : node_set_state(nid, N_HIGH_MEMORY);
8298 : if (zone_type <= ZONE_NORMAL)
8299 : node_set_state(nid, N_NORMAL_MEMORY);
8300 : break;
8301 : }
8302 : }
8303 : }
8304 :
8305 : /*
8306 : * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
8307 : * such cases we allow max_zone_pfn sorted in the descending order
8308 : */
8309 1 : bool __weak arch_has_descending_max_zone_pfns(void)
8310 : {
8311 1 : return false;
8312 : }
8313 :
8314 : /**
8315 : * free_area_init - Initialise all pg_data_t and zone data
8316 : * @max_zone_pfn: an array of max PFNs for each zone
8317 : *
8318 : * This will call free_area_init_node() for each active node in the system.
8319 : * Using the page ranges provided by memblock_set_node(), the size of each
8320 : * zone in each node and their holes is calculated. If the maximum PFN
8321 : * between two adjacent zones match, it is assumed that the zone is empty.
8322 : * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
8323 : * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
8324 : * starts where the previous one ended. For example, ZONE_DMA32 starts
8325 : * at arch_max_dma_pfn.
8326 : */
8327 1 : void __init free_area_init(unsigned long *max_zone_pfn)
8328 : {
8329 : unsigned long start_pfn, end_pfn;
8330 : int i, nid, zone;
8331 : bool descending;
8332 :
8333 : /* Record where the zone boundaries are */
8334 1 : memset(arch_zone_lowest_possible_pfn, 0,
8335 : sizeof(arch_zone_lowest_possible_pfn));
8336 1 : memset(arch_zone_highest_possible_pfn, 0,
8337 : sizeof(arch_zone_highest_possible_pfn));
8338 :
8339 1 : start_pfn = PHYS_PFN(memblock_start_of_DRAM());
8340 1 : descending = arch_has_descending_max_zone_pfns();
8341 :
8342 3 : for (i = 0; i < MAX_NR_ZONES; i++) {
8343 2 : if (descending)
8344 0 : zone = MAX_NR_ZONES - i - 1;
8345 : else
8346 : zone = i;
8347 :
8348 2 : if (zone == ZONE_MOVABLE)
8349 1 : continue;
8350 :
8351 1 : end_pfn = max(max_zone_pfn[zone], start_pfn);
8352 1 : arch_zone_lowest_possible_pfn[zone] = start_pfn;
8353 1 : arch_zone_highest_possible_pfn[zone] = end_pfn;
8354 :
8355 1 : start_pfn = end_pfn;
8356 : }
8357 :
8358 : /* Find the PFNs that ZONE_MOVABLE begins at in each node */
8359 1 : memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
8360 1 : find_zone_movable_pfns_for_nodes();
8361 :
8362 : /* Print out the zone ranges */
8363 1 : pr_info("Zone ranges:\n");
8364 3 : for (i = 0; i < MAX_NR_ZONES; i++) {
8365 2 : if (i == ZONE_MOVABLE)
8366 1 : continue;
8367 1 : pr_info(" %-8s ", zone_names[i]);
8368 2 : if (arch_zone_lowest_possible_pfn[i] ==
8369 1 : arch_zone_highest_possible_pfn[i])
8370 0 : pr_cont("empty\n");
8371 : else
8372 1 : pr_cont("[mem %#018Lx-%#018Lx]\n",
8373 : (u64)arch_zone_lowest_possible_pfn[i]
8374 : << PAGE_SHIFT,
8375 : ((u64)arch_zone_highest_possible_pfn[i]
8376 : << PAGE_SHIFT) - 1);
8377 : }
8378 :
8379 : /* Print out the PFNs ZONE_MOVABLE begins at in each node */
8380 1 : pr_info("Movable zone start for each node\n");
8381 2 : for (i = 0; i < MAX_NUMNODES; i++) {
8382 1 : if (zone_movable_pfn[i])
8383 0 : pr_info(" Node %d: %#018Lx\n", i,
8384 : (u64)zone_movable_pfn[i] << PAGE_SHIFT);
8385 : }
8386 :
8387 : /*
8388 : * Print out the early node map, and initialize the
8389 : * subsection-map relative to active online memory ranges to
8390 : * enable future "sub-section" extensions of the memory map.
8391 : */
8392 1 : pr_info("Early memory node ranges\n");
8393 2 : for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
8394 1 : pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
8395 : (u64)start_pfn << PAGE_SHIFT,
8396 : ((u64)end_pfn << PAGE_SHIFT) - 1);
8397 : subsection_map_init(start_pfn, end_pfn - start_pfn);
8398 : }
8399 :
8400 : /* Initialise every node */
8401 1 : mminit_verify_pageflags_layout();
8402 : setup_nr_node_ids();
8403 2 : for_each_node(nid) {
8404 : pg_data_t *pgdat;
8405 :
8406 1 : if (!node_online(nid)) {
8407 : pr_info("Initializing node %d as memoryless\n", nid);
8408 :
8409 : /* Allocator not initialized yet */
8410 : pgdat = arch_alloc_nodedata(nid);
8411 : if (!pgdat)
8412 : panic("Cannot allocate %zuB for node %d.\n",
8413 : sizeof(*pgdat), nid);
8414 : arch_refresh_nodedata(nid, pgdat);
8415 : free_area_init_memoryless_node(nid);
8416 :
8417 : /*
8418 : * We do not want to confuse userspace by sysfs
8419 : * files/directories for node without any memory
8420 : * attached to it, so this node is not marked as
8421 : * N_MEMORY and not marked online so that no sysfs
8422 : * hierarchy will be created via register_one_node for
8423 : * it. The pgdat will get fully initialized by
8424 : * hotadd_init_pgdat() when memory is hotplugged into
8425 : * this node.
8426 : */
8427 : continue;
8428 : }
8429 :
8430 1 : pgdat = NODE_DATA(nid);
8431 1 : free_area_init_node(nid);
8432 :
8433 : /* Any memory on that node */
8434 : if (pgdat->node_present_pages)
8435 : node_set_state(nid, N_MEMORY);
8436 2 : check_for_memory(pgdat, nid);
8437 : }
8438 :
8439 1 : memmap_init();
8440 1 : }
8441 :
8442 0 : static int __init cmdline_parse_core(char *p, unsigned long *core,
8443 : unsigned long *percent)
8444 : {
8445 : unsigned long long coremem;
8446 : char *endptr;
8447 :
8448 0 : if (!p)
8449 : return -EINVAL;
8450 :
8451 : /* Value may be a percentage of total memory, otherwise bytes */
8452 0 : coremem = simple_strtoull(p, &endptr, 0);
8453 0 : if (*endptr == '%') {
8454 : /* Paranoid check for percent values greater than 100 */
8455 0 : WARN_ON(coremem > 100);
8456 :
8457 0 : *percent = coremem;
8458 : } else {
8459 0 : coremem = memparse(p, &p);
8460 : /* Paranoid check that UL is enough for the coremem value */
8461 0 : WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
8462 :
8463 0 : *core = coremem >> PAGE_SHIFT;
8464 0 : *percent = 0UL;
8465 : }
8466 : return 0;
8467 : }
8468 :
8469 : /*
8470 : * kernelcore=size sets the amount of memory for use for allocations that
8471 : * cannot be reclaimed or migrated.
8472 : */
8473 0 : static int __init cmdline_parse_kernelcore(char *p)
8474 : {
8475 : /* parse kernelcore=mirror */
8476 0 : if (parse_option_str(p, "mirror")) {
8477 0 : mirrored_kernelcore = true;
8478 0 : return 0;
8479 : }
8480 :
8481 0 : return cmdline_parse_core(p, &required_kernelcore,
8482 : &required_kernelcore_percent);
8483 : }
8484 :
8485 : /*
8486 : * movablecore=size sets the amount of memory for use for allocations that
8487 : * can be reclaimed or migrated.
8488 : */
8489 0 : static int __init cmdline_parse_movablecore(char *p)
8490 : {
8491 0 : return cmdline_parse_core(p, &required_movablecore,
8492 : &required_movablecore_percent);
8493 : }
8494 :
8495 : early_param("kernelcore", cmdline_parse_kernelcore);
8496 : early_param("movablecore", cmdline_parse_movablecore);
8497 :
8498 0 : void adjust_managed_page_count(struct page *page, long count)
8499 : {
8500 0 : atomic_long_add(count, &page_zone(page)->managed_pages);
8501 0 : totalram_pages_add(count);
8502 : #ifdef CONFIG_HIGHMEM
8503 : if (PageHighMem(page))
8504 : totalhigh_pages_add(count);
8505 : #endif
8506 0 : }
8507 : EXPORT_SYMBOL(adjust_managed_page_count);
8508 :
8509 0 : unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
8510 : {
8511 : void *pos;
8512 0 : unsigned long pages = 0;
8513 :
8514 0 : start = (void *)PAGE_ALIGN((unsigned long)start);
8515 0 : end = (void *)((unsigned long)end & PAGE_MASK);
8516 0 : for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
8517 0 : struct page *page = virt_to_page(pos);
8518 : void *direct_map_addr;
8519 :
8520 : /*
8521 : * 'direct_map_addr' might be different from 'pos'
8522 : * because some architectures' virt_to_page()
8523 : * work with aliases. Getting the direct map
8524 : * address ensures that we get a _writeable_
8525 : * alias for the memset().
8526 : */
8527 0 : direct_map_addr = page_address(page);
8528 : /*
8529 : * Perform a kasan-unchecked memset() since this memory
8530 : * has not been initialized.
8531 : */
8532 0 : direct_map_addr = kasan_reset_tag(direct_map_addr);
8533 0 : if ((unsigned int)poison <= 0xFF)
8534 0 : memset(direct_map_addr, poison, PAGE_SIZE);
8535 :
8536 0 : free_reserved_page(page);
8537 : }
8538 :
8539 0 : if (pages && s)
8540 0 : pr_info("Freeing %s memory: %ldK\n", s, K(pages));
8541 :
8542 0 : return pages;
8543 : }
8544 :
8545 1 : void __init mem_init_print_info(void)
8546 : {
8547 : unsigned long physpages, codesize, datasize, rosize, bss_size;
8548 : unsigned long init_code_size, init_data_size;
8549 :
8550 1 : physpages = get_num_physpages();
8551 1 : codesize = _etext - _stext;
8552 1 : datasize = _edata - _sdata;
8553 1 : rosize = __end_rodata - __start_rodata;
8554 1 : bss_size = __bss_stop - __bss_start;
8555 1 : init_data_size = __init_end - __init_begin;
8556 1 : init_code_size = _einittext - _sinittext;
8557 :
8558 : /*
8559 : * Detect special cases and adjust section sizes accordingly:
8560 : * 1) .init.* may be embedded into .data sections
8561 : * 2) .init.text.* may be out of [__init_begin, __init_end],
8562 : * please refer to arch/tile/kernel/vmlinux.lds.S.
8563 : * 3) .rodata.* may be embedded into .text or .data sections.
8564 : */
8565 : #define adj_init_size(start, end, size, pos, adj) \
8566 : do { \
8567 : if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
8568 : size -= adj; \
8569 : } while (0)
8570 :
8571 1 : adj_init_size(__init_begin, __init_end, init_data_size,
8572 : _sinittext, init_code_size);
8573 1 : adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
8574 1 : adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
8575 1 : adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
8576 1 : adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
8577 :
8578 : #undef adj_init_size
8579 :
8580 3 : pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
8581 : #ifdef CONFIG_HIGHMEM
8582 : ", %luK highmem"
8583 : #endif
8584 : ")\n",
8585 : K(nr_free_pages()), K(physpages),
8586 : codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
8587 : (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
8588 : K(physpages - totalram_pages() - totalcma_pages),
8589 : K(totalcma_pages)
8590 : #ifdef CONFIG_HIGHMEM
8591 : , K(totalhigh_pages())
8592 : #endif
8593 : );
8594 1 : }
8595 :
8596 : /**
8597 : * set_dma_reserve - set the specified number of pages reserved in the first zone
8598 : * @new_dma_reserve: The number of pages to mark reserved
8599 : *
8600 : * The per-cpu batchsize and zone watermarks are determined by managed_pages.
8601 : * In the DMA zone, a significant percentage may be consumed by kernel image
8602 : * and other unfreeable allocations which can skew the watermarks badly. This
8603 : * function may optionally be used to account for unfreeable pages in the
8604 : * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
8605 : * smaller per-cpu batchsize.
8606 : */
8607 0 : void __init set_dma_reserve(unsigned long new_dma_reserve)
8608 : {
8609 0 : dma_reserve = new_dma_reserve;
8610 0 : }
8611 :
8612 0 : static int page_alloc_cpu_dead(unsigned int cpu)
8613 : {
8614 : struct zone *zone;
8615 :
8616 0 : lru_add_drain_cpu(cpu);
8617 0 : mlock_drain_remote(cpu);
8618 0 : drain_pages(cpu);
8619 :
8620 : /*
8621 : * Spill the event counters of the dead processor
8622 : * into the current processors event counters.
8623 : * This artificially elevates the count of the current
8624 : * processor.
8625 : */
8626 0 : vm_events_fold_cpu(cpu);
8627 :
8628 : /*
8629 : * Zero the differential counters of the dead processor
8630 : * so that the vm statistics are consistent.
8631 : *
8632 : * This is only okay since the processor is dead and cannot
8633 : * race with what we are doing.
8634 : */
8635 0 : cpu_vm_stats_fold(cpu);
8636 :
8637 0 : for_each_populated_zone(zone)
8638 0 : zone_pcp_update(zone, 0);
8639 :
8640 0 : return 0;
8641 : }
8642 :
8643 0 : static int page_alloc_cpu_online(unsigned int cpu)
8644 : {
8645 : struct zone *zone;
8646 :
8647 0 : for_each_populated_zone(zone)
8648 0 : zone_pcp_update(zone, 1);
8649 0 : return 0;
8650 : }
8651 :
8652 : #ifdef CONFIG_NUMA
8653 : int hashdist = HASHDIST_DEFAULT;
8654 :
8655 : static int __init set_hashdist(char *str)
8656 : {
8657 : if (!str)
8658 : return 0;
8659 : hashdist = simple_strtoul(str, &str, 0);
8660 : return 1;
8661 : }
8662 : __setup("hashdist=", set_hashdist);
8663 : #endif
8664 :
8665 1 : void __init page_alloc_init(void)
8666 : {
8667 : int ret;
8668 :
8669 : #ifdef CONFIG_NUMA
8670 : if (num_node_state(N_MEMORY) == 1)
8671 : hashdist = 0;
8672 : #endif
8673 :
8674 1 : ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
8675 : "mm/page_alloc:pcp",
8676 : page_alloc_cpu_online,
8677 : page_alloc_cpu_dead);
8678 1 : WARN_ON(ret < 0);
8679 1 : }
8680 :
8681 : /*
8682 : * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
8683 : * or min_free_kbytes changes.
8684 : */
8685 2 : static void calculate_totalreserve_pages(void)
8686 : {
8687 : struct pglist_data *pgdat;
8688 2 : unsigned long reserve_pages = 0;
8689 : enum zone_type i, j;
8690 :
8691 4 : for_each_online_pgdat(pgdat) {
8692 :
8693 2 : pgdat->totalreserve_pages = 0;
8694 :
8695 6 : for (i = 0; i < MAX_NR_ZONES; i++) {
8696 4 : struct zone *zone = pgdat->node_zones + i;
8697 4 : long max = 0;
8698 4 : unsigned long managed_pages = zone_managed_pages(zone);
8699 :
8700 : /* Find valid and maximum lowmem_reserve in the zone */
8701 10 : for (j = i; j < MAX_NR_ZONES; j++) {
8702 6 : if (zone->lowmem_reserve[j] > max)
8703 0 : max = zone->lowmem_reserve[j];
8704 : }
8705 :
8706 : /* we treat the high watermark as reserved pages. */
8707 4 : max += high_wmark_pages(zone);
8708 :
8709 4 : if (max > managed_pages)
8710 0 : max = managed_pages;
8711 :
8712 4 : pgdat->totalreserve_pages += max;
8713 :
8714 4 : reserve_pages += max;
8715 : }
8716 : }
8717 2 : totalreserve_pages = reserve_pages;
8718 2 : }
8719 :
8720 : /*
8721 : * setup_per_zone_lowmem_reserve - called whenever
8722 : * sysctl_lowmem_reserve_ratio changes. Ensures that each zone
8723 : * has a correct pages reserved value, so an adequate number of
8724 : * pages are left in the zone after a successful __alloc_pages().
8725 : */
8726 1 : static void setup_per_zone_lowmem_reserve(void)
8727 : {
8728 : struct pglist_data *pgdat;
8729 : enum zone_type i, j;
8730 :
8731 2 : for_each_online_pgdat(pgdat) {
8732 2 : for (i = 0; i < MAX_NR_ZONES - 1; i++) {
8733 1 : struct zone *zone = &pgdat->node_zones[i];
8734 1 : int ratio = sysctl_lowmem_reserve_ratio[i];
8735 2 : bool clear = !ratio || !zone_managed_pages(zone);
8736 1 : unsigned long managed_pages = 0;
8737 :
8738 2 : for (j = i + 1; j < MAX_NR_ZONES; j++) {
8739 1 : struct zone *upper_zone = &pgdat->node_zones[j];
8740 :
8741 1 : managed_pages += zone_managed_pages(upper_zone);
8742 :
8743 1 : if (clear)
8744 0 : zone->lowmem_reserve[j] = 0;
8745 : else
8746 1 : zone->lowmem_reserve[j] = managed_pages / ratio;
8747 : }
8748 : }
8749 : }
8750 :
8751 : /* update totalreserve_pages */
8752 1 : calculate_totalreserve_pages();
8753 1 : }
8754 :
8755 1 : static void __setup_per_zone_wmarks(void)
8756 : {
8757 1 : unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
8758 1 : unsigned long lowmem_pages = 0;
8759 : struct zone *zone;
8760 : unsigned long flags;
8761 :
8762 : /* Calculate total number of !ZONE_HIGHMEM pages */
8763 3 : for_each_zone(zone) {
8764 2 : if (!is_highmem(zone))
8765 2 : lowmem_pages += zone_managed_pages(zone);
8766 : }
8767 :
8768 3 : for_each_zone(zone) {
8769 : u64 tmp;
8770 :
8771 2 : spin_lock_irqsave(&zone->lock, flags);
8772 2 : tmp = (u64)pages_min * zone_managed_pages(zone);
8773 2 : do_div(tmp, lowmem_pages);
8774 2 : if (is_highmem(zone)) {
8775 : /*
8776 : * __GFP_HIGH and PF_MEMALLOC allocations usually don't
8777 : * need highmem pages, so cap pages_min to a small
8778 : * value here.
8779 : *
8780 : * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
8781 : * deltas control async page reclaim, and so should
8782 : * not be capped for highmem.
8783 : */
8784 : unsigned long min_pages;
8785 :
8786 : min_pages = zone_managed_pages(zone) / 1024;
8787 : min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
8788 : zone->_watermark[WMARK_MIN] = min_pages;
8789 : } else {
8790 : /*
8791 : * If it's a lowmem zone, reserve a number of pages
8792 : * proportionate to the zone's size.
8793 : */
8794 2 : zone->_watermark[WMARK_MIN] = tmp;
8795 : }
8796 :
8797 : /*
8798 : * Set the kswapd watermarks distance according to the
8799 : * scale factor in proportion to available memory, but
8800 : * ensure a minimum size on small systems.
8801 : */
8802 6 : tmp = max_t(u64, tmp >> 2,
8803 : mult_frac(zone_managed_pages(zone),
8804 : watermark_scale_factor, 10000));
8805 :
8806 2 : zone->watermark_boost = 0;
8807 2 : zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
8808 2 : zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
8809 2 : zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
8810 :
8811 4 : spin_unlock_irqrestore(&zone->lock, flags);
8812 : }
8813 :
8814 : /* update totalreserve_pages */
8815 1 : calculate_totalreserve_pages();
8816 1 : }
8817 :
8818 : /**
8819 : * setup_per_zone_wmarks - called when min_free_kbytes changes
8820 : * or when memory is hot-{added|removed}
8821 : *
8822 : * Ensures that the watermark[min,low,high] values for each zone are set
8823 : * correctly with respect to min_free_kbytes.
8824 : */
8825 1 : void setup_per_zone_wmarks(void)
8826 : {
8827 : struct zone *zone;
8828 : static DEFINE_SPINLOCK(lock);
8829 :
8830 1 : spin_lock(&lock);
8831 1 : __setup_per_zone_wmarks();
8832 1 : spin_unlock(&lock);
8833 :
8834 : /*
8835 : * The watermark size have changed so update the pcpu batch
8836 : * and high limits or the limits may be inappropriate.
8837 : */
8838 3 : for_each_zone(zone)
8839 2 : zone_pcp_update(zone, 0);
8840 1 : }
8841 :
8842 : /*
8843 : * Initialise min_free_kbytes.
8844 : *
8845 : * For small machines we want it small (128k min). For large machines
8846 : * we want it large (256MB max). But it is not linear, because network
8847 : * bandwidth does not increase linearly with machine size. We use
8848 : *
8849 : * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
8850 : * min_free_kbytes = sqrt(lowmem_kbytes * 16)
8851 : *
8852 : * which yields
8853 : *
8854 : * 16MB: 512k
8855 : * 32MB: 724k
8856 : * 64MB: 1024k
8857 : * 128MB: 1448k
8858 : * 256MB: 2048k
8859 : * 512MB: 2896k
8860 : * 1024MB: 4096k
8861 : * 2048MB: 5792k
8862 : * 4096MB: 8192k
8863 : * 8192MB: 11584k
8864 : * 16384MB: 16384k
8865 : */
8866 1 : void calculate_min_free_kbytes(void)
8867 : {
8868 : unsigned long lowmem_kbytes;
8869 : int new_min_free_kbytes;
8870 :
8871 1 : lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
8872 1 : new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
8873 :
8874 1 : if (new_min_free_kbytes > user_min_free_kbytes)
8875 1 : min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144);
8876 : else
8877 0 : pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
8878 : new_min_free_kbytes, user_min_free_kbytes);
8879 :
8880 1 : }
8881 :
8882 1 : int __meminit init_per_zone_wmark_min(void)
8883 : {
8884 1 : calculate_min_free_kbytes();
8885 1 : setup_per_zone_wmarks();
8886 : refresh_zone_stat_thresholds();
8887 1 : setup_per_zone_lowmem_reserve();
8888 :
8889 : #ifdef CONFIG_NUMA
8890 : setup_min_unmapped_ratio();
8891 : setup_min_slab_ratio();
8892 : #endif
8893 :
8894 : khugepaged_min_free_kbytes_update();
8895 :
8896 1 : return 0;
8897 : }
8898 : postcore_initcall(init_per_zone_wmark_min)
8899 :
8900 : /*
8901 : * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
8902 : * that we can call two helper functions whenever min_free_kbytes
8903 : * changes.
8904 : */
8905 0 : int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
8906 : void *buffer, size_t *length, loff_t *ppos)
8907 : {
8908 : int rc;
8909 :
8910 0 : rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
8911 0 : if (rc)
8912 : return rc;
8913 :
8914 0 : if (write) {
8915 0 : user_min_free_kbytes = min_free_kbytes;
8916 0 : setup_per_zone_wmarks();
8917 : }
8918 : return 0;
8919 : }
8920 :
8921 0 : int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
8922 : void *buffer, size_t *length, loff_t *ppos)
8923 : {
8924 : int rc;
8925 :
8926 0 : rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
8927 0 : if (rc)
8928 : return rc;
8929 :
8930 0 : if (write)
8931 0 : setup_per_zone_wmarks();
8932 :
8933 : return 0;
8934 : }
8935 :
8936 : #ifdef CONFIG_NUMA
8937 : static void setup_min_unmapped_ratio(void)
8938 : {
8939 : pg_data_t *pgdat;
8940 : struct zone *zone;
8941 :
8942 : for_each_online_pgdat(pgdat)
8943 : pgdat->min_unmapped_pages = 0;
8944 :
8945 : for_each_zone(zone)
8946 : zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
8947 : sysctl_min_unmapped_ratio) / 100;
8948 : }
8949 :
8950 :
8951 : int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
8952 : void *buffer, size_t *length, loff_t *ppos)
8953 : {
8954 : int rc;
8955 :
8956 : rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
8957 : if (rc)
8958 : return rc;
8959 :
8960 : setup_min_unmapped_ratio();
8961 :
8962 : return 0;
8963 : }
8964 :
8965 : static void setup_min_slab_ratio(void)
8966 : {
8967 : pg_data_t *pgdat;
8968 : struct zone *zone;
8969 :
8970 : for_each_online_pgdat(pgdat)
8971 : pgdat->min_slab_pages = 0;
8972 :
8973 : for_each_zone(zone)
8974 : zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
8975 : sysctl_min_slab_ratio) / 100;
8976 : }
8977 :
8978 : int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
8979 : void *buffer, size_t *length, loff_t *ppos)
8980 : {
8981 : int rc;
8982 :
8983 : rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
8984 : if (rc)
8985 : return rc;
8986 :
8987 : setup_min_slab_ratio();
8988 :
8989 : return 0;
8990 : }
8991 : #endif
8992 :
8993 : /*
8994 : * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
8995 : * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
8996 : * whenever sysctl_lowmem_reserve_ratio changes.
8997 : *
8998 : * The reserve ratio obviously has absolutely no relation with the
8999 : * minimum watermarks. The lowmem reserve ratio can only make sense
9000 : * if in function of the boot time zone sizes.
9001 : */
9002 0 : int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
9003 : void *buffer, size_t *length, loff_t *ppos)
9004 : {
9005 : int i;
9006 :
9007 0 : proc_dointvec_minmax(table, write, buffer, length, ppos);
9008 :
9009 0 : for (i = 0; i < MAX_NR_ZONES; i++) {
9010 0 : if (sysctl_lowmem_reserve_ratio[i] < 1)
9011 0 : sysctl_lowmem_reserve_ratio[i] = 0;
9012 : }
9013 :
9014 0 : setup_per_zone_lowmem_reserve();
9015 0 : return 0;
9016 : }
9017 :
9018 : /*
9019 : * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
9020 : * cpu. It is the fraction of total pages in each zone that a hot per cpu
9021 : * pagelist can have before it gets flushed back to buddy allocator.
9022 : */
9023 0 : int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
9024 : int write, void *buffer, size_t *length, loff_t *ppos)
9025 : {
9026 : struct zone *zone;
9027 : int old_percpu_pagelist_high_fraction;
9028 : int ret;
9029 :
9030 0 : mutex_lock(&pcp_batch_high_lock);
9031 0 : old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
9032 :
9033 0 : ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
9034 0 : if (!write || ret < 0)
9035 : goto out;
9036 :
9037 : /* Sanity checking to avoid pcp imbalance */
9038 0 : if (percpu_pagelist_high_fraction &&
9039 : percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
9040 0 : percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
9041 0 : ret = -EINVAL;
9042 0 : goto out;
9043 : }
9044 :
9045 : /* No change? */
9046 0 : if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
9047 : goto out;
9048 :
9049 0 : for_each_populated_zone(zone)
9050 0 : zone_set_pageset_high_and_batch(zone, 0);
9051 : out:
9052 0 : mutex_unlock(&pcp_batch_high_lock);
9053 0 : return ret;
9054 : }
9055 :
9056 : #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
9057 : /*
9058 : * Returns the number of pages that arch has reserved but
9059 : * is not known to alloc_large_system_hash().
9060 : */
9061 : static unsigned long __init arch_reserved_kernel_pages(void)
9062 : {
9063 : return 0;
9064 : }
9065 : #endif
9066 :
9067 : /*
9068 : * Adaptive scale is meant to reduce sizes of hash tables on large memory
9069 : * machines. As memory size is increased the scale is also increased but at
9070 : * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
9071 : * quadruples the scale is increased by one, which means the size of hash table
9072 : * only doubles, instead of quadrupling as well.
9073 : * Because 32-bit systems cannot have large physical memory, where this scaling
9074 : * makes sense, it is disabled on such platforms.
9075 : */
9076 : #if __BITS_PER_LONG > 32
9077 : #define ADAPT_SCALE_BASE (64ul << 30)
9078 : #define ADAPT_SCALE_SHIFT 2
9079 : #define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
9080 : #endif
9081 :
9082 : /*
9083 : * allocate a large system hash table from bootmem
9084 : * - it is assumed that the hash table must contain an exact power-of-2
9085 : * quantity of entries
9086 : * - limit is the number of hash buckets, not the total allocation size
9087 : */
9088 5 : void *__init alloc_large_system_hash(const char *tablename,
9089 : unsigned long bucketsize,
9090 : unsigned long numentries,
9091 : int scale,
9092 : int flags,
9093 : unsigned int *_hash_shift,
9094 : unsigned int *_hash_mask,
9095 : unsigned long low_limit,
9096 : unsigned long high_limit)
9097 : {
9098 5 : unsigned long long max = high_limit;
9099 : unsigned long log2qty, size;
9100 : void *table;
9101 : gfp_t gfp_flags;
9102 : bool virt;
9103 : bool huge;
9104 :
9105 : /* allow the kernel cmdline to have a say */
9106 5 : if (!numentries) {
9107 : /* round applicable memory size up to nearest megabyte */
9108 4 : numentries = nr_kernel_pages;
9109 4 : numentries -= arch_reserved_kernel_pages();
9110 :
9111 : /* It isn't necessary when PAGE_SIZE >= 1MB */
9112 : if (PAGE_SIZE < SZ_1M)
9113 4 : numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
9114 :
9115 : #if __BITS_PER_LONG > 32
9116 4 : if (!high_limit) {
9117 : unsigned long adapt;
9118 :
9119 4 : for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
9120 0 : adapt <<= ADAPT_SCALE_SHIFT)
9121 0 : scale++;
9122 : }
9123 : #endif
9124 :
9125 : /* limit to 1 bucket per 2^scale bytes of low memory */
9126 4 : if (scale > PAGE_SHIFT)
9127 4 : numentries >>= (scale - PAGE_SHIFT);
9128 : else
9129 0 : numentries <<= (PAGE_SHIFT - scale);
9130 :
9131 : /* Make sure we've got at least a 0-order allocation.. */
9132 4 : if (unlikely(flags & HASH_SMALL)) {
9133 : /* Makes no sense without HASH_EARLY */
9134 0 : WARN_ON(!(flags & HASH_EARLY));
9135 0 : if (!(numentries >> *_hash_shift)) {
9136 0 : numentries = 1UL << *_hash_shift;
9137 0 : BUG_ON(!numentries);
9138 : }
9139 4 : } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
9140 0 : numentries = PAGE_SIZE / bucketsize;
9141 : }
9142 10 : numentries = roundup_pow_of_two(numentries);
9143 :
9144 : /* limit allocation size to 1/16 total memory by default */
9145 5 : if (max == 0) {
9146 4 : max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
9147 4 : do_div(max, bucketsize);
9148 : }
9149 5 : max = min(max, 0x80000000ULL);
9150 :
9151 5 : if (numentries < low_limit)
9152 0 : numentries = low_limit;
9153 5 : if (numentries > max)
9154 0 : numentries = max;
9155 :
9156 10 : log2qty = ilog2(numentries);
9157 :
9158 5 : gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
9159 : do {
9160 5 : virt = false;
9161 5 : size = bucketsize << log2qty;
9162 5 : if (flags & HASH_EARLY) {
9163 2 : if (flags & HASH_ZERO)
9164 2 : table = memblock_alloc(size, SMP_CACHE_BYTES);
9165 : else
9166 0 : table = memblock_alloc_raw(size,
9167 : SMP_CACHE_BYTES);
9168 3 : } else if (get_order(size) >= MAX_ORDER || hashdist) {
9169 0 : table = vmalloc_huge(size, gfp_flags);
9170 0 : virt = true;
9171 : if (table)
9172 : huge = is_vm_area_hugepages(table);
9173 : } else {
9174 : /*
9175 : * If bucketsize is not a power-of-two, we may free
9176 : * some pages at the end of hash table which
9177 : * alloc_pages_exact() automatically does
9178 : */
9179 3 : table = alloc_pages_exact(size, gfp_flags);
9180 3 : kmemleak_alloc(table, size, 1, gfp_flags);
9181 : }
9182 5 : } while (!table && size > PAGE_SIZE && --log2qty);
9183 :
9184 5 : if (!table)
9185 0 : panic("Failed to allocate %s hash table\n", tablename);
9186 :
9187 10 : pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
9188 : tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
9189 : virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
9190 :
9191 5 : if (_hash_shift)
9192 5 : *_hash_shift = log2qty;
9193 5 : if (_hash_mask)
9194 3 : *_hash_mask = (1 << log2qty) - 1;
9195 :
9196 5 : return table;
9197 : }
9198 :
9199 : #ifdef CONFIG_CONTIG_ALLOC
9200 : #if defined(CONFIG_DYNAMIC_DEBUG) || \
9201 : (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
9202 : /* Usage: See admin-guide/dynamic-debug-howto.rst */
9203 : static void alloc_contig_dump_pages(struct list_head *page_list)
9204 : {
9205 : DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
9206 :
9207 : if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
9208 : struct page *page;
9209 :
9210 : dump_stack();
9211 : list_for_each_entry(page, page_list, lru)
9212 : dump_page(page, "migration failure");
9213 : }
9214 : }
9215 : #else
9216 : static inline void alloc_contig_dump_pages(struct list_head *page_list)
9217 : {
9218 : }
9219 : #endif
9220 :
9221 : /* [start, end) must belong to a single zone. */
9222 : int __alloc_contig_migrate_range(struct compact_control *cc,
9223 : unsigned long start, unsigned long end)
9224 : {
9225 : /* This function is based on compact_zone() from compaction.c. */
9226 : unsigned int nr_reclaimed;
9227 : unsigned long pfn = start;
9228 : unsigned int tries = 0;
9229 : int ret = 0;
9230 : struct migration_target_control mtc = {
9231 : .nid = zone_to_nid(cc->zone),
9232 : .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
9233 : };
9234 :
9235 : lru_cache_disable();
9236 :
9237 : while (pfn < end || !list_empty(&cc->migratepages)) {
9238 : if (fatal_signal_pending(current)) {
9239 : ret = -EINTR;
9240 : break;
9241 : }
9242 :
9243 : if (list_empty(&cc->migratepages)) {
9244 : cc->nr_migratepages = 0;
9245 : ret = isolate_migratepages_range(cc, pfn, end);
9246 : if (ret && ret != -EAGAIN)
9247 : break;
9248 : pfn = cc->migrate_pfn;
9249 : tries = 0;
9250 : } else if (++tries == 5) {
9251 : ret = -EBUSY;
9252 : break;
9253 : }
9254 :
9255 : nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
9256 : &cc->migratepages);
9257 : cc->nr_migratepages -= nr_reclaimed;
9258 :
9259 : ret = migrate_pages(&cc->migratepages, alloc_migration_target,
9260 : NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
9261 :
9262 : /*
9263 : * On -ENOMEM, migrate_pages() bails out right away. It is pointless
9264 : * to retry again over this error, so do the same here.
9265 : */
9266 : if (ret == -ENOMEM)
9267 : break;
9268 : }
9269 :
9270 : lru_cache_enable();
9271 : if (ret < 0) {
9272 : if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
9273 : alloc_contig_dump_pages(&cc->migratepages);
9274 : putback_movable_pages(&cc->migratepages);
9275 : return ret;
9276 : }
9277 : return 0;
9278 : }
9279 :
9280 : /**
9281 : * alloc_contig_range() -- tries to allocate given range of pages
9282 : * @start: start PFN to allocate
9283 : * @end: one-past-the-last PFN to allocate
9284 : * @migratetype: migratetype of the underlying pageblocks (either
9285 : * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
9286 : * in range must have the same migratetype and it must
9287 : * be either of the two.
9288 : * @gfp_mask: GFP mask to use during compaction
9289 : *
9290 : * The PFN range does not have to be pageblock aligned. The PFN range must
9291 : * belong to a single zone.
9292 : *
9293 : * The first thing this routine does is attempt to MIGRATE_ISOLATE all
9294 : * pageblocks in the range. Once isolated, the pageblocks should not
9295 : * be modified by others.
9296 : *
9297 : * Return: zero on success or negative error code. On success all
9298 : * pages which PFN is in [start, end) are allocated for the caller and
9299 : * need to be freed with free_contig_range().
9300 : */
9301 : int alloc_contig_range(unsigned long start, unsigned long end,
9302 : unsigned migratetype, gfp_t gfp_mask)
9303 : {
9304 : unsigned long outer_start, outer_end;
9305 : int order;
9306 : int ret = 0;
9307 :
9308 : struct compact_control cc = {
9309 : .nr_migratepages = 0,
9310 : .order = -1,
9311 : .zone = page_zone(pfn_to_page(start)),
9312 : .mode = MIGRATE_SYNC,
9313 : .ignore_skip_hint = true,
9314 : .no_set_skip_hint = true,
9315 : .gfp_mask = current_gfp_context(gfp_mask),
9316 : .alloc_contig = true,
9317 : };
9318 : INIT_LIST_HEAD(&cc.migratepages);
9319 :
9320 : /*
9321 : * What we do here is we mark all pageblocks in range as
9322 : * MIGRATE_ISOLATE. Because pageblock and max order pages may
9323 : * have different sizes, and due to the way page allocator
9324 : * work, start_isolate_page_range() has special handlings for this.
9325 : *
9326 : * Once the pageblocks are marked as MIGRATE_ISOLATE, we
9327 : * migrate the pages from an unaligned range (ie. pages that
9328 : * we are interested in). This will put all the pages in
9329 : * range back to page allocator as MIGRATE_ISOLATE.
9330 : *
9331 : * When this is done, we take the pages in range from page
9332 : * allocator removing them from the buddy system. This way
9333 : * page allocator will never consider using them.
9334 : *
9335 : * This lets us mark the pageblocks back as
9336 : * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
9337 : * aligned range but not in the unaligned, original range are
9338 : * put back to page allocator so that buddy can use them.
9339 : */
9340 :
9341 : ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
9342 : if (ret)
9343 : goto done;
9344 :
9345 : drain_all_pages(cc.zone);
9346 :
9347 : /*
9348 : * In case of -EBUSY, we'd like to know which page causes problem.
9349 : * So, just fall through. test_pages_isolated() has a tracepoint
9350 : * which will report the busy page.
9351 : *
9352 : * It is possible that busy pages could become available before
9353 : * the call to test_pages_isolated, and the range will actually be
9354 : * allocated. So, if we fall through be sure to clear ret so that
9355 : * -EBUSY is not accidentally used or returned to caller.
9356 : */
9357 : ret = __alloc_contig_migrate_range(&cc, start, end);
9358 : if (ret && ret != -EBUSY)
9359 : goto done;
9360 : ret = 0;
9361 :
9362 : /*
9363 : * Pages from [start, end) are within a pageblock_nr_pages
9364 : * aligned blocks that are marked as MIGRATE_ISOLATE. What's
9365 : * more, all pages in [start, end) are free in page allocator.
9366 : * What we are going to do is to allocate all pages from
9367 : * [start, end) (that is remove them from page allocator).
9368 : *
9369 : * The only problem is that pages at the beginning and at the
9370 : * end of interesting range may be not aligned with pages that
9371 : * page allocator holds, ie. they can be part of higher order
9372 : * pages. Because of this, we reserve the bigger range and
9373 : * once this is done free the pages we are not interested in.
9374 : *
9375 : * We don't have to hold zone->lock here because the pages are
9376 : * isolated thus they won't get removed from buddy.
9377 : */
9378 :
9379 : order = 0;
9380 : outer_start = start;
9381 : while (!PageBuddy(pfn_to_page(outer_start))) {
9382 : if (++order >= MAX_ORDER) {
9383 : outer_start = start;
9384 : break;
9385 : }
9386 : outer_start &= ~0UL << order;
9387 : }
9388 :
9389 : if (outer_start != start) {
9390 : order = buddy_order(pfn_to_page(outer_start));
9391 :
9392 : /*
9393 : * outer_start page could be small order buddy page and
9394 : * it doesn't include start page. Adjust outer_start
9395 : * in this case to report failed page properly
9396 : * on tracepoint in test_pages_isolated()
9397 : */
9398 : if (outer_start + (1UL << order) <= start)
9399 : outer_start = start;
9400 : }
9401 :
9402 : /* Make sure the range is really isolated. */
9403 : if (test_pages_isolated(outer_start, end, 0)) {
9404 : ret = -EBUSY;
9405 : goto done;
9406 : }
9407 :
9408 : /* Grab isolated pages from freelists. */
9409 : outer_end = isolate_freepages_range(&cc, outer_start, end);
9410 : if (!outer_end) {
9411 : ret = -EBUSY;
9412 : goto done;
9413 : }
9414 :
9415 : /* Free head and tail (if any) */
9416 : if (start != outer_start)
9417 : free_contig_range(outer_start, start - outer_start);
9418 : if (end != outer_end)
9419 : free_contig_range(end, outer_end - end);
9420 :
9421 : done:
9422 : undo_isolate_page_range(start, end, migratetype);
9423 : return ret;
9424 : }
9425 : EXPORT_SYMBOL(alloc_contig_range);
9426 :
9427 : static int __alloc_contig_pages(unsigned long start_pfn,
9428 : unsigned long nr_pages, gfp_t gfp_mask)
9429 : {
9430 : unsigned long end_pfn = start_pfn + nr_pages;
9431 :
9432 : return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
9433 : gfp_mask);
9434 : }
9435 :
9436 : static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
9437 : unsigned long nr_pages)
9438 : {
9439 : unsigned long i, end_pfn = start_pfn + nr_pages;
9440 : struct page *page;
9441 :
9442 : for (i = start_pfn; i < end_pfn; i++) {
9443 : page = pfn_to_online_page(i);
9444 : if (!page)
9445 : return false;
9446 :
9447 : if (page_zone(page) != z)
9448 : return false;
9449 :
9450 : if (PageReserved(page))
9451 : return false;
9452 : }
9453 : return true;
9454 : }
9455 :
9456 : static bool zone_spans_last_pfn(const struct zone *zone,
9457 : unsigned long start_pfn, unsigned long nr_pages)
9458 : {
9459 : unsigned long last_pfn = start_pfn + nr_pages - 1;
9460 :
9461 : return zone_spans_pfn(zone, last_pfn);
9462 : }
9463 :
9464 : /**
9465 : * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
9466 : * @nr_pages: Number of contiguous pages to allocate
9467 : * @gfp_mask: GFP mask to limit search and used during compaction
9468 : * @nid: Target node
9469 : * @nodemask: Mask for other possible nodes
9470 : *
9471 : * This routine is a wrapper around alloc_contig_range(). It scans over zones
9472 : * on an applicable zonelist to find a contiguous pfn range which can then be
9473 : * tried for allocation with alloc_contig_range(). This routine is intended
9474 : * for allocation requests which can not be fulfilled with the buddy allocator.
9475 : *
9476 : * The allocated memory is always aligned to a page boundary. If nr_pages is a
9477 : * power of two, then allocated range is also guaranteed to be aligned to same
9478 : * nr_pages (e.g. 1GB request would be aligned to 1GB).
9479 : *
9480 : * Allocated pages can be freed with free_contig_range() or by manually calling
9481 : * __free_page() on each allocated page.
9482 : *
9483 : * Return: pointer to contiguous pages on success, or NULL if not successful.
9484 : */
9485 : struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
9486 : int nid, nodemask_t *nodemask)
9487 : {
9488 : unsigned long ret, pfn, flags;
9489 : struct zonelist *zonelist;
9490 : struct zone *zone;
9491 : struct zoneref *z;
9492 :
9493 : zonelist = node_zonelist(nid, gfp_mask);
9494 : for_each_zone_zonelist_nodemask(zone, z, zonelist,
9495 : gfp_zone(gfp_mask), nodemask) {
9496 : spin_lock_irqsave(&zone->lock, flags);
9497 :
9498 : pfn = ALIGN(zone->zone_start_pfn, nr_pages);
9499 : while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
9500 : if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
9501 : /*
9502 : * We release the zone lock here because
9503 : * alloc_contig_range() will also lock the zone
9504 : * at some point. If there's an allocation
9505 : * spinning on this lock, it may win the race
9506 : * and cause alloc_contig_range() to fail...
9507 : */
9508 : spin_unlock_irqrestore(&zone->lock, flags);
9509 : ret = __alloc_contig_pages(pfn, nr_pages,
9510 : gfp_mask);
9511 : if (!ret)
9512 : return pfn_to_page(pfn);
9513 : spin_lock_irqsave(&zone->lock, flags);
9514 : }
9515 : pfn += nr_pages;
9516 : }
9517 : spin_unlock_irqrestore(&zone->lock, flags);
9518 : }
9519 : return NULL;
9520 : }
9521 : #endif /* CONFIG_CONTIG_ALLOC */
9522 :
9523 0 : void free_contig_range(unsigned long pfn, unsigned long nr_pages)
9524 : {
9525 0 : unsigned long count = 0;
9526 :
9527 0 : for (; nr_pages--; pfn++) {
9528 0 : struct page *page = pfn_to_page(pfn);
9529 :
9530 0 : count += page_count(page) != 1;
9531 0 : __free_page(page);
9532 : }
9533 0 : WARN(count != 0, "%lu pages are still in use!\n", count);
9534 0 : }
9535 : EXPORT_SYMBOL(free_contig_range);
9536 :
9537 : /*
9538 : * Effectively disable pcplists for the zone by setting the high limit to 0
9539 : * and draining all cpus. A concurrent page freeing on another CPU that's about
9540 : * to put the page on pcplist will either finish before the drain and the page
9541 : * will be drained, or observe the new high limit and skip the pcplist.
9542 : *
9543 : * Must be paired with a call to zone_pcp_enable().
9544 : */
9545 0 : void zone_pcp_disable(struct zone *zone)
9546 : {
9547 0 : mutex_lock(&pcp_batch_high_lock);
9548 0 : __zone_set_pageset_high_and_batch(zone, 0, 1);
9549 0 : __drain_all_pages(zone, true);
9550 0 : }
9551 :
9552 0 : void zone_pcp_enable(struct zone *zone)
9553 : {
9554 0 : __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
9555 0 : mutex_unlock(&pcp_batch_high_lock);
9556 0 : }
9557 :
9558 0 : void zone_pcp_reset(struct zone *zone)
9559 : {
9560 : int cpu;
9561 : struct per_cpu_zonestat *pzstats;
9562 :
9563 0 : if (zone->per_cpu_pageset != &boot_pageset) {
9564 : for_each_online_cpu(cpu) {
9565 : pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
9566 : drain_zonestat(zone, pzstats);
9567 : }
9568 0 : free_percpu(zone->per_cpu_pageset);
9569 0 : zone->per_cpu_pageset = &boot_pageset;
9570 0 : if (zone->per_cpu_zonestats != &boot_zonestats) {
9571 0 : free_percpu(zone->per_cpu_zonestats);
9572 0 : zone->per_cpu_zonestats = &boot_zonestats;
9573 : }
9574 : }
9575 0 : }
9576 :
9577 : #ifdef CONFIG_MEMORY_HOTREMOVE
9578 : /*
9579 : * All pages in the range must be in a single zone, must not contain holes,
9580 : * must span full sections, and must be isolated before calling this function.
9581 : */
9582 : void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
9583 : {
9584 : unsigned long pfn = start_pfn;
9585 : struct page *page;
9586 : struct zone *zone;
9587 : unsigned int order;
9588 : unsigned long flags;
9589 :
9590 : offline_mem_sections(pfn, end_pfn);
9591 : zone = page_zone(pfn_to_page(pfn));
9592 : spin_lock_irqsave(&zone->lock, flags);
9593 : while (pfn < end_pfn) {
9594 : page = pfn_to_page(pfn);
9595 : /*
9596 : * The HWPoisoned page may be not in buddy system, and
9597 : * page_count() is not 0.
9598 : */
9599 : if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
9600 : pfn++;
9601 : continue;
9602 : }
9603 : /*
9604 : * At this point all remaining PageOffline() pages have a
9605 : * reference count of 0 and can simply be skipped.
9606 : */
9607 : if (PageOffline(page)) {
9608 : BUG_ON(page_count(page));
9609 : BUG_ON(PageBuddy(page));
9610 : pfn++;
9611 : continue;
9612 : }
9613 :
9614 : BUG_ON(page_count(page));
9615 : BUG_ON(!PageBuddy(page));
9616 : order = buddy_order(page);
9617 : del_page_from_free_list(page, zone, order);
9618 : pfn += (1 << order);
9619 : }
9620 : spin_unlock_irqrestore(&zone->lock, flags);
9621 : }
9622 : #endif
9623 :
9624 : /*
9625 : * This function returns a stable result only if called under zone lock.
9626 : */
9627 0 : bool is_free_buddy_page(struct page *page)
9628 : {
9629 0 : unsigned long pfn = page_to_pfn(page);
9630 : unsigned int order;
9631 :
9632 0 : for (order = 0; order < MAX_ORDER; order++) {
9633 0 : struct page *page_head = page - (pfn & ((1 << order) - 1));
9634 :
9635 0 : if (PageBuddy(page_head) &&
9636 0 : buddy_order_unsafe(page_head) >= order)
9637 : break;
9638 : }
9639 :
9640 0 : return order < MAX_ORDER;
9641 : }
9642 : EXPORT_SYMBOL(is_free_buddy_page);
9643 :
9644 : #ifdef CONFIG_MEMORY_FAILURE
9645 : /*
9646 : * Break down a higher-order page in sub-pages, and keep our target out of
9647 : * buddy allocator.
9648 : */
9649 : static void break_down_buddy_pages(struct zone *zone, struct page *page,
9650 : struct page *target, int low, int high,
9651 : int migratetype)
9652 : {
9653 : unsigned long size = 1 << high;
9654 : struct page *current_buddy, *next_page;
9655 :
9656 : while (high > low) {
9657 : high--;
9658 : size >>= 1;
9659 :
9660 : if (target >= &page[size]) {
9661 : next_page = page + size;
9662 : current_buddy = page;
9663 : } else {
9664 : next_page = page;
9665 : current_buddy = page + size;
9666 : }
9667 :
9668 : if (set_page_guard(zone, current_buddy, high, migratetype))
9669 : continue;
9670 :
9671 : if (current_buddy != target) {
9672 : add_to_free_list(current_buddy, zone, high, migratetype);
9673 : set_buddy_order(current_buddy, high);
9674 : page = next_page;
9675 : }
9676 : }
9677 : }
9678 :
9679 : /*
9680 : * Take a page that will be marked as poisoned off the buddy allocator.
9681 : */
9682 : bool take_page_off_buddy(struct page *page)
9683 : {
9684 : struct zone *zone = page_zone(page);
9685 : unsigned long pfn = page_to_pfn(page);
9686 : unsigned long flags;
9687 : unsigned int order;
9688 : bool ret = false;
9689 :
9690 : spin_lock_irqsave(&zone->lock, flags);
9691 : for (order = 0; order < MAX_ORDER; order++) {
9692 : struct page *page_head = page - (pfn & ((1 << order) - 1));
9693 : int page_order = buddy_order(page_head);
9694 :
9695 : if (PageBuddy(page_head) && page_order >= order) {
9696 : unsigned long pfn_head = page_to_pfn(page_head);
9697 : int migratetype = get_pfnblock_migratetype(page_head,
9698 : pfn_head);
9699 :
9700 : del_page_from_free_list(page_head, zone, page_order);
9701 : break_down_buddy_pages(zone, page_head, page, 0,
9702 : page_order, migratetype);
9703 : SetPageHWPoisonTakenOff(page);
9704 : if (!is_migrate_isolate(migratetype))
9705 : __mod_zone_freepage_state(zone, -1, migratetype);
9706 : ret = true;
9707 : break;
9708 : }
9709 : if (page_count(page_head) > 0)
9710 : break;
9711 : }
9712 : spin_unlock_irqrestore(&zone->lock, flags);
9713 : return ret;
9714 : }
9715 :
9716 : /*
9717 : * Cancel takeoff done by take_page_off_buddy().
9718 : */
9719 : bool put_page_back_buddy(struct page *page)
9720 : {
9721 : struct zone *zone = page_zone(page);
9722 : unsigned long pfn = page_to_pfn(page);
9723 : unsigned long flags;
9724 : int migratetype = get_pfnblock_migratetype(page, pfn);
9725 : bool ret = false;
9726 :
9727 : spin_lock_irqsave(&zone->lock, flags);
9728 : if (put_page_testzero(page)) {
9729 : ClearPageHWPoisonTakenOff(page);
9730 : __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
9731 : if (TestClearPageHWPoison(page)) {
9732 : ret = true;
9733 : }
9734 : }
9735 : spin_unlock_irqrestore(&zone->lock, flags);
9736 :
9737 : return ret;
9738 : }
9739 : #endif
9740 :
9741 : #ifdef CONFIG_ZONE_DMA
9742 : bool has_managed_dma(void)
9743 : {
9744 : struct pglist_data *pgdat;
9745 :
9746 : for_each_online_pgdat(pgdat) {
9747 : struct zone *zone = &pgdat->node_zones[ZONE_DMA];
9748 :
9749 : if (managed_zone(zone))
9750 : return true;
9751 : }
9752 : return false;
9753 : }
9754 : #endif /* CONFIG_ZONE_DMA */
|