Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * linux/mm/page_alloc.c
4 : *
5 : * Manages the free list, the system allocates free pages here.
6 : * Note that kmalloc() lives in slab.c
7 : *
8 : * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
9 : * Swap reorganised 29.12.95, Stephen Tweedie
10 : * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11 : * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
12 : * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
13 : * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
14 : * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
15 : * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
16 : */
17 :
18 : #include <linux/stddef.h>
19 : #include <linux/mm.h>
20 : #include <linux/highmem.h>
21 : #include <linux/interrupt.h>
22 : #include <linux/jiffies.h>
23 : #include <linux/compiler.h>
24 : #include <linux/kernel.h>
25 : #include <linux/kasan.h>
26 : #include <linux/kmsan.h>
27 : #include <linux/module.h>
28 : #include <linux/suspend.h>
29 : #include <linux/ratelimit.h>
30 : #include <linux/oom.h>
31 : #include <linux/topology.h>
32 : #include <linux/sysctl.h>
33 : #include <linux/cpu.h>
34 : #include <linux/cpuset.h>
35 : #include <linux/memory_hotplug.h>
36 : #include <linux/nodemask.h>
37 : #include <linux/vmstat.h>
38 : #include <linux/fault-inject.h>
39 : #include <linux/compaction.h>
40 : #include <trace/events/kmem.h>
41 : #include <trace/events/oom.h>
42 : #include <linux/prefetch.h>
43 : #include <linux/mm_inline.h>
44 : #include <linux/mmu_notifier.h>
45 : #include <linux/migrate.h>
46 : #include <linux/sched/mm.h>
47 : #include <linux/page_owner.h>
48 : #include <linux/page_table_check.h>
49 : #include <linux/memcontrol.h>
50 : #include <linux/ftrace.h>
51 : #include <linux/lockdep.h>
52 : #include <linux/psi.h>
53 : #include <linux/khugepaged.h>
54 : #include <linux/delayacct.h>
55 : #include <asm/div64.h>
56 : #include "internal.h"
57 : #include "shuffle.h"
58 : #include "page_reporting.h"
59 :
60 : /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
61 : typedef int __bitwise fpi_t;
62 :
63 : /* No special request */
64 : #define FPI_NONE ((__force fpi_t)0)
65 :
66 : /*
67 : * Skip free page reporting notification for the (possibly merged) page.
68 : * This does not hinder free page reporting from grabbing the page,
69 : * reporting it and marking it "reported" - it only skips notifying
70 : * the free page reporting infrastructure about a newly freed page. For
71 : * example, used when temporarily pulling a page from a freelist and
72 : * putting it back unmodified.
73 : */
74 : #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
75 :
76 : /*
77 : * Place the (possibly merged) page to the tail of the freelist. Will ignore
78 : * page shuffling (relevant code - e.g., memory onlining - is expected to
79 : * shuffle the whole zone).
80 : *
81 : * Note: No code should rely on this flag for correctness - it's purely
82 : * to allow for optimizations when handing back either fresh pages
83 : * (memory onlining) or untouched pages (page isolation, free page
84 : * reporting).
85 : */
86 : #define FPI_TO_TAIL ((__force fpi_t)BIT(1))
87 :
88 : /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
89 : static DEFINE_MUTEX(pcp_batch_high_lock);
90 : #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
91 :
92 : #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
93 : /*
94 : * On SMP, spin_trylock is sufficient protection.
95 : * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
96 : */
97 : #define pcp_trylock_prepare(flags) do { } while (0)
98 : #define pcp_trylock_finish(flag) do { } while (0)
99 : #else
100 :
101 : /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
102 : #define pcp_trylock_prepare(flags) local_irq_save(flags)
103 : #define pcp_trylock_finish(flags) local_irq_restore(flags)
104 : #endif
105 :
106 : /*
107 : * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
108 : * a migration causing the wrong PCP to be locked and remote memory being
109 : * potentially allocated, pin the task to the CPU for the lookup+lock.
110 : * preempt_disable is used on !RT because it is faster than migrate_disable.
111 : * migrate_disable is used on RT because otherwise RT spinlock usage is
112 : * interfered with and a high priority task cannot preempt the allocator.
113 : */
114 : #ifndef CONFIG_PREEMPT_RT
115 : #define pcpu_task_pin() preempt_disable()
116 : #define pcpu_task_unpin() preempt_enable()
117 : #else
118 : #define pcpu_task_pin() migrate_disable()
119 : #define pcpu_task_unpin() migrate_enable()
120 : #endif
121 :
122 : /*
123 : * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
124 : * Return value should be used with equivalent unlock helper.
125 : */
126 : #define pcpu_spin_lock(type, member, ptr) \
127 : ({ \
128 : type *_ret; \
129 : pcpu_task_pin(); \
130 : _ret = this_cpu_ptr(ptr); \
131 : spin_lock(&_ret->member); \
132 : _ret; \
133 : })
134 :
135 : #define pcpu_spin_trylock(type, member, ptr) \
136 : ({ \
137 : type *_ret; \
138 : pcpu_task_pin(); \
139 : _ret = this_cpu_ptr(ptr); \
140 : if (!spin_trylock(&_ret->member)) { \
141 : pcpu_task_unpin(); \
142 : _ret = NULL; \
143 : } \
144 : _ret; \
145 : })
146 :
147 : #define pcpu_spin_unlock(member, ptr) \
148 : ({ \
149 : spin_unlock(&ptr->member); \
150 : pcpu_task_unpin(); \
151 : })
152 :
153 : /* struct per_cpu_pages specific helpers. */
154 : #define pcp_spin_lock(ptr) \
155 : pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
156 :
157 : #define pcp_spin_trylock(ptr) \
158 : pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
159 :
160 : #define pcp_spin_unlock(ptr) \
161 : pcpu_spin_unlock(lock, ptr)
162 :
163 : #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
164 : DEFINE_PER_CPU(int, numa_node);
165 : EXPORT_PER_CPU_SYMBOL(numa_node);
166 : #endif
167 :
168 : DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
169 :
170 : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
171 : /*
172 : * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
173 : * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
174 : * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
175 : * defined in <linux/topology.h>.
176 : */
177 : DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
178 : EXPORT_PER_CPU_SYMBOL(_numa_mem_);
179 : #endif
180 :
181 : static DEFINE_MUTEX(pcpu_drain_mutex);
182 :
183 : #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
184 : volatile unsigned long latent_entropy __latent_entropy;
185 : EXPORT_SYMBOL(latent_entropy);
186 : #endif
187 :
188 : /*
189 : * Array of node states.
190 : */
191 : nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
192 : [N_POSSIBLE] = NODE_MASK_ALL,
193 : [N_ONLINE] = { { [0] = 1UL } },
194 : #ifndef CONFIG_NUMA
195 : [N_NORMAL_MEMORY] = { { [0] = 1UL } },
196 : #ifdef CONFIG_HIGHMEM
197 : [N_HIGH_MEMORY] = { { [0] = 1UL } },
198 : #endif
199 : [N_MEMORY] = { { [0] = 1UL } },
200 : [N_CPU] = { { [0] = 1UL } },
201 : #endif /* NUMA */
202 : };
203 : EXPORT_SYMBOL(node_states);
204 :
205 : gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
206 :
207 : /*
208 : * A cached value of the page's pageblock's migratetype, used when the page is
209 : * put on a pcplist. Used to avoid the pageblock migratetype lookup when
210 : * freeing from pcplists in most cases, at the cost of possibly becoming stale.
211 : * Also the migratetype set in the page does not necessarily match the pcplist
212 : * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
213 : * other index - this ensures that it will be put on the correct CMA freelist.
214 : */
215 : static inline int get_pcppage_migratetype(struct page *page)
216 : {
217 0 : return page->index;
218 : }
219 :
220 : static inline void set_pcppage_migratetype(struct page *page, int migratetype)
221 : {
222 677 : page->index = migratetype;
223 : }
224 :
225 : #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
226 : unsigned int pageblock_order __read_mostly;
227 : #endif
228 :
229 : static void __free_pages_ok(struct page *page, unsigned int order,
230 : fpi_t fpi_flags);
231 :
232 : /*
233 : * results with 256, 32 in the lowmem_reserve sysctl:
234 : * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
235 : * 1G machine -> (16M dma, 784M normal, 224M high)
236 : * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
237 : * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
238 : * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
239 : *
240 : * TBD: should special case ZONE_DMA32 machines here - in those we normally
241 : * don't need any ZONE_NORMAL reservation
242 : */
243 : static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
244 : #ifdef CONFIG_ZONE_DMA
245 : [ZONE_DMA] = 256,
246 : #endif
247 : #ifdef CONFIG_ZONE_DMA32
248 : [ZONE_DMA32] = 256,
249 : #endif
250 : [ZONE_NORMAL] = 32,
251 : #ifdef CONFIG_HIGHMEM
252 : [ZONE_HIGHMEM] = 0,
253 : #endif
254 : [ZONE_MOVABLE] = 0,
255 : };
256 :
257 : char * const zone_names[MAX_NR_ZONES] = {
258 : #ifdef CONFIG_ZONE_DMA
259 : "DMA",
260 : #endif
261 : #ifdef CONFIG_ZONE_DMA32
262 : "DMA32",
263 : #endif
264 : "Normal",
265 : #ifdef CONFIG_HIGHMEM
266 : "HighMem",
267 : #endif
268 : "Movable",
269 : #ifdef CONFIG_ZONE_DEVICE
270 : "Device",
271 : #endif
272 : };
273 :
274 : const char * const migratetype_names[MIGRATE_TYPES] = {
275 : "Unmovable",
276 : "Movable",
277 : "Reclaimable",
278 : "HighAtomic",
279 : #ifdef CONFIG_CMA
280 : "CMA",
281 : #endif
282 : #ifdef CONFIG_MEMORY_ISOLATION
283 : "Isolate",
284 : #endif
285 : };
286 :
287 : static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
288 : [NULL_COMPOUND_DTOR] = NULL,
289 : [COMPOUND_PAGE_DTOR] = free_compound_page,
290 : #ifdef CONFIG_HUGETLB_PAGE
291 : [HUGETLB_PAGE_DTOR] = free_huge_page,
292 : #endif
293 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
294 : [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
295 : #endif
296 : };
297 :
298 : int min_free_kbytes = 1024;
299 : int user_min_free_kbytes = -1;
300 : static int watermark_boost_factor __read_mostly = 15000;
301 : static int watermark_scale_factor = 10;
302 :
303 : /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
304 : int movable_zone;
305 : EXPORT_SYMBOL(movable_zone);
306 :
307 : #if MAX_NUMNODES > 1
308 : unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
309 : unsigned int nr_online_nodes __read_mostly = 1;
310 : EXPORT_SYMBOL(nr_node_ids);
311 : EXPORT_SYMBOL(nr_online_nodes);
312 : #endif
313 :
314 : static bool page_contains_unaccepted(struct page *page, unsigned int order);
315 : static void accept_page(struct page *page, unsigned int order);
316 : static bool try_to_accept_memory(struct zone *zone, unsigned int order);
317 : static inline bool has_unaccepted_memory(void);
318 : static bool __free_unaccepted(struct page *page);
319 :
320 : int page_group_by_mobility_disabled __read_mostly;
321 :
322 : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
323 : /*
324 : * During boot we initialize deferred pages on-demand, as needed, but once
325 : * page_alloc_init_late() has finished, the deferred pages are all initialized,
326 : * and we can permanently disable that path.
327 : */
328 : DEFINE_STATIC_KEY_TRUE(deferred_pages);
329 :
330 : static inline bool deferred_pages_enabled(void)
331 : {
332 : return static_branch_unlikely(&deferred_pages);
333 : }
334 :
335 : /*
336 : * deferred_grow_zone() is __init, but it is called from
337 : * get_page_from_freelist() during early boot until deferred_pages permanently
338 : * disables this call. This is why we have refdata wrapper to avoid warning,
339 : * and to ensure that the function body gets unloaded.
340 : */
341 : static bool __ref
342 : _deferred_grow_zone(struct zone *zone, unsigned int order)
343 : {
344 : return deferred_grow_zone(zone, order);
345 : }
346 : #else
347 : static inline bool deferred_pages_enabled(void)
348 : {
349 : return false;
350 : }
351 : #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
352 :
353 : /* Return a pointer to the bitmap storing bits affecting a block of pages */
354 : static inline unsigned long *get_pageblock_bitmap(const struct page *page,
355 : unsigned long pfn)
356 : {
357 : #ifdef CONFIG_SPARSEMEM
358 : return section_to_usemap(__pfn_to_section(pfn));
359 : #else
360 523 : return page_zone(page)->pageblock_flags;
361 : #endif /* CONFIG_SPARSEMEM */
362 : }
363 :
364 : static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
365 : {
366 : #ifdef CONFIG_SPARSEMEM
367 : pfn &= (PAGES_PER_SECTION-1);
368 : #else
369 523 : pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
370 : #endif /* CONFIG_SPARSEMEM */
371 523 : return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
372 : }
373 :
374 : static __always_inline
375 : unsigned long __get_pfnblock_flags_mask(const struct page *page,
376 : unsigned long pfn,
377 : unsigned long mask)
378 : {
379 : unsigned long *bitmap;
380 : unsigned long bitidx, word_bitidx;
381 : unsigned long word;
382 :
383 522 : bitmap = get_pageblock_bitmap(page, pfn);
384 261 : bitidx = pfn_to_bitidx(page, pfn);
385 261 : word_bitidx = bitidx / BITS_PER_LONG;
386 261 : bitidx &= (BITS_PER_LONG-1);
387 : /*
388 : * This races, without locks, with set_pfnblock_flags_mask(). Ensure
389 : * a consistent read of the memory array, so that results, even though
390 : * racy, are not corrupted.
391 : */
392 261 : word = READ_ONCE(bitmap[word_bitidx]);
393 261 : return (word >> bitidx) & mask;
394 : }
395 :
396 : /**
397 : * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
398 : * @page: The page within the block of interest
399 : * @pfn: The target page frame number
400 : * @mask: mask of bits that the caller is interested in
401 : *
402 : * Return: pageblock_bits flags
403 : */
404 0 : unsigned long get_pfnblock_flags_mask(const struct page *page,
405 : unsigned long pfn, unsigned long mask)
406 : {
407 2 : return __get_pfnblock_flags_mask(page, pfn, mask);
408 : }
409 :
410 : static __always_inline int get_pfnblock_migratetype(const struct page *page,
411 : unsigned long pfn)
412 : {
413 259 : return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
414 : }
415 :
416 : /**
417 : * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
418 : * @page: The page within the block of interest
419 : * @flags: The flags to set
420 : * @pfn: The target page frame number
421 : * @mask: mask of bits that the caller is interested in
422 : */
423 262 : void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
424 : unsigned long pfn,
425 : unsigned long mask)
426 : {
427 : unsigned long *bitmap;
428 : unsigned long bitidx, word_bitidx;
429 : unsigned long word;
430 :
431 : BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
432 : BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
433 :
434 524 : bitmap = get_pageblock_bitmap(page, pfn);
435 262 : bitidx = pfn_to_bitidx(page, pfn);
436 262 : word_bitidx = bitidx / BITS_PER_LONG;
437 262 : bitidx &= (BITS_PER_LONG-1);
438 :
439 : VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
440 :
441 262 : mask <<= bitidx;
442 262 : flags <<= bitidx;
443 :
444 262 : word = READ_ONCE(bitmap[word_bitidx]);
445 : do {
446 786 : } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
447 262 : }
448 :
449 262 : void set_pageblock_migratetype(struct page *page, int migratetype)
450 : {
451 262 : if (unlikely(page_group_by_mobility_disabled &&
452 : migratetype < MIGRATE_PCPTYPES))
453 0 : migratetype = MIGRATE_UNMOVABLE;
454 :
455 262 : set_pfnblock_flags_mask(page, (unsigned long)migratetype,
456 262 : page_to_pfn(page), MIGRATETYPE_MASK);
457 262 : }
458 :
459 : #ifdef CONFIG_DEBUG_VM
460 : static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
461 : {
462 : int ret = 0;
463 : unsigned seq;
464 : unsigned long pfn = page_to_pfn(page);
465 : unsigned long sp, start_pfn;
466 :
467 : do {
468 : seq = zone_span_seqbegin(zone);
469 : start_pfn = zone->zone_start_pfn;
470 : sp = zone->spanned_pages;
471 : if (!zone_spans_pfn(zone, pfn))
472 : ret = 1;
473 : } while (zone_span_seqretry(zone, seq));
474 :
475 : if (ret)
476 : pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
477 : pfn, zone_to_nid(zone), zone->name,
478 : start_pfn, start_pfn + sp);
479 :
480 : return ret;
481 : }
482 :
483 : /*
484 : * Temporary debugging check for pages not lying within a given zone.
485 : */
486 : static int __maybe_unused bad_range(struct zone *zone, struct page *page)
487 : {
488 : if (page_outside_zone_boundaries(zone, page))
489 : return 1;
490 : if (zone != page_zone(page))
491 : return 1;
492 :
493 : return 0;
494 : }
495 : #else
496 : static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
497 : {
498 : return 0;
499 : }
500 : #endif
501 :
502 0 : static void bad_page(struct page *page, const char *reason)
503 : {
504 : static unsigned long resume;
505 : static unsigned long nr_shown;
506 : static unsigned long nr_unshown;
507 :
508 : /*
509 : * Allow a burst of 60 reports, then keep quiet for that minute;
510 : * or allow a steady drip of one report per second.
511 : */
512 0 : if (nr_shown == 60) {
513 0 : if (time_before(jiffies, resume)) {
514 0 : nr_unshown++;
515 0 : goto out;
516 : }
517 0 : if (nr_unshown) {
518 0 : pr_alert(
519 : "BUG: Bad page state: %lu messages suppressed\n",
520 : nr_unshown);
521 0 : nr_unshown = 0;
522 : }
523 0 : nr_shown = 0;
524 : }
525 0 : if (nr_shown++ == 0)
526 0 : resume = jiffies + 60 * HZ;
527 :
528 0 : pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
529 : current->comm, page_to_pfn(page));
530 0 : dump_page(page, reason);
531 :
532 : print_modules();
533 0 : dump_stack();
534 : out:
535 : /* Leave bad fields for debug, except PageBuddy could make trouble */
536 0 : page_mapcount_reset(page); /* remove PageBuddy */
537 0 : add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
538 0 : }
539 :
540 : static inline unsigned int order_to_pindex(int migratetype, int order)
541 : {
542 457 : int base = order;
543 :
544 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
545 : if (order > PAGE_ALLOC_COSTLY_ORDER) {
546 : VM_BUG_ON(order != pageblock_order);
547 : return NR_LOWORDER_PCP_LISTS;
548 : }
549 : #else
550 : VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
551 : #endif
552 :
553 457 : return (MIGRATE_PCPTYPES * base) + migratetype;
554 : }
555 :
556 : static inline int pindex_to_order(unsigned int pindex)
557 : {
558 0 : int order = pindex / MIGRATE_PCPTYPES;
559 :
560 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
561 : if (pindex == NR_LOWORDER_PCP_LISTS)
562 : order = pageblock_order;
563 : #else
564 : VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
565 : #endif
566 :
567 : return order;
568 : }
569 :
570 : static inline bool pcp_allowed_order(unsigned int order)
571 : {
572 441 : if (order <= PAGE_ALLOC_COSTLY_ORDER)
573 : return true;
574 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
575 : if (order == pageblock_order)
576 : return true;
577 : #endif
578 : return false;
579 : }
580 :
581 0 : static inline void free_the_page(struct page *page, unsigned int order)
582 : {
583 0 : if (pcp_allowed_order(order)) /* Via pcp? */
584 0 : free_unref_page(page, order);
585 : else
586 0 : __free_pages_ok(page, order, FPI_NONE);
587 0 : }
588 :
589 : /*
590 : * Higher-order pages are called "compound pages". They are structured thusly:
591 : *
592 : * The first PAGE_SIZE page is called the "head page" and have PG_head set.
593 : *
594 : * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
595 : * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
596 : *
597 : * The first tail page's ->compound_dtor holds the offset in array of compound
598 : * page destructors. See compound_page_dtors.
599 : *
600 : * The first tail page's ->compound_order holds the order of allocation.
601 : * This usage means that zero-order pages may not be compound.
602 : */
603 :
604 0 : void free_compound_page(struct page *page)
605 : {
606 0 : mem_cgroup_uncharge(page_folio(page));
607 0 : free_the_page(page, compound_order(page));
608 0 : }
609 :
610 96 : void prep_compound_page(struct page *page, unsigned int order)
611 : {
612 : int i;
613 96 : int nr_pages = 1 << order;
614 :
615 96 : __SetPageHead(page);
616 238 : for (i = 1; i < nr_pages; i++)
617 142 : prep_compound_tail(page, i);
618 :
619 96 : prep_compound_head(page, order);
620 96 : }
621 :
622 0 : void destroy_large_folio(struct folio *folio)
623 : {
624 0 : enum compound_dtor_id dtor = folio->_folio_dtor;
625 :
626 : VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
627 0 : compound_page_dtors[dtor](&folio->page);
628 0 : }
629 :
630 : static inline void set_buddy_order(struct page *page, unsigned int order)
631 : {
632 1886 : set_page_private(page, order);
633 943 : __SetPageBuddy(page);
634 : }
635 :
636 : #ifdef CONFIG_COMPACTION
637 259 : static inline struct capture_control *task_capc(struct zone *zone)
638 : {
639 259 : struct capture_control *capc = current->capture_control;
640 :
641 259 : return unlikely(capc) &&
642 0 : !(current->flags & PF_KTHREAD) &&
643 0 : !capc->page &&
644 518 : capc->cc->zone == zone ? capc : NULL;
645 : }
646 :
647 : static inline bool
648 : compaction_capture(struct capture_control *capc, struct page *page,
649 : int order, int migratetype)
650 : {
651 11 : if (!capc || order != capc->cc->order)
652 : return false;
653 :
654 : /* Do not accidentally pollute CMA or isolated regions*/
655 : if (is_migrate_cma(migratetype) ||
656 0 : is_migrate_isolate(migratetype))
657 : return false;
658 :
659 : /*
660 : * Do not let lower order allocations pollute a movable pageblock.
661 : * This might let an unmovable request use a reclaimable pageblock
662 : * and vice-versa but no more than normal fallback logic which can
663 : * have trouble finding a high-order free page.
664 : */
665 0 : if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
666 : return false;
667 :
668 0 : capc->page = page;
669 : return true;
670 : }
671 :
672 : #else
673 : static inline struct capture_control *task_capc(struct zone *zone)
674 : {
675 : return NULL;
676 : }
677 :
678 : static inline bool
679 : compaction_capture(struct capture_control *capc, struct page *page,
680 : int order, int migratetype)
681 : {
682 : return false;
683 : }
684 : #endif /* CONFIG_COMPACTION */
685 :
686 : /* Used for pages not on another list */
687 : static inline void add_to_free_list(struct page *page, struct zone *zone,
688 : unsigned int order, int migratetype)
689 : {
690 684 : struct free_area *area = &zone->free_area[order];
691 :
692 1368 : list_add(&page->buddy_list, &area->free_list[migratetype]);
693 684 : area->nr_free++;
694 : }
695 :
696 : /* Used for pages not on another list */
697 : static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
698 : unsigned int order, int migratetype)
699 : {
700 259 : struct free_area *area = &zone->free_area[order];
701 :
702 518 : list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
703 259 : area->nr_free++;
704 : }
705 :
706 : /*
707 : * Used for pages which are on another list. Move the pages to the tail
708 : * of the list - so the moved pages won't immediately be considered for
709 : * allocation again (e.g., optimization for memory onlining).
710 : */
711 : static inline void move_to_free_list(struct page *page, struct zone *zone,
712 : unsigned int order, int migratetype)
713 : {
714 2 : struct free_area *area = &zone->free_area[order];
715 :
716 4 : list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
717 : }
718 :
719 : static inline void del_page_from_free_list(struct page *page, struct zone *zone,
720 : unsigned int order)
721 : {
722 : /* clear reported state and update reported page count */
723 : if (page_reported(page))
724 : __ClearPageReported(page);
725 :
726 1354 : list_del(&page->buddy_list);
727 677 : __ClearPageBuddy(page);
728 1354 : set_page_private(page, 0);
729 677 : zone->free_area[order].nr_free--;
730 : }
731 :
732 : static inline struct page *get_page_from_free_area(struct free_area *area,
733 : int migratetype)
734 : {
735 1384 : return list_first_entry_or_null(&area->free_list[migratetype],
736 : struct page, buddy_list);
737 : }
738 :
739 : /*
740 : * If this is not the largest possible page, check if the buddy
741 : * of the next-highest order is free. If it is, it's possible
742 : * that pages are being freed that will coalesce soon. In case,
743 : * that is happening, add the free page to the tail of the list
744 : * so it's less likely to be used soon and more likely to be merged
745 : * as a higher order page
746 : */
747 : static inline bool
748 0 : buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
749 : struct page *page, unsigned int order)
750 : {
751 : unsigned long higher_page_pfn;
752 : struct page *higher_page;
753 :
754 0 : if (order >= MAX_ORDER - 1)
755 : return false;
756 :
757 0 : higher_page_pfn = buddy_pfn & pfn;
758 0 : higher_page = page + (higher_page_pfn - pfn);
759 :
760 0 : return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
761 0 : NULL) != NULL;
762 : }
763 :
764 : /*
765 : * Freeing function for a buddy system allocator.
766 : *
767 : * The concept of a buddy system is to maintain direct-mapped table
768 : * (containing bit values) for memory blocks of various "orders".
769 : * The bottom level table contains the map for the smallest allocatable
770 : * units of memory (here, pages), and each level above it describes
771 : * pairs of units from the levels below, hence, "buddies".
772 : * At a high level, all that happens here is marking the table entry
773 : * at the bottom level available, and propagating the changes upward
774 : * as necessary, plus some accounting needed to play nicely with other
775 : * parts of the VM system.
776 : * At each level, we keep a list of pages, which are heads of continuous
777 : * free pages of length of (1 << order) and marked with PageBuddy.
778 : * Page's order is recorded in page_private(page) field.
779 : * So when we are allocating or freeing one, we can derive the state of the
780 : * other. That is, if we allocate a small block, and both were
781 : * free, the remainder of the region must be split into blocks.
782 : * If a block is freed, and its buddy is also free, then this
783 : * triggers coalescing into a block of larger size.
784 : *
785 : * -- nyc
786 : */
787 :
788 259 : static inline void __free_one_page(struct page *page,
789 : unsigned long pfn,
790 : struct zone *zone, unsigned int order,
791 : int migratetype, fpi_t fpi_flags)
792 : {
793 259 : struct capture_control *capc = task_capc(zone);
794 259 : unsigned long buddy_pfn = 0;
795 : unsigned long combined_pfn;
796 : struct page *buddy;
797 : bool to_tail;
798 :
799 : VM_BUG_ON(!zone_is_initialized(zone));
800 : VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
801 :
802 : VM_BUG_ON(migratetype == -1);
803 259 : if (likely(!is_migrate_isolate(migratetype)))
804 259 : __mod_zone_freepage_state(zone, 1 << order, migratetype);
805 :
806 : VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
807 : VM_BUG_ON_PAGE(bad_range(zone, page), page);
808 :
809 259 : while (order < MAX_ORDER) {
810 22 : if (compaction_capture(capc, page, order, migratetype)) {
811 0 : __mod_zone_freepage_state(zone, -(1 << order),
812 : migratetype);
813 0 : return;
814 : }
815 :
816 11 : buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
817 11 : if (!buddy)
818 : goto done_merging;
819 :
820 : if (unlikely(order >= pageblock_order)) {
821 : /*
822 : * We want to prevent merge between freepages on pageblock
823 : * without fallbacks and normal pageblock. Without this,
824 : * pageblock isolation could cause incorrect freepage or CMA
825 : * accounting or HIGHATOMIC accounting.
826 : */
827 : int buddy_mt = get_pageblock_migratetype(buddy);
828 :
829 : if (migratetype != buddy_mt
830 : && (!migratetype_is_mergeable(migratetype) ||
831 : !migratetype_is_mergeable(buddy_mt)))
832 : goto done_merging;
833 : }
834 :
835 : /*
836 : * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
837 : * merge with it and move up one order.
838 : */
839 : if (page_is_guard(buddy))
840 : clear_page_guard(zone, buddy, order, migratetype);
841 : else
842 : del_page_from_free_list(buddy, zone, order);
843 0 : combined_pfn = buddy_pfn & pfn;
844 0 : page = page + (combined_pfn - pfn);
845 0 : pfn = combined_pfn;
846 0 : order++;
847 : }
848 :
849 : done_merging:
850 259 : set_buddy_order(page, order);
851 :
852 259 : if (fpi_flags & FPI_TO_TAIL)
853 : to_tail = true;
854 0 : else if (is_shuffle_order(order))
855 : to_tail = shuffle_pick_tail();
856 : else
857 0 : to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
858 :
859 259 : if (to_tail)
860 : add_to_free_list_tail(page, zone, order, migratetype);
861 : else
862 : add_to_free_list(page, zone, order, migratetype);
863 :
864 : /* Notify page reporting subsystem of freed page */
865 : if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
866 : page_reporting_notify_free(order);
867 : }
868 :
869 : /**
870 : * split_free_page() -- split a free page at split_pfn_offset
871 : * @free_page: the original free page
872 : * @order: the order of the page
873 : * @split_pfn_offset: split offset within the page
874 : *
875 : * Return -ENOENT if the free page is changed, otherwise 0
876 : *
877 : * It is used when the free page crosses two pageblocks with different migratetypes
878 : * at split_pfn_offset within the page. The split free page will be put into
879 : * separate migratetype lists afterwards. Otherwise, the function achieves
880 : * nothing.
881 : */
882 0 : int split_free_page(struct page *free_page,
883 : unsigned int order, unsigned long split_pfn_offset)
884 : {
885 0 : struct zone *zone = page_zone(free_page);
886 0 : unsigned long free_page_pfn = page_to_pfn(free_page);
887 : unsigned long pfn;
888 : unsigned long flags;
889 : int free_page_order;
890 : int mt;
891 0 : int ret = 0;
892 :
893 0 : if (split_pfn_offset == 0)
894 : return ret;
895 :
896 0 : spin_lock_irqsave(&zone->lock, flags);
897 :
898 0 : if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
899 : ret = -ENOENT;
900 : goto out;
901 : }
902 :
903 0 : mt = get_pageblock_migratetype(free_page);
904 0 : if (likely(!is_migrate_isolate(mt)))
905 0 : __mod_zone_freepage_state(zone, -(1UL << order), mt);
906 :
907 0 : del_page_from_free_list(free_page, zone, order);
908 0 : for (pfn = free_page_pfn;
909 0 : pfn < free_page_pfn + (1UL << order);) {
910 0 : int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
911 :
912 0 : free_page_order = min_t(unsigned int,
913 : pfn ? __ffs(pfn) : order,
914 : __fls(split_pfn_offset));
915 0 : __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
916 : mt, FPI_NONE);
917 0 : pfn += 1UL << free_page_order;
918 0 : split_pfn_offset -= (1UL << free_page_order);
919 : /* we have done the first part, now switch to second part */
920 0 : if (split_pfn_offset == 0)
921 0 : split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
922 : }
923 : out:
924 0 : spin_unlock_irqrestore(&zone->lock, flags);
925 0 : return ret;
926 : }
927 : /*
928 : * A bad page could be due to a number of fields. Instead of multiple branches,
929 : * try and check multiple fields with one check. The caller must do a detailed
930 : * check if necessary.
931 : */
932 : static inline bool page_expected_state(struct page *page,
933 : unsigned long check_flags)
934 : {
935 0 : if (unlikely(atomic_read(&page->_mapcount) != -1))
936 : return false;
937 :
938 0 : if (unlikely((unsigned long)page->mapping |
939 : page_ref_count(page) |
940 : #ifdef CONFIG_MEMCG
941 : page->memcg_data |
942 : #endif
943 : (page->flags & check_flags)))
944 : return false;
945 :
946 : return true;
947 : }
948 :
949 : static const char *page_bad_reason(struct page *page, unsigned long flags)
950 : {
951 0 : const char *bad_reason = NULL;
952 :
953 0 : if (unlikely(atomic_read(&page->_mapcount) != -1))
954 0 : bad_reason = "nonzero mapcount";
955 0 : if (unlikely(page->mapping != NULL))
956 0 : bad_reason = "non-NULL mapping";
957 0 : if (unlikely(page_ref_count(page) != 0))
958 0 : bad_reason = "nonzero _refcount";
959 0 : if (unlikely(page->flags & flags)) {
960 : if (flags == PAGE_FLAGS_CHECK_AT_PREP)
961 : bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
962 : else
963 0 : bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
964 : }
965 : #ifdef CONFIG_MEMCG
966 : if (unlikely(page->memcg_data))
967 : bad_reason = "page still charged to cgroup";
968 : #endif
969 : return bad_reason;
970 : }
971 :
972 0 : static void free_page_is_bad_report(struct page *page)
973 : {
974 0 : bad_page(page,
975 : page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
976 0 : }
977 :
978 0 : static inline bool free_page_is_bad(struct page *page)
979 : {
980 0 : if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
981 : return false;
982 :
983 : /* Something has gone sideways, find it */
984 0 : free_page_is_bad_report(page);
985 0 : return true;
986 : }
987 :
988 0 : static inline bool is_check_pages_enabled(void)
989 : {
990 255250 : return static_branch_unlikely(&check_pages_enabled);
991 : }
992 :
993 0 : static int free_tail_page_prepare(struct page *head_page, struct page *page)
994 : {
995 0 : struct folio *folio = (struct folio *)head_page;
996 0 : int ret = 1;
997 :
998 : /*
999 : * We rely page->lru.next never has bit 0 set, unless the page
1000 : * is PageTail(). Let's make sure that's true even for poisoned ->lru.
1001 : */
1002 : BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
1003 :
1004 0 : if (!is_check_pages_enabled()) {
1005 : ret = 0;
1006 : goto out;
1007 : }
1008 0 : switch (page - head_page) {
1009 : case 1:
1010 : /* the first tail page: these may be in place of ->mapping */
1011 0 : if (unlikely(folio_entire_mapcount(folio))) {
1012 0 : bad_page(page, "nonzero entire_mapcount");
1013 0 : goto out;
1014 : }
1015 0 : if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
1016 0 : bad_page(page, "nonzero nr_pages_mapped");
1017 0 : goto out;
1018 : }
1019 0 : if (unlikely(atomic_read(&folio->_pincount))) {
1020 0 : bad_page(page, "nonzero pincount");
1021 0 : goto out;
1022 : }
1023 : break;
1024 : case 2:
1025 : /*
1026 : * the second tail page: ->mapping is
1027 : * deferred_list.next -- ignore value.
1028 : */
1029 : break;
1030 : default:
1031 0 : if (page->mapping != TAIL_MAPPING) {
1032 0 : bad_page(page, "corrupted mapping in tail page");
1033 0 : goto out;
1034 : }
1035 : break;
1036 : }
1037 0 : if (unlikely(!PageTail(page))) {
1038 0 : bad_page(page, "PageTail not set");
1039 0 : goto out;
1040 : }
1041 0 : if (unlikely(compound_head(page) != head_page)) {
1042 0 : bad_page(page, "compound_head not consistent");
1043 0 : goto out;
1044 : }
1045 : ret = 0;
1046 : out:
1047 0 : page->mapping = NULL;
1048 0 : clear_compound_head(page);
1049 0 : return ret;
1050 : }
1051 :
1052 : /*
1053 : * Skip KASAN memory poisoning when either:
1054 : *
1055 : * 1. For generic KASAN: deferred memory initialization has not yet completed.
1056 : * Tag-based KASAN modes skip pages freed via deferred memory initialization
1057 : * using page tags instead (see below).
1058 : * 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
1059 : * that error detection is disabled for accesses via the page address.
1060 : *
1061 : * Pages will have match-all tags in the following circumstances:
1062 : *
1063 : * 1. Pages are being initialized for the first time, including during deferred
1064 : * memory init; see the call to page_kasan_tag_reset in __init_single_page.
1065 : * 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
1066 : * exception of pages unpoisoned by kasan_unpoison_vmalloc.
1067 : * 3. The allocation was excluded from being checked due to sampling,
1068 : * see the call to kasan_unpoison_pages.
1069 : *
1070 : * Poisoning pages during deferred memory init will greatly lengthen the
1071 : * process and cause problem in large memory systems as the deferred pages
1072 : * initialization is done with interrupt disabled.
1073 : *
1074 : * Assuming that there will be no reference to those newly initialized
1075 : * pages before they are ever allocated, this should have no effect on
1076 : * KASAN memory tracking as the poison will be properly inserted at page
1077 : * allocation time. The only corner case is when pages are allocated by
1078 : * on-demand allocation and then freed again before the deferred pages
1079 : * initialization is done, but this is not likely to happen.
1080 : */
1081 : static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
1082 : {
1083 : if (IS_ENABLED(CONFIG_KASAN_GENERIC))
1084 : return deferred_pages_enabled();
1085 :
1086 259 : return page_kasan_tag(page) == 0xff;
1087 : }
1088 :
1089 0 : static void kernel_init_pages(struct page *page, int numpages)
1090 : {
1091 : int i;
1092 :
1093 : /* s390's use of memset() could override KASAN redzones. */
1094 : kasan_disable_current();
1095 74 : for (i = 0; i < numpages; i++)
1096 74 : clear_highpage_kasan_tagged(page + i);
1097 : kasan_enable_current();
1098 0 : }
1099 :
1100 : static __always_inline bool free_pages_prepare(struct page *page,
1101 : unsigned int order, fpi_t fpi_flags)
1102 : {
1103 259 : int bad = 0;
1104 518 : bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
1105 259 : bool init = want_init_on_free();
1106 :
1107 : VM_BUG_ON_PAGE(PageTail(page), page);
1108 :
1109 259 : trace_mm_page_free(page, order);
1110 259 : kmsan_free_page(page, order);
1111 :
1112 259 : if (unlikely(PageHWPoison(page)) && !order) {
1113 : /*
1114 : * Do not let hwpoison pages hit pcplists/buddy
1115 : * Untie memcg state and reset page's owner
1116 : */
1117 : if (memcg_kmem_online() && PageMemcgKmem(page))
1118 : __memcg_kmem_uncharge_page(page, order);
1119 : reset_page_owner(page, order);
1120 : page_table_check_free(page, order);
1121 : return false;
1122 : }
1123 :
1124 : /*
1125 : * Check tail pages before head page information is cleared to
1126 : * avoid checking PageCompound for order-0 pages.
1127 : */
1128 259 : if (unlikely(order)) {
1129 258 : bool compound = PageCompound(page);
1130 : int i;
1131 :
1132 : VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
1133 :
1134 : if (compound)
1135 : ClearPageHasHWPoisoned(page);
1136 254486 : for (i = 1; i < (1 << order); i++) {
1137 254486 : if (compound)
1138 0 : bad += free_tail_page_prepare(page, page + i);
1139 254486 : if (is_check_pages_enabled()) {
1140 0 : if (free_page_is_bad(page + i)) {
1141 0 : bad++;
1142 0 : continue;
1143 : }
1144 : }
1145 254486 : (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1146 : }
1147 : }
1148 259 : if (PageMappingFlags(page))
1149 0 : page->mapping = NULL;
1150 : if (memcg_kmem_online() && PageMemcgKmem(page))
1151 : __memcg_kmem_uncharge_page(page, order);
1152 259 : if (is_check_pages_enabled()) {
1153 0 : if (free_page_is_bad(page))
1154 0 : bad++;
1155 0 : if (bad)
1156 : return false;
1157 : }
1158 :
1159 259 : page_cpupid_reset_last(page);
1160 259 : page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1161 : reset_page_owner(page, order);
1162 259 : page_table_check_free(page, order);
1163 :
1164 259 : if (!PageHighMem(page)) {
1165 : debug_check_no_locks_freed(page_address(page),
1166 : PAGE_SIZE << order);
1167 : debug_check_no_obj_freed(page_address(page),
1168 : PAGE_SIZE << order);
1169 : }
1170 :
1171 259 : kernel_poison_pages(page, 1 << order);
1172 :
1173 : /*
1174 : * As memory initialization might be integrated into KASAN,
1175 : * KASAN poisoning and memory initialization code must be
1176 : * kept together to avoid discrepancies in behavior.
1177 : *
1178 : * With hardware tag-based KASAN, memory tags must be set before the
1179 : * page becomes unavailable via debug_pagealloc or arch_free_page.
1180 : */
1181 : if (!skip_kasan_poison) {
1182 : kasan_poison_pages(page, order, init);
1183 :
1184 : /* Memory is already initialized if KASAN did it internally. */
1185 : if (kasan_has_integrated_init())
1186 : init = false;
1187 : }
1188 259 : if (init)
1189 0 : kernel_init_pages(page, 1 << order);
1190 :
1191 : /*
1192 : * arch_free_page() can make the page's contents inaccessible. s390
1193 : * does this. So nothing which can access the page's contents should
1194 : * happen after this.
1195 : */
1196 : arch_free_page(page, order);
1197 :
1198 : debug_pagealloc_unmap_pages(page, 1 << order);
1199 :
1200 : return true;
1201 : }
1202 :
1203 : /*
1204 : * Frees a number of pages from the PCP lists
1205 : * Assumes all pages on list are in same zone.
1206 : * count is the number of pages to free.
1207 : */
1208 0 : static void free_pcppages_bulk(struct zone *zone, int count,
1209 : struct per_cpu_pages *pcp,
1210 : int pindex)
1211 : {
1212 : unsigned long flags;
1213 0 : int min_pindex = 0;
1214 0 : int max_pindex = NR_PCP_LISTS - 1;
1215 : unsigned int order;
1216 : bool isolated_pageblocks;
1217 : struct page *page;
1218 :
1219 : /*
1220 : * Ensure proper count is passed which otherwise would stuck in the
1221 : * below while (list_empty(list)) loop.
1222 : */
1223 0 : count = min(pcp->count, count);
1224 :
1225 : /* Ensure requested pindex is drained first. */
1226 0 : pindex = pindex - 1;
1227 :
1228 0 : spin_lock_irqsave(&zone->lock, flags);
1229 0 : isolated_pageblocks = has_isolate_pageblock(zone);
1230 :
1231 0 : while (count > 0) {
1232 : struct list_head *list;
1233 : int nr_pages;
1234 :
1235 : /* Remove pages from lists in a round-robin fashion. */
1236 : do {
1237 0 : if (++pindex > max_pindex)
1238 0 : pindex = min_pindex;
1239 0 : list = &pcp->lists[pindex];
1240 0 : if (!list_empty(list))
1241 : break;
1242 :
1243 0 : if (pindex == max_pindex)
1244 0 : max_pindex--;
1245 0 : if (pindex == min_pindex)
1246 0 : min_pindex++;
1247 : } while (1);
1248 :
1249 0 : order = pindex_to_order(pindex);
1250 0 : nr_pages = 1 << order;
1251 : do {
1252 : int mt;
1253 :
1254 0 : page = list_last_entry(list, struct page, pcp_list);
1255 0 : mt = get_pcppage_migratetype(page);
1256 :
1257 : /* must delete to avoid corrupting pcp list */
1258 0 : list_del(&page->pcp_list);
1259 0 : count -= nr_pages;
1260 0 : pcp->count -= nr_pages;
1261 :
1262 : /* MIGRATE_ISOLATE page should not go to pcplists */
1263 : VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1264 : /* Pageblock could have been isolated meanwhile */
1265 : if (unlikely(isolated_pageblocks))
1266 : mt = get_pageblock_migratetype(page);
1267 :
1268 0 : __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
1269 0 : trace_mm_page_pcpu_drain(page, order, mt);
1270 0 : } while (count > 0 && !list_empty(list));
1271 : }
1272 :
1273 0 : spin_unlock_irqrestore(&zone->lock, flags);
1274 0 : }
1275 :
1276 0 : static void free_one_page(struct zone *zone,
1277 : struct page *page, unsigned long pfn,
1278 : unsigned int order,
1279 : int migratetype, fpi_t fpi_flags)
1280 : {
1281 : unsigned long flags;
1282 :
1283 0 : spin_lock_irqsave(&zone->lock, flags);
1284 0 : if (unlikely(has_isolate_pageblock(zone) ||
1285 : is_migrate_isolate(migratetype))) {
1286 : migratetype = get_pfnblock_migratetype(page, pfn);
1287 : }
1288 0 : __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
1289 0 : spin_unlock_irqrestore(&zone->lock, flags);
1290 0 : }
1291 :
1292 259 : static void __free_pages_ok(struct page *page, unsigned int order,
1293 : fpi_t fpi_flags)
1294 : {
1295 : unsigned long flags;
1296 : int migratetype;
1297 259 : unsigned long pfn = page_to_pfn(page);
1298 259 : struct zone *zone = page_zone(page);
1299 :
1300 259 : if (!free_pages_prepare(page, order, fpi_flags))
1301 : return;
1302 :
1303 : /*
1304 : * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
1305 : * is used to avoid calling get_pfnblock_migratetype() under the lock.
1306 : * This will reduce the lock holding time.
1307 : */
1308 259 : migratetype = get_pfnblock_migratetype(page, pfn);
1309 :
1310 259 : spin_lock_irqsave(&zone->lock, flags);
1311 : if (unlikely(has_isolate_pageblock(zone) ||
1312 : is_migrate_isolate(migratetype))) {
1313 : migratetype = get_pfnblock_migratetype(page, pfn);
1314 : }
1315 259 : __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
1316 518 : spin_unlock_irqrestore(&zone->lock, flags);
1317 :
1318 259 : __count_vm_events(PGFREE, 1 << order);
1319 : }
1320 :
1321 259 : void __free_pages_core(struct page *page, unsigned int order)
1322 : {
1323 259 : unsigned int nr_pages = 1 << order;
1324 259 : struct page *p = page;
1325 : unsigned int loop;
1326 :
1327 : /*
1328 : * When initializing the memmap, __init_single_page() sets the refcount
1329 : * of all pages to 1 ("allocated"/"not free"). We have to set the
1330 : * refcount of all involved pages to 0.
1331 : */
1332 259 : prefetchw(p);
1333 254745 : for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
1334 254486 : prefetchw(p + 1);
1335 254486 : __ClearPageReserved(p);
1336 254486 : set_page_count(p, 0);
1337 : }
1338 259 : __ClearPageReserved(p);
1339 259 : set_page_count(p, 0);
1340 :
1341 518 : atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
1342 :
1343 259 : if (page_contains_unaccepted(page, order)) {
1344 : if (order == MAX_ORDER && __free_unaccepted(page))
1345 : return;
1346 :
1347 : accept_page(page, order);
1348 : }
1349 :
1350 : /*
1351 : * Bypass PCP and place fresh pages right to the tail, primarily
1352 : * relevant for memory onlining.
1353 : */
1354 259 : __free_pages_ok(page, order, FPI_TO_TAIL);
1355 : }
1356 :
1357 : /*
1358 : * Check that the whole (or subset of) a pageblock given by the interval of
1359 : * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1360 : * with the migration of free compaction scanner.
1361 : *
1362 : * Return struct page pointer of start_pfn, or NULL if checks were not passed.
1363 : *
1364 : * It's possible on some configurations to have a setup like node0 node1 node0
1365 : * i.e. it's possible that all pages within a zones range of pages do not
1366 : * belong to a single zone. We assume that a border between node0 and node1
1367 : * can occur within a single pageblock, but not a node0 node1 node0
1368 : * interleaving within a single pageblock. It is therefore sufficient to check
1369 : * the first and last page of a pageblock and avoid checking each individual
1370 : * page in a pageblock.
1371 : *
1372 : * Note: the function may return non-NULL struct page even for a page block
1373 : * which contains a memory hole (i.e. there is no physical memory for a subset
1374 : * of the pfn range). For example, if the pageblock order is MAX_ORDER, which
1375 : * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
1376 : * even though the start pfn is online and valid. This should be safe most of
1377 : * the time because struct pages are still initialized via init_unavailable_range()
1378 : * and pfn walkers shouldn't touch any physical memory range for which they do
1379 : * not recognize any specific metadata in struct pages.
1380 : */
1381 260 : struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
1382 : unsigned long end_pfn, struct zone *zone)
1383 : {
1384 : struct page *start_page;
1385 : struct page *end_page;
1386 :
1387 : /* end_pfn is one past the range we are checking */
1388 260 : end_pfn--;
1389 :
1390 260 : if (!pfn_valid(end_pfn))
1391 : return NULL;
1392 :
1393 520 : start_page = pfn_to_online_page(start_pfn);
1394 260 : if (!start_page)
1395 : return NULL;
1396 :
1397 260 : if (page_zone(start_page) != zone)
1398 : return NULL;
1399 :
1400 260 : end_page = pfn_to_page(end_pfn);
1401 :
1402 : /* This gives a shorter code than deriving page_zone(end_page) */
1403 780 : if (page_zone_id(start_page) != page_zone_id(end_page))
1404 : return NULL;
1405 :
1406 260 : return start_page;
1407 : }
1408 :
1409 : /*
1410 : * The order of subdivision here is critical for the IO subsystem.
1411 : * Please do not alter this order without good reasons and regression
1412 : * testing. Specifically, as large blocks of memory are subdivided,
1413 : * the order in which smaller blocks are delivered depends on the order
1414 : * they're subdivided in this function. This is the primary factor
1415 : * influencing the order in which pages are delivered to the IO
1416 : * subsystem according to empirical testing, and this is also justified
1417 : * by considering the behavior of a buddy system containing a single
1418 : * large block of memory acted on by a series of small allocations.
1419 : * This behavior is a critical factor in sglist merging's success.
1420 : *
1421 : * -- nyc
1422 : */
1423 : static inline void expand(struct zone *zone, struct page *page,
1424 : int low, int high, int migratetype)
1425 : {
1426 677 : unsigned long size = 1 << high;
1427 :
1428 1361 : while (high > low) {
1429 684 : high--;
1430 684 : size >>= 1;
1431 : VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
1432 :
1433 : /*
1434 : * Mark as guard pages (or page), that will allow to
1435 : * merge back to allocator when buddy will be freed.
1436 : * Corresponding page table entries will not be touched,
1437 : * pages will stay not present in virtual address space
1438 : */
1439 684 : if (set_page_guard(zone, &page[size], high, migratetype))
1440 : continue;
1441 :
1442 1368 : add_to_free_list(&page[size], zone, high, migratetype);
1443 684 : set_buddy_order(&page[size], high);
1444 : }
1445 : }
1446 :
1447 0 : static void check_new_page_bad(struct page *page)
1448 : {
1449 : if (unlikely(page->flags & __PG_HWPOISON)) {
1450 : /* Don't complain about hwpoisoned pages */
1451 : page_mapcount_reset(page); /* remove PageBuddy */
1452 : return;
1453 : }
1454 :
1455 0 : bad_page(page,
1456 : page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
1457 : }
1458 :
1459 : /*
1460 : * This page is about to be returned from the page allocator
1461 : */
1462 0 : static int check_new_page(struct page *page)
1463 : {
1464 0 : if (likely(page_expected_state(page,
1465 : PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
1466 : return 0;
1467 :
1468 0 : check_new_page_bad(page);
1469 0 : return 1;
1470 : }
1471 :
1472 505 : static inline bool check_new_pages(struct page *page, unsigned int order)
1473 : {
1474 505 : if (is_check_pages_enabled()) {
1475 0 : for (int i = 0; i < (1 << order); i++) {
1476 0 : struct page *p = page + i;
1477 :
1478 0 : if (check_new_page(p))
1479 : return true;
1480 : }
1481 : }
1482 :
1483 : return false;
1484 : }
1485 :
1486 : static inline bool should_skip_kasan_unpoison(gfp_t flags)
1487 : {
1488 : /* Don't skip if a software KASAN mode is enabled. */
1489 : if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
1490 : IS_ENABLED(CONFIG_KASAN_SW_TAGS))
1491 : return false;
1492 :
1493 : /* Skip, if hardware tag-based KASAN is not enabled. */
1494 : if (!kasan_hw_tags_enabled())
1495 : return true;
1496 :
1497 : /*
1498 : * With hardware tag-based KASAN enabled, skip if this has been
1499 : * requested via __GFP_SKIP_KASAN.
1500 : */
1501 : return flags & __GFP_SKIP_KASAN;
1502 : }
1503 :
1504 : static inline bool should_skip_init(gfp_t flags)
1505 : {
1506 : /* Don't skip, if hardware tag-based KASAN is not enabled. */
1507 : if (!kasan_hw_tags_enabled())
1508 : return false;
1509 :
1510 : /* For hardware tag-based KASAN, skip if requested. */
1511 : return (flags & __GFP_SKIP_ZERO);
1512 : }
1513 :
1514 505 : inline void post_alloc_hook(struct page *page, unsigned int order,
1515 : gfp_t gfp_flags)
1516 : {
1517 1010 : bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
1518 : !should_skip_init(gfp_flags);
1519 505 : bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
1520 : int i;
1521 :
1522 1010 : set_page_private(page, 0);
1523 505 : set_page_refcounted(page);
1524 :
1525 505 : arch_alloc_page(page, order);
1526 505 : debug_pagealloc_map_pages(page, 1 << order);
1527 :
1528 : /*
1529 : * Page unpoisoning must happen before memory initialization.
1530 : * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
1531 : * allocations and the page unpoisoning code will complain.
1532 : */
1533 505 : kernel_unpoison_pages(page, 1 << order);
1534 :
1535 : /*
1536 : * As memory initialization might be integrated into KASAN,
1537 : * KASAN unpoisoning and memory initializion code must be
1538 : * kept together to avoid discrepancies in behavior.
1539 : */
1540 :
1541 : /*
1542 : * If memory tags should be zeroed
1543 : * (which happens only when memory should be initialized as well).
1544 : */
1545 505 : if (zero_tags) {
1546 : /* Initialize both memory and memory tags. */
1547 : for (i = 0; i != 1 << order; ++i)
1548 : tag_clear_highpage(page + i);
1549 :
1550 : /* Take note that memory was initialized by the loop above. */
1551 : init = false;
1552 : }
1553 505 : if (!should_skip_kasan_unpoison(gfp_flags) &&
1554 : kasan_unpoison_pages(page, order, init)) {
1555 : /* Take note that memory was initialized by KASAN. */
1556 : if (kasan_has_integrated_init())
1557 : init = false;
1558 : } else {
1559 : /*
1560 : * If memory tags have not been set by KASAN, reset the page
1561 : * tags to ensure page_address() dereferencing does not fault.
1562 : */
1563 505 : for (i = 0; i != 1 << order; ++i)
1564 : page_kasan_tag_reset(page + i);
1565 : }
1566 : /* If memory is still not initialized, initialize it now. */
1567 505 : if (init)
1568 : kernel_init_pages(page, 1 << order);
1569 :
1570 505 : set_page_owner(page, order, gfp_flags);
1571 505 : page_table_check_alloc(page, order);
1572 505 : }
1573 :
1574 441 : static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
1575 : unsigned int alloc_flags)
1576 : {
1577 505 : post_alloc_hook(page, order, gfp_flags);
1578 :
1579 441 : if (order && (gfp_flags & __GFP_COMP))
1580 96 : prep_compound_page(page, order);
1581 :
1582 : /*
1583 : * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
1584 : * allocate the page. The expectation is that the caller is taking
1585 : * steps that will free more memory. The caller should avoid the page
1586 : * being used for !PFMEMALLOC purposes.
1587 : */
1588 441 : if (alloc_flags & ALLOC_NO_WATERMARKS)
1589 0 : set_page_pfmemalloc(page);
1590 : else
1591 505 : clear_page_pfmemalloc(page);
1592 441 : }
1593 :
1594 : /*
1595 : * Go through the free lists for the given migratetype and remove
1596 : * the smallest available page from the freelists
1597 : */
1598 : static __always_inline
1599 : struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1600 : int migratetype)
1601 : {
1602 : unsigned int current_order;
1603 : struct free_area *area;
1604 : struct page *page;
1605 :
1606 : /* Find a page of the appropriate size in the preferred list */
1607 2768 : for (current_order = order; current_order <= MAX_ORDER; ++current_order) {
1608 1382 : area = &(zone->free_area[current_order]);
1609 1382 : page = get_page_from_free_area(area, migratetype);
1610 1382 : if (!page)
1611 705 : continue;
1612 677 : del_page_from_free_list(page, zone, current_order);
1613 1354 : expand(zone, page, order, current_order, migratetype);
1614 677 : set_pcppage_migratetype(page, migratetype);
1615 : trace_mm_page_alloc_zone_locked(page, order, migratetype,
1616 : pcp_allowed_order(order) &&
1617 : migratetype < MIGRATE_PCPTYPES);
1618 : return page;
1619 : }
1620 :
1621 : return NULL;
1622 : }
1623 :
1624 :
1625 : /*
1626 : * This array describes the order lists are fallen back to when
1627 : * the free lists for the desirable migrate type are depleted
1628 : *
1629 : * The other migratetypes do not have fallbacks.
1630 : */
1631 : static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = {
1632 : [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE },
1633 : [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
1634 : [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE },
1635 : };
1636 :
1637 : #ifdef CONFIG_CMA
1638 : static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1639 : unsigned int order)
1640 : {
1641 : return __rmqueue_smallest(zone, order, MIGRATE_CMA);
1642 : }
1643 : #else
1644 : static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1645 : unsigned int order) { return NULL; }
1646 : #endif
1647 :
1648 : /*
1649 : * Move the free pages in a range to the freelist tail of the requested type.
1650 : * Note that start_page and end_pages are not aligned on a pageblock
1651 : * boundary. If alignment is required, use move_freepages_block()
1652 : */
1653 0 : static int move_freepages(struct zone *zone,
1654 : unsigned long start_pfn, unsigned long end_pfn,
1655 : int migratetype, int *num_movable)
1656 : {
1657 : struct page *page;
1658 : unsigned long pfn;
1659 : unsigned int order;
1660 0 : int pages_moved = 0;
1661 :
1662 0 : for (pfn = start_pfn; pfn <= end_pfn;) {
1663 0 : page = pfn_to_page(pfn);
1664 0 : if (!PageBuddy(page)) {
1665 : /*
1666 : * We assume that pages that could be isolated for
1667 : * migration are movable. But we don't actually try
1668 : * isolating, as that would be expensive.
1669 : */
1670 0 : if (num_movable &&
1671 0 : (PageLRU(page) || __PageMovable(page)))
1672 0 : (*num_movable)++;
1673 0 : pfn++;
1674 0 : continue;
1675 : }
1676 :
1677 : /* Make sure we are not inadvertently changing nodes */
1678 : VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
1679 : VM_BUG_ON_PAGE(page_zone(page) != zone, page);
1680 :
1681 0 : order = buddy_order(page);
1682 0 : move_to_free_list(page, zone, order, migratetype);
1683 0 : pfn += 1 << order;
1684 0 : pages_moved += 1 << order;
1685 : }
1686 :
1687 0 : return pages_moved;
1688 : }
1689 :
1690 0 : int move_freepages_block(struct zone *zone, struct page *page,
1691 : int migratetype, int *num_movable)
1692 : {
1693 : unsigned long start_pfn, end_pfn, pfn;
1694 :
1695 0 : if (num_movable)
1696 0 : *num_movable = 0;
1697 :
1698 0 : pfn = page_to_pfn(page);
1699 0 : start_pfn = pageblock_start_pfn(pfn);
1700 0 : end_pfn = pageblock_end_pfn(pfn) - 1;
1701 :
1702 : /* Do not cross zone boundaries */
1703 0 : if (!zone_spans_pfn(zone, start_pfn))
1704 0 : start_pfn = pfn;
1705 0 : if (!zone_spans_pfn(zone, end_pfn))
1706 : return 0;
1707 :
1708 0 : return move_freepages(zone, start_pfn, end_pfn, migratetype,
1709 : num_movable);
1710 : }
1711 :
1712 : static void change_pageblock_range(struct page *pageblock_page,
1713 : int start_order, int migratetype)
1714 : {
1715 2 : int nr_pageblocks = 1 << (start_order - pageblock_order);
1716 :
1717 4 : while (nr_pageblocks--) {
1718 2 : set_pageblock_migratetype(pageblock_page, migratetype);
1719 2 : pageblock_page += pageblock_nr_pages;
1720 : }
1721 : }
1722 :
1723 : /*
1724 : * When we are falling back to another migratetype during allocation, try to
1725 : * steal extra free pages from the same pageblocks to satisfy further
1726 : * allocations, instead of polluting multiple pageblocks.
1727 : *
1728 : * If we are stealing a relatively large buddy page, it is likely there will
1729 : * be more free pages in the pageblock, so try to steal them all. For
1730 : * reclaimable and unmovable allocations, we steal regardless of page size,
1731 : * as fragmentation caused by those allocations polluting movable pageblocks
1732 : * is worse than movable allocations stealing from unmovable and reclaimable
1733 : * pageblocks.
1734 : */
1735 : static bool can_steal_fallback(unsigned int order, int start_mt)
1736 : {
1737 : /*
1738 : * Leaving this order check is intended, although there is
1739 : * relaxed order check in next check. The reason is that
1740 : * we can actually steal whole pageblock if this condition met,
1741 : * but, below check doesn't guarantee it and that is just heuristic
1742 : * so could be changed anytime.
1743 : */
1744 2 : if (order >= pageblock_order)
1745 : return true;
1746 :
1747 0 : if (order >= pageblock_order / 2 ||
1748 0 : start_mt == MIGRATE_RECLAIMABLE ||
1749 0 : start_mt == MIGRATE_UNMOVABLE ||
1750 : page_group_by_mobility_disabled)
1751 : return true;
1752 :
1753 : return false;
1754 : }
1755 :
1756 0 : static inline bool boost_watermark(struct zone *zone)
1757 : {
1758 : unsigned long max_boost;
1759 :
1760 0 : if (!watermark_boost_factor)
1761 : return false;
1762 : /*
1763 : * Don't bother in zones that are unlikely to produce results.
1764 : * On small machines, including kdump capture kernels running
1765 : * in a small area, boosting the watermark can cause an out of
1766 : * memory situation immediately.
1767 : */
1768 0 : if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
1769 : return false;
1770 :
1771 0 : max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
1772 : watermark_boost_factor, 10000);
1773 :
1774 : /*
1775 : * high watermark may be uninitialised if fragmentation occurs
1776 : * very early in boot so do not boost. We do not fall
1777 : * through and boost by pageblock_nr_pages as failing
1778 : * allocations that early means that reclaim is not going
1779 : * to help and it may even be impossible to reclaim the
1780 : * boosted watermark resulting in a hang.
1781 : */
1782 0 : if (!max_boost)
1783 : return false;
1784 :
1785 0 : max_boost = max(pageblock_nr_pages, max_boost);
1786 :
1787 0 : zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
1788 : max_boost);
1789 :
1790 0 : return true;
1791 : }
1792 :
1793 : /*
1794 : * This function implements actual steal behaviour. If order is large enough,
1795 : * we can steal whole pageblock. If not, we first move freepages in this
1796 : * pageblock to our migratetype and determine how many already-allocated pages
1797 : * are there in the pageblock with a compatible migratetype. If at least half
1798 : * of pages are free or compatible, we can change migratetype of the pageblock
1799 : * itself, so pages freed in the future will be put on the correct free list.
1800 : */
1801 2 : static void steal_suitable_fallback(struct zone *zone, struct page *page,
1802 : unsigned int alloc_flags, int start_type, bool whole_block)
1803 : {
1804 4 : unsigned int current_order = buddy_order(page);
1805 : int free_pages, movable_pages, alike_pages;
1806 : int old_block_type;
1807 :
1808 4 : old_block_type = get_pageblock_migratetype(page);
1809 :
1810 : /*
1811 : * This can happen due to races and we want to prevent broken
1812 : * highatomic accounting.
1813 : */
1814 2 : if (is_migrate_highatomic(old_block_type))
1815 : goto single_page;
1816 :
1817 : /* Take ownership for orders >= pageblock_order */
1818 2 : if (current_order >= pageblock_order) {
1819 2 : change_pageblock_range(page, current_order, start_type);
1820 : goto single_page;
1821 : }
1822 :
1823 : /*
1824 : * Boost watermarks to increase reclaim pressure to reduce the
1825 : * likelihood of future fallbacks. Wake kswapd now as the node
1826 : * may be balanced overall and kswapd will not wake naturally.
1827 : */
1828 0 : if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
1829 0 : set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
1830 :
1831 : /* We are not allowed to try stealing from the whole block */
1832 0 : if (!whole_block)
1833 : goto single_page;
1834 :
1835 0 : free_pages = move_freepages_block(zone, page, start_type,
1836 : &movable_pages);
1837 : /*
1838 : * Determine how many pages are compatible with our allocation.
1839 : * For movable allocation, it's the number of movable pages which
1840 : * we just obtained. For other types it's a bit more tricky.
1841 : */
1842 0 : if (start_type == MIGRATE_MOVABLE) {
1843 0 : alike_pages = movable_pages;
1844 : } else {
1845 : /*
1846 : * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
1847 : * to MOVABLE pageblock, consider all non-movable pages as
1848 : * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
1849 : * vice versa, be conservative since we can't distinguish the
1850 : * exact migratetype of non-movable pages.
1851 : */
1852 0 : if (old_block_type == MIGRATE_MOVABLE)
1853 0 : alike_pages = pageblock_nr_pages
1854 0 : - (free_pages + movable_pages);
1855 : else
1856 : alike_pages = 0;
1857 : }
1858 :
1859 : /* moving whole block can fail due to zone boundary conditions */
1860 0 : if (!free_pages)
1861 : goto single_page;
1862 :
1863 : /*
1864 : * If a sufficient number of pages in the block are either free or of
1865 : * comparable migratability as our allocation, claim the whole block.
1866 : */
1867 0 : if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
1868 : page_group_by_mobility_disabled)
1869 0 : set_pageblock_migratetype(page, start_type);
1870 :
1871 0 : return;
1872 :
1873 : single_page:
1874 2 : move_to_free_list(page, zone, current_order, start_type);
1875 : }
1876 :
1877 : /*
1878 : * Check whether there is a suitable fallback freepage with requested order.
1879 : * If only_stealable is true, this function returns fallback_mt only if
1880 : * we can steal other freepages all together. This would help to reduce
1881 : * fragmentation due to mixed migratetype pages in one pageblock.
1882 : */
1883 2 : int find_suitable_fallback(struct free_area *area, unsigned int order,
1884 : int migratetype, bool only_stealable, bool *can_steal)
1885 : {
1886 : int i;
1887 : int fallback_mt;
1888 :
1889 2 : if (area->nr_free == 0)
1890 : return -1;
1891 :
1892 2 : *can_steal = false;
1893 4 : for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
1894 4 : fallback_mt = fallbacks[migratetype][i];
1895 4 : if (free_area_empty(area, fallback_mt))
1896 2 : continue;
1897 :
1898 2 : if (can_steal_fallback(order, migratetype))
1899 2 : *can_steal = true;
1900 :
1901 2 : if (!only_stealable)
1902 : return fallback_mt;
1903 :
1904 0 : if (*can_steal)
1905 : return fallback_mt;
1906 : }
1907 :
1908 : return -1;
1909 : }
1910 :
1911 : /*
1912 : * Reserve a pageblock for exclusive use of high-order atomic allocations if
1913 : * there are no empty page blocks that contain a page with a suitable order
1914 : */
1915 0 : static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
1916 : unsigned int alloc_order)
1917 : {
1918 : int mt;
1919 : unsigned long max_managed, flags;
1920 :
1921 : /*
1922 : * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
1923 : * Check is race-prone but harmless.
1924 : */
1925 0 : max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
1926 0 : if (zone->nr_reserved_highatomic >= max_managed)
1927 : return;
1928 :
1929 0 : spin_lock_irqsave(&zone->lock, flags);
1930 :
1931 : /* Recheck the nr_reserved_highatomic limit under the lock */
1932 0 : if (zone->nr_reserved_highatomic >= max_managed)
1933 : goto out_unlock;
1934 :
1935 : /* Yoink! */
1936 0 : mt = get_pageblock_migratetype(page);
1937 : /* Only reserve normal pageblocks (i.e., they can merge with others) */
1938 0 : if (migratetype_is_mergeable(mt)) {
1939 0 : zone->nr_reserved_highatomic += pageblock_nr_pages;
1940 0 : set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
1941 0 : move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
1942 : }
1943 :
1944 : out_unlock:
1945 0 : spin_unlock_irqrestore(&zone->lock, flags);
1946 : }
1947 :
1948 : /*
1949 : * Used when an allocation is about to fail under memory pressure. This
1950 : * potentially hurts the reliability of high-order allocations when under
1951 : * intense memory pressure but failed atomic allocations should be easier
1952 : * to recover from than an OOM.
1953 : *
1954 : * If @force is true, try to unreserve a pageblock even though highatomic
1955 : * pageblock is exhausted.
1956 : */
1957 0 : static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
1958 : bool force)
1959 : {
1960 0 : struct zonelist *zonelist = ac->zonelist;
1961 : unsigned long flags;
1962 : struct zoneref *z;
1963 : struct zone *zone;
1964 : struct page *page;
1965 : int order;
1966 : bool ret;
1967 :
1968 0 : for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
1969 : ac->nodemask) {
1970 : /*
1971 : * Preserve at least one pageblock unless memory pressure
1972 : * is really high.
1973 : */
1974 0 : if (!force && zone->nr_reserved_highatomic <=
1975 : pageblock_nr_pages)
1976 0 : continue;
1977 :
1978 0 : spin_lock_irqsave(&zone->lock, flags);
1979 0 : for (order = 0; order <= MAX_ORDER; order++) {
1980 0 : struct free_area *area = &(zone->free_area[order]);
1981 :
1982 0 : page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
1983 0 : if (!page)
1984 0 : continue;
1985 :
1986 : /*
1987 : * In page freeing path, migratetype change is racy so
1988 : * we can counter several free pages in a pageblock
1989 : * in this loop although we changed the pageblock type
1990 : * from highatomic to ac->migratetype. So we should
1991 : * adjust the count once.
1992 : */
1993 0 : if (is_migrate_highatomic_page(page)) {
1994 : /*
1995 : * It should never happen but changes to
1996 : * locking could inadvertently allow a per-cpu
1997 : * drain to add pages to MIGRATE_HIGHATOMIC
1998 : * while unreserving so be safe and watch for
1999 : * underflows.
2000 : */
2001 0 : zone->nr_reserved_highatomic -= min(
2002 : pageblock_nr_pages,
2003 : zone->nr_reserved_highatomic);
2004 : }
2005 :
2006 : /*
2007 : * Convert to ac->migratetype and avoid the normal
2008 : * pageblock stealing heuristics. Minimally, the caller
2009 : * is doing the work and needs the pages. More
2010 : * importantly, if the block was always converted to
2011 : * MIGRATE_UNMOVABLE or another type then the number
2012 : * of pageblocks that cannot be completely freed
2013 : * may increase.
2014 : */
2015 0 : set_pageblock_migratetype(page, ac->migratetype);
2016 0 : ret = move_freepages_block(zone, page, ac->migratetype,
2017 : NULL);
2018 0 : if (ret) {
2019 0 : spin_unlock_irqrestore(&zone->lock, flags);
2020 0 : return ret;
2021 : }
2022 : }
2023 0 : spin_unlock_irqrestore(&zone->lock, flags);
2024 : }
2025 :
2026 : return false;
2027 : }
2028 :
2029 : /*
2030 : * Try finding a free buddy page on the fallback list and put it on the free
2031 : * list of requested migratetype, possibly along with other pages from the same
2032 : * block, depending on fragmentation avoidance heuristics. Returns true if
2033 : * fallback was found so that __rmqueue_smallest() can grab it.
2034 : *
2035 : * The use of signed ints for order and current_order is a deliberate
2036 : * deviation from the rest of this file, to make the for loop
2037 : * condition simpler.
2038 : */
2039 : static __always_inline bool
2040 : __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
2041 : unsigned int alloc_flags)
2042 : {
2043 : struct free_area *area;
2044 : int current_order;
2045 2 : int min_order = order;
2046 : struct page *page;
2047 : int fallback_mt;
2048 : bool can_steal;
2049 :
2050 : /*
2051 : * Do not steal pages from freelists belonging to other pageblocks
2052 : * i.e. orders < pageblock_order. If there are no local zones free,
2053 : * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
2054 : */
2055 : if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
2056 : min_order = pageblock_order;
2057 :
2058 : /*
2059 : * Find the largest available free page in the other list. This roughly
2060 : * approximates finding the pageblock with the most free pages, which
2061 : * would be too costly to do exactly.
2062 : */
2063 4 : for (current_order = MAX_ORDER; current_order >= min_order;
2064 0 : --current_order) {
2065 2 : area = &(zone->free_area[current_order]);
2066 2 : fallback_mt = find_suitable_fallback(area, current_order,
2067 : start_migratetype, false, &can_steal);
2068 2 : if (fallback_mt == -1)
2069 0 : continue;
2070 :
2071 : /*
2072 : * We cannot steal all free pages from the pageblock and the
2073 : * requested migratetype is movable. In that case it's better to
2074 : * steal and split the smallest available page instead of the
2075 : * largest available page, because even if the next movable
2076 : * allocation falls back into a different pageblock than this
2077 : * one, it won't cause permanent fragmentation.
2078 : */
2079 2 : if (!can_steal && start_migratetype == MIGRATE_MOVABLE
2080 0 : && current_order > order)
2081 : goto find_smallest;
2082 :
2083 : goto do_steal;
2084 : }
2085 :
2086 : return false;
2087 :
2088 : find_smallest:
2089 0 : for (current_order = order; current_order <= MAX_ORDER;
2090 0 : current_order++) {
2091 0 : area = &(zone->free_area[current_order]);
2092 0 : fallback_mt = find_suitable_fallback(area, current_order,
2093 : start_migratetype, false, &can_steal);
2094 0 : if (fallback_mt != -1)
2095 : break;
2096 : }
2097 :
2098 : /*
2099 : * This should not happen - we already found a suitable fallback
2100 : * when looking for the largest page.
2101 : */
2102 : VM_BUG_ON(current_order > MAX_ORDER);
2103 :
2104 : do_steal:
2105 2 : page = get_page_from_free_area(area, fallback_mt);
2106 :
2107 2 : steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
2108 : can_steal);
2109 :
2110 2 : trace_mm_page_alloc_extfrag(page, order, current_order,
2111 : start_migratetype, fallback_mt);
2112 :
2113 : return true;
2114 :
2115 : }
2116 :
2117 : /*
2118 : * Do the hard work of removing an element from the buddy allocator.
2119 : * Call me with the zone->lock already held.
2120 : */
2121 : static __always_inline struct page *
2122 : __rmqueue(struct zone *zone, unsigned int order, int migratetype,
2123 : unsigned int alloc_flags)
2124 : {
2125 : struct page *page;
2126 :
2127 : if (IS_ENABLED(CONFIG_CMA)) {
2128 : /*
2129 : * Balance movable allocations between regular and CMA areas by
2130 : * allocating from CMA when over half of the zone's free memory
2131 : * is in the CMA area.
2132 : */
2133 : if (alloc_flags & ALLOC_CMA &&
2134 : zone_page_state(zone, NR_FREE_CMA_PAGES) >
2135 : zone_page_state(zone, NR_FREE_PAGES) / 2) {
2136 : page = __rmqueue_cma_fallback(zone, order);
2137 : if (page)
2138 : return page;
2139 : }
2140 : }
2141 : retry:
2142 679 : page = __rmqueue_smallest(zone, order, migratetype);
2143 679 : if (unlikely(!page)) {
2144 2 : if (alloc_flags & ALLOC_CMA)
2145 0 : page = __rmqueue_cma_fallback(zone, order);
2146 :
2147 4 : if (!page && __rmqueue_fallback(zone, order, migratetype,
2148 : alloc_flags))
2149 : goto retry;
2150 : }
2151 : return page;
2152 : }
2153 :
2154 : /*
2155 : * Obtain a specified number of elements from the buddy allocator, all under
2156 : * a single hold of the lock, for efficiency. Add them to the supplied list.
2157 : * Returns the number of new pages which were placed at *list.
2158 : */
2159 27 : static int rmqueue_bulk(struct zone *zone, unsigned int order,
2160 : unsigned long count, struct list_head *list,
2161 : int migratetype, unsigned int alloc_flags)
2162 : {
2163 : unsigned long flags;
2164 : int i;
2165 :
2166 27 : spin_lock_irqsave(&zone->lock, flags);
2167 704 : for (i = 0; i < count; ++i) {
2168 677 : struct page *page = __rmqueue(zone, order, migratetype,
2169 : alloc_flags);
2170 677 : if (unlikely(page == NULL))
2171 : break;
2172 :
2173 : /*
2174 : * Split buddy pages returned by expand() are received here in
2175 : * physical page order. The page is added to the tail of
2176 : * caller's list. From the callers perspective, the linked list
2177 : * is ordered by page number under some conditions. This is
2178 : * useful for IO devices that can forward direction from the
2179 : * head, thus also in the physical page order. This is useful
2180 : * for IO devices that can merge IO requests if the physical
2181 : * pages are ordered properly.
2182 : */
2183 1354 : list_add_tail(&page->pcp_list, list);
2184 : if (is_migrate_cma(get_pcppage_migratetype(page)))
2185 : __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
2186 : -(1 << order));
2187 : }
2188 :
2189 54 : __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
2190 54 : spin_unlock_irqrestore(&zone->lock, flags);
2191 :
2192 27 : return i;
2193 : }
2194 :
2195 : #ifdef CONFIG_NUMA
2196 : /*
2197 : * Called from the vmstat counter updater to drain pagesets of this
2198 : * currently executing processor on remote nodes after they have
2199 : * expired.
2200 : */
2201 : void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
2202 : {
2203 : int to_drain, batch;
2204 :
2205 : batch = READ_ONCE(pcp->batch);
2206 : to_drain = min(pcp->count, batch);
2207 : if (to_drain > 0) {
2208 : spin_lock(&pcp->lock);
2209 : free_pcppages_bulk(zone, to_drain, pcp, 0);
2210 : spin_unlock(&pcp->lock);
2211 : }
2212 : }
2213 : #endif
2214 :
2215 : /*
2216 : * Drain pcplists of the indicated processor and zone.
2217 : */
2218 0 : static void drain_pages_zone(unsigned int cpu, struct zone *zone)
2219 : {
2220 : struct per_cpu_pages *pcp;
2221 :
2222 0 : pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
2223 0 : if (pcp->count) {
2224 0 : spin_lock(&pcp->lock);
2225 0 : free_pcppages_bulk(zone, pcp->count, pcp, 0);
2226 0 : spin_unlock(&pcp->lock);
2227 : }
2228 0 : }
2229 :
2230 : /*
2231 : * Drain pcplists of all zones on the indicated processor.
2232 : */
2233 0 : static void drain_pages(unsigned int cpu)
2234 : {
2235 : struct zone *zone;
2236 :
2237 0 : for_each_populated_zone(zone) {
2238 0 : drain_pages_zone(cpu, zone);
2239 : }
2240 0 : }
2241 :
2242 : /*
2243 : * Spill all of this CPU's per-cpu pages back into the buddy allocator.
2244 : */
2245 0 : void drain_local_pages(struct zone *zone)
2246 : {
2247 0 : int cpu = smp_processor_id();
2248 :
2249 0 : if (zone)
2250 0 : drain_pages_zone(cpu, zone);
2251 : else
2252 0 : drain_pages(cpu);
2253 0 : }
2254 :
2255 : /*
2256 : * The implementation of drain_all_pages(), exposing an extra parameter to
2257 : * drain on all cpus.
2258 : *
2259 : * drain_all_pages() is optimized to only execute on cpus where pcplists are
2260 : * not empty. The check for non-emptiness can however race with a free to
2261 : * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
2262 : * that need the guarantee that every CPU has drained can disable the
2263 : * optimizing racy check.
2264 : */
2265 0 : static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
2266 : {
2267 : int cpu;
2268 :
2269 : /*
2270 : * Allocate in the BSS so we won't require allocation in
2271 : * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
2272 : */
2273 : static cpumask_t cpus_with_pcps;
2274 :
2275 : /*
2276 : * Do not drain if one is already in progress unless it's specific to
2277 : * a zone. Such callers are primarily CMA and memory hotplug and need
2278 : * the drain to be complete when the call returns.
2279 : */
2280 0 : if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
2281 0 : if (!zone)
2282 : return;
2283 0 : mutex_lock(&pcpu_drain_mutex);
2284 : }
2285 :
2286 : /*
2287 : * We don't care about racing with CPU hotplug event
2288 : * as offline notification will cause the notified
2289 : * cpu to drain that CPU pcps and on_each_cpu_mask
2290 : * disables preemption as part of its processing
2291 : */
2292 0 : for_each_online_cpu(cpu) {
2293 : struct per_cpu_pages *pcp;
2294 : struct zone *z;
2295 0 : bool has_pcps = false;
2296 :
2297 0 : if (force_all_cpus) {
2298 : /*
2299 : * The pcp.count check is racy, some callers need a
2300 : * guarantee that no cpu is missed.
2301 : */
2302 : has_pcps = true;
2303 0 : } else if (zone) {
2304 0 : pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
2305 0 : if (pcp->count)
2306 0 : has_pcps = true;
2307 : } else {
2308 0 : for_each_populated_zone(z) {
2309 0 : pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
2310 0 : if (pcp->count) {
2311 : has_pcps = true;
2312 : break;
2313 : }
2314 : }
2315 : }
2316 :
2317 0 : if (has_pcps)
2318 0 : cpumask_set_cpu(cpu, &cpus_with_pcps);
2319 : else
2320 : cpumask_clear_cpu(cpu, &cpus_with_pcps);
2321 : }
2322 :
2323 0 : for_each_cpu(cpu, &cpus_with_pcps) {
2324 0 : if (zone)
2325 0 : drain_pages_zone(cpu, zone);
2326 : else
2327 0 : drain_pages(cpu);
2328 : }
2329 :
2330 0 : mutex_unlock(&pcpu_drain_mutex);
2331 : }
2332 :
2333 : /*
2334 : * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2335 : *
2336 : * When zone parameter is non-NULL, spill just the single zone's pages.
2337 : */
2338 0 : void drain_all_pages(struct zone *zone)
2339 : {
2340 0 : __drain_all_pages(zone, false);
2341 0 : }
2342 :
2343 0 : static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
2344 : unsigned int order)
2345 : {
2346 : int migratetype;
2347 :
2348 0 : if (!free_pages_prepare(page, order, FPI_NONE))
2349 : return false;
2350 :
2351 0 : migratetype = get_pfnblock_migratetype(page, pfn);
2352 0 : set_pcppage_migratetype(page, migratetype);
2353 0 : return true;
2354 : }
2355 :
2356 : static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
2357 : bool free_high)
2358 : {
2359 : int min_nr_free, max_nr_free;
2360 :
2361 : /* Free everything if batch freeing high-order pages. */
2362 0 : if (unlikely(free_high))
2363 : return pcp->count;
2364 :
2365 : /* Check for PCP disabled or boot pageset */
2366 0 : if (unlikely(high < batch))
2367 : return 1;
2368 :
2369 : /* Leave at least pcp->batch pages on the list */
2370 0 : min_nr_free = batch;
2371 0 : max_nr_free = high - batch;
2372 :
2373 : /*
2374 : * Double the number of pages freed each time there is subsequent
2375 : * freeing of pages without any allocation.
2376 : */
2377 0 : batch <<= pcp->free_factor;
2378 0 : if (batch < max_nr_free)
2379 0 : pcp->free_factor++;
2380 0 : batch = clamp(batch, min_nr_free, max_nr_free);
2381 :
2382 : return batch;
2383 : }
2384 :
2385 0 : static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
2386 : bool free_high)
2387 : {
2388 0 : int high = READ_ONCE(pcp->high);
2389 :
2390 0 : if (unlikely(!high || free_high))
2391 : return 0;
2392 :
2393 0 : if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
2394 : return high;
2395 :
2396 : /*
2397 : * If reclaim is active, limit the number of pages that can be
2398 : * stored on pcp lists
2399 : */
2400 0 : return min(READ_ONCE(pcp->batch) << 2, high);
2401 : }
2402 :
2403 0 : static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
2404 : struct page *page, int migratetype,
2405 : unsigned int order)
2406 : {
2407 : int high;
2408 : int pindex;
2409 : bool free_high;
2410 :
2411 0 : __count_vm_events(PGFREE, 1 << order);
2412 0 : pindex = order_to_pindex(migratetype, order);
2413 0 : list_add(&page->pcp_list, &pcp->lists[pindex]);
2414 0 : pcp->count += 1 << order;
2415 :
2416 : /*
2417 : * As high-order pages other than THP's stored on PCP can contribute
2418 : * to fragmentation, limit the number stored when PCP is heavily
2419 : * freeing without allocation. The remainder after bulk freeing
2420 : * stops will be drained from vmstat refresh context.
2421 : */
2422 0 : free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
2423 :
2424 0 : high = nr_pcp_high(pcp, zone, free_high);
2425 0 : if (pcp->count >= high) {
2426 0 : int batch = READ_ONCE(pcp->batch);
2427 :
2428 0 : free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
2429 : }
2430 0 : }
2431 :
2432 : /*
2433 : * Free a pcp page
2434 : */
2435 0 : void free_unref_page(struct page *page, unsigned int order)
2436 : {
2437 : unsigned long __maybe_unused UP_flags;
2438 : struct per_cpu_pages *pcp;
2439 : struct zone *zone;
2440 0 : unsigned long pfn = page_to_pfn(page);
2441 : int migratetype;
2442 :
2443 0 : if (!free_unref_page_prepare(page, pfn, order))
2444 : return;
2445 :
2446 : /*
2447 : * We only track unmovable, reclaimable and movable on pcp lists.
2448 : * Place ISOLATE pages on the isolated list because they are being
2449 : * offlined but treat HIGHATOMIC as movable pages so we can get those
2450 : * areas back if necessary. Otherwise, we may have to free
2451 : * excessively into the page allocator
2452 : */
2453 0 : migratetype = get_pcppage_migratetype(page);
2454 0 : if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
2455 : if (unlikely(is_migrate_isolate(migratetype))) {
2456 : free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
2457 : return;
2458 : }
2459 0 : migratetype = MIGRATE_MOVABLE;
2460 : }
2461 :
2462 0 : zone = page_zone(page);
2463 0 : pcp_trylock_prepare(UP_flags);
2464 0 : pcp = pcp_spin_trylock(zone->per_cpu_pageset);
2465 0 : if (pcp) {
2466 0 : free_unref_page_commit(zone, pcp, page, migratetype, order);
2467 0 : pcp_spin_unlock(pcp);
2468 : } else {
2469 0 : free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
2470 : }
2471 0 : pcp_trylock_finish(UP_flags);
2472 : }
2473 :
2474 : /*
2475 : * Free a list of 0-order pages
2476 : */
2477 0 : void free_unref_page_list(struct list_head *list)
2478 : {
2479 : unsigned long __maybe_unused UP_flags;
2480 : struct page *page, *next;
2481 0 : struct per_cpu_pages *pcp = NULL;
2482 0 : struct zone *locked_zone = NULL;
2483 0 : int batch_count = 0;
2484 : int migratetype;
2485 :
2486 : /* Prepare pages for freeing */
2487 0 : list_for_each_entry_safe(page, next, list, lru) {
2488 0 : unsigned long pfn = page_to_pfn(page);
2489 0 : if (!free_unref_page_prepare(page, pfn, 0)) {
2490 0 : list_del(&page->lru);
2491 0 : continue;
2492 : }
2493 :
2494 : /*
2495 : * Free isolated pages directly to the allocator, see
2496 : * comment in free_unref_page.
2497 : */
2498 : migratetype = get_pcppage_migratetype(page);
2499 : if (unlikely(is_migrate_isolate(migratetype))) {
2500 : list_del(&page->lru);
2501 : free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
2502 : continue;
2503 : }
2504 : }
2505 :
2506 0 : list_for_each_entry_safe(page, next, list, lru) {
2507 0 : struct zone *zone = page_zone(page);
2508 :
2509 0 : list_del(&page->lru);
2510 0 : migratetype = get_pcppage_migratetype(page);
2511 :
2512 : /*
2513 : * Either different zone requiring a different pcp lock or
2514 : * excessive lock hold times when freeing a large list of
2515 : * pages.
2516 : */
2517 0 : if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
2518 0 : if (pcp) {
2519 0 : pcp_spin_unlock(pcp);
2520 0 : pcp_trylock_finish(UP_flags);
2521 : }
2522 :
2523 0 : batch_count = 0;
2524 :
2525 : /*
2526 : * trylock is necessary as pages may be getting freed
2527 : * from IRQ or SoftIRQ context after an IO completion.
2528 : */
2529 0 : pcp_trylock_prepare(UP_flags);
2530 0 : pcp = pcp_spin_trylock(zone->per_cpu_pageset);
2531 0 : if (unlikely(!pcp)) {
2532 0 : pcp_trylock_finish(UP_flags);
2533 0 : free_one_page(zone, page, page_to_pfn(page),
2534 : 0, migratetype, FPI_NONE);
2535 0 : locked_zone = NULL;
2536 0 : continue;
2537 : }
2538 : locked_zone = zone;
2539 : }
2540 :
2541 : /*
2542 : * Non-isolated types over MIGRATE_PCPTYPES get added
2543 : * to the MIGRATE_MOVABLE pcp list.
2544 : */
2545 0 : if (unlikely(migratetype >= MIGRATE_PCPTYPES))
2546 0 : migratetype = MIGRATE_MOVABLE;
2547 :
2548 0 : trace_mm_page_free_batched(page);
2549 0 : free_unref_page_commit(zone, pcp, page, migratetype, 0);
2550 0 : batch_count++;
2551 : }
2552 :
2553 0 : if (pcp) {
2554 0 : pcp_spin_unlock(pcp);
2555 0 : pcp_trylock_finish(UP_flags);
2556 : }
2557 0 : }
2558 :
2559 : /*
2560 : * split_page takes a non-compound higher-order page, and splits it into
2561 : * n (1<<order) sub-pages: page[0..n]
2562 : * Each sub-page must be freed individually.
2563 : *
2564 : * Note: this is probably too low level an operation for use in drivers.
2565 : * Please consult with lkml before using this in your driver.
2566 : */
2567 0 : void split_page(struct page *page, unsigned int order)
2568 : {
2569 : int i;
2570 :
2571 : VM_BUG_ON_PAGE(PageCompound(page), page);
2572 : VM_BUG_ON_PAGE(!page_count(page), page);
2573 :
2574 0 : for (i = 1; i < (1 << order); i++)
2575 0 : set_page_refcounted(page + i);
2576 0 : split_page_owner(page, 1 << order);
2577 0 : split_page_memcg(page, 1 << order);
2578 0 : }
2579 : EXPORT_SYMBOL_GPL(split_page);
2580 :
2581 0 : int __isolate_free_page(struct page *page, unsigned int order)
2582 : {
2583 0 : struct zone *zone = page_zone(page);
2584 0 : int mt = get_pageblock_migratetype(page);
2585 :
2586 0 : if (!is_migrate_isolate(mt)) {
2587 : unsigned long watermark;
2588 : /*
2589 : * Obey watermarks as if the page was being allocated. We can
2590 : * emulate a high-order watermark check with a raised order-0
2591 : * watermark, because we already know our high-order page
2592 : * exists.
2593 : */
2594 0 : watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
2595 0 : if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
2596 : return 0;
2597 :
2598 0 : __mod_zone_freepage_state(zone, -(1UL << order), mt);
2599 : }
2600 :
2601 0 : del_page_from_free_list(page, zone, order);
2602 :
2603 : /*
2604 : * Set the pageblock if the isolated page is at least half of a
2605 : * pageblock
2606 : */
2607 0 : if (order >= pageblock_order - 1) {
2608 0 : struct page *endpage = page + (1 << order) - 1;
2609 0 : for (; page < endpage; page += pageblock_nr_pages) {
2610 0 : int mt = get_pageblock_migratetype(page);
2611 : /*
2612 : * Only change normal pageblocks (i.e., they can merge
2613 : * with others)
2614 : */
2615 0 : if (migratetype_is_mergeable(mt))
2616 0 : set_pageblock_migratetype(page,
2617 : MIGRATE_MOVABLE);
2618 : }
2619 : }
2620 :
2621 0 : return 1UL << order;
2622 : }
2623 :
2624 : /**
2625 : * __putback_isolated_page - Return a now-isolated page back where we got it
2626 : * @page: Page that was isolated
2627 : * @order: Order of the isolated page
2628 : * @mt: The page's pageblock's migratetype
2629 : *
2630 : * This function is meant to return a page pulled from the free lists via
2631 : * __isolate_free_page back to the free lists they were pulled from.
2632 : */
2633 0 : void __putback_isolated_page(struct page *page, unsigned int order, int mt)
2634 : {
2635 0 : struct zone *zone = page_zone(page);
2636 :
2637 : /* zone lock should be held when this function is called */
2638 : lockdep_assert_held(&zone->lock);
2639 :
2640 : /* Return isolated page to tail of freelist. */
2641 0 : __free_one_page(page, page_to_pfn(page), zone, order, mt,
2642 : FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
2643 0 : }
2644 :
2645 : /*
2646 : * Update NUMA hit/miss statistics
2647 : */
2648 : static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
2649 : long nr_account)
2650 : {
2651 : #ifdef CONFIG_NUMA
2652 : enum numa_stat_item local_stat = NUMA_LOCAL;
2653 :
2654 : /* skip numa counters update if numa stats is disabled */
2655 : if (!static_branch_likely(&vm_numa_stat_key))
2656 : return;
2657 :
2658 : if (zone_to_nid(z) != numa_node_id())
2659 : local_stat = NUMA_OTHER;
2660 :
2661 : if (zone_to_nid(z) == zone_to_nid(preferred_zone))
2662 : __count_numa_events(z, NUMA_HIT, nr_account);
2663 : else {
2664 : __count_numa_events(z, NUMA_MISS, nr_account);
2665 : __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account);
2666 : }
2667 : __count_numa_events(z, local_stat, nr_account);
2668 : #endif
2669 : }
2670 :
2671 : static __always_inline
2672 : struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
2673 : unsigned int order, unsigned int alloc_flags,
2674 : int migratetype)
2675 : {
2676 : struct page *page;
2677 : unsigned long flags;
2678 :
2679 : do {
2680 0 : page = NULL;
2681 0 : spin_lock_irqsave(&zone->lock, flags);
2682 : /*
2683 : * order-0 request can reach here when the pcplist is skipped
2684 : * due to non-CMA allocation context. HIGHATOMIC area is
2685 : * reserved for high-order atomic allocation, so order-0
2686 : * request should skip it.
2687 : */
2688 0 : if (alloc_flags & ALLOC_HIGHATOMIC)
2689 : page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
2690 0 : if (!page) {
2691 0 : page = __rmqueue(zone, order, migratetype, alloc_flags);
2692 :
2693 : /*
2694 : * If the allocation fails, allow OOM handling access
2695 : * to HIGHATOMIC reserves as failing now is worse than
2696 : * failing a high-order atomic allocation in the
2697 : * future.
2698 : */
2699 0 : if (!page && (alloc_flags & ALLOC_OOM))
2700 : page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
2701 :
2702 0 : if (!page) {
2703 0 : spin_unlock_irqrestore(&zone->lock, flags);
2704 : return NULL;
2705 : }
2706 : }
2707 0 : __mod_zone_freepage_state(zone, -(1 << order),
2708 : get_pcppage_migratetype(page));
2709 0 : spin_unlock_irqrestore(&zone->lock, flags);
2710 0 : } while (check_new_pages(page, order));
2711 :
2712 0 : __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2713 : zone_statistics(preferred_zone, zone, 1);
2714 :
2715 : return page;
2716 : }
2717 :
2718 : /* Remove page from the per-cpu list, caller must protect the list */
2719 : static inline
2720 505 : struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
2721 : int migratetype,
2722 : unsigned int alloc_flags,
2723 : struct per_cpu_pages *pcp,
2724 : struct list_head *list)
2725 : {
2726 : struct page *page;
2727 :
2728 : do {
2729 505 : if (list_empty(list)) {
2730 27 : int batch = READ_ONCE(pcp->batch);
2731 : int alloced;
2732 :
2733 : /*
2734 : * Scale batch relative to order if batch implies
2735 : * free pages can be stored on the PCP. Batch can
2736 : * be 1 for small zones or for boot pagesets which
2737 : * should never store free pages as the pages may
2738 : * belong to arbitrary zones.
2739 : */
2740 27 : if (batch > 1)
2741 15 : batch = max(batch >> order, 2);
2742 27 : alloced = rmqueue_bulk(zone, order,
2743 : batch, list,
2744 : migratetype, alloc_flags);
2745 :
2746 27 : pcp->count += alloced << order;
2747 27 : if (unlikely(list_empty(list)))
2748 : return NULL;
2749 : }
2750 :
2751 505 : page = list_first_entry(list, struct page, pcp_list);
2752 1010 : list_del(&page->pcp_list);
2753 505 : pcp->count -= 1 << order;
2754 505 : } while (check_new_pages(page, order));
2755 :
2756 : return page;
2757 : }
2758 :
2759 : /* Lock and remove page from the per-cpu list */
2760 441 : static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2761 : struct zone *zone, unsigned int order,
2762 : int migratetype, unsigned int alloc_flags)
2763 : {
2764 : struct per_cpu_pages *pcp;
2765 : struct list_head *list;
2766 : struct page *page;
2767 : unsigned long __maybe_unused UP_flags;
2768 :
2769 : /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
2770 441 : pcp_trylock_prepare(UP_flags);
2771 882 : pcp = pcp_spin_trylock(zone->per_cpu_pageset);
2772 441 : if (!pcp) {
2773 0 : pcp_trylock_finish(UP_flags);
2774 : return NULL;
2775 : }
2776 :
2777 : /*
2778 : * On allocation, reduce the number of pages that are batch freed.
2779 : * See nr_pcp_free() where free_factor is increased for subsequent
2780 : * frees.
2781 : */
2782 441 : pcp->free_factor >>= 1;
2783 882 : list = &pcp->lists[order_to_pindex(migratetype, order)];
2784 441 : page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
2785 882 : pcp_spin_unlock(pcp);
2786 882 : pcp_trylock_finish(UP_flags);
2787 441 : if (page) {
2788 882 : __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2789 : zone_statistics(preferred_zone, zone, 1);
2790 : }
2791 : return page;
2792 : }
2793 :
2794 : /*
2795 : * Allocate a page from the given zone.
2796 : * Use pcplists for THP or "cheap" high-order allocations.
2797 : */
2798 :
2799 : /*
2800 : * Do not instrument rmqueue() with KMSAN. This function may call
2801 : * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
2802 : * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
2803 : * may call rmqueue() again, which will result in a deadlock.
2804 : */
2805 : __no_sanitize_memory
2806 : static inline
2807 441 : struct page *rmqueue(struct zone *preferred_zone,
2808 : struct zone *zone, unsigned int order,
2809 : gfp_t gfp_flags, unsigned int alloc_flags,
2810 : int migratetype)
2811 : {
2812 : struct page *page;
2813 :
2814 : /*
2815 : * We most definitely don't want callers attempting to
2816 : * allocate greater than order-1 page units with __GFP_NOFAIL.
2817 : */
2818 441 : WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
2819 :
2820 441 : if (likely(pcp_allowed_order(order))) {
2821 : /*
2822 : * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
2823 : * we need to skip it when CMA area isn't allowed.
2824 : */
2825 : if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
2826 : migratetype != MIGRATE_MOVABLE) {
2827 441 : page = rmqueue_pcplist(preferred_zone, zone, order,
2828 : migratetype, alloc_flags);
2829 441 : if (likely(page))
2830 : goto out;
2831 : }
2832 : }
2833 :
2834 : page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
2835 : migratetype);
2836 :
2837 : out:
2838 : /* Separate test+clear to avoid unnecessary atomics */
2839 774 : if ((alloc_flags & ALLOC_KSWAPD) &&
2840 666 : unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
2841 0 : clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
2842 0 : wakeup_kswapd(zone, 0, 0, zone_idx(zone));
2843 : }
2844 :
2845 : VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
2846 441 : return page;
2847 : }
2848 :
2849 457 : noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
2850 : {
2851 457 : return __should_fail_alloc_page(gfp_mask, order);
2852 : }
2853 : ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
2854 :
2855 : static inline long __zone_watermark_unusable_free(struct zone *z,
2856 : unsigned int order, unsigned int alloc_flags)
2857 : {
2858 458 : long unusable_free = (1 << order) - 1;
2859 :
2860 : /*
2861 : * If the caller does not have rights to reserves below the min
2862 : * watermark then subtract the high-atomic reserves. This will
2863 : * over-estimate the size of the atomic reserve but it avoids a search.
2864 : */
2865 458 : if (likely(!(alloc_flags & ALLOC_RESERVES)))
2866 458 : unusable_free += z->nr_reserved_highatomic;
2867 :
2868 : #ifdef CONFIG_CMA
2869 : /* If allocation can't use CMA areas don't use free CMA pages */
2870 : if (!(alloc_flags & ALLOC_CMA))
2871 : unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
2872 : #endif
2873 : #ifdef CONFIG_UNACCEPTED_MEMORY
2874 : unusable_free += zone_page_state(z, NR_UNACCEPTED);
2875 : #endif
2876 :
2877 : return unusable_free;
2878 : }
2879 :
2880 : /*
2881 : * Return true if free base pages are above 'mark'. For high-order checks it
2882 : * will return true of the order-0 watermark is reached and there is at least
2883 : * one free page of a suitable size. Checking now avoids taking the zone lock
2884 : * to check in the allocation paths if no pages are free.
2885 : */
2886 100 : bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
2887 : int highest_zoneidx, unsigned int alloc_flags,
2888 : long free_pages)
2889 : {
2890 100 : long min = mark;
2891 : int o;
2892 :
2893 : /* free_pages may go negative - that's OK */
2894 200 : free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
2895 :
2896 100 : if (unlikely(alloc_flags & ALLOC_RESERVES)) {
2897 : /*
2898 : * __GFP_HIGH allows access to 50% of the min reserve as well
2899 : * as OOM.
2900 : */
2901 0 : if (alloc_flags & ALLOC_MIN_RESERVE) {
2902 0 : min -= min / 2;
2903 :
2904 : /*
2905 : * Non-blocking allocations (e.g. GFP_ATOMIC) can
2906 : * access more reserves than just __GFP_HIGH. Other
2907 : * non-blocking allocations requests such as GFP_NOWAIT
2908 : * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get
2909 : * access to the min reserve.
2910 : */
2911 0 : if (alloc_flags & ALLOC_NON_BLOCK)
2912 0 : min -= min / 4;
2913 : }
2914 :
2915 : /*
2916 : * OOM victims can try even harder than the normal reserve
2917 : * users on the grounds that it's definitely going to be in
2918 : * the exit path shortly and free memory. Any allocation it
2919 : * makes during the free path will be small and short-lived.
2920 : */
2921 0 : if (alloc_flags & ALLOC_OOM)
2922 0 : min -= min / 2;
2923 : }
2924 :
2925 : /*
2926 : * Check watermarks for an order-0 allocation request. If these
2927 : * are not met, then a high-order request also cannot go ahead
2928 : * even if a suitable page happened to be free.
2929 : */
2930 100 : if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
2931 : return false;
2932 :
2933 : /* If this is an order-0 request then the watermark is fine */
2934 100 : if (!order)
2935 : return true;
2936 :
2937 : /* For a high-order request, check at least one suitable page is free */
2938 103 : for (o = order; o <= MAX_ORDER; o++) {
2939 103 : struct free_area *area = &z->free_area[o];
2940 : int mt;
2941 :
2942 103 : if (!area->nr_free)
2943 4 : continue;
2944 :
2945 75 : for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
2946 174 : if (!free_area_empty(area, mt))
2947 : return true;
2948 : }
2949 :
2950 : #ifdef CONFIG_CMA
2951 : if ((alloc_flags & ALLOC_CMA) &&
2952 : !free_area_empty(area, MIGRATE_CMA)) {
2953 : return true;
2954 : }
2955 : #endif
2956 0 : if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) &&
2957 0 : !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
2958 : return true;
2959 : }
2960 : }
2961 : return false;
2962 : }
2963 :
2964 0 : bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
2965 : int highest_zoneidx, unsigned int alloc_flags)
2966 : {
2967 0 : return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
2968 0 : zone_page_state(z, NR_FREE_PAGES));
2969 : }
2970 :
2971 457 : static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
2972 : unsigned long mark, int highest_zoneidx,
2973 : unsigned int alloc_flags, gfp_t gfp_mask)
2974 : {
2975 : long free_pages;
2976 :
2977 457 : free_pages = zone_page_state(z, NR_FREE_PAGES);
2978 :
2979 : /*
2980 : * Fast check for order-0 only. If this fails then the reserves
2981 : * need to be calculated.
2982 : */
2983 457 : if (!order) {
2984 : long usable_free;
2985 : long reserved;
2986 :
2987 358 : usable_free = free_pages;
2988 716 : reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
2989 :
2990 : /* reserved may over estimate high-atomic reserves. */
2991 358 : usable_free -= min(usable_free, reserved);
2992 358 : if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
2993 : return true;
2994 : }
2995 :
2996 99 : if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
2997 : free_pages))
2998 : return true;
2999 :
3000 : /*
3001 : * Ignore watermark boosting for __GFP_HIGH order-0 allocations
3002 : * when checking the min watermark. The min watermark is the
3003 : * point where boosting is ignored so that kswapd is woken up
3004 : * when below the low watermark.
3005 : */
3006 0 : if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost
3007 : && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
3008 0 : mark = z->_watermark[WMARK_MIN];
3009 0 : return __zone_watermark_ok(z, order, mark, highest_zoneidx,
3010 : alloc_flags, free_pages);
3011 : }
3012 :
3013 : return false;
3014 : }
3015 :
3016 1 : bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
3017 : unsigned long mark, int highest_zoneidx)
3018 : {
3019 1 : long free_pages = zone_page_state(z, NR_FREE_PAGES);
3020 :
3021 1 : if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
3022 0 : free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
3023 :
3024 1 : return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
3025 : free_pages);
3026 : }
3027 :
3028 : #ifdef CONFIG_NUMA
3029 : int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
3030 :
3031 : static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3032 : {
3033 : return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3034 : node_reclaim_distance;
3035 : }
3036 : #else /* CONFIG_NUMA */
3037 : static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3038 : {
3039 : return true;
3040 : }
3041 : #endif /* CONFIG_NUMA */
3042 :
3043 : /*
3044 : * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
3045 : * fragmentation is subtle. If the preferred zone was HIGHMEM then
3046 : * premature use of a lower zone may cause lowmem pressure problems that
3047 : * are worse than fragmentation. If the next zone is ZONE_DMA then it is
3048 : * probably too small. It only makes sense to spread allocations to avoid
3049 : * fragmentation between the Normal and DMA32 zones.
3050 : */
3051 : static inline unsigned int
3052 : alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
3053 : {
3054 : unsigned int alloc_flags;
3055 :
3056 : /*
3057 : * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
3058 : * to save a branch.
3059 : */
3060 441 : alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
3061 :
3062 : #ifdef CONFIG_ZONE_DMA32
3063 : if (!zone)
3064 : return alloc_flags;
3065 :
3066 : if (zone_idx(zone) != ZONE_NORMAL)
3067 : return alloc_flags;
3068 :
3069 : /*
3070 : * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
3071 : * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
3072 : * on UMA that if Normal is populated then so is DMA32.
3073 : */
3074 : BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
3075 : if (nr_online_nodes > 1 && !populated_zone(--zone))
3076 : return alloc_flags;
3077 :
3078 : alloc_flags |= ALLOC_NOFRAGMENT;
3079 : #endif /* CONFIG_ZONE_DMA32 */
3080 : return alloc_flags;
3081 : }
3082 :
3083 : /* Must be called after current_gfp_context() which can change gfp_mask */
3084 : static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
3085 : unsigned int alloc_flags)
3086 : {
3087 : #ifdef CONFIG_CMA
3088 : if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
3089 : alloc_flags |= ALLOC_CMA;
3090 : #endif
3091 : return alloc_flags;
3092 : }
3093 :
3094 : /*
3095 : * get_page_from_freelist goes through the zonelist trying to allocate
3096 : * a page.
3097 : */
3098 : static struct page *
3099 441 : get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3100 : const struct alloc_context *ac)
3101 : {
3102 : struct zoneref *z;
3103 : struct zone *zone;
3104 441 : struct pglist_data *last_pgdat = NULL;
3105 441 : bool last_pgdat_dirty_ok = false;
3106 : bool no_fallback;
3107 :
3108 : retry:
3109 : /*
3110 : * Scan zonelist, looking for a zone with enough free.
3111 : * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
3112 : */
3113 441 : no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
3114 441 : z = ac->preferred_zoneref;
3115 441 : for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
3116 : ac->nodemask) {
3117 : struct page *page;
3118 : unsigned long mark;
3119 :
3120 : if (cpusets_enabled() &&
3121 : (alloc_flags & ALLOC_CPUSET) &&
3122 : !__cpuset_zone_allowed(zone, gfp_mask))
3123 : continue;
3124 : /*
3125 : * When allocating a page cache page for writing, we
3126 : * want to get it from a node that is within its dirty
3127 : * limit, such that no single node holds more than its
3128 : * proportional share of globally allowed dirty pages.
3129 : * The dirty limits take into account the node's
3130 : * lowmem reserves and high watermark so that kswapd
3131 : * should be able to balance it without having to
3132 : * write pages from its LRU list.
3133 : *
3134 : * XXX: For now, allow allocations to potentially
3135 : * exceed the per-node dirty limit in the slowpath
3136 : * (spread_dirty_pages unset) before going into reclaim,
3137 : * which is important when on a NUMA setup the allowed
3138 : * nodes are together not big enough to reach the
3139 : * global limit. The proper fix for these situations
3140 : * will require awareness of nodes in the
3141 : * dirty-throttling and the flusher threads.
3142 : */
3143 441 : if (ac->spread_dirty_pages) {
3144 0 : if (last_pgdat != zone->zone_pgdat) {
3145 0 : last_pgdat = zone->zone_pgdat;
3146 0 : last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
3147 : }
3148 :
3149 0 : if (!last_pgdat_dirty_ok)
3150 0 : continue;
3151 : }
3152 :
3153 : if (no_fallback && nr_online_nodes > 1 &&
3154 : zone != ac->preferred_zoneref->zone) {
3155 : int local_nid;
3156 :
3157 : /*
3158 : * If moving to a remote node, retry but allow
3159 : * fragmenting fallbacks. Locality is more important
3160 : * than fragmentation avoidance.
3161 : */
3162 : local_nid = zone_to_nid(ac->preferred_zoneref->zone);
3163 : if (zone_to_nid(zone) != local_nid) {
3164 : alloc_flags &= ~ALLOC_NOFRAGMENT;
3165 : goto retry;
3166 : }
3167 : }
3168 :
3169 441 : mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
3170 882 : if (!zone_watermark_fast(zone, order, mark,
3171 441 : ac->highest_zoneidx, alloc_flags,
3172 : gfp_mask)) {
3173 : int ret;
3174 :
3175 : if (has_unaccepted_memory()) {
3176 : if (try_to_accept_memory(zone, order))
3177 : goto try_this_zone;
3178 : }
3179 :
3180 : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3181 : /*
3182 : * Watermark failed for this zone, but see if we can
3183 : * grow this zone if it contains deferred pages.
3184 : */
3185 : if (deferred_pages_enabled()) {
3186 : if (_deferred_grow_zone(zone, order))
3187 : goto try_this_zone;
3188 : }
3189 : #endif
3190 : /* Checked here to keep the fast path fast */
3191 : BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
3192 0 : if (alloc_flags & ALLOC_NO_WATERMARKS)
3193 : goto try_this_zone;
3194 :
3195 : if (!node_reclaim_enabled() ||
3196 : !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
3197 0 : continue;
3198 :
3199 : ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
3200 : switch (ret) {
3201 : case NODE_RECLAIM_NOSCAN:
3202 : /* did not scan */
3203 : continue;
3204 : case NODE_RECLAIM_FULL:
3205 : /* scanned but unreclaimable */
3206 : continue;
3207 : default:
3208 : /* did we reclaim enough */
3209 : if (zone_watermark_ok(zone, order, mark,
3210 : ac->highest_zoneidx, alloc_flags))
3211 : goto try_this_zone;
3212 :
3213 : continue;
3214 : }
3215 : }
3216 :
3217 : try_this_zone:
3218 441 : page = rmqueue(ac->preferred_zoneref->zone, zone, order,
3219 : gfp_mask, alloc_flags, ac->migratetype);
3220 441 : if (page) {
3221 441 : prep_new_page(page, order, gfp_mask, alloc_flags);
3222 :
3223 : /*
3224 : * If this is a high-order atomic allocation then check
3225 : * if the pageblock should be reserved for the future
3226 : */
3227 441 : if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
3228 0 : reserve_highatomic_pageblock(page, zone, order);
3229 :
3230 : return page;
3231 : } else {
3232 : if (has_unaccepted_memory()) {
3233 : if (try_to_accept_memory(zone, order))
3234 : goto try_this_zone;
3235 : }
3236 :
3237 : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3238 : /* Try again if zone has deferred pages */
3239 : if (deferred_pages_enabled()) {
3240 : if (_deferred_grow_zone(zone, order))
3241 : goto try_this_zone;
3242 : }
3243 : #endif
3244 : }
3245 : }
3246 :
3247 : /*
3248 : * It's possible on a UMA machine to get through all zones that are
3249 : * fragmented. If avoiding fragmentation, reset and try again.
3250 : */
3251 : if (no_fallback) {
3252 : alloc_flags &= ~ALLOC_NOFRAGMENT;
3253 : goto retry;
3254 : }
3255 :
3256 : return NULL;
3257 : }
3258 :
3259 0 : static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3260 : {
3261 0 : unsigned int filter = SHOW_MEM_FILTER_NODES;
3262 :
3263 : /*
3264 : * This documents exceptions given to allocations in certain
3265 : * contexts that are allowed to allocate outside current's set
3266 : * of allowed nodes.
3267 : */
3268 0 : if (!(gfp_mask & __GFP_NOMEMALLOC))
3269 0 : if (tsk_is_oom_victim(current) ||
3270 0 : (current->flags & (PF_MEMALLOC | PF_EXITING)))
3271 : filter &= ~SHOW_MEM_FILTER_NODES;
3272 0 : if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
3273 0 : filter &= ~SHOW_MEM_FILTER_NODES;
3274 :
3275 0 : __show_mem(filter, nodemask, gfp_zone(gfp_mask));
3276 0 : }
3277 :
3278 0 : void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
3279 : {
3280 : struct va_format vaf;
3281 : va_list args;
3282 : static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
3283 :
3284 0 : if ((gfp_mask & __GFP_NOWARN) ||
3285 0 : !__ratelimit(&nopage_rs) ||
3286 0 : ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
3287 0 : return;
3288 :
3289 0 : va_start(args, fmt);
3290 0 : vaf.fmt = fmt;
3291 0 : vaf.va = &args;
3292 0 : pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
3293 : current->comm, &vaf, gfp_mask, &gfp_mask,
3294 : nodemask_pr_args(nodemask));
3295 0 : va_end(args);
3296 :
3297 : cpuset_print_current_mems_allowed();
3298 0 : pr_cont("\n");
3299 0 : dump_stack();
3300 0 : warn_alloc_show_mem(gfp_mask, nodemask);
3301 : }
3302 :
3303 : static inline struct page *
3304 0 : __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
3305 : unsigned int alloc_flags,
3306 : const struct alloc_context *ac)
3307 : {
3308 : struct page *page;
3309 :
3310 0 : page = get_page_from_freelist(gfp_mask, order,
3311 0 : alloc_flags|ALLOC_CPUSET, ac);
3312 : /*
3313 : * fallback to ignore cpuset restriction if our nodes
3314 : * are depleted
3315 : */
3316 0 : if (!page)
3317 0 : page = get_page_from_freelist(gfp_mask, order,
3318 : alloc_flags, ac);
3319 :
3320 0 : return page;
3321 : }
3322 :
3323 : static inline struct page *
3324 0 : __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
3325 : const struct alloc_context *ac, unsigned long *did_some_progress)
3326 : {
3327 0 : struct oom_control oc = {
3328 0 : .zonelist = ac->zonelist,
3329 0 : .nodemask = ac->nodemask,
3330 : .memcg = NULL,
3331 : .gfp_mask = gfp_mask,
3332 : .order = order,
3333 : };
3334 : struct page *page;
3335 :
3336 0 : *did_some_progress = 0;
3337 :
3338 : /*
3339 : * Acquire the oom lock. If that fails, somebody else is
3340 : * making progress for us.
3341 : */
3342 0 : if (!mutex_trylock(&oom_lock)) {
3343 0 : *did_some_progress = 1;
3344 0 : schedule_timeout_uninterruptible(1);
3345 0 : return NULL;
3346 : }
3347 :
3348 : /*
3349 : * Go through the zonelist yet one more time, keep very high watermark
3350 : * here, this is only to catch a parallel oom killing, we must fail if
3351 : * we're still under heavy pressure. But make sure that this reclaim
3352 : * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
3353 : * allocation which will never fail due to oom_lock already held.
3354 : */
3355 0 : page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
3356 : ~__GFP_DIRECT_RECLAIM, order,
3357 : ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
3358 0 : if (page)
3359 : goto out;
3360 :
3361 : /* Coredumps can quickly deplete all memory reserves */
3362 0 : if (current->flags & PF_DUMPCORE)
3363 : goto out;
3364 : /* The OOM killer will not help higher order allocs */
3365 0 : if (order > PAGE_ALLOC_COSTLY_ORDER)
3366 : goto out;
3367 : /*
3368 : * We have already exhausted all our reclaim opportunities without any
3369 : * success so it is time to admit defeat. We will skip the OOM killer
3370 : * because it is very likely that the caller has a more reasonable
3371 : * fallback than shooting a random task.
3372 : *
3373 : * The OOM killer may not free memory on a specific node.
3374 : */
3375 0 : if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
3376 : goto out;
3377 : /* The OOM killer does not needlessly kill tasks for lowmem */
3378 : if (ac->highest_zoneidx < ZONE_NORMAL)
3379 : goto out;
3380 0 : if (pm_suspended_storage())
3381 : goto out;
3382 : /*
3383 : * XXX: GFP_NOFS allocations should rather fail than rely on
3384 : * other request to make a forward progress.
3385 : * We are in an unfortunate situation where out_of_memory cannot
3386 : * do much for this context but let's try it to at least get
3387 : * access to memory reserved if the current task is killed (see
3388 : * out_of_memory). Once filesystems are ready to handle allocation
3389 : * failures more gracefully we should just bail out here.
3390 : */
3391 :
3392 : /* Exhausted what can be done so it's blame time */
3393 0 : if (out_of_memory(&oc) ||
3394 0 : WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
3395 0 : *did_some_progress = 1;
3396 :
3397 : /*
3398 : * Help non-failing allocations by giving them access to memory
3399 : * reserves
3400 : */
3401 0 : if (gfp_mask & __GFP_NOFAIL)
3402 0 : page = __alloc_pages_cpuset_fallback(gfp_mask, order,
3403 : ALLOC_NO_WATERMARKS, ac);
3404 : }
3405 : out:
3406 0 : mutex_unlock(&oom_lock);
3407 0 : return page;
3408 : }
3409 :
3410 : /*
3411 : * Maximum number of compaction retries with a progress before OOM
3412 : * killer is consider as the only way to move forward.
3413 : */
3414 : #define MAX_COMPACT_RETRIES 16
3415 :
3416 : #ifdef CONFIG_COMPACTION
3417 : /* Try memory compaction for high-order allocations before reclaim */
3418 : static struct page *
3419 0 : __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3420 : unsigned int alloc_flags, const struct alloc_context *ac,
3421 : enum compact_priority prio, enum compact_result *compact_result)
3422 : {
3423 0 : struct page *page = NULL;
3424 : unsigned long pflags;
3425 : unsigned int noreclaim_flag;
3426 :
3427 0 : if (!order)
3428 : return NULL;
3429 :
3430 0 : psi_memstall_enter(&pflags);
3431 : delayacct_compact_start();
3432 0 : noreclaim_flag = memalloc_noreclaim_save();
3433 :
3434 0 : *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3435 : prio, &page);
3436 :
3437 0 : memalloc_noreclaim_restore(noreclaim_flag);
3438 0 : psi_memstall_leave(&pflags);
3439 : delayacct_compact_end();
3440 :
3441 0 : if (*compact_result == COMPACT_SKIPPED)
3442 : return NULL;
3443 : /*
3444 : * At least in one zone compaction wasn't deferred or skipped, so let's
3445 : * count a compaction stall
3446 : */
3447 0 : count_vm_event(COMPACTSTALL);
3448 :
3449 : /* Prep a captured page if available */
3450 0 : if (page)
3451 0 : prep_new_page(page, order, gfp_mask, alloc_flags);
3452 :
3453 : /* Try get a page from the freelist if available */
3454 0 : if (!page)
3455 0 : page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3456 :
3457 0 : if (page) {
3458 0 : struct zone *zone = page_zone(page);
3459 :
3460 0 : zone->compact_blockskip_flush = false;
3461 0 : compaction_defer_reset(zone, order, true);
3462 0 : count_vm_event(COMPACTSUCCESS);
3463 0 : return page;
3464 : }
3465 :
3466 : /*
3467 : * It's bad if compaction run occurs and fails. The most likely reason
3468 : * is that pages exist, but not enough to satisfy watermarks.
3469 : */
3470 0 : count_vm_event(COMPACTFAIL);
3471 :
3472 0 : cond_resched();
3473 :
3474 0 : return NULL;
3475 : }
3476 :
3477 : static inline bool
3478 0 : should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3479 : enum compact_result compact_result,
3480 : enum compact_priority *compact_priority,
3481 : int *compaction_retries)
3482 : {
3483 0 : int max_retries = MAX_COMPACT_RETRIES;
3484 : int min_priority;
3485 0 : bool ret = false;
3486 0 : int retries = *compaction_retries;
3487 0 : enum compact_priority priority = *compact_priority;
3488 :
3489 0 : if (!order)
3490 : return false;
3491 :
3492 0 : if (fatal_signal_pending(current))
3493 : return false;
3494 :
3495 : /*
3496 : * Compaction was skipped due to a lack of free order-0
3497 : * migration targets. Continue if reclaim can help.
3498 : */
3499 0 : if (compact_result == COMPACT_SKIPPED) {
3500 0 : ret = compaction_zonelist_suitable(ac, order, alloc_flags);
3501 0 : goto out;
3502 : }
3503 :
3504 : /*
3505 : * Compaction managed to coalesce some page blocks, but the
3506 : * allocation failed presumably due to a race. Retry some.
3507 : */
3508 0 : if (compact_result == COMPACT_SUCCESS) {
3509 : /*
3510 : * !costly requests are much more important than
3511 : * __GFP_RETRY_MAYFAIL costly ones because they are de
3512 : * facto nofail and invoke OOM killer to move on while
3513 : * costly can fail and users are ready to cope with
3514 : * that. 1/4 retries is rather arbitrary but we would
3515 : * need much more detailed feedback from compaction to
3516 : * make a better decision.
3517 : */
3518 0 : if (order > PAGE_ALLOC_COSTLY_ORDER)
3519 0 : max_retries /= 4;
3520 :
3521 0 : if (++(*compaction_retries) <= max_retries) {
3522 : ret = true;
3523 : goto out;
3524 : }
3525 : }
3526 :
3527 : /*
3528 : * Compaction failed. Retry with increasing priority.
3529 : */
3530 0 : min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
3531 0 : MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
3532 :
3533 0 : if (*compact_priority > min_priority) {
3534 0 : (*compact_priority)--;
3535 0 : *compaction_retries = 0;
3536 0 : ret = true;
3537 : }
3538 : out:
3539 0 : trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
3540 0 : return ret;
3541 : }
3542 : #else
3543 : static inline struct page *
3544 : __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3545 : unsigned int alloc_flags, const struct alloc_context *ac,
3546 : enum compact_priority prio, enum compact_result *compact_result)
3547 : {
3548 : *compact_result = COMPACT_SKIPPED;
3549 : return NULL;
3550 : }
3551 :
3552 : static inline bool
3553 : should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
3554 : enum compact_result compact_result,
3555 : enum compact_priority *compact_priority,
3556 : int *compaction_retries)
3557 : {
3558 : struct zone *zone;
3559 : struct zoneref *z;
3560 :
3561 : if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
3562 : return false;
3563 :
3564 : /*
3565 : * There are setups with compaction disabled which would prefer to loop
3566 : * inside the allocator rather than hit the oom killer prematurely.
3567 : * Let's give them a good hope and keep retrying while the order-0
3568 : * watermarks are OK.
3569 : */
3570 : for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
3571 : ac->highest_zoneidx, ac->nodemask) {
3572 : if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
3573 : ac->highest_zoneidx, alloc_flags))
3574 : return true;
3575 : }
3576 : return false;
3577 : }
3578 : #endif /* CONFIG_COMPACTION */
3579 :
3580 : #ifdef CONFIG_LOCKDEP
3581 : static struct lockdep_map __fs_reclaim_map =
3582 : STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
3583 :
3584 : static bool __need_reclaim(gfp_t gfp_mask)
3585 : {
3586 : /* no reclaim without waiting on it */
3587 : if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
3588 : return false;
3589 :
3590 : /* this guy won't enter reclaim */
3591 : if (current->flags & PF_MEMALLOC)
3592 : return false;
3593 :
3594 : if (gfp_mask & __GFP_NOLOCKDEP)
3595 : return false;
3596 :
3597 : return true;
3598 : }
3599 :
3600 : void __fs_reclaim_acquire(unsigned long ip)
3601 : {
3602 : lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip);
3603 : }
3604 :
3605 : void __fs_reclaim_release(unsigned long ip)
3606 : {
3607 : lock_release(&__fs_reclaim_map, ip);
3608 : }
3609 :
3610 : void fs_reclaim_acquire(gfp_t gfp_mask)
3611 : {
3612 : gfp_mask = current_gfp_context(gfp_mask);
3613 :
3614 : if (__need_reclaim(gfp_mask)) {
3615 : if (gfp_mask & __GFP_FS)
3616 : __fs_reclaim_acquire(_RET_IP_);
3617 :
3618 : #ifdef CONFIG_MMU_NOTIFIER
3619 : lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
3620 : lock_map_release(&__mmu_notifier_invalidate_range_start_map);
3621 : #endif
3622 :
3623 : }
3624 : }
3625 : EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
3626 :
3627 : void fs_reclaim_release(gfp_t gfp_mask)
3628 : {
3629 : gfp_mask = current_gfp_context(gfp_mask);
3630 :
3631 : if (__need_reclaim(gfp_mask)) {
3632 : if (gfp_mask & __GFP_FS)
3633 : __fs_reclaim_release(_RET_IP_);
3634 : }
3635 : }
3636 : EXPORT_SYMBOL_GPL(fs_reclaim_release);
3637 : #endif
3638 :
3639 : /*
3640 : * Zonelists may change due to hotplug during allocation. Detect when zonelists
3641 : * have been rebuilt so allocation retries. Reader side does not lock and
3642 : * retries the allocation if zonelist changes. Writer side is protected by the
3643 : * embedded spin_lock.
3644 : */
3645 : static DEFINE_SEQLOCK(zonelist_update_seq);
3646 :
3647 : static unsigned int zonelist_iter_begin(void)
3648 : {
3649 : if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
3650 : return read_seqbegin(&zonelist_update_seq);
3651 :
3652 : return 0;
3653 : }
3654 :
3655 : static unsigned int check_retry_zonelist(unsigned int seq)
3656 : {
3657 : if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
3658 : return read_seqretry(&zonelist_update_seq, seq);
3659 :
3660 : return seq;
3661 : }
3662 :
3663 : /* Perform direct synchronous page reclaim */
3664 : static unsigned long
3665 0 : __perform_reclaim(gfp_t gfp_mask, unsigned int order,
3666 : const struct alloc_context *ac)
3667 : {
3668 : unsigned int noreclaim_flag;
3669 : unsigned long progress;
3670 :
3671 0 : cond_resched();
3672 :
3673 : /* We now go into synchronous reclaim */
3674 : cpuset_memory_pressure_bump();
3675 0 : fs_reclaim_acquire(gfp_mask);
3676 0 : noreclaim_flag = memalloc_noreclaim_save();
3677 :
3678 0 : progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
3679 : ac->nodemask);
3680 :
3681 0 : memalloc_noreclaim_restore(noreclaim_flag);
3682 0 : fs_reclaim_release(gfp_mask);
3683 :
3684 0 : cond_resched();
3685 :
3686 0 : return progress;
3687 : }
3688 :
3689 : /* The really slow allocator path where we enter direct reclaim */
3690 : static inline struct page *
3691 0 : __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
3692 : unsigned int alloc_flags, const struct alloc_context *ac,
3693 : unsigned long *did_some_progress)
3694 : {
3695 0 : struct page *page = NULL;
3696 : unsigned long pflags;
3697 0 : bool drained = false;
3698 :
3699 0 : psi_memstall_enter(&pflags);
3700 0 : *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
3701 0 : if (unlikely(!(*did_some_progress)))
3702 : goto out;
3703 :
3704 : retry:
3705 0 : page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3706 :
3707 : /*
3708 : * If an allocation failed after direct reclaim, it could be because
3709 : * pages are pinned on the per-cpu lists or in high alloc reserves.
3710 : * Shrink them and try again
3711 : */
3712 0 : if (!page && !drained) {
3713 0 : unreserve_highatomic_pageblock(ac, false);
3714 0 : drain_all_pages(NULL);
3715 0 : drained = true;
3716 0 : goto retry;
3717 : }
3718 : out:
3719 0 : psi_memstall_leave(&pflags);
3720 :
3721 0 : return page;
3722 : }
3723 :
3724 0 : static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
3725 : const struct alloc_context *ac)
3726 : {
3727 : struct zoneref *z;
3728 : struct zone *zone;
3729 0 : pg_data_t *last_pgdat = NULL;
3730 0 : enum zone_type highest_zoneidx = ac->highest_zoneidx;
3731 :
3732 0 : for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
3733 : ac->nodemask) {
3734 0 : if (!managed_zone(zone))
3735 0 : continue;
3736 0 : if (last_pgdat != zone->zone_pgdat) {
3737 0 : wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
3738 0 : last_pgdat = zone->zone_pgdat;
3739 : }
3740 : }
3741 0 : }
3742 :
3743 : static inline unsigned int
3744 0 : gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
3745 : {
3746 0 : unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
3747 :
3748 : /*
3749 : * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE
3750 : * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
3751 : * to save two branches.
3752 : */
3753 : BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE);
3754 : BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
3755 :
3756 : /*
3757 : * The caller may dip into page reserves a bit more if the caller
3758 : * cannot run direct reclaim, or if the caller has realtime scheduling
3759 : * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
3760 : * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
3761 : */
3762 0 : alloc_flags |= (__force int)
3763 : (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
3764 :
3765 0 : if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
3766 : /*
3767 : * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
3768 : * if it can't schedule.
3769 : */
3770 0 : if (!(gfp_mask & __GFP_NOMEMALLOC)) {
3771 0 : alloc_flags |= ALLOC_NON_BLOCK;
3772 :
3773 0 : if (order > 0)
3774 0 : alloc_flags |= ALLOC_HIGHATOMIC;
3775 : }
3776 :
3777 : /*
3778 : * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
3779 : * GFP_ATOMIC) rather than fail, see the comment for
3780 : * cpuset_node_allowed().
3781 : */
3782 0 : if (alloc_flags & ALLOC_MIN_RESERVE)
3783 0 : alloc_flags &= ~ALLOC_CPUSET;
3784 0 : } else if (unlikely(rt_task(current)) && in_task())
3785 0 : alloc_flags |= ALLOC_MIN_RESERVE;
3786 :
3787 0 : alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
3788 :
3789 0 : return alloc_flags;
3790 : }
3791 :
3792 : static bool oom_reserves_allowed(struct task_struct *tsk)
3793 : {
3794 0 : if (!tsk_is_oom_victim(tsk))
3795 : return false;
3796 :
3797 : /*
3798 : * !MMU doesn't have oom reaper so give access to memory reserves
3799 : * only to the thread with TIF_MEMDIE set
3800 : */
3801 : if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
3802 : return false;
3803 :
3804 : return true;
3805 : }
3806 :
3807 : /*
3808 : * Distinguish requests which really need access to full memory
3809 : * reserves from oom victims which can live with a portion of it
3810 : */
3811 0 : static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
3812 : {
3813 0 : if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
3814 : return 0;
3815 0 : if (gfp_mask & __GFP_MEMALLOC)
3816 : return ALLOC_NO_WATERMARKS;
3817 0 : if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
3818 : return ALLOC_NO_WATERMARKS;
3819 0 : if (!in_interrupt()) {
3820 0 : if (current->flags & PF_MEMALLOC)
3821 : return ALLOC_NO_WATERMARKS;
3822 0 : else if (oom_reserves_allowed(current))
3823 : return ALLOC_OOM;
3824 : }
3825 :
3826 : return 0;
3827 : }
3828 :
3829 0 : bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
3830 : {
3831 0 : return !!__gfp_pfmemalloc_flags(gfp_mask);
3832 : }
3833 :
3834 : /*
3835 : * Checks whether it makes sense to retry the reclaim to make a forward progress
3836 : * for the given allocation request.
3837 : *
3838 : * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
3839 : * without success, or when we couldn't even meet the watermark if we
3840 : * reclaimed all remaining pages on the LRU lists.
3841 : *
3842 : * Returns true if a retry is viable or false to enter the oom path.
3843 : */
3844 : static inline bool
3845 0 : should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3846 : struct alloc_context *ac, int alloc_flags,
3847 : bool did_some_progress, int *no_progress_loops)
3848 : {
3849 : struct zone *zone;
3850 : struct zoneref *z;
3851 0 : bool ret = false;
3852 :
3853 : /*
3854 : * Costly allocations might have made a progress but this doesn't mean
3855 : * their order will become available due to high fragmentation so
3856 : * always increment the no progress counter for them
3857 : */
3858 0 : if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
3859 0 : *no_progress_loops = 0;
3860 : else
3861 0 : (*no_progress_loops)++;
3862 :
3863 : /*
3864 : * Make sure we converge to OOM if we cannot make any progress
3865 : * several times in the row.
3866 : */
3867 0 : if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
3868 : /* Before OOM, exhaust highatomic_reserve */
3869 0 : return unreserve_highatomic_pageblock(ac, true);
3870 : }
3871 :
3872 : /*
3873 : * Keep reclaiming pages while there is a chance this will lead
3874 : * somewhere. If none of the target zones can satisfy our allocation
3875 : * request even if all reclaimable pages are considered then we are
3876 : * screwed and have to go OOM.
3877 : */
3878 0 : for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
3879 : ac->highest_zoneidx, ac->nodemask) {
3880 : unsigned long available;
3881 : unsigned long reclaimable;
3882 0 : unsigned long min_wmark = min_wmark_pages(zone);
3883 : bool wmark;
3884 :
3885 0 : available = reclaimable = zone_reclaimable_pages(zone);
3886 0 : available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
3887 :
3888 : /*
3889 : * Would the allocation succeed if we reclaimed all
3890 : * reclaimable pages?
3891 : */
3892 0 : wmark = __zone_watermark_ok(zone, order, min_wmark,
3893 0 : ac->highest_zoneidx, alloc_flags, available);
3894 0 : trace_reclaim_retry_zone(z, order, reclaimable,
3895 : available, min_wmark, *no_progress_loops, wmark);
3896 0 : if (wmark) {
3897 : ret = true;
3898 : break;
3899 : }
3900 : }
3901 :
3902 : /*
3903 : * Memory allocation/reclaim might be called from a WQ context and the
3904 : * current implementation of the WQ concurrency control doesn't
3905 : * recognize that a particular WQ is congested if the worker thread is
3906 : * looping without ever sleeping. Therefore we have to do a short sleep
3907 : * here rather than calling cond_resched().
3908 : */
3909 0 : if (current->flags & PF_WQ_WORKER)
3910 0 : schedule_timeout_uninterruptible(1);
3911 : else
3912 0 : cond_resched();
3913 : return ret;
3914 : }
3915 :
3916 : static inline bool
3917 : check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
3918 : {
3919 : /*
3920 : * It's possible that cpuset's mems_allowed and the nodemask from
3921 : * mempolicy don't intersect. This should be normally dealt with by
3922 : * policy_nodemask(), but it's possible to race with cpuset update in
3923 : * such a way the check therein was true, and then it became false
3924 : * before we got our cpuset_mems_cookie here.
3925 : * This assumes that for all allocations, ac->nodemask can come only
3926 : * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
3927 : * when it does not intersect with the cpuset restrictions) or the
3928 : * caller can deal with a violated nodemask.
3929 : */
3930 : if (cpusets_enabled() && ac->nodemask &&
3931 : !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
3932 : ac->nodemask = NULL;
3933 : return true;
3934 : }
3935 :
3936 : /*
3937 : * When updating a task's mems_allowed or mempolicy nodemask, it is
3938 : * possible to race with parallel threads in such a way that our
3939 : * allocation can fail while the mask is being updated. If we are about
3940 : * to fail, check if the cpuset changed during allocation and if so,
3941 : * retry.
3942 : */
3943 0 : if (read_mems_allowed_retry(cpuset_mems_cookie))
3944 : return true;
3945 :
3946 : return false;
3947 : }
3948 :
3949 : static inline struct page *
3950 0 : __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3951 : struct alloc_context *ac)
3952 : {
3953 0 : bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
3954 0 : const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
3955 0 : struct page *page = NULL;
3956 : unsigned int alloc_flags;
3957 : unsigned long did_some_progress;
3958 : enum compact_priority compact_priority;
3959 : enum compact_result compact_result;
3960 : int compaction_retries;
3961 : int no_progress_loops;
3962 : unsigned int cpuset_mems_cookie;
3963 : unsigned int zonelist_iter_cookie;
3964 : int reserve_flags;
3965 :
3966 : restart:
3967 0 : compaction_retries = 0;
3968 0 : no_progress_loops = 0;
3969 0 : compact_priority = DEF_COMPACT_PRIORITY;
3970 0 : cpuset_mems_cookie = read_mems_allowed_begin();
3971 0 : zonelist_iter_cookie = zonelist_iter_begin();
3972 :
3973 : /*
3974 : * The fast path uses conservative alloc_flags to succeed only until
3975 : * kswapd needs to be woken up, and to avoid the cost of setting up
3976 : * alloc_flags precisely. So we do that now.
3977 : */
3978 0 : alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
3979 :
3980 : /*
3981 : * We need to recalculate the starting point for the zonelist iterator
3982 : * because we might have used different nodemask in the fast path, or
3983 : * there was a cpuset modification and we are retrying - otherwise we
3984 : * could end up iterating over non-eligible zones endlessly.
3985 : */
3986 0 : ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3987 : ac->highest_zoneidx, ac->nodemask);
3988 0 : if (!ac->preferred_zoneref->zone)
3989 : goto nopage;
3990 :
3991 : /*
3992 : * Check for insane configurations where the cpuset doesn't contain
3993 : * any suitable zone to satisfy the request - e.g. non-movable
3994 : * GFP_HIGHUSER allocations from MOVABLE nodes only.
3995 : */
3996 : if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
3997 : struct zoneref *z = first_zones_zonelist(ac->zonelist,
3998 : ac->highest_zoneidx,
3999 : &cpuset_current_mems_allowed);
4000 : if (!z->zone)
4001 : goto nopage;
4002 : }
4003 :
4004 0 : if (alloc_flags & ALLOC_KSWAPD)
4005 0 : wake_all_kswapds(order, gfp_mask, ac);
4006 :
4007 : /*
4008 : * The adjusted alloc_flags might result in immediate success, so try
4009 : * that first
4010 : */
4011 0 : page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4012 0 : if (page)
4013 : goto got_pg;
4014 :
4015 : /*
4016 : * For costly allocations, try direct compaction first, as it's likely
4017 : * that we have enough base pages and don't need to reclaim. For non-
4018 : * movable high-order allocations, do that as well, as compaction will
4019 : * try prevent permanent fragmentation by migrating from blocks of the
4020 : * same migratetype.
4021 : * Don't try this for allocations that are allowed to ignore
4022 : * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
4023 : */
4024 0 : if (can_direct_reclaim &&
4025 0 : (costly_order ||
4026 0 : (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
4027 0 : && !gfp_pfmemalloc_allowed(gfp_mask)) {
4028 0 : page = __alloc_pages_direct_compact(gfp_mask, order,
4029 : alloc_flags, ac,
4030 : INIT_COMPACT_PRIORITY,
4031 : &compact_result);
4032 0 : if (page)
4033 : goto got_pg;
4034 :
4035 : /*
4036 : * Checks for costly allocations with __GFP_NORETRY, which
4037 : * includes some THP page fault allocations
4038 : */
4039 0 : if (costly_order && (gfp_mask & __GFP_NORETRY)) {
4040 : /*
4041 : * If allocating entire pageblock(s) and compaction
4042 : * failed because all zones are below low watermarks
4043 : * or is prohibited because it recently failed at this
4044 : * order, fail immediately unless the allocator has
4045 : * requested compaction and reclaim retry.
4046 : *
4047 : * Reclaim is
4048 : * - potentially very expensive because zones are far
4049 : * below their low watermarks or this is part of very
4050 : * bursty high order allocations,
4051 : * - not guaranteed to help because isolate_freepages()
4052 : * may not iterate over freed pages as part of its
4053 : * linear scan, and
4054 : * - unlikely to make entire pageblocks free on its
4055 : * own.
4056 : */
4057 0 : if (compact_result == COMPACT_SKIPPED ||
4058 : compact_result == COMPACT_DEFERRED)
4059 : goto nopage;
4060 :
4061 : /*
4062 : * Looks like reclaim/compaction is worth trying, but
4063 : * sync compaction could be very expensive, so keep
4064 : * using async compaction.
4065 : */
4066 0 : compact_priority = INIT_COMPACT_PRIORITY;
4067 : }
4068 : }
4069 :
4070 : retry:
4071 : /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4072 0 : if (alloc_flags & ALLOC_KSWAPD)
4073 0 : wake_all_kswapds(order, gfp_mask, ac);
4074 :
4075 0 : reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4076 0 : if (reserve_flags)
4077 0 : alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) |
4078 : (alloc_flags & ALLOC_KSWAPD);
4079 :
4080 : /*
4081 : * Reset the nodemask and zonelist iterators if memory policies can be
4082 : * ignored. These allocations are high priority and system rather than
4083 : * user oriented.
4084 : */
4085 0 : if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
4086 0 : ac->nodemask = NULL;
4087 0 : ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4088 : ac->highest_zoneidx, ac->nodemask);
4089 : }
4090 :
4091 : /* Attempt with potentially adjusted zonelist and alloc_flags */
4092 0 : page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4093 0 : if (page)
4094 : goto got_pg;
4095 :
4096 : /* Caller is not willing to reclaim, we can't balance anything */
4097 0 : if (!can_direct_reclaim)
4098 : goto nopage;
4099 :
4100 : /* Avoid recursion of direct reclaim */
4101 0 : if (current->flags & PF_MEMALLOC)
4102 : goto nopage;
4103 :
4104 : /* Try direct reclaim and then allocating */
4105 0 : page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
4106 : &did_some_progress);
4107 0 : if (page)
4108 : goto got_pg;
4109 :
4110 : /* Try direct compaction and then allocating */
4111 0 : page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
4112 : compact_priority, &compact_result);
4113 0 : if (page)
4114 : goto got_pg;
4115 :
4116 : /* Do not loop if specifically requested */
4117 0 : if (gfp_mask & __GFP_NORETRY)
4118 : goto nopage;
4119 :
4120 : /*
4121 : * Do not retry costly high order allocations unless they are
4122 : * __GFP_RETRY_MAYFAIL
4123 : */
4124 0 : if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
4125 : goto nopage;
4126 :
4127 0 : if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
4128 : did_some_progress > 0, &no_progress_loops))
4129 : goto retry;
4130 :
4131 : /*
4132 : * It doesn't make any sense to retry for the compaction if the order-0
4133 : * reclaim is not able to make any progress because the current
4134 : * implementation of the compaction depends on the sufficient amount
4135 : * of free memory (see __compaction_suitable)
4136 : */
4137 0 : if (did_some_progress > 0 &&
4138 0 : should_compact_retry(ac, order, alloc_flags,
4139 : compact_result, &compact_priority,
4140 : &compaction_retries))
4141 : goto retry;
4142 :
4143 :
4144 : /*
4145 : * Deal with possible cpuset update races or zonelist updates to avoid
4146 : * a unnecessary OOM kill.
4147 : */
4148 0 : if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
4149 0 : check_retry_zonelist(zonelist_iter_cookie))
4150 : goto restart;
4151 :
4152 : /* Reclaim has failed us, start killing things */
4153 0 : page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
4154 0 : if (page)
4155 : goto got_pg;
4156 :
4157 : /* Avoid allocations with no watermarks from looping endlessly */
4158 0 : if (tsk_is_oom_victim(current) &&
4159 0 : (alloc_flags & ALLOC_OOM ||
4160 0 : (gfp_mask & __GFP_NOMEMALLOC)))
4161 : goto nopage;
4162 :
4163 : /* Retry as long as the OOM killer is making progress */
4164 0 : if (did_some_progress) {
4165 0 : no_progress_loops = 0;
4166 0 : goto retry;
4167 : }
4168 :
4169 : nopage:
4170 : /*
4171 : * Deal with possible cpuset update races or zonelist updates to avoid
4172 : * a unnecessary OOM kill.
4173 : */
4174 0 : if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
4175 0 : check_retry_zonelist(zonelist_iter_cookie))
4176 : goto restart;
4177 :
4178 : /*
4179 : * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
4180 : * we always retry
4181 : */
4182 0 : if (gfp_mask & __GFP_NOFAIL) {
4183 : /*
4184 : * All existing users of the __GFP_NOFAIL are blockable, so warn
4185 : * of any new users that actually require GFP_NOWAIT
4186 : */
4187 0 : if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
4188 : goto fail;
4189 :
4190 : /*
4191 : * PF_MEMALLOC request from this context is rather bizarre
4192 : * because we cannot reclaim anything and only can loop waiting
4193 : * for somebody to do a work for us
4194 : */
4195 0 : WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
4196 :
4197 : /*
4198 : * non failing costly orders are a hard requirement which we
4199 : * are not prepared for much so let's warn about these users
4200 : * so that we can identify them and convert them to something
4201 : * else.
4202 : */
4203 0 : WARN_ON_ONCE_GFP(costly_order, gfp_mask);
4204 :
4205 : /*
4206 : * Help non-failing allocations by giving some access to memory
4207 : * reserves normally used for high priority non-blocking
4208 : * allocations but do not use ALLOC_NO_WATERMARKS because this
4209 : * could deplete whole memory reserves which would just make
4210 : * the situation worse.
4211 : */
4212 0 : page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac);
4213 0 : if (page)
4214 : goto got_pg;
4215 :
4216 0 : cond_resched();
4217 0 : goto retry;
4218 : }
4219 : fail:
4220 0 : warn_alloc(gfp_mask, ac->nodemask,
4221 : "page allocation failure: order:%u", order);
4222 : got_pg:
4223 0 : return page;
4224 : }
4225 :
4226 457 : static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
4227 : int preferred_nid, nodemask_t *nodemask,
4228 : struct alloc_context *ac, gfp_t *alloc_gfp,
4229 : unsigned int *alloc_flags)
4230 : {
4231 457 : ac->highest_zoneidx = gfp_zone(gfp_mask);
4232 914 : ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
4233 457 : ac->nodemask = nodemask;
4234 457 : ac->migratetype = gfp_migratetype(gfp_mask);
4235 :
4236 : if (cpusets_enabled()) {
4237 : *alloc_gfp |= __GFP_HARDWALL;
4238 : /*
4239 : * When we are in the interrupt context, it is irrelevant
4240 : * to the current task context. It means that any node ok.
4241 : */
4242 : if (in_task() && !ac->nodemask)
4243 : ac->nodemask = &cpuset_current_mems_allowed;
4244 : else
4245 : *alloc_flags |= ALLOC_CPUSET;
4246 : }
4247 :
4248 457 : might_alloc(gfp_mask);
4249 :
4250 457 : if (should_fail_alloc_page(gfp_mask, order))
4251 : return false;
4252 :
4253 457 : *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
4254 :
4255 : /* Dirty zone balancing only done in the fast path */
4256 457 : ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
4257 :
4258 : /*
4259 : * The preferred zone is used for statistics but crucially it is
4260 : * also used as the starting point for the zonelist iterator. It
4261 : * may get reset for allocations that ignore memory policies.
4262 : */
4263 914 : ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4264 : ac->highest_zoneidx, ac->nodemask);
4265 :
4266 : return true;
4267 : }
4268 :
4269 : /*
4270 : * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
4271 : * @gfp: GFP flags for the allocation
4272 : * @preferred_nid: The preferred NUMA node ID to allocate from
4273 : * @nodemask: Set of nodes to allocate from, may be NULL
4274 : * @nr_pages: The number of pages desired on the list or array
4275 : * @page_list: Optional list to store the allocated pages
4276 : * @page_array: Optional array to store the pages
4277 : *
4278 : * This is a batched version of the page allocator that attempts to
4279 : * allocate nr_pages quickly. Pages are added to page_list if page_list
4280 : * is not NULL, otherwise it is assumed that the page_array is valid.
4281 : *
4282 : * For lists, nr_pages is the number of pages that should be allocated.
4283 : *
4284 : * For arrays, only NULL elements are populated with pages and nr_pages
4285 : * is the maximum number of pages that will be stored in the array.
4286 : *
4287 : * Returns the number of pages on the list or array.
4288 : */
4289 16 : unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
4290 : nodemask_t *nodemask, int nr_pages,
4291 : struct list_head *page_list,
4292 : struct page **page_array)
4293 : {
4294 : struct page *page;
4295 : unsigned long __maybe_unused UP_flags;
4296 : struct zone *zone;
4297 : struct zoneref *z;
4298 : struct per_cpu_pages *pcp;
4299 : struct list_head *pcp_list;
4300 : struct alloc_context ac;
4301 : gfp_t alloc_gfp;
4302 16 : unsigned int alloc_flags = ALLOC_WMARK_LOW;
4303 16 : int nr_populated = 0, nr_account = 0;
4304 :
4305 : /*
4306 : * Skip populated array elements to determine if any pages need
4307 : * to be allocated before disabling IRQs.
4308 : */
4309 32 : while (page_array && nr_populated < nr_pages && page_array[nr_populated])
4310 0 : nr_populated++;
4311 :
4312 : /* No pages requested? */
4313 16 : if (unlikely(nr_pages <= 0))
4314 : goto out;
4315 :
4316 : /* Already populated array? */
4317 16 : if (unlikely(page_array && nr_pages - nr_populated == 0))
4318 : goto out;
4319 :
4320 : /* Bulk allocator does not support memcg accounting. */
4321 : if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT))
4322 : goto failed;
4323 :
4324 : /* Use the single page allocator for one page. */
4325 16 : if (nr_pages - nr_populated == 1)
4326 : goto failed;
4327 :
4328 : #ifdef CONFIG_PAGE_OWNER
4329 : /*
4330 : * PAGE_OWNER may recurse into the allocator to allocate space to
4331 : * save the stack with pagesets.lock held. Releasing/reacquiring
4332 : * removes much of the performance benefit of bulk allocation so
4333 : * force the caller to allocate one page at a time as it'll have
4334 : * similar performance to added complexity to the bulk allocator.
4335 : */
4336 : if (static_branch_unlikely(&page_owner_inited))
4337 : goto failed;
4338 : #endif
4339 :
4340 : /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
4341 16 : gfp &= gfp_allowed_mask;
4342 16 : alloc_gfp = gfp;
4343 16 : if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
4344 : goto out;
4345 16 : gfp = alloc_gfp;
4346 :
4347 : /* Find an allowed local zone that meets the low watermark. */
4348 32 : for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
4349 : unsigned long mark;
4350 :
4351 : if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
4352 : !__cpuset_zone_allowed(zone, gfp)) {
4353 : continue;
4354 : }
4355 :
4356 : if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
4357 : zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
4358 : goto failed;
4359 : }
4360 :
4361 16 : mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
4362 16 : if (zone_watermark_fast(zone, 0, mark,
4363 : zonelist_zone_idx(ac.preferred_zoneref),
4364 : alloc_flags, gfp)) {
4365 : break;
4366 : }
4367 : }
4368 :
4369 : /*
4370 : * If there are no allowed local zones that meets the watermarks then
4371 : * try to allocate a single page and reclaim if necessary.
4372 : */
4373 16 : if (unlikely(!zone))
4374 : goto failed;
4375 :
4376 : /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
4377 16 : pcp_trylock_prepare(UP_flags);
4378 32 : pcp = pcp_spin_trylock(zone->per_cpu_pageset);
4379 16 : if (!pcp)
4380 : goto failed_irq;
4381 :
4382 : /* Attempt the batch allocation */
4383 32 : pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
4384 96 : while (nr_populated < nr_pages) {
4385 :
4386 : /* Skip existing pages */
4387 64 : if (page_array && page_array[nr_populated]) {
4388 0 : nr_populated++;
4389 0 : continue;
4390 : }
4391 :
4392 64 : page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
4393 : pcp, pcp_list);
4394 64 : if (unlikely(!page)) {
4395 : /* Try and allocate at least one page */
4396 0 : if (!nr_account) {
4397 0 : pcp_spin_unlock(pcp);
4398 0 : goto failed_irq;
4399 : }
4400 : break;
4401 : }
4402 64 : nr_account++;
4403 :
4404 64 : prep_new_page(page, 0, gfp, 0);
4405 64 : if (page_list)
4406 0 : list_add(&page->lru, page_list);
4407 : else
4408 64 : page_array[nr_populated] = page;
4409 64 : nr_populated++;
4410 : }
4411 :
4412 32 : pcp_spin_unlock(pcp);
4413 32 : pcp_trylock_finish(UP_flags);
4414 :
4415 32 : __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
4416 16 : zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
4417 :
4418 : out:
4419 16 : return nr_populated;
4420 :
4421 : failed_irq:
4422 0 : pcp_trylock_finish(UP_flags);
4423 :
4424 : failed:
4425 0 : page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
4426 0 : if (page) {
4427 0 : if (page_list)
4428 0 : list_add(&page->lru, page_list);
4429 : else
4430 0 : page_array[nr_populated] = page;
4431 0 : nr_populated++;
4432 : }
4433 :
4434 : goto out;
4435 : }
4436 : EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
4437 :
4438 : /*
4439 : * This is the 'heart' of the zoned buddy allocator.
4440 : */
4441 441 : struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
4442 : nodemask_t *nodemask)
4443 : {
4444 : struct page *page;
4445 441 : unsigned int alloc_flags = ALLOC_WMARK_LOW;
4446 : gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
4447 441 : struct alloc_context ac = { };
4448 :
4449 : /*
4450 : * There are several places where we assume that the order value is sane
4451 : * so bail out early if the request is out of bound.
4452 : */
4453 441 : if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
4454 : return NULL;
4455 :
4456 441 : gfp &= gfp_allowed_mask;
4457 : /*
4458 : * Apply scoped allocation constraints. This is mainly about GFP_NOFS
4459 : * resp. GFP_NOIO which has to be inherited for all allocation requests
4460 : * from a particular context which has been marked by
4461 : * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
4462 : * movable zones are not used during allocation.
4463 : */
4464 441 : gfp = current_gfp_context(gfp);
4465 441 : alloc_gfp = gfp;
4466 441 : if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
4467 : &alloc_gfp, &alloc_flags))
4468 : return NULL;
4469 :
4470 : /*
4471 : * Forbid the first pass from falling back to types that fragment
4472 : * memory until all local zones are considered.
4473 : */
4474 882 : alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
4475 :
4476 : /* First allocation attempt */
4477 441 : page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
4478 441 : if (likely(page))
4479 : goto out;
4480 :
4481 0 : alloc_gfp = gfp;
4482 0 : ac.spread_dirty_pages = false;
4483 :
4484 : /*
4485 : * Restore the original nodemask if it was potentially replaced with
4486 : * &cpuset_current_mems_allowed to optimize the fast-path attempt.
4487 : */
4488 0 : ac.nodemask = nodemask;
4489 :
4490 0 : page = __alloc_pages_slowpath(alloc_gfp, order, &ac);
4491 :
4492 : out:
4493 : if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
4494 : unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
4495 : __free_pages(page, order);
4496 : page = NULL;
4497 : }
4498 :
4499 441 : trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
4500 441 : kmsan_alloc_page(page, order, alloc_gfp);
4501 :
4502 441 : return page;
4503 : }
4504 : EXPORT_SYMBOL(__alloc_pages);
4505 :
4506 0 : struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
4507 : nodemask_t *nodemask)
4508 : {
4509 0 : struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
4510 : preferred_nid, nodemask);
4511 :
4512 : if (page && order > 1)
4513 : prep_transhuge_page(page);
4514 0 : return (struct folio *)page;
4515 : }
4516 : EXPORT_SYMBOL(__folio_alloc);
4517 :
4518 : /*
4519 : * Common helper functions. Never use with __GFP_HIGHMEM because the returned
4520 : * address cannot represent highmem pages. Use alloc_pages and then kmap if
4521 : * you need to access high mem.
4522 : */
4523 4 : unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
4524 : {
4525 : struct page *page;
4526 :
4527 8 : page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
4528 4 : if (!page)
4529 : return 0;
4530 4 : return (unsigned long) page_address(page);
4531 : }
4532 : EXPORT_SYMBOL(__get_free_pages);
4533 :
4534 0 : unsigned long get_zeroed_page(gfp_t gfp_mask)
4535 : {
4536 0 : return __get_free_page(gfp_mask | __GFP_ZERO);
4537 : }
4538 : EXPORT_SYMBOL(get_zeroed_page);
4539 :
4540 : /**
4541 : * __free_pages - Free pages allocated with alloc_pages().
4542 : * @page: The page pointer returned from alloc_pages().
4543 : * @order: The order of the allocation.
4544 : *
4545 : * This function can free multi-page allocations that are not compound
4546 : * pages. It does not check that the @order passed in matches that of
4547 : * the allocation, so it is easy to leak memory. Freeing more memory
4548 : * than was allocated will probably emit a warning.
4549 : *
4550 : * If the last reference to this page is speculative, it will be released
4551 : * by put_page() which only frees the first page of a non-compound
4552 : * allocation. To prevent the remaining pages from being leaked, we free
4553 : * the subsequent pages here. If you want to use the page's reference
4554 : * count to decide when to free the allocation, you should allocate a
4555 : * compound page, and use put_page() instead of __free_pages().
4556 : *
4557 : * Context: May be called in interrupt context or while holding a normal
4558 : * spinlock, but not in NMI context or while holding a raw spinlock.
4559 : */
4560 0 : void __free_pages(struct page *page, unsigned int order)
4561 : {
4562 : /* get PageHead before we drop reference */
4563 0 : int head = PageHead(page);
4564 :
4565 0 : if (put_page_testzero(page))
4566 0 : free_the_page(page, order);
4567 0 : else if (!head)
4568 0 : while (order-- > 0)
4569 0 : free_the_page(page + (1 << order), order);
4570 0 : }
4571 : EXPORT_SYMBOL(__free_pages);
4572 :
4573 0 : void free_pages(unsigned long addr, unsigned int order)
4574 : {
4575 0 : if (addr != 0) {
4576 : VM_BUG_ON(!virt_addr_valid((void *)addr));
4577 0 : __free_pages(virt_to_page((void *)addr), order);
4578 : }
4579 0 : }
4580 :
4581 : EXPORT_SYMBOL(free_pages);
4582 :
4583 : /*
4584 : * Page Fragment:
4585 : * An arbitrary-length arbitrary-offset area of memory which resides
4586 : * within a 0 or higher order page. Multiple fragments within that page
4587 : * are individually refcounted, in the page's reference counter.
4588 : *
4589 : * The page_frag functions below provide a simple allocation framework for
4590 : * page fragments. This is used by the network stack and network device
4591 : * drivers to provide a backing region of memory for use as either an
4592 : * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
4593 : */
4594 0 : static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
4595 : gfp_t gfp_mask)
4596 : {
4597 0 : struct page *page = NULL;
4598 0 : gfp_t gfp = gfp_mask;
4599 :
4600 : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4601 0 : gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
4602 : __GFP_NOMEMALLOC;
4603 0 : page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
4604 0 : PAGE_FRAG_CACHE_MAX_ORDER);
4605 0 : nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
4606 : #endif
4607 0 : if (unlikely(!page))
4608 0 : page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
4609 :
4610 0 : nc->va = page ? page_address(page) : NULL;
4611 :
4612 0 : return page;
4613 : }
4614 :
4615 0 : void __page_frag_cache_drain(struct page *page, unsigned int count)
4616 : {
4617 : VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
4618 :
4619 0 : if (page_ref_sub_and_test(page, count))
4620 0 : free_the_page(page, compound_order(page));
4621 0 : }
4622 : EXPORT_SYMBOL(__page_frag_cache_drain);
4623 :
4624 0 : void *page_frag_alloc_align(struct page_frag_cache *nc,
4625 : unsigned int fragsz, gfp_t gfp_mask,
4626 : unsigned int align_mask)
4627 : {
4628 0 : unsigned int size = PAGE_SIZE;
4629 : struct page *page;
4630 : int offset;
4631 :
4632 0 : if (unlikely(!nc->va)) {
4633 : refill:
4634 0 : page = __page_frag_cache_refill(nc, gfp_mask);
4635 0 : if (!page)
4636 : return NULL;
4637 :
4638 : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4639 : /* if size can vary use size else just use PAGE_SIZE */
4640 0 : size = nc->size;
4641 : #endif
4642 : /* Even if we own the page, we do not use atomic_set().
4643 : * This would break get_page_unless_zero() users.
4644 : */
4645 0 : page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
4646 :
4647 : /* reset page count bias and offset to start of new frag */
4648 0 : nc->pfmemalloc = page_is_pfmemalloc(page);
4649 0 : nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
4650 0 : nc->offset = size;
4651 : }
4652 :
4653 0 : offset = nc->offset - fragsz;
4654 0 : if (unlikely(offset < 0)) {
4655 0 : page = virt_to_page(nc->va);
4656 :
4657 0 : if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
4658 : goto refill;
4659 :
4660 0 : if (unlikely(nc->pfmemalloc)) {
4661 0 : free_the_page(page, compound_order(page));
4662 0 : goto refill;
4663 : }
4664 :
4665 : #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4666 : /* if size can vary use size else just use PAGE_SIZE */
4667 0 : size = nc->size;
4668 : #endif
4669 : /* OK, page count is 0, we can safely set it */
4670 0 : set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
4671 :
4672 : /* reset page count bias and offset to start of new frag */
4673 0 : nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
4674 0 : offset = size - fragsz;
4675 0 : if (unlikely(offset < 0)) {
4676 : /*
4677 : * The caller is trying to allocate a fragment
4678 : * with fragsz > PAGE_SIZE but the cache isn't big
4679 : * enough to satisfy the request, this may
4680 : * happen in low memory conditions.
4681 : * We don't release the cache page because
4682 : * it could make memory pressure worse
4683 : * so we simply return NULL here.
4684 : */
4685 : return NULL;
4686 : }
4687 : }
4688 :
4689 0 : nc->pagecnt_bias--;
4690 0 : offset &= align_mask;
4691 0 : nc->offset = offset;
4692 :
4693 0 : return nc->va + offset;
4694 : }
4695 : EXPORT_SYMBOL(page_frag_alloc_align);
4696 :
4697 : /*
4698 : * Frees a page fragment allocated out of either a compound or order 0 page.
4699 : */
4700 0 : void page_frag_free(void *addr)
4701 : {
4702 0 : struct page *page = virt_to_head_page(addr);
4703 :
4704 0 : if (unlikely(put_page_testzero(page)))
4705 0 : free_the_page(page, compound_order(page));
4706 0 : }
4707 : EXPORT_SYMBOL(page_frag_free);
4708 :
4709 3 : static void *make_alloc_exact(unsigned long addr, unsigned int order,
4710 : size_t size)
4711 : {
4712 3 : if (addr) {
4713 3 : unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE);
4714 6 : struct page *page = virt_to_page((void *)addr);
4715 3 : struct page *last = page + nr;
4716 :
4717 3 : split_page_owner(page, 1 << order);
4718 3 : split_page_memcg(page, 1 << order);
4719 10 : while (page < --last)
4720 : set_page_refcounted(last);
4721 :
4722 3 : last = page + (1UL << order);
4723 3 : for (page += nr; page < last; page++)
4724 0 : __free_pages_ok(page, 0, FPI_TO_TAIL);
4725 : }
4726 3 : return (void *)addr;
4727 : }
4728 :
4729 : /**
4730 : * alloc_pages_exact - allocate an exact number physically-contiguous pages.
4731 : * @size: the number of bytes to allocate
4732 : * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4733 : *
4734 : * This function is similar to alloc_pages(), except that it allocates the
4735 : * minimum number of pages to satisfy the request. alloc_pages() can only
4736 : * allocate memory in power-of-two pages.
4737 : *
4738 : * This function is also limited by MAX_ORDER.
4739 : *
4740 : * Memory allocated by this function must be released by free_pages_exact().
4741 : *
4742 : * Return: pointer to the allocated area or %NULL in case of error.
4743 : */
4744 3 : void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
4745 : {
4746 3 : unsigned int order = get_order(size);
4747 : unsigned long addr;
4748 :
4749 3 : if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
4750 0 : gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
4751 :
4752 3 : addr = __get_free_pages(gfp_mask, order);
4753 3 : return make_alloc_exact(addr, order, size);
4754 : }
4755 : EXPORT_SYMBOL(alloc_pages_exact);
4756 :
4757 : /**
4758 : * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
4759 : * pages on a node.
4760 : * @nid: the preferred node ID where memory should be allocated
4761 : * @size: the number of bytes to allocate
4762 : * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
4763 : *
4764 : * Like alloc_pages_exact(), but try to allocate on node nid first before falling
4765 : * back.
4766 : *
4767 : * Return: pointer to the allocated area or %NULL in case of error.
4768 : */
4769 0 : void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
4770 : {
4771 0 : unsigned int order = get_order(size);
4772 : struct page *p;
4773 :
4774 0 : if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
4775 0 : gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
4776 :
4777 0 : p = alloc_pages_node(nid, gfp_mask, order);
4778 0 : if (!p)
4779 : return NULL;
4780 0 : return make_alloc_exact((unsigned long)page_address(p), order, size);
4781 : }
4782 :
4783 : /**
4784 : * free_pages_exact - release memory allocated via alloc_pages_exact()
4785 : * @virt: the value returned by alloc_pages_exact.
4786 : * @size: size of allocation, same value as passed to alloc_pages_exact().
4787 : *
4788 : * Release the memory allocated by a previous call to alloc_pages_exact.
4789 : */
4790 0 : void free_pages_exact(void *virt, size_t size)
4791 : {
4792 0 : unsigned long addr = (unsigned long)virt;
4793 0 : unsigned long end = addr + PAGE_ALIGN(size);
4794 :
4795 0 : while (addr < end) {
4796 0 : free_page(addr);
4797 0 : addr += PAGE_SIZE;
4798 : }
4799 0 : }
4800 : EXPORT_SYMBOL(free_pages_exact);
4801 :
4802 : /**
4803 : * nr_free_zone_pages - count number of pages beyond high watermark
4804 : * @offset: The zone index of the highest zone
4805 : *
4806 : * nr_free_zone_pages() counts the number of pages which are beyond the
4807 : * high watermark within all zones at or below a given zone index. For each
4808 : * zone, the number of pages is calculated as:
4809 : *
4810 : * nr_free_zone_pages = managed_pages - high_pages
4811 : *
4812 : * Return: number of pages beyond high watermark.
4813 : */
4814 3 : static unsigned long nr_free_zone_pages(int offset)
4815 : {
4816 : struct zoneref *z;
4817 : struct zone *zone;
4818 :
4819 : /* Just pick one node, since fallback list is circular */
4820 3 : unsigned long sum = 0;
4821 :
4822 6 : struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
4823 :
4824 12 : for_each_zone_zonelist(zone, z, zonelist, offset) {
4825 3 : unsigned long size = zone_managed_pages(zone);
4826 3 : unsigned long high = high_wmark_pages(zone);
4827 3 : if (size > high)
4828 3 : sum += size - high;
4829 : }
4830 :
4831 3 : return sum;
4832 : }
4833 :
4834 : /**
4835 : * nr_free_buffer_pages - count number of pages beyond high watermark
4836 : *
4837 : * nr_free_buffer_pages() counts the number of pages which are beyond the high
4838 : * watermark within ZONE_DMA and ZONE_NORMAL.
4839 : *
4840 : * Return: number of pages beyond high watermark within ZONE_DMA and
4841 : * ZONE_NORMAL.
4842 : */
4843 1 : unsigned long nr_free_buffer_pages(void)
4844 : {
4845 2 : return nr_free_zone_pages(gfp_zone(GFP_USER));
4846 : }
4847 : EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
4848 :
4849 : static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
4850 : {
4851 1 : zoneref->zone = zone;
4852 1 : zoneref->zone_idx = zone_idx(zone);
4853 : }
4854 :
4855 : /*
4856 : * Builds allocation fallback zone lists.
4857 : *
4858 : * Add all populated zones of a node to the zonelist.
4859 : */
4860 : static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
4861 : {
4862 : struct zone *zone;
4863 1 : enum zone_type zone_type = MAX_NR_ZONES;
4864 1 : int nr_zones = 0;
4865 :
4866 : do {
4867 2 : zone_type--;
4868 2 : zone = pgdat->node_zones + zone_type;
4869 2 : if (populated_zone(zone)) {
4870 2 : zoneref_set_zone(zone, &zonerefs[nr_zones++]);
4871 1 : check_highest_zone(zone_type);
4872 : }
4873 2 : } while (zone_type);
4874 :
4875 : return nr_zones;
4876 : }
4877 :
4878 : #ifdef CONFIG_NUMA
4879 :
4880 : static int __parse_numa_zonelist_order(char *s)
4881 : {
4882 : /*
4883 : * We used to support different zonelists modes but they turned
4884 : * out to be just not useful. Let's keep the warning in place
4885 : * if somebody still use the cmd line parameter so that we do
4886 : * not fail it silently
4887 : */
4888 : if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
4889 : pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
4890 : return -EINVAL;
4891 : }
4892 : return 0;
4893 : }
4894 :
4895 : static char numa_zonelist_order[] = "Node";
4896 : #define NUMA_ZONELIST_ORDER_LEN 16
4897 : /*
4898 : * sysctl handler for numa_zonelist_order
4899 : */
4900 : static int numa_zonelist_order_handler(struct ctl_table *table, int write,
4901 : void *buffer, size_t *length, loff_t *ppos)
4902 : {
4903 : if (write)
4904 : return __parse_numa_zonelist_order(buffer);
4905 : return proc_dostring(table, write, buffer, length, ppos);
4906 : }
4907 :
4908 : static int node_load[MAX_NUMNODES];
4909 :
4910 : /**
4911 : * find_next_best_node - find the next node that should appear in a given node's fallback list
4912 : * @node: node whose fallback list we're appending
4913 : * @used_node_mask: nodemask_t of already used nodes
4914 : *
4915 : * We use a number of factors to determine which is the next node that should
4916 : * appear on a given node's fallback list. The node should not have appeared
4917 : * already in @node's fallback list, and it should be the next closest node
4918 : * according to the distance array (which contains arbitrary distance values
4919 : * from each node to each node in the system), and should also prefer nodes
4920 : * with no CPUs, since presumably they'll have very little allocation pressure
4921 : * on them otherwise.
4922 : *
4923 : * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
4924 : */
4925 : int find_next_best_node(int node, nodemask_t *used_node_mask)
4926 : {
4927 : int n, val;
4928 : int min_val = INT_MAX;
4929 : int best_node = NUMA_NO_NODE;
4930 :
4931 : /* Use the local node if we haven't already */
4932 : if (!node_isset(node, *used_node_mask)) {
4933 : node_set(node, *used_node_mask);
4934 : return node;
4935 : }
4936 :
4937 : for_each_node_state(n, N_MEMORY) {
4938 :
4939 : /* Don't want a node to appear more than once */
4940 : if (node_isset(n, *used_node_mask))
4941 : continue;
4942 :
4943 : /* Use the distance array to find the distance */
4944 : val = node_distance(node, n);
4945 :
4946 : /* Penalize nodes under us ("prefer the next node") */
4947 : val += (n < node);
4948 :
4949 : /* Give preference to headless and unused nodes */
4950 : if (!cpumask_empty(cpumask_of_node(n)))
4951 : val += PENALTY_FOR_NODE_WITH_CPUS;
4952 :
4953 : /* Slight preference for less loaded node */
4954 : val *= MAX_NUMNODES;
4955 : val += node_load[n];
4956 :
4957 : if (val < min_val) {
4958 : min_val = val;
4959 : best_node = n;
4960 : }
4961 : }
4962 :
4963 : if (best_node >= 0)
4964 : node_set(best_node, *used_node_mask);
4965 :
4966 : return best_node;
4967 : }
4968 :
4969 :
4970 : /*
4971 : * Build zonelists ordered by node and zones within node.
4972 : * This results in maximum locality--normal zone overflows into local
4973 : * DMA zone, if any--but risks exhausting DMA zone.
4974 : */
4975 : static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
4976 : unsigned nr_nodes)
4977 : {
4978 : struct zoneref *zonerefs;
4979 : int i;
4980 :
4981 : zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
4982 :
4983 : for (i = 0; i < nr_nodes; i++) {
4984 : int nr_zones;
4985 :
4986 : pg_data_t *node = NODE_DATA(node_order[i]);
4987 :
4988 : nr_zones = build_zonerefs_node(node, zonerefs);
4989 : zonerefs += nr_zones;
4990 : }
4991 : zonerefs->zone = NULL;
4992 : zonerefs->zone_idx = 0;
4993 : }
4994 :
4995 : /*
4996 : * Build gfp_thisnode zonelists
4997 : */
4998 : static void build_thisnode_zonelists(pg_data_t *pgdat)
4999 : {
5000 : struct zoneref *zonerefs;
5001 : int nr_zones;
5002 :
5003 : zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
5004 : nr_zones = build_zonerefs_node(pgdat, zonerefs);
5005 : zonerefs += nr_zones;
5006 : zonerefs->zone = NULL;
5007 : zonerefs->zone_idx = 0;
5008 : }
5009 :
5010 : /*
5011 : * Build zonelists ordered by zone and nodes within zones.
5012 : * This results in conserving DMA zone[s] until all Normal memory is
5013 : * exhausted, but results in overflowing to remote node while memory
5014 : * may still exist in local DMA zone.
5015 : */
5016 :
5017 : static void build_zonelists(pg_data_t *pgdat)
5018 : {
5019 : static int node_order[MAX_NUMNODES];
5020 : int node, nr_nodes = 0;
5021 : nodemask_t used_mask = NODE_MASK_NONE;
5022 : int local_node, prev_node;
5023 :
5024 : /* NUMA-aware ordering of nodes */
5025 : local_node = pgdat->node_id;
5026 : prev_node = local_node;
5027 :
5028 : memset(node_order, 0, sizeof(node_order));
5029 : while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
5030 : /*
5031 : * We don't want to pressure a particular node.
5032 : * So adding penalty to the first node in same
5033 : * distance group to make it round-robin.
5034 : */
5035 : if (node_distance(local_node, node) !=
5036 : node_distance(local_node, prev_node))
5037 : node_load[node] += 1;
5038 :
5039 : node_order[nr_nodes++] = node;
5040 : prev_node = node;
5041 : }
5042 :
5043 : build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
5044 : build_thisnode_zonelists(pgdat);
5045 : pr_info("Fallback order for Node %d: ", local_node);
5046 : for (node = 0; node < nr_nodes; node++)
5047 : pr_cont("%d ", node_order[node]);
5048 : pr_cont("\n");
5049 : }
5050 :
5051 : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
5052 : /*
5053 : * Return node id of node used for "local" allocations.
5054 : * I.e., first node id of first zone in arg node's generic zonelist.
5055 : * Used for initializing percpu 'numa_mem', which is used primarily
5056 : * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
5057 : */
5058 : int local_memory_node(int node)
5059 : {
5060 : struct zoneref *z;
5061 :
5062 : z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
5063 : gfp_zone(GFP_KERNEL),
5064 : NULL);
5065 : return zone_to_nid(z->zone);
5066 : }
5067 : #endif
5068 :
5069 : static void setup_min_unmapped_ratio(void);
5070 : static void setup_min_slab_ratio(void);
5071 : #else /* CONFIG_NUMA */
5072 :
5073 1 : static void build_zonelists(pg_data_t *pgdat)
5074 : {
5075 : int node, local_node;
5076 : struct zoneref *zonerefs;
5077 : int nr_zones;
5078 :
5079 1 : local_node = pgdat->node_id;
5080 :
5081 1 : zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5082 1 : nr_zones = build_zonerefs_node(pgdat, zonerefs);
5083 1 : zonerefs += nr_zones;
5084 :
5085 : /*
5086 : * Now we build the zonelist so that it contains the zones
5087 : * of all the other nodes.
5088 : * We don't want to pressure a particular node, so when
5089 : * building the zones for node N, we make sure that the
5090 : * zones coming right after the local ones are those from
5091 : * node N+1 (modulo N)
5092 : */
5093 1 : for (node = local_node + 1; node < MAX_NUMNODES; node++) {
5094 0 : if (!node_online(node))
5095 0 : continue;
5096 0 : nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5097 0 : zonerefs += nr_zones;
5098 : }
5099 0 : for (node = 0; node < local_node; node++) {
5100 0 : if (!node_online(node))
5101 0 : continue;
5102 0 : nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5103 0 : zonerefs += nr_zones;
5104 : }
5105 :
5106 1 : zonerefs->zone = NULL;
5107 1 : zonerefs->zone_idx = 0;
5108 1 : }
5109 :
5110 : #endif /* CONFIG_NUMA */
5111 :
5112 : /*
5113 : * Boot pageset table. One per cpu which is going to be used for all
5114 : * zones and all nodes. The parameters will be set in such a way
5115 : * that an item put on a list will immediately be handed over to
5116 : * the buddy list. This is safe since pageset manipulation is done
5117 : * with interrupts disabled.
5118 : *
5119 : * The boot_pagesets must be kept even after bootup is complete for
5120 : * unused processors and/or zones. They do play a role for bootstrapping
5121 : * hotplugged processors.
5122 : *
5123 : * zoneinfo_show() and maybe other functions do
5124 : * not check if the processor is online before following the pageset pointer.
5125 : * Other parts of the kernel may not check if the zone is available.
5126 : */
5127 : static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats);
5128 : /* These effectively disable the pcplists in the boot pageset completely */
5129 : #define BOOT_PAGESET_HIGH 0
5130 : #define BOOT_PAGESET_BATCH 1
5131 : static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
5132 : static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
5133 :
5134 1 : static void __build_all_zonelists(void *data)
5135 : {
5136 : int nid;
5137 : int __maybe_unused cpu;
5138 1 : pg_data_t *self = data;
5139 : unsigned long flags;
5140 :
5141 : /*
5142 : * Explicitly disable this CPU's interrupts before taking seqlock
5143 : * to prevent any IRQ handler from calling into the page allocator
5144 : * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock.
5145 : */
5146 1 : local_irq_save(flags);
5147 : /*
5148 : * Explicitly disable this CPU's synchronous printk() before taking
5149 : * seqlock to prevent any printk() from trying to hold port->lock, for
5150 : * tty_insert_flip_string_and_push_buffer() on other CPU might be
5151 : * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held.
5152 : */
5153 1 : printk_deferred_enter();
5154 1 : write_seqlock(&zonelist_update_seq);
5155 :
5156 : #ifdef CONFIG_NUMA
5157 : memset(node_load, 0, sizeof(node_load));
5158 : #endif
5159 :
5160 : /*
5161 : * This node is hotadded and no memory is yet present. So just
5162 : * building zonelists is fine - no need to touch other nodes.
5163 : */
5164 1 : if (self && !node_online(self->node_id)) {
5165 0 : build_zonelists(self);
5166 : } else {
5167 : /*
5168 : * All possible nodes have pgdat preallocated
5169 : * in free_area_init
5170 : */
5171 1 : for_each_node(nid) {
5172 1 : pg_data_t *pgdat = NODE_DATA(nid);
5173 :
5174 1 : build_zonelists(pgdat);
5175 : }
5176 :
5177 : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
5178 : /*
5179 : * We now know the "local memory node" for each node--
5180 : * i.e., the node of the first zone in the generic zonelist.
5181 : * Set up numa_mem percpu variable for on-line cpus. During
5182 : * boot, only the boot cpu should be on-line; we'll init the
5183 : * secondary cpus' numa_mem as they come on-line. During
5184 : * node/memory hotplug, we'll fixup all on-line cpus.
5185 : */
5186 : for_each_online_cpu(cpu)
5187 : set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
5188 : #endif
5189 : }
5190 :
5191 1 : write_sequnlock(&zonelist_update_seq);
5192 1 : printk_deferred_exit();
5193 2 : local_irq_restore(flags);
5194 1 : }
5195 :
5196 : static noinline void __init
5197 1 : build_all_zonelists_init(void)
5198 : {
5199 : int cpu;
5200 :
5201 1 : __build_all_zonelists(NULL);
5202 :
5203 : /*
5204 : * Initialize the boot_pagesets that are going to be used
5205 : * for bootstrapping processors. The real pagesets for
5206 : * each zone will be allocated later when the per cpu
5207 : * allocator is available.
5208 : *
5209 : * boot_pagesets are used also for bootstrapping offline
5210 : * cpus if the system is already booted because the pagesets
5211 : * are needed to initialize allocators on a specific cpu too.
5212 : * F.e. the percpu allocator needs the page allocator which
5213 : * needs the percpu allocator in order to allocate its pagesets
5214 : * (a chicken-egg dilemma).
5215 : */
5216 2 : for_each_possible_cpu(cpu)
5217 1 : per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu));
5218 :
5219 1 : mminit_verify_zonelist();
5220 : cpuset_init_current_mems_allowed();
5221 1 : }
5222 :
5223 : /*
5224 : * unless system_state == SYSTEM_BOOTING.
5225 : *
5226 : * __ref due to call of __init annotated helper build_all_zonelists_init
5227 : * [protected by SYSTEM_BOOTING].
5228 : */
5229 1 : void __ref build_all_zonelists(pg_data_t *pgdat)
5230 : {
5231 : unsigned long vm_total_pages;
5232 :
5233 1 : if (system_state == SYSTEM_BOOTING) {
5234 1 : build_all_zonelists_init();
5235 : } else {
5236 0 : __build_all_zonelists(pgdat);
5237 : /* cpuset refresh routine should be here */
5238 : }
5239 : /* Get the number of free pages beyond high watermark in all zones. */
5240 1 : vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
5241 : /*
5242 : * Disable grouping by mobility if the number of pages in the
5243 : * system is too low to allow the mechanism to work. It would be
5244 : * more accurate, but expensive to check per-zone. This check is
5245 : * made on memory-hotadd so a system can start with mobility
5246 : * disabled and enable it later
5247 : */
5248 1 : if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
5249 0 : page_group_by_mobility_disabled = 1;
5250 : else
5251 1 : page_group_by_mobility_disabled = 0;
5252 :
5253 1 : pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
5254 : nr_online_nodes,
5255 : page_group_by_mobility_disabled ? "off" : "on",
5256 : vm_total_pages);
5257 : #ifdef CONFIG_NUMA
5258 : pr_info("Policy zone: %s\n", zone_names[policy_zone]);
5259 : #endif
5260 1 : }
5261 :
5262 3 : static int zone_batchsize(struct zone *zone)
5263 : {
5264 : #ifdef CONFIG_MMU
5265 : int batch;
5266 :
5267 : /*
5268 : * The number of pages to batch allocate is either ~0.1%
5269 : * of the zone or 1MB, whichever is smaller. The batch
5270 : * size is striking a balance between allocation latency
5271 : * and zone lock contention.
5272 : */
5273 3 : batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
5274 3 : batch /= 4; /* We effectively *= 4 below */
5275 3 : if (batch < 1)
5276 1 : batch = 1;
5277 :
5278 : /*
5279 : * Clamp the batch to a 2^n - 1 value. Having a power
5280 : * of 2 value was found to be more likely to have
5281 : * suboptimal cache aliasing properties in some cases.
5282 : *
5283 : * For example if 2 tasks are alternately allocating
5284 : * batches of pages, one task can end up with a lot
5285 : * of pages of one half of the possible page colors
5286 : * and the other with pages of the other colors.
5287 : */
5288 5 : batch = rounddown_pow_of_two(batch + batch/2) - 1;
5289 :
5290 3 : return batch;
5291 :
5292 : #else
5293 : /* The deferral and batching of frees should be suppressed under NOMMU
5294 : * conditions.
5295 : *
5296 : * The problem is that NOMMU needs to be able to allocate large chunks
5297 : * of contiguous memory as there's no hardware page translation to
5298 : * assemble apparent contiguous memory from discontiguous pages.
5299 : *
5300 : * Queueing large contiguous runs of pages for batching, however,
5301 : * causes the pages to actually be freed in smaller chunks. As there
5302 : * can be a significant delay between the individual batches being
5303 : * recycled, this leads to the once large chunks of space being
5304 : * fragmented and becoming unavailable for high-order allocations.
5305 : */
5306 : return 0;
5307 : #endif
5308 : }
5309 :
5310 : static int percpu_pagelist_high_fraction;
5311 3 : static int zone_highsize(struct zone *zone, int batch, int cpu_online)
5312 : {
5313 : #ifdef CONFIG_MMU
5314 : int high;
5315 : int nr_split_cpus;
5316 : unsigned long total_pages;
5317 :
5318 3 : if (!percpu_pagelist_high_fraction) {
5319 : /*
5320 : * By default, the high value of the pcp is based on the zone
5321 : * low watermark so that if they are full then background
5322 : * reclaim will not be started prematurely.
5323 : */
5324 3 : total_pages = low_wmark_pages(zone);
5325 : } else {
5326 : /*
5327 : * If percpu_pagelist_high_fraction is configured, the high
5328 : * value is based on a fraction of the managed pages in the
5329 : * zone.
5330 : */
5331 0 : total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
5332 : }
5333 :
5334 : /*
5335 : * Split the high value across all online CPUs local to the zone. Note
5336 : * that early in boot that CPUs may not be online yet and that during
5337 : * CPU hotplug that the cpumask is not yet updated when a CPU is being
5338 : * onlined. For memory nodes that have no CPUs, split pcp->high across
5339 : * all online CPUs to mitigate the risk that reclaim is triggered
5340 : * prematurely due to pages stored on pcp lists.
5341 : */
5342 6 : nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
5343 3 : if (!nr_split_cpus)
5344 0 : nr_split_cpus = num_online_cpus();
5345 3 : high = total_pages / nr_split_cpus;
5346 :
5347 : /*
5348 : * Ensure high is at least batch*4. The multiple is based on the
5349 : * historical relationship between high and batch.
5350 : */
5351 3 : high = max(high, batch << 2);
5352 :
5353 3 : return high;
5354 : #else
5355 : return 0;
5356 : #endif
5357 : }
5358 :
5359 : /*
5360 : * pcp->high and pcp->batch values are related and generally batch is lower
5361 : * than high. They are also related to pcp->count such that count is lower
5362 : * than high, and as soon as it reaches high, the pcplist is flushed.
5363 : *
5364 : * However, guaranteeing these relations at all times would require e.g. write
5365 : * barriers here but also careful usage of read barriers at the read side, and
5366 : * thus be prone to error and bad for performance. Thus the update only prevents
5367 : * store tearing. Any new users of pcp->batch and pcp->high should ensure they
5368 : * can cope with those fields changing asynchronously, and fully trust only the
5369 : * pcp->count field on the local CPU with interrupts disabled.
5370 : *
5371 : * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
5372 : * outside of boot time (or some other assurance that no concurrent updaters
5373 : * exist).
5374 : */
5375 : static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
5376 : unsigned long batch)
5377 : {
5378 3 : WRITE_ONCE(pcp->batch, batch);
5379 3 : WRITE_ONCE(pcp->high, high);
5380 : }
5381 :
5382 2 : static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
5383 : {
5384 : int pindex;
5385 :
5386 4 : memset(pcp, 0, sizeof(*pcp));
5387 2 : memset(pzstats, 0, sizeof(*pzstats));
5388 :
5389 2 : spin_lock_init(&pcp->lock);
5390 26 : for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
5391 48 : INIT_LIST_HEAD(&pcp->lists[pindex]);
5392 :
5393 : /*
5394 : * Set batch and high values safe for a boot pageset. A true percpu
5395 : * pageset's initialization will update them subsequently. Here we don't
5396 : * need to be as careful as pageset_update() as nobody can access the
5397 : * pageset yet.
5398 : */
5399 2 : pcp->high = BOOT_PAGESET_HIGH;
5400 2 : pcp->batch = BOOT_PAGESET_BATCH;
5401 2 : pcp->free_factor = 0;
5402 2 : }
5403 :
5404 : static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
5405 : unsigned long batch)
5406 : {
5407 : struct per_cpu_pages *pcp;
5408 : int cpu;
5409 :
5410 3 : for_each_possible_cpu(cpu) {
5411 3 : pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
5412 3 : pageset_update(pcp, high, batch);
5413 : }
5414 : }
5415 :
5416 : /*
5417 : * Calculate and set new high and batch values for all per-cpu pagesets of a
5418 : * zone based on the zone's size.
5419 : */
5420 3 : static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
5421 : {
5422 : int new_high, new_batch;
5423 :
5424 3 : new_batch = max(1, zone_batchsize(zone));
5425 3 : new_high = zone_highsize(zone, new_batch, cpu_online);
5426 :
5427 3 : if (zone->pageset_high == new_high &&
5428 0 : zone->pageset_batch == new_batch)
5429 : return;
5430 :
5431 3 : zone->pageset_high = new_high;
5432 3 : zone->pageset_batch = new_batch;
5433 :
5434 3 : __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
5435 : }
5436 :
5437 1 : void __meminit setup_zone_pageset(struct zone *zone)
5438 : {
5439 : int cpu;
5440 :
5441 : /* Size may be 0 on !SMP && !NUMA */
5442 : if (sizeof(struct per_cpu_zonestat) > 0)
5443 : zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
5444 :
5445 1 : zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
5446 2 : for_each_possible_cpu(cpu) {
5447 : struct per_cpu_pages *pcp;
5448 : struct per_cpu_zonestat *pzstats;
5449 :
5450 1 : pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
5451 1 : pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
5452 1 : per_cpu_pages_init(pcp, pzstats);
5453 : }
5454 :
5455 1 : zone_set_pageset_high_and_batch(zone, 0);
5456 1 : }
5457 :
5458 : /*
5459 : * The zone indicated has a new number of managed_pages; batch sizes and percpu
5460 : * page high values need to be recalculated.
5461 : */
5462 2 : static void zone_pcp_update(struct zone *zone, int cpu_online)
5463 : {
5464 2 : mutex_lock(&pcp_batch_high_lock);
5465 2 : zone_set_pageset_high_and_batch(zone, cpu_online);
5466 2 : mutex_unlock(&pcp_batch_high_lock);
5467 2 : }
5468 :
5469 : /*
5470 : * Allocate per cpu pagesets and initialize them.
5471 : * Before this call only boot pagesets were available.
5472 : */
5473 1 : void __init setup_per_cpu_pageset(void)
5474 : {
5475 : struct pglist_data *pgdat;
5476 : struct zone *zone;
5477 : int __maybe_unused cpu;
5478 :
5479 3 : for_each_populated_zone(zone)
5480 1 : setup_zone_pageset(zone);
5481 :
5482 : #ifdef CONFIG_NUMA
5483 : /*
5484 : * Unpopulated zones continue using the boot pagesets.
5485 : * The numa stats for these pagesets need to be reset.
5486 : * Otherwise, they will end up skewing the stats of
5487 : * the nodes these zones are associated with.
5488 : */
5489 : for_each_possible_cpu(cpu) {
5490 : struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
5491 : memset(pzstats->vm_numa_event, 0,
5492 : sizeof(pzstats->vm_numa_event));
5493 : }
5494 : #endif
5495 :
5496 2 : for_each_online_pgdat(pgdat)
5497 1 : pgdat->per_cpu_nodestats =
5498 1 : alloc_percpu(struct per_cpu_nodestat);
5499 1 : }
5500 :
5501 2 : __meminit void zone_pcp_init(struct zone *zone)
5502 : {
5503 : /*
5504 : * per cpu subsystem is not up at this point. The following code
5505 : * relies on the ability of the linker to provide the
5506 : * offset of a (static) per cpu variable into the per cpu area.
5507 : */
5508 2 : zone->per_cpu_pageset = &boot_pageset;
5509 2 : zone->per_cpu_zonestats = &boot_zonestats;
5510 2 : zone->pageset_high = BOOT_PAGESET_HIGH;
5511 2 : zone->pageset_batch = BOOT_PAGESET_BATCH;
5512 :
5513 2 : if (populated_zone(zone))
5514 : pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name,
5515 : zone->present_pages, zone_batchsize(zone));
5516 2 : }
5517 :
5518 0 : void adjust_managed_page_count(struct page *page, long count)
5519 : {
5520 0 : atomic_long_add(count, &page_zone(page)->managed_pages);
5521 0 : totalram_pages_add(count);
5522 : #ifdef CONFIG_HIGHMEM
5523 : if (PageHighMem(page))
5524 : totalhigh_pages_add(count);
5525 : #endif
5526 0 : }
5527 : EXPORT_SYMBOL(adjust_managed_page_count);
5528 :
5529 0 : unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
5530 : {
5531 : void *pos;
5532 0 : unsigned long pages = 0;
5533 :
5534 0 : start = (void *)PAGE_ALIGN((unsigned long)start);
5535 0 : end = (void *)((unsigned long)end & PAGE_MASK);
5536 0 : for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
5537 0 : struct page *page = virt_to_page(pos);
5538 : void *direct_map_addr;
5539 :
5540 : /*
5541 : * 'direct_map_addr' might be different from 'pos'
5542 : * because some architectures' virt_to_page()
5543 : * work with aliases. Getting the direct map
5544 : * address ensures that we get a _writeable_
5545 : * alias for the memset().
5546 : */
5547 0 : direct_map_addr = page_address(page);
5548 : /*
5549 : * Perform a kasan-unchecked memset() since this memory
5550 : * has not been initialized.
5551 : */
5552 0 : direct_map_addr = kasan_reset_tag(direct_map_addr);
5553 0 : if ((unsigned int)poison <= 0xFF)
5554 0 : memset(direct_map_addr, poison, PAGE_SIZE);
5555 :
5556 0 : free_reserved_page(page);
5557 : }
5558 :
5559 0 : if (pages && s)
5560 0 : pr_info("Freeing %s memory: %ldK\n", s, K(pages));
5561 :
5562 0 : return pages;
5563 : }
5564 :
5565 0 : static int page_alloc_cpu_dead(unsigned int cpu)
5566 : {
5567 : struct zone *zone;
5568 :
5569 0 : lru_add_drain_cpu(cpu);
5570 0 : mlock_drain_remote(cpu);
5571 0 : drain_pages(cpu);
5572 :
5573 : /*
5574 : * Spill the event counters of the dead processor
5575 : * into the current processors event counters.
5576 : * This artificially elevates the count of the current
5577 : * processor.
5578 : */
5579 0 : vm_events_fold_cpu(cpu);
5580 :
5581 : /*
5582 : * Zero the differential counters of the dead processor
5583 : * so that the vm statistics are consistent.
5584 : *
5585 : * This is only okay since the processor is dead and cannot
5586 : * race with what we are doing.
5587 : */
5588 0 : cpu_vm_stats_fold(cpu);
5589 :
5590 0 : for_each_populated_zone(zone)
5591 0 : zone_pcp_update(zone, 0);
5592 :
5593 0 : return 0;
5594 : }
5595 :
5596 0 : static int page_alloc_cpu_online(unsigned int cpu)
5597 : {
5598 : struct zone *zone;
5599 :
5600 0 : for_each_populated_zone(zone)
5601 0 : zone_pcp_update(zone, 1);
5602 0 : return 0;
5603 : }
5604 :
5605 1 : void __init page_alloc_init_cpuhp(void)
5606 : {
5607 : int ret;
5608 :
5609 1 : ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
5610 : "mm/page_alloc:pcp",
5611 : page_alloc_cpu_online,
5612 : page_alloc_cpu_dead);
5613 1 : WARN_ON(ret < 0);
5614 1 : }
5615 :
5616 : /*
5617 : * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
5618 : * or min_free_kbytes changes.
5619 : */
5620 2 : static void calculate_totalreserve_pages(void)
5621 : {
5622 : struct pglist_data *pgdat;
5623 2 : unsigned long reserve_pages = 0;
5624 : enum zone_type i, j;
5625 :
5626 4 : for_each_online_pgdat(pgdat) {
5627 :
5628 2 : pgdat->totalreserve_pages = 0;
5629 :
5630 6 : for (i = 0; i < MAX_NR_ZONES; i++) {
5631 4 : struct zone *zone = pgdat->node_zones + i;
5632 4 : long max = 0;
5633 4 : unsigned long managed_pages = zone_managed_pages(zone);
5634 :
5635 : /* Find valid and maximum lowmem_reserve in the zone */
5636 10 : for (j = i; j < MAX_NR_ZONES; j++) {
5637 6 : if (zone->lowmem_reserve[j] > max)
5638 0 : max = zone->lowmem_reserve[j];
5639 : }
5640 :
5641 : /* we treat the high watermark as reserved pages. */
5642 4 : max += high_wmark_pages(zone);
5643 :
5644 4 : if (max > managed_pages)
5645 0 : max = managed_pages;
5646 :
5647 4 : pgdat->totalreserve_pages += max;
5648 :
5649 4 : reserve_pages += max;
5650 : }
5651 : }
5652 2 : totalreserve_pages = reserve_pages;
5653 2 : }
5654 :
5655 : /*
5656 : * setup_per_zone_lowmem_reserve - called whenever
5657 : * sysctl_lowmem_reserve_ratio changes. Ensures that each zone
5658 : * has a correct pages reserved value, so an adequate number of
5659 : * pages are left in the zone after a successful __alloc_pages().
5660 : */
5661 1 : static void setup_per_zone_lowmem_reserve(void)
5662 : {
5663 : struct pglist_data *pgdat;
5664 : enum zone_type i, j;
5665 :
5666 2 : for_each_online_pgdat(pgdat) {
5667 2 : for (i = 0; i < MAX_NR_ZONES - 1; i++) {
5668 1 : struct zone *zone = &pgdat->node_zones[i];
5669 1 : int ratio = sysctl_lowmem_reserve_ratio[i];
5670 2 : bool clear = !ratio || !zone_managed_pages(zone);
5671 1 : unsigned long managed_pages = 0;
5672 :
5673 2 : for (j = i + 1; j < MAX_NR_ZONES; j++) {
5674 1 : struct zone *upper_zone = &pgdat->node_zones[j];
5675 :
5676 1 : managed_pages += zone_managed_pages(upper_zone);
5677 :
5678 1 : if (clear)
5679 0 : zone->lowmem_reserve[j] = 0;
5680 : else
5681 1 : zone->lowmem_reserve[j] = managed_pages / ratio;
5682 : }
5683 : }
5684 : }
5685 :
5686 : /* update totalreserve_pages */
5687 1 : calculate_totalreserve_pages();
5688 1 : }
5689 :
5690 1 : static void __setup_per_zone_wmarks(void)
5691 : {
5692 1 : unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5693 1 : unsigned long lowmem_pages = 0;
5694 : struct zone *zone;
5695 : unsigned long flags;
5696 :
5697 : /* Calculate total number of !ZONE_HIGHMEM pages */
5698 3 : for_each_zone(zone) {
5699 2 : if (!is_highmem(zone))
5700 2 : lowmem_pages += zone_managed_pages(zone);
5701 : }
5702 :
5703 3 : for_each_zone(zone) {
5704 : u64 tmp;
5705 :
5706 2 : spin_lock_irqsave(&zone->lock, flags);
5707 2 : tmp = (u64)pages_min * zone_managed_pages(zone);
5708 2 : do_div(tmp, lowmem_pages);
5709 2 : if (is_highmem(zone)) {
5710 : /*
5711 : * __GFP_HIGH and PF_MEMALLOC allocations usually don't
5712 : * need highmem pages, so cap pages_min to a small
5713 : * value here.
5714 : *
5715 : * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5716 : * deltas control async page reclaim, and so should
5717 : * not be capped for highmem.
5718 : */
5719 : unsigned long min_pages;
5720 :
5721 : min_pages = zone_managed_pages(zone) / 1024;
5722 : min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
5723 : zone->_watermark[WMARK_MIN] = min_pages;
5724 : } else {
5725 : /*
5726 : * If it's a lowmem zone, reserve a number of pages
5727 : * proportionate to the zone's size.
5728 : */
5729 2 : zone->_watermark[WMARK_MIN] = tmp;
5730 : }
5731 :
5732 : /*
5733 : * Set the kswapd watermarks distance according to the
5734 : * scale factor in proportion to available memory, but
5735 : * ensure a minimum size on small systems.
5736 : */
5737 4 : tmp = max_t(u64, tmp >> 2,
5738 : mult_frac(zone_managed_pages(zone),
5739 : watermark_scale_factor, 10000));
5740 :
5741 2 : zone->watermark_boost = 0;
5742 2 : zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
5743 2 : zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
5744 2 : zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
5745 :
5746 4 : spin_unlock_irqrestore(&zone->lock, flags);
5747 : }
5748 :
5749 : /* update totalreserve_pages */
5750 1 : calculate_totalreserve_pages();
5751 1 : }
5752 :
5753 : /**
5754 : * setup_per_zone_wmarks - called when min_free_kbytes changes
5755 : * or when memory is hot-{added|removed}
5756 : *
5757 : * Ensures that the watermark[min,low,high] values for each zone are set
5758 : * correctly with respect to min_free_kbytes.
5759 : */
5760 1 : void setup_per_zone_wmarks(void)
5761 : {
5762 : struct zone *zone;
5763 : static DEFINE_SPINLOCK(lock);
5764 :
5765 1 : spin_lock(&lock);
5766 1 : __setup_per_zone_wmarks();
5767 1 : spin_unlock(&lock);
5768 :
5769 : /*
5770 : * The watermark size have changed so update the pcpu batch
5771 : * and high limits or the limits may be inappropriate.
5772 : */
5773 3 : for_each_zone(zone)
5774 2 : zone_pcp_update(zone, 0);
5775 1 : }
5776 :
5777 : /*
5778 : * Initialise min_free_kbytes.
5779 : *
5780 : * For small machines we want it small (128k min). For large machines
5781 : * we want it large (256MB max). But it is not linear, because network
5782 : * bandwidth does not increase linearly with machine size. We use
5783 : *
5784 : * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
5785 : * min_free_kbytes = sqrt(lowmem_kbytes * 16)
5786 : *
5787 : * which yields
5788 : *
5789 : * 16MB: 512k
5790 : * 32MB: 724k
5791 : * 64MB: 1024k
5792 : * 128MB: 1448k
5793 : * 256MB: 2048k
5794 : * 512MB: 2896k
5795 : * 1024MB: 4096k
5796 : * 2048MB: 5792k
5797 : * 4096MB: 8192k
5798 : * 8192MB: 11584k
5799 : * 16384MB: 16384k
5800 : */
5801 1 : void calculate_min_free_kbytes(void)
5802 : {
5803 : unsigned long lowmem_kbytes;
5804 : int new_min_free_kbytes;
5805 :
5806 1 : lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5807 1 : new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5808 :
5809 1 : if (new_min_free_kbytes > user_min_free_kbytes)
5810 1 : min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144);
5811 : else
5812 0 : pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
5813 : new_min_free_kbytes, user_min_free_kbytes);
5814 :
5815 1 : }
5816 :
5817 1 : int __meminit init_per_zone_wmark_min(void)
5818 : {
5819 1 : calculate_min_free_kbytes();
5820 1 : setup_per_zone_wmarks();
5821 : refresh_zone_stat_thresholds();
5822 1 : setup_per_zone_lowmem_reserve();
5823 :
5824 : #ifdef CONFIG_NUMA
5825 : setup_min_unmapped_ratio();
5826 : setup_min_slab_ratio();
5827 : #endif
5828 :
5829 : khugepaged_min_free_kbytes_update();
5830 :
5831 1 : return 0;
5832 : }
5833 : postcore_initcall(init_per_zone_wmark_min)
5834 :
5835 : /*
5836 : * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
5837 : * that we can call two helper functions whenever min_free_kbytes
5838 : * changes.
5839 : */
5840 0 : static int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
5841 : void *buffer, size_t *length, loff_t *ppos)
5842 : {
5843 : int rc;
5844 :
5845 0 : rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5846 0 : if (rc)
5847 : return rc;
5848 :
5849 0 : if (write) {
5850 0 : user_min_free_kbytes = min_free_kbytes;
5851 0 : setup_per_zone_wmarks();
5852 : }
5853 : return 0;
5854 : }
5855 :
5856 0 : static int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
5857 : void *buffer, size_t *length, loff_t *ppos)
5858 : {
5859 : int rc;
5860 :
5861 0 : rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5862 0 : if (rc)
5863 : return rc;
5864 :
5865 0 : if (write)
5866 0 : setup_per_zone_wmarks();
5867 :
5868 : return 0;
5869 : }
5870 :
5871 : #ifdef CONFIG_NUMA
5872 : static void setup_min_unmapped_ratio(void)
5873 : {
5874 : pg_data_t *pgdat;
5875 : struct zone *zone;
5876 :
5877 : for_each_online_pgdat(pgdat)
5878 : pgdat->min_unmapped_pages = 0;
5879 :
5880 : for_each_zone(zone)
5881 : zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
5882 : sysctl_min_unmapped_ratio) / 100;
5883 : }
5884 :
5885 :
5886 : static int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
5887 : void *buffer, size_t *length, loff_t *ppos)
5888 : {
5889 : int rc;
5890 :
5891 : rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5892 : if (rc)
5893 : return rc;
5894 :
5895 : setup_min_unmapped_ratio();
5896 :
5897 : return 0;
5898 : }
5899 :
5900 : static void setup_min_slab_ratio(void)
5901 : {
5902 : pg_data_t *pgdat;
5903 : struct zone *zone;
5904 :
5905 : for_each_online_pgdat(pgdat)
5906 : pgdat->min_slab_pages = 0;
5907 :
5908 : for_each_zone(zone)
5909 : zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
5910 : sysctl_min_slab_ratio) / 100;
5911 : }
5912 :
5913 : static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
5914 : void *buffer, size_t *length, loff_t *ppos)
5915 : {
5916 : int rc;
5917 :
5918 : rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5919 : if (rc)
5920 : return rc;
5921 :
5922 : setup_min_slab_ratio();
5923 :
5924 : return 0;
5925 : }
5926 : #endif
5927 :
5928 : /*
5929 : * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
5930 : * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
5931 : * whenever sysctl_lowmem_reserve_ratio changes.
5932 : *
5933 : * The reserve ratio obviously has absolutely no relation with the
5934 : * minimum watermarks. The lowmem reserve ratio can only make sense
5935 : * if in function of the boot time zone sizes.
5936 : */
5937 0 : static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table,
5938 : int write, void *buffer, size_t *length, loff_t *ppos)
5939 : {
5940 : int i;
5941 :
5942 0 : proc_dointvec_minmax(table, write, buffer, length, ppos);
5943 :
5944 0 : for (i = 0; i < MAX_NR_ZONES; i++) {
5945 0 : if (sysctl_lowmem_reserve_ratio[i] < 1)
5946 0 : sysctl_lowmem_reserve_ratio[i] = 0;
5947 : }
5948 :
5949 0 : setup_per_zone_lowmem_reserve();
5950 0 : return 0;
5951 : }
5952 :
5953 : /*
5954 : * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
5955 : * cpu. It is the fraction of total pages in each zone that a hot per cpu
5956 : * pagelist can have before it gets flushed back to buddy allocator.
5957 : */
5958 0 : static int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
5959 : int write, void *buffer, size_t *length, loff_t *ppos)
5960 : {
5961 : struct zone *zone;
5962 : int old_percpu_pagelist_high_fraction;
5963 : int ret;
5964 :
5965 0 : mutex_lock(&pcp_batch_high_lock);
5966 0 : old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
5967 :
5968 0 : ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5969 0 : if (!write || ret < 0)
5970 : goto out;
5971 :
5972 : /* Sanity checking to avoid pcp imbalance */
5973 0 : if (percpu_pagelist_high_fraction &&
5974 : percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
5975 0 : percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
5976 0 : ret = -EINVAL;
5977 0 : goto out;
5978 : }
5979 :
5980 : /* No change? */
5981 0 : if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
5982 : goto out;
5983 :
5984 0 : for_each_populated_zone(zone)
5985 0 : zone_set_pageset_high_and_batch(zone, 0);
5986 : out:
5987 0 : mutex_unlock(&pcp_batch_high_lock);
5988 0 : return ret;
5989 : }
5990 :
5991 : static struct ctl_table page_alloc_sysctl_table[] = {
5992 : {
5993 : .procname = "min_free_kbytes",
5994 : .data = &min_free_kbytes,
5995 : .maxlen = sizeof(min_free_kbytes),
5996 : .mode = 0644,
5997 : .proc_handler = min_free_kbytes_sysctl_handler,
5998 : .extra1 = SYSCTL_ZERO,
5999 : },
6000 : {
6001 : .procname = "watermark_boost_factor",
6002 : .data = &watermark_boost_factor,
6003 : .maxlen = sizeof(watermark_boost_factor),
6004 : .mode = 0644,
6005 : .proc_handler = proc_dointvec_minmax,
6006 : .extra1 = SYSCTL_ZERO,
6007 : },
6008 : {
6009 : .procname = "watermark_scale_factor",
6010 : .data = &watermark_scale_factor,
6011 : .maxlen = sizeof(watermark_scale_factor),
6012 : .mode = 0644,
6013 : .proc_handler = watermark_scale_factor_sysctl_handler,
6014 : .extra1 = SYSCTL_ONE,
6015 : .extra2 = SYSCTL_THREE_THOUSAND,
6016 : },
6017 : {
6018 : .procname = "percpu_pagelist_high_fraction",
6019 : .data = &percpu_pagelist_high_fraction,
6020 : .maxlen = sizeof(percpu_pagelist_high_fraction),
6021 : .mode = 0644,
6022 : .proc_handler = percpu_pagelist_high_fraction_sysctl_handler,
6023 : .extra1 = SYSCTL_ZERO,
6024 : },
6025 : {
6026 : .procname = "lowmem_reserve_ratio",
6027 : .data = &sysctl_lowmem_reserve_ratio,
6028 : .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
6029 : .mode = 0644,
6030 : .proc_handler = lowmem_reserve_ratio_sysctl_handler,
6031 : },
6032 : #ifdef CONFIG_NUMA
6033 : {
6034 : .procname = "numa_zonelist_order",
6035 : .data = &numa_zonelist_order,
6036 : .maxlen = NUMA_ZONELIST_ORDER_LEN,
6037 : .mode = 0644,
6038 : .proc_handler = numa_zonelist_order_handler,
6039 : },
6040 : {
6041 : .procname = "min_unmapped_ratio",
6042 : .data = &sysctl_min_unmapped_ratio,
6043 : .maxlen = sizeof(sysctl_min_unmapped_ratio),
6044 : .mode = 0644,
6045 : .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
6046 : .extra1 = SYSCTL_ZERO,
6047 : .extra2 = SYSCTL_ONE_HUNDRED,
6048 : },
6049 : {
6050 : .procname = "min_slab_ratio",
6051 : .data = &sysctl_min_slab_ratio,
6052 : .maxlen = sizeof(sysctl_min_slab_ratio),
6053 : .mode = 0644,
6054 : .proc_handler = sysctl_min_slab_ratio_sysctl_handler,
6055 : .extra1 = SYSCTL_ZERO,
6056 : .extra2 = SYSCTL_ONE_HUNDRED,
6057 : },
6058 : #endif
6059 : {}
6060 : };
6061 :
6062 1 : void __init page_alloc_sysctl_init(void)
6063 : {
6064 1 : register_sysctl_init("vm", page_alloc_sysctl_table);
6065 1 : }
6066 :
6067 : #ifdef CONFIG_CONTIG_ALLOC
6068 : /* Usage: See admin-guide/dynamic-debug-howto.rst */
6069 : static void alloc_contig_dump_pages(struct list_head *page_list)
6070 : {
6071 : DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
6072 :
6073 : if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
6074 : struct page *page;
6075 :
6076 : dump_stack();
6077 : list_for_each_entry(page, page_list, lru)
6078 : dump_page(page, "migration failure");
6079 : }
6080 : }
6081 :
6082 : /* [start, end) must belong to a single zone. */
6083 : int __alloc_contig_migrate_range(struct compact_control *cc,
6084 : unsigned long start, unsigned long end)
6085 : {
6086 : /* This function is based on compact_zone() from compaction.c. */
6087 : unsigned int nr_reclaimed;
6088 : unsigned long pfn = start;
6089 : unsigned int tries = 0;
6090 : int ret = 0;
6091 : struct migration_target_control mtc = {
6092 : .nid = zone_to_nid(cc->zone),
6093 : .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
6094 : };
6095 :
6096 : lru_cache_disable();
6097 :
6098 : while (pfn < end || !list_empty(&cc->migratepages)) {
6099 : if (fatal_signal_pending(current)) {
6100 : ret = -EINTR;
6101 : break;
6102 : }
6103 :
6104 : if (list_empty(&cc->migratepages)) {
6105 : cc->nr_migratepages = 0;
6106 : ret = isolate_migratepages_range(cc, pfn, end);
6107 : if (ret && ret != -EAGAIN)
6108 : break;
6109 : pfn = cc->migrate_pfn;
6110 : tries = 0;
6111 : } else if (++tries == 5) {
6112 : ret = -EBUSY;
6113 : break;
6114 : }
6115 :
6116 : nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
6117 : &cc->migratepages);
6118 : cc->nr_migratepages -= nr_reclaimed;
6119 :
6120 : ret = migrate_pages(&cc->migratepages, alloc_migration_target,
6121 : NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
6122 :
6123 : /*
6124 : * On -ENOMEM, migrate_pages() bails out right away. It is pointless
6125 : * to retry again over this error, so do the same here.
6126 : */
6127 : if (ret == -ENOMEM)
6128 : break;
6129 : }
6130 :
6131 : lru_cache_enable();
6132 : if (ret < 0) {
6133 : if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
6134 : alloc_contig_dump_pages(&cc->migratepages);
6135 : putback_movable_pages(&cc->migratepages);
6136 : return ret;
6137 : }
6138 : return 0;
6139 : }
6140 :
6141 : /**
6142 : * alloc_contig_range() -- tries to allocate given range of pages
6143 : * @start: start PFN to allocate
6144 : * @end: one-past-the-last PFN to allocate
6145 : * @migratetype: migratetype of the underlying pageblocks (either
6146 : * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
6147 : * in range must have the same migratetype and it must
6148 : * be either of the two.
6149 : * @gfp_mask: GFP mask to use during compaction
6150 : *
6151 : * The PFN range does not have to be pageblock aligned. The PFN range must
6152 : * belong to a single zone.
6153 : *
6154 : * The first thing this routine does is attempt to MIGRATE_ISOLATE all
6155 : * pageblocks in the range. Once isolated, the pageblocks should not
6156 : * be modified by others.
6157 : *
6158 : * Return: zero on success or negative error code. On success all
6159 : * pages which PFN is in [start, end) are allocated for the caller and
6160 : * need to be freed with free_contig_range().
6161 : */
6162 : int alloc_contig_range(unsigned long start, unsigned long end,
6163 : unsigned migratetype, gfp_t gfp_mask)
6164 : {
6165 : unsigned long outer_start, outer_end;
6166 : int order;
6167 : int ret = 0;
6168 :
6169 : struct compact_control cc = {
6170 : .nr_migratepages = 0,
6171 : .order = -1,
6172 : .zone = page_zone(pfn_to_page(start)),
6173 : .mode = MIGRATE_SYNC,
6174 : .ignore_skip_hint = true,
6175 : .no_set_skip_hint = true,
6176 : .gfp_mask = current_gfp_context(gfp_mask),
6177 : .alloc_contig = true,
6178 : };
6179 : INIT_LIST_HEAD(&cc.migratepages);
6180 :
6181 : /*
6182 : * What we do here is we mark all pageblocks in range as
6183 : * MIGRATE_ISOLATE. Because pageblock and max order pages may
6184 : * have different sizes, and due to the way page allocator
6185 : * work, start_isolate_page_range() has special handlings for this.
6186 : *
6187 : * Once the pageblocks are marked as MIGRATE_ISOLATE, we
6188 : * migrate the pages from an unaligned range (ie. pages that
6189 : * we are interested in). This will put all the pages in
6190 : * range back to page allocator as MIGRATE_ISOLATE.
6191 : *
6192 : * When this is done, we take the pages in range from page
6193 : * allocator removing them from the buddy system. This way
6194 : * page allocator will never consider using them.
6195 : *
6196 : * This lets us mark the pageblocks back as
6197 : * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
6198 : * aligned range but not in the unaligned, original range are
6199 : * put back to page allocator so that buddy can use them.
6200 : */
6201 :
6202 : ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
6203 : if (ret)
6204 : goto done;
6205 :
6206 : drain_all_pages(cc.zone);
6207 :
6208 : /*
6209 : * In case of -EBUSY, we'd like to know which page causes problem.
6210 : * So, just fall through. test_pages_isolated() has a tracepoint
6211 : * which will report the busy page.
6212 : *
6213 : * It is possible that busy pages could become available before
6214 : * the call to test_pages_isolated, and the range will actually be
6215 : * allocated. So, if we fall through be sure to clear ret so that
6216 : * -EBUSY is not accidentally used or returned to caller.
6217 : */
6218 : ret = __alloc_contig_migrate_range(&cc, start, end);
6219 : if (ret && ret != -EBUSY)
6220 : goto done;
6221 : ret = 0;
6222 :
6223 : /*
6224 : * Pages from [start, end) are within a pageblock_nr_pages
6225 : * aligned blocks that are marked as MIGRATE_ISOLATE. What's
6226 : * more, all pages in [start, end) are free in page allocator.
6227 : * What we are going to do is to allocate all pages from
6228 : * [start, end) (that is remove them from page allocator).
6229 : *
6230 : * The only problem is that pages at the beginning and at the
6231 : * end of interesting range may be not aligned with pages that
6232 : * page allocator holds, ie. they can be part of higher order
6233 : * pages. Because of this, we reserve the bigger range and
6234 : * once this is done free the pages we are not interested in.
6235 : *
6236 : * We don't have to hold zone->lock here because the pages are
6237 : * isolated thus they won't get removed from buddy.
6238 : */
6239 :
6240 : order = 0;
6241 : outer_start = start;
6242 : while (!PageBuddy(pfn_to_page(outer_start))) {
6243 : if (++order > MAX_ORDER) {
6244 : outer_start = start;
6245 : break;
6246 : }
6247 : outer_start &= ~0UL << order;
6248 : }
6249 :
6250 : if (outer_start != start) {
6251 : order = buddy_order(pfn_to_page(outer_start));
6252 :
6253 : /*
6254 : * outer_start page could be small order buddy page and
6255 : * it doesn't include start page. Adjust outer_start
6256 : * in this case to report failed page properly
6257 : * on tracepoint in test_pages_isolated()
6258 : */
6259 : if (outer_start + (1UL << order) <= start)
6260 : outer_start = start;
6261 : }
6262 :
6263 : /* Make sure the range is really isolated. */
6264 : if (test_pages_isolated(outer_start, end, 0)) {
6265 : ret = -EBUSY;
6266 : goto done;
6267 : }
6268 :
6269 : /* Grab isolated pages from freelists. */
6270 : outer_end = isolate_freepages_range(&cc, outer_start, end);
6271 : if (!outer_end) {
6272 : ret = -EBUSY;
6273 : goto done;
6274 : }
6275 :
6276 : /* Free head and tail (if any) */
6277 : if (start != outer_start)
6278 : free_contig_range(outer_start, start - outer_start);
6279 : if (end != outer_end)
6280 : free_contig_range(end, outer_end - end);
6281 :
6282 : done:
6283 : undo_isolate_page_range(start, end, migratetype);
6284 : return ret;
6285 : }
6286 : EXPORT_SYMBOL(alloc_contig_range);
6287 :
6288 : static int __alloc_contig_pages(unsigned long start_pfn,
6289 : unsigned long nr_pages, gfp_t gfp_mask)
6290 : {
6291 : unsigned long end_pfn = start_pfn + nr_pages;
6292 :
6293 : return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
6294 : gfp_mask);
6295 : }
6296 :
6297 : static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
6298 : unsigned long nr_pages)
6299 : {
6300 : unsigned long i, end_pfn = start_pfn + nr_pages;
6301 : struct page *page;
6302 :
6303 : for (i = start_pfn; i < end_pfn; i++) {
6304 : page = pfn_to_online_page(i);
6305 : if (!page)
6306 : return false;
6307 :
6308 : if (page_zone(page) != z)
6309 : return false;
6310 :
6311 : if (PageReserved(page))
6312 : return false;
6313 :
6314 : if (PageHuge(page))
6315 : return false;
6316 : }
6317 : return true;
6318 : }
6319 :
6320 : static bool zone_spans_last_pfn(const struct zone *zone,
6321 : unsigned long start_pfn, unsigned long nr_pages)
6322 : {
6323 : unsigned long last_pfn = start_pfn + nr_pages - 1;
6324 :
6325 : return zone_spans_pfn(zone, last_pfn);
6326 : }
6327 :
6328 : /**
6329 : * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
6330 : * @nr_pages: Number of contiguous pages to allocate
6331 : * @gfp_mask: GFP mask to limit search and used during compaction
6332 : * @nid: Target node
6333 : * @nodemask: Mask for other possible nodes
6334 : *
6335 : * This routine is a wrapper around alloc_contig_range(). It scans over zones
6336 : * on an applicable zonelist to find a contiguous pfn range which can then be
6337 : * tried for allocation with alloc_contig_range(). This routine is intended
6338 : * for allocation requests which can not be fulfilled with the buddy allocator.
6339 : *
6340 : * The allocated memory is always aligned to a page boundary. If nr_pages is a
6341 : * power of two, then allocated range is also guaranteed to be aligned to same
6342 : * nr_pages (e.g. 1GB request would be aligned to 1GB).
6343 : *
6344 : * Allocated pages can be freed with free_contig_range() or by manually calling
6345 : * __free_page() on each allocated page.
6346 : *
6347 : * Return: pointer to contiguous pages on success, or NULL if not successful.
6348 : */
6349 : struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
6350 : int nid, nodemask_t *nodemask)
6351 : {
6352 : unsigned long ret, pfn, flags;
6353 : struct zonelist *zonelist;
6354 : struct zone *zone;
6355 : struct zoneref *z;
6356 :
6357 : zonelist = node_zonelist(nid, gfp_mask);
6358 : for_each_zone_zonelist_nodemask(zone, z, zonelist,
6359 : gfp_zone(gfp_mask), nodemask) {
6360 : spin_lock_irqsave(&zone->lock, flags);
6361 :
6362 : pfn = ALIGN(zone->zone_start_pfn, nr_pages);
6363 : while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
6364 : if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
6365 : /*
6366 : * We release the zone lock here because
6367 : * alloc_contig_range() will also lock the zone
6368 : * at some point. If there's an allocation
6369 : * spinning on this lock, it may win the race
6370 : * and cause alloc_contig_range() to fail...
6371 : */
6372 : spin_unlock_irqrestore(&zone->lock, flags);
6373 : ret = __alloc_contig_pages(pfn, nr_pages,
6374 : gfp_mask);
6375 : if (!ret)
6376 : return pfn_to_page(pfn);
6377 : spin_lock_irqsave(&zone->lock, flags);
6378 : }
6379 : pfn += nr_pages;
6380 : }
6381 : spin_unlock_irqrestore(&zone->lock, flags);
6382 : }
6383 : return NULL;
6384 : }
6385 : #endif /* CONFIG_CONTIG_ALLOC */
6386 :
6387 0 : void free_contig_range(unsigned long pfn, unsigned long nr_pages)
6388 : {
6389 0 : unsigned long count = 0;
6390 :
6391 0 : for (; nr_pages--; pfn++) {
6392 0 : struct page *page = pfn_to_page(pfn);
6393 :
6394 0 : count += page_count(page) != 1;
6395 0 : __free_page(page);
6396 : }
6397 0 : WARN(count != 0, "%lu pages are still in use!\n", count);
6398 0 : }
6399 : EXPORT_SYMBOL(free_contig_range);
6400 :
6401 : /*
6402 : * Effectively disable pcplists for the zone by setting the high limit to 0
6403 : * and draining all cpus. A concurrent page freeing on another CPU that's about
6404 : * to put the page on pcplist will either finish before the drain and the page
6405 : * will be drained, or observe the new high limit and skip the pcplist.
6406 : *
6407 : * Must be paired with a call to zone_pcp_enable().
6408 : */
6409 0 : void zone_pcp_disable(struct zone *zone)
6410 : {
6411 0 : mutex_lock(&pcp_batch_high_lock);
6412 0 : __zone_set_pageset_high_and_batch(zone, 0, 1);
6413 0 : __drain_all_pages(zone, true);
6414 0 : }
6415 :
6416 0 : void zone_pcp_enable(struct zone *zone)
6417 : {
6418 0 : __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
6419 0 : mutex_unlock(&pcp_batch_high_lock);
6420 0 : }
6421 :
6422 0 : void zone_pcp_reset(struct zone *zone)
6423 : {
6424 : int cpu;
6425 : struct per_cpu_zonestat *pzstats;
6426 :
6427 0 : if (zone->per_cpu_pageset != &boot_pageset) {
6428 : for_each_online_cpu(cpu) {
6429 : pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
6430 : drain_zonestat(zone, pzstats);
6431 : }
6432 0 : free_percpu(zone->per_cpu_pageset);
6433 0 : zone->per_cpu_pageset = &boot_pageset;
6434 0 : if (zone->per_cpu_zonestats != &boot_zonestats) {
6435 0 : free_percpu(zone->per_cpu_zonestats);
6436 0 : zone->per_cpu_zonestats = &boot_zonestats;
6437 : }
6438 : }
6439 0 : }
6440 :
6441 : #ifdef CONFIG_MEMORY_HOTREMOVE
6442 : /*
6443 : * All pages in the range must be in a single zone, must not contain holes,
6444 : * must span full sections, and must be isolated before calling this function.
6445 : */
6446 : void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6447 : {
6448 : unsigned long pfn = start_pfn;
6449 : struct page *page;
6450 : struct zone *zone;
6451 : unsigned int order;
6452 : unsigned long flags;
6453 :
6454 : offline_mem_sections(pfn, end_pfn);
6455 : zone = page_zone(pfn_to_page(pfn));
6456 : spin_lock_irqsave(&zone->lock, flags);
6457 : while (pfn < end_pfn) {
6458 : page = pfn_to_page(pfn);
6459 : /*
6460 : * The HWPoisoned page may be not in buddy system, and
6461 : * page_count() is not 0.
6462 : */
6463 : if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6464 : pfn++;
6465 : continue;
6466 : }
6467 : /*
6468 : * At this point all remaining PageOffline() pages have a
6469 : * reference count of 0 and can simply be skipped.
6470 : */
6471 : if (PageOffline(page)) {
6472 : BUG_ON(page_count(page));
6473 : BUG_ON(PageBuddy(page));
6474 : pfn++;
6475 : continue;
6476 : }
6477 :
6478 : BUG_ON(page_count(page));
6479 : BUG_ON(!PageBuddy(page));
6480 : order = buddy_order(page);
6481 : del_page_from_free_list(page, zone, order);
6482 : pfn += (1 << order);
6483 : }
6484 : spin_unlock_irqrestore(&zone->lock, flags);
6485 : }
6486 : #endif
6487 :
6488 : /*
6489 : * This function returns a stable result only if called under zone lock.
6490 : */
6491 0 : bool is_free_buddy_page(struct page *page)
6492 : {
6493 0 : unsigned long pfn = page_to_pfn(page);
6494 : unsigned int order;
6495 :
6496 0 : for (order = 0; order <= MAX_ORDER; order++) {
6497 0 : struct page *page_head = page - (pfn & ((1 << order) - 1));
6498 :
6499 0 : if (PageBuddy(page_head) &&
6500 0 : buddy_order_unsafe(page_head) >= order)
6501 : break;
6502 : }
6503 :
6504 0 : return order <= MAX_ORDER;
6505 : }
6506 : EXPORT_SYMBOL(is_free_buddy_page);
6507 :
6508 : #ifdef CONFIG_MEMORY_FAILURE
6509 : /*
6510 : * Break down a higher-order page in sub-pages, and keep our target out of
6511 : * buddy allocator.
6512 : */
6513 : static void break_down_buddy_pages(struct zone *zone, struct page *page,
6514 : struct page *target, int low, int high,
6515 : int migratetype)
6516 : {
6517 : unsigned long size = 1 << high;
6518 : struct page *current_buddy, *next_page;
6519 :
6520 : while (high > low) {
6521 : high--;
6522 : size >>= 1;
6523 :
6524 : if (target >= &page[size]) {
6525 : next_page = page + size;
6526 : current_buddy = page;
6527 : } else {
6528 : next_page = page;
6529 : current_buddy = page + size;
6530 : }
6531 :
6532 : if (set_page_guard(zone, current_buddy, high, migratetype))
6533 : continue;
6534 :
6535 : if (current_buddy != target) {
6536 : add_to_free_list(current_buddy, zone, high, migratetype);
6537 : set_buddy_order(current_buddy, high);
6538 : page = next_page;
6539 : }
6540 : }
6541 : }
6542 :
6543 : /*
6544 : * Take a page that will be marked as poisoned off the buddy allocator.
6545 : */
6546 : bool take_page_off_buddy(struct page *page)
6547 : {
6548 : struct zone *zone = page_zone(page);
6549 : unsigned long pfn = page_to_pfn(page);
6550 : unsigned long flags;
6551 : unsigned int order;
6552 : bool ret = false;
6553 :
6554 : spin_lock_irqsave(&zone->lock, flags);
6555 : for (order = 0; order <= MAX_ORDER; order++) {
6556 : struct page *page_head = page - (pfn & ((1 << order) - 1));
6557 : int page_order = buddy_order(page_head);
6558 :
6559 : if (PageBuddy(page_head) && page_order >= order) {
6560 : unsigned long pfn_head = page_to_pfn(page_head);
6561 : int migratetype = get_pfnblock_migratetype(page_head,
6562 : pfn_head);
6563 :
6564 : del_page_from_free_list(page_head, zone, page_order);
6565 : break_down_buddy_pages(zone, page_head, page, 0,
6566 : page_order, migratetype);
6567 : SetPageHWPoisonTakenOff(page);
6568 : if (!is_migrate_isolate(migratetype))
6569 : __mod_zone_freepage_state(zone, -1, migratetype);
6570 : ret = true;
6571 : break;
6572 : }
6573 : if (page_count(page_head) > 0)
6574 : break;
6575 : }
6576 : spin_unlock_irqrestore(&zone->lock, flags);
6577 : return ret;
6578 : }
6579 :
6580 : /*
6581 : * Cancel takeoff done by take_page_off_buddy().
6582 : */
6583 : bool put_page_back_buddy(struct page *page)
6584 : {
6585 : struct zone *zone = page_zone(page);
6586 : unsigned long pfn = page_to_pfn(page);
6587 : unsigned long flags;
6588 : int migratetype = get_pfnblock_migratetype(page, pfn);
6589 : bool ret = false;
6590 :
6591 : spin_lock_irqsave(&zone->lock, flags);
6592 : if (put_page_testzero(page)) {
6593 : ClearPageHWPoisonTakenOff(page);
6594 : __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
6595 : if (TestClearPageHWPoison(page)) {
6596 : ret = true;
6597 : }
6598 : }
6599 : spin_unlock_irqrestore(&zone->lock, flags);
6600 :
6601 : return ret;
6602 : }
6603 : #endif
6604 :
6605 : #ifdef CONFIG_ZONE_DMA
6606 : bool has_managed_dma(void)
6607 : {
6608 : struct pglist_data *pgdat;
6609 :
6610 : for_each_online_pgdat(pgdat) {
6611 : struct zone *zone = &pgdat->node_zones[ZONE_DMA];
6612 :
6613 : if (managed_zone(zone))
6614 : return true;
6615 : }
6616 : return false;
6617 : }
6618 : #endif /* CONFIG_ZONE_DMA */
6619 :
6620 : #ifdef CONFIG_UNACCEPTED_MEMORY
6621 :
6622 : /* Counts number of zones with unaccepted pages. */
6623 : static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
6624 :
6625 : static bool lazy_accept = true;
6626 :
6627 : static int __init accept_memory_parse(char *p)
6628 : {
6629 : if (!strcmp(p, "lazy")) {
6630 : lazy_accept = true;
6631 : return 0;
6632 : } else if (!strcmp(p, "eager")) {
6633 : lazy_accept = false;
6634 : return 0;
6635 : } else {
6636 : return -EINVAL;
6637 : }
6638 : }
6639 : early_param("accept_memory", accept_memory_parse);
6640 :
6641 : static bool page_contains_unaccepted(struct page *page, unsigned int order)
6642 : {
6643 : phys_addr_t start = page_to_phys(page);
6644 : phys_addr_t end = start + (PAGE_SIZE << order);
6645 :
6646 : return range_contains_unaccepted_memory(start, end);
6647 : }
6648 :
6649 : static void accept_page(struct page *page, unsigned int order)
6650 : {
6651 : phys_addr_t start = page_to_phys(page);
6652 :
6653 : accept_memory(start, start + (PAGE_SIZE << order));
6654 : }
6655 :
6656 : static bool try_to_accept_memory_one(struct zone *zone)
6657 : {
6658 : unsigned long flags;
6659 : struct page *page;
6660 : bool last;
6661 :
6662 : if (list_empty(&zone->unaccepted_pages))
6663 : return false;
6664 :
6665 : spin_lock_irqsave(&zone->lock, flags);
6666 : page = list_first_entry_or_null(&zone->unaccepted_pages,
6667 : struct page, lru);
6668 : if (!page) {
6669 : spin_unlock_irqrestore(&zone->lock, flags);
6670 : return false;
6671 : }
6672 :
6673 : list_del(&page->lru);
6674 : last = list_empty(&zone->unaccepted_pages);
6675 :
6676 : __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
6677 : __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
6678 : spin_unlock_irqrestore(&zone->lock, flags);
6679 :
6680 : accept_page(page, MAX_ORDER);
6681 :
6682 : __free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL);
6683 :
6684 : if (last)
6685 : static_branch_dec(&zones_with_unaccepted_pages);
6686 :
6687 : return true;
6688 : }
6689 :
6690 : static bool try_to_accept_memory(struct zone *zone, unsigned int order)
6691 : {
6692 : long to_accept;
6693 : int ret = false;
6694 :
6695 : /* How much to accept to get to high watermark? */
6696 : to_accept = high_wmark_pages(zone) -
6697 : (zone_page_state(zone, NR_FREE_PAGES) -
6698 : __zone_watermark_unusable_free(zone, order, 0));
6699 :
6700 : /* Accept at least one page */
6701 : do {
6702 : if (!try_to_accept_memory_one(zone))
6703 : break;
6704 : ret = true;
6705 : to_accept -= MAX_ORDER_NR_PAGES;
6706 : } while (to_accept > 0);
6707 :
6708 : return ret;
6709 : }
6710 :
6711 : static inline bool has_unaccepted_memory(void)
6712 : {
6713 : return static_branch_unlikely(&zones_with_unaccepted_pages);
6714 : }
6715 :
6716 : static bool __free_unaccepted(struct page *page)
6717 : {
6718 : struct zone *zone = page_zone(page);
6719 : unsigned long flags;
6720 : bool first = false;
6721 :
6722 : if (!lazy_accept)
6723 : return false;
6724 :
6725 : spin_lock_irqsave(&zone->lock, flags);
6726 : first = list_empty(&zone->unaccepted_pages);
6727 : list_add_tail(&page->lru, &zone->unaccepted_pages);
6728 : __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
6729 : __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
6730 : spin_unlock_irqrestore(&zone->lock, flags);
6731 :
6732 : if (first)
6733 : static_branch_inc(&zones_with_unaccepted_pages);
6734 :
6735 : return true;
6736 : }
6737 :
6738 : #else
6739 :
6740 : static bool page_contains_unaccepted(struct page *page, unsigned int order)
6741 : {
6742 : return false;
6743 : }
6744 :
6745 : static void accept_page(struct page *page, unsigned int order)
6746 : {
6747 : }
6748 :
6749 : static bool try_to_accept_memory(struct zone *zone, unsigned int order)
6750 : {
6751 : return false;
6752 : }
6753 :
6754 : static inline bool has_unaccepted_memory(void)
6755 : {
6756 : return false;
6757 : }
6758 :
6759 : static bool __free_unaccepted(struct page *page)
6760 : {
6761 : BUILD_BUG();
6762 : return false;
6763 : }
6764 :
6765 : #endif /* CONFIG_UNACCEPTED_MEMORY */
|