Line data Source code
1 : /* SPDX-License-Identifier: GPL-2.0 */
2 : #ifndef _LINUX_MMZONE_H
3 : #define _LINUX_MMZONE_H
4 :
5 : #ifndef __ASSEMBLY__
6 : #ifndef __GENERATING_BOUNDS_H
7 :
8 : #include <linux/spinlock.h>
9 : #include <linux/list.h>
10 : #include <linux/list_nulls.h>
11 : #include <linux/wait.h>
12 : #include <linux/bitops.h>
13 : #include <linux/cache.h>
14 : #include <linux/threads.h>
15 : #include <linux/numa.h>
16 : #include <linux/init.h>
17 : #include <linux/seqlock.h>
18 : #include <linux/nodemask.h>
19 : #include <linux/pageblock-flags.h>
20 : #include <linux/page-flags-layout.h>
21 : #include <linux/atomic.h>
22 : #include <linux/mm_types.h>
23 : #include <linux/page-flags.h>
24 : #include <linux/local_lock.h>
25 : #include <asm/page.h>
26 :
27 : /* Free memory management - zoned buddy allocator. */
28 : #ifndef CONFIG_ARCH_FORCE_MAX_ORDER
29 : #define MAX_ORDER 10
30 : #else
31 : #define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
32 : #endif
33 : #define MAX_ORDER_NR_PAGES (1 << MAX_ORDER)
34 :
35 : #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)
36 :
37 : /*
38 : * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
39 : * costly to service. That is between allocation orders which should
40 : * coalesce naturally under reasonable reclaim pressure and those which
41 : * will not.
42 : */
43 : #define PAGE_ALLOC_COSTLY_ORDER 3
44 :
45 : enum migratetype {
46 : MIGRATE_UNMOVABLE,
47 : MIGRATE_MOVABLE,
48 : MIGRATE_RECLAIMABLE,
49 : MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
50 : MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
51 : #ifdef CONFIG_CMA
52 : /*
53 : * MIGRATE_CMA migration type is designed to mimic the way
54 : * ZONE_MOVABLE works. Only movable pages can be allocated
55 : * from MIGRATE_CMA pageblocks and page allocator never
56 : * implicitly change migration type of MIGRATE_CMA pageblock.
57 : *
58 : * The way to use it is to change migratetype of a range of
59 : * pageblocks to MIGRATE_CMA which can be done by
60 : * __free_pageblock_cma() function.
61 : */
62 : MIGRATE_CMA,
63 : #endif
64 : #ifdef CONFIG_MEMORY_ISOLATION
65 : MIGRATE_ISOLATE, /* can't allocate from here */
66 : #endif
67 : MIGRATE_TYPES
68 : };
69 :
70 : /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
71 : extern const char * const migratetype_names[MIGRATE_TYPES];
72 :
73 : #ifdef CONFIG_CMA
74 : # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
75 : # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
76 : #else
77 : # define is_migrate_cma(migratetype) false
78 : # define is_migrate_cma_page(_page) false
79 : #endif
80 :
81 : static inline bool is_migrate_movable(int mt)
82 : {
83 0 : return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
84 : }
85 :
86 : /*
87 : * Check whether a migratetype can be merged with another migratetype.
88 : *
89 : * It is only mergeable when it can fall back to other migratetypes for
90 : * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c.
91 : */
92 : static inline bool migratetype_is_mergeable(int mt)
93 : {
94 : return mt < MIGRATE_PCPTYPES;
95 : }
96 :
97 : #define for_each_migratetype_order(order, type) \
98 : for (order = 0; order <= MAX_ORDER; order++) \
99 : for (type = 0; type < MIGRATE_TYPES; type++)
100 :
101 : extern int page_group_by_mobility_disabled;
102 :
103 : #define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1)
104 :
105 : #define get_pageblock_migratetype(page) \
106 : get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK)
107 :
108 : struct free_area {
109 : struct list_head free_list[MIGRATE_TYPES];
110 : unsigned long nr_free;
111 : };
112 :
113 : struct pglist_data;
114 :
115 : #ifdef CONFIG_NUMA
116 : enum numa_stat_item {
117 : NUMA_HIT, /* allocated in intended node */
118 : NUMA_MISS, /* allocated in non intended node */
119 : NUMA_FOREIGN, /* was intended here, hit elsewhere */
120 : NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */
121 : NUMA_LOCAL, /* allocation from local node */
122 : NUMA_OTHER, /* allocation from other node */
123 : NR_VM_NUMA_EVENT_ITEMS
124 : };
125 : #else
126 : #define NR_VM_NUMA_EVENT_ITEMS 0
127 : #endif
128 :
129 : enum zone_stat_item {
130 : /* First 128 byte cacheline (assuming 64 bit words) */
131 : NR_FREE_PAGES,
132 : NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
133 : NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
134 : NR_ZONE_ACTIVE_ANON,
135 : NR_ZONE_INACTIVE_FILE,
136 : NR_ZONE_ACTIVE_FILE,
137 : NR_ZONE_UNEVICTABLE,
138 : NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
139 : NR_MLOCK, /* mlock()ed pages found and moved off LRU */
140 : /* Second 128 byte cacheline */
141 : NR_BOUNCE,
142 : #if IS_ENABLED(CONFIG_ZSMALLOC)
143 : NR_ZSPAGES, /* allocated in zsmalloc */
144 : #endif
145 : NR_FREE_CMA_PAGES,
146 : NR_VM_ZONE_STAT_ITEMS };
147 :
148 : enum node_stat_item {
149 : NR_LRU_BASE,
150 : NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
151 : NR_ACTIVE_ANON, /* " " " " " */
152 : NR_INACTIVE_FILE, /* " " " " " */
153 : NR_ACTIVE_FILE, /* " " " " " */
154 : NR_UNEVICTABLE, /* " " " " " */
155 : NR_SLAB_RECLAIMABLE_B,
156 : NR_SLAB_UNRECLAIMABLE_B,
157 : NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
158 : NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
159 : WORKINGSET_NODES,
160 : WORKINGSET_REFAULT_BASE,
161 : WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE,
162 : WORKINGSET_REFAULT_FILE,
163 : WORKINGSET_ACTIVATE_BASE,
164 : WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE,
165 : WORKINGSET_ACTIVATE_FILE,
166 : WORKINGSET_RESTORE_BASE,
167 : WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE,
168 : WORKINGSET_RESTORE_FILE,
169 : WORKINGSET_NODERECLAIM,
170 : NR_ANON_MAPPED, /* Mapped anonymous pages */
171 : NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
172 : only modified from process context */
173 : NR_FILE_PAGES,
174 : NR_FILE_DIRTY,
175 : NR_WRITEBACK,
176 : NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */
177 : NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
178 : NR_SHMEM_THPS,
179 : NR_SHMEM_PMDMAPPED,
180 : NR_FILE_THPS,
181 : NR_FILE_PMDMAPPED,
182 : NR_ANON_THPS,
183 : NR_VMSCAN_WRITE,
184 : NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
185 : NR_DIRTIED, /* page dirtyings since bootup */
186 : NR_WRITTEN, /* page writings since bootup */
187 : NR_THROTTLED_WRITTEN, /* NR_WRITTEN while reclaim throttled */
188 : NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
189 : NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */
190 : NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */
191 : NR_KERNEL_STACK_KB, /* measured in KiB */
192 : #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
193 : NR_KERNEL_SCS_KB, /* measured in KiB */
194 : #endif
195 : NR_PAGETABLE, /* used for pagetables */
196 : NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. KVM pagetables */
197 : #ifdef CONFIG_SWAP
198 : NR_SWAPCACHE,
199 : #endif
200 : #ifdef CONFIG_NUMA_BALANCING
201 : PGPROMOTE_SUCCESS, /* promote successfully */
202 : PGPROMOTE_CANDIDATE, /* candidate pages to promote */
203 : #endif
204 : NR_VM_NODE_STAT_ITEMS
205 : };
206 :
207 : /*
208 : * Returns true if the item should be printed in THPs (/proc/vmstat
209 : * currently prints number of anon, file and shmem THPs. But the item
210 : * is charged in pages).
211 : */
212 : static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
213 : {
214 : if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
215 : return false;
216 :
217 : return item == NR_ANON_THPS ||
218 : item == NR_FILE_THPS ||
219 : item == NR_SHMEM_THPS ||
220 : item == NR_SHMEM_PMDMAPPED ||
221 : item == NR_FILE_PMDMAPPED;
222 : }
223 :
224 : /*
225 : * Returns true if the value is measured in bytes (most vmstat values are
226 : * measured in pages). This defines the API part, the internal representation
227 : * might be different.
228 : */
229 : static __always_inline bool vmstat_item_in_bytes(int idx)
230 : {
231 : /*
232 : * Global and per-node slab counters track slab pages.
233 : * It's expected that changes are multiples of PAGE_SIZE.
234 : * Internally values are stored in pages.
235 : *
236 : * Per-memcg and per-lruvec counters track memory, consumed
237 : * by individual slab objects. These counters are actually
238 : * byte-precise.
239 : */
240 3947 : return (idx == NR_SLAB_RECLAIMABLE_B ||
241 : idx == NR_SLAB_UNRECLAIMABLE_B);
242 : }
243 :
244 : /*
245 : * We do arithmetic on the LRU lists in various places in the code,
246 : * so it is important to keep the active lists LRU_ACTIVE higher in
247 : * the array than the corresponding inactive lists, and to keep
248 : * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
249 : *
250 : * This has to be kept in sync with the statistics in zone_stat_item
251 : * above and the descriptions in vmstat_text in mm/vmstat.c
252 : */
253 : #define LRU_BASE 0
254 : #define LRU_ACTIVE 1
255 : #define LRU_FILE 2
256 :
257 : enum lru_list {
258 : LRU_INACTIVE_ANON = LRU_BASE,
259 : LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
260 : LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
261 : LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
262 : LRU_UNEVICTABLE,
263 : NR_LRU_LISTS
264 : };
265 :
266 : enum vmscan_throttle_state {
267 : VMSCAN_THROTTLE_WRITEBACK,
268 : VMSCAN_THROTTLE_ISOLATED,
269 : VMSCAN_THROTTLE_NOPROGRESS,
270 : VMSCAN_THROTTLE_CONGESTED,
271 : NR_VMSCAN_THROTTLE,
272 : };
273 :
274 : #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)
275 :
276 : #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
277 :
278 : static inline bool is_file_lru(enum lru_list lru)
279 : {
280 0 : return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
281 : }
282 :
283 : static inline bool is_active_lru(enum lru_list lru)
284 : {
285 0 : return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
286 : }
287 :
288 : #define WORKINGSET_ANON 0
289 : #define WORKINGSET_FILE 1
290 : #define ANON_AND_FILE 2
291 :
292 : enum lruvec_flags {
293 : LRUVEC_CONGESTED, /* lruvec has many dirty pages
294 : * backed by a congested BDI
295 : */
296 : };
297 :
298 : #endif /* !__GENERATING_BOUNDS_H */
299 :
300 : /*
301 : * Evictable pages are divided into multiple generations. The youngest and the
302 : * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
303 : * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
304 : * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
305 : * corresponding generation. The gen counter in folio->flags stores gen+1 while
306 : * a page is on one of lrugen->folios[]. Otherwise it stores 0.
307 : *
308 : * A page is added to the youngest generation on faulting. The aging needs to
309 : * check the accessed bit at least twice before handing this page over to the
310 : * eviction. The first check takes care of the accessed bit set on the initial
311 : * fault; the second check makes sure this page hasn't been used since then.
312 : * This process, AKA second chance, requires a minimum of two generations,
313 : * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
314 : * LRU, e.g., /proc/vmstat, these two generations are considered active; the
315 : * rest of generations, if they exist, are considered inactive. See
316 : * lru_gen_is_active().
317 : *
318 : * PG_active is always cleared while a page is on one of lrugen->folios[] so
319 : * that the aging needs not to worry about it. And it's set again when a page
320 : * considered active is isolated for non-reclaiming purposes, e.g., migration.
321 : * See lru_gen_add_folio() and lru_gen_del_folio().
322 : *
323 : * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
324 : * number of categories of the active/inactive LRU when keeping track of
325 : * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
326 : * in folio->flags.
327 : */
328 : #define MIN_NR_GENS 2U
329 : #define MAX_NR_GENS 4U
330 :
331 : /*
332 : * Each generation is divided into multiple tiers. A page accessed N times
333 : * through file descriptors is in tier order_base_2(N). A page in the first tier
334 : * (N=0,1) is marked by PG_referenced unless it was faulted in through page
335 : * tables or read ahead. A page in any other tier (N>1) is marked by
336 : * PG_referenced and PG_workingset. This implies a minimum of two tiers is
337 : * supported without using additional bits in folio->flags.
338 : *
339 : * In contrast to moving across generations which requires the LRU lock, moving
340 : * across tiers only involves atomic operations on folio->flags and therefore
341 : * has a negligible cost in the buffered access path. In the eviction path,
342 : * comparisons of refaulted/(evicted+protected) from the first tier and the
343 : * rest infer whether pages accessed multiple times through file descriptors
344 : * are statistically hot and thus worth protecting.
345 : *
346 : * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
347 : * number of categories of the active/inactive LRU when keeping track of
348 : * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
349 : * folio->flags.
350 : */
351 : #define MAX_NR_TIERS 4U
352 :
353 : #ifndef __GENERATING_BOUNDS_H
354 :
355 : struct lruvec;
356 : struct page_vma_mapped_walk;
357 :
358 : #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
359 : #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
360 :
361 : #ifdef CONFIG_LRU_GEN
362 :
363 : enum {
364 : LRU_GEN_ANON,
365 : LRU_GEN_FILE,
366 : };
367 :
368 : enum {
369 : LRU_GEN_CORE,
370 : LRU_GEN_MM_WALK,
371 : LRU_GEN_NONLEAF_YOUNG,
372 : NR_LRU_GEN_CAPS
373 : };
374 :
375 : #define MIN_LRU_BATCH BITS_PER_LONG
376 : #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
377 :
378 : /* whether to keep historical stats from evicted generations */
379 : #ifdef CONFIG_LRU_GEN_STATS
380 : #define NR_HIST_GENS MAX_NR_GENS
381 : #else
382 : #define NR_HIST_GENS 1U
383 : #endif
384 :
385 : /*
386 : * The youngest generation number is stored in max_seq for both anon and file
387 : * types as they are aged on an equal footing. The oldest generation numbers are
388 : * stored in min_seq[] separately for anon and file types as clean file pages
389 : * can be evicted regardless of swap constraints.
390 : *
391 : * Normally anon and file min_seq are in sync. But if swapping is constrained,
392 : * e.g., out of swap space, file min_seq is allowed to advance and leave anon
393 : * min_seq behind.
394 : *
395 : * The number of pages in each generation is eventually consistent and therefore
396 : * can be transiently negative when reset_batch_size() is pending.
397 : */
398 : struct lru_gen_folio {
399 : /* the aging increments the youngest generation number */
400 : unsigned long max_seq;
401 : /* the eviction increments the oldest generation numbers */
402 : unsigned long min_seq[ANON_AND_FILE];
403 : /* the birth time of each generation in jiffies */
404 : unsigned long timestamps[MAX_NR_GENS];
405 : /* the multi-gen LRU lists, lazily sorted on eviction */
406 : struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
407 : /* the multi-gen LRU sizes, eventually consistent */
408 : long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
409 : /* the exponential moving average of refaulted */
410 : unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
411 : /* the exponential moving average of evicted+protected */
412 : unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
413 : /* the first tier doesn't need protection, hence the minus one */
414 : unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
415 : /* can be modified without holding the LRU lock */
416 : atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
417 : atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
418 : /* whether the multi-gen LRU is enabled */
419 : bool enabled;
420 : #ifdef CONFIG_MEMCG
421 : /* the memcg generation this lru_gen_folio belongs to */
422 : u8 gen;
423 : /* the list segment this lru_gen_folio belongs to */
424 : u8 seg;
425 : /* per-node lru_gen_folio list for global reclaim */
426 : struct hlist_nulls_node list;
427 : #endif
428 : };
429 :
430 : enum {
431 : MM_LEAF_TOTAL, /* total leaf entries */
432 : MM_LEAF_OLD, /* old leaf entries */
433 : MM_LEAF_YOUNG, /* young leaf entries */
434 : MM_NONLEAF_TOTAL, /* total non-leaf entries */
435 : MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */
436 : MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */
437 : NR_MM_STATS
438 : };
439 :
440 : /* double-buffering Bloom filters */
441 : #define NR_BLOOM_FILTERS 2
442 :
443 : struct lru_gen_mm_state {
444 : /* set to max_seq after each iteration */
445 : unsigned long seq;
446 : /* where the current iteration continues after */
447 : struct list_head *head;
448 : /* where the last iteration ended before */
449 : struct list_head *tail;
450 : /* Bloom filters flip after each iteration */
451 : unsigned long *filters[NR_BLOOM_FILTERS];
452 : /* the mm stats for debugging */
453 : unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
454 : };
455 :
456 : struct lru_gen_mm_walk {
457 : /* the lruvec under reclaim */
458 : struct lruvec *lruvec;
459 : /* unstable max_seq from lru_gen_folio */
460 : unsigned long max_seq;
461 : /* the next address within an mm to scan */
462 : unsigned long next_addr;
463 : /* to batch promoted pages */
464 : int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
465 : /* to batch the mm stats */
466 : int mm_stats[NR_MM_STATS];
467 : /* total batched items */
468 : int batched;
469 : bool can_swap;
470 : bool force_scan;
471 : };
472 :
473 : void lru_gen_init_lruvec(struct lruvec *lruvec);
474 : void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
475 :
476 : #ifdef CONFIG_MEMCG
477 :
478 : /*
479 : * For each node, memcgs are divided into two generations: the old and the
480 : * young. For each generation, memcgs are randomly sharded into multiple bins
481 : * to improve scalability. For each bin, the hlist_nulls is virtually divided
482 : * into three segments: the head, the tail and the default.
483 : *
484 : * An onlining memcg is added to the tail of a random bin in the old generation.
485 : * The eviction starts at the head of a random bin in the old generation. The
486 : * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
487 : * the old generation, is incremented when all its bins become empty.
488 : *
489 : * There are four operations:
490 : * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
491 : * current generation (old or young) and updates its "seg" to "head";
492 : * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
493 : * current generation (old or young) and updates its "seg" to "tail";
494 : * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
495 : * generation, updates its "gen" to "old" and resets its "seg" to "default";
496 : * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
497 : * young generation, updates its "gen" to "young" and resets its "seg" to
498 : * "default".
499 : *
500 : * The events that trigger the above operations are:
501 : * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
502 : * 2. The first attempt to reclaim an memcg below low, which triggers
503 : * MEMCG_LRU_TAIL;
504 : * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
505 : * which triggers MEMCG_LRU_TAIL;
506 : * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
507 : * which triggers MEMCG_LRU_YOUNG;
508 : * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
509 : * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
510 : * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
511 : *
512 : * Note that memcg LRU only applies to global reclaim, and the round-robin
513 : * incrementing of their max_seq counters ensures the eventual fairness to all
514 : * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
515 : */
516 : #define MEMCG_NR_GENS 2
517 : #define MEMCG_NR_BINS 8
518 :
519 : struct lru_gen_memcg {
520 : /* the per-node memcg generation counter */
521 : unsigned long seq;
522 : /* each memcg has one lru_gen_folio per node */
523 : unsigned long nr_memcgs[MEMCG_NR_GENS];
524 : /* per-node lru_gen_folio list for global reclaim */
525 : struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
526 : /* protects the above */
527 : spinlock_t lock;
528 : };
529 :
530 : void lru_gen_init_pgdat(struct pglist_data *pgdat);
531 :
532 : void lru_gen_init_memcg(struct mem_cgroup *memcg);
533 : void lru_gen_exit_memcg(struct mem_cgroup *memcg);
534 : void lru_gen_online_memcg(struct mem_cgroup *memcg);
535 : void lru_gen_offline_memcg(struct mem_cgroup *memcg);
536 : void lru_gen_release_memcg(struct mem_cgroup *memcg);
537 : void lru_gen_soft_reclaim(struct lruvec *lruvec);
538 :
539 : #else /* !CONFIG_MEMCG */
540 :
541 : #define MEMCG_NR_GENS 1
542 :
543 : struct lru_gen_memcg {
544 : };
545 :
546 : static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
547 : {
548 : }
549 :
550 : #endif /* CONFIG_MEMCG */
551 :
552 : #else /* !CONFIG_LRU_GEN */
553 :
554 : static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
555 : {
556 : }
557 :
558 : static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
559 : {
560 : }
561 :
562 : static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
563 : {
564 : }
565 :
566 : #ifdef CONFIG_MEMCG
567 :
568 : static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
569 : {
570 : }
571 :
572 : static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
573 : {
574 : }
575 :
576 : static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
577 : {
578 : }
579 :
580 : static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
581 : {
582 : }
583 :
584 : static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
585 : {
586 : }
587 :
588 : static inline void lru_gen_soft_reclaim(struct lruvec *lruvec)
589 : {
590 : }
591 :
592 : #endif /* CONFIG_MEMCG */
593 :
594 : #endif /* CONFIG_LRU_GEN */
595 :
596 : struct lruvec {
597 : struct list_head lists[NR_LRU_LISTS];
598 : /* per lruvec lru_lock for memcg */
599 : spinlock_t lru_lock;
600 : /*
601 : * These track the cost of reclaiming one LRU - file or anon -
602 : * over the other. As the observed cost of reclaiming one LRU
603 : * increases, the reclaim scan balance tips toward the other.
604 : */
605 : unsigned long anon_cost;
606 : unsigned long file_cost;
607 : /* Non-resident age, driven by LRU movement */
608 : atomic_long_t nonresident_age;
609 : /* Refaults at the time of last reclaim cycle */
610 : unsigned long refaults[ANON_AND_FILE];
611 : /* Various lruvec state flags (enum lruvec_flags) */
612 : unsigned long flags;
613 : #ifdef CONFIG_LRU_GEN
614 : /* evictable pages divided into generations */
615 : struct lru_gen_folio lrugen;
616 : /* to concurrently iterate lru_gen_mm_list */
617 : struct lru_gen_mm_state mm_state;
618 : #endif
619 : #ifdef CONFIG_MEMCG
620 : struct pglist_data *pgdat;
621 : #endif
622 : };
623 :
624 : /* Isolate unmapped pages */
625 : #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2)
626 : /* Isolate for asynchronous migration */
627 : #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4)
628 : /* Isolate unevictable pages */
629 : #define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8)
630 :
631 : /* LRU Isolation modes. */
632 : typedef unsigned __bitwise isolate_mode_t;
633 :
634 : enum zone_watermarks {
635 : WMARK_MIN,
636 : WMARK_LOW,
637 : WMARK_HIGH,
638 : WMARK_PROMO,
639 : NR_WMARK
640 : };
641 :
642 : /*
643 : * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list
644 : * for THP which will usually be GFP_MOVABLE. Even if it is another type,
645 : * it should not contribute to serious fragmentation causing THP allocation
646 : * failures.
647 : */
648 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
649 : #define NR_PCP_THP 1
650 : #else
651 : #define NR_PCP_THP 0
652 : #endif
653 : #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
654 : #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)
655 :
656 : #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
657 : #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
658 : #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
659 : #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
660 :
661 : /* Fields and list protected by pagesets local_lock in page_alloc.c */
662 : struct per_cpu_pages {
663 : spinlock_t lock; /* Protects lists field */
664 : int count; /* number of pages in the list */
665 : int high; /* high watermark, emptying needed */
666 : int batch; /* chunk size for buddy add/remove */
667 : short free_factor; /* batch scaling factor during free */
668 : #ifdef CONFIG_NUMA
669 : short expire; /* When 0, remote pagesets are drained */
670 : #endif
671 :
672 : /* Lists of pages, one per migrate type stored on the pcp-lists */
673 : struct list_head lists[NR_PCP_LISTS];
674 : } ____cacheline_aligned_in_smp;
675 :
676 : struct per_cpu_zonestat {
677 : #ifdef CONFIG_SMP
678 : s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
679 : s8 stat_threshold;
680 : #endif
681 : #ifdef CONFIG_NUMA
682 : /*
683 : * Low priority inaccurate counters that are only folded
684 : * on demand. Use a large type to avoid the overhead of
685 : * folding during refresh_cpu_vm_stats.
686 : */
687 : unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
688 : #endif
689 : };
690 :
691 : struct per_cpu_nodestat {
692 : s8 stat_threshold;
693 : s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
694 : };
695 :
696 : #endif /* !__GENERATING_BOUNDS.H */
697 :
698 : enum zone_type {
699 : /*
700 : * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
701 : * to DMA to all of the addressable memory (ZONE_NORMAL).
702 : * On architectures where this area covers the whole 32 bit address
703 : * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
704 : * DMA addressing constraints. This distinction is important as a 32bit
705 : * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
706 : * platforms may need both zones as they support peripherals with
707 : * different DMA addressing limitations.
708 : */
709 : #ifdef CONFIG_ZONE_DMA
710 : ZONE_DMA,
711 : #endif
712 : #ifdef CONFIG_ZONE_DMA32
713 : ZONE_DMA32,
714 : #endif
715 : /*
716 : * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
717 : * performed on pages in ZONE_NORMAL if the DMA devices support
718 : * transfers to all addressable memory.
719 : */
720 : ZONE_NORMAL,
721 : #ifdef CONFIG_HIGHMEM
722 : /*
723 : * A memory area that is only addressable by the kernel through
724 : * mapping portions into its own address space. This is for example
725 : * used by i386 to allow the kernel to address the memory beyond
726 : * 900MB. The kernel will set up special mappings (page
727 : * table entries on i386) for each page that the kernel needs to
728 : * access.
729 : */
730 : ZONE_HIGHMEM,
731 : #endif
732 : /*
733 : * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains
734 : * movable pages with few exceptional cases described below. Main use
735 : * cases for ZONE_MOVABLE are to make memory offlining/unplug more
736 : * likely to succeed, and to locally limit unmovable allocations - e.g.,
737 : * to increase the number of THP/huge pages. Notable special cases are:
738 : *
739 : * 1. Pinned pages: (long-term) pinning of movable pages might
740 : * essentially turn such pages unmovable. Therefore, we do not allow
741 : * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and
742 : * faulted, they come from the right zone right away. However, it is
743 : * still possible that address space already has pages in
744 : * ZONE_MOVABLE at the time when pages are pinned (i.e. user has
745 : * touches that memory before pinning). In such case we migrate them
746 : * to a different zone. When migration fails - pinning fails.
747 : * 2. memblock allocations: kernelcore/movablecore setups might create
748 : * situations where ZONE_MOVABLE contains unmovable allocations
749 : * after boot. Memory offlining and allocations fail early.
750 : * 3. Memory holes: kernelcore/movablecore setups might create very rare
751 : * situations where ZONE_MOVABLE contains memory holes after boot,
752 : * for example, if we have sections that are only partially
753 : * populated. Memory offlining and allocations fail early.
754 : * 4. PG_hwpoison pages: while poisoned pages can be skipped during
755 : * memory offlining, such pages cannot be allocated.
756 : * 5. Unmovable PG_offline pages: in paravirtualized environments,
757 : * hotplugged memory blocks might only partially be managed by the
758 : * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The
759 : * parts not manged by the buddy are unmovable PG_offline pages. In
760 : * some cases (virtio-mem), such pages can be skipped during
761 : * memory offlining, however, cannot be moved/allocated. These
762 : * techniques might use alloc_contig_range() to hide previously
763 : * exposed pages from the buddy again (e.g., to implement some sort
764 : * of memory unplug in virtio-mem).
765 : * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create
766 : * situations where ZERO_PAGE(0) which is allocated differently
767 : * on different platforms may end up in a movable zone. ZERO_PAGE(0)
768 : * cannot be migrated.
769 : * 7. Memory-hotplug: when using memmap_on_memory and onlining the
770 : * memory to the MOVABLE zone, the vmemmap pages are also placed in
771 : * such zone. Such pages cannot be really moved around as they are
772 : * self-stored in the range, but they are treated as movable when
773 : * the range they describe is about to be offlined.
774 : *
775 : * In general, no unmovable allocations that degrade memory offlining
776 : * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
777 : * have to expect that migrating pages in ZONE_MOVABLE can fail (even
778 : * if has_unmovable_pages() states that there are no unmovable pages,
779 : * there can be false negatives).
780 : */
781 : ZONE_MOVABLE,
782 : #ifdef CONFIG_ZONE_DEVICE
783 : ZONE_DEVICE,
784 : #endif
785 : __MAX_NR_ZONES
786 :
787 : };
788 :
789 : #ifndef __GENERATING_BOUNDS_H
790 :
791 : #define ASYNC_AND_SYNC 2
792 :
793 : struct zone {
794 : /* Read-mostly fields */
795 :
796 : /* zone watermarks, access with *_wmark_pages(zone) macros */
797 : unsigned long _watermark[NR_WMARK];
798 : unsigned long watermark_boost;
799 :
800 : unsigned long nr_reserved_highatomic;
801 :
802 : /*
803 : * We don't know if the memory that we're going to allocate will be
804 : * freeable or/and it will be released eventually, so to avoid totally
805 : * wasting several GB of ram we must reserve some of the lower zone
806 : * memory (otherwise we risk to run OOM on the lower zones despite
807 : * there being tons of freeable ram on the higher zones). This array is
808 : * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
809 : * changes.
810 : */
811 : long lowmem_reserve[MAX_NR_ZONES];
812 :
813 : #ifdef CONFIG_NUMA
814 : int node;
815 : #endif
816 : struct pglist_data *zone_pgdat;
817 : struct per_cpu_pages __percpu *per_cpu_pageset;
818 : struct per_cpu_zonestat __percpu *per_cpu_zonestats;
819 : /*
820 : * the high and batch values are copied to individual pagesets for
821 : * faster access
822 : */
823 : int pageset_high;
824 : int pageset_batch;
825 :
826 : #ifndef CONFIG_SPARSEMEM
827 : /*
828 : * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
829 : * In SPARSEMEM, this map is stored in struct mem_section
830 : */
831 : unsigned long *pageblock_flags;
832 : #endif /* CONFIG_SPARSEMEM */
833 :
834 : /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
835 : unsigned long zone_start_pfn;
836 :
837 : /*
838 : * spanned_pages is the total pages spanned by the zone, including
839 : * holes, which is calculated as:
840 : * spanned_pages = zone_end_pfn - zone_start_pfn;
841 : *
842 : * present_pages is physical pages existing within the zone, which
843 : * is calculated as:
844 : * present_pages = spanned_pages - absent_pages(pages in holes);
845 : *
846 : * present_early_pages is present pages existing within the zone
847 : * located on memory available since early boot, excluding hotplugged
848 : * memory.
849 : *
850 : * managed_pages is present pages managed by the buddy system, which
851 : * is calculated as (reserved_pages includes pages allocated by the
852 : * bootmem allocator):
853 : * managed_pages = present_pages - reserved_pages;
854 : *
855 : * cma pages is present pages that are assigned for CMA use
856 : * (MIGRATE_CMA).
857 : *
858 : * So present_pages may be used by memory hotplug or memory power
859 : * management logic to figure out unmanaged pages by checking
860 : * (present_pages - managed_pages). And managed_pages should be used
861 : * by page allocator and vm scanner to calculate all kinds of watermarks
862 : * and thresholds.
863 : *
864 : * Locking rules:
865 : *
866 : * zone_start_pfn and spanned_pages are protected by span_seqlock.
867 : * It is a seqlock because it has to be read outside of zone->lock,
868 : * and it is done in the main allocator path. But, it is written
869 : * quite infrequently.
870 : *
871 : * The span_seq lock is declared along with zone->lock because it is
872 : * frequently read in proximity to zone->lock. It's good to
873 : * give them a chance of being in the same cacheline.
874 : *
875 : * Write access to present_pages at runtime should be protected by
876 : * mem_hotplug_begin/done(). Any reader who can't tolerant drift of
877 : * present_pages should use get_online_mems() to get a stable value.
878 : */
879 : atomic_long_t managed_pages;
880 : unsigned long spanned_pages;
881 : unsigned long present_pages;
882 : #if defined(CONFIG_MEMORY_HOTPLUG)
883 : unsigned long present_early_pages;
884 : #endif
885 : #ifdef CONFIG_CMA
886 : unsigned long cma_pages;
887 : #endif
888 :
889 : const char *name;
890 :
891 : #ifdef CONFIG_MEMORY_ISOLATION
892 : /*
893 : * Number of isolated pageblock. It is used to solve incorrect
894 : * freepage counting problem due to racy retrieving migratetype
895 : * of pageblock. Protected by zone->lock.
896 : */
897 : unsigned long nr_isolate_pageblock;
898 : #endif
899 :
900 : #ifdef CONFIG_MEMORY_HOTPLUG
901 : /* see spanned/present_pages for more description */
902 : seqlock_t span_seqlock;
903 : #endif
904 :
905 : int initialized;
906 :
907 : /* Write-intensive fields used from the page allocator */
908 : CACHELINE_PADDING(_pad1_);
909 :
910 : /* free areas of different sizes */
911 : struct free_area free_area[MAX_ORDER + 1];
912 :
913 : /* zone flags, see below */
914 : unsigned long flags;
915 :
916 : /* Primarily protects free_area */
917 : spinlock_t lock;
918 :
919 : /* Write-intensive fields used by compaction and vmstats. */
920 : CACHELINE_PADDING(_pad2_);
921 :
922 : /*
923 : * When free pages are below this point, additional steps are taken
924 : * when reading the number of free pages to avoid per-cpu counter
925 : * drift allowing watermarks to be breached
926 : */
927 : unsigned long percpu_drift_mark;
928 :
929 : #if defined CONFIG_COMPACTION || defined CONFIG_CMA
930 : /* pfn where compaction free scanner should start */
931 : unsigned long compact_cached_free_pfn;
932 : /* pfn where compaction migration scanner should start */
933 : unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC];
934 : unsigned long compact_init_migrate_pfn;
935 : unsigned long compact_init_free_pfn;
936 : #endif
937 :
938 : #ifdef CONFIG_COMPACTION
939 : /*
940 : * On compaction failure, 1<<compact_defer_shift compactions
941 : * are skipped before trying again. The number attempted since
942 : * last failure is tracked with compact_considered.
943 : * compact_order_failed is the minimum compaction failed order.
944 : */
945 : unsigned int compact_considered;
946 : unsigned int compact_defer_shift;
947 : int compact_order_failed;
948 : #endif
949 :
950 : #if defined CONFIG_COMPACTION || defined CONFIG_CMA
951 : /* Set to true when the PG_migrate_skip bits should be cleared */
952 : bool compact_blockskip_flush;
953 : #endif
954 :
955 : bool contiguous;
956 :
957 : CACHELINE_PADDING(_pad3_);
958 : /* Zone statistics */
959 : atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
960 : atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
961 : } ____cacheline_internodealigned_in_smp;
962 :
963 : enum pgdat_flags {
964 : PGDAT_DIRTY, /* reclaim scanning has recently found
965 : * many dirty file pages at the tail
966 : * of the LRU.
967 : */
968 : PGDAT_WRITEBACK, /* reclaim scanning has recently found
969 : * many pages under writeback
970 : */
971 : PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
972 : };
973 :
974 : enum zone_flags {
975 : ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks.
976 : * Cleared when kswapd is woken.
977 : */
978 : ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */
979 : };
980 :
981 : static inline unsigned long zone_managed_pages(struct zone *zone)
982 : {
983 44 : return (unsigned long)atomic_long_read(&zone->managed_pages);
984 : }
985 :
986 : static inline unsigned long zone_cma_pages(struct zone *zone)
987 : {
988 : #ifdef CONFIG_CMA
989 : return zone->cma_pages;
990 : #else
991 : return 0;
992 : #endif
993 : }
994 :
995 : static inline unsigned long zone_end_pfn(const struct zone *zone)
996 : {
997 261 : return zone->zone_start_pfn + zone->spanned_pages;
998 : }
999 :
1000 : static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
1001 : {
1002 0 : return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
1003 : }
1004 :
1005 : static inline bool zone_is_initialized(struct zone *zone)
1006 : {
1007 : return zone->initialized;
1008 : }
1009 :
1010 : static inline bool zone_is_empty(struct zone *zone)
1011 : {
1012 : return zone->spanned_pages == 0;
1013 : }
1014 :
1015 : #ifndef BUILD_VDSO32_64
1016 : /*
1017 : * The zone field is never updated after free_area_init_core()
1018 : * sets it, so none of the operations on it need to be atomic.
1019 : */
1020 :
1021 : /* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
1022 : #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
1023 : #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
1024 : #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
1025 : #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
1026 : #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
1027 : #define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
1028 : #define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
1029 :
1030 : /*
1031 : * Define the bit shifts to access each section. For non-existent
1032 : * sections we define the shift as 0; that plus a 0 mask ensures
1033 : * the compiler will optimise away reference to them.
1034 : */
1035 : #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
1036 : #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
1037 : #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
1038 : #define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
1039 : #define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))
1040 :
1041 : /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
1042 : #ifdef NODE_NOT_IN_PAGE_FLAGS
1043 : #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT)
1044 : #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF) ? \
1045 : SECTIONS_PGOFF : ZONES_PGOFF)
1046 : #else
1047 : #define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT)
1048 : #define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF) ? \
1049 : NODES_PGOFF : ZONES_PGOFF)
1050 : #endif
1051 :
1052 : #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0))
1053 :
1054 : #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
1055 : #define NODES_MASK ((1UL << NODES_WIDTH) - 1)
1056 : #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
1057 : #define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1)
1058 : #define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1)
1059 : #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
1060 :
1061 : static inline enum zone_type page_zonenum(const struct page *page)
1062 : {
1063 136766 : ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
1064 92002 : return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
1065 : }
1066 :
1067 : static inline enum zone_type folio_zonenum(const struct folio *folio)
1068 : {
1069 0 : return page_zonenum(&folio->page);
1070 : }
1071 :
1072 : #ifdef CONFIG_ZONE_DEVICE
1073 : static inline bool is_zone_device_page(const struct page *page)
1074 : {
1075 : return page_zonenum(page) == ZONE_DEVICE;
1076 : }
1077 :
1078 : /*
1079 : * Consecutive zone device pages should not be merged into the same sgl
1080 : * or bvec segment with other types of pages or if they belong to different
1081 : * pgmaps. Otherwise getting the pgmap of a given segment is not possible
1082 : * without scanning the entire segment. This helper returns true either if
1083 : * both pages are not zone device pages or both pages are zone device pages
1084 : * with the same pgmap.
1085 : */
1086 : static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
1087 : const struct page *b)
1088 : {
1089 : if (is_zone_device_page(a) != is_zone_device_page(b))
1090 : return false;
1091 : if (!is_zone_device_page(a))
1092 : return true;
1093 : return a->pgmap == b->pgmap;
1094 : }
1095 :
1096 : extern void memmap_init_zone_device(struct zone *, unsigned long,
1097 : unsigned long, struct dev_pagemap *);
1098 : #else
1099 : static inline bool is_zone_device_page(const struct page *page)
1100 : {
1101 : return false;
1102 : }
1103 : static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
1104 : const struct page *b)
1105 : {
1106 : return true;
1107 : }
1108 : #endif
1109 :
1110 : static inline bool folio_is_zone_device(const struct folio *folio)
1111 : {
1112 0 : return is_zone_device_page(&folio->page);
1113 : }
1114 :
1115 : static inline bool is_zone_movable_page(const struct page *page)
1116 : {
1117 0 : return page_zonenum(page) == ZONE_MOVABLE;
1118 : }
1119 : #endif
1120 :
1121 : /*
1122 : * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
1123 : * intersection with the given zone
1124 : */
1125 : static inline bool zone_intersects(struct zone *zone,
1126 : unsigned long start_pfn, unsigned long nr_pages)
1127 : {
1128 : if (zone_is_empty(zone))
1129 : return false;
1130 : if (start_pfn >= zone_end_pfn(zone) ||
1131 : start_pfn + nr_pages <= zone->zone_start_pfn)
1132 : return false;
1133 :
1134 : return true;
1135 : }
1136 :
1137 : /*
1138 : * The "priority" of VM scanning is how much of the queues we will scan in one
1139 : * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
1140 : * queues ("queue_length >> 12") during an aging round.
1141 : */
1142 : #define DEF_PRIORITY 12
1143 :
1144 : /* Maximum number of zones on a zonelist */
1145 : #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
1146 :
1147 : enum {
1148 : ZONELIST_FALLBACK, /* zonelist with fallback */
1149 : #ifdef CONFIG_NUMA
1150 : /*
1151 : * The NUMA zonelists are doubled because we need zonelists that
1152 : * restrict the allocations to a single node for __GFP_THISNODE.
1153 : */
1154 : ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */
1155 : #endif
1156 : MAX_ZONELISTS
1157 : };
1158 :
1159 : /*
1160 : * This struct contains information about a zone in a zonelist. It is stored
1161 : * here to avoid dereferences into large structures and lookups of tables
1162 : */
1163 : struct zoneref {
1164 : struct zone *zone; /* Pointer to actual zone */
1165 : int zone_idx; /* zone_idx(zoneref->zone) */
1166 : };
1167 :
1168 : /*
1169 : * One allocation request operates on a zonelist. A zonelist
1170 : * is a list of zones, the first one is the 'goal' of the
1171 : * allocation, the other zones are fallback zones, in decreasing
1172 : * priority.
1173 : *
1174 : * To speed the reading of the zonelist, the zonerefs contain the zone index
1175 : * of the entry being read. Helper functions to access information given
1176 : * a struct zoneref are
1177 : *
1178 : * zonelist_zone() - Return the struct zone * for an entry in _zonerefs
1179 : * zonelist_zone_idx() - Return the index of the zone for an entry
1180 : * zonelist_node_idx() - Return the index of the node for an entry
1181 : */
1182 : struct zonelist {
1183 : struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
1184 : };
1185 :
1186 : /*
1187 : * The array of struct pages for flatmem.
1188 : * It must be declared for SPARSEMEM as well because there are configurations
1189 : * that rely on that.
1190 : */
1191 : extern struct page *mem_map;
1192 :
1193 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1194 : struct deferred_split {
1195 : spinlock_t split_queue_lock;
1196 : struct list_head split_queue;
1197 : unsigned long split_queue_len;
1198 : };
1199 : #endif
1200 :
1201 : #ifdef CONFIG_MEMORY_FAILURE
1202 : /*
1203 : * Per NUMA node memory failure handling statistics.
1204 : */
1205 : struct memory_failure_stats {
1206 : /*
1207 : * Number of raw pages poisoned.
1208 : * Cases not accounted: memory outside kernel control, offline page,
1209 : * arch-specific memory_failure (SGX), hwpoison_filter() filtered
1210 : * error events, and unpoison actions from hwpoison_unpoison.
1211 : */
1212 : unsigned long total;
1213 : /*
1214 : * Recovery results of poisoned raw pages handled by memory_failure,
1215 : * in sync with mf_result.
1216 : * total = ignored + failed + delayed + recovered.
1217 : * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted.
1218 : */
1219 : unsigned long ignored;
1220 : unsigned long failed;
1221 : unsigned long delayed;
1222 : unsigned long recovered;
1223 : };
1224 : #endif
1225 :
1226 : /*
1227 : * On NUMA machines, each NUMA node would have a pg_data_t to describe
1228 : * it's memory layout. On UMA machines there is a single pglist_data which
1229 : * describes the whole memory.
1230 : *
1231 : * Memory statistics and page replacement data structures are maintained on a
1232 : * per-zone basis.
1233 : */
1234 : typedef struct pglist_data {
1235 : /*
1236 : * node_zones contains just the zones for THIS node. Not all of the
1237 : * zones may be populated, but it is the full list. It is referenced by
1238 : * this node's node_zonelists as well as other node's node_zonelists.
1239 : */
1240 : struct zone node_zones[MAX_NR_ZONES];
1241 :
1242 : /*
1243 : * node_zonelists contains references to all zones in all nodes.
1244 : * Generally the first zones will be references to this node's
1245 : * node_zones.
1246 : */
1247 : struct zonelist node_zonelists[MAX_ZONELISTS];
1248 :
1249 : int nr_zones; /* number of populated zones in this node */
1250 : #ifdef CONFIG_FLATMEM /* means !SPARSEMEM */
1251 : struct page *node_mem_map;
1252 : #ifdef CONFIG_PAGE_EXTENSION
1253 : struct page_ext *node_page_ext;
1254 : #endif
1255 : #endif
1256 : #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
1257 : /*
1258 : * Must be held any time you expect node_start_pfn,
1259 : * node_present_pages, node_spanned_pages or nr_zones to stay constant.
1260 : * Also synchronizes pgdat->first_deferred_pfn during deferred page
1261 : * init.
1262 : *
1263 : * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
1264 : * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
1265 : * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
1266 : *
1267 : * Nests above zone->lock and zone->span_seqlock
1268 : */
1269 : spinlock_t node_size_lock;
1270 : #endif
1271 : unsigned long node_start_pfn;
1272 : unsigned long node_present_pages; /* total number of physical pages */
1273 : unsigned long node_spanned_pages; /* total size of physical page
1274 : range, including holes */
1275 : int node_id;
1276 : wait_queue_head_t kswapd_wait;
1277 : wait_queue_head_t pfmemalloc_wait;
1278 :
1279 : /* workqueues for throttling reclaim for different reasons. */
1280 : wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE];
1281 :
1282 : atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */
1283 : unsigned long nr_reclaim_start; /* nr pages written while throttled
1284 : * when throttling started. */
1285 : #ifdef CONFIG_MEMORY_HOTPLUG
1286 : struct mutex kswapd_lock;
1287 : #endif
1288 : struct task_struct *kswapd; /* Protected by kswapd_lock */
1289 : int kswapd_order;
1290 : enum zone_type kswapd_highest_zoneidx;
1291 :
1292 : int kswapd_failures; /* Number of 'reclaimed == 0' runs */
1293 :
1294 : #ifdef CONFIG_COMPACTION
1295 : int kcompactd_max_order;
1296 : enum zone_type kcompactd_highest_zoneidx;
1297 : wait_queue_head_t kcompactd_wait;
1298 : struct task_struct *kcompactd;
1299 : bool proactive_compact_trigger;
1300 : #endif
1301 : /*
1302 : * This is a per-node reserve of pages that are not available
1303 : * to userspace allocations.
1304 : */
1305 : unsigned long totalreserve_pages;
1306 :
1307 : #ifdef CONFIG_NUMA
1308 : /*
1309 : * node reclaim becomes active if more unmapped pages exist.
1310 : */
1311 : unsigned long min_unmapped_pages;
1312 : unsigned long min_slab_pages;
1313 : #endif /* CONFIG_NUMA */
1314 :
1315 : /* Write-intensive fields used by page reclaim */
1316 : CACHELINE_PADDING(_pad1_);
1317 :
1318 : #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1319 : /*
1320 : * If memory initialisation on large machines is deferred then this
1321 : * is the first PFN that needs to be initialised.
1322 : */
1323 : unsigned long first_deferred_pfn;
1324 : #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1325 :
1326 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1327 : struct deferred_split deferred_split_queue;
1328 : #endif
1329 :
1330 : #ifdef CONFIG_NUMA_BALANCING
1331 : /* start time in ms of current promote rate limit period */
1332 : unsigned int nbp_rl_start;
1333 : /* number of promote candidate pages at start time of current rate limit period */
1334 : unsigned long nbp_rl_nr_cand;
1335 : /* promote threshold in ms */
1336 : unsigned int nbp_threshold;
1337 : /* start time in ms of current promote threshold adjustment period */
1338 : unsigned int nbp_th_start;
1339 : /*
1340 : * number of promote candidate pages at start time of current promote
1341 : * threshold adjustment period
1342 : */
1343 : unsigned long nbp_th_nr_cand;
1344 : #endif
1345 : /* Fields commonly accessed by the page reclaim scanner */
1346 :
1347 : /*
1348 : * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
1349 : *
1350 : * Use mem_cgroup_lruvec() to look up lruvecs.
1351 : */
1352 : struct lruvec __lruvec;
1353 :
1354 : unsigned long flags;
1355 :
1356 : #ifdef CONFIG_LRU_GEN
1357 : /* kswap mm walk data */
1358 : struct lru_gen_mm_walk mm_walk;
1359 : /* lru_gen_folio list */
1360 : struct lru_gen_memcg memcg_lru;
1361 : #endif
1362 :
1363 : CACHELINE_PADDING(_pad2_);
1364 :
1365 : /* Per-node vmstats */
1366 : struct per_cpu_nodestat __percpu *per_cpu_nodestats;
1367 : atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
1368 : #ifdef CONFIG_NUMA
1369 : struct memory_tier __rcu *memtier;
1370 : #endif
1371 : #ifdef CONFIG_MEMORY_FAILURE
1372 : struct memory_failure_stats mf_stats;
1373 : #endif
1374 : } pg_data_t;
1375 :
1376 : #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
1377 : #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
1378 :
1379 : #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
1380 : #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
1381 :
1382 : static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
1383 : {
1384 1 : return pgdat->node_start_pfn + pgdat->node_spanned_pages;
1385 : }
1386 :
1387 : #include <linux/memory_hotplug.h>
1388 :
1389 : void build_all_zonelists(pg_data_t *pgdat);
1390 : void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
1391 : enum zone_type highest_zoneidx);
1392 : bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
1393 : int highest_zoneidx, unsigned int alloc_flags,
1394 : long free_pages);
1395 : bool zone_watermark_ok(struct zone *z, unsigned int order,
1396 : unsigned long mark, int highest_zoneidx,
1397 : unsigned int alloc_flags);
1398 : bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
1399 : unsigned long mark, int highest_zoneidx);
1400 : /*
1401 : * Memory initialization context, use to differentiate memory added by
1402 : * the platform statically or via memory hotplug interface.
1403 : */
1404 : enum meminit_context {
1405 : MEMINIT_EARLY,
1406 : MEMINIT_HOTPLUG,
1407 : };
1408 :
1409 : extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
1410 : unsigned long size);
1411 :
1412 : extern void lruvec_init(struct lruvec *lruvec);
1413 :
1414 : static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
1415 : {
1416 : #ifdef CONFIG_MEMCG
1417 : return lruvec->pgdat;
1418 : #else
1419 0 : return container_of(lruvec, struct pglist_data, __lruvec);
1420 : #endif
1421 : }
1422 :
1423 : #ifdef CONFIG_HAVE_MEMORYLESS_NODES
1424 : int local_memory_node(int node_id);
1425 : #else
1426 : static inline int local_memory_node(int node_id) { return node_id; };
1427 : #endif
1428 :
1429 : /*
1430 : * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
1431 : */
1432 : #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
1433 :
1434 : #ifdef CONFIG_ZONE_DEVICE
1435 : static inline bool zone_is_zone_device(struct zone *zone)
1436 : {
1437 : return zone_idx(zone) == ZONE_DEVICE;
1438 : }
1439 : #else
1440 : static inline bool zone_is_zone_device(struct zone *zone)
1441 : {
1442 : return false;
1443 : }
1444 : #endif
1445 :
1446 : /*
1447 : * Returns true if a zone has pages managed by the buddy allocator.
1448 : * All the reclaim decisions have to use this function rather than
1449 : * populated_zone(). If the whole zone is reserved then we can easily
1450 : * end up with populated_zone() && !managed_zone().
1451 : */
1452 : static inline bool managed_zone(struct zone *zone)
1453 : {
1454 2 : return zone_managed_pages(zone);
1455 : }
1456 :
1457 : /* Returns true if a zone has memory */
1458 : static inline bool populated_zone(struct zone *zone)
1459 : {
1460 : return zone->present_pages;
1461 : }
1462 :
1463 : #ifdef CONFIG_NUMA
1464 : static inline int zone_to_nid(struct zone *zone)
1465 : {
1466 : return zone->node;
1467 : }
1468 :
1469 : static inline void zone_set_nid(struct zone *zone, int nid)
1470 : {
1471 : zone->node = nid;
1472 : }
1473 : #else
1474 : static inline int zone_to_nid(struct zone *zone)
1475 : {
1476 : return 0;
1477 : }
1478 :
1479 : static inline void zone_set_nid(struct zone *zone, int nid) {}
1480 : #endif
1481 :
1482 : extern int movable_zone;
1483 :
1484 : static inline int is_highmem_idx(enum zone_type idx)
1485 : {
1486 : #ifdef CONFIG_HIGHMEM
1487 : return (idx == ZONE_HIGHMEM ||
1488 : (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM));
1489 : #else
1490 : return 0;
1491 : #endif
1492 : }
1493 :
1494 : /**
1495 : * is_highmem - helper function to quickly check if a struct zone is a
1496 : * highmem zone or not. This is an attempt to keep references
1497 : * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
1498 : * @zone: pointer to struct zone variable
1499 : * Return: 1 for a highmem zone, 0 otherwise
1500 : */
1501 : static inline int is_highmem(struct zone *zone)
1502 : {
1503 4 : return is_highmem_idx(zone_idx(zone));
1504 : }
1505 :
1506 : #ifdef CONFIG_ZONE_DMA
1507 : bool has_managed_dma(void);
1508 : #else
1509 : static inline bool has_managed_dma(void)
1510 : {
1511 : return false;
1512 : }
1513 : #endif
1514 :
1515 : /* These two functions are used to setup the per zone pages min values */
1516 : struct ctl_table;
1517 :
1518 : int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *,
1519 : loff_t *);
1520 : int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *,
1521 : size_t *, loff_t *);
1522 : extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
1523 : int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *,
1524 : size_t *, loff_t *);
1525 : int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *, int,
1526 : void *, size_t *, loff_t *);
1527 : int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
1528 : void *, size_t *, loff_t *);
1529 : int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
1530 : void *, size_t *, loff_t *);
1531 : int numa_zonelist_order_handler(struct ctl_table *, int,
1532 : void *, size_t *, loff_t *);
1533 : extern int percpu_pagelist_high_fraction;
1534 : extern char numa_zonelist_order[];
1535 : #define NUMA_ZONELIST_ORDER_LEN 16
1536 :
1537 : #ifndef CONFIG_NUMA
1538 :
1539 : extern struct pglist_data contig_page_data;
1540 : static inline struct pglist_data *NODE_DATA(int nid)
1541 : {
1542 : return &contig_page_data;
1543 : }
1544 :
1545 : #else /* CONFIG_NUMA */
1546 :
1547 : #include <asm/mmzone.h>
1548 :
1549 : #endif /* !CONFIG_NUMA */
1550 :
1551 : extern struct pglist_data *first_online_pgdat(void);
1552 : extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
1553 : extern struct zone *next_zone(struct zone *zone);
1554 :
1555 : /**
1556 : * for_each_online_pgdat - helper macro to iterate over all online nodes
1557 : * @pgdat: pointer to a pg_data_t variable
1558 : */
1559 : #define for_each_online_pgdat(pgdat) \
1560 : for (pgdat = first_online_pgdat(); \
1561 : pgdat; \
1562 : pgdat = next_online_pgdat(pgdat))
1563 : /**
1564 : * for_each_zone - helper macro to iterate over all memory zones
1565 : * @zone: pointer to struct zone variable
1566 : *
1567 : * The user only needs to declare the zone variable, for_each_zone
1568 : * fills it in.
1569 : */
1570 : #define for_each_zone(zone) \
1571 : for (zone = (first_online_pgdat())->node_zones; \
1572 : zone; \
1573 : zone = next_zone(zone))
1574 :
1575 : #define for_each_populated_zone(zone) \
1576 : for (zone = (first_online_pgdat())->node_zones; \
1577 : zone; \
1578 : zone = next_zone(zone)) \
1579 : if (!populated_zone(zone)) \
1580 : ; /* do nothing */ \
1581 : else
1582 :
1583 : static inline struct zone *zonelist_zone(struct zoneref *zoneref)
1584 : {
1585 : return zoneref->zone;
1586 : }
1587 :
1588 : static inline int zonelist_zone_idx(struct zoneref *zoneref)
1589 : {
1590 : return zoneref->zone_idx;
1591 : }
1592 :
1593 : static inline int zonelist_node_idx(struct zoneref *zoneref)
1594 : {
1595 : return zone_to_nid(zoneref->zone);
1596 : }
1597 :
1598 : struct zoneref *__next_zones_zonelist(struct zoneref *z,
1599 : enum zone_type highest_zoneidx,
1600 : nodemask_t *nodes);
1601 :
1602 : /**
1603 : * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
1604 : * @z: The cursor used as a starting point for the search
1605 : * @highest_zoneidx: The zone index of the highest zone to return
1606 : * @nodes: An optional nodemask to filter the zonelist with
1607 : *
1608 : * This function returns the next zone at or below a given zone index that is
1609 : * within the allowed nodemask using a cursor as the starting point for the
1610 : * search. The zoneref returned is a cursor that represents the current zone
1611 : * being examined. It should be advanced by one before calling
1612 : * next_zones_zonelist again.
1613 : *
1614 : * Return: the next zone at or below highest_zoneidx within the allowed
1615 : * nodemask using a cursor within a zonelist as a starting point
1616 : */
1617 : static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
1618 : enum zone_type highest_zoneidx,
1619 : nodemask_t *nodes)
1620 : {
1621 3420 : if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
1622 : return z;
1623 0 : return __next_zones_zonelist(z, highest_zoneidx, nodes);
1624 : }
1625 :
1626 : /**
1627 : * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
1628 : * @zonelist: The zonelist to search for a suitable zone
1629 : * @highest_zoneidx: The zone index of the highest zone to return
1630 : * @nodes: An optional nodemask to filter the zonelist with
1631 : *
1632 : * This function returns the first zone at or below a given zone index that is
1633 : * within the allowed nodemask. The zoneref returned is a cursor that can be
1634 : * used to iterate the zonelist with next_zones_zonelist by advancing it by
1635 : * one before calling.
1636 : *
1637 : * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
1638 : * never NULL). This may happen either genuinely, or due to concurrent nodemask
1639 : * update due to cpuset modification.
1640 : *
1641 : * Return: Zoneref pointer for the first suitable zone found
1642 : */
1643 : static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
1644 : enum zone_type highest_zoneidx,
1645 : nodemask_t *nodes)
1646 : {
1647 6834 : return next_zones_zonelist(zonelist->_zonerefs,
1648 : highest_zoneidx, nodes);
1649 : }
1650 :
1651 : /**
1652 : * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
1653 : * @zone: The current zone in the iterator
1654 : * @z: The current pointer within zonelist->_zonerefs being iterated
1655 : * @zlist: The zonelist being iterated
1656 : * @highidx: The zone index of the highest zone to return
1657 : * @nodemask: Nodemask allowed by the allocator
1658 : *
1659 : * This iterator iterates though all zones at or below a given zone index and
1660 : * within a given nodemask
1661 : */
1662 : #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
1663 : for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \
1664 : zone; \
1665 : z = next_zones_zonelist(++z, highidx, nodemask), \
1666 : zone = zonelist_zone(z))
1667 :
1668 : #define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \
1669 : for (zone = z->zone; \
1670 : zone; \
1671 : z = next_zones_zonelist(++z, highidx, nodemask), \
1672 : zone = zonelist_zone(z))
1673 :
1674 :
1675 : /**
1676 : * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
1677 : * @zone: The current zone in the iterator
1678 : * @z: The current pointer within zonelist->zones being iterated
1679 : * @zlist: The zonelist being iterated
1680 : * @highidx: The zone index of the highest zone to return
1681 : *
1682 : * This iterator iterates though all zones at or below a given zone index.
1683 : */
1684 : #define for_each_zone_zonelist(zone, z, zlist, highidx) \
1685 : for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
1686 :
1687 : /* Whether the 'nodes' are all movable nodes */
1688 : static inline bool movable_only_nodes(nodemask_t *nodes)
1689 : {
1690 : struct zonelist *zonelist;
1691 : struct zoneref *z;
1692 : int nid;
1693 :
1694 : if (nodes_empty(*nodes))
1695 : return false;
1696 :
1697 : /*
1698 : * We can chose arbitrary node from the nodemask to get a
1699 : * zonelist as they are interlinked. We just need to find
1700 : * at least one zone that can satisfy kernel allocations.
1701 : */
1702 : nid = first_node(*nodes);
1703 : zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
1704 : z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes);
1705 : return (!z->zone) ? true : false;
1706 : }
1707 :
1708 :
1709 : #ifdef CONFIG_SPARSEMEM
1710 : #include <asm/sparsemem.h>
1711 : #endif
1712 :
1713 : #ifdef CONFIG_FLATMEM
1714 : #define pfn_to_nid(pfn) (0)
1715 : #endif
1716 :
1717 : #ifdef CONFIG_SPARSEMEM
1718 :
1719 : /*
1720 : * PA_SECTION_SHIFT physical address to/from section number
1721 : * PFN_SECTION_SHIFT pfn to/from section number
1722 : */
1723 : #define PA_SECTION_SHIFT (SECTION_SIZE_BITS)
1724 : #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT)
1725 :
1726 : #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT)
1727 :
1728 : #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT)
1729 : #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1))
1730 :
1731 : #define SECTION_BLOCKFLAGS_BITS \
1732 : ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
1733 :
1734 : #if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
1735 : #error Allocator MAX_ORDER exceeds SECTION_SIZE
1736 : #endif
1737 :
1738 : static inline unsigned long pfn_to_section_nr(unsigned long pfn)
1739 : {
1740 : return pfn >> PFN_SECTION_SHIFT;
1741 : }
1742 : static inline unsigned long section_nr_to_pfn(unsigned long sec)
1743 : {
1744 : return sec << PFN_SECTION_SHIFT;
1745 : }
1746 :
1747 : #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
1748 : #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK)
1749 :
1750 : #define SUBSECTION_SHIFT 21
1751 : #define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT)
1752 :
1753 : #define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
1754 : #define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
1755 : #define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1))
1756 :
1757 : #if SUBSECTION_SHIFT > SECTION_SIZE_BITS
1758 : #error Subsection size exceeds section size
1759 : #else
1760 : #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
1761 : #endif
1762 :
1763 : #define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
1764 : #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
1765 :
1766 : struct mem_section_usage {
1767 : #ifdef CONFIG_SPARSEMEM_VMEMMAP
1768 : DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
1769 : #endif
1770 : /* See declaration of similar field in struct zone */
1771 : unsigned long pageblock_flags[0];
1772 : };
1773 :
1774 : void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
1775 :
1776 : struct page;
1777 : struct page_ext;
1778 : struct mem_section {
1779 : /*
1780 : * This is, logically, a pointer to an array of struct
1781 : * pages. However, it is stored with some other magic.
1782 : * (see sparse.c::sparse_init_one_section())
1783 : *
1784 : * Additionally during early boot we encode node id of
1785 : * the location of the section here to guide allocation.
1786 : * (see sparse.c::memory_present())
1787 : *
1788 : * Making it a UL at least makes someone do a cast
1789 : * before using it wrong.
1790 : */
1791 : unsigned long section_mem_map;
1792 :
1793 : struct mem_section_usage *usage;
1794 : #ifdef CONFIG_PAGE_EXTENSION
1795 : /*
1796 : * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
1797 : * section. (see page_ext.h about this.)
1798 : */
1799 : struct page_ext *page_ext;
1800 : unsigned long pad;
1801 : #endif
1802 : /*
1803 : * WARNING: mem_section must be a power-of-2 in size for the
1804 : * calculation and use of SECTION_ROOT_MASK to make sense.
1805 : */
1806 : };
1807 :
1808 : #ifdef CONFIG_SPARSEMEM_EXTREME
1809 : #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section))
1810 : #else
1811 : #define SECTIONS_PER_ROOT 1
1812 : #endif
1813 :
1814 : #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT)
1815 : #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
1816 : #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1)
1817 :
1818 : #ifdef CONFIG_SPARSEMEM_EXTREME
1819 : extern struct mem_section **mem_section;
1820 : #else
1821 : extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
1822 : #endif
1823 :
1824 : static inline unsigned long *section_to_usemap(struct mem_section *ms)
1825 : {
1826 : return ms->usage->pageblock_flags;
1827 : }
1828 :
1829 : static inline struct mem_section *__nr_to_section(unsigned long nr)
1830 : {
1831 : unsigned long root = SECTION_NR_TO_ROOT(nr);
1832 :
1833 : if (unlikely(root >= NR_SECTION_ROOTS))
1834 : return NULL;
1835 :
1836 : #ifdef CONFIG_SPARSEMEM_EXTREME
1837 : if (!mem_section || !mem_section[root])
1838 : return NULL;
1839 : #endif
1840 : return &mem_section[root][nr & SECTION_ROOT_MASK];
1841 : }
1842 : extern size_t mem_section_usage_size(void);
1843 :
1844 : /*
1845 : * We use the lower bits of the mem_map pointer to store
1846 : * a little bit of information. The pointer is calculated
1847 : * as mem_map - section_nr_to_pfn(pnum). The result is
1848 : * aligned to the minimum alignment of the two values:
1849 : * 1. All mem_map arrays are page-aligned.
1850 : * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
1851 : * lowest bits. PFN_SECTION_SHIFT is arch-specific
1852 : * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
1853 : * worst combination is powerpc with 256k pages,
1854 : * which results in PFN_SECTION_SHIFT equal 6.
1855 : * To sum it up, at least 6 bits are available on all architectures.
1856 : * However, we can exceed 6 bits on some other architectures except
1857 : * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
1858 : * with the worst case of 64K pages on arm64) if we make sure the
1859 : * exceeded bit is not applicable to powerpc.
1860 : */
1861 : enum {
1862 : SECTION_MARKED_PRESENT_BIT,
1863 : SECTION_HAS_MEM_MAP_BIT,
1864 : SECTION_IS_ONLINE_BIT,
1865 : SECTION_IS_EARLY_BIT,
1866 : #ifdef CONFIG_ZONE_DEVICE
1867 : SECTION_TAINT_ZONE_DEVICE_BIT,
1868 : #endif
1869 : SECTION_MAP_LAST_BIT,
1870 : };
1871 :
1872 : #define SECTION_MARKED_PRESENT BIT(SECTION_MARKED_PRESENT_BIT)
1873 : #define SECTION_HAS_MEM_MAP BIT(SECTION_HAS_MEM_MAP_BIT)
1874 : #define SECTION_IS_ONLINE BIT(SECTION_IS_ONLINE_BIT)
1875 : #define SECTION_IS_EARLY BIT(SECTION_IS_EARLY_BIT)
1876 : #ifdef CONFIG_ZONE_DEVICE
1877 : #define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
1878 : #endif
1879 : #define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1))
1880 : #define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT
1881 :
1882 : static inline struct page *__section_mem_map_addr(struct mem_section *section)
1883 : {
1884 : unsigned long map = section->section_mem_map;
1885 : map &= SECTION_MAP_MASK;
1886 : return (struct page *)map;
1887 : }
1888 :
1889 : static inline int present_section(struct mem_section *section)
1890 : {
1891 : return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
1892 : }
1893 :
1894 : static inline int present_section_nr(unsigned long nr)
1895 : {
1896 : return present_section(__nr_to_section(nr));
1897 : }
1898 :
1899 : static inline int valid_section(struct mem_section *section)
1900 : {
1901 : return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
1902 : }
1903 :
1904 : static inline int early_section(struct mem_section *section)
1905 : {
1906 : return (section && (section->section_mem_map & SECTION_IS_EARLY));
1907 : }
1908 :
1909 : static inline int valid_section_nr(unsigned long nr)
1910 : {
1911 : return valid_section(__nr_to_section(nr));
1912 : }
1913 :
1914 : static inline int online_section(struct mem_section *section)
1915 : {
1916 : return (section && (section->section_mem_map & SECTION_IS_ONLINE));
1917 : }
1918 :
1919 : #ifdef CONFIG_ZONE_DEVICE
1920 : static inline int online_device_section(struct mem_section *section)
1921 : {
1922 : unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;
1923 :
1924 : return section && ((section->section_mem_map & flags) == flags);
1925 : }
1926 : #else
1927 : static inline int online_device_section(struct mem_section *section)
1928 : {
1929 : return 0;
1930 : }
1931 : #endif
1932 :
1933 : static inline int online_section_nr(unsigned long nr)
1934 : {
1935 : return online_section(__nr_to_section(nr));
1936 : }
1937 :
1938 : #ifdef CONFIG_MEMORY_HOTPLUG
1939 : void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
1940 : void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
1941 : #endif
1942 :
1943 : static inline struct mem_section *__pfn_to_section(unsigned long pfn)
1944 : {
1945 : return __nr_to_section(pfn_to_section_nr(pfn));
1946 : }
1947 :
1948 : extern unsigned long __highest_present_section_nr;
1949 :
1950 : static inline int subsection_map_index(unsigned long pfn)
1951 : {
1952 : return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION;
1953 : }
1954 :
1955 : #ifdef CONFIG_SPARSEMEM_VMEMMAP
1956 : static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
1957 : {
1958 : int idx = subsection_map_index(pfn);
1959 :
1960 : return test_bit(idx, ms->usage->subsection_map);
1961 : }
1962 : #else
1963 : static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
1964 : {
1965 : return 1;
1966 : }
1967 : #endif
1968 :
1969 : #ifndef CONFIG_HAVE_ARCH_PFN_VALID
1970 : /**
1971 : * pfn_valid - check if there is a valid memory map entry for a PFN
1972 : * @pfn: the page frame number to check
1973 : *
1974 : * Check if there is a valid memory map entry aka struct page for the @pfn.
1975 : * Note, that availability of the memory map entry does not imply that
1976 : * there is actual usable memory at that @pfn. The struct page may
1977 : * represent a hole or an unusable page frame.
1978 : *
1979 : * Return: 1 for PFNs that have memory map entries and 0 otherwise
1980 : */
1981 : static inline int pfn_valid(unsigned long pfn)
1982 : {
1983 : struct mem_section *ms;
1984 :
1985 : /*
1986 : * Ensure the upper PAGE_SHIFT bits are clear in the
1987 : * pfn. Else it might lead to false positives when
1988 : * some of the upper bits are set, but the lower bits
1989 : * match a valid pfn.
1990 : */
1991 : if (PHYS_PFN(PFN_PHYS(pfn)) != pfn)
1992 : return 0;
1993 :
1994 : if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
1995 : return 0;
1996 : ms = __pfn_to_section(pfn);
1997 : if (!valid_section(ms))
1998 : return 0;
1999 : /*
2000 : * Traditionally early sections always returned pfn_valid() for
2001 : * the entire section-sized span.
2002 : */
2003 : return early_section(ms) || pfn_section_valid(ms, pfn);
2004 : }
2005 : #endif
2006 :
2007 : static inline int pfn_in_present_section(unsigned long pfn)
2008 : {
2009 : if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
2010 : return 0;
2011 : return present_section(__pfn_to_section(pfn));
2012 : }
2013 :
2014 : static inline unsigned long next_present_section_nr(unsigned long section_nr)
2015 : {
2016 : while (++section_nr <= __highest_present_section_nr) {
2017 : if (present_section_nr(section_nr))
2018 : return section_nr;
2019 : }
2020 :
2021 : return -1;
2022 : }
2023 :
2024 : /*
2025 : * These are _only_ used during initialisation, therefore they
2026 : * can use __initdata ... They could have names to indicate
2027 : * this restriction.
2028 : */
2029 : #ifdef CONFIG_NUMA
2030 : #define pfn_to_nid(pfn) \
2031 : ({ \
2032 : unsigned long __pfn_to_nid_pfn = (pfn); \
2033 : page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \
2034 : })
2035 : #else
2036 : #define pfn_to_nid(pfn) (0)
2037 : #endif
2038 :
2039 : void sparse_init(void);
2040 : #else
2041 : #define sparse_init() do {} while (0)
2042 : #define sparse_index_init(_sec, _nid) do {} while (0)
2043 : #define pfn_in_present_section pfn_valid
2044 : #define subsection_map_init(_pfn, _nr_pages) do {} while (0)
2045 : #endif /* CONFIG_SPARSEMEM */
2046 :
2047 : #endif /* !__GENERATING_BOUNDS.H */
2048 : #endif /* !__ASSEMBLY__ */
2049 : #endif /* _LINUX_MMZONE_H */
|