Line data Source code
1 : /* SPDX-License-Identifier: GPL-2.0-or-later */
2 : /* internal.h: mm/ internal definitions
3 : *
4 : * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
5 : * Written by David Howells (dhowells@redhat.com)
6 : */
7 : #ifndef __MM_INTERNAL_H
8 : #define __MM_INTERNAL_H
9 :
10 : #include <linux/fs.h>
11 : #include <linux/mm.h>
12 : #include <linux/pagemap.h>
13 : #include <linux/rmap.h>
14 : #include <linux/tracepoint-defs.h>
15 :
16 : struct folio_batch;
17 :
18 : /*
19 : * The set of flags that only affect watermark checking and reclaim
20 : * behaviour. This is used by the MM to obey the caller constraints
21 : * about IO, FS and watermark checking while ignoring placement
22 : * hints such as HIGHMEM usage.
23 : */
24 : #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
25 : __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
26 : __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
27 : __GFP_NOLOCKDEP)
28 :
29 : /* The GFP flags allowed during early boot */
30 : #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
31 :
32 : /* Control allocation cpuset and node placement constraints */
33 : #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
34 :
35 : /* Do not use these with a slab allocator */
36 : #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
37 :
38 : /*
39 : * Different from WARN_ON_ONCE(), no warning will be issued
40 : * when we specify __GFP_NOWARN.
41 : */
42 : #define WARN_ON_ONCE_GFP(cond, gfp) ({ \
43 : static bool __section(".data.once") __warned; \
44 : int __ret_warn_once = !!(cond); \
45 : \
46 : if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
47 : __warned = true; \
48 : WARN_ON(1); \
49 : } \
50 : unlikely(__ret_warn_once); \
51 : })
52 :
53 : void page_writeback_init(void);
54 :
55 : /*
56 : * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
57 : * its nr_pages_mapped would be 0x400000: choose the COMPOUND_MAPPED bit
58 : * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently
59 : * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
60 : */
61 : #define COMPOUND_MAPPED 0x800000
62 : #define FOLIO_PAGES_MAPPED (COMPOUND_MAPPED - 1)
63 :
64 : /*
65 : * How many individual pages have an elevated _mapcount. Excludes
66 : * the folio's entire_mapcount.
67 : */
68 : static inline int folio_nr_pages_mapped(struct folio *folio)
69 : {
70 0 : return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
71 : }
72 :
73 : static inline void *folio_raw_mapping(struct folio *folio)
74 : {
75 0 : unsigned long mapping = (unsigned long)folio->mapping;
76 :
77 0 : return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
78 : }
79 :
80 : void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
81 : int nr_throttled);
82 : static inline void acct_reclaim_writeback(struct folio *folio)
83 : {
84 0 : pg_data_t *pgdat = folio_pgdat(folio);
85 0 : int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);
86 :
87 0 : if (nr_throttled)
88 0 : __acct_reclaim_writeback(pgdat, folio, nr_throttled);
89 : }
90 :
91 : static inline void wake_throttle_isolated(pg_data_t *pgdat)
92 : {
93 : wait_queue_head_t *wqh;
94 :
95 0 : wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
96 0 : if (waitqueue_active(wqh))
97 0 : wake_up(wqh);
98 : }
99 :
100 : vm_fault_t do_swap_page(struct vm_fault *vmf);
101 : void folio_rotate_reclaimable(struct folio *folio);
102 : bool __folio_end_writeback(struct folio *folio);
103 : void deactivate_file_folio(struct folio *folio);
104 : void folio_activate(struct folio *folio);
105 :
106 : void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
107 : struct vm_area_struct *start_vma, unsigned long floor,
108 : unsigned long ceiling);
109 : void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
110 :
111 : struct zap_details;
112 : void unmap_page_range(struct mmu_gather *tlb,
113 : struct vm_area_struct *vma,
114 : unsigned long addr, unsigned long end,
115 : struct zap_details *details);
116 :
117 : void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
118 : unsigned int order);
119 : void force_page_cache_ra(struct readahead_control *, unsigned long nr);
120 : static inline void force_page_cache_readahead(struct address_space *mapping,
121 : struct file *file, pgoff_t index, unsigned long nr_to_read)
122 : {
123 0 : DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
124 0 : force_page_cache_ra(&ractl, nr_to_read);
125 : }
126 :
127 : unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
128 : pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
129 : unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
130 : pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
131 : void filemap_free_folio(struct address_space *mapping, struct folio *folio);
132 : int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
133 : bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
134 : loff_t end);
135 : long invalidate_inode_page(struct page *page);
136 : unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
137 : pgoff_t start, pgoff_t end, unsigned long *nr_pagevec);
138 :
139 : /**
140 : * folio_evictable - Test whether a folio is evictable.
141 : * @folio: The folio to test.
142 : *
143 : * Test whether @folio is evictable -- i.e., should be placed on
144 : * active/inactive lists vs unevictable list.
145 : *
146 : * Reasons folio might not be evictable:
147 : * 1. folio's mapping marked unevictable
148 : * 2. One of the pages in the folio is part of an mlocked VMA
149 : */
150 0 : static inline bool folio_evictable(struct folio *folio)
151 : {
152 : bool ret;
153 :
154 : /* Prevent address_space of inode and swap cache from being freed */
155 : rcu_read_lock();
156 0 : ret = !mapping_unevictable(folio_mapping(folio)) &&
157 0 : !folio_test_mlocked(folio);
158 : rcu_read_unlock();
159 0 : return ret;
160 : }
161 :
162 : /*
163 : * Turn a non-refcounted page (->_refcount == 0) into refcounted with
164 : * a count of one.
165 : */
166 : static inline void set_page_refcounted(struct page *page)
167 : {
168 : VM_BUG_ON_PAGE(PageTail(page), page);
169 : VM_BUG_ON_PAGE(page_ref_count(page), page);
170 51176 : set_page_count(page, 1);
171 : }
172 :
173 : extern unsigned long highest_memmap_pfn;
174 :
175 : /*
176 : * Maximum number of reclaim retries without progress before the OOM
177 : * killer is consider the only way forward.
178 : */
179 : #define MAX_RECLAIM_RETRIES 16
180 :
181 : /*
182 : * in mm/early_ioremap.c
183 : */
184 : pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
185 : unsigned long size, pgprot_t prot);
186 :
187 : /*
188 : * in mm/vmscan.c:
189 : */
190 : bool isolate_lru_page(struct page *page);
191 : bool folio_isolate_lru(struct folio *folio);
192 : void putback_lru_page(struct page *page);
193 : void folio_putback_lru(struct folio *folio);
194 : extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
195 :
196 : /*
197 : * in mm/rmap.c:
198 : */
199 : pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
200 :
201 : /*
202 : * in mm/page_alloc.c
203 : */
204 :
205 : /*
206 : * Structure for holding the mostly immutable allocation parameters passed
207 : * between functions involved in allocations, including the alloc_pages*
208 : * family of functions.
209 : *
210 : * nodemask, migratetype and highest_zoneidx are initialized only once in
211 : * __alloc_pages() and then never change.
212 : *
213 : * zonelist, preferred_zone and highest_zoneidx are set first in
214 : * __alloc_pages() for the fast path, and might be later changed
215 : * in __alloc_pages_slowpath(). All other functions pass the whole structure
216 : * by a const pointer.
217 : */
218 : struct alloc_context {
219 : struct zonelist *zonelist;
220 : nodemask_t *nodemask;
221 : struct zoneref *preferred_zoneref;
222 : int migratetype;
223 :
224 : /*
225 : * highest_zoneidx represents highest usable zone index of
226 : * the allocation request. Due to the nature of the zone,
227 : * memory on lower zone than the highest_zoneidx will be
228 : * protected by lowmem_reserve[highest_zoneidx].
229 : *
230 : * highest_zoneidx is also used by reclaim/compaction to limit
231 : * the target zone since higher zone than this index cannot be
232 : * usable for this allocation request.
233 : */
234 : enum zone_type highest_zoneidx;
235 : bool spread_dirty_pages;
236 : };
237 :
238 : /*
239 : * This function returns the order of a free page in the buddy system. In
240 : * general, page_zone(page)->lock must be held by the caller to prevent the
241 : * page from being allocated in parallel and returning garbage as the order.
242 : * If a caller does not hold page_zone(page)->lock, it must guarantee that the
243 : * page cannot be allocated or merged in parallel. Alternatively, it must
244 : * handle invalid values gracefully, and use buddy_order_unsafe() below.
245 : */
246 : static inline unsigned int buddy_order(struct page *page)
247 : {
248 : /* PageBuddy() must be checked by the caller */
249 8214 : return page_private(page);
250 : }
251 :
252 : /*
253 : * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
254 : * PageBuddy() should be checked first by the caller to minimize race window,
255 : * and invalid values must be handled gracefully.
256 : *
257 : * READ_ONCE is used so that if the caller assigns the result into a local
258 : * variable and e.g. tests it for valid range before using, the compiler cannot
259 : * decide to remove the variable and inline the page_private(page) multiple
260 : * times, potentially observing different values in the tests and the actual
261 : * use of the result.
262 : */
263 : #define buddy_order_unsafe(page) READ_ONCE(page_private(page))
264 :
265 : /*
266 : * This function checks whether a page is free && is the buddy
267 : * we can coalesce a page and its buddy if
268 : * (a) the buddy is not in a hole (check before calling!) &&
269 : * (b) the buddy is in the buddy system &&
270 : * (c) a page and its buddy have the same order &&
271 : * (d) a page and its buddy are in the same zone.
272 : *
273 : * For recording whether a page is in the buddy system, we set PageBuddy.
274 : * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
275 : *
276 : * For recording page's order, we use page_private(page).
277 : */
278 : static inline bool page_is_buddy(struct page *page, struct page *buddy,
279 : unsigned int order)
280 : {
281 29522 : if (!page_is_guard(buddy) && !PageBuddy(buddy))
282 : return false;
283 :
284 16418 : if (buddy_order(buddy) != order)
285 : return false;
286 :
287 : /*
288 : * zone check is done late to avoid uselessly calculating
289 : * zone/node ids for pages that could never merge.
290 : */
291 17856 : if (page_zone_id(page) != page_zone_id(buddy))
292 : return false;
293 :
294 : VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
295 :
296 : return true;
297 : }
298 :
299 : /*
300 : * Locate the struct page for both the matching buddy in our
301 : * pair (buddy1) and the combined O(n+1) page they form (page).
302 : *
303 : * 1) Any buddy B1 will have an order O twin B2 which satisfies
304 : * the following equation:
305 : * B2 = B1 ^ (1 << O)
306 : * For example, if the starting buddy (buddy2) is #8 its order
307 : * 1 buddy is #10:
308 : * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
309 : *
310 : * 2) Any buddy B will have an order O+1 parent P which
311 : * satisfies the following equation:
312 : * P = B & ~(1 << O)
313 : *
314 : * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
315 : */
316 : static inline unsigned long
317 : __find_buddy_pfn(unsigned long page_pfn, unsigned int order)
318 : {
319 14761 : return page_pfn ^ (1 << order);
320 : }
321 :
322 : /*
323 : * Find the buddy of @page and validate it.
324 : * @page: The input page
325 : * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the
326 : * function is used in the performance-critical __free_one_page().
327 : * @order: The order of the page
328 : * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to
329 : * page_to_pfn().
330 : *
331 : * The found buddy can be a non PageBuddy, out of @page's zone, or its order is
332 : * not the same as @page. The validation is necessary before use it.
333 : *
334 : * Return: the found buddy page or NULL if not found.
335 : */
336 : static inline struct page *find_buddy_page_pfn(struct page *page,
337 : unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
338 : {
339 14761 : unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order);
340 : struct page *buddy;
341 :
342 14761 : buddy = page + (__buddy_pfn - pfn);
343 : if (buddy_pfn)
344 9787 : *buddy_pfn = __buddy_pfn;
345 :
346 14761 : if (page_is_buddy(page, buddy, order))
347 : return buddy;
348 : return NULL;
349 : }
350 :
351 : extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
352 : unsigned long end_pfn, struct zone *zone);
353 :
354 : static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
355 : unsigned long end_pfn, struct zone *zone)
356 : {
357 0 : if (zone->contiguous)
358 0 : return pfn_to_page(start_pfn);
359 :
360 0 : return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
361 : }
362 :
363 : extern int __isolate_free_page(struct page *page, unsigned int order);
364 : extern void __putback_isolated_page(struct page *page, unsigned int order,
365 : int mt);
366 : extern void memblock_free_pages(struct page *page, unsigned long pfn,
367 : unsigned int order);
368 : extern void __free_pages_core(struct page *page, unsigned int order);
369 : extern void prep_compound_page(struct page *page, unsigned int order);
370 : extern void post_alloc_hook(struct page *page, unsigned int order,
371 : gfp_t gfp_flags);
372 : extern int user_min_free_kbytes;
373 :
374 : extern void free_unref_page(struct page *page, unsigned int order);
375 : extern void free_unref_page_list(struct list_head *list);
376 :
377 : extern void zone_pcp_reset(struct zone *zone);
378 : extern void zone_pcp_disable(struct zone *zone);
379 : extern void zone_pcp_enable(struct zone *zone);
380 :
381 : extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
382 : phys_addr_t min_addr,
383 : int nid, bool exact_nid);
384 :
385 : int split_free_page(struct page *free_page,
386 : unsigned int order, unsigned long split_pfn_offset);
387 :
388 : /*
389 : * This will have no effect, other than possibly generating a warning, if the
390 : * caller passes in a non-large folio.
391 : */
392 : static inline void folio_set_order(struct folio *folio, unsigned int order)
393 : {
394 : if (WARN_ON_ONCE(!folio_test_large(folio)))
395 : return;
396 :
397 : folio->_folio_order = order;
398 : #ifdef CONFIG_64BIT
399 : /*
400 : * When hugetlb dissolves a folio, we need to clear the tail
401 : * page, rather than setting nr_pages to 1.
402 : */
403 : folio->_folio_nr_pages = order ? 1U << order : 0;
404 : #endif
405 : }
406 :
407 : #if defined CONFIG_COMPACTION || defined CONFIG_CMA
408 :
409 : /*
410 : * in mm/compaction.c
411 : */
412 : /*
413 : * compact_control is used to track pages being migrated and the free pages
414 : * they are being migrated to during memory compaction. The free_pfn starts
415 : * at the end of a zone and migrate_pfn begins at the start. Movable pages
416 : * are moved to the end of a zone during a compaction run and the run
417 : * completes when free_pfn <= migrate_pfn
418 : */
419 : struct compact_control {
420 : struct list_head freepages; /* List of free pages to migrate to */
421 : struct list_head migratepages; /* List of pages being migrated */
422 : unsigned int nr_freepages; /* Number of isolated free pages */
423 : unsigned int nr_migratepages; /* Number of pages to migrate */
424 : unsigned long free_pfn; /* isolate_freepages search base */
425 : /*
426 : * Acts as an in/out parameter to page isolation for migration.
427 : * isolate_migratepages uses it as a search base.
428 : * isolate_migratepages_block will update the value to the next pfn
429 : * after the last isolated one.
430 : */
431 : unsigned long migrate_pfn;
432 : unsigned long fast_start_pfn; /* a pfn to start linear scan from */
433 : struct zone *zone;
434 : unsigned long total_migrate_scanned;
435 : unsigned long total_free_scanned;
436 : unsigned short fast_search_fail;/* failures to use free list searches */
437 : short search_order; /* order to start a fast search at */
438 : const gfp_t gfp_mask; /* gfp mask of a direct compactor */
439 : int order; /* order a direct compactor needs */
440 : int migratetype; /* migratetype of direct compactor */
441 : const unsigned int alloc_flags; /* alloc flags of a direct compactor */
442 : const int highest_zoneidx; /* zone index of a direct compactor */
443 : enum migrate_mode mode; /* Async or sync migration mode */
444 : bool ignore_skip_hint; /* Scan blocks even if marked skip */
445 : bool no_set_skip_hint; /* Don't mark blocks for skipping */
446 : bool ignore_block_suitable; /* Scan blocks considered unsuitable */
447 : bool direct_compaction; /* False from kcompactd or /proc/... */
448 : bool proactive_compaction; /* kcompactd proactive compaction */
449 : bool whole_zone; /* Whole zone should/has been scanned */
450 : bool contended; /* Signal lock contention */
451 : bool finish_pageblock; /* Scan the remainder of a pageblock. Used
452 : * when there are potentially transient
453 : * isolation or migration failures to
454 : * ensure forward progress.
455 : */
456 : bool alloc_contig; /* alloc_contig_range allocation */
457 : };
458 :
459 : /*
460 : * Used in direct compaction when a page should be taken from the freelists
461 : * immediately when one is created during the free path.
462 : */
463 : struct capture_control {
464 : struct compact_control *cc;
465 : struct page *page;
466 : };
467 :
468 : unsigned long
469 : isolate_freepages_range(struct compact_control *cc,
470 : unsigned long start_pfn, unsigned long end_pfn);
471 : int
472 : isolate_migratepages_range(struct compact_control *cc,
473 : unsigned long low_pfn, unsigned long end_pfn);
474 :
475 : int __alloc_contig_migrate_range(struct compact_control *cc,
476 : unsigned long start, unsigned long end);
477 : #endif
478 : int find_suitable_fallback(struct free_area *area, unsigned int order,
479 : int migratetype, bool only_stealable, bool *can_steal);
480 :
481 : /*
482 : * These three helpers classifies VMAs for virtual memory accounting.
483 : */
484 :
485 : /*
486 : * Executable code area - executable, not writable, not stack
487 : */
488 : static inline bool is_exec_mapping(vm_flags_t flags)
489 : {
490 0 : return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
491 : }
492 :
493 : /*
494 : * Stack area - automatically grows in one direction
495 : *
496 : * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
497 : * do_mmap() forbids all other combinations.
498 : */
499 : static inline bool is_stack_mapping(vm_flags_t flags)
500 : {
501 0 : return (flags & VM_STACK) == VM_STACK;
502 : }
503 :
504 : /*
505 : * Data area - private, writable, not stack
506 : */
507 : static inline bool is_data_mapping(vm_flags_t flags)
508 : {
509 0 : return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
510 : }
511 :
512 : /* mm/util.c */
513 : struct anon_vma *folio_anon_vma(struct folio *folio);
514 :
515 : #ifdef CONFIG_MMU
516 : void unmap_mapping_folio(struct folio *folio);
517 : extern long populate_vma_page_range(struct vm_area_struct *vma,
518 : unsigned long start, unsigned long end, int *locked);
519 : extern long faultin_vma_page_range(struct vm_area_struct *vma,
520 : unsigned long start, unsigned long end,
521 : bool write, int *locked);
522 : extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
523 : unsigned long len);
524 : /*
525 : * mlock_vma_folio() and munlock_vma_folio():
526 : * should be called with vma's mmap_lock held for read or write,
527 : * under page table lock for the pte/pmd being added or removed.
528 : *
529 : * mlock is usually called at the end of page_add_*_rmap(), munlock at
530 : * the end of page_remove_rmap(); but new anon folios are managed by
531 : * folio_add_lru_vma() calling mlock_new_folio().
532 : *
533 : * @compound is used to include pmd mappings of THPs, but filter out
534 : * pte mappings of THPs, which cannot be consistently counted: a pte
535 : * mapping of the THP head cannot be distinguished by the page alone.
536 : */
537 : void mlock_folio(struct folio *folio);
538 0 : static inline void mlock_vma_folio(struct folio *folio,
539 : struct vm_area_struct *vma, bool compound)
540 : {
541 : /*
542 : * The VM_SPECIAL check here serves two purposes.
543 : * 1) VM_IO check prevents migration from double-counting during mlock.
544 : * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
545 : * is never left set on a VM_SPECIAL vma, there is an interval while
546 : * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
547 : * still be set while VM_SPECIAL bits are added: so ignore it then.
548 : */
549 0 : if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
550 0 : (compound || !folio_test_large(folio)))
551 0 : mlock_folio(folio);
552 0 : }
553 :
554 : void munlock_folio(struct folio *folio);
555 0 : static inline void munlock_vma_folio(struct folio *folio,
556 : struct vm_area_struct *vma, bool compound)
557 : {
558 0 : if (unlikely(vma->vm_flags & VM_LOCKED) &&
559 0 : (compound || !folio_test_large(folio)))
560 0 : munlock_folio(folio);
561 0 : }
562 :
563 : void mlock_new_folio(struct folio *folio);
564 : bool need_mlock_drain(int cpu);
565 : void mlock_drain_local(void);
566 : void mlock_drain_remote(int cpu);
567 :
568 : extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
569 :
570 : /*
571 : * Return the start of user virtual address at the specific offset within
572 : * a vma.
573 : */
574 : static inline unsigned long
575 : vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages,
576 : struct vm_area_struct *vma)
577 : {
578 : unsigned long address;
579 :
580 0 : if (pgoff >= vma->vm_pgoff) {
581 0 : address = vma->vm_start +
582 0 : ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
583 : /* Check for address beyond vma (or wrapped through 0?) */
584 0 : if (address < vma->vm_start || address >= vma->vm_end)
585 0 : address = -EFAULT;
586 0 : } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
587 : /* Test above avoids possibility of wrap to 0 on 32-bit */
588 0 : address = vma->vm_start;
589 : } else {
590 : address = -EFAULT;
591 : }
592 : return address;
593 : }
594 :
595 : /*
596 : * Return the start of user virtual address of a page within a vma.
597 : * Returns -EFAULT if all of the page is outside the range of vma.
598 : * If page is a compound head, the entire compound page is considered.
599 : */
600 : static inline unsigned long
601 0 : vma_address(struct page *page, struct vm_area_struct *vma)
602 : {
603 : VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
604 0 : return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma);
605 : }
606 :
607 : /*
608 : * Then at what user virtual address will none of the range be found in vma?
609 : * Assumes that vma_address() already returned a good starting address.
610 : */
611 : static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
612 : {
613 0 : struct vm_area_struct *vma = pvmw->vma;
614 : pgoff_t pgoff;
615 : unsigned long address;
616 :
617 : /* Common case, plus ->pgoff is invalid for KSM */
618 0 : if (pvmw->nr_pages == 1)
619 0 : return pvmw->address + PAGE_SIZE;
620 :
621 0 : pgoff = pvmw->pgoff + pvmw->nr_pages;
622 0 : address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
623 : /* Check for address beyond vma (or wrapped through 0?) */
624 0 : if (address < vma->vm_start || address > vma->vm_end)
625 0 : address = vma->vm_end;
626 : return address;
627 : }
628 :
629 0 : static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
630 : struct file *fpin)
631 : {
632 0 : int flags = vmf->flags;
633 :
634 0 : if (fpin)
635 : return fpin;
636 :
637 : /*
638 : * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
639 : * anything, so we only pin the file and drop the mmap_lock if only
640 : * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
641 : */
642 0 : if (fault_flag_allow_retry_first(flags) &&
643 0 : !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
644 0 : fpin = get_file(vmf->vma->vm_file);
645 0 : mmap_read_unlock(vmf->vma->vm_mm);
646 : }
647 : return fpin;
648 : }
649 : #else /* !CONFIG_MMU */
650 : static inline void unmap_mapping_folio(struct folio *folio) { }
651 : static inline void mlock_new_folio(struct folio *folio) { }
652 : static inline bool need_mlock_drain(int cpu) { return false; }
653 : static inline void mlock_drain_local(void) { }
654 : static inline void mlock_drain_remote(int cpu) { }
655 : static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
656 : {
657 : }
658 : #endif /* !CONFIG_MMU */
659 :
660 : /* Memory initialisation debug and verification */
661 : enum mminit_level {
662 : MMINIT_WARNING,
663 : MMINIT_VERIFY,
664 : MMINIT_TRACE
665 : };
666 :
667 : #ifdef CONFIG_DEBUG_MEMORY_INIT
668 :
669 : extern int mminit_loglevel;
670 :
671 : #define mminit_dprintk(level, prefix, fmt, arg...) \
672 : do { \
673 : if (level < mminit_loglevel) { \
674 : if (level <= MMINIT_WARNING) \
675 : pr_warn("mminit::" prefix " " fmt, ##arg); \
676 : else \
677 : printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
678 : } \
679 : } while (0)
680 :
681 : extern void mminit_verify_pageflags_layout(void);
682 : extern void mminit_verify_zonelist(void);
683 : #else
684 :
685 : static inline void mminit_dprintk(enum mminit_level level,
686 : const char *prefix, const char *fmt, ...)
687 : {
688 : }
689 :
690 : static inline void mminit_verify_pageflags_layout(void)
691 : {
692 : }
693 :
694 : static inline void mminit_verify_zonelist(void)
695 : {
696 : }
697 : #endif /* CONFIG_DEBUG_MEMORY_INIT */
698 :
699 : #define NODE_RECLAIM_NOSCAN -2
700 : #define NODE_RECLAIM_FULL -1
701 : #define NODE_RECLAIM_SOME 0
702 : #define NODE_RECLAIM_SUCCESS 1
703 :
704 : #ifdef CONFIG_NUMA
705 : extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
706 : extern int find_next_best_node(int node, nodemask_t *used_node_mask);
707 : #else
708 : static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
709 : unsigned int order)
710 : {
711 : return NODE_RECLAIM_NOSCAN;
712 : }
713 : static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
714 : {
715 : return NUMA_NO_NODE;
716 : }
717 : #endif
718 :
719 : /*
720 : * mm/memory-failure.c
721 : */
722 : extern int hwpoison_filter(struct page *p);
723 :
724 : extern u32 hwpoison_filter_dev_major;
725 : extern u32 hwpoison_filter_dev_minor;
726 : extern u64 hwpoison_filter_flags_mask;
727 : extern u64 hwpoison_filter_flags_value;
728 : extern u64 hwpoison_filter_memcg;
729 : extern u32 hwpoison_filter_enable;
730 :
731 : extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
732 : unsigned long, unsigned long,
733 : unsigned long, unsigned long);
734 :
735 : extern void set_pageblock_order(void);
736 : unsigned int reclaim_clean_pages_from_list(struct zone *zone,
737 : struct list_head *page_list);
738 : /* The ALLOC_WMARK bits are used as an index to zone->watermark */
739 : #define ALLOC_WMARK_MIN WMARK_MIN
740 : #define ALLOC_WMARK_LOW WMARK_LOW
741 : #define ALLOC_WMARK_HIGH WMARK_HIGH
742 : #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
743 :
744 : /* Mask to get the watermark bits */
745 : #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
746 :
747 : /*
748 : * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
749 : * cannot assume a reduced access to memory reserves is sufficient for
750 : * !MMU
751 : */
752 : #ifdef CONFIG_MMU
753 : #define ALLOC_OOM 0x08
754 : #else
755 : #define ALLOC_OOM ALLOC_NO_WATERMARKS
756 : #endif
757 :
758 : #define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access
759 : * to 25% of the min watermark or
760 : * 62.5% if __GFP_HIGH is set.
761 : */
762 : #define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50%
763 : * of the min watermark.
764 : */
765 : #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
766 : #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
767 : #ifdef CONFIG_ZONE_DMA32
768 : #define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */
769 : #else
770 : #define ALLOC_NOFRAGMENT 0x0
771 : #endif
772 : #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */
773 : #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
774 :
775 : /* Flags that allow allocations below the min watermark. */
776 : #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
777 :
778 : enum ttu_flags;
779 : struct tlbflush_unmap_batch;
780 :
781 :
782 : /*
783 : * only for MM internal work items which do not depend on
784 : * any allocations or locks which might depend on allocations
785 : */
786 : extern struct workqueue_struct *mm_percpu_wq;
787 :
788 : #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
789 : void try_to_unmap_flush(void);
790 : void try_to_unmap_flush_dirty(void);
791 : void flush_tlb_batched_pending(struct mm_struct *mm);
792 : #else
793 : static inline void try_to_unmap_flush(void)
794 : {
795 : }
796 : static inline void try_to_unmap_flush_dirty(void)
797 : {
798 : }
799 : static inline void flush_tlb_batched_pending(struct mm_struct *mm)
800 : {
801 : }
802 : #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
803 :
804 : extern const struct trace_print_flags pageflag_names[];
805 : extern const struct trace_print_flags vmaflag_names[];
806 : extern const struct trace_print_flags gfpflag_names[];
807 :
808 : static inline bool is_migrate_highatomic(enum migratetype migratetype)
809 : {
810 : return migratetype == MIGRATE_HIGHATOMIC;
811 : }
812 :
813 0 : static inline bool is_migrate_highatomic_page(struct page *page)
814 : {
815 0 : return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
816 : }
817 :
818 : void setup_zone_pageset(struct zone *zone);
819 :
820 : struct migration_target_control {
821 : int nid; /* preferred node id */
822 : nodemask_t *nmask;
823 : gfp_t gfp_mask;
824 : };
825 :
826 : /*
827 : * mm/filemap.c
828 : */
829 : size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
830 : struct folio *folio, loff_t fpos, size_t size);
831 :
832 : /*
833 : * mm/vmalloc.c
834 : */
835 : #ifdef CONFIG_MMU
836 : int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
837 : pgprot_t prot, struct page **pages, unsigned int page_shift);
838 : #else
839 : static inline
840 : int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
841 : pgprot_t prot, struct page **pages, unsigned int page_shift)
842 : {
843 : return -EINVAL;
844 : }
845 : #endif
846 :
847 : int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
848 : pgprot_t prot, struct page **pages,
849 : unsigned int page_shift);
850 :
851 : void vunmap_range_noflush(unsigned long start, unsigned long end);
852 :
853 : void __vunmap_range_noflush(unsigned long start, unsigned long end);
854 :
855 : int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
856 : unsigned long addr, int page_nid, int *flags);
857 :
858 : void free_zone_device_page(struct page *page);
859 : int migrate_device_coherent_page(struct page *page);
860 :
861 : /*
862 : * mm/gup.c
863 : */
864 : struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
865 : int __must_check try_grab_page(struct page *page, unsigned int flags);
866 :
867 : enum {
868 : /* mark page accessed */
869 : FOLL_TOUCH = 1 << 16,
870 : /* a retry, previous pass started an IO */
871 : FOLL_TRIED = 1 << 17,
872 : /* we are working on non-current tsk/mm */
873 : FOLL_REMOTE = 1 << 18,
874 : /* pages must be released via unpin_user_page */
875 : FOLL_PIN = 1 << 19,
876 : /* gup_fast: prevent fall-back to slow gup */
877 : FOLL_FAST_ONLY = 1 << 20,
878 : /* allow unlocking the mmap lock */
879 : FOLL_UNLOCKABLE = 1 << 21,
880 : };
881 :
882 : /*
883 : * Indicates for which pages that are write-protected in the page table,
884 : * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
885 : * GUP pin will remain consistent with the pages mapped into the page tables
886 : * of the MM.
887 : *
888 : * Temporary unmapping of PageAnonExclusive() pages or clearing of
889 : * PageAnonExclusive() has to protect against concurrent GUP:
890 : * * Ordinary GUP: Using the PT lock
891 : * * GUP-fast and fork(): mm->write_protect_seq
892 : * * GUP-fast and KSM or temporary unmapping (swap, migration): see
893 : * page_try_share_anon_rmap()
894 : *
895 : * Must be called with the (sub)page that's actually referenced via the
896 : * page table entry, which might not necessarily be the head page for a
897 : * PTE-mapped THP.
898 : *
899 : * If the vma is NULL, we're coming from the GUP-fast path and might have
900 : * to fallback to the slow path just to lookup the vma.
901 : */
902 0 : static inline bool gup_must_unshare(struct vm_area_struct *vma,
903 : unsigned int flags, struct page *page)
904 : {
905 : /*
906 : * FOLL_WRITE is implicitly handled correctly as the page table entry
907 : * has to be writable -- and if it references (part of) an anonymous
908 : * folio, that part is required to be marked exclusive.
909 : */
910 0 : if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN)
911 : return false;
912 : /*
913 : * Note: PageAnon(page) is stable until the page is actually getting
914 : * freed.
915 : */
916 0 : if (!PageAnon(page)) {
917 : /*
918 : * We only care about R/O long-term pining: R/O short-term
919 : * pinning does not have the semantics to observe successive
920 : * changes through the process page tables.
921 : */
922 0 : if (!(flags & FOLL_LONGTERM))
923 : return false;
924 :
925 : /* We really need the vma ... */
926 0 : if (!vma)
927 : return true;
928 :
929 : /*
930 : * ... because we only care about writable private ("COW")
931 : * mappings where we have to break COW early.
932 : */
933 0 : return is_cow_mapping(vma->vm_flags);
934 : }
935 :
936 : /* Paired with a memory barrier in page_try_share_anon_rmap(). */
937 : if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
938 : smp_rmb();
939 :
940 : /*
941 : * Note that PageKsm() pages cannot be exclusive, and consequently,
942 : * cannot get pinned.
943 : */
944 0 : return !PageAnonExclusive(page);
945 : }
946 :
947 : extern bool mirrored_kernelcore;
948 :
949 : static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
950 : {
951 : /*
952 : * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
953 : * enablements, because when without soft-dirty being compiled in,
954 : * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
955 : * will be constantly true.
956 : */
957 : if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
958 : return false;
959 :
960 : /*
961 : * Soft-dirty is kind of special: its tracking is enabled when the
962 : * vma flags not set.
963 : */
964 : return !(vma->vm_flags & VM_SOFTDIRTY);
965 : }
966 :
967 : /*
968 : * VMA Iterator functions shared between nommu and mmap
969 : */
970 : static inline int vma_iter_prealloc(struct vma_iterator *vmi)
971 : {
972 0 : return mas_preallocate(&vmi->mas, GFP_KERNEL);
973 : }
974 :
975 : static inline void vma_iter_clear(struct vma_iterator *vmi,
976 : unsigned long start, unsigned long end)
977 : {
978 0 : mas_set_range(&vmi->mas, start, end - 1);
979 0 : mas_store_prealloc(&vmi->mas, NULL);
980 : }
981 :
982 : static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
983 : {
984 0 : return mas_walk(&vmi->mas);
985 : }
986 :
987 : /* Store a VMA with preallocated memory */
988 0 : static inline void vma_iter_store(struct vma_iterator *vmi,
989 : struct vm_area_struct *vma)
990 : {
991 :
992 : #if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
993 : if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.index > vma->vm_start)) {
994 : printk("%lu > %lu\n", vmi->mas.index, vma->vm_start);
995 : printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end);
996 : printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last);
997 : mt_dump(vmi->mas.tree);
998 : }
999 : if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) {
1000 : printk("%lu < %lu\n", vmi->mas.last, vma->vm_start);
1001 : printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end);
1002 : printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last);
1003 : mt_dump(vmi->mas.tree);
1004 : }
1005 : #endif
1006 :
1007 0 : if (vmi->mas.node != MAS_START &&
1008 0 : ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
1009 : vma_iter_invalidate(vmi);
1010 :
1011 0 : vmi->mas.index = vma->vm_start;
1012 0 : vmi->mas.last = vma->vm_end - 1;
1013 0 : mas_store_prealloc(&vmi->mas, vma);
1014 0 : }
1015 :
1016 0 : static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
1017 : struct vm_area_struct *vma, gfp_t gfp)
1018 : {
1019 0 : if (vmi->mas.node != MAS_START &&
1020 0 : ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
1021 : vma_iter_invalidate(vmi);
1022 :
1023 0 : vmi->mas.index = vma->vm_start;
1024 0 : vmi->mas.last = vma->vm_end - 1;
1025 0 : mas_store_gfp(&vmi->mas, vma, gfp);
1026 0 : if (unlikely(mas_is_err(&vmi->mas)))
1027 : return -ENOMEM;
1028 :
1029 0 : return 0;
1030 : }
1031 :
1032 : /*
1033 : * VMA lock generalization
1034 : */
1035 : struct vma_prepare {
1036 : struct vm_area_struct *vma;
1037 : struct vm_area_struct *adj_next;
1038 : struct file *file;
1039 : struct address_space *mapping;
1040 : struct anon_vma *anon_vma;
1041 : struct vm_area_struct *insert;
1042 : struct vm_area_struct *remove;
1043 : struct vm_area_struct *remove2;
1044 : };
1045 : #endif /* __MM_INTERNAL_H */
|