Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : #include <linux/kernel.h>
3 : #include <linux/errno.h>
4 : #include <linux/err.h>
5 : #include <linux/spinlock.h>
6 :
7 : #include <linux/mm.h>
8 : #include <linux/memremap.h>
9 : #include <linux/pagemap.h>
10 : #include <linux/rmap.h>
11 : #include <linux/swap.h>
12 : #include <linux/swapops.h>
13 : #include <linux/secretmem.h>
14 :
15 : #include <linux/sched/signal.h>
16 : #include <linux/rwsem.h>
17 : #include <linux/hugetlb.h>
18 : #include <linux/migrate.h>
19 : #include <linux/mm_inline.h>
20 : #include <linux/sched/mm.h>
21 :
22 : #include <asm/mmu_context.h>
23 : #include <asm/tlbflush.h>
24 :
25 : #include "internal.h"
26 :
27 : struct follow_page_context {
28 : struct dev_pagemap *pgmap;
29 : unsigned int page_mask;
30 : };
31 :
32 : static inline void sanity_check_pinned_pages(struct page **pages,
33 : unsigned long npages)
34 : {
35 : if (!IS_ENABLED(CONFIG_DEBUG_VM))
36 : return;
37 :
38 : /*
39 : * We only pin anonymous pages if they are exclusive. Once pinned, we
40 : * can no longer turn them possibly shared and PageAnonExclusive() will
41 : * stick around until the page is freed.
42 : *
43 : * We'd like to verify that our pinned anonymous pages are still mapped
44 : * exclusively. The issue with anon THP is that we don't know how
45 : * they are/were mapped when pinning them. However, for anon
46 : * THP we can assume that either the given page (PTE-mapped THP) or
47 : * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
48 : * neither is the case, there is certainly something wrong.
49 : */
50 : for (; npages; npages--, pages++) {
51 : struct page *page = *pages;
52 : struct folio *folio = page_folio(page);
53 :
54 : if (!folio_test_anon(folio))
55 : continue;
56 : if (!folio_test_large(folio) || folio_test_hugetlb(folio))
57 : VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
58 : else
59 : /* Either a PTE-mapped or a PMD-mapped THP. */
60 : VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
61 : !PageAnonExclusive(page), page);
62 : }
63 : }
64 :
65 : /*
66 : * Return the folio with ref appropriately incremented,
67 : * or NULL if that failed.
68 : */
69 0 : static inline struct folio *try_get_folio(struct page *page, int refs)
70 : {
71 : struct folio *folio;
72 :
73 : retry:
74 0 : folio = page_folio(page);
75 0 : if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
76 : return NULL;
77 0 : if (unlikely(!folio_ref_try_add_rcu(folio, refs)))
78 : return NULL;
79 :
80 : /*
81 : * At this point we have a stable reference to the folio; but it
82 : * could be that between calling page_folio() and the refcount
83 : * increment, the folio was split, in which case we'd end up
84 : * holding a reference on a folio that has nothing to do with the page
85 : * we were given anymore.
86 : * So now that the folio is stable, recheck that the page still
87 : * belongs to this folio.
88 : */
89 0 : if (unlikely(page_folio(page) != folio)) {
90 0 : if (!put_devmap_managed_page_refs(&folio->page, refs))
91 : folio_put_refs(folio, refs);
92 : goto retry;
93 : }
94 :
95 : return folio;
96 : }
97 :
98 : /**
99 : * try_grab_folio() - Attempt to get or pin a folio.
100 : * @page: pointer to page to be grabbed
101 : * @refs: the value to (effectively) add to the folio's refcount
102 : * @flags: gup flags: these are the FOLL_* flag values.
103 : *
104 : * "grab" names in this file mean, "look at flags to decide whether to use
105 : * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
106 : *
107 : * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
108 : * same time. (That's true throughout the get_user_pages*() and
109 : * pin_user_pages*() APIs.) Cases:
110 : *
111 : * FOLL_GET: folio's refcount will be incremented by @refs.
112 : *
113 : * FOLL_PIN on large folios: folio's refcount will be incremented by
114 : * @refs, and its pincount will be incremented by @refs.
115 : *
116 : * FOLL_PIN on single-page folios: folio's refcount will be incremented by
117 : * @refs * GUP_PIN_COUNTING_BIAS.
118 : *
119 : * Return: The folio containing @page (with refcount appropriately
120 : * incremented) for success, or NULL upon failure. If neither FOLL_GET
121 : * nor FOLL_PIN was set, that's considered failure, and furthermore,
122 : * a likely bug in the caller, so a warning is also emitted.
123 : */
124 0 : struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
125 : {
126 : if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
127 : return NULL;
128 :
129 0 : if (flags & FOLL_GET)
130 0 : return try_get_folio(page, refs);
131 0 : else if (flags & FOLL_PIN) {
132 : struct folio *folio;
133 :
134 : /*
135 : * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
136 : * right zone, so fail and let the caller fall back to the slow
137 : * path.
138 : */
139 0 : if (unlikely((flags & FOLL_LONGTERM) &&
140 : !is_longterm_pinnable_page(page)))
141 : return NULL;
142 :
143 : /*
144 : * CAUTION: Don't use compound_head() on the page before this
145 : * point, the result won't be stable.
146 : */
147 0 : folio = try_get_folio(page, refs);
148 0 : if (!folio)
149 : return NULL;
150 :
151 : /*
152 : * When pinning a large folio, use an exact count to track it.
153 : *
154 : * However, be sure to *also* increment the normal folio
155 : * refcount field at least once, so that the folio really
156 : * is pinned. That's why the refcount from the earlier
157 : * try_get_folio() is left intact.
158 : */
159 0 : if (folio_test_large(folio))
160 0 : atomic_add(refs, &folio->_pincount);
161 : else
162 0 : folio_ref_add(folio,
163 0 : refs * (GUP_PIN_COUNTING_BIAS - 1));
164 : /*
165 : * Adjust the pincount before re-checking the PTE for changes.
166 : * This is essentially a smp_mb() and is paired with a memory
167 : * barrier in page_try_share_anon_rmap().
168 : */
169 0 : smp_mb__after_atomic();
170 :
171 0 : node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
172 :
173 0 : return folio;
174 : }
175 :
176 0 : WARN_ON_ONCE(1);
177 : return NULL;
178 : }
179 :
180 0 : static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
181 : {
182 0 : if (flags & FOLL_PIN) {
183 0 : node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
184 0 : if (folio_test_large(folio))
185 0 : atomic_sub(refs, &folio->_pincount);
186 : else
187 0 : refs *= GUP_PIN_COUNTING_BIAS;
188 : }
189 :
190 0 : if (!put_devmap_managed_page_refs(&folio->page, refs))
191 : folio_put_refs(folio, refs);
192 0 : }
193 :
194 : /**
195 : * try_grab_page() - elevate a page's refcount by a flag-dependent amount
196 : * @page: pointer to page to be grabbed
197 : * @flags: gup flags: these are the FOLL_* flag values.
198 : *
199 : * This might not do anything at all, depending on the flags argument.
200 : *
201 : * "grab" names in this file mean, "look at flags to decide whether to use
202 : * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
203 : *
204 : * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
205 : * time. Cases: please see the try_grab_folio() documentation, with
206 : * "refs=1".
207 : *
208 : * Return: 0 for success, or if no action was required (if neither FOLL_PIN
209 : * nor FOLL_GET was set, nothing is done). A negative error code for failure:
210 : *
211 : * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not
212 : * be grabbed.
213 : */
214 0 : int __must_check try_grab_page(struct page *page, unsigned int flags)
215 : {
216 0 : struct folio *folio = page_folio(page);
217 :
218 0 : if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
219 : return -ENOMEM;
220 :
221 : if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
222 : return -EREMOTEIO;
223 :
224 0 : if (flags & FOLL_GET)
225 : folio_ref_inc(folio);
226 0 : else if (flags & FOLL_PIN) {
227 : /*
228 : * Similar to try_grab_folio(): be sure to *also*
229 : * increment the normal page refcount field at least once,
230 : * so that the page really is pinned.
231 : */
232 0 : if (folio_test_large(folio)) {
233 0 : folio_ref_add(folio, 1);
234 0 : atomic_add(1, &folio->_pincount);
235 : } else {
236 : folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
237 : }
238 :
239 0 : node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
240 : }
241 :
242 : return 0;
243 : }
244 :
245 : /**
246 : * unpin_user_page() - release a dma-pinned page
247 : * @page: pointer to page to be released
248 : *
249 : * Pages that were pinned via pin_user_pages*() must be released via either
250 : * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
251 : * that such pages can be separately tracked and uniquely handled. In
252 : * particular, interactions with RDMA and filesystems need special handling.
253 : */
254 0 : void unpin_user_page(struct page *page)
255 : {
256 0 : sanity_check_pinned_pages(&page, 1);
257 0 : gup_put_folio(page_folio(page), 1, FOLL_PIN);
258 0 : }
259 : EXPORT_SYMBOL(unpin_user_page);
260 :
261 0 : static inline struct folio *gup_folio_range_next(struct page *start,
262 : unsigned long npages, unsigned long i, unsigned int *ntails)
263 : {
264 0 : struct page *next = nth_page(start, i);
265 0 : struct folio *folio = page_folio(next);
266 0 : unsigned int nr = 1;
267 :
268 0 : if (folio_test_large(folio))
269 0 : nr = min_t(unsigned int, npages - i,
270 : folio_nr_pages(folio) - folio_page_idx(folio, next));
271 :
272 0 : *ntails = nr;
273 0 : return folio;
274 : }
275 :
276 0 : static inline struct folio *gup_folio_next(struct page **list,
277 : unsigned long npages, unsigned long i, unsigned int *ntails)
278 : {
279 0 : struct folio *folio = page_folio(list[i]);
280 : unsigned int nr;
281 :
282 0 : for (nr = i + 1; nr < npages; nr++) {
283 0 : if (page_folio(list[nr]) != folio)
284 : break;
285 : }
286 :
287 0 : *ntails = nr - i;
288 0 : return folio;
289 : }
290 :
291 : /**
292 : * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
293 : * @pages: array of pages to be maybe marked dirty, and definitely released.
294 : * @npages: number of pages in the @pages array.
295 : * @make_dirty: whether to mark the pages dirty
296 : *
297 : * "gup-pinned page" refers to a page that has had one of the get_user_pages()
298 : * variants called on that page.
299 : *
300 : * For each page in the @pages array, make that page (or its head page, if a
301 : * compound page) dirty, if @make_dirty is true, and if the page was previously
302 : * listed as clean. In any case, releases all pages using unpin_user_page(),
303 : * possibly via unpin_user_pages(), for the non-dirty case.
304 : *
305 : * Please see the unpin_user_page() documentation for details.
306 : *
307 : * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
308 : * required, then the caller should a) verify that this is really correct,
309 : * because _lock() is usually required, and b) hand code it:
310 : * set_page_dirty_lock(), unpin_user_page().
311 : *
312 : */
313 0 : void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
314 : bool make_dirty)
315 : {
316 : unsigned long i;
317 : struct folio *folio;
318 : unsigned int nr;
319 :
320 0 : if (!make_dirty) {
321 0 : unpin_user_pages(pages, npages);
322 0 : return;
323 : }
324 :
325 : sanity_check_pinned_pages(pages, npages);
326 0 : for (i = 0; i < npages; i += nr) {
327 0 : folio = gup_folio_next(pages, npages, i, &nr);
328 : /*
329 : * Checking PageDirty at this point may race with
330 : * clear_page_dirty_for_io(), but that's OK. Two key
331 : * cases:
332 : *
333 : * 1) This code sees the page as already dirty, so it
334 : * skips the call to set_page_dirty(). That could happen
335 : * because clear_page_dirty_for_io() called
336 : * page_mkclean(), followed by set_page_dirty().
337 : * However, now the page is going to get written back,
338 : * which meets the original intention of setting it
339 : * dirty, so all is well: clear_page_dirty_for_io() goes
340 : * on to call TestClearPageDirty(), and write the page
341 : * back.
342 : *
343 : * 2) This code sees the page as clean, so it calls
344 : * set_page_dirty(). The page stays dirty, despite being
345 : * written back, so it gets written back again in the
346 : * next writeback cycle. This is harmless.
347 : */
348 0 : if (!folio_test_dirty(folio)) {
349 0 : folio_lock(folio);
350 0 : folio_mark_dirty(folio);
351 0 : folio_unlock(folio);
352 : }
353 0 : gup_put_folio(folio, nr, FOLL_PIN);
354 : }
355 : }
356 : EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
357 :
358 : /**
359 : * unpin_user_page_range_dirty_lock() - release and optionally dirty
360 : * gup-pinned page range
361 : *
362 : * @page: the starting page of a range maybe marked dirty, and definitely released.
363 : * @npages: number of consecutive pages to release.
364 : * @make_dirty: whether to mark the pages dirty
365 : *
366 : * "gup-pinned page range" refers to a range of pages that has had one of the
367 : * pin_user_pages() variants called on that page.
368 : *
369 : * For the page ranges defined by [page .. page+npages], make that range (or
370 : * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
371 : * page range was previously listed as clean.
372 : *
373 : * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
374 : * required, then the caller should a) verify that this is really correct,
375 : * because _lock() is usually required, and b) hand code it:
376 : * set_page_dirty_lock(), unpin_user_page().
377 : *
378 : */
379 0 : void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
380 : bool make_dirty)
381 : {
382 : unsigned long i;
383 : struct folio *folio;
384 : unsigned int nr;
385 :
386 0 : for (i = 0; i < npages; i += nr) {
387 0 : folio = gup_folio_range_next(page, npages, i, &nr);
388 0 : if (make_dirty && !folio_test_dirty(folio)) {
389 0 : folio_lock(folio);
390 0 : folio_mark_dirty(folio);
391 0 : folio_unlock(folio);
392 : }
393 0 : gup_put_folio(folio, nr, FOLL_PIN);
394 : }
395 0 : }
396 : EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
397 :
398 : static void unpin_user_pages_lockless(struct page **pages, unsigned long npages)
399 : {
400 : unsigned long i;
401 : struct folio *folio;
402 : unsigned int nr;
403 :
404 : /*
405 : * Don't perform any sanity checks because we might have raced with
406 : * fork() and some anonymous pages might now actually be shared --
407 : * which is why we're unpinning after all.
408 : */
409 : for (i = 0; i < npages; i += nr) {
410 : folio = gup_folio_next(pages, npages, i, &nr);
411 : gup_put_folio(folio, nr, FOLL_PIN);
412 : }
413 : }
414 :
415 : /**
416 : * unpin_user_pages() - release an array of gup-pinned pages.
417 : * @pages: array of pages to be marked dirty and released.
418 : * @npages: number of pages in the @pages array.
419 : *
420 : * For each page in the @pages array, release the page using unpin_user_page().
421 : *
422 : * Please see the unpin_user_page() documentation for details.
423 : */
424 0 : void unpin_user_pages(struct page **pages, unsigned long npages)
425 : {
426 : unsigned long i;
427 : struct folio *folio;
428 : unsigned int nr;
429 :
430 : /*
431 : * If this WARN_ON() fires, then the system *might* be leaking pages (by
432 : * leaving them pinned), but probably not. More likely, gup/pup returned
433 : * a hard -ERRNO error to the caller, who erroneously passed it here.
434 : */
435 0 : if (WARN_ON(IS_ERR_VALUE(npages)))
436 0 : return;
437 :
438 : sanity_check_pinned_pages(pages, npages);
439 0 : for (i = 0; i < npages; i += nr) {
440 0 : folio = gup_folio_next(pages, npages, i, &nr);
441 0 : gup_put_folio(folio, nr, FOLL_PIN);
442 : }
443 : }
444 : EXPORT_SYMBOL(unpin_user_pages);
445 :
446 : /*
447 : * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
448 : * lifecycle. Avoid setting the bit unless necessary, or it might cause write
449 : * cache bouncing on large SMP machines for concurrent pinned gups.
450 : */
451 0 : static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
452 : {
453 0 : if (!test_bit(MMF_HAS_PINNED, mm_flags))
454 : set_bit(MMF_HAS_PINNED, mm_flags);
455 0 : }
456 :
457 : #ifdef CONFIG_MMU
458 : static struct page *no_page_table(struct vm_area_struct *vma,
459 : unsigned int flags)
460 : {
461 : /*
462 : * When core dumping an enormous anonymous area that nobody
463 : * has touched so far, we don't want to allocate unnecessary pages or
464 : * page tables. Return error instead of NULL to skip handle_mm_fault,
465 : * then get_dump_page() will return NULL to leave a hole in the dump.
466 : * But we can only make this optimization where a hole would surely
467 : * be zero-filled if handle_mm_fault() actually did handle it.
468 : */
469 0 : if ((flags & FOLL_DUMP) &&
470 0 : (vma_is_anonymous(vma) || !vma->vm_ops->fault))
471 : return ERR_PTR(-EFAULT);
472 : return NULL;
473 : }
474 :
475 0 : static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
476 : pte_t *pte, unsigned int flags)
477 : {
478 0 : if (flags & FOLL_TOUCH) {
479 0 : pte_t entry = *pte;
480 :
481 0 : if (flags & FOLL_WRITE)
482 : entry = pte_mkdirty(entry);
483 0 : entry = pte_mkyoung(entry);
484 :
485 0 : if (!pte_same(*pte, entry)) {
486 0 : set_pte_at(vma->vm_mm, address, pte, entry);
487 : update_mmu_cache(vma, address, pte);
488 : }
489 : }
490 :
491 : /* Proper page table entry exists, but no corresponding struct page */
492 0 : return -EEXIST;
493 : }
494 :
495 : /* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
496 0 : static inline bool can_follow_write_pte(pte_t pte, struct page *page,
497 : struct vm_area_struct *vma,
498 : unsigned int flags)
499 : {
500 : /* If the pte is writable, we can write to the page. */
501 0 : if (pte_write(pte))
502 : return true;
503 :
504 : /* Maybe FOLL_FORCE is set to override it? */
505 0 : if (!(flags & FOLL_FORCE))
506 : return false;
507 :
508 : /* But FOLL_FORCE has no effect on shared mappings */
509 0 : if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
510 : return false;
511 :
512 : /* ... or read-only private ones */
513 0 : if (!(vma->vm_flags & VM_MAYWRITE))
514 : return false;
515 :
516 : /* ... or already writable ones that just need to take a write fault */
517 0 : if (vma->vm_flags & VM_WRITE)
518 : return false;
519 :
520 : /*
521 : * See can_change_pte_writable(): we broke COW and could map the page
522 : * writable if we have an exclusive anonymous page ...
523 : */
524 0 : if (!page || !PageAnon(page) || !PageAnonExclusive(page))
525 : return false;
526 :
527 : /* ... and a write-fault isn't required for other reasons. */
528 : if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
529 : return false;
530 : return !userfaultfd_pte_wp(vma, pte);
531 : }
532 :
533 0 : static struct page *follow_page_pte(struct vm_area_struct *vma,
534 : unsigned long address, pmd_t *pmd, unsigned int flags,
535 : struct dev_pagemap **pgmap)
536 : {
537 0 : struct mm_struct *mm = vma->vm_mm;
538 : struct page *page;
539 : spinlock_t *ptl;
540 : pte_t *ptep, pte;
541 : int ret;
542 :
543 : /* FOLL_GET and FOLL_PIN are mutually exclusive. */
544 0 : if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
545 : (FOLL_PIN | FOLL_GET)))
546 : return ERR_PTR(-EINVAL);
547 0 : if (unlikely(pmd_bad(*pmd)))
548 : return no_page_table(vma, flags);
549 :
550 0 : ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
551 0 : pte = *ptep;
552 0 : if (!pte_present(pte))
553 : goto no_page;
554 0 : if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
555 : goto no_page;
556 :
557 0 : page = vm_normal_page(vma, address, pte);
558 :
559 : /*
560 : * We only care about anon pages in can_follow_write_pte() and don't
561 : * have to worry about pte_devmap() because they are never anon.
562 : */
563 0 : if ((flags & FOLL_WRITE) &&
564 0 : !can_follow_write_pte(pte, page, vma, flags)) {
565 : page = NULL;
566 : goto out;
567 : }
568 :
569 : if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
570 : /*
571 : * Only return device mapping pages in the FOLL_GET or FOLL_PIN
572 : * case since they are only valid while holding the pgmap
573 : * reference.
574 : */
575 : *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
576 : if (*pgmap)
577 : page = pte_page(pte);
578 : else
579 : goto no_page;
580 0 : } else if (unlikely(!page)) {
581 0 : if (flags & FOLL_DUMP) {
582 : /* Avoid special (like zero) pages in core dumps */
583 : page = ERR_PTR(-EFAULT);
584 : goto out;
585 : }
586 :
587 0 : if (is_zero_pfn(pte_pfn(pte))) {
588 0 : page = pte_page(pte);
589 : } else {
590 0 : ret = follow_pfn_pte(vma, address, ptep, flags);
591 0 : page = ERR_PTR(ret);
592 : goto out;
593 : }
594 : }
595 :
596 0 : if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {
597 : page = ERR_PTR(-EMLINK);
598 : goto out;
599 : }
600 :
601 : VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
602 : !PageAnonExclusive(page), page);
603 :
604 : /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
605 0 : ret = try_grab_page(page, flags);
606 0 : if (unlikely(ret)) {
607 0 : page = ERR_PTR(ret);
608 : goto out;
609 : }
610 :
611 : /*
612 : * We need to make the page accessible if and only if we are going
613 : * to access its content (the FOLL_PIN case). Please see
614 : * Documentation/core-api/pin_user_pages.rst for details.
615 : */
616 : if (flags & FOLL_PIN) {
617 : ret = arch_make_page_accessible(page);
618 : if (ret) {
619 : unpin_user_page(page);
620 : page = ERR_PTR(ret);
621 : goto out;
622 : }
623 : }
624 0 : if (flags & FOLL_TOUCH) {
625 0 : if ((flags & FOLL_WRITE) &&
626 0 : !pte_dirty(pte) && !PageDirty(page))
627 0 : set_page_dirty(page);
628 : /*
629 : * pte_mkyoung() would be more correct here, but atomic care
630 : * is needed to avoid losing the dirty bit: it is easier to use
631 : * mark_page_accessed().
632 : */
633 0 : mark_page_accessed(page);
634 : }
635 : out:
636 0 : pte_unmap_unlock(ptep, ptl);
637 : return page;
638 : no_page:
639 0 : pte_unmap_unlock(ptep, ptl);
640 0 : if (!pte_none(pte))
641 : return NULL;
642 : return no_page_table(vma, flags);
643 : }
644 :
645 0 : static struct page *follow_pmd_mask(struct vm_area_struct *vma,
646 : unsigned long address, pud_t *pudp,
647 : unsigned int flags,
648 : struct follow_page_context *ctx)
649 : {
650 : pmd_t *pmd, pmdval;
651 : spinlock_t *ptl;
652 : struct page *page;
653 0 : struct mm_struct *mm = vma->vm_mm;
654 :
655 0 : pmd = pmd_offset(pudp, address);
656 : /*
657 : * The READ_ONCE() will stabilize the pmdval in a register or
658 : * on the stack so that it will stop changing under the code.
659 : */
660 0 : pmdval = READ_ONCE(*pmd);
661 0 : if (pmd_none(pmdval))
662 : return no_page_table(vma, flags);
663 0 : if (!pmd_present(pmdval))
664 : return no_page_table(vma, flags);
665 0 : if (pmd_devmap(pmdval)) {
666 : ptl = pmd_lock(mm, pmd);
667 : page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
668 : spin_unlock(ptl);
669 : if (page)
670 : return page;
671 : }
672 0 : if (likely(!pmd_trans_huge(pmdval)))
673 0 : return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
674 :
675 : if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags))
676 : return no_page_table(vma, flags);
677 :
678 : ptl = pmd_lock(mm, pmd);
679 : if (unlikely(!pmd_present(*pmd))) {
680 : spin_unlock(ptl);
681 : return no_page_table(vma, flags);
682 : }
683 : if (unlikely(!pmd_trans_huge(*pmd))) {
684 : spin_unlock(ptl);
685 : return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
686 : }
687 : if (flags & FOLL_SPLIT_PMD) {
688 : int ret;
689 : page = pmd_page(*pmd);
690 : if (is_huge_zero_page(page)) {
691 : spin_unlock(ptl);
692 : ret = 0;
693 : split_huge_pmd(vma, pmd, address);
694 : if (pmd_trans_unstable(pmd))
695 : ret = -EBUSY;
696 : } else {
697 : spin_unlock(ptl);
698 : split_huge_pmd(vma, pmd, address);
699 : ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
700 : }
701 :
702 : return ret ? ERR_PTR(ret) :
703 : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
704 : }
705 : page = follow_trans_huge_pmd(vma, address, pmd, flags);
706 : spin_unlock(ptl);
707 : ctx->page_mask = HPAGE_PMD_NR - 1;
708 : return page;
709 : }
710 :
711 0 : static struct page *follow_pud_mask(struct vm_area_struct *vma,
712 : unsigned long address, p4d_t *p4dp,
713 : unsigned int flags,
714 : struct follow_page_context *ctx)
715 : {
716 : pud_t *pud;
717 : spinlock_t *ptl;
718 : struct page *page;
719 0 : struct mm_struct *mm = vma->vm_mm;
720 :
721 0 : pud = pud_offset(p4dp, address);
722 0 : if (pud_none(*pud))
723 : return no_page_table(vma, flags);
724 : if (pud_devmap(*pud)) {
725 : ptl = pud_lock(mm, pud);
726 : page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
727 : spin_unlock(ptl);
728 : if (page)
729 : return page;
730 : }
731 0 : if (unlikely(pud_bad(*pud)))
732 : return no_page_table(vma, flags);
733 :
734 0 : return follow_pmd_mask(vma, address, pud, flags, ctx);
735 : }
736 :
737 : static struct page *follow_p4d_mask(struct vm_area_struct *vma,
738 : unsigned long address, pgd_t *pgdp,
739 : unsigned int flags,
740 : struct follow_page_context *ctx)
741 : {
742 : p4d_t *p4d;
743 :
744 0 : p4d = p4d_offset(pgdp, address);
745 : if (p4d_none(*p4d))
746 : return no_page_table(vma, flags);
747 : BUILD_BUG_ON(p4d_huge(*p4d));
748 : if (unlikely(p4d_bad(*p4d)))
749 : return no_page_table(vma, flags);
750 :
751 0 : return follow_pud_mask(vma, address, p4d, flags, ctx);
752 : }
753 :
754 : /**
755 : * follow_page_mask - look up a page descriptor from a user-virtual address
756 : * @vma: vm_area_struct mapping @address
757 : * @address: virtual address to look up
758 : * @flags: flags modifying lookup behaviour
759 : * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
760 : * pointer to output page_mask
761 : *
762 : * @flags can have FOLL_ flags set, defined in <linux/mm.h>
763 : *
764 : * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
765 : * the device's dev_pagemap metadata to avoid repeating expensive lookups.
766 : *
767 : * When getting an anonymous page and the caller has to trigger unsharing
768 : * of a shared anonymous page first, -EMLINK is returned. The caller should
769 : * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
770 : * relevant with FOLL_PIN and !FOLL_WRITE.
771 : *
772 : * On output, the @ctx->page_mask is set according to the size of the page.
773 : *
774 : * Return: the mapped (struct page *), %NULL if no mapping exists, or
775 : * an error pointer if there is a mapping to something not represented
776 : * by a page descriptor (see also vm_normal_page()).
777 : */
778 : static struct page *follow_page_mask(struct vm_area_struct *vma,
779 : unsigned long address, unsigned int flags,
780 : struct follow_page_context *ctx)
781 : {
782 : pgd_t *pgd;
783 : struct page *page;
784 0 : struct mm_struct *mm = vma->vm_mm;
785 :
786 0 : ctx->page_mask = 0;
787 :
788 : /*
789 : * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
790 : * special hugetlb page table walking code. This eliminates the
791 : * need to check for hugetlb entries in the general walking code.
792 : *
793 : * hugetlb_follow_page_mask is only for follow_page() handling here.
794 : * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
795 : */
796 0 : if (is_vm_hugetlb_page(vma)) {
797 : page = hugetlb_follow_page_mask(vma, address, flags);
798 : if (!page)
799 : page = no_page_table(vma, flags);
800 : return page;
801 : }
802 :
803 0 : pgd = pgd_offset(mm, address);
804 :
805 : if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
806 : return no_page_table(vma, flags);
807 :
808 0 : return follow_p4d_mask(vma, address, pgd, flags, ctx);
809 : }
810 :
811 0 : struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
812 : unsigned int foll_flags)
813 : {
814 0 : struct follow_page_context ctx = { NULL };
815 : struct page *page;
816 :
817 0 : if (vma_is_secretmem(vma))
818 : return NULL;
819 :
820 0 : if (WARN_ON_ONCE(foll_flags & FOLL_PIN))
821 : return NULL;
822 :
823 0 : page = follow_page_mask(vma, address, foll_flags, &ctx);
824 : if (ctx.pgmap)
825 : put_dev_pagemap(ctx.pgmap);
826 0 : return page;
827 : }
828 :
829 : static int get_gate_page(struct mm_struct *mm, unsigned long address,
830 : unsigned int gup_flags, struct vm_area_struct **vma,
831 : struct page **page)
832 : {
833 : pgd_t *pgd;
834 : p4d_t *p4d;
835 : pud_t *pud;
836 : pmd_t *pmd;
837 : pte_t *pte;
838 : int ret = -EFAULT;
839 :
840 : /* user gate pages are read-only */
841 : if (gup_flags & FOLL_WRITE)
842 : return -EFAULT;
843 : if (address > TASK_SIZE)
844 : pgd = pgd_offset_k(address);
845 : else
846 : pgd = pgd_offset_gate(mm, address);
847 : if (pgd_none(*pgd))
848 : return -EFAULT;
849 : p4d = p4d_offset(pgd, address);
850 : if (p4d_none(*p4d))
851 : return -EFAULT;
852 : pud = pud_offset(p4d, address);
853 : if (pud_none(*pud))
854 : return -EFAULT;
855 : pmd = pmd_offset(pud, address);
856 : if (!pmd_present(*pmd))
857 : return -EFAULT;
858 : VM_BUG_ON(pmd_trans_huge(*pmd));
859 : pte = pte_offset_map(pmd, address);
860 : if (pte_none(*pte))
861 : goto unmap;
862 : *vma = get_gate_vma(mm);
863 : if (!page)
864 : goto out;
865 : *page = vm_normal_page(*vma, address, *pte);
866 : if (!*page) {
867 : if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
868 : goto unmap;
869 : *page = pte_page(*pte);
870 : }
871 : ret = try_grab_page(*page, gup_flags);
872 : if (unlikely(ret))
873 : goto unmap;
874 : out:
875 : ret = 0;
876 : unmap:
877 : pte_unmap(pte);
878 : return ret;
879 : }
880 :
881 : /*
882 : * mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not
883 : * FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set
884 : * to 0 and -EBUSY returned.
885 : */
886 0 : static int faultin_page(struct vm_area_struct *vma,
887 : unsigned long address, unsigned int *flags, bool unshare,
888 : int *locked)
889 : {
890 0 : unsigned int fault_flags = 0;
891 : vm_fault_t ret;
892 :
893 0 : if (*flags & FOLL_NOFAULT)
894 : return -EFAULT;
895 0 : if (*flags & FOLL_WRITE)
896 0 : fault_flags |= FAULT_FLAG_WRITE;
897 0 : if (*flags & FOLL_REMOTE)
898 0 : fault_flags |= FAULT_FLAG_REMOTE;
899 0 : if (*flags & FOLL_UNLOCKABLE) {
900 0 : fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
901 : /*
902 : * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
903 : * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.
904 : * That's because some callers may not be prepared to
905 : * handle early exits caused by non-fatal signals.
906 : */
907 0 : if (*flags & FOLL_INTERRUPTIBLE)
908 0 : fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
909 : }
910 0 : if (*flags & FOLL_NOWAIT)
911 0 : fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
912 0 : if (*flags & FOLL_TRIED) {
913 : /*
914 : * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
915 : * can co-exist
916 : */
917 0 : fault_flags |= FAULT_FLAG_TRIED;
918 : }
919 0 : if (unshare) {
920 0 : fault_flags |= FAULT_FLAG_UNSHARE;
921 : /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
922 : VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
923 : }
924 :
925 0 : ret = handle_mm_fault(vma, address, fault_flags, NULL);
926 :
927 0 : if (ret & VM_FAULT_COMPLETED) {
928 : /*
929 : * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
930 : * mmap lock in the page fault handler. Sanity check this.
931 : */
932 0 : WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
933 0 : *locked = 0;
934 :
935 : /*
936 : * We should do the same as VM_FAULT_RETRY, but let's not
937 : * return -EBUSY since that's not reflecting the reality of
938 : * what has happened - we've just fully completed a page
939 : * fault, with the mmap lock released. Use -EAGAIN to show
940 : * that we want to take the mmap lock _again_.
941 : */
942 0 : return -EAGAIN;
943 : }
944 :
945 0 : if (ret & VM_FAULT_ERROR) {
946 0 : int err = vm_fault_to_errno(ret, *flags);
947 :
948 0 : if (err)
949 : return err;
950 0 : BUG();
951 : }
952 :
953 0 : if (ret & VM_FAULT_RETRY) {
954 0 : if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
955 0 : *locked = 0;
956 : return -EBUSY;
957 : }
958 :
959 : return 0;
960 : }
961 :
962 0 : static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
963 : {
964 0 : vm_flags_t vm_flags = vma->vm_flags;
965 0 : int write = (gup_flags & FOLL_WRITE);
966 0 : int foreign = (gup_flags & FOLL_REMOTE);
967 :
968 0 : if (vm_flags & (VM_IO | VM_PFNMAP))
969 : return -EFAULT;
970 :
971 0 : if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
972 : return -EFAULT;
973 :
974 : if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
975 : return -EOPNOTSUPP;
976 :
977 0 : if (vma_is_secretmem(vma))
978 : return -EFAULT;
979 :
980 0 : if (write) {
981 0 : if (!(vm_flags & VM_WRITE)) {
982 0 : if (!(gup_flags & FOLL_FORCE))
983 : return -EFAULT;
984 : /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
985 0 : if (is_vm_hugetlb_page(vma))
986 : return -EFAULT;
987 : /*
988 : * We used to let the write,force case do COW in a
989 : * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
990 : * set a breakpoint in a read-only mapping of an
991 : * executable, without corrupting the file (yet only
992 : * when that file had been opened for writing!).
993 : * Anon pages in shared mappings are surprising: now
994 : * just reject it.
995 : */
996 0 : if (!is_cow_mapping(vm_flags))
997 : return -EFAULT;
998 : }
999 0 : } else if (!(vm_flags & VM_READ)) {
1000 0 : if (!(gup_flags & FOLL_FORCE))
1001 : return -EFAULT;
1002 : /*
1003 : * Is there actually any vma we can reach here which does not
1004 : * have VM_MAYREAD set?
1005 : */
1006 0 : if (!(vm_flags & VM_MAYREAD))
1007 : return -EFAULT;
1008 : }
1009 : /*
1010 : * gups are always data accesses, not instruction
1011 : * fetches, so execute=false here
1012 : */
1013 0 : if (!arch_vma_access_permitted(vma, write, false, foreign))
1014 : return -EFAULT;
1015 0 : return 0;
1016 : }
1017 :
1018 : /**
1019 : * __get_user_pages() - pin user pages in memory
1020 : * @mm: mm_struct of target mm
1021 : * @start: starting user address
1022 : * @nr_pages: number of pages from start to pin
1023 : * @gup_flags: flags modifying pin behaviour
1024 : * @pages: array that receives pointers to the pages pinned.
1025 : * Should be at least nr_pages long. Or NULL, if caller
1026 : * only intends to ensure the pages are faulted in.
1027 : * @vmas: array of pointers to vmas corresponding to each page.
1028 : * Or NULL if the caller does not require them.
1029 : * @locked: whether we're still with the mmap_lock held
1030 : *
1031 : * Returns either number of pages pinned (which may be less than the
1032 : * number requested), or an error. Details about the return value:
1033 : *
1034 : * -- If nr_pages is 0, returns 0.
1035 : * -- If nr_pages is >0, but no pages were pinned, returns -errno.
1036 : * -- If nr_pages is >0, and some pages were pinned, returns the number of
1037 : * pages pinned. Again, this may be less than nr_pages.
1038 : * -- 0 return value is possible when the fault would need to be retried.
1039 : *
1040 : * The caller is responsible for releasing returned @pages, via put_page().
1041 : *
1042 : * @vmas are valid only as long as mmap_lock is held.
1043 : *
1044 : * Must be called with mmap_lock held. It may be released. See below.
1045 : *
1046 : * __get_user_pages walks a process's page tables and takes a reference to
1047 : * each struct page that each user address corresponds to at a given
1048 : * instant. That is, it takes the page that would be accessed if a user
1049 : * thread accesses the given user virtual address at that instant.
1050 : *
1051 : * This does not guarantee that the page exists in the user mappings when
1052 : * __get_user_pages returns, and there may even be a completely different
1053 : * page there in some cases (eg. if mmapped pagecache has been invalidated
1054 : * and subsequently re-faulted). However it does guarantee that the page
1055 : * won't be freed completely. And mostly callers simply care that the page
1056 : * contains data that was valid *at some point in time*. Typically, an IO
1057 : * or similar operation cannot guarantee anything stronger anyway because
1058 : * locks can't be held over the syscall boundary.
1059 : *
1060 : * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
1061 : * the page is written to, set_page_dirty (or set_page_dirty_lock, as
1062 : * appropriate) must be called after the page is finished with, and
1063 : * before put_page is called.
1064 : *
1065 : * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may
1066 : * be released. If this happens *@locked will be set to 0 on return.
1067 : *
1068 : * A caller using such a combination of @gup_flags must therefore hold the
1069 : * mmap_lock for reading only, and recognize when it's been released. Otherwise,
1070 : * it must be held for either reading or writing and will not be released.
1071 : *
1072 : * In most cases, get_user_pages or get_user_pages_fast should be used
1073 : * instead of __get_user_pages. __get_user_pages should be used only if
1074 : * you need some special @gup_flags.
1075 : */
1076 0 : static long __get_user_pages(struct mm_struct *mm,
1077 : unsigned long start, unsigned long nr_pages,
1078 : unsigned int gup_flags, struct page **pages,
1079 : struct vm_area_struct **vmas, int *locked)
1080 : {
1081 0 : long ret = 0, i = 0;
1082 0 : struct vm_area_struct *vma = NULL;
1083 0 : struct follow_page_context ctx = { NULL };
1084 :
1085 0 : if (!nr_pages)
1086 : return 0;
1087 :
1088 : start = untagged_addr(start);
1089 :
1090 : VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
1091 :
1092 : do {
1093 : struct page *page;
1094 0 : unsigned int foll_flags = gup_flags;
1095 : unsigned int page_increm;
1096 :
1097 : /* first iteration or cross vma bound */
1098 0 : if (!vma || start >= vma->vm_end) {
1099 0 : vma = find_extend_vma(mm, start);
1100 : if (!vma && in_gate_area(mm, start)) {
1101 : ret = get_gate_page(mm, start & PAGE_MASK,
1102 : gup_flags, &vma,
1103 : pages ? &pages[i] : NULL);
1104 : if (ret)
1105 : goto out;
1106 : ctx.page_mask = 0;
1107 : goto next_page;
1108 : }
1109 :
1110 0 : if (!vma) {
1111 : ret = -EFAULT;
1112 : goto out;
1113 : }
1114 0 : ret = check_vma_flags(vma, gup_flags);
1115 0 : if (ret)
1116 : goto out;
1117 :
1118 : if (is_vm_hugetlb_page(vma)) {
1119 : i = follow_hugetlb_page(mm, vma, pages, vmas,
1120 : &start, &nr_pages, i,
1121 : gup_flags, locked);
1122 : if (!*locked) {
1123 : /*
1124 : * We've got a VM_FAULT_RETRY
1125 : * and we've lost mmap_lock.
1126 : * We must stop here.
1127 : */
1128 : BUG_ON(gup_flags & FOLL_NOWAIT);
1129 : goto out;
1130 : }
1131 : continue;
1132 : }
1133 : }
1134 : retry:
1135 : /*
1136 : * If we have a pending SIGKILL, don't keep faulting pages and
1137 : * potentially allocating memory.
1138 : */
1139 0 : if (fatal_signal_pending(current)) {
1140 : ret = -EINTR;
1141 : goto out;
1142 : }
1143 0 : cond_resched();
1144 :
1145 0 : page = follow_page_mask(vma, start, foll_flags, &ctx);
1146 0 : if (!page || PTR_ERR(page) == -EMLINK) {
1147 0 : ret = faultin_page(vma, start, &foll_flags,
1148 0 : PTR_ERR(page) == -EMLINK, locked);
1149 0 : switch (ret) {
1150 : case 0:
1151 : goto retry;
1152 : case -EBUSY:
1153 : case -EAGAIN:
1154 0 : ret = 0;
1155 : fallthrough;
1156 : case -EFAULT:
1157 : case -ENOMEM:
1158 : case -EHWPOISON:
1159 : goto out;
1160 : }
1161 0 : BUG();
1162 0 : } else if (PTR_ERR(page) == -EEXIST) {
1163 : /*
1164 : * Proper page table entry exists, but no corresponding
1165 : * struct page. If the caller expects **pages to be
1166 : * filled in, bail out now, because that can't be done
1167 : * for this page.
1168 : */
1169 0 : if (pages) {
1170 : ret = PTR_ERR(page);
1171 : goto out;
1172 : }
1173 :
1174 : goto next_page;
1175 0 : } else if (IS_ERR(page)) {
1176 : ret = PTR_ERR(page);
1177 : goto out;
1178 : }
1179 0 : if (pages) {
1180 0 : pages[i] = page;
1181 0 : flush_anon_page(vma, page, start);
1182 : flush_dcache_page(page);
1183 0 : ctx.page_mask = 0;
1184 : }
1185 : next_page:
1186 0 : if (vmas) {
1187 0 : vmas[i] = vma;
1188 0 : ctx.page_mask = 0;
1189 : }
1190 0 : page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
1191 0 : if (page_increm > nr_pages)
1192 0 : page_increm = nr_pages;
1193 0 : i += page_increm;
1194 0 : start += page_increm * PAGE_SIZE;
1195 0 : nr_pages -= page_increm;
1196 0 : } while (nr_pages);
1197 : out:
1198 : if (ctx.pgmap)
1199 : put_dev_pagemap(ctx.pgmap);
1200 0 : return i ? i : ret;
1201 : }
1202 :
1203 : static bool vma_permits_fault(struct vm_area_struct *vma,
1204 : unsigned int fault_flags)
1205 : {
1206 0 : bool write = !!(fault_flags & FAULT_FLAG_WRITE);
1207 0 : bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
1208 0 : vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
1209 :
1210 0 : if (!(vm_flags & vma->vm_flags))
1211 : return false;
1212 :
1213 : /*
1214 : * The architecture might have a hardware protection
1215 : * mechanism other than read/write that can deny access.
1216 : *
1217 : * gup always represents data access, not instruction
1218 : * fetches, so execute=false here:
1219 : */
1220 0 : if (!arch_vma_access_permitted(vma, write, false, foreign))
1221 : return false;
1222 :
1223 : return true;
1224 : }
1225 :
1226 : /**
1227 : * fixup_user_fault() - manually resolve a user page fault
1228 : * @mm: mm_struct of target mm
1229 : * @address: user address
1230 : * @fault_flags:flags to pass down to handle_mm_fault()
1231 : * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller
1232 : * does not allow retry. If NULL, the caller must guarantee
1233 : * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
1234 : *
1235 : * This is meant to be called in the specific scenario where for locking reasons
1236 : * we try to access user memory in atomic context (within a pagefault_disable()
1237 : * section), this returns -EFAULT, and we want to resolve the user fault before
1238 : * trying again.
1239 : *
1240 : * Typically this is meant to be used by the futex code.
1241 : *
1242 : * The main difference with get_user_pages() is that this function will
1243 : * unconditionally call handle_mm_fault() which will in turn perform all the
1244 : * necessary SW fixup of the dirty and young bits in the PTE, while
1245 : * get_user_pages() only guarantees to update these in the struct page.
1246 : *
1247 : * This is important for some architectures where those bits also gate the
1248 : * access permission to the page because they are maintained in software. On
1249 : * such architectures, gup() will not be enough to make a subsequent access
1250 : * succeed.
1251 : *
1252 : * This function will not return with an unlocked mmap_lock. So it has not the
1253 : * same semantics wrt the @mm->mmap_lock as does filemap_fault().
1254 : */
1255 0 : int fixup_user_fault(struct mm_struct *mm,
1256 : unsigned long address, unsigned int fault_flags,
1257 : bool *unlocked)
1258 : {
1259 : struct vm_area_struct *vma;
1260 : vm_fault_t ret;
1261 :
1262 0 : address = untagged_addr(address);
1263 :
1264 0 : if (unlocked)
1265 0 : fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
1266 :
1267 : retry:
1268 0 : vma = find_extend_vma(mm, address);
1269 0 : if (!vma || address < vma->vm_start)
1270 : return -EFAULT;
1271 :
1272 0 : if (!vma_permits_fault(vma, fault_flags))
1273 : return -EFAULT;
1274 :
1275 0 : if ((fault_flags & FAULT_FLAG_KILLABLE) &&
1276 0 : fatal_signal_pending(current))
1277 : return -EINTR;
1278 :
1279 0 : ret = handle_mm_fault(vma, address, fault_flags, NULL);
1280 :
1281 0 : if (ret & VM_FAULT_COMPLETED) {
1282 : /*
1283 : * NOTE: it's a pity that we need to retake the lock here
1284 : * to pair with the unlock() in the callers. Ideally we
1285 : * could tell the callers so they do not need to unlock.
1286 : */
1287 0 : mmap_read_lock(mm);
1288 0 : *unlocked = true;
1289 0 : return 0;
1290 : }
1291 :
1292 0 : if (ret & VM_FAULT_ERROR) {
1293 0 : int err = vm_fault_to_errno(ret, 0);
1294 :
1295 0 : if (err)
1296 : return err;
1297 0 : BUG();
1298 : }
1299 :
1300 0 : if (ret & VM_FAULT_RETRY) {
1301 0 : mmap_read_lock(mm);
1302 0 : *unlocked = true;
1303 0 : fault_flags |= FAULT_FLAG_TRIED;
1304 0 : goto retry;
1305 : }
1306 :
1307 : return 0;
1308 : }
1309 : EXPORT_SYMBOL_GPL(fixup_user_fault);
1310 :
1311 : /*
1312 : * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is
1313 : * specified, it'll also respond to generic signals. The caller of GUP
1314 : * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.
1315 : */
1316 0 : static bool gup_signal_pending(unsigned int flags)
1317 : {
1318 0 : if (fatal_signal_pending(current))
1319 : return true;
1320 :
1321 0 : if (!(flags & FOLL_INTERRUPTIBLE))
1322 : return false;
1323 :
1324 0 : return signal_pending(current);
1325 : }
1326 :
1327 : /*
1328 : * Locking: (*locked == 1) means that the mmap_lock has already been acquired by
1329 : * the caller. This function may drop the mmap_lock. If it does so, then it will
1330 : * set (*locked = 0).
1331 : *
1332 : * (*locked == 0) means that the caller expects this function to acquire and
1333 : * drop the mmap_lock. Therefore, the value of *locked will still be zero when
1334 : * the function returns, even though it may have changed temporarily during
1335 : * function execution.
1336 : *
1337 : * Please note that this function, unlike __get_user_pages(), will not return 0
1338 : * for nr_pages > 0, unless FOLL_NOWAIT is used.
1339 : */
1340 : static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
1341 : unsigned long start,
1342 : unsigned long nr_pages,
1343 : struct page **pages,
1344 : struct vm_area_struct **vmas,
1345 : int *locked,
1346 : unsigned int flags)
1347 : {
1348 : long ret, pages_done;
1349 0 : bool must_unlock = false;
1350 :
1351 : /*
1352 : * The internal caller expects GUP to manage the lock internally and the
1353 : * lock must be released when this returns.
1354 : */
1355 0 : if (!*locked) {
1356 0 : if (mmap_read_lock_killable(mm))
1357 : return -EAGAIN;
1358 0 : must_unlock = true;
1359 0 : *locked = 1;
1360 : }
1361 : else
1362 : mmap_assert_locked(mm);
1363 :
1364 0 : if (flags & FOLL_PIN)
1365 0 : mm_set_has_pinned_flag(&mm->flags);
1366 :
1367 : /*
1368 : * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
1369 : * is to set FOLL_GET if the caller wants pages[] filled in (but has
1370 : * carelessly failed to specify FOLL_GET), so keep doing that, but only
1371 : * for FOLL_GET, not for the newer FOLL_PIN.
1372 : *
1373 : * FOLL_PIN always expects pages to be non-null, but no need to assert
1374 : * that here, as any failures will be obvious enough.
1375 : */
1376 0 : if (pages && !(flags & FOLL_PIN))
1377 0 : flags |= FOLL_GET;
1378 :
1379 0 : pages_done = 0;
1380 : for (;;) {
1381 0 : ret = __get_user_pages(mm, start, nr_pages, flags, pages,
1382 : vmas, locked);
1383 0 : if (!(flags & FOLL_UNLOCKABLE)) {
1384 : /* VM_FAULT_RETRY couldn't trigger, bypass */
1385 : pages_done = ret;
1386 : break;
1387 : }
1388 :
1389 : /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
1390 0 : if (!*locked) {
1391 0 : BUG_ON(ret < 0);
1392 0 : BUG_ON(ret >= nr_pages);
1393 : }
1394 :
1395 0 : if (ret > 0) {
1396 0 : nr_pages -= ret;
1397 0 : pages_done += ret;
1398 0 : if (!nr_pages)
1399 : break;
1400 : }
1401 0 : if (*locked) {
1402 : /*
1403 : * VM_FAULT_RETRY didn't trigger or it was a
1404 : * FOLL_NOWAIT.
1405 : */
1406 0 : if (!pages_done)
1407 0 : pages_done = ret;
1408 : break;
1409 : }
1410 : /*
1411 : * VM_FAULT_RETRY triggered, so seek to the faulting offset.
1412 : * For the prefault case (!pages) we only update counts.
1413 : */
1414 0 : if (likely(pages))
1415 0 : pages += ret;
1416 0 : start += ret << PAGE_SHIFT;
1417 :
1418 : /* The lock was temporarily dropped, so we must unlock later */
1419 0 : must_unlock = true;
1420 :
1421 : retry:
1422 : /*
1423 : * Repeat on the address that fired VM_FAULT_RETRY
1424 : * with both FAULT_FLAG_ALLOW_RETRY and
1425 : * FAULT_FLAG_TRIED. Note that GUP can be interrupted
1426 : * by fatal signals of even common signals, depending on
1427 : * the caller's request. So we need to check it before we
1428 : * start trying again otherwise it can loop forever.
1429 : */
1430 0 : if (gup_signal_pending(flags)) {
1431 0 : if (!pages_done)
1432 0 : pages_done = -EINTR;
1433 : break;
1434 : }
1435 :
1436 0 : ret = mmap_read_lock_killable(mm);
1437 0 : if (ret) {
1438 0 : BUG_ON(ret > 0);
1439 0 : if (!pages_done)
1440 0 : pages_done = ret;
1441 : break;
1442 : }
1443 :
1444 0 : *locked = 1;
1445 0 : ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
1446 : pages, NULL, locked);
1447 0 : if (!*locked) {
1448 : /* Continue to retry until we succeeded */
1449 0 : BUG_ON(ret != 0);
1450 : goto retry;
1451 : }
1452 0 : if (ret != 1) {
1453 0 : BUG_ON(ret > 1);
1454 0 : if (!pages_done)
1455 0 : pages_done = ret;
1456 : break;
1457 : }
1458 0 : nr_pages--;
1459 0 : pages_done++;
1460 0 : if (!nr_pages)
1461 : break;
1462 0 : if (likely(pages))
1463 0 : pages++;
1464 0 : start += PAGE_SIZE;
1465 : }
1466 0 : if (must_unlock && *locked) {
1467 : /*
1468 : * We either temporarily dropped the lock, or the caller
1469 : * requested that we both acquire and drop the lock. Either way,
1470 : * we must now unlock, and notify the caller of that state.
1471 : */
1472 0 : mmap_read_unlock(mm);
1473 0 : *locked = 0;
1474 : }
1475 : return pages_done;
1476 : }
1477 :
1478 : /**
1479 : * populate_vma_page_range() - populate a range of pages in the vma.
1480 : * @vma: target vma
1481 : * @start: start address
1482 : * @end: end address
1483 : * @locked: whether the mmap_lock is still held
1484 : *
1485 : * This takes care of mlocking the pages too if VM_LOCKED is set.
1486 : *
1487 : * Return either number of pages pinned in the vma, or a negative error
1488 : * code on error.
1489 : *
1490 : * vma->vm_mm->mmap_lock must be held.
1491 : *
1492 : * If @locked is NULL, it may be held for read or write and will
1493 : * be unperturbed.
1494 : *
1495 : * If @locked is non-NULL, it must held for read only and may be
1496 : * released. If it's released, *@locked will be set to 0.
1497 : */
1498 0 : long populate_vma_page_range(struct vm_area_struct *vma,
1499 : unsigned long start, unsigned long end, int *locked)
1500 : {
1501 0 : struct mm_struct *mm = vma->vm_mm;
1502 0 : unsigned long nr_pages = (end - start) / PAGE_SIZE;
1503 0 : int local_locked = 1;
1504 : int gup_flags;
1505 : long ret;
1506 :
1507 : VM_BUG_ON(!PAGE_ALIGNED(start));
1508 : VM_BUG_ON(!PAGE_ALIGNED(end));
1509 : VM_BUG_ON_VMA(start < vma->vm_start, vma);
1510 : VM_BUG_ON_VMA(end > vma->vm_end, vma);
1511 0 : mmap_assert_locked(mm);
1512 :
1513 : /*
1514 : * Rightly or wrongly, the VM_LOCKONFAULT case has never used
1515 : * faultin_page() to break COW, so it has no work to do here.
1516 : */
1517 0 : if (vma->vm_flags & VM_LOCKONFAULT)
1518 0 : return nr_pages;
1519 :
1520 0 : gup_flags = FOLL_TOUCH;
1521 : /*
1522 : * We want to touch writable mappings with a write fault in order
1523 : * to break COW, except for shared mappings because these don't COW
1524 : * and we would not want to dirty them for nothing.
1525 : */
1526 0 : if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
1527 0 : gup_flags |= FOLL_WRITE;
1528 :
1529 : /*
1530 : * We want mlock to succeed for regions that have any permissions
1531 : * other than PROT_NONE.
1532 : */
1533 0 : if (vma_is_accessible(vma))
1534 0 : gup_flags |= FOLL_FORCE;
1535 :
1536 0 : if (locked)
1537 0 : gup_flags |= FOLL_UNLOCKABLE;
1538 :
1539 : /*
1540 : * We made sure addr is within a VMA, so the following will
1541 : * not result in a stack expansion that recurses back here.
1542 : */
1543 0 : ret = __get_user_pages(mm, start, nr_pages, gup_flags,
1544 : NULL, NULL, locked ? locked : &local_locked);
1545 0 : lru_add_drain();
1546 0 : return ret;
1547 : }
1548 :
1549 : /*
1550 : * faultin_vma_page_range() - populate (prefault) page tables inside the
1551 : * given VMA range readable/writable
1552 : *
1553 : * This takes care of mlocking the pages, too, if VM_LOCKED is set.
1554 : *
1555 : * @vma: target vma
1556 : * @start: start address
1557 : * @end: end address
1558 : * @write: whether to prefault readable or writable
1559 : * @locked: whether the mmap_lock is still held
1560 : *
1561 : * Returns either number of processed pages in the vma, or a negative error
1562 : * code on error (see __get_user_pages()).
1563 : *
1564 : * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
1565 : * covered by the VMA. If it's released, *@locked will be set to 0.
1566 : */
1567 0 : long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
1568 : unsigned long end, bool write, int *locked)
1569 : {
1570 0 : struct mm_struct *mm = vma->vm_mm;
1571 0 : unsigned long nr_pages = (end - start) / PAGE_SIZE;
1572 : int gup_flags;
1573 : long ret;
1574 :
1575 : VM_BUG_ON(!PAGE_ALIGNED(start));
1576 : VM_BUG_ON(!PAGE_ALIGNED(end));
1577 : VM_BUG_ON_VMA(start < vma->vm_start, vma);
1578 : VM_BUG_ON_VMA(end > vma->vm_end, vma);
1579 0 : mmap_assert_locked(mm);
1580 :
1581 : /*
1582 : * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
1583 : * the page dirty with FOLL_WRITE -- which doesn't make a
1584 : * difference with !FOLL_FORCE, because the page is writable
1585 : * in the page table.
1586 : * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
1587 : * a poisoned page.
1588 : * !FOLL_FORCE: Require proper access permissions.
1589 : */
1590 0 : gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE;
1591 0 : if (write)
1592 0 : gup_flags |= FOLL_WRITE;
1593 :
1594 : /*
1595 : * We want to report -EINVAL instead of -EFAULT for any permission
1596 : * problems or incompatible mappings.
1597 : */
1598 0 : if (check_vma_flags(vma, gup_flags))
1599 : return -EINVAL;
1600 :
1601 0 : ret = __get_user_pages(mm, start, nr_pages, gup_flags,
1602 : NULL, NULL, locked);
1603 0 : lru_add_drain();
1604 0 : return ret;
1605 : }
1606 :
1607 : /*
1608 : * __mm_populate - populate and/or mlock pages within a range of address space.
1609 : *
1610 : * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1611 : * flags. VMAs must be already marked with the desired vm_flags, and
1612 : * mmap_lock must not be held.
1613 : */
1614 0 : int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1615 : {
1616 0 : struct mm_struct *mm = current->mm;
1617 : unsigned long end, nstart, nend;
1618 0 : struct vm_area_struct *vma = NULL;
1619 0 : int locked = 0;
1620 0 : long ret = 0;
1621 :
1622 0 : end = start + len;
1623 :
1624 0 : for (nstart = start; nstart < end; nstart = nend) {
1625 : /*
1626 : * We want to fault in pages for [nstart; end) address range.
1627 : * Find first corresponding VMA.
1628 : */
1629 0 : if (!locked) {
1630 0 : locked = 1;
1631 0 : mmap_read_lock(mm);
1632 0 : vma = find_vma_intersection(mm, nstart, end);
1633 0 : } else if (nstart >= vma->vm_end)
1634 0 : vma = find_vma_intersection(mm, vma->vm_end, end);
1635 :
1636 0 : if (!vma)
1637 : break;
1638 : /*
1639 : * Set [nstart; nend) to intersection of desired address
1640 : * range with the first VMA. Also, skip undesirable VMA types.
1641 : */
1642 0 : nend = min(end, vma->vm_end);
1643 0 : if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1644 0 : continue;
1645 0 : if (nstart < vma->vm_start)
1646 0 : nstart = vma->vm_start;
1647 : /*
1648 : * Now fault in a range of pages. populate_vma_page_range()
1649 : * double checks the vma flags, so that it won't mlock pages
1650 : * if the vma was already munlocked.
1651 : */
1652 0 : ret = populate_vma_page_range(vma, nstart, nend, &locked);
1653 0 : if (ret < 0) {
1654 0 : if (ignore_errors) {
1655 0 : ret = 0;
1656 0 : continue; /* continue at next VMA */
1657 : }
1658 : break;
1659 : }
1660 0 : nend = nstart + ret * PAGE_SIZE;
1661 0 : ret = 0;
1662 : }
1663 0 : if (locked)
1664 : mmap_read_unlock(mm);
1665 0 : return ret; /* 0 or negative error code */
1666 : }
1667 : #else /* CONFIG_MMU */
1668 : static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
1669 : unsigned long nr_pages, struct page **pages,
1670 : struct vm_area_struct **vmas, int *locked,
1671 : unsigned int foll_flags)
1672 : {
1673 : struct vm_area_struct *vma;
1674 : bool must_unlock = false;
1675 : unsigned long vm_flags;
1676 : long i;
1677 :
1678 : if (!nr_pages)
1679 : return 0;
1680 :
1681 : /*
1682 : * The internal caller expects GUP to manage the lock internally and the
1683 : * lock must be released when this returns.
1684 : */
1685 : if (!*locked) {
1686 : if (mmap_read_lock_killable(mm))
1687 : return -EAGAIN;
1688 : must_unlock = true;
1689 : *locked = 1;
1690 : }
1691 :
1692 : /* calculate required read or write permissions.
1693 : * If FOLL_FORCE is set, we only require the "MAY" flags.
1694 : */
1695 : vm_flags = (foll_flags & FOLL_WRITE) ?
1696 : (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1697 : vm_flags &= (foll_flags & FOLL_FORCE) ?
1698 : (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1699 :
1700 : for (i = 0; i < nr_pages; i++) {
1701 : vma = find_vma(mm, start);
1702 : if (!vma)
1703 : break;
1704 :
1705 : /* protect what we can, including chardevs */
1706 : if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1707 : !(vm_flags & vma->vm_flags))
1708 : break;
1709 :
1710 : if (pages) {
1711 : pages[i] = virt_to_page((void *)start);
1712 : if (pages[i])
1713 : get_page(pages[i]);
1714 : }
1715 : if (vmas)
1716 : vmas[i] = vma;
1717 : start = (start + PAGE_SIZE) & PAGE_MASK;
1718 : }
1719 :
1720 : if (must_unlock && *locked) {
1721 : mmap_read_unlock(mm);
1722 : *locked = 0;
1723 : }
1724 :
1725 : return i ? : -EFAULT;
1726 : }
1727 : #endif /* !CONFIG_MMU */
1728 :
1729 : /**
1730 : * fault_in_writeable - fault in userspace address range for writing
1731 : * @uaddr: start of address range
1732 : * @size: size of address range
1733 : *
1734 : * Returns the number of bytes not faulted in (like copy_to_user() and
1735 : * copy_from_user()).
1736 : */
1737 0 : size_t fault_in_writeable(char __user *uaddr, size_t size)
1738 : {
1739 0 : char __user *start = uaddr, *end;
1740 :
1741 0 : if (unlikely(size == 0))
1742 : return 0;
1743 0 : if (!user_write_access_begin(uaddr, size))
1744 : return size;
1745 0 : if (!PAGE_ALIGNED(uaddr)) {
1746 0 : unsafe_put_user(0, uaddr, out);
1747 0 : uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
1748 : }
1749 0 : end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
1750 0 : if (unlikely(end < start))
1751 0 : end = NULL;
1752 0 : while (uaddr != end) {
1753 0 : unsafe_put_user(0, uaddr, out);
1754 0 : uaddr += PAGE_SIZE;
1755 : }
1756 :
1757 : out:
1758 : user_write_access_end();
1759 0 : if (size > uaddr - start)
1760 0 : return size - (uaddr - start);
1761 : return 0;
1762 : }
1763 : EXPORT_SYMBOL(fault_in_writeable);
1764 :
1765 : /**
1766 : * fault_in_subpage_writeable - fault in an address range for writing
1767 : * @uaddr: start of address range
1768 : * @size: size of address range
1769 : *
1770 : * Fault in a user address range for writing while checking for permissions at
1771 : * sub-page granularity (e.g. arm64 MTE). This function should be used when
1772 : * the caller cannot guarantee forward progress of a copy_to_user() loop.
1773 : *
1774 : * Returns the number of bytes not faulted in (like copy_to_user() and
1775 : * copy_from_user()).
1776 : */
1777 0 : size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
1778 : {
1779 : size_t faulted_in;
1780 :
1781 : /*
1782 : * Attempt faulting in at page granularity first for page table
1783 : * permission checking. The arch-specific probe_subpage_writeable()
1784 : * functions may not check for this.
1785 : */
1786 0 : faulted_in = size - fault_in_writeable(uaddr, size);
1787 : if (faulted_in)
1788 : faulted_in -= probe_subpage_writeable(uaddr, faulted_in);
1789 :
1790 0 : return size - faulted_in;
1791 : }
1792 : EXPORT_SYMBOL(fault_in_subpage_writeable);
1793 :
1794 : /*
1795 : * fault_in_safe_writeable - fault in an address range for writing
1796 : * @uaddr: start of address range
1797 : * @size: length of address range
1798 : *
1799 : * Faults in an address range for writing. This is primarily useful when we
1800 : * already know that some or all of the pages in the address range aren't in
1801 : * memory.
1802 : *
1803 : * Unlike fault_in_writeable(), this function is non-destructive.
1804 : *
1805 : * Note that we don't pin or otherwise hold the pages referenced that we fault
1806 : * in. There's no guarantee that they'll stay in memory for any duration of
1807 : * time.
1808 : *
1809 : * Returns the number of bytes not faulted in, like copy_to_user() and
1810 : * copy_from_user().
1811 : */
1812 0 : size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
1813 : {
1814 0 : unsigned long start = (unsigned long)uaddr, end;
1815 0 : struct mm_struct *mm = current->mm;
1816 0 : bool unlocked = false;
1817 :
1818 0 : if (unlikely(size == 0))
1819 : return 0;
1820 0 : end = PAGE_ALIGN(start + size);
1821 0 : if (end < start)
1822 0 : end = 0;
1823 :
1824 : mmap_read_lock(mm);
1825 : do {
1826 0 : if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
1827 : break;
1828 0 : start = (start + PAGE_SIZE) & PAGE_MASK;
1829 0 : } while (start != end);
1830 0 : mmap_read_unlock(mm);
1831 :
1832 0 : if (size > (unsigned long)uaddr - start)
1833 0 : return size - ((unsigned long)uaddr - start);
1834 : return 0;
1835 : }
1836 : EXPORT_SYMBOL(fault_in_safe_writeable);
1837 :
1838 : /**
1839 : * fault_in_readable - fault in userspace address range for reading
1840 : * @uaddr: start of user address range
1841 : * @size: size of user address range
1842 : *
1843 : * Returns the number of bytes not faulted in (like copy_to_user() and
1844 : * copy_from_user()).
1845 : */
1846 0 : size_t fault_in_readable(const char __user *uaddr, size_t size)
1847 : {
1848 0 : const char __user *start = uaddr, *end;
1849 : volatile char c;
1850 :
1851 0 : if (unlikely(size == 0))
1852 : return 0;
1853 0 : if (!user_read_access_begin(uaddr, size))
1854 : return size;
1855 0 : if (!PAGE_ALIGNED(uaddr)) {
1856 0 : unsafe_get_user(c, uaddr, out);
1857 0 : uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
1858 : }
1859 0 : end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
1860 0 : if (unlikely(end < start))
1861 0 : end = NULL;
1862 0 : while (uaddr != end) {
1863 0 : unsafe_get_user(c, uaddr, out);
1864 0 : uaddr += PAGE_SIZE;
1865 : }
1866 :
1867 : out:
1868 : user_read_access_end();
1869 0 : (void)c;
1870 0 : if (size > uaddr - start)
1871 0 : return size - (uaddr - start);
1872 : return 0;
1873 : }
1874 : EXPORT_SYMBOL(fault_in_readable);
1875 :
1876 : /**
1877 : * get_dump_page() - pin user page in memory while writing it to core dump
1878 : * @addr: user address
1879 : *
1880 : * Returns struct page pointer of user page pinned for dump,
1881 : * to be freed afterwards by put_page().
1882 : *
1883 : * Returns NULL on any kind of failure - a hole must then be inserted into
1884 : * the corefile, to preserve alignment with its headers; and also returns
1885 : * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1886 : * allowing a hole to be left in the corefile to save disk space.
1887 : *
1888 : * Called without mmap_lock (takes and releases the mmap_lock by itself).
1889 : */
1890 : #ifdef CONFIG_ELF_CORE
1891 0 : struct page *get_dump_page(unsigned long addr)
1892 : {
1893 : struct page *page;
1894 0 : int locked = 0;
1895 : int ret;
1896 :
1897 0 : ret = __get_user_pages_locked(current->mm, addr, 1, &page, NULL,
1898 : &locked,
1899 : FOLL_FORCE | FOLL_DUMP | FOLL_GET);
1900 0 : return (ret == 1) ? page : NULL;
1901 : }
1902 : #endif /* CONFIG_ELF_CORE */
1903 :
1904 : #ifdef CONFIG_MIGRATION
1905 : /*
1906 : * Returns the number of collected pages. Return value is always >= 0.
1907 : */
1908 0 : static unsigned long collect_longterm_unpinnable_pages(
1909 : struct list_head *movable_page_list,
1910 : unsigned long nr_pages,
1911 : struct page **pages)
1912 : {
1913 0 : unsigned long i, collected = 0;
1914 0 : struct folio *prev_folio = NULL;
1915 0 : bool drain_allow = true;
1916 :
1917 0 : for (i = 0; i < nr_pages; i++) {
1918 0 : struct folio *folio = page_folio(pages[i]);
1919 :
1920 0 : if (folio == prev_folio)
1921 0 : continue;
1922 0 : prev_folio = folio;
1923 :
1924 0 : if (folio_is_longterm_pinnable(folio))
1925 0 : continue;
1926 :
1927 0 : collected++;
1928 :
1929 0 : if (folio_is_device_coherent(folio))
1930 : continue;
1931 :
1932 0 : if (folio_test_hugetlb(folio)) {
1933 : isolate_hugetlb(folio, movable_page_list);
1934 : continue;
1935 : }
1936 :
1937 0 : if (!folio_test_lru(folio) && drain_allow) {
1938 0 : lru_add_drain_all();
1939 0 : drain_allow = false;
1940 : }
1941 :
1942 0 : if (!folio_isolate_lru(folio))
1943 0 : continue;
1944 :
1945 0 : list_add_tail(&folio->lru, movable_page_list);
1946 0 : node_stat_mod_folio(folio,
1947 0 : NR_ISOLATED_ANON + folio_is_file_lru(folio),
1948 : folio_nr_pages(folio));
1949 : }
1950 :
1951 0 : return collected;
1952 : }
1953 :
1954 : /*
1955 : * Unpins all pages and migrates device coherent pages and movable_page_list.
1956 : * Returns -EAGAIN if all pages were successfully migrated or -errno for failure
1957 : * (or partial success).
1958 : */
1959 0 : static int migrate_longterm_unpinnable_pages(
1960 : struct list_head *movable_page_list,
1961 : unsigned long nr_pages,
1962 : struct page **pages)
1963 : {
1964 : int ret;
1965 : unsigned long i;
1966 :
1967 0 : for (i = 0; i < nr_pages; i++) {
1968 0 : struct folio *folio = page_folio(pages[i]);
1969 :
1970 0 : if (folio_is_device_coherent(folio)) {
1971 : /*
1972 : * Migration will fail if the page is pinned, so convert
1973 : * the pin on the source page to a normal reference.
1974 : */
1975 : pages[i] = NULL;
1976 : folio_get(folio);
1977 : gup_put_folio(folio, 1, FOLL_PIN);
1978 :
1979 : if (migrate_device_coherent_page(&folio->page)) {
1980 : ret = -EBUSY;
1981 : goto err;
1982 : }
1983 :
1984 : continue;
1985 : }
1986 :
1987 : /*
1988 : * We can't migrate pages with unexpected references, so drop
1989 : * the reference obtained by __get_user_pages_locked().
1990 : * Migrating pages have been added to movable_page_list after
1991 : * calling folio_isolate_lru() which takes a reference so the
1992 : * page won't be freed if it's migrating.
1993 : */
1994 0 : unpin_user_page(pages[i]);
1995 0 : pages[i] = NULL;
1996 : }
1997 :
1998 0 : if (!list_empty(movable_page_list)) {
1999 0 : struct migration_target_control mtc = {
2000 : .nid = NUMA_NO_NODE,
2001 : .gfp_mask = GFP_USER | __GFP_NOWARN,
2002 : };
2003 :
2004 0 : if (migrate_pages(movable_page_list, alloc_migration_target,
2005 : NULL, (unsigned long)&mtc, MIGRATE_SYNC,
2006 : MR_LONGTERM_PIN, NULL)) {
2007 0 : ret = -ENOMEM;
2008 0 : goto err;
2009 : }
2010 : }
2011 :
2012 0 : putback_movable_pages(movable_page_list);
2013 :
2014 0 : return -EAGAIN;
2015 :
2016 : err:
2017 0 : for (i = 0; i < nr_pages; i++)
2018 0 : if (pages[i])
2019 0 : unpin_user_page(pages[i]);
2020 0 : putback_movable_pages(movable_page_list);
2021 :
2022 0 : return ret;
2023 : }
2024 :
2025 : /*
2026 : * Check whether all pages are *allowed* to be pinned. Rather confusingly, all
2027 : * pages in the range are required to be pinned via FOLL_PIN, before calling
2028 : * this routine.
2029 : *
2030 : * If any pages in the range are not allowed to be pinned, then this routine
2031 : * will migrate those pages away, unpin all the pages in the range and return
2032 : * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then
2033 : * call this routine again.
2034 : *
2035 : * If an error other than -EAGAIN occurs, this indicates a migration failure.
2036 : * The caller should give up, and propagate the error back up the call stack.
2037 : *
2038 : * If everything is OK and all pages in the range are allowed to be pinned, then
2039 : * this routine leaves all pages pinned and returns zero for success.
2040 : */
2041 0 : static long check_and_migrate_movable_pages(unsigned long nr_pages,
2042 : struct page **pages)
2043 : {
2044 : unsigned long collected;
2045 0 : LIST_HEAD(movable_page_list);
2046 :
2047 0 : collected = collect_longterm_unpinnable_pages(&movable_page_list,
2048 : nr_pages, pages);
2049 0 : if (!collected)
2050 : return 0;
2051 :
2052 0 : return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages,
2053 : pages);
2054 : }
2055 : #else
2056 : static long check_and_migrate_movable_pages(unsigned long nr_pages,
2057 : struct page **pages)
2058 : {
2059 : return 0;
2060 : }
2061 : #endif /* CONFIG_MIGRATION */
2062 :
2063 : /*
2064 : * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
2065 : * allows us to process the FOLL_LONGTERM flag.
2066 : */
2067 0 : static long __gup_longterm_locked(struct mm_struct *mm,
2068 : unsigned long start,
2069 : unsigned long nr_pages,
2070 : struct page **pages,
2071 : struct vm_area_struct **vmas,
2072 : int *locked,
2073 : unsigned int gup_flags)
2074 : {
2075 : unsigned int flags;
2076 : long rc, nr_pinned_pages;
2077 :
2078 0 : if (!(gup_flags & FOLL_LONGTERM))
2079 : return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
2080 : locked, gup_flags);
2081 :
2082 0 : flags = memalloc_pin_save();
2083 : do {
2084 0 : nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
2085 : pages, vmas, locked,
2086 : gup_flags);
2087 0 : if (nr_pinned_pages <= 0) {
2088 : rc = nr_pinned_pages;
2089 : break;
2090 : }
2091 :
2092 : /* FOLL_LONGTERM implies FOLL_PIN */
2093 0 : rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
2094 0 : } while (rc == -EAGAIN);
2095 0 : memalloc_pin_restore(flags);
2096 0 : return rc ? rc : nr_pinned_pages;
2097 : }
2098 :
2099 : /*
2100 : * Check that the given flags are valid for the exported gup/pup interface, and
2101 : * update them with the required flags that the caller must have set.
2102 : */
2103 0 : static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
2104 : int *locked, unsigned int *gup_flags_p,
2105 : unsigned int to_set)
2106 : {
2107 0 : unsigned int gup_flags = *gup_flags_p;
2108 :
2109 : /*
2110 : * These flags not allowed to be specified externally to the gup
2111 : * interfaces:
2112 : * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
2113 : * - FOLL_REMOTE is internal only and used on follow_page()
2114 : * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL
2115 : */
2116 0 : if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | FOLL_UNLOCKABLE |
2117 : FOLL_REMOTE | FOLL_FAST_ONLY)))
2118 : return false;
2119 :
2120 0 : gup_flags |= to_set;
2121 0 : if (locked) {
2122 : /* At the external interface locked must be set */
2123 0 : if (WARN_ON_ONCE(*locked != 1))
2124 : return false;
2125 :
2126 0 : gup_flags |= FOLL_UNLOCKABLE;
2127 : }
2128 :
2129 : /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2130 0 : if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
2131 : (FOLL_PIN | FOLL_GET)))
2132 : return false;
2133 :
2134 : /* LONGTERM can only be specified when pinning */
2135 0 : if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM)))
2136 : return false;
2137 :
2138 : /* Pages input must be given if using GET/PIN */
2139 0 : if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages))
2140 : return false;
2141 :
2142 : /* We want to allow the pgmap to be hot-unplugged at all times */
2143 0 : if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) &&
2144 : (gup_flags & FOLL_PCI_P2PDMA)))
2145 : return false;
2146 :
2147 : /*
2148 : * Can't use VMAs with locked, as locked allows GUP to unlock
2149 : * which invalidates the vmas array
2150 : */
2151 0 : if (WARN_ON_ONCE(vmas && (gup_flags & FOLL_UNLOCKABLE)))
2152 : return false;
2153 :
2154 0 : *gup_flags_p = gup_flags;
2155 0 : return true;
2156 : }
2157 :
2158 : #ifdef CONFIG_MMU
2159 : /**
2160 : * get_user_pages_remote() - pin user pages in memory
2161 : * @mm: mm_struct of target mm
2162 : * @start: starting user address
2163 : * @nr_pages: number of pages from start to pin
2164 : * @gup_flags: flags modifying lookup behaviour
2165 : * @pages: array that receives pointers to the pages pinned.
2166 : * Should be at least nr_pages long. Or NULL, if caller
2167 : * only intends to ensure the pages are faulted in.
2168 : * @vmas: array of pointers to vmas corresponding to each page.
2169 : * Or NULL if the caller does not require them.
2170 : * @locked: pointer to lock flag indicating whether lock is held and
2171 : * subsequently whether VM_FAULT_RETRY functionality can be
2172 : * utilised. Lock must initially be held.
2173 : *
2174 : * Returns either number of pages pinned (which may be less than the
2175 : * number requested), or an error. Details about the return value:
2176 : *
2177 : * -- If nr_pages is 0, returns 0.
2178 : * -- If nr_pages is >0, but no pages were pinned, returns -errno.
2179 : * -- If nr_pages is >0, and some pages were pinned, returns the number of
2180 : * pages pinned. Again, this may be less than nr_pages.
2181 : *
2182 : * The caller is responsible for releasing returned @pages, via put_page().
2183 : *
2184 : * @vmas are valid only as long as mmap_lock is held.
2185 : *
2186 : * Must be called with mmap_lock held for read or write.
2187 : *
2188 : * get_user_pages_remote walks a process's page tables and takes a reference
2189 : * to each struct page that each user address corresponds to at a given
2190 : * instant. That is, it takes the page that would be accessed if a user
2191 : * thread accesses the given user virtual address at that instant.
2192 : *
2193 : * This does not guarantee that the page exists in the user mappings when
2194 : * get_user_pages_remote returns, and there may even be a completely different
2195 : * page there in some cases (eg. if mmapped pagecache has been invalidated
2196 : * and subsequently re faulted). However it does guarantee that the page
2197 : * won't be freed completely. And mostly callers simply care that the page
2198 : * contains data that was valid *at some point in time*. Typically, an IO
2199 : * or similar operation cannot guarantee anything stronger anyway because
2200 : * locks can't be held over the syscall boundary.
2201 : *
2202 : * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
2203 : * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
2204 : * be called after the page is finished with, and before put_page is called.
2205 : *
2206 : * get_user_pages_remote is typically used for fewer-copy IO operations,
2207 : * to get a handle on the memory by some means other than accesses
2208 : * via the user virtual addresses. The pages may be submitted for
2209 : * DMA to devices or accessed via their kernel linear mapping (via the
2210 : * kmap APIs). Care should be taken to use the correct cache flushing APIs.
2211 : *
2212 : * See also get_user_pages_fast, for performance critical applications.
2213 : *
2214 : * get_user_pages_remote should be phased out in favor of
2215 : * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
2216 : * should use get_user_pages_remote because it cannot pass
2217 : * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
2218 : */
2219 0 : long get_user_pages_remote(struct mm_struct *mm,
2220 : unsigned long start, unsigned long nr_pages,
2221 : unsigned int gup_flags, struct page **pages,
2222 : struct vm_area_struct **vmas, int *locked)
2223 : {
2224 0 : int local_locked = 1;
2225 :
2226 0 : if (!is_valid_gup_args(pages, vmas, locked, &gup_flags,
2227 : FOLL_TOUCH | FOLL_REMOTE))
2228 : return -EINVAL;
2229 :
2230 0 : return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
2231 : locked ? locked : &local_locked,
2232 : gup_flags);
2233 : }
2234 : EXPORT_SYMBOL(get_user_pages_remote);
2235 :
2236 : #else /* CONFIG_MMU */
2237 : long get_user_pages_remote(struct mm_struct *mm,
2238 : unsigned long start, unsigned long nr_pages,
2239 : unsigned int gup_flags, struct page **pages,
2240 : struct vm_area_struct **vmas, int *locked)
2241 : {
2242 : return 0;
2243 : }
2244 : #endif /* !CONFIG_MMU */
2245 :
2246 : /**
2247 : * get_user_pages() - pin user pages in memory
2248 : * @start: starting user address
2249 : * @nr_pages: number of pages from start to pin
2250 : * @gup_flags: flags modifying lookup behaviour
2251 : * @pages: array that receives pointers to the pages pinned.
2252 : * Should be at least nr_pages long. Or NULL, if caller
2253 : * only intends to ensure the pages are faulted in.
2254 : * @vmas: array of pointers to vmas corresponding to each page.
2255 : * Or NULL if the caller does not require them.
2256 : *
2257 : * This is the same as get_user_pages_remote(), just with a less-flexible
2258 : * calling convention where we assume that the mm being operated on belongs to
2259 : * the current task, and doesn't allow passing of a locked parameter. We also
2260 : * obviously don't pass FOLL_REMOTE in here.
2261 : */
2262 0 : long get_user_pages(unsigned long start, unsigned long nr_pages,
2263 : unsigned int gup_flags, struct page **pages,
2264 : struct vm_area_struct **vmas)
2265 : {
2266 0 : int locked = 1;
2267 :
2268 0 : if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_TOUCH))
2269 : return -EINVAL;
2270 :
2271 0 : return __get_user_pages_locked(current->mm, start, nr_pages, pages,
2272 : vmas, &locked, gup_flags);
2273 : }
2274 : EXPORT_SYMBOL(get_user_pages);
2275 :
2276 : /*
2277 : * get_user_pages_unlocked() is suitable to replace the form:
2278 : *
2279 : * mmap_read_lock(mm);
2280 : * get_user_pages(mm, ..., pages, NULL);
2281 : * mmap_read_unlock(mm);
2282 : *
2283 : * with:
2284 : *
2285 : * get_user_pages_unlocked(mm, ..., pages);
2286 : *
2287 : * It is functionally equivalent to get_user_pages_fast so
2288 : * get_user_pages_fast should be used instead if specific gup_flags
2289 : * (e.g. FOLL_FORCE) are not required.
2290 : */
2291 0 : long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
2292 : struct page **pages, unsigned int gup_flags)
2293 : {
2294 0 : int locked = 0;
2295 :
2296 0 : if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
2297 : FOLL_TOUCH | FOLL_UNLOCKABLE))
2298 : return -EINVAL;
2299 :
2300 0 : return __get_user_pages_locked(current->mm, start, nr_pages, pages,
2301 : NULL, &locked, gup_flags);
2302 : }
2303 : EXPORT_SYMBOL(get_user_pages_unlocked);
2304 :
2305 : /*
2306 : * Fast GUP
2307 : *
2308 : * get_user_pages_fast attempts to pin user pages by walking the page
2309 : * tables directly and avoids taking locks. Thus the walker needs to be
2310 : * protected from page table pages being freed from under it, and should
2311 : * block any THP splits.
2312 : *
2313 : * One way to achieve this is to have the walker disable interrupts, and
2314 : * rely on IPIs from the TLB flushing code blocking before the page table
2315 : * pages are freed. This is unsuitable for architectures that do not need
2316 : * to broadcast an IPI when invalidating TLBs.
2317 : *
2318 : * Another way to achieve this is to batch up page table containing pages
2319 : * belonging to more than one mm_user, then rcu_sched a callback to free those
2320 : * pages. Disabling interrupts will allow the fast_gup walker to both block
2321 : * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
2322 : * (which is a relatively rare event). The code below adopts this strategy.
2323 : *
2324 : * Before activating this code, please be aware that the following assumptions
2325 : * are currently made:
2326 : *
2327 : * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
2328 : * free pages containing page tables or TLB flushing requires IPI broadcast.
2329 : *
2330 : * *) ptes can be read atomically by the architecture.
2331 : *
2332 : * *) access_ok is sufficient to validate userspace address ranges.
2333 : *
2334 : * The last two assumptions can be relaxed by the addition of helper functions.
2335 : *
2336 : * This code is based heavily on the PowerPC implementation by Nick Piggin.
2337 : */
2338 : #ifdef CONFIG_HAVE_FAST_GUP
2339 :
2340 : static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
2341 : unsigned int flags,
2342 : struct page **pages)
2343 : {
2344 : while ((*nr) - nr_start) {
2345 : struct page *page = pages[--(*nr)];
2346 :
2347 : ClearPageReferenced(page);
2348 : if (flags & FOLL_PIN)
2349 : unpin_user_page(page);
2350 : else
2351 : put_page(page);
2352 : }
2353 : }
2354 :
2355 : #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
2356 : /*
2357 : * Fast-gup relies on pte change detection to avoid concurrent pgtable
2358 : * operations.
2359 : *
2360 : * To pin the page, fast-gup needs to do below in order:
2361 : * (1) pin the page (by prefetching pte), then (2) check pte not changed.
2362 : *
2363 : * For the rest of pgtable operations where pgtable updates can be racy
2364 : * with fast-gup, we need to do (1) clear pte, then (2) check whether page
2365 : * is pinned.
2366 : *
2367 : * Above will work for all pte-level operations, including THP split.
2368 : *
2369 : * For THP collapse, it's a bit more complicated because fast-gup may be
2370 : * walking a pgtable page that is being freed (pte is still valid but pmd
2371 : * can be cleared already). To avoid race in such condition, we need to
2372 : * also check pmd here to make sure pmd doesn't change (corresponds to
2373 : * pmdp_collapse_flush() in the THP collapse code path).
2374 : */
2375 : static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
2376 : unsigned long end, unsigned int flags,
2377 : struct page **pages, int *nr)
2378 : {
2379 : struct dev_pagemap *pgmap = NULL;
2380 : int nr_start = *nr, ret = 0;
2381 : pte_t *ptep, *ptem;
2382 :
2383 : ptem = ptep = pte_offset_map(&pmd, addr);
2384 : do {
2385 : pte_t pte = ptep_get_lockless(ptep);
2386 : struct page *page;
2387 : struct folio *folio;
2388 :
2389 : if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
2390 : goto pte_unmap;
2391 :
2392 : if (!pte_access_permitted(pte, flags & FOLL_WRITE))
2393 : goto pte_unmap;
2394 :
2395 : if (pte_devmap(pte)) {
2396 : if (unlikely(flags & FOLL_LONGTERM))
2397 : goto pte_unmap;
2398 :
2399 : pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
2400 : if (unlikely(!pgmap)) {
2401 : undo_dev_pagemap(nr, nr_start, flags, pages);
2402 : goto pte_unmap;
2403 : }
2404 : } else if (pte_special(pte))
2405 : goto pte_unmap;
2406 :
2407 : VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
2408 : page = pte_page(pte);
2409 :
2410 : folio = try_grab_folio(page, 1, flags);
2411 : if (!folio)
2412 : goto pte_unmap;
2413 :
2414 : if (unlikely(page_is_secretmem(page))) {
2415 : gup_put_folio(folio, 1, flags);
2416 : goto pte_unmap;
2417 : }
2418 :
2419 : if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
2420 : unlikely(pte_val(pte) != pte_val(*ptep))) {
2421 : gup_put_folio(folio, 1, flags);
2422 : goto pte_unmap;
2423 : }
2424 :
2425 : if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
2426 : gup_put_folio(folio, 1, flags);
2427 : goto pte_unmap;
2428 : }
2429 :
2430 : /*
2431 : * We need to make the page accessible if and only if we are
2432 : * going to access its content (the FOLL_PIN case). Please
2433 : * see Documentation/core-api/pin_user_pages.rst for
2434 : * details.
2435 : */
2436 : if (flags & FOLL_PIN) {
2437 : ret = arch_make_page_accessible(page);
2438 : if (ret) {
2439 : gup_put_folio(folio, 1, flags);
2440 : goto pte_unmap;
2441 : }
2442 : }
2443 : folio_set_referenced(folio);
2444 : pages[*nr] = page;
2445 : (*nr)++;
2446 : } while (ptep++, addr += PAGE_SIZE, addr != end);
2447 :
2448 : ret = 1;
2449 :
2450 : pte_unmap:
2451 : if (pgmap)
2452 : put_dev_pagemap(pgmap);
2453 : pte_unmap(ptem);
2454 : return ret;
2455 : }
2456 : #else
2457 :
2458 : /*
2459 : * If we can't determine whether or not a pte is special, then fail immediately
2460 : * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
2461 : * to be special.
2462 : *
2463 : * For a futex to be placed on a THP tail page, get_futex_key requires a
2464 : * get_user_pages_fast_only implementation that can pin pages. Thus it's still
2465 : * useful to have gup_huge_pmd even if we can't operate on ptes.
2466 : */
2467 : static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
2468 : unsigned long end, unsigned int flags,
2469 : struct page **pages, int *nr)
2470 : {
2471 : return 0;
2472 : }
2473 : #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
2474 :
2475 : #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
2476 : static int __gup_device_huge(unsigned long pfn, unsigned long addr,
2477 : unsigned long end, unsigned int flags,
2478 : struct page **pages, int *nr)
2479 : {
2480 : int nr_start = *nr;
2481 : struct dev_pagemap *pgmap = NULL;
2482 :
2483 : do {
2484 : struct page *page = pfn_to_page(pfn);
2485 :
2486 : pgmap = get_dev_pagemap(pfn, pgmap);
2487 : if (unlikely(!pgmap)) {
2488 : undo_dev_pagemap(nr, nr_start, flags, pages);
2489 : break;
2490 : }
2491 :
2492 : if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
2493 : undo_dev_pagemap(nr, nr_start, flags, pages);
2494 : break;
2495 : }
2496 :
2497 : SetPageReferenced(page);
2498 : pages[*nr] = page;
2499 : if (unlikely(try_grab_page(page, flags))) {
2500 : undo_dev_pagemap(nr, nr_start, flags, pages);
2501 : break;
2502 : }
2503 : (*nr)++;
2504 : pfn++;
2505 : } while (addr += PAGE_SIZE, addr != end);
2506 :
2507 : put_dev_pagemap(pgmap);
2508 : return addr == end;
2509 : }
2510 :
2511 : static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
2512 : unsigned long end, unsigned int flags,
2513 : struct page **pages, int *nr)
2514 : {
2515 : unsigned long fault_pfn;
2516 : int nr_start = *nr;
2517 :
2518 : fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
2519 : if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
2520 : return 0;
2521 :
2522 : if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
2523 : undo_dev_pagemap(nr, nr_start, flags, pages);
2524 : return 0;
2525 : }
2526 : return 1;
2527 : }
2528 :
2529 : static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
2530 : unsigned long end, unsigned int flags,
2531 : struct page **pages, int *nr)
2532 : {
2533 : unsigned long fault_pfn;
2534 : int nr_start = *nr;
2535 :
2536 : fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
2537 : if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
2538 : return 0;
2539 :
2540 : if (unlikely(pud_val(orig) != pud_val(*pudp))) {
2541 : undo_dev_pagemap(nr, nr_start, flags, pages);
2542 : return 0;
2543 : }
2544 : return 1;
2545 : }
2546 : #else
2547 : static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
2548 : unsigned long end, unsigned int flags,
2549 : struct page **pages, int *nr)
2550 : {
2551 : BUILD_BUG();
2552 : return 0;
2553 : }
2554 :
2555 : static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
2556 : unsigned long end, unsigned int flags,
2557 : struct page **pages, int *nr)
2558 : {
2559 : BUILD_BUG();
2560 : return 0;
2561 : }
2562 : #endif
2563 :
2564 : static int record_subpages(struct page *page, unsigned long addr,
2565 : unsigned long end, struct page **pages)
2566 : {
2567 : int nr;
2568 :
2569 : for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
2570 : pages[nr] = nth_page(page, nr);
2571 :
2572 : return nr;
2573 : }
2574 :
2575 : #ifdef CONFIG_ARCH_HAS_HUGEPD
2576 : static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
2577 : unsigned long sz)
2578 : {
2579 : unsigned long __boundary = (addr + sz) & ~(sz-1);
2580 : return (__boundary - 1 < end - 1) ? __boundary : end;
2581 : }
2582 :
2583 : static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
2584 : unsigned long end, unsigned int flags,
2585 : struct page **pages, int *nr)
2586 : {
2587 : unsigned long pte_end;
2588 : struct page *page;
2589 : struct folio *folio;
2590 : pte_t pte;
2591 : int refs;
2592 :
2593 : pte_end = (addr + sz) & ~(sz-1);
2594 : if (pte_end < end)
2595 : end = pte_end;
2596 :
2597 : pte = huge_ptep_get(ptep);
2598 :
2599 : if (!pte_access_permitted(pte, flags & FOLL_WRITE))
2600 : return 0;
2601 :
2602 : /* hugepages are never "special" */
2603 : VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
2604 :
2605 : page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT);
2606 : refs = record_subpages(page, addr, end, pages + *nr);
2607 :
2608 : folio = try_grab_folio(page, refs, flags);
2609 : if (!folio)
2610 : return 0;
2611 :
2612 : if (unlikely(pte_val(pte) != pte_val(*ptep))) {
2613 : gup_put_folio(folio, refs, flags);
2614 : return 0;
2615 : }
2616 :
2617 : if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
2618 : gup_put_folio(folio, refs, flags);
2619 : return 0;
2620 : }
2621 :
2622 : *nr += refs;
2623 : folio_set_referenced(folio);
2624 : return 1;
2625 : }
2626 :
2627 : static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
2628 : unsigned int pdshift, unsigned long end, unsigned int flags,
2629 : struct page **pages, int *nr)
2630 : {
2631 : pte_t *ptep;
2632 : unsigned long sz = 1UL << hugepd_shift(hugepd);
2633 : unsigned long next;
2634 :
2635 : ptep = hugepte_offset(hugepd, addr, pdshift);
2636 : do {
2637 : next = hugepte_addr_end(addr, end, sz);
2638 : if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
2639 : return 0;
2640 : } while (ptep++, addr = next, addr != end);
2641 :
2642 : return 1;
2643 : }
2644 : #else
2645 : static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
2646 : unsigned int pdshift, unsigned long end, unsigned int flags,
2647 : struct page **pages, int *nr)
2648 : {
2649 : return 0;
2650 : }
2651 : #endif /* CONFIG_ARCH_HAS_HUGEPD */
2652 :
2653 : static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
2654 : unsigned long end, unsigned int flags,
2655 : struct page **pages, int *nr)
2656 : {
2657 : struct page *page;
2658 : struct folio *folio;
2659 : int refs;
2660 :
2661 : if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
2662 : return 0;
2663 :
2664 : if (pmd_devmap(orig)) {
2665 : if (unlikely(flags & FOLL_LONGTERM))
2666 : return 0;
2667 : return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
2668 : pages, nr);
2669 : }
2670 :
2671 : page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
2672 : refs = record_subpages(page, addr, end, pages + *nr);
2673 :
2674 : folio = try_grab_folio(page, refs, flags);
2675 : if (!folio)
2676 : return 0;
2677 :
2678 : if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
2679 : gup_put_folio(folio, refs, flags);
2680 : return 0;
2681 : }
2682 :
2683 : if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
2684 : gup_put_folio(folio, refs, flags);
2685 : return 0;
2686 : }
2687 :
2688 : *nr += refs;
2689 : folio_set_referenced(folio);
2690 : return 1;
2691 : }
2692 :
2693 : static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
2694 : unsigned long end, unsigned int flags,
2695 : struct page **pages, int *nr)
2696 : {
2697 : struct page *page;
2698 : struct folio *folio;
2699 : int refs;
2700 :
2701 : if (!pud_access_permitted(orig, flags & FOLL_WRITE))
2702 : return 0;
2703 :
2704 : if (pud_devmap(orig)) {
2705 : if (unlikely(flags & FOLL_LONGTERM))
2706 : return 0;
2707 : return __gup_device_huge_pud(orig, pudp, addr, end, flags,
2708 : pages, nr);
2709 : }
2710 :
2711 : page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT);
2712 : refs = record_subpages(page, addr, end, pages + *nr);
2713 :
2714 : folio = try_grab_folio(page, refs, flags);
2715 : if (!folio)
2716 : return 0;
2717 :
2718 : if (unlikely(pud_val(orig) != pud_val(*pudp))) {
2719 : gup_put_folio(folio, refs, flags);
2720 : return 0;
2721 : }
2722 :
2723 : if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
2724 : gup_put_folio(folio, refs, flags);
2725 : return 0;
2726 : }
2727 :
2728 : *nr += refs;
2729 : folio_set_referenced(folio);
2730 : return 1;
2731 : }
2732 :
2733 : static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
2734 : unsigned long end, unsigned int flags,
2735 : struct page **pages, int *nr)
2736 : {
2737 : int refs;
2738 : struct page *page;
2739 : struct folio *folio;
2740 :
2741 : if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
2742 : return 0;
2743 :
2744 : BUILD_BUG_ON(pgd_devmap(orig));
2745 :
2746 : page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT);
2747 : refs = record_subpages(page, addr, end, pages + *nr);
2748 :
2749 : folio = try_grab_folio(page, refs, flags);
2750 : if (!folio)
2751 : return 0;
2752 :
2753 : if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
2754 : gup_put_folio(folio, refs, flags);
2755 : return 0;
2756 : }
2757 :
2758 : *nr += refs;
2759 : folio_set_referenced(folio);
2760 : return 1;
2761 : }
2762 :
2763 : static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
2764 : unsigned int flags, struct page **pages, int *nr)
2765 : {
2766 : unsigned long next;
2767 : pmd_t *pmdp;
2768 :
2769 : pmdp = pmd_offset_lockless(pudp, pud, addr);
2770 : do {
2771 : pmd_t pmd = pmdp_get_lockless(pmdp);
2772 :
2773 : next = pmd_addr_end(addr, end);
2774 : if (!pmd_present(pmd))
2775 : return 0;
2776 :
2777 : if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
2778 : pmd_devmap(pmd))) {
2779 : if (pmd_protnone(pmd) &&
2780 : !gup_can_follow_protnone(flags))
2781 : return 0;
2782 :
2783 : if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
2784 : pages, nr))
2785 : return 0;
2786 :
2787 : } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
2788 : /*
2789 : * architecture have different format for hugetlbfs
2790 : * pmd format and THP pmd format
2791 : */
2792 : if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
2793 : PMD_SHIFT, next, flags, pages, nr))
2794 : return 0;
2795 : } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
2796 : return 0;
2797 : } while (pmdp++, addr = next, addr != end);
2798 :
2799 : return 1;
2800 : }
2801 :
2802 : static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
2803 : unsigned int flags, struct page **pages, int *nr)
2804 : {
2805 : unsigned long next;
2806 : pud_t *pudp;
2807 :
2808 : pudp = pud_offset_lockless(p4dp, p4d, addr);
2809 : do {
2810 : pud_t pud = READ_ONCE(*pudp);
2811 :
2812 : next = pud_addr_end(addr, end);
2813 : if (unlikely(!pud_present(pud)))
2814 : return 0;
2815 : if (unlikely(pud_huge(pud) || pud_devmap(pud))) {
2816 : if (!gup_huge_pud(pud, pudp, addr, next, flags,
2817 : pages, nr))
2818 : return 0;
2819 : } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
2820 : if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
2821 : PUD_SHIFT, next, flags, pages, nr))
2822 : return 0;
2823 : } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
2824 : return 0;
2825 : } while (pudp++, addr = next, addr != end);
2826 :
2827 : return 1;
2828 : }
2829 :
2830 : static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
2831 : unsigned int flags, struct page **pages, int *nr)
2832 : {
2833 : unsigned long next;
2834 : p4d_t *p4dp;
2835 :
2836 : p4dp = p4d_offset_lockless(pgdp, pgd, addr);
2837 : do {
2838 : p4d_t p4d = READ_ONCE(*p4dp);
2839 :
2840 : next = p4d_addr_end(addr, end);
2841 : if (p4d_none(p4d))
2842 : return 0;
2843 : BUILD_BUG_ON(p4d_huge(p4d));
2844 : if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
2845 : if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
2846 : P4D_SHIFT, next, flags, pages, nr))
2847 : return 0;
2848 : } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
2849 : return 0;
2850 : } while (p4dp++, addr = next, addr != end);
2851 :
2852 : return 1;
2853 : }
2854 :
2855 : static void gup_pgd_range(unsigned long addr, unsigned long end,
2856 : unsigned int flags, struct page **pages, int *nr)
2857 : {
2858 : unsigned long next;
2859 : pgd_t *pgdp;
2860 :
2861 : pgdp = pgd_offset(current->mm, addr);
2862 : do {
2863 : pgd_t pgd = READ_ONCE(*pgdp);
2864 :
2865 : next = pgd_addr_end(addr, end);
2866 : if (pgd_none(pgd))
2867 : return;
2868 : if (unlikely(pgd_huge(pgd))) {
2869 : if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
2870 : pages, nr))
2871 : return;
2872 : } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
2873 : if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
2874 : PGDIR_SHIFT, next, flags, pages, nr))
2875 : return;
2876 : } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
2877 : return;
2878 : } while (pgdp++, addr = next, addr != end);
2879 : }
2880 : #else
2881 : static inline void gup_pgd_range(unsigned long addr, unsigned long end,
2882 : unsigned int flags, struct page **pages, int *nr)
2883 : {
2884 : }
2885 : #endif /* CONFIG_HAVE_FAST_GUP */
2886 :
2887 : #ifndef gup_fast_permitted
2888 : /*
2889 : * Check if it's allowed to use get_user_pages_fast_only() for the range, or
2890 : * we need to fall back to the slow version:
2891 : */
2892 : static bool gup_fast_permitted(unsigned long start, unsigned long end)
2893 : {
2894 : return true;
2895 : }
2896 : #endif
2897 :
2898 : static unsigned long lockless_pages_from_mm(unsigned long start,
2899 : unsigned long end,
2900 : unsigned int gup_flags,
2901 : struct page **pages)
2902 : {
2903 : unsigned long flags;
2904 0 : int nr_pinned = 0;
2905 : unsigned seq;
2906 :
2907 : if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
2908 : !gup_fast_permitted(start, end))
2909 : return 0;
2910 :
2911 : if (gup_flags & FOLL_PIN) {
2912 : seq = raw_read_seqcount(¤t->mm->write_protect_seq);
2913 : if (seq & 1)
2914 : return 0;
2915 : }
2916 :
2917 : /*
2918 : * Disable interrupts. The nested form is used, in order to allow full,
2919 : * general purpose use of this routine.
2920 : *
2921 : * With interrupts disabled, we block page table pages from being freed
2922 : * from under us. See struct mmu_table_batch comments in
2923 : * include/asm-generic/tlb.h for more details.
2924 : *
2925 : * We do not adopt an rcu_read_lock() here as we also want to block IPIs
2926 : * that come from THPs splitting.
2927 : */
2928 : local_irq_save(flags);
2929 : gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
2930 : local_irq_restore(flags);
2931 :
2932 : /*
2933 : * When pinning pages for DMA there could be a concurrent write protect
2934 : * from fork() via copy_page_range(), in this case always fail fast GUP.
2935 : */
2936 : if (gup_flags & FOLL_PIN) {
2937 : if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) {
2938 : unpin_user_pages_lockless(pages, nr_pinned);
2939 : return 0;
2940 : } else {
2941 : sanity_check_pinned_pages(pages, nr_pinned);
2942 : }
2943 : }
2944 : return nr_pinned;
2945 : }
2946 :
2947 0 : static int internal_get_user_pages_fast(unsigned long start,
2948 : unsigned long nr_pages,
2949 : unsigned int gup_flags,
2950 : struct page **pages)
2951 : {
2952 : unsigned long len, end;
2953 : unsigned long nr_pinned;
2954 0 : int locked = 0;
2955 : int ret;
2956 :
2957 0 : if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
2958 : FOLL_FORCE | FOLL_PIN | FOLL_GET |
2959 : FOLL_FAST_ONLY | FOLL_NOFAULT |
2960 : FOLL_PCI_P2PDMA)))
2961 : return -EINVAL;
2962 :
2963 0 : if (gup_flags & FOLL_PIN)
2964 0 : mm_set_has_pinned_flag(¤t->mm->flags);
2965 :
2966 0 : if (!(gup_flags & FOLL_FAST_ONLY))
2967 : might_lock_read(¤t->mm->mmap_lock);
2968 :
2969 0 : start = untagged_addr(start) & PAGE_MASK;
2970 0 : len = nr_pages << PAGE_SHIFT;
2971 0 : if (check_add_overflow(start, len, &end))
2972 : return 0;
2973 0 : if (unlikely(!access_ok((void __user *)start, len)))
2974 : return -EFAULT;
2975 :
2976 0 : nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
2977 0 : if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
2978 : return nr_pinned;
2979 :
2980 : /* Slow path: try to get the remaining pages with get_user_pages */
2981 0 : start += nr_pinned << PAGE_SHIFT;
2982 0 : pages += nr_pinned;
2983 0 : ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,
2984 : pages, NULL, &locked,
2985 : gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);
2986 : if (ret < 0) {
2987 : /*
2988 : * The caller has to unpin the pages we already pinned so
2989 : * returning -errno is not an option
2990 : */
2991 : if (nr_pinned)
2992 : return nr_pinned;
2993 : return ret;
2994 : }
2995 : return ret + nr_pinned;
2996 : }
2997 :
2998 : /**
2999 : * get_user_pages_fast_only() - pin user pages in memory
3000 : * @start: starting user address
3001 : * @nr_pages: number of pages from start to pin
3002 : * @gup_flags: flags modifying pin behaviour
3003 : * @pages: array that receives pointers to the pages pinned.
3004 : * Should be at least nr_pages long.
3005 : *
3006 : * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
3007 : * the regular GUP.
3008 : *
3009 : * If the architecture does not support this function, simply return with no
3010 : * pages pinned.
3011 : *
3012 : * Careful, careful! COW breaking can go either way, so a non-write
3013 : * access can get ambiguous page results. If you call this function without
3014 : * 'write' set, you'd better be sure that you're ok with that ambiguity.
3015 : */
3016 0 : int get_user_pages_fast_only(unsigned long start, int nr_pages,
3017 : unsigned int gup_flags, struct page **pages)
3018 : {
3019 : /*
3020 : * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
3021 : * because gup fast is always a "pin with a +1 page refcount" request.
3022 : *
3023 : * FOLL_FAST_ONLY is required in order to match the API description of
3024 : * this routine: no fall back to regular ("slow") GUP.
3025 : */
3026 0 : if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
3027 : FOLL_GET | FOLL_FAST_ONLY))
3028 : return -EINVAL;
3029 :
3030 0 : return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
3031 : }
3032 : EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
3033 :
3034 : /**
3035 : * get_user_pages_fast() - pin user pages in memory
3036 : * @start: starting user address
3037 : * @nr_pages: number of pages from start to pin
3038 : * @gup_flags: flags modifying pin behaviour
3039 : * @pages: array that receives pointers to the pages pinned.
3040 : * Should be at least nr_pages long.
3041 : *
3042 : * Attempt to pin user pages in memory without taking mm->mmap_lock.
3043 : * If not successful, it will fall back to taking the lock and
3044 : * calling get_user_pages().
3045 : *
3046 : * Returns number of pages pinned. This may be fewer than the number requested.
3047 : * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
3048 : * -errno.
3049 : */
3050 0 : int get_user_pages_fast(unsigned long start, int nr_pages,
3051 : unsigned int gup_flags, struct page **pages)
3052 : {
3053 : /*
3054 : * The caller may or may not have explicitly set FOLL_GET; either way is
3055 : * OK. However, internally (within mm/gup.c), gup fast variants must set
3056 : * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
3057 : * request.
3058 : */
3059 0 : if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_GET))
3060 : return -EINVAL;
3061 0 : return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
3062 : }
3063 : EXPORT_SYMBOL_GPL(get_user_pages_fast);
3064 :
3065 : /**
3066 : * pin_user_pages_fast() - pin user pages in memory without taking locks
3067 : *
3068 : * @start: starting user address
3069 : * @nr_pages: number of pages from start to pin
3070 : * @gup_flags: flags modifying pin behaviour
3071 : * @pages: array that receives pointers to the pages pinned.
3072 : * Should be at least nr_pages long.
3073 : *
3074 : * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
3075 : * get_user_pages_fast() for documentation on the function arguments, because
3076 : * the arguments here are identical.
3077 : *
3078 : * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
3079 : * see Documentation/core-api/pin_user_pages.rst for further details.
3080 : */
3081 0 : int pin_user_pages_fast(unsigned long start, int nr_pages,
3082 : unsigned int gup_flags, struct page **pages)
3083 : {
3084 0 : if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_PIN))
3085 : return -EINVAL;
3086 0 : return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
3087 : }
3088 : EXPORT_SYMBOL_GPL(pin_user_pages_fast);
3089 :
3090 : /**
3091 : * pin_user_pages_remote() - pin pages of a remote process
3092 : *
3093 : * @mm: mm_struct of target mm
3094 : * @start: starting user address
3095 : * @nr_pages: number of pages from start to pin
3096 : * @gup_flags: flags modifying lookup behaviour
3097 : * @pages: array that receives pointers to the pages pinned.
3098 : * Should be at least nr_pages long.
3099 : * @vmas: array of pointers to vmas corresponding to each page.
3100 : * Or NULL if the caller does not require them.
3101 : * @locked: pointer to lock flag indicating whether lock is held and
3102 : * subsequently whether VM_FAULT_RETRY functionality can be
3103 : * utilised. Lock must initially be held.
3104 : *
3105 : * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
3106 : * get_user_pages_remote() for documentation on the function arguments, because
3107 : * the arguments here are identical.
3108 : *
3109 : * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
3110 : * see Documentation/core-api/pin_user_pages.rst for details.
3111 : */
3112 0 : long pin_user_pages_remote(struct mm_struct *mm,
3113 : unsigned long start, unsigned long nr_pages,
3114 : unsigned int gup_flags, struct page **pages,
3115 : struct vm_area_struct **vmas, int *locked)
3116 : {
3117 0 : int local_locked = 1;
3118 :
3119 0 : if (!is_valid_gup_args(pages, vmas, locked, &gup_flags,
3120 : FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))
3121 : return 0;
3122 0 : return __gup_longterm_locked(mm, start, nr_pages, pages, vmas,
3123 : locked ? locked : &local_locked,
3124 : gup_flags);
3125 : }
3126 : EXPORT_SYMBOL(pin_user_pages_remote);
3127 :
3128 : /**
3129 : * pin_user_pages() - pin user pages in memory for use by other devices
3130 : *
3131 : * @start: starting user address
3132 : * @nr_pages: number of pages from start to pin
3133 : * @gup_flags: flags modifying lookup behaviour
3134 : * @pages: array that receives pointers to the pages pinned.
3135 : * Should be at least nr_pages long.
3136 : * @vmas: array of pointers to vmas corresponding to each page.
3137 : * Or NULL if the caller does not require them.
3138 : *
3139 : * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
3140 : * FOLL_PIN is set.
3141 : *
3142 : * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
3143 : * see Documentation/core-api/pin_user_pages.rst for details.
3144 : */
3145 0 : long pin_user_pages(unsigned long start, unsigned long nr_pages,
3146 : unsigned int gup_flags, struct page **pages,
3147 : struct vm_area_struct **vmas)
3148 : {
3149 0 : int locked = 1;
3150 :
3151 0 : if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_PIN))
3152 : return 0;
3153 0 : return __gup_longterm_locked(current->mm, start, nr_pages,
3154 : pages, vmas, &locked, gup_flags);
3155 : }
3156 : EXPORT_SYMBOL(pin_user_pages);
3157 :
3158 : /*
3159 : * pin_user_pages_unlocked() is the FOLL_PIN variant of
3160 : * get_user_pages_unlocked(). Behavior is the same, except that this one sets
3161 : * FOLL_PIN and rejects FOLL_GET.
3162 : */
3163 0 : long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
3164 : struct page **pages, unsigned int gup_flags)
3165 : {
3166 0 : int locked = 0;
3167 :
3168 0 : if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
3169 : FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))
3170 : return 0;
3171 :
3172 0 : return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL,
3173 : &locked, gup_flags);
3174 : }
3175 : EXPORT_SYMBOL(pin_user_pages_unlocked);
|