Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : #include <linux/pagewalk.h>
3 : #include <linux/highmem.h>
4 : #include <linux/sched.h>
5 : #include <linux/hugetlb.h>
6 :
7 : /*
8 : * We want to know the real level where a entry is located ignoring any
9 : * folding of levels which may be happening. For example if p4d is folded then
10 : * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
11 : */
12 : static int real_depth(int depth)
13 : {
14 : if (depth == 3 && PTRS_PER_PMD == 1)
15 : depth = 2;
16 : if (depth == 2 && PTRS_PER_PUD == 1)
17 0 : depth = 1;
18 : if (depth == 1 && PTRS_PER_P4D == 1)
19 0 : depth = 0;
20 : return depth;
21 : }
22 :
23 : static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
24 : unsigned long end, struct mm_walk *walk)
25 : {
26 0 : const struct mm_walk_ops *ops = walk->ops;
27 0 : int err = 0;
28 :
29 : for (;;) {
30 0 : err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
31 0 : if (err)
32 : break;
33 0 : if (addr >= end - PAGE_SIZE)
34 : break;
35 0 : addr += PAGE_SIZE;
36 0 : pte++;
37 : }
38 : return err;
39 : }
40 :
41 0 : static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
42 : struct mm_walk *walk)
43 : {
44 : pte_t *pte;
45 0 : int err = 0;
46 : spinlock_t *ptl;
47 :
48 0 : if (walk->no_vma) {
49 : /*
50 : * pte_offset_map() might apply user-specific validation.
51 : */
52 0 : if (walk->mm == &init_mm)
53 0 : pte = pte_offset_kernel(pmd, addr);
54 : else
55 0 : pte = pte_offset_map(pmd, addr);
56 0 : if (pte) {
57 : err = walk_pte_range_inner(pte, addr, end, walk);
58 : if (walk->mm != &init_mm)
59 : pte_unmap(pte);
60 : }
61 : } else {
62 0 : pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
63 0 : if (pte) {
64 0 : err = walk_pte_range_inner(pte, addr, end, walk);
65 0 : pte_unmap_unlock(pte, ptl);
66 : }
67 : }
68 0 : if (!pte)
69 0 : walk->action = ACTION_AGAIN;
70 0 : return err;
71 : }
72 :
73 : #ifdef CONFIG_ARCH_HAS_HUGEPD
74 : static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
75 : unsigned long end, struct mm_walk *walk, int pdshift)
76 : {
77 : int err = 0;
78 : const struct mm_walk_ops *ops = walk->ops;
79 : int shift = hugepd_shift(*phpd);
80 : int page_size = 1 << shift;
81 :
82 : if (!ops->pte_entry)
83 : return 0;
84 :
85 : if (addr & (page_size - 1))
86 : return 0;
87 :
88 : for (;;) {
89 : pte_t *pte;
90 :
91 : spin_lock(&walk->mm->page_table_lock);
92 : pte = hugepte_offset(*phpd, addr, pdshift);
93 : err = ops->pte_entry(pte, addr, addr + page_size, walk);
94 : spin_unlock(&walk->mm->page_table_lock);
95 :
96 : if (err)
97 : break;
98 : if (addr >= end - page_size)
99 : break;
100 : addr += page_size;
101 : }
102 : return err;
103 : }
104 : #else
105 : static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
106 : unsigned long end, struct mm_walk *walk, int pdshift)
107 : {
108 : return 0;
109 : }
110 : #endif
111 :
112 0 : static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
113 : struct mm_walk *walk)
114 : {
115 : pmd_t *pmd;
116 : unsigned long next;
117 0 : const struct mm_walk_ops *ops = walk->ops;
118 0 : int err = 0;
119 0 : int depth = real_depth(3);
120 :
121 0 : pmd = pmd_offset(pud, addr);
122 : do {
123 : again:
124 0 : next = pmd_addr_end(addr, end);
125 0 : if (pmd_none(*pmd)) {
126 0 : if (ops->pte_hole)
127 0 : err = ops->pte_hole(addr, next, depth, walk);
128 0 : if (err)
129 : break;
130 0 : continue;
131 : }
132 :
133 0 : walk->action = ACTION_SUBTREE;
134 :
135 : /*
136 : * This implies that each ->pmd_entry() handler
137 : * needs to know about pmd_trans_huge() pmds
138 : */
139 0 : if (ops->pmd_entry)
140 0 : err = ops->pmd_entry(pmd, addr, next, walk);
141 0 : if (err)
142 : break;
143 :
144 0 : if (walk->action == ACTION_AGAIN)
145 : goto again;
146 :
147 : /*
148 : * Check this here so we only break down trans_huge
149 : * pages when we _need_ to
150 : */
151 0 : if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
152 0 : walk->action == ACTION_CONTINUE ||
153 0 : !(ops->pte_entry))
154 0 : continue;
155 :
156 : if (walk->vma)
157 : split_huge_pmd(walk->vma, pmd, addr);
158 :
159 : if (is_hugepd(__hugepd(pmd_val(*pmd))))
160 : err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
161 : else
162 0 : err = walk_pte_range(pmd, addr, next, walk);
163 0 : if (err)
164 : break;
165 :
166 0 : if (walk->action == ACTION_AGAIN)
167 : goto again;
168 :
169 0 : } while (pmd++, addr = next, addr != end);
170 :
171 0 : return err;
172 : }
173 :
174 0 : static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
175 : struct mm_walk *walk)
176 : {
177 : pud_t *pud;
178 : unsigned long next;
179 0 : const struct mm_walk_ops *ops = walk->ops;
180 0 : int err = 0;
181 0 : int depth = real_depth(2);
182 :
183 0 : pud = pud_offset(p4d, addr);
184 : do {
185 : again:
186 0 : next = pud_addr_end(addr, end);
187 0 : if (pud_none(*pud)) {
188 0 : if (ops->pte_hole)
189 0 : err = ops->pte_hole(addr, next, depth, walk);
190 0 : if (err)
191 : break;
192 0 : continue;
193 : }
194 :
195 0 : walk->action = ACTION_SUBTREE;
196 :
197 0 : if (ops->pud_entry)
198 0 : err = ops->pud_entry(pud, addr, next, walk);
199 0 : if (err)
200 : break;
201 :
202 0 : if (walk->action == ACTION_AGAIN)
203 : goto again;
204 :
205 0 : if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
206 0 : walk->action == ACTION_CONTINUE ||
207 0 : !(ops->pmd_entry || ops->pte_entry))
208 0 : continue;
209 :
210 : if (walk->vma)
211 : split_huge_pud(walk->vma, pud, addr);
212 0 : if (pud_none(*pud))
213 : goto again;
214 :
215 : if (is_hugepd(__hugepd(pud_val(*pud))))
216 : err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
217 : else
218 0 : err = walk_pmd_range(pud, addr, next, walk);
219 0 : if (err)
220 : break;
221 0 : } while (pud++, addr = next, addr != end);
222 :
223 0 : return err;
224 : }
225 :
226 0 : static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
227 : struct mm_walk *walk)
228 : {
229 : p4d_t *p4d;
230 : unsigned long next;
231 0 : const struct mm_walk_ops *ops = walk->ops;
232 0 : int err = 0;
233 0 : int depth = real_depth(1);
234 :
235 0 : p4d = p4d_offset(pgd, addr);
236 : do {
237 0 : next = p4d_addr_end(addr, end);
238 0 : if (p4d_none_or_clear_bad(p4d)) {
239 : if (ops->pte_hole)
240 : err = ops->pte_hole(addr, next, depth, walk);
241 : if (err)
242 : break;
243 : continue;
244 : }
245 0 : if (ops->p4d_entry) {
246 0 : err = ops->p4d_entry(p4d, addr, next, walk);
247 0 : if (err)
248 : break;
249 : }
250 : if (is_hugepd(__hugepd(p4d_val(*p4d))))
251 : err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT);
252 0 : else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
253 0 : err = walk_pud_range(p4d, addr, next, walk);
254 : if (err)
255 : break;
256 : } while (p4d++, addr = next, addr != end);
257 :
258 0 : return err;
259 : }
260 :
261 0 : static int walk_pgd_range(unsigned long addr, unsigned long end,
262 : struct mm_walk *walk)
263 : {
264 : pgd_t *pgd;
265 : unsigned long next;
266 0 : const struct mm_walk_ops *ops = walk->ops;
267 0 : int err = 0;
268 :
269 0 : if (walk->pgd)
270 0 : pgd = walk->pgd + pgd_index(addr);
271 : else
272 0 : pgd = pgd_offset(walk->mm, addr);
273 : do {
274 0 : next = pgd_addr_end(addr, end);
275 0 : if (pgd_none_or_clear_bad(pgd)) {
276 : if (ops->pte_hole)
277 : err = ops->pte_hole(addr, next, 0, walk);
278 : if (err)
279 : break;
280 : continue;
281 : }
282 0 : if (ops->pgd_entry) {
283 0 : err = ops->pgd_entry(pgd, addr, next, walk);
284 0 : if (err)
285 : break;
286 : }
287 : if (is_hugepd(__hugepd(pgd_val(*pgd))))
288 : err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT);
289 0 : else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
290 0 : err = walk_p4d_range(pgd, addr, next, walk);
291 0 : if (err)
292 : break;
293 0 : } while (pgd++, addr = next, addr != end);
294 :
295 0 : return err;
296 : }
297 :
298 : #ifdef CONFIG_HUGETLB_PAGE
299 : static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
300 : unsigned long end)
301 : {
302 : unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
303 : return boundary < end ? boundary : end;
304 : }
305 :
306 : static int walk_hugetlb_range(unsigned long addr, unsigned long end,
307 : struct mm_walk *walk)
308 : {
309 : struct vm_area_struct *vma = walk->vma;
310 : struct hstate *h = hstate_vma(vma);
311 : unsigned long next;
312 : unsigned long hmask = huge_page_mask(h);
313 : unsigned long sz = huge_page_size(h);
314 : pte_t *pte;
315 : const struct mm_walk_ops *ops = walk->ops;
316 : int err = 0;
317 :
318 : hugetlb_vma_lock_read(vma);
319 : do {
320 : next = hugetlb_entry_end(h, addr, end);
321 : pte = hugetlb_walk(vma, addr & hmask, sz);
322 : if (pte)
323 : err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
324 : else if (ops->pte_hole)
325 : err = ops->pte_hole(addr, next, -1, walk);
326 : if (err)
327 : break;
328 : } while (addr = next, addr != end);
329 : hugetlb_vma_unlock_read(vma);
330 :
331 : return err;
332 : }
333 :
334 : #else /* CONFIG_HUGETLB_PAGE */
335 : static int walk_hugetlb_range(unsigned long addr, unsigned long end,
336 : struct mm_walk *walk)
337 : {
338 : return 0;
339 : }
340 :
341 : #endif /* CONFIG_HUGETLB_PAGE */
342 :
343 : /*
344 : * Decide whether we really walk over the current vma on [@start, @end)
345 : * or skip it via the returned value. Return 0 if we do walk over the
346 : * current vma, and return 1 if we skip the vma. Negative values means
347 : * error, where we abort the current walk.
348 : */
349 0 : static int walk_page_test(unsigned long start, unsigned long end,
350 : struct mm_walk *walk)
351 : {
352 0 : struct vm_area_struct *vma = walk->vma;
353 0 : const struct mm_walk_ops *ops = walk->ops;
354 :
355 0 : if (ops->test_walk)
356 0 : return ops->test_walk(start, end, walk);
357 :
358 : /*
359 : * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
360 : * range, so we don't walk over it as we do for normal vmas. However,
361 : * Some callers are interested in handling hole range and they don't
362 : * want to just ignore any single address range. Such users certainly
363 : * define their ->pte_hole() callbacks, so let's delegate them to handle
364 : * vma(VM_PFNMAP).
365 : */
366 0 : if (vma->vm_flags & VM_PFNMAP) {
367 0 : int err = 1;
368 0 : if (ops->pte_hole)
369 0 : err = ops->pte_hole(start, end, -1, walk);
370 0 : return err ? err : 1;
371 : }
372 : return 0;
373 : }
374 :
375 0 : static int __walk_page_range(unsigned long start, unsigned long end,
376 : struct mm_walk *walk)
377 : {
378 0 : int err = 0;
379 0 : struct vm_area_struct *vma = walk->vma;
380 0 : const struct mm_walk_ops *ops = walk->ops;
381 :
382 0 : if (ops->pre_vma) {
383 0 : err = ops->pre_vma(start, end, walk);
384 0 : if (err)
385 : return err;
386 : }
387 :
388 0 : if (is_vm_hugetlb_page(vma)) {
389 : if (ops->hugetlb_entry)
390 : err = walk_hugetlb_range(start, end, walk);
391 : } else
392 0 : err = walk_pgd_range(start, end, walk);
393 :
394 0 : if (ops->post_vma)
395 0 : ops->post_vma(walk);
396 :
397 : return err;
398 : }
399 :
400 : /**
401 : * walk_page_range - walk page table with caller specific callbacks
402 : * @mm: mm_struct representing the target process of page table walk
403 : * @start: start address of the virtual address range
404 : * @end: end address of the virtual address range
405 : * @ops: operation to call during the walk
406 : * @private: private data for callbacks' usage
407 : *
408 : * Recursively walk the page table tree of the process represented by @mm
409 : * within the virtual address range [@start, @end). During walking, we can do
410 : * some caller-specific works for each entry, by setting up pmd_entry(),
411 : * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
412 : * callbacks, the associated entries/pages are just ignored.
413 : * The return values of these callbacks are commonly defined like below:
414 : *
415 : * - 0 : succeeded to handle the current entry, and if you don't reach the
416 : * end address yet, continue to walk.
417 : * - >0 : succeeded to handle the current entry, and return to the caller
418 : * with caller specific value.
419 : * - <0 : failed to handle the current entry, and return to the caller
420 : * with error code.
421 : *
422 : * Before starting to walk page table, some callers want to check whether
423 : * they really want to walk over the current vma, typically by checking
424 : * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
425 : * purpose.
426 : *
427 : * If operations need to be staged before and committed after a vma is walked,
428 : * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
429 : * since it is intended to handle commit-type operations, can't return any
430 : * errors.
431 : *
432 : * struct mm_walk keeps current values of some common data like vma and pmd,
433 : * which are useful for the access from callbacks. If you want to pass some
434 : * caller-specific data to callbacks, @private should be helpful.
435 : *
436 : * Locking:
437 : * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
438 : * because these function traverse vma list and/or access to vma's data.
439 : */
440 0 : int walk_page_range(struct mm_struct *mm, unsigned long start,
441 : unsigned long end, const struct mm_walk_ops *ops,
442 : void *private)
443 : {
444 0 : int err = 0;
445 : unsigned long next;
446 : struct vm_area_struct *vma;
447 0 : struct mm_walk walk = {
448 : .ops = ops,
449 : .mm = mm,
450 : .private = private,
451 : };
452 :
453 0 : if (start >= end)
454 : return -EINVAL;
455 :
456 0 : if (!walk.mm)
457 : return -EINVAL;
458 :
459 0 : mmap_assert_locked(walk.mm);
460 :
461 0 : vma = find_vma(walk.mm, start);
462 : do {
463 0 : if (!vma) { /* after the last vma */
464 0 : walk.vma = NULL;
465 0 : next = end;
466 0 : if (ops->pte_hole)
467 0 : err = ops->pte_hole(start, next, -1, &walk);
468 0 : } else if (start < vma->vm_start) { /* outside vma */
469 0 : walk.vma = NULL;
470 0 : next = min(end, vma->vm_start);
471 0 : if (ops->pte_hole)
472 0 : err = ops->pte_hole(start, next, -1, &walk);
473 : } else { /* inside vma */
474 0 : walk.vma = vma;
475 0 : next = min(end, vma->vm_end);
476 0 : vma = find_vma(mm, vma->vm_end);
477 :
478 0 : err = walk_page_test(start, next, &walk);
479 0 : if (err > 0) {
480 : /*
481 : * positive return values are purely for
482 : * controlling the pagewalk, so should never
483 : * be passed to the callers.
484 : */
485 0 : err = 0;
486 0 : continue;
487 : }
488 0 : if (err < 0)
489 : break;
490 0 : err = __walk_page_range(start, next, &walk);
491 : }
492 0 : if (err)
493 : break;
494 0 : } while (start = next, start < end);
495 : return err;
496 : }
497 :
498 : /**
499 : * walk_page_range_novma - walk a range of pagetables not backed by a vma
500 : * @mm: mm_struct representing the target process of page table walk
501 : * @start: start address of the virtual address range
502 : * @end: end address of the virtual address range
503 : * @ops: operation to call during the walk
504 : * @pgd: pgd to walk if different from mm->pgd
505 : * @private: private data for callbacks' usage
506 : *
507 : * Similar to walk_page_range() but can walk any page tables even if they are
508 : * not backed by VMAs. Because 'unusual' entries may be walked this function
509 : * will also not lock the PTEs for the pte_entry() callback. This is useful for
510 : * walking the kernel pages tables or page tables for firmware.
511 : */
512 0 : int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
513 : unsigned long end, const struct mm_walk_ops *ops,
514 : pgd_t *pgd,
515 : void *private)
516 : {
517 0 : struct mm_walk walk = {
518 : .ops = ops,
519 : .mm = mm,
520 : .pgd = pgd,
521 : .private = private,
522 : .no_vma = true
523 : };
524 :
525 0 : if (start >= end || !walk.mm)
526 : return -EINVAL;
527 :
528 0 : mmap_assert_write_locked(walk.mm);
529 :
530 0 : return walk_pgd_range(start, end, &walk);
531 : }
532 :
533 0 : int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
534 : unsigned long end, const struct mm_walk_ops *ops,
535 : void *private)
536 : {
537 0 : struct mm_walk walk = {
538 : .ops = ops,
539 0 : .mm = vma->vm_mm,
540 : .vma = vma,
541 : .private = private,
542 : };
543 :
544 0 : if (start >= end || !walk.mm)
545 : return -EINVAL;
546 0 : if (start < vma->vm_start || end > vma->vm_end)
547 : return -EINVAL;
548 :
549 0 : mmap_assert_locked(walk.mm);
550 0 : return __walk_page_range(start, end, &walk);
551 : }
552 :
553 0 : int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
554 : void *private)
555 : {
556 0 : struct mm_walk walk = {
557 : .ops = ops,
558 0 : .mm = vma->vm_mm,
559 : .vma = vma,
560 : .private = private,
561 : };
562 :
563 0 : if (!walk.mm)
564 : return -EINVAL;
565 :
566 0 : mmap_assert_locked(walk.mm);
567 0 : return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
568 : }
569 :
570 : /**
571 : * walk_page_mapping - walk all memory areas mapped into a struct address_space.
572 : * @mapping: Pointer to the struct address_space
573 : * @first_index: First page offset in the address_space
574 : * @nr: Number of incremental page offsets to cover
575 : * @ops: operation to call during the walk
576 : * @private: private data for callbacks' usage
577 : *
578 : * This function walks all memory areas mapped into a struct address_space.
579 : * The walk is limited to only the given page-size index range, but if
580 : * the index boundaries cross a huge page-table entry, that entry will be
581 : * included.
582 : *
583 : * Also see walk_page_range() for additional information.
584 : *
585 : * Locking:
586 : * This function can't require that the struct mm_struct::mmap_lock is held,
587 : * since @mapping may be mapped by multiple processes. Instead
588 : * @mapping->i_mmap_rwsem must be held. This might have implications in the
589 : * callbacks, and it's up tho the caller to ensure that the
590 : * struct mm_struct::mmap_lock is not needed.
591 : *
592 : * Also this means that a caller can't rely on the struct
593 : * vm_area_struct::vm_flags to be constant across a call,
594 : * except for immutable flags. Callers requiring this shouldn't use
595 : * this function.
596 : *
597 : * Return: 0 on success, negative error code on failure, positive number on
598 : * caller defined premature termination.
599 : */
600 0 : int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
601 : pgoff_t nr, const struct mm_walk_ops *ops,
602 : void *private)
603 : {
604 0 : struct mm_walk walk = {
605 : .ops = ops,
606 : .private = private,
607 : };
608 : struct vm_area_struct *vma;
609 : pgoff_t vba, vea, cba, cea;
610 : unsigned long start_addr, end_addr;
611 0 : int err = 0;
612 :
613 : lockdep_assert_held(&mapping->i_mmap_rwsem);
614 0 : vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
615 : first_index + nr - 1) {
616 : /* Clip to the vma */
617 0 : vba = vma->vm_pgoff;
618 0 : vea = vba + vma_pages(vma);
619 0 : cba = first_index;
620 0 : cba = max(cba, vba);
621 0 : cea = first_index + nr;
622 0 : cea = min(cea, vea);
623 :
624 0 : start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
625 0 : end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
626 0 : if (start_addr >= end_addr)
627 0 : continue;
628 :
629 0 : walk.vma = vma;
630 0 : walk.mm = vma->vm_mm;
631 :
632 0 : err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
633 0 : if (err > 0) {
634 : err = 0;
635 : break;
636 0 : } else if (err < 0)
637 : break;
638 :
639 0 : err = __walk_page_range(start_addr, end_addr, &walk);
640 0 : if (err)
641 : break;
642 : }
643 :
644 0 : return err;
645 : }
|