Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * mm/mremap.c
4 : *
5 : * (C) Copyright 1996 Linus Torvalds
6 : *
7 : * Address space accounting code <alan@lxorguk.ukuu.org.uk>
8 : * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
9 : */
10 :
11 : #include <linux/mm.h>
12 : #include <linux/mm_inline.h>
13 : #include <linux/hugetlb.h>
14 : #include <linux/shm.h>
15 : #include <linux/ksm.h>
16 : #include <linux/mman.h>
17 : #include <linux/swap.h>
18 : #include <linux/capability.h>
19 : #include <linux/fs.h>
20 : #include <linux/swapops.h>
21 : #include <linux/highmem.h>
22 : #include <linux/security.h>
23 : #include <linux/syscalls.h>
24 : #include <linux/mmu_notifier.h>
25 : #include <linux/uaccess.h>
26 : #include <linux/userfaultfd_k.h>
27 : #include <linux/mempolicy.h>
28 :
29 : #include <asm/cacheflush.h>
30 : #include <asm/tlb.h>
31 : #include <asm/pgalloc.h>
32 :
33 : #include "internal.h"
34 :
35 : static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
36 : {
37 : pgd_t *pgd;
38 : p4d_t *p4d;
39 : pud_t *pud;
40 :
41 0 : pgd = pgd_offset(mm, addr);
42 : if (pgd_none_or_clear_bad(pgd))
43 : return NULL;
44 :
45 0 : p4d = p4d_offset(pgd, addr);
46 : if (p4d_none_or_clear_bad(p4d))
47 : return NULL;
48 :
49 0 : pud = pud_offset(p4d, addr);
50 0 : if (pud_none_or_clear_bad(pud))
51 : return NULL;
52 :
53 : return pud;
54 : }
55 :
56 0 : static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
57 : {
58 : pud_t *pud;
59 : pmd_t *pmd;
60 :
61 0 : pud = get_old_pud(mm, addr);
62 0 : if (!pud)
63 : return NULL;
64 :
65 0 : pmd = pmd_offset(pud, addr);
66 0 : if (pmd_none(*pmd))
67 : return NULL;
68 :
69 0 : return pmd;
70 : }
71 :
72 : static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
73 : unsigned long addr)
74 : {
75 : pgd_t *pgd;
76 : p4d_t *p4d;
77 :
78 0 : pgd = pgd_offset(mm, addr);
79 0 : p4d = p4d_alloc(mm, pgd, addr);
80 0 : if (!p4d)
81 : return NULL;
82 :
83 0 : return pud_alloc(mm, p4d, addr);
84 : }
85 :
86 0 : static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
87 : unsigned long addr)
88 : {
89 : pud_t *pud;
90 : pmd_t *pmd;
91 :
92 0 : pud = alloc_new_pud(mm, vma, addr);
93 0 : if (!pud)
94 : return NULL;
95 :
96 0 : pmd = pmd_alloc(mm, pud, addr);
97 0 : if (!pmd)
98 : return NULL;
99 :
100 : VM_BUG_ON(pmd_trans_huge(*pmd));
101 :
102 : return pmd;
103 : }
104 :
105 0 : static void take_rmap_locks(struct vm_area_struct *vma)
106 : {
107 0 : if (vma->vm_file)
108 0 : i_mmap_lock_write(vma->vm_file->f_mapping);
109 0 : if (vma->anon_vma)
110 0 : anon_vma_lock_write(vma->anon_vma);
111 0 : }
112 :
113 0 : static void drop_rmap_locks(struct vm_area_struct *vma)
114 : {
115 0 : if (vma->anon_vma)
116 0 : anon_vma_unlock_write(vma->anon_vma);
117 0 : if (vma->vm_file)
118 0 : i_mmap_unlock_write(vma->vm_file->f_mapping);
119 0 : }
120 :
121 : static pte_t move_soft_dirty_pte(pte_t pte)
122 : {
123 : /*
124 : * Set soft dirty bit so we can notice
125 : * in userspace the ptes were moved.
126 : */
127 : #ifdef CONFIG_MEM_SOFT_DIRTY
128 : if (pte_present(pte))
129 : pte = pte_mksoft_dirty(pte);
130 : else if (is_swap_pte(pte))
131 : pte = pte_swp_mksoft_dirty(pte);
132 : #endif
133 : return pte;
134 : }
135 :
136 0 : static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
137 : unsigned long old_addr, unsigned long old_end,
138 : struct vm_area_struct *new_vma, pmd_t *new_pmd,
139 : unsigned long new_addr, bool need_rmap_locks)
140 : {
141 0 : struct mm_struct *mm = vma->vm_mm;
142 : pte_t *old_pte, *new_pte, pte;
143 : spinlock_t *old_ptl, *new_ptl;
144 0 : bool force_flush = false;
145 0 : unsigned long len = old_end - old_addr;
146 :
147 : /*
148 : * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
149 : * locks to ensure that rmap will always observe either the old or the
150 : * new ptes. This is the easiest way to avoid races with
151 : * truncate_pagecache(), page migration, etc...
152 : *
153 : * When need_rmap_locks is false, we use other ways to avoid
154 : * such races:
155 : *
156 : * - During exec() shift_arg_pages(), we use a specially tagged vma
157 : * which rmap call sites look for using vma_is_temporary_stack().
158 : *
159 : * - During mremap(), new_vma is often known to be placed after vma
160 : * in rmap traversal order. This ensures rmap will always observe
161 : * either the old pte, or the new pte, or both (the page table locks
162 : * serialize access to individual ptes, but only rmap traversal
163 : * order guarantees that we won't miss both the old and new ptes).
164 : */
165 0 : if (need_rmap_locks)
166 0 : take_rmap_locks(vma);
167 :
168 : /*
169 : * We don't have to worry about the ordering of src and dst
170 : * pte locks because exclusive mmap_lock prevents deadlock.
171 : */
172 0 : old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
173 0 : new_pte = pte_offset_map(new_pmd, new_addr);
174 0 : new_ptl = pte_lockptr(mm, new_pmd);
175 : if (new_ptl != old_ptl)
176 : spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
177 0 : flush_tlb_batched_pending(vma->vm_mm);
178 : arch_enter_lazy_mmu_mode();
179 :
180 0 : for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
181 0 : new_pte++, new_addr += PAGE_SIZE) {
182 0 : if (pte_none(*old_pte))
183 0 : continue;
184 :
185 0 : pte = ptep_get_and_clear(mm, old_addr, old_pte);
186 : /*
187 : * If we are remapping a valid PTE, make sure
188 : * to flush TLB before we drop the PTL for the
189 : * PTE.
190 : *
191 : * NOTE! Both old and new PTL matter: the old one
192 : * for racing with page_mkclean(), the new one to
193 : * make sure the physical page stays valid until
194 : * the TLB entry for the old mapping has been
195 : * flushed.
196 : */
197 0 : if (pte_present(pte))
198 0 : force_flush = true;
199 0 : pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
200 : pte = move_soft_dirty_pte(pte);
201 0 : set_pte_at(mm, new_addr, new_pte, pte);
202 : }
203 :
204 : arch_leave_lazy_mmu_mode();
205 0 : if (force_flush)
206 0 : flush_tlb_range(vma, old_end - len, old_end);
207 : if (new_ptl != old_ptl)
208 : spin_unlock(new_ptl);
209 : pte_unmap(new_pte - 1);
210 0 : pte_unmap_unlock(old_pte - 1, old_ptl);
211 0 : if (need_rmap_locks)
212 0 : drop_rmap_locks(vma);
213 0 : }
214 :
215 : #ifndef arch_supports_page_table_move
216 : #define arch_supports_page_table_move arch_supports_page_table_move
217 : static inline bool arch_supports_page_table_move(void)
218 : {
219 : return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
220 : IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
221 : }
222 : #endif
223 :
224 : #ifdef CONFIG_HAVE_MOVE_PMD
225 : static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
226 : unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
227 : {
228 : spinlock_t *old_ptl, *new_ptl;
229 : struct mm_struct *mm = vma->vm_mm;
230 : pmd_t pmd;
231 :
232 : if (!arch_supports_page_table_move())
233 : return false;
234 : /*
235 : * The destination pmd shouldn't be established, free_pgtables()
236 : * should have released it.
237 : *
238 : * However, there's a case during execve() where we use mremap
239 : * to move the initial stack, and in that case the target area
240 : * may overlap the source area (always moving down).
241 : *
242 : * If everything is PMD-aligned, that works fine, as moving
243 : * each pmd down will clear the source pmd. But if we first
244 : * have a few 4kB-only pages that get moved down, and then
245 : * hit the "now the rest is PMD-aligned, let's do everything
246 : * one pmd at a time", we will still have the old (now empty
247 : * of any 4kB pages, but still there) PMD in the page table
248 : * tree.
249 : *
250 : * Warn on it once - because we really should try to figure
251 : * out how to do this better - but then say "I won't move
252 : * this pmd".
253 : *
254 : * One alternative might be to just unmap the target pmd at
255 : * this point, and verify that it really is empty. We'll see.
256 : */
257 : if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
258 : return false;
259 :
260 : /*
261 : * We don't have to worry about the ordering of src and dst
262 : * ptlocks because exclusive mmap_lock prevents deadlock.
263 : */
264 : old_ptl = pmd_lock(vma->vm_mm, old_pmd);
265 : new_ptl = pmd_lockptr(mm, new_pmd);
266 : if (new_ptl != old_ptl)
267 : spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
268 :
269 : /* Clear the pmd */
270 : pmd = *old_pmd;
271 : pmd_clear(old_pmd);
272 :
273 : VM_BUG_ON(!pmd_none(*new_pmd));
274 :
275 : pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
276 : flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
277 : if (new_ptl != old_ptl)
278 : spin_unlock(new_ptl);
279 : spin_unlock(old_ptl);
280 :
281 : return true;
282 : }
283 : #else
284 : static inline bool move_normal_pmd(struct vm_area_struct *vma,
285 : unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
286 : pmd_t *new_pmd)
287 : {
288 : return false;
289 : }
290 : #endif
291 :
292 : #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
293 : static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
294 : unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
295 : {
296 : spinlock_t *old_ptl, *new_ptl;
297 : struct mm_struct *mm = vma->vm_mm;
298 : pud_t pud;
299 :
300 : if (!arch_supports_page_table_move())
301 : return false;
302 : /*
303 : * The destination pud shouldn't be established, free_pgtables()
304 : * should have released it.
305 : */
306 : if (WARN_ON_ONCE(!pud_none(*new_pud)))
307 : return false;
308 :
309 : /*
310 : * We don't have to worry about the ordering of src and dst
311 : * ptlocks because exclusive mmap_lock prevents deadlock.
312 : */
313 : old_ptl = pud_lock(vma->vm_mm, old_pud);
314 : new_ptl = pud_lockptr(mm, new_pud);
315 : if (new_ptl != old_ptl)
316 : spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
317 :
318 : /* Clear the pud */
319 : pud = *old_pud;
320 : pud_clear(old_pud);
321 :
322 : VM_BUG_ON(!pud_none(*new_pud));
323 :
324 : pud_populate(mm, new_pud, pud_pgtable(pud));
325 : flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
326 : if (new_ptl != old_ptl)
327 : spin_unlock(new_ptl);
328 : spin_unlock(old_ptl);
329 :
330 : return true;
331 : }
332 : #else
333 : static inline bool move_normal_pud(struct vm_area_struct *vma,
334 : unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
335 : pud_t *new_pud)
336 : {
337 : return false;
338 : }
339 : #endif
340 :
341 : #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
342 : static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
343 : unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
344 : {
345 : spinlock_t *old_ptl, *new_ptl;
346 : struct mm_struct *mm = vma->vm_mm;
347 : pud_t pud;
348 :
349 : /*
350 : * The destination pud shouldn't be established, free_pgtables()
351 : * should have released it.
352 : */
353 : if (WARN_ON_ONCE(!pud_none(*new_pud)))
354 : return false;
355 :
356 : /*
357 : * We don't have to worry about the ordering of src and dst
358 : * ptlocks because exclusive mmap_lock prevents deadlock.
359 : */
360 : old_ptl = pud_lock(vma->vm_mm, old_pud);
361 : new_ptl = pud_lockptr(mm, new_pud);
362 : if (new_ptl != old_ptl)
363 : spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
364 :
365 : /* Clear the pud */
366 : pud = *old_pud;
367 : pud_clear(old_pud);
368 :
369 : VM_BUG_ON(!pud_none(*new_pud));
370 :
371 : /* Set the new pud */
372 : /* mark soft_ditry when we add pud level soft dirty support */
373 : set_pud_at(mm, new_addr, new_pud, pud);
374 : flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
375 : if (new_ptl != old_ptl)
376 : spin_unlock(new_ptl);
377 : spin_unlock(old_ptl);
378 :
379 : return true;
380 : }
381 : #else
382 : static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
383 : unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
384 : {
385 : WARN_ON_ONCE(1);
386 : return false;
387 :
388 : }
389 : #endif
390 :
391 : enum pgt_entry {
392 : NORMAL_PMD,
393 : HPAGE_PMD,
394 : NORMAL_PUD,
395 : HPAGE_PUD,
396 : };
397 :
398 : /*
399 : * Returns an extent of the corresponding size for the pgt_entry specified if
400 : * valid. Else returns a smaller extent bounded by the end of the source and
401 : * destination pgt_entry.
402 : */
403 : static __always_inline unsigned long get_extent(enum pgt_entry entry,
404 : unsigned long old_addr, unsigned long old_end,
405 : unsigned long new_addr)
406 : {
407 : unsigned long next, extent, mask, size;
408 :
409 : switch (entry) {
410 : case HPAGE_PMD:
411 : case NORMAL_PMD:
412 : mask = PMD_MASK;
413 : size = PMD_SIZE;
414 : break;
415 : case HPAGE_PUD:
416 : case NORMAL_PUD:
417 0 : mask = PUD_MASK;
418 0 : size = PUD_SIZE;
419 : break;
420 : default:
421 : BUILD_BUG();
422 : break;
423 : }
424 :
425 0 : next = (old_addr + size) & mask;
426 : /* even if next overflowed, extent below will be ok */
427 0 : extent = next - old_addr;
428 0 : if (extent > old_end - old_addr)
429 0 : extent = old_end - old_addr;
430 0 : next = (new_addr + size) & mask;
431 0 : if (extent > next - new_addr)
432 0 : extent = next - new_addr;
433 : return extent;
434 : }
435 :
436 : /*
437 : * Attempts to speedup the move by moving entry at the level corresponding to
438 : * pgt_entry. Returns true if the move was successful, else false.
439 : */
440 : static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
441 : unsigned long old_addr, unsigned long new_addr,
442 : void *old_entry, void *new_entry, bool need_rmap_locks)
443 : {
444 : bool moved = false;
445 :
446 : /* See comment in move_ptes() */
447 : if (need_rmap_locks)
448 : take_rmap_locks(vma);
449 :
450 : switch (entry) {
451 : case NORMAL_PMD:
452 : moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
453 : new_entry);
454 : break;
455 : case NORMAL_PUD:
456 : moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
457 : new_entry);
458 : break;
459 : case HPAGE_PMD:
460 : moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
461 : move_huge_pmd(vma, old_addr, new_addr, old_entry,
462 : new_entry);
463 : break;
464 : case HPAGE_PUD:
465 : moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
466 : move_huge_pud(vma, old_addr, new_addr, old_entry,
467 : new_entry);
468 : break;
469 :
470 : default:
471 : WARN_ON_ONCE(1);
472 : break;
473 : }
474 :
475 : if (need_rmap_locks)
476 : drop_rmap_locks(vma);
477 :
478 : return moved;
479 : }
480 :
481 0 : unsigned long move_page_tables(struct vm_area_struct *vma,
482 : unsigned long old_addr, struct vm_area_struct *new_vma,
483 : unsigned long new_addr, unsigned long len,
484 : bool need_rmap_locks)
485 : {
486 : unsigned long extent, old_end;
487 : struct mmu_notifier_range range;
488 : pmd_t *old_pmd, *new_pmd;
489 : pud_t *old_pud, *new_pud;
490 :
491 0 : if (!len)
492 : return 0;
493 :
494 0 : old_end = old_addr + len;
495 :
496 0 : if (is_vm_hugetlb_page(vma))
497 : return move_hugetlb_page_tables(vma, new_vma, old_addr,
498 : new_addr, len);
499 :
500 0 : flush_cache_range(vma, old_addr, old_end);
501 : mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
502 : old_addr, old_end);
503 : mmu_notifier_invalidate_range_start(&range);
504 :
505 0 : for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
506 0 : cond_resched();
507 : /*
508 : * If extent is PUD-sized try to speed up the move by moving at the
509 : * PUD level if possible.
510 : */
511 0 : extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
512 :
513 0 : old_pud = get_old_pud(vma->vm_mm, old_addr);
514 0 : if (!old_pud)
515 0 : continue;
516 0 : new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
517 0 : if (!new_pud)
518 : break;
519 : if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
520 : if (extent == HPAGE_PUD_SIZE) {
521 : move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
522 : old_pud, new_pud, need_rmap_locks);
523 : /* We ignore and continue on error? */
524 : continue;
525 : }
526 : } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
527 :
528 : if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
529 : old_pud, new_pud, true))
530 : continue;
531 : }
532 :
533 0 : extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
534 0 : old_pmd = get_old_pmd(vma->vm_mm, old_addr);
535 0 : if (!old_pmd)
536 0 : continue;
537 0 : new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
538 0 : if (!new_pmd)
539 : break;
540 0 : if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
541 0 : pmd_devmap(*old_pmd)) {
542 : if (extent == HPAGE_PMD_SIZE &&
543 : move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
544 : old_pmd, new_pmd, need_rmap_locks))
545 : continue;
546 : split_huge_pmd(vma, old_pmd, old_addr);
547 : if (pmd_trans_unstable(old_pmd))
548 : continue;
549 : } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
550 : extent == PMD_SIZE) {
551 : /*
552 : * If the extent is PMD-sized, try to speed the move by
553 : * moving at the PMD level if possible.
554 : */
555 : if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
556 : old_pmd, new_pmd, true))
557 : continue;
558 : }
559 :
560 0 : if (pte_alloc(new_vma->vm_mm, new_pmd))
561 : break;
562 0 : move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
563 : new_pmd, new_addr, need_rmap_locks);
564 : }
565 :
566 0 : mmu_notifier_invalidate_range_end(&range);
567 :
568 0 : return len + old_addr - old_end; /* how much done */
569 : }
570 :
571 0 : static unsigned long move_vma(struct vm_area_struct *vma,
572 : unsigned long old_addr, unsigned long old_len,
573 : unsigned long new_len, unsigned long new_addr,
574 : bool *locked, unsigned long flags,
575 : struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
576 : {
577 0 : long to_account = new_len - old_len;
578 0 : struct mm_struct *mm = vma->vm_mm;
579 : struct vm_area_struct *new_vma;
580 0 : unsigned long vm_flags = vma->vm_flags;
581 : unsigned long new_pgoff;
582 : unsigned long moved_len;
583 0 : unsigned long account_start = 0;
584 0 : unsigned long account_end = 0;
585 : unsigned long hiwater_vm;
586 0 : int err = 0;
587 : bool need_rmap_locks;
588 : struct vma_iterator vmi;
589 :
590 : /*
591 : * We'd prefer to avoid failure later on in do_munmap:
592 : * which may split one vma into three before unmapping.
593 : */
594 0 : if (mm->map_count >= sysctl_max_map_count - 3)
595 : return -ENOMEM;
596 :
597 0 : if (unlikely(flags & MREMAP_DONTUNMAP))
598 0 : to_account = new_len;
599 :
600 0 : if (vma->vm_ops && vma->vm_ops->may_split) {
601 0 : if (vma->vm_start != old_addr)
602 0 : err = vma->vm_ops->may_split(vma, old_addr);
603 0 : if (!err && vma->vm_end != old_addr + old_len)
604 0 : err = vma->vm_ops->may_split(vma, old_addr + old_len);
605 0 : if (err)
606 0 : return err;
607 : }
608 :
609 : /*
610 : * Advise KSM to break any KSM pages in the area to be moved:
611 : * it would be confusing if they were to turn up at the new
612 : * location, where they happen to coincide with different KSM
613 : * pages recently unmapped. But leave vma->vm_flags as it was,
614 : * so KSM can come around to merge on vma and new_vma afterwards.
615 : */
616 0 : err = ksm_madvise(vma, old_addr, old_addr + old_len,
617 : MADV_UNMERGEABLE, &vm_flags);
618 : if (err)
619 : return err;
620 :
621 0 : if (vm_flags & VM_ACCOUNT) {
622 0 : if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
623 : return -ENOMEM;
624 : }
625 :
626 0 : new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
627 0 : new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
628 : &need_rmap_locks);
629 0 : if (!new_vma) {
630 0 : if (vm_flags & VM_ACCOUNT)
631 0 : vm_unacct_memory(to_account >> PAGE_SHIFT);
632 : return -ENOMEM;
633 : }
634 :
635 0 : moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
636 : need_rmap_locks);
637 0 : if (moved_len < old_len) {
638 : err = -ENOMEM;
639 0 : } else if (vma->vm_ops && vma->vm_ops->mremap) {
640 0 : err = vma->vm_ops->mremap(new_vma);
641 : }
642 :
643 0 : if (unlikely(err)) {
644 : /*
645 : * On error, move entries back from new area to old,
646 : * which will succeed since page tables still there,
647 : * and then proceed to unmap new area instead of old.
648 : */
649 0 : move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
650 : true);
651 0 : vma = new_vma;
652 0 : old_len = new_len;
653 0 : old_addr = new_addr;
654 0 : new_addr = err;
655 : } else {
656 : mremap_userfaultfd_prep(new_vma, uf);
657 : }
658 :
659 0 : if (is_vm_hugetlb_page(vma)) {
660 : clear_vma_resv_huge_pages(vma);
661 : }
662 :
663 : /* Conceal VM_ACCOUNT so old reservation is not undone */
664 0 : if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
665 0 : vm_flags_clear(vma, VM_ACCOUNT);
666 0 : if (vma->vm_start < old_addr)
667 0 : account_start = vma->vm_start;
668 0 : if (vma->vm_end > old_addr + old_len)
669 0 : account_end = vma->vm_end;
670 : }
671 :
672 : /*
673 : * If we failed to move page tables we still do total_vm increment
674 : * since do_munmap() will decrement it by old_len == new_len.
675 : *
676 : * Since total_vm is about to be raised artificially high for a
677 : * moment, we need to restore high watermark afterwards: if stats
678 : * are taken meanwhile, total_vm and hiwater_vm appear too high.
679 : * If this were a serious issue, we'd add a flag to do_munmap().
680 : */
681 0 : hiwater_vm = mm->hiwater_vm;
682 0 : vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
683 :
684 : /* Tell pfnmap has moved from this vma */
685 0 : if (unlikely(vma->vm_flags & VM_PFNMAP))
686 : untrack_pfn_moved(vma);
687 :
688 0 : if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
689 : /* We always clear VM_LOCKED[ONFAULT] on the old vma */
690 0 : vm_flags_clear(vma, VM_LOCKED_MASK);
691 :
692 : /*
693 : * anon_vma links of the old vma is no longer needed after its page
694 : * table has been moved.
695 : */
696 0 : if (new_vma != vma && vma->vm_start == old_addr &&
697 0 : vma->vm_end == (old_addr + old_len))
698 0 : unlink_anon_vmas(vma);
699 :
700 : /* Because we won't unmap we don't need to touch locked_vm */
701 : return new_addr;
702 : }
703 :
704 0 : vma_iter_init(&vmi, mm, old_addr);
705 0 : if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
706 : /* OOM: unable to split vma, just get accounts right */
707 0 : if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
708 0 : vm_acct_memory(old_len >> PAGE_SHIFT);
709 : account_start = account_end = 0;
710 : }
711 :
712 0 : if (vm_flags & VM_LOCKED) {
713 0 : mm->locked_vm += new_len >> PAGE_SHIFT;
714 0 : *locked = true;
715 : }
716 :
717 0 : mm->hiwater_vm = hiwater_vm;
718 :
719 : /* Restore VM_ACCOUNT if one or two pieces of vma left */
720 0 : if (account_start) {
721 0 : vma = vma_prev(&vmi);
722 0 : vm_flags_set(vma, VM_ACCOUNT);
723 : }
724 :
725 0 : if (account_end) {
726 0 : vma = vma_next(&vmi);
727 0 : vm_flags_set(vma, VM_ACCOUNT);
728 : }
729 :
730 : return new_addr;
731 : }
732 :
733 0 : static struct vm_area_struct *vma_to_resize(unsigned long addr,
734 : unsigned long old_len, unsigned long new_len, unsigned long flags)
735 : {
736 0 : struct mm_struct *mm = current->mm;
737 : struct vm_area_struct *vma;
738 : unsigned long pgoff;
739 :
740 0 : vma = vma_lookup(mm, addr);
741 0 : if (!vma)
742 : return ERR_PTR(-EFAULT);
743 :
744 : /*
745 : * !old_len is a special case where an attempt is made to 'duplicate'
746 : * a mapping. This makes no sense for private mappings as it will
747 : * instead create a fresh/new mapping unrelated to the original. This
748 : * is contrary to the basic idea of mremap which creates new mappings
749 : * based on the original. There are no known use cases for this
750 : * behavior. As a result, fail such attempts.
751 : */
752 0 : if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
753 0 : pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
754 : return ERR_PTR(-EINVAL);
755 : }
756 :
757 0 : if ((flags & MREMAP_DONTUNMAP) &&
758 0 : (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
759 : return ERR_PTR(-EINVAL);
760 :
761 : /* We can't remap across vm area boundaries */
762 0 : if (old_len > vma->vm_end - addr)
763 : return ERR_PTR(-EFAULT);
764 :
765 0 : if (new_len == old_len)
766 : return vma;
767 :
768 : /* Need to be careful about a growing mapping */
769 0 : pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
770 0 : pgoff += vma->vm_pgoff;
771 0 : if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
772 : return ERR_PTR(-EINVAL);
773 :
774 0 : if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
775 : return ERR_PTR(-EFAULT);
776 :
777 0 : if (mlock_future_check(mm, vma->vm_flags, new_len - old_len))
778 : return ERR_PTR(-EAGAIN);
779 :
780 0 : if (!may_expand_vm(mm, vma->vm_flags,
781 : (new_len - old_len) >> PAGE_SHIFT))
782 : return ERR_PTR(-ENOMEM);
783 :
784 0 : return vma;
785 : }
786 :
787 0 : static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
788 : unsigned long new_addr, unsigned long new_len, bool *locked,
789 : unsigned long flags, struct vm_userfaultfd_ctx *uf,
790 : struct list_head *uf_unmap_early,
791 : struct list_head *uf_unmap)
792 : {
793 0 : struct mm_struct *mm = current->mm;
794 : struct vm_area_struct *vma;
795 0 : unsigned long ret = -EINVAL;
796 0 : unsigned long map_flags = 0;
797 :
798 0 : if (offset_in_page(new_addr))
799 : goto out;
800 :
801 0 : if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
802 : goto out;
803 :
804 : /* Ensure the old/new locations do not overlap */
805 0 : if (addr + old_len > new_addr && new_addr + new_len > addr)
806 : goto out;
807 :
808 : /*
809 : * move_vma() need us to stay 4 maps below the threshold, otherwise
810 : * it will bail out at the very beginning.
811 : * That is a problem if we have already unmaped the regions here
812 : * (new_addr, and old_addr), because userspace will not know the
813 : * state of the vma's after it gets -ENOMEM.
814 : * So, to avoid such scenario we can pre-compute if the whole
815 : * operation has high chances to success map-wise.
816 : * Worst-scenario case is when both vma's (new_addr and old_addr) get
817 : * split in 3 before unmapping it.
818 : * That means 2 more maps (1 for each) to the ones we already hold.
819 : * Check whether current map count plus 2 still leads us to 4 maps below
820 : * the threshold, otherwise return -ENOMEM here to be more safe.
821 : */
822 0 : if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
823 : return -ENOMEM;
824 :
825 0 : if (flags & MREMAP_FIXED) {
826 0 : ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
827 0 : if (ret)
828 : goto out;
829 : }
830 :
831 0 : if (old_len > new_len) {
832 0 : ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
833 0 : if (ret)
834 : goto out;
835 : old_len = new_len;
836 : }
837 :
838 0 : vma = vma_to_resize(addr, old_len, new_len, flags);
839 0 : if (IS_ERR(vma)) {
840 : ret = PTR_ERR(vma);
841 : goto out;
842 : }
843 :
844 : /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
845 0 : if (flags & MREMAP_DONTUNMAP &&
846 0 : !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
847 : ret = -ENOMEM;
848 : goto out;
849 : }
850 :
851 0 : if (flags & MREMAP_FIXED)
852 0 : map_flags |= MAP_FIXED;
853 :
854 0 : if (vma->vm_flags & VM_MAYSHARE)
855 0 : map_flags |= MAP_SHARED;
856 :
857 0 : ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
858 0 : ((addr - vma->vm_start) >> PAGE_SHIFT),
859 : map_flags);
860 0 : if (IS_ERR_VALUE(ret))
861 : goto out;
862 :
863 : /* We got a new mapping */
864 0 : if (!(flags & MREMAP_FIXED))
865 0 : new_addr = ret;
866 :
867 0 : ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
868 : uf_unmap);
869 :
870 : out:
871 : return ret;
872 : }
873 :
874 0 : static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
875 : {
876 0 : unsigned long end = vma->vm_end + delta;
877 :
878 0 : if (end < vma->vm_end) /* overflow */
879 : return 0;
880 0 : if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
881 : return 0;
882 0 : if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
883 0 : 0, MAP_FIXED) & ~PAGE_MASK)
884 : return 0;
885 0 : return 1;
886 : }
887 :
888 : /*
889 : * Expand (or shrink) an existing mapping, potentially moving it at the
890 : * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
891 : *
892 : * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
893 : * This option implies MREMAP_MAYMOVE.
894 : */
895 0 : SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
896 : unsigned long, new_len, unsigned long, flags,
897 : unsigned long, new_addr)
898 : {
899 0 : struct mm_struct *mm = current->mm;
900 : struct vm_area_struct *vma;
901 0 : unsigned long ret = -EINVAL;
902 0 : bool locked = false;
903 0 : bool downgraded = false;
904 : struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
905 0 : LIST_HEAD(uf_unmap_early);
906 0 : LIST_HEAD(uf_unmap);
907 :
908 : /*
909 : * There is a deliberate asymmetry here: we strip the pointer tag
910 : * from the old address but leave the new address alone. This is
911 : * for consistency with mmap(), where we prevent the creation of
912 : * aliasing mappings in userspace by leaving the tag bits of the
913 : * mapping address intact. A non-zero tag will cause the subsequent
914 : * range checks to reject the address as invalid.
915 : *
916 : * See Documentation/arm64/tagged-address-abi.rst for more information.
917 : */
918 0 : addr = untagged_addr(addr);
919 :
920 0 : if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
921 : return ret;
922 :
923 0 : if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
924 : return ret;
925 :
926 : /*
927 : * MREMAP_DONTUNMAP is always a move and it does not allow resizing
928 : * in the process.
929 : */
930 0 : if (flags & MREMAP_DONTUNMAP &&
931 0 : (!(flags & MREMAP_MAYMOVE) || old_len != new_len))
932 : return ret;
933 :
934 :
935 0 : if (offset_in_page(addr))
936 : return ret;
937 :
938 0 : old_len = PAGE_ALIGN(old_len);
939 0 : new_len = PAGE_ALIGN(new_len);
940 :
941 : /*
942 : * We allow a zero old-len as a special case
943 : * for DOS-emu "duplicate shm area" thing. But
944 : * a zero new-len is nonsensical.
945 : */
946 0 : if (!new_len)
947 : return ret;
948 :
949 0 : if (mmap_write_lock_killable(current->mm))
950 : return -EINTR;
951 0 : vma = vma_lookup(mm, addr);
952 0 : if (!vma) {
953 : ret = -EFAULT;
954 : goto out;
955 : }
956 :
957 : if (is_vm_hugetlb_page(vma)) {
958 : struct hstate *h __maybe_unused = hstate_vma(vma);
959 :
960 : old_len = ALIGN(old_len, huge_page_size(h));
961 : new_len = ALIGN(new_len, huge_page_size(h));
962 :
963 : /* addrs must be huge page aligned */
964 : if (addr & ~huge_page_mask(h))
965 : goto out;
966 : if (new_addr & ~huge_page_mask(h))
967 : goto out;
968 :
969 : /*
970 : * Don't allow remap expansion, because the underlying hugetlb
971 : * reservation is not yet capable to handle split reservation.
972 : */
973 : if (new_len > old_len)
974 : goto out;
975 : }
976 :
977 0 : if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
978 0 : ret = mremap_to(addr, old_len, new_addr, new_len,
979 : &locked, flags, &uf, &uf_unmap_early,
980 : &uf_unmap);
981 0 : goto out;
982 : }
983 :
984 : /*
985 : * Always allow a shrinking remap: that just unmaps
986 : * the unnecessary pages..
987 : * do_vmi_munmap does all the needed commit accounting, and
988 : * downgrades mmap_lock to read if so directed.
989 : */
990 0 : if (old_len >= new_len) {
991 : int retval;
992 0 : VMA_ITERATOR(vmi, mm, addr + new_len);
993 :
994 0 : retval = do_vmi_munmap(&vmi, mm, addr + new_len,
995 : old_len - new_len, &uf_unmap, true);
996 : /* Returning 1 indicates mmap_lock is downgraded to read. */
997 0 : if (retval == 1) {
998 : downgraded = true;
999 0 : } else if (retval < 0 && old_len != new_len) {
1000 0 : ret = retval;
1001 0 : goto out;
1002 : }
1003 :
1004 : ret = addr;
1005 : goto out;
1006 : }
1007 :
1008 : /*
1009 : * Ok, we need to grow..
1010 : */
1011 0 : vma = vma_to_resize(addr, old_len, new_len, flags);
1012 0 : if (IS_ERR(vma)) {
1013 : ret = PTR_ERR(vma);
1014 : goto out;
1015 : }
1016 :
1017 : /* old_len exactly to the end of the area..
1018 : */
1019 0 : if (old_len == vma->vm_end - addr) {
1020 : /* can we just expand the current mapping? */
1021 0 : if (vma_expandable(vma, new_len - old_len)) {
1022 0 : long pages = (new_len - old_len) >> PAGE_SHIFT;
1023 0 : unsigned long extension_start = addr + old_len;
1024 0 : unsigned long extension_end = addr + new_len;
1025 0 : pgoff_t extension_pgoff = vma->vm_pgoff +
1026 0 : ((extension_start - vma->vm_start) >> PAGE_SHIFT);
1027 0 : VMA_ITERATOR(vmi, mm, extension_start);
1028 :
1029 0 : if (vma->vm_flags & VM_ACCOUNT) {
1030 0 : if (security_vm_enough_memory_mm(mm, pages)) {
1031 : ret = -ENOMEM;
1032 : goto out;
1033 : }
1034 : }
1035 :
1036 : /*
1037 : * Function vma_merge() is called on the extension we
1038 : * are adding to the already existing vma, vma_merge()
1039 : * will merge this extension with the already existing
1040 : * vma (expand operation itself) and possibly also with
1041 : * the next vma if it becomes adjacent to the expanded
1042 : * vma and otherwise compatible.
1043 : *
1044 : * However, vma_merge() can currently fail due to
1045 : * is_mergeable_vma() check for vm_ops->close (see the
1046 : * comment there). Yet this should not prevent vma
1047 : * expanding, so perform a simple expand for such vma.
1048 : * Ideally the check for close op should be only done
1049 : * when a vma would be actually removed due to a merge.
1050 : */
1051 0 : if (!vma->vm_ops || !vma->vm_ops->close) {
1052 0 : vma = vma_merge(&vmi, mm, vma, extension_start,
1053 : extension_end, vma->vm_flags, vma->anon_vma,
1054 : vma->vm_file, extension_pgoff, vma_policy(vma),
1055 : vma->vm_userfaultfd_ctx, anon_vma_name(vma));
1056 0 : } else if (vma_expand(&vmi, vma, vma->vm_start,
1057 : addr + new_len, vma->vm_pgoff, NULL)) {
1058 0 : vma = NULL;
1059 : }
1060 0 : if (!vma) {
1061 0 : vm_unacct_memory(pages);
1062 0 : ret = -ENOMEM;
1063 0 : goto out;
1064 : }
1065 :
1066 0 : vm_stat_account(mm, vma->vm_flags, pages);
1067 0 : if (vma->vm_flags & VM_LOCKED) {
1068 0 : mm->locked_vm += pages;
1069 0 : locked = true;
1070 0 : new_addr = addr;
1071 : }
1072 : ret = addr;
1073 : goto out;
1074 : }
1075 : }
1076 :
1077 : /*
1078 : * We weren't able to just expand or shrink the area,
1079 : * we need to create a new one and move it..
1080 : */
1081 0 : ret = -ENOMEM;
1082 0 : if (flags & MREMAP_MAYMOVE) {
1083 0 : unsigned long map_flags = 0;
1084 0 : if (vma->vm_flags & VM_MAYSHARE)
1085 0 : map_flags |= MAP_SHARED;
1086 :
1087 0 : new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
1088 0 : vma->vm_pgoff +
1089 0 : ((addr - vma->vm_start) >> PAGE_SHIFT),
1090 : map_flags);
1091 0 : if (IS_ERR_VALUE(new_addr)) {
1092 : ret = new_addr;
1093 : goto out;
1094 : }
1095 :
1096 0 : ret = move_vma(vma, addr, old_len, new_len, new_addr,
1097 : &locked, flags, &uf, &uf_unmap);
1098 : }
1099 : out:
1100 0 : if (offset_in_page(ret))
1101 0 : locked = false;
1102 0 : if (downgraded)
1103 0 : mmap_read_unlock(current->mm);
1104 : else
1105 0 : mmap_write_unlock(current->mm);
1106 0 : if (locked && new_len > old_len)
1107 0 : mm_populate(new_addr + old_len, new_len - old_len);
1108 0 : userfaultfd_unmap_complete(mm, &uf_unmap_early);
1109 0 : mremap_userfaultfd_complete(&uf, addr, ret, old_len);
1110 0 : userfaultfd_unmap_complete(mm, &uf_unmap);
1111 0 : return ret;
1112 : }
|