Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * mm/mremap.c
4 : *
5 : * (C) Copyright 1996 Linus Torvalds
6 : *
7 : * Address space accounting code <alan@lxorguk.ukuu.org.uk>
8 : * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
9 : */
10 :
11 : #include <linux/mm.h>
12 : #include <linux/mm_inline.h>
13 : #include <linux/hugetlb.h>
14 : #include <linux/shm.h>
15 : #include <linux/ksm.h>
16 : #include <linux/mman.h>
17 : #include <linux/swap.h>
18 : #include <linux/capability.h>
19 : #include <linux/fs.h>
20 : #include <linux/swapops.h>
21 : #include <linux/highmem.h>
22 : #include <linux/security.h>
23 : #include <linux/syscalls.h>
24 : #include <linux/mmu_notifier.h>
25 : #include <linux/uaccess.h>
26 : #include <linux/userfaultfd_k.h>
27 : #include <linux/mempolicy.h>
28 :
29 : #include <asm/cacheflush.h>
30 : #include <asm/tlb.h>
31 : #include <asm/pgalloc.h>
32 :
33 : #include "internal.h"
34 :
35 : static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
36 : {
37 : pgd_t *pgd;
38 : p4d_t *p4d;
39 : pud_t *pud;
40 :
41 0 : pgd = pgd_offset(mm, addr);
42 : if (pgd_none_or_clear_bad(pgd))
43 : return NULL;
44 :
45 0 : p4d = p4d_offset(pgd, addr);
46 : if (p4d_none_or_clear_bad(p4d))
47 : return NULL;
48 :
49 0 : pud = pud_offset(p4d, addr);
50 0 : if (pud_none_or_clear_bad(pud))
51 : return NULL;
52 :
53 : return pud;
54 : }
55 :
56 0 : static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
57 : {
58 : pud_t *pud;
59 : pmd_t *pmd;
60 :
61 0 : pud = get_old_pud(mm, addr);
62 0 : if (!pud)
63 : return NULL;
64 :
65 0 : pmd = pmd_offset(pud, addr);
66 0 : if (pmd_none(*pmd))
67 : return NULL;
68 :
69 0 : return pmd;
70 : }
71 :
72 : static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
73 : unsigned long addr)
74 : {
75 : pgd_t *pgd;
76 : p4d_t *p4d;
77 :
78 0 : pgd = pgd_offset(mm, addr);
79 0 : p4d = p4d_alloc(mm, pgd, addr);
80 0 : if (!p4d)
81 : return NULL;
82 :
83 0 : return pud_alloc(mm, p4d, addr);
84 : }
85 :
86 0 : static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
87 : unsigned long addr)
88 : {
89 : pud_t *pud;
90 : pmd_t *pmd;
91 :
92 0 : pud = alloc_new_pud(mm, vma, addr);
93 0 : if (!pud)
94 : return NULL;
95 :
96 0 : pmd = pmd_alloc(mm, pud, addr);
97 0 : if (!pmd)
98 : return NULL;
99 :
100 : VM_BUG_ON(pmd_trans_huge(*pmd));
101 :
102 : return pmd;
103 : }
104 :
105 0 : static void take_rmap_locks(struct vm_area_struct *vma)
106 : {
107 0 : if (vma->vm_file)
108 0 : i_mmap_lock_write(vma->vm_file->f_mapping);
109 0 : if (vma->anon_vma)
110 0 : anon_vma_lock_write(vma->anon_vma);
111 0 : }
112 :
113 0 : static void drop_rmap_locks(struct vm_area_struct *vma)
114 : {
115 0 : if (vma->anon_vma)
116 0 : anon_vma_unlock_write(vma->anon_vma);
117 0 : if (vma->vm_file)
118 0 : i_mmap_unlock_write(vma->vm_file->f_mapping);
119 0 : }
120 :
121 : static pte_t move_soft_dirty_pte(pte_t pte)
122 : {
123 : /*
124 : * Set soft dirty bit so we can notice
125 : * in userspace the ptes were moved.
126 : */
127 : #ifdef CONFIG_MEM_SOFT_DIRTY
128 : if (pte_present(pte))
129 : pte = pte_mksoft_dirty(pte);
130 : else if (is_swap_pte(pte))
131 : pte = pte_swp_mksoft_dirty(pte);
132 : #endif
133 : return pte;
134 : }
135 :
136 0 : static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
137 : unsigned long old_addr, unsigned long old_end,
138 : struct vm_area_struct *new_vma, pmd_t *new_pmd,
139 : unsigned long new_addr, bool need_rmap_locks)
140 : {
141 0 : struct mm_struct *mm = vma->vm_mm;
142 : pte_t *old_pte, *new_pte, pte;
143 : spinlock_t *old_ptl, *new_ptl;
144 0 : bool force_flush = false;
145 0 : unsigned long len = old_end - old_addr;
146 :
147 : /*
148 : * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
149 : * locks to ensure that rmap will always observe either the old or the
150 : * new ptes. This is the easiest way to avoid races with
151 : * truncate_pagecache(), page migration, etc...
152 : *
153 : * When need_rmap_locks is false, we use other ways to avoid
154 : * such races:
155 : *
156 : * - During exec() shift_arg_pages(), we use a specially tagged vma
157 : * which rmap call sites look for using vma_is_temporary_stack().
158 : *
159 : * - During mremap(), new_vma is often known to be placed after vma
160 : * in rmap traversal order. This ensures rmap will always observe
161 : * either the old pte, or the new pte, or both (the page table locks
162 : * serialize access to individual ptes, but only rmap traversal
163 : * order guarantees that we won't miss both the old and new ptes).
164 : */
165 0 : if (need_rmap_locks)
166 0 : take_rmap_locks(vma);
167 :
168 : /*
169 : * We don't have to worry about the ordering of src and dst
170 : * pte locks because exclusive mmap_lock prevents deadlock.
171 : */
172 0 : old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
173 0 : new_pte = pte_offset_map(new_pmd, new_addr);
174 0 : new_ptl = pte_lockptr(mm, new_pmd);
175 : if (new_ptl != old_ptl)
176 : spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
177 0 : flush_tlb_batched_pending(vma->vm_mm);
178 : arch_enter_lazy_mmu_mode();
179 :
180 0 : for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
181 0 : new_pte++, new_addr += PAGE_SIZE) {
182 0 : if (pte_none(*old_pte))
183 0 : continue;
184 :
185 0 : pte = ptep_get_and_clear(mm, old_addr, old_pte);
186 : /*
187 : * If we are remapping a valid PTE, make sure
188 : * to flush TLB before we drop the PTL for the
189 : * PTE.
190 : *
191 : * NOTE! Both old and new PTL matter: the old one
192 : * for racing with page_mkclean(), the new one to
193 : * make sure the physical page stays valid until
194 : * the TLB entry for the old mapping has been
195 : * flushed.
196 : */
197 0 : if (pte_present(pte))
198 0 : force_flush = true;
199 0 : pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
200 : pte = move_soft_dirty_pte(pte);
201 0 : set_pte_at(mm, new_addr, new_pte, pte);
202 : }
203 :
204 : arch_leave_lazy_mmu_mode();
205 0 : if (force_flush)
206 0 : flush_tlb_range(vma, old_end - len, old_end);
207 : if (new_ptl != old_ptl)
208 : spin_unlock(new_ptl);
209 : pte_unmap(new_pte - 1);
210 0 : pte_unmap_unlock(old_pte - 1, old_ptl);
211 0 : if (need_rmap_locks)
212 0 : drop_rmap_locks(vma);
213 0 : }
214 :
215 : #ifndef arch_supports_page_table_move
216 : #define arch_supports_page_table_move arch_supports_page_table_move
217 : static inline bool arch_supports_page_table_move(void)
218 : {
219 : return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
220 : IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
221 : }
222 : #endif
223 :
224 : #ifdef CONFIG_HAVE_MOVE_PMD
225 : static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
226 : unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
227 : {
228 : spinlock_t *old_ptl, *new_ptl;
229 : struct mm_struct *mm = vma->vm_mm;
230 : pmd_t pmd;
231 :
232 : if (!arch_supports_page_table_move())
233 : return false;
234 : /*
235 : * The destination pmd shouldn't be established, free_pgtables()
236 : * should have released it.
237 : *
238 : * However, there's a case during execve() where we use mremap
239 : * to move the initial stack, and in that case the target area
240 : * may overlap the source area (always moving down).
241 : *
242 : * If everything is PMD-aligned, that works fine, as moving
243 : * each pmd down will clear the source pmd. But if we first
244 : * have a few 4kB-only pages that get moved down, and then
245 : * hit the "now the rest is PMD-aligned, let's do everything
246 : * one pmd at a time", we will still have the old (now empty
247 : * of any 4kB pages, but still there) PMD in the page table
248 : * tree.
249 : *
250 : * Warn on it once - because we really should try to figure
251 : * out how to do this better - but then say "I won't move
252 : * this pmd".
253 : *
254 : * One alternative might be to just unmap the target pmd at
255 : * this point, and verify that it really is empty. We'll see.
256 : */
257 : if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
258 : return false;
259 :
260 : /*
261 : * We don't have to worry about the ordering of src and dst
262 : * ptlocks because exclusive mmap_lock prevents deadlock.
263 : */
264 : old_ptl = pmd_lock(vma->vm_mm, old_pmd);
265 : new_ptl = pmd_lockptr(mm, new_pmd);
266 : if (new_ptl != old_ptl)
267 : spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
268 :
269 : /* Clear the pmd */
270 : pmd = *old_pmd;
271 : pmd_clear(old_pmd);
272 :
273 : VM_BUG_ON(!pmd_none(*new_pmd));
274 :
275 : pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
276 : flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
277 : if (new_ptl != old_ptl)
278 : spin_unlock(new_ptl);
279 : spin_unlock(old_ptl);
280 :
281 : return true;
282 : }
283 : #else
284 : static inline bool move_normal_pmd(struct vm_area_struct *vma,
285 : unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
286 : pmd_t *new_pmd)
287 : {
288 : return false;
289 : }
290 : #endif
291 :
292 : #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
293 : static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
294 : unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
295 : {
296 : spinlock_t *old_ptl, *new_ptl;
297 : struct mm_struct *mm = vma->vm_mm;
298 : pud_t pud;
299 :
300 : if (!arch_supports_page_table_move())
301 : return false;
302 : /*
303 : * The destination pud shouldn't be established, free_pgtables()
304 : * should have released it.
305 : */
306 : if (WARN_ON_ONCE(!pud_none(*new_pud)))
307 : return false;
308 :
309 : /*
310 : * We don't have to worry about the ordering of src and dst
311 : * ptlocks because exclusive mmap_lock prevents deadlock.
312 : */
313 : old_ptl = pud_lock(vma->vm_mm, old_pud);
314 : new_ptl = pud_lockptr(mm, new_pud);
315 : if (new_ptl != old_ptl)
316 : spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
317 :
318 : /* Clear the pud */
319 : pud = *old_pud;
320 : pud_clear(old_pud);
321 :
322 : VM_BUG_ON(!pud_none(*new_pud));
323 :
324 : pud_populate(mm, new_pud, pud_pgtable(pud));
325 : flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
326 : if (new_ptl != old_ptl)
327 : spin_unlock(new_ptl);
328 : spin_unlock(old_ptl);
329 :
330 : return true;
331 : }
332 : #else
333 : static inline bool move_normal_pud(struct vm_area_struct *vma,
334 : unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
335 : pud_t *new_pud)
336 : {
337 : return false;
338 : }
339 : #endif
340 :
341 : #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
342 : static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
343 : unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
344 : {
345 : spinlock_t *old_ptl, *new_ptl;
346 : struct mm_struct *mm = vma->vm_mm;
347 : pud_t pud;
348 :
349 : /*
350 : * The destination pud shouldn't be established, free_pgtables()
351 : * should have released it.
352 : */
353 : if (WARN_ON_ONCE(!pud_none(*new_pud)))
354 : return false;
355 :
356 : /*
357 : * We don't have to worry about the ordering of src and dst
358 : * ptlocks because exclusive mmap_lock prevents deadlock.
359 : */
360 : old_ptl = pud_lock(vma->vm_mm, old_pud);
361 : new_ptl = pud_lockptr(mm, new_pud);
362 : if (new_ptl != old_ptl)
363 : spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
364 :
365 : /* Clear the pud */
366 : pud = *old_pud;
367 : pud_clear(old_pud);
368 :
369 : VM_BUG_ON(!pud_none(*new_pud));
370 :
371 : /* Set the new pud */
372 : /* mark soft_ditry when we add pud level soft dirty support */
373 : set_pud_at(mm, new_addr, new_pud, pud);
374 : flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
375 : if (new_ptl != old_ptl)
376 : spin_unlock(new_ptl);
377 : spin_unlock(old_ptl);
378 :
379 : return true;
380 : }
381 : #else
382 : static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
383 : unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
384 : {
385 : WARN_ON_ONCE(1);
386 : return false;
387 :
388 : }
389 : #endif
390 :
391 : enum pgt_entry {
392 : NORMAL_PMD,
393 : HPAGE_PMD,
394 : NORMAL_PUD,
395 : HPAGE_PUD,
396 : };
397 :
398 : /*
399 : * Returns an extent of the corresponding size for the pgt_entry specified if
400 : * valid. Else returns a smaller extent bounded by the end of the source and
401 : * destination pgt_entry.
402 : */
403 : static __always_inline unsigned long get_extent(enum pgt_entry entry,
404 : unsigned long old_addr, unsigned long old_end,
405 : unsigned long new_addr)
406 : {
407 : unsigned long next, extent, mask, size;
408 :
409 : switch (entry) {
410 : case HPAGE_PMD:
411 : case NORMAL_PMD:
412 : mask = PMD_MASK;
413 : size = PMD_SIZE;
414 : break;
415 : case HPAGE_PUD:
416 : case NORMAL_PUD:
417 0 : mask = PUD_MASK;
418 0 : size = PUD_SIZE;
419 : break;
420 : default:
421 : BUILD_BUG();
422 : break;
423 : }
424 :
425 0 : next = (old_addr + size) & mask;
426 : /* even if next overflowed, extent below will be ok */
427 0 : extent = next - old_addr;
428 0 : if (extent > old_end - old_addr)
429 0 : extent = old_end - old_addr;
430 0 : next = (new_addr + size) & mask;
431 0 : if (extent > next - new_addr)
432 0 : extent = next - new_addr;
433 : return extent;
434 : }
435 :
436 : /*
437 : * Attempts to speedup the move by moving entry at the level corresponding to
438 : * pgt_entry. Returns true if the move was successful, else false.
439 : */
440 : static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
441 : unsigned long old_addr, unsigned long new_addr,
442 : void *old_entry, void *new_entry, bool need_rmap_locks)
443 : {
444 : bool moved = false;
445 :
446 : /* See comment in move_ptes() */
447 : if (need_rmap_locks)
448 : take_rmap_locks(vma);
449 :
450 : switch (entry) {
451 : case NORMAL_PMD:
452 : moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
453 : new_entry);
454 : break;
455 : case NORMAL_PUD:
456 : moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
457 : new_entry);
458 : break;
459 : case HPAGE_PMD:
460 : moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
461 : move_huge_pmd(vma, old_addr, new_addr, old_entry,
462 : new_entry);
463 : break;
464 : case HPAGE_PUD:
465 : moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
466 : move_huge_pud(vma, old_addr, new_addr, old_entry,
467 : new_entry);
468 : break;
469 :
470 : default:
471 : WARN_ON_ONCE(1);
472 : break;
473 : }
474 :
475 : if (need_rmap_locks)
476 : drop_rmap_locks(vma);
477 :
478 : return moved;
479 : }
480 :
481 0 : unsigned long move_page_tables(struct vm_area_struct *vma,
482 : unsigned long old_addr, struct vm_area_struct *new_vma,
483 : unsigned long new_addr, unsigned long len,
484 : bool need_rmap_locks)
485 : {
486 : unsigned long extent, old_end;
487 : struct mmu_notifier_range range;
488 : pmd_t *old_pmd, *new_pmd;
489 : pud_t *old_pud, *new_pud;
490 :
491 0 : if (!len)
492 : return 0;
493 :
494 0 : old_end = old_addr + len;
495 :
496 0 : if (is_vm_hugetlb_page(vma))
497 : return move_hugetlb_page_tables(vma, new_vma, old_addr,
498 : new_addr, len);
499 :
500 0 : flush_cache_range(vma, old_addr, old_end);
501 : mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
502 : old_addr, old_end);
503 : mmu_notifier_invalidate_range_start(&range);
504 :
505 0 : for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
506 0 : cond_resched();
507 : /*
508 : * If extent is PUD-sized try to speed up the move by moving at the
509 : * PUD level if possible.
510 : */
511 0 : extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
512 :
513 0 : old_pud = get_old_pud(vma->vm_mm, old_addr);
514 0 : if (!old_pud)
515 0 : continue;
516 0 : new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
517 0 : if (!new_pud)
518 : break;
519 : if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
520 : if (extent == HPAGE_PUD_SIZE) {
521 : move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
522 : old_pud, new_pud, need_rmap_locks);
523 : /* We ignore and continue on error? */
524 : continue;
525 : }
526 : } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
527 :
528 : if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
529 : old_pud, new_pud, true))
530 : continue;
531 : }
532 :
533 0 : extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
534 0 : old_pmd = get_old_pmd(vma->vm_mm, old_addr);
535 0 : if (!old_pmd)
536 0 : continue;
537 0 : new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
538 0 : if (!new_pmd)
539 : break;
540 0 : if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
541 0 : pmd_devmap(*old_pmd)) {
542 : if (extent == HPAGE_PMD_SIZE &&
543 : move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
544 : old_pmd, new_pmd, need_rmap_locks))
545 : continue;
546 : split_huge_pmd(vma, old_pmd, old_addr);
547 : if (pmd_trans_unstable(old_pmd))
548 : continue;
549 : } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
550 : extent == PMD_SIZE) {
551 : /*
552 : * If the extent is PMD-sized, try to speed the move by
553 : * moving at the PMD level if possible.
554 : */
555 : if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
556 : old_pmd, new_pmd, true))
557 : continue;
558 : }
559 :
560 0 : if (pte_alloc(new_vma->vm_mm, new_pmd))
561 : break;
562 0 : move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
563 : new_pmd, new_addr, need_rmap_locks);
564 : }
565 :
566 0 : mmu_notifier_invalidate_range_end(&range);
567 :
568 0 : return len + old_addr - old_end; /* how much done */
569 : }
570 :
571 0 : static unsigned long move_vma(struct vm_area_struct *vma,
572 : unsigned long old_addr, unsigned long old_len,
573 : unsigned long new_len, unsigned long new_addr,
574 : bool *locked, unsigned long flags,
575 : struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
576 : {
577 0 : long to_account = new_len - old_len;
578 0 : struct mm_struct *mm = vma->vm_mm;
579 : struct vm_area_struct *new_vma;
580 0 : unsigned long vm_flags = vma->vm_flags;
581 : unsigned long new_pgoff;
582 : unsigned long moved_len;
583 0 : unsigned long account_start = 0;
584 0 : unsigned long account_end = 0;
585 : unsigned long hiwater_vm;
586 0 : int err = 0;
587 : bool need_rmap_locks;
588 : struct vma_iterator vmi;
589 :
590 : /*
591 : * We'd prefer to avoid failure later on in do_munmap:
592 : * which may split one vma into three before unmapping.
593 : */
594 0 : if (mm->map_count >= sysctl_max_map_count - 3)
595 : return -ENOMEM;
596 :
597 0 : if (unlikely(flags & MREMAP_DONTUNMAP))
598 0 : to_account = new_len;
599 :
600 0 : if (vma->vm_ops && vma->vm_ops->may_split) {
601 0 : if (vma->vm_start != old_addr)
602 0 : err = vma->vm_ops->may_split(vma, old_addr);
603 0 : if (!err && vma->vm_end != old_addr + old_len)
604 0 : err = vma->vm_ops->may_split(vma, old_addr + old_len);
605 0 : if (err)
606 0 : return err;
607 : }
608 :
609 : /*
610 : * Advise KSM to break any KSM pages in the area to be moved:
611 : * it would be confusing if they were to turn up at the new
612 : * location, where they happen to coincide with different KSM
613 : * pages recently unmapped. But leave vma->vm_flags as it was,
614 : * so KSM can come around to merge on vma and new_vma afterwards.
615 : */
616 0 : err = ksm_madvise(vma, old_addr, old_addr + old_len,
617 : MADV_UNMERGEABLE, &vm_flags);
618 : if (err)
619 : return err;
620 :
621 0 : if (vm_flags & VM_ACCOUNT) {
622 0 : if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
623 : return -ENOMEM;
624 : }
625 :
626 0 : vma_start_write(vma);
627 0 : new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
628 0 : new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
629 : &need_rmap_locks);
630 0 : if (!new_vma) {
631 0 : if (vm_flags & VM_ACCOUNT)
632 0 : vm_unacct_memory(to_account >> PAGE_SHIFT);
633 : return -ENOMEM;
634 : }
635 :
636 0 : moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
637 : need_rmap_locks);
638 0 : if (moved_len < old_len) {
639 : err = -ENOMEM;
640 0 : } else if (vma->vm_ops && vma->vm_ops->mremap) {
641 0 : err = vma->vm_ops->mremap(new_vma);
642 : }
643 :
644 0 : if (unlikely(err)) {
645 : /*
646 : * On error, move entries back from new area to old,
647 : * which will succeed since page tables still there,
648 : * and then proceed to unmap new area instead of old.
649 : */
650 0 : move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
651 : true);
652 0 : vma = new_vma;
653 0 : old_len = new_len;
654 0 : old_addr = new_addr;
655 0 : new_addr = err;
656 : } else {
657 : mremap_userfaultfd_prep(new_vma, uf);
658 : }
659 :
660 0 : if (is_vm_hugetlb_page(vma)) {
661 : clear_vma_resv_huge_pages(vma);
662 : }
663 :
664 : /* Conceal VM_ACCOUNT so old reservation is not undone */
665 0 : if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
666 0 : vm_flags_clear(vma, VM_ACCOUNT);
667 0 : if (vma->vm_start < old_addr)
668 0 : account_start = vma->vm_start;
669 0 : if (vma->vm_end > old_addr + old_len)
670 0 : account_end = vma->vm_end;
671 : }
672 :
673 : /*
674 : * If we failed to move page tables we still do total_vm increment
675 : * since do_munmap() will decrement it by old_len == new_len.
676 : *
677 : * Since total_vm is about to be raised artificially high for a
678 : * moment, we need to restore high watermark afterwards: if stats
679 : * are taken meanwhile, total_vm and hiwater_vm appear too high.
680 : * If this were a serious issue, we'd add a flag to do_munmap().
681 : */
682 0 : hiwater_vm = mm->hiwater_vm;
683 0 : vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
684 :
685 : /* Tell pfnmap has moved from this vma */
686 0 : if (unlikely(vma->vm_flags & VM_PFNMAP))
687 : untrack_pfn_clear(vma);
688 :
689 0 : if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
690 : /* We always clear VM_LOCKED[ONFAULT] on the old vma */
691 0 : vm_flags_clear(vma, VM_LOCKED_MASK);
692 :
693 : /*
694 : * anon_vma links of the old vma is no longer needed after its page
695 : * table has been moved.
696 : */
697 0 : if (new_vma != vma && vma->vm_start == old_addr &&
698 0 : vma->vm_end == (old_addr + old_len))
699 0 : unlink_anon_vmas(vma);
700 :
701 : /* Because we won't unmap we don't need to touch locked_vm */
702 : return new_addr;
703 : }
704 :
705 0 : vma_iter_init(&vmi, mm, old_addr);
706 0 : if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
707 : /* OOM: unable to split vma, just get accounts right */
708 0 : if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
709 0 : vm_acct_memory(old_len >> PAGE_SHIFT);
710 : account_start = account_end = 0;
711 : }
712 :
713 0 : if (vm_flags & VM_LOCKED) {
714 0 : mm->locked_vm += new_len >> PAGE_SHIFT;
715 0 : *locked = true;
716 : }
717 :
718 0 : mm->hiwater_vm = hiwater_vm;
719 :
720 : /* Restore VM_ACCOUNT if one or two pieces of vma left */
721 0 : if (account_start) {
722 0 : vma = vma_prev(&vmi);
723 0 : vm_flags_set(vma, VM_ACCOUNT);
724 : }
725 :
726 0 : if (account_end) {
727 0 : vma = vma_next(&vmi);
728 0 : vm_flags_set(vma, VM_ACCOUNT);
729 : }
730 :
731 : return new_addr;
732 : }
733 :
734 0 : static struct vm_area_struct *vma_to_resize(unsigned long addr,
735 : unsigned long old_len, unsigned long new_len, unsigned long flags)
736 : {
737 0 : struct mm_struct *mm = current->mm;
738 : struct vm_area_struct *vma;
739 : unsigned long pgoff;
740 :
741 0 : vma = vma_lookup(mm, addr);
742 0 : if (!vma)
743 : return ERR_PTR(-EFAULT);
744 :
745 : /*
746 : * !old_len is a special case where an attempt is made to 'duplicate'
747 : * a mapping. This makes no sense for private mappings as it will
748 : * instead create a fresh/new mapping unrelated to the original. This
749 : * is contrary to the basic idea of mremap which creates new mappings
750 : * based on the original. There are no known use cases for this
751 : * behavior. As a result, fail such attempts.
752 : */
753 0 : if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
754 0 : pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
755 : return ERR_PTR(-EINVAL);
756 : }
757 :
758 0 : if ((flags & MREMAP_DONTUNMAP) &&
759 0 : (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
760 : return ERR_PTR(-EINVAL);
761 :
762 : /* We can't remap across vm area boundaries */
763 0 : if (old_len > vma->vm_end - addr)
764 : return ERR_PTR(-EFAULT);
765 :
766 0 : if (new_len == old_len)
767 : return vma;
768 :
769 : /* Need to be careful about a growing mapping */
770 0 : pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
771 0 : pgoff += vma->vm_pgoff;
772 0 : if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
773 : return ERR_PTR(-EINVAL);
774 :
775 0 : if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
776 : return ERR_PTR(-EFAULT);
777 :
778 0 : if (mlock_future_check(mm, vma->vm_flags, new_len - old_len))
779 : return ERR_PTR(-EAGAIN);
780 :
781 0 : if (!may_expand_vm(mm, vma->vm_flags,
782 : (new_len - old_len) >> PAGE_SHIFT))
783 : return ERR_PTR(-ENOMEM);
784 :
785 0 : return vma;
786 : }
787 :
788 0 : static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
789 : unsigned long new_addr, unsigned long new_len, bool *locked,
790 : unsigned long flags, struct vm_userfaultfd_ctx *uf,
791 : struct list_head *uf_unmap_early,
792 : struct list_head *uf_unmap)
793 : {
794 0 : struct mm_struct *mm = current->mm;
795 : struct vm_area_struct *vma;
796 0 : unsigned long ret = -EINVAL;
797 0 : unsigned long map_flags = 0;
798 :
799 0 : if (offset_in_page(new_addr))
800 : goto out;
801 :
802 0 : if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
803 : goto out;
804 :
805 : /* Ensure the old/new locations do not overlap */
806 0 : if (addr + old_len > new_addr && new_addr + new_len > addr)
807 : goto out;
808 :
809 : /*
810 : * move_vma() need us to stay 4 maps below the threshold, otherwise
811 : * it will bail out at the very beginning.
812 : * That is a problem if we have already unmaped the regions here
813 : * (new_addr, and old_addr), because userspace will not know the
814 : * state of the vma's after it gets -ENOMEM.
815 : * So, to avoid such scenario we can pre-compute if the whole
816 : * operation has high chances to success map-wise.
817 : * Worst-scenario case is when both vma's (new_addr and old_addr) get
818 : * split in 3 before unmapping it.
819 : * That means 2 more maps (1 for each) to the ones we already hold.
820 : * Check whether current map count plus 2 still leads us to 4 maps below
821 : * the threshold, otherwise return -ENOMEM here to be more safe.
822 : */
823 0 : if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
824 : return -ENOMEM;
825 :
826 0 : if (flags & MREMAP_FIXED) {
827 0 : ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
828 0 : if (ret)
829 : goto out;
830 : }
831 :
832 0 : if (old_len > new_len) {
833 0 : ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
834 0 : if (ret)
835 : goto out;
836 : old_len = new_len;
837 : }
838 :
839 0 : vma = vma_to_resize(addr, old_len, new_len, flags);
840 0 : if (IS_ERR(vma)) {
841 : ret = PTR_ERR(vma);
842 : goto out;
843 : }
844 :
845 : /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
846 0 : if (flags & MREMAP_DONTUNMAP &&
847 0 : !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
848 : ret = -ENOMEM;
849 : goto out;
850 : }
851 :
852 0 : if (flags & MREMAP_FIXED)
853 0 : map_flags |= MAP_FIXED;
854 :
855 0 : if (vma->vm_flags & VM_MAYSHARE)
856 0 : map_flags |= MAP_SHARED;
857 :
858 0 : ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
859 0 : ((addr - vma->vm_start) >> PAGE_SHIFT),
860 : map_flags);
861 0 : if (IS_ERR_VALUE(ret))
862 : goto out;
863 :
864 : /* We got a new mapping */
865 0 : if (!(flags & MREMAP_FIXED))
866 0 : new_addr = ret;
867 :
868 0 : ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
869 : uf_unmap);
870 :
871 : out:
872 : return ret;
873 : }
874 :
875 0 : static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
876 : {
877 0 : unsigned long end = vma->vm_end + delta;
878 :
879 0 : if (end < vma->vm_end) /* overflow */
880 : return 0;
881 0 : if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
882 : return 0;
883 0 : if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
884 0 : 0, MAP_FIXED) & ~PAGE_MASK)
885 : return 0;
886 0 : return 1;
887 : }
888 :
889 : /*
890 : * Expand (or shrink) an existing mapping, potentially moving it at the
891 : * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
892 : *
893 : * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
894 : * This option implies MREMAP_MAYMOVE.
895 : */
896 0 : SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
897 : unsigned long, new_len, unsigned long, flags,
898 : unsigned long, new_addr)
899 : {
900 0 : struct mm_struct *mm = current->mm;
901 : struct vm_area_struct *vma;
902 0 : unsigned long ret = -EINVAL;
903 0 : bool locked = false;
904 0 : bool downgraded = false;
905 : struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
906 0 : LIST_HEAD(uf_unmap_early);
907 0 : LIST_HEAD(uf_unmap);
908 :
909 : /*
910 : * There is a deliberate asymmetry here: we strip the pointer tag
911 : * from the old address but leave the new address alone. This is
912 : * for consistency with mmap(), where we prevent the creation of
913 : * aliasing mappings in userspace by leaving the tag bits of the
914 : * mapping address intact. A non-zero tag will cause the subsequent
915 : * range checks to reject the address as invalid.
916 : *
917 : * See Documentation/arm64/tagged-address-abi.rst for more information.
918 : */
919 0 : addr = untagged_addr(addr);
920 :
921 0 : if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
922 : return ret;
923 :
924 0 : if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
925 : return ret;
926 :
927 : /*
928 : * MREMAP_DONTUNMAP is always a move and it does not allow resizing
929 : * in the process.
930 : */
931 0 : if (flags & MREMAP_DONTUNMAP &&
932 0 : (!(flags & MREMAP_MAYMOVE) || old_len != new_len))
933 : return ret;
934 :
935 :
936 0 : if (offset_in_page(addr))
937 : return ret;
938 :
939 0 : old_len = PAGE_ALIGN(old_len);
940 0 : new_len = PAGE_ALIGN(new_len);
941 :
942 : /*
943 : * We allow a zero old-len as a special case
944 : * for DOS-emu "duplicate shm area" thing. But
945 : * a zero new-len is nonsensical.
946 : */
947 0 : if (!new_len)
948 : return ret;
949 :
950 0 : if (mmap_write_lock_killable(current->mm))
951 : return -EINTR;
952 0 : vma = vma_lookup(mm, addr);
953 0 : if (!vma) {
954 : ret = -EFAULT;
955 : goto out;
956 : }
957 :
958 : if (is_vm_hugetlb_page(vma)) {
959 : struct hstate *h __maybe_unused = hstate_vma(vma);
960 :
961 : old_len = ALIGN(old_len, huge_page_size(h));
962 : new_len = ALIGN(new_len, huge_page_size(h));
963 :
964 : /* addrs must be huge page aligned */
965 : if (addr & ~huge_page_mask(h))
966 : goto out;
967 : if (new_addr & ~huge_page_mask(h))
968 : goto out;
969 :
970 : /*
971 : * Don't allow remap expansion, because the underlying hugetlb
972 : * reservation is not yet capable to handle split reservation.
973 : */
974 : if (new_len > old_len)
975 : goto out;
976 : }
977 :
978 0 : if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
979 0 : ret = mremap_to(addr, old_len, new_addr, new_len,
980 : &locked, flags, &uf, &uf_unmap_early,
981 : &uf_unmap);
982 0 : goto out;
983 : }
984 :
985 : /*
986 : * Always allow a shrinking remap: that just unmaps
987 : * the unnecessary pages..
988 : * do_vmi_munmap does all the needed commit accounting, and
989 : * downgrades mmap_lock to read if so directed.
990 : */
991 0 : if (old_len >= new_len) {
992 : int retval;
993 0 : VMA_ITERATOR(vmi, mm, addr + new_len);
994 :
995 0 : retval = do_vmi_munmap(&vmi, mm, addr + new_len,
996 : old_len - new_len, &uf_unmap, true);
997 : /* Returning 1 indicates mmap_lock is downgraded to read. */
998 0 : if (retval == 1) {
999 : downgraded = true;
1000 0 : } else if (retval < 0 && old_len != new_len) {
1001 0 : ret = retval;
1002 0 : goto out;
1003 : }
1004 :
1005 : ret = addr;
1006 : goto out;
1007 : }
1008 :
1009 : /*
1010 : * Ok, we need to grow..
1011 : */
1012 0 : vma = vma_to_resize(addr, old_len, new_len, flags);
1013 0 : if (IS_ERR(vma)) {
1014 : ret = PTR_ERR(vma);
1015 : goto out;
1016 : }
1017 :
1018 : /* old_len exactly to the end of the area..
1019 : */
1020 0 : if (old_len == vma->vm_end - addr) {
1021 : /* can we just expand the current mapping? */
1022 0 : if (vma_expandable(vma, new_len - old_len)) {
1023 0 : long pages = (new_len - old_len) >> PAGE_SHIFT;
1024 0 : unsigned long extension_start = addr + old_len;
1025 0 : unsigned long extension_end = addr + new_len;
1026 0 : pgoff_t extension_pgoff = vma->vm_pgoff +
1027 0 : ((extension_start - vma->vm_start) >> PAGE_SHIFT);
1028 0 : VMA_ITERATOR(vmi, mm, extension_start);
1029 :
1030 0 : if (vma->vm_flags & VM_ACCOUNT) {
1031 0 : if (security_vm_enough_memory_mm(mm, pages)) {
1032 : ret = -ENOMEM;
1033 : goto out;
1034 : }
1035 : }
1036 :
1037 : /*
1038 : * Function vma_merge() is called on the extension we
1039 : * are adding to the already existing vma, vma_merge()
1040 : * will merge this extension with the already existing
1041 : * vma (expand operation itself) and possibly also with
1042 : * the next vma if it becomes adjacent to the expanded
1043 : * vma and otherwise compatible.
1044 : */
1045 0 : vma = vma_merge(&vmi, mm, vma, extension_start,
1046 : extension_end, vma->vm_flags, vma->anon_vma,
1047 : vma->vm_file, extension_pgoff, vma_policy(vma),
1048 : vma->vm_userfaultfd_ctx, anon_vma_name(vma));
1049 0 : if (!vma) {
1050 0 : vm_unacct_memory(pages);
1051 0 : ret = -ENOMEM;
1052 0 : goto out;
1053 : }
1054 :
1055 0 : vm_stat_account(mm, vma->vm_flags, pages);
1056 0 : if (vma->vm_flags & VM_LOCKED) {
1057 0 : mm->locked_vm += pages;
1058 0 : locked = true;
1059 0 : new_addr = addr;
1060 : }
1061 : ret = addr;
1062 : goto out;
1063 : }
1064 : }
1065 :
1066 : /*
1067 : * We weren't able to just expand or shrink the area,
1068 : * we need to create a new one and move it..
1069 : */
1070 0 : ret = -ENOMEM;
1071 0 : if (flags & MREMAP_MAYMOVE) {
1072 0 : unsigned long map_flags = 0;
1073 0 : if (vma->vm_flags & VM_MAYSHARE)
1074 0 : map_flags |= MAP_SHARED;
1075 :
1076 0 : new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
1077 0 : vma->vm_pgoff +
1078 0 : ((addr - vma->vm_start) >> PAGE_SHIFT),
1079 : map_flags);
1080 0 : if (IS_ERR_VALUE(new_addr)) {
1081 : ret = new_addr;
1082 : goto out;
1083 : }
1084 :
1085 0 : ret = move_vma(vma, addr, old_len, new_len, new_addr,
1086 : &locked, flags, &uf, &uf_unmap);
1087 : }
1088 : out:
1089 0 : if (offset_in_page(ret))
1090 0 : locked = false;
1091 0 : if (downgraded)
1092 0 : mmap_read_unlock(current->mm);
1093 : else
1094 0 : mmap_write_unlock(current->mm);
1095 0 : if (locked && new_len > old_len)
1096 0 : mm_populate(new_addr + old_len, new_len - old_len);
1097 0 : userfaultfd_unmap_complete(mm, &uf_unmap_early);
1098 0 : mremap_userfaultfd_complete(&uf, addr, ret, old_len);
1099 0 : userfaultfd_unmap_complete(mm, &uf_unmap);
1100 0 : return ret;
1101 : }
|