LCOV - code coverage report
Current view: top level - mm - rmap.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 4 561 0.7 %
Date: 2023-07-19 18:55:55 Functions: 1 40 2.5 %

          Line data    Source code
       1             : /*
       2             :  * mm/rmap.c - physical to virtual reverse mappings
       3             :  *
       4             :  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
       5             :  * Released under the General Public License (GPL).
       6             :  *
       7             :  * Simple, low overhead reverse mapping scheme.
       8             :  * Please try to keep this thing as modular as possible.
       9             :  *
      10             :  * Provides methods for unmapping each kind of mapped page:
      11             :  * the anon methods track anonymous pages, and
      12             :  * the file methods track pages belonging to an inode.
      13             :  *
      14             :  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
      15             :  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
      16             :  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
      17             :  * Contributions by Hugh Dickins 2003, 2004
      18             :  */
      19             : 
      20             : /*
      21             :  * Lock ordering in mm:
      22             :  *
      23             :  * inode->i_rwsem    (while writing or truncating, not reading or faulting)
      24             :  *   mm->mmap_lock
      25             :  *     mapping->invalidate_lock (in filemap_fault)
      26             :  *       page->flags PG_locked (lock_page)
      27             :  *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
      28             :  *           vma_start_write
      29             :  *             mapping->i_mmap_rwsem
      30             :  *               anon_vma->rwsem
      31             :  *                 mm->page_table_lock or pte_lock
      32             :  *                   swap_lock (in swap_duplicate, swap_info_get)
      33             :  *                     mmlist_lock (in mmput, drain_mmlist and others)
      34             :  *                     mapping->private_lock (in block_dirty_folio)
      35             :  *                       folio_lock_memcg move_lock (in block_dirty_folio)
      36             :  *                         i_pages lock (widely used)
      37             :  *                           lruvec->lru_lock (in folio_lruvec_lock_irq)
      38             :  *                     inode->i_lock (in set_page_dirty's __mark_inode_dirty)
      39             :  *                     bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
      40             :  *                       sb_lock (within inode_lock in fs/fs-writeback.c)
      41             :  *                       i_pages lock (widely used, in set_page_dirty,
      42             :  *                                 in arch-dependent flush_dcache_mmap_lock,
      43             :  *                                 within bdi.wb->list_lock in __sync_single_inode)
      44             :  *
      45             :  * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
      46             :  *   ->tasklist_lock
      47             :  *     pte map lock
      48             :  *
      49             :  * hugetlbfs PageHuge() take locks in this order:
      50             :  *   hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
      51             :  *     vma_lock (hugetlb specific lock for pmd_sharing)
      52             :  *       mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
      53             :  *         page->flags PG_locked (lock_page)
      54             :  */
      55             : 
      56             : #include <linux/mm.h>
      57             : #include <linux/sched/mm.h>
      58             : #include <linux/sched/task.h>
      59             : #include <linux/pagemap.h>
      60             : #include <linux/swap.h>
      61             : #include <linux/swapops.h>
      62             : #include <linux/slab.h>
      63             : #include <linux/init.h>
      64             : #include <linux/ksm.h>
      65             : #include <linux/rmap.h>
      66             : #include <linux/rcupdate.h>
      67             : #include <linux/export.h>
      68             : #include <linux/memcontrol.h>
      69             : #include <linux/mmu_notifier.h>
      70             : #include <linux/migrate.h>
      71             : #include <linux/hugetlb.h>
      72             : #include <linux/huge_mm.h>
      73             : #include <linux/backing-dev.h>
      74             : #include <linux/page_idle.h>
      75             : #include <linux/memremap.h>
      76             : #include <linux/userfaultfd_k.h>
      77             : #include <linux/mm_inline.h>
      78             : 
      79             : #include <asm/tlbflush.h>
      80             : 
      81             : #define CREATE_TRACE_POINTS
      82             : #include <trace/events/tlb.h>
      83             : #include <trace/events/migrate.h>
      84             : 
      85             : #include "internal.h"
      86             : 
      87             : static struct kmem_cache *anon_vma_cachep;
      88             : static struct kmem_cache *anon_vma_chain_cachep;
      89             : 
      90           0 : static inline struct anon_vma *anon_vma_alloc(void)
      91             : {
      92             :         struct anon_vma *anon_vma;
      93             : 
      94           0 :         anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
      95           0 :         if (anon_vma) {
      96           0 :                 atomic_set(&anon_vma->refcount, 1);
      97           0 :                 anon_vma->num_children = 0;
      98           0 :                 anon_vma->num_active_vmas = 0;
      99           0 :                 anon_vma->parent = anon_vma;
     100             :                 /*
     101             :                  * Initialise the anon_vma root to point to itself. If called
     102             :                  * from fork, the root will be reset to the parents anon_vma.
     103             :                  */
     104           0 :                 anon_vma->root = anon_vma;
     105             :         }
     106             : 
     107           0 :         return anon_vma;
     108             : }
     109             : 
     110           0 : static inline void anon_vma_free(struct anon_vma *anon_vma)
     111             : {
     112             :         VM_BUG_ON(atomic_read(&anon_vma->refcount));
     113             : 
     114             :         /*
     115             :          * Synchronize against folio_lock_anon_vma_read() such that
     116             :          * we can safely hold the lock without the anon_vma getting
     117             :          * freed.
     118             :          *
     119             :          * Relies on the full mb implied by the atomic_dec_and_test() from
     120             :          * put_anon_vma() against the acquire barrier implied by
     121             :          * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
     122             :          *
     123             :          * folio_lock_anon_vma_read()   VS      put_anon_vma()
     124             :          *   down_read_trylock()                  atomic_dec_and_test()
     125             :          *   LOCK                                 MB
     126             :          *   atomic_read()                        rwsem_is_locked()
     127             :          *
     128             :          * LOCK should suffice since the actual taking of the lock must
     129             :          * happen _before_ what follows.
     130             :          */
     131             :         might_sleep();
     132           0 :         if (rwsem_is_locked(&anon_vma->root->rwsem)) {
     133           0 :                 anon_vma_lock_write(anon_vma);
     134           0 :                 anon_vma_unlock_write(anon_vma);
     135             :         }
     136             : 
     137           0 :         kmem_cache_free(anon_vma_cachep, anon_vma);
     138           0 : }
     139             : 
     140             : static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
     141             : {
     142           0 :         return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
     143             : }
     144             : 
     145             : static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
     146             : {
     147           0 :         kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
     148             : }
     149             : 
     150             : static void anon_vma_chain_link(struct vm_area_struct *vma,
     151             :                                 struct anon_vma_chain *avc,
     152             :                                 struct anon_vma *anon_vma)
     153             : {
     154           0 :         avc->vma = vma;
     155           0 :         avc->anon_vma = anon_vma;
     156           0 :         list_add(&avc->same_vma, &vma->anon_vma_chain);
     157           0 :         anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
     158             : }
     159             : 
     160             : /**
     161             :  * __anon_vma_prepare - attach an anon_vma to a memory region
     162             :  * @vma: the memory region in question
     163             :  *
     164             :  * This makes sure the memory mapping described by 'vma' has
     165             :  * an 'anon_vma' attached to it, so that we can associate the
     166             :  * anonymous pages mapped into it with that anon_vma.
     167             :  *
     168             :  * The common case will be that we already have one, which
     169             :  * is handled inline by anon_vma_prepare(). But if
     170             :  * not we either need to find an adjacent mapping that we
     171             :  * can re-use the anon_vma from (very common when the only
     172             :  * reason for splitting a vma has been mprotect()), or we
     173             :  * allocate a new one.
     174             :  *
     175             :  * Anon-vma allocations are very subtle, because we may have
     176             :  * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
     177             :  * and that may actually touch the rwsem even in the newly
     178             :  * allocated vma (it depends on RCU to make sure that the
     179             :  * anon_vma isn't actually destroyed).
     180             :  *
     181             :  * As a result, we need to do proper anon_vma locking even
     182             :  * for the new allocation. At the same time, we do not want
     183             :  * to do any locking for the common case of already having
     184             :  * an anon_vma.
     185             :  *
     186             :  * This must be called with the mmap_lock held for reading.
     187             :  */
     188           0 : int __anon_vma_prepare(struct vm_area_struct *vma)
     189             : {
     190           0 :         struct mm_struct *mm = vma->vm_mm;
     191             :         struct anon_vma *anon_vma, *allocated;
     192             :         struct anon_vma_chain *avc;
     193             : 
     194             :         might_sleep();
     195             : 
     196           0 :         avc = anon_vma_chain_alloc(GFP_KERNEL);
     197           0 :         if (!avc)
     198             :                 goto out_enomem;
     199             : 
     200           0 :         anon_vma = find_mergeable_anon_vma(vma);
     201           0 :         allocated = NULL;
     202           0 :         if (!anon_vma) {
     203           0 :                 anon_vma = anon_vma_alloc();
     204           0 :                 if (unlikely(!anon_vma))
     205             :                         goto out_enomem_free_avc;
     206           0 :                 anon_vma->num_children++; /* self-parent link for new root */
     207           0 :                 allocated = anon_vma;
     208             :         }
     209             : 
     210           0 :         anon_vma_lock_write(anon_vma);
     211             :         /* page_table_lock to protect against threads */
     212           0 :         spin_lock(&mm->page_table_lock);
     213           0 :         if (likely(!vma->anon_vma)) {
     214           0 :                 vma->anon_vma = anon_vma;
     215           0 :                 anon_vma_chain_link(vma, avc, anon_vma);
     216           0 :                 anon_vma->num_active_vmas++;
     217           0 :                 allocated = NULL;
     218           0 :                 avc = NULL;
     219             :         }
     220           0 :         spin_unlock(&mm->page_table_lock);
     221           0 :         anon_vma_unlock_write(anon_vma);
     222             : 
     223           0 :         if (unlikely(allocated))
     224             :                 put_anon_vma(allocated);
     225           0 :         if (unlikely(avc))
     226             :                 anon_vma_chain_free(avc);
     227             : 
     228             :         return 0;
     229             : 
     230             :  out_enomem_free_avc:
     231             :         anon_vma_chain_free(avc);
     232             :  out_enomem:
     233             :         return -ENOMEM;
     234             : }
     235             : 
     236             : /*
     237             :  * This is a useful helper function for locking the anon_vma root as
     238             :  * we traverse the vma->anon_vma_chain, looping over anon_vma's that
     239             :  * have the same vma.
     240             :  *
     241             :  * Such anon_vma's should have the same root, so you'd expect to see
     242             :  * just a single mutex_lock for the whole traversal.
     243             :  */
     244           0 : static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
     245             : {
     246           0 :         struct anon_vma *new_root = anon_vma->root;
     247           0 :         if (new_root != root) {
     248           0 :                 if (WARN_ON_ONCE(root))
     249           0 :                         up_write(&root->rwsem);
     250           0 :                 root = new_root;
     251           0 :                 down_write(&root->rwsem);
     252             :         }
     253           0 :         return root;
     254             : }
     255             : 
     256             : static inline void unlock_anon_vma_root(struct anon_vma *root)
     257             : {
     258           0 :         if (root)
     259           0 :                 up_write(&root->rwsem);
     260             : }
     261             : 
     262             : /*
     263             :  * Attach the anon_vmas from src to dst.
     264             :  * Returns 0 on success, -ENOMEM on failure.
     265             :  *
     266             :  * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
     267             :  * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
     268             :  * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
     269             :  * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
     270             :  * call, we can identify this case by checking (!dst->anon_vma &&
     271             :  * src->anon_vma).
     272             :  *
     273             :  * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
     274             :  * and reuse existing anon_vma which has no vmas and only one child anon_vma.
     275             :  * This prevents degradation of anon_vma hierarchy to endless linear chain in
     276             :  * case of constantly forking task. On the other hand, an anon_vma with more
     277             :  * than one child isn't reused even if there was no alive vma, thus rmap
     278             :  * walker has a good chance of avoiding scanning the whole hierarchy when it
     279             :  * searches where page is mapped.
     280             :  */
     281           0 : int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
     282             : {
     283             :         struct anon_vma_chain *avc, *pavc;
     284           0 :         struct anon_vma *root = NULL;
     285             : 
     286           0 :         list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
     287             :                 struct anon_vma *anon_vma;
     288             : 
     289           0 :                 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
     290           0 :                 if (unlikely(!avc)) {
     291           0 :                         unlock_anon_vma_root(root);
     292           0 :                         root = NULL;
     293           0 :                         avc = anon_vma_chain_alloc(GFP_KERNEL);
     294           0 :                         if (!avc)
     295             :                                 goto enomem_failure;
     296             :                 }
     297           0 :                 anon_vma = pavc->anon_vma;
     298           0 :                 root = lock_anon_vma_root(root, anon_vma);
     299           0 :                 anon_vma_chain_link(dst, avc, anon_vma);
     300             : 
     301             :                 /*
     302             :                  * Reuse existing anon_vma if it has no vma and only one
     303             :                  * anon_vma child.
     304             :                  *
     305             :                  * Root anon_vma is never reused:
     306             :                  * it has self-parent reference and at least one child.
     307             :                  */
     308           0 :                 if (!dst->anon_vma && src->anon_vma &&
     309           0 :                     anon_vma->num_children < 2 &&
     310           0 :                     anon_vma->num_active_vmas == 0)
     311           0 :                         dst->anon_vma = anon_vma;
     312             :         }
     313           0 :         if (dst->anon_vma)
     314           0 :                 dst->anon_vma->num_active_vmas++;
     315             :         unlock_anon_vma_root(root);
     316             :         return 0;
     317             : 
     318             :  enomem_failure:
     319             :         /*
     320             :          * dst->anon_vma is dropped here otherwise its num_active_vmas can
     321             :          * be incorrectly decremented in unlink_anon_vmas().
     322             :          * We can safely do this because callers of anon_vma_clone() don't care
     323             :          * about dst->anon_vma if anon_vma_clone() failed.
     324             :          */
     325           0 :         dst->anon_vma = NULL;
     326           0 :         unlink_anon_vmas(dst);
     327           0 :         return -ENOMEM;
     328             : }
     329             : 
     330             : /*
     331             :  * Attach vma to its own anon_vma, as well as to the anon_vmas that
     332             :  * the corresponding VMA in the parent process is attached to.
     333             :  * Returns 0 on success, non-zero on failure.
     334             :  */
     335           0 : int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
     336             : {
     337             :         struct anon_vma_chain *avc;
     338             :         struct anon_vma *anon_vma;
     339             :         int error;
     340             : 
     341             :         /* Don't bother if the parent process has no anon_vma here. */
     342           0 :         if (!pvma->anon_vma)
     343             :                 return 0;
     344             : 
     345             :         /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
     346           0 :         vma->anon_vma = NULL;
     347             : 
     348             :         /*
     349             :          * First, attach the new VMA to the parent VMA's anon_vmas,
     350             :          * so rmap can find non-COWed pages in child processes.
     351             :          */
     352           0 :         error = anon_vma_clone(vma, pvma);
     353           0 :         if (error)
     354             :                 return error;
     355             : 
     356             :         /* An existing anon_vma has been reused, all done then. */
     357           0 :         if (vma->anon_vma)
     358             :                 return 0;
     359             : 
     360             :         /* Then add our own anon_vma. */
     361           0 :         anon_vma = anon_vma_alloc();
     362           0 :         if (!anon_vma)
     363             :                 goto out_error;
     364           0 :         anon_vma->num_active_vmas++;
     365           0 :         avc = anon_vma_chain_alloc(GFP_KERNEL);
     366           0 :         if (!avc)
     367             :                 goto out_error_free_anon_vma;
     368             : 
     369             :         /*
     370             :          * The root anon_vma's rwsem is the lock actually used when we
     371             :          * lock any of the anon_vmas in this anon_vma tree.
     372             :          */
     373           0 :         anon_vma->root = pvma->anon_vma->root;
     374           0 :         anon_vma->parent = pvma->anon_vma;
     375             :         /*
     376             :          * With refcounts, an anon_vma can stay around longer than the
     377             :          * process it belongs to. The root anon_vma needs to be pinned until
     378             :          * this anon_vma is freed, because the lock lives in the root.
     379             :          */
     380           0 :         get_anon_vma(anon_vma->root);
     381             :         /* Mark this anon_vma as the one where our new (COWed) pages go. */
     382           0 :         vma->anon_vma = anon_vma;
     383           0 :         anon_vma_lock_write(anon_vma);
     384           0 :         anon_vma_chain_link(vma, avc, anon_vma);
     385           0 :         anon_vma->parent->num_children++;
     386           0 :         anon_vma_unlock_write(anon_vma);
     387             : 
     388           0 :         return 0;
     389             : 
     390             :  out_error_free_anon_vma:
     391             :         put_anon_vma(anon_vma);
     392             :  out_error:
     393           0 :         unlink_anon_vmas(vma);
     394           0 :         return -ENOMEM;
     395             : }
     396             : 
     397           0 : void unlink_anon_vmas(struct vm_area_struct *vma)
     398             : {
     399             :         struct anon_vma_chain *avc, *next;
     400           0 :         struct anon_vma *root = NULL;
     401             : 
     402             :         /*
     403             :          * Unlink each anon_vma chained to the VMA.  This list is ordered
     404             :          * from newest to oldest, ensuring the root anon_vma gets freed last.
     405             :          */
     406           0 :         list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
     407           0 :                 struct anon_vma *anon_vma = avc->anon_vma;
     408             : 
     409           0 :                 root = lock_anon_vma_root(root, anon_vma);
     410           0 :                 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
     411             : 
     412             :                 /*
     413             :                  * Leave empty anon_vmas on the list - we'll need
     414             :                  * to free them outside the lock.
     415             :                  */
     416           0 :                 if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
     417           0 :                         anon_vma->parent->num_children--;
     418           0 :                         continue;
     419             :                 }
     420             : 
     421           0 :                 list_del(&avc->same_vma);
     422             :                 anon_vma_chain_free(avc);
     423             :         }
     424           0 :         if (vma->anon_vma) {
     425           0 :                 vma->anon_vma->num_active_vmas--;
     426             : 
     427             :                 /*
     428             :                  * vma would still be needed after unlink, and anon_vma will be prepared
     429             :                  * when handle fault.
     430             :                  */
     431           0 :                 vma->anon_vma = NULL;
     432             :         }
     433           0 :         unlock_anon_vma_root(root);
     434             : 
     435             :         /*
     436             :          * Iterate the list once more, it now only contains empty and unlinked
     437             :          * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
     438             :          * needing to write-acquire the anon_vma->root->rwsem.
     439             :          */
     440           0 :         list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
     441           0 :                 struct anon_vma *anon_vma = avc->anon_vma;
     442             : 
     443             :                 VM_WARN_ON(anon_vma->num_children);
     444             :                 VM_WARN_ON(anon_vma->num_active_vmas);
     445           0 :                 put_anon_vma(anon_vma);
     446             : 
     447           0 :                 list_del(&avc->same_vma);
     448           0 :                 anon_vma_chain_free(avc);
     449             :         }
     450           0 : }
     451             : 
     452           0 : static void anon_vma_ctor(void *data)
     453             : {
     454           0 :         struct anon_vma *anon_vma = data;
     455             : 
     456           0 :         init_rwsem(&anon_vma->rwsem);
     457           0 :         atomic_set(&anon_vma->refcount, 0);
     458           0 :         anon_vma->rb_root = RB_ROOT_CACHED;
     459           0 : }
     460             : 
     461           1 : void __init anon_vma_init(void)
     462             : {
     463           1 :         anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
     464             :                         0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
     465             :                         anon_vma_ctor);
     466           1 :         anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
     467             :                         SLAB_PANIC|SLAB_ACCOUNT);
     468           1 : }
     469             : 
     470             : /*
     471             :  * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
     472             :  *
     473             :  * Since there is no serialization what so ever against page_remove_rmap()
     474             :  * the best this function can do is return a refcount increased anon_vma
     475             :  * that might have been relevant to this page.
     476             :  *
     477             :  * The page might have been remapped to a different anon_vma or the anon_vma
     478             :  * returned may already be freed (and even reused).
     479             :  *
     480             :  * In case it was remapped to a different anon_vma, the new anon_vma will be a
     481             :  * child of the old anon_vma, and the anon_vma lifetime rules will therefore
     482             :  * ensure that any anon_vma obtained from the page will still be valid for as
     483             :  * long as we observe page_mapped() [ hence all those page_mapped() tests ].
     484             :  *
     485             :  * All users of this function must be very careful when walking the anon_vma
     486             :  * chain and verify that the page in question is indeed mapped in it
     487             :  * [ something equivalent to page_mapped_in_vma() ].
     488             :  *
     489             :  * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
     490             :  * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
     491             :  * if there is a mapcount, we can dereference the anon_vma after observing
     492             :  * those.
     493             :  */
     494           0 : struct anon_vma *folio_get_anon_vma(struct folio *folio)
     495             : {
     496           0 :         struct anon_vma *anon_vma = NULL;
     497             :         unsigned long anon_mapping;
     498             : 
     499             :         rcu_read_lock();
     500           0 :         anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
     501           0 :         if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
     502             :                 goto out;
     503           0 :         if (!folio_mapped(folio))
     504             :                 goto out;
     505             : 
     506           0 :         anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
     507           0 :         if (!atomic_inc_not_zero(&anon_vma->refcount)) {
     508             :                 anon_vma = NULL;
     509             :                 goto out;
     510             :         }
     511             : 
     512             :         /*
     513             :          * If this folio is still mapped, then its anon_vma cannot have been
     514             :          * freed.  But if it has been unmapped, we have no security against the
     515             :          * anon_vma structure being freed and reused (for another anon_vma:
     516             :          * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
     517             :          * above cannot corrupt).
     518             :          */
     519           0 :         if (!folio_mapped(folio)) {
     520           0 :                 rcu_read_unlock();
     521             :                 put_anon_vma(anon_vma);
     522             :                 return NULL;
     523             :         }
     524             : out:
     525             :         rcu_read_unlock();
     526             : 
     527           0 :         return anon_vma;
     528             : }
     529             : 
     530             : /*
     531             :  * Similar to folio_get_anon_vma() except it locks the anon_vma.
     532             :  *
     533             :  * Its a little more complex as it tries to keep the fast path to a single
     534             :  * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
     535             :  * reference like with folio_get_anon_vma() and then block on the mutex
     536             :  * on !rwc->try_lock case.
     537             :  */
     538           0 : struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
     539             :                                           struct rmap_walk_control *rwc)
     540             : {
     541           0 :         struct anon_vma *anon_vma = NULL;
     542             :         struct anon_vma *root_anon_vma;
     543             :         unsigned long anon_mapping;
     544             : 
     545             :         rcu_read_lock();
     546           0 :         anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
     547           0 :         if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
     548             :                 goto out;
     549           0 :         if (!folio_mapped(folio))
     550             :                 goto out;
     551             : 
     552           0 :         anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
     553           0 :         root_anon_vma = READ_ONCE(anon_vma->root);
     554           0 :         if (down_read_trylock(&root_anon_vma->rwsem)) {
     555             :                 /*
     556             :                  * If the folio is still mapped, then this anon_vma is still
     557             :                  * its anon_vma, and holding the mutex ensures that it will
     558             :                  * not go away, see anon_vma_free().
     559             :                  */
     560           0 :                 if (!folio_mapped(folio)) {
     561           0 :                         up_read(&root_anon_vma->rwsem);
     562           0 :                         anon_vma = NULL;
     563             :                 }
     564             :                 goto out;
     565             :         }
     566             : 
     567           0 :         if (rwc && rwc->try_lock) {
     568           0 :                 anon_vma = NULL;
     569           0 :                 rwc->contended = true;
     570           0 :                 goto out;
     571             :         }
     572             : 
     573             :         /* trylock failed, we got to sleep */
     574           0 :         if (!atomic_inc_not_zero(&anon_vma->refcount)) {
     575             :                 anon_vma = NULL;
     576             :                 goto out;
     577             :         }
     578             : 
     579           0 :         if (!folio_mapped(folio)) {
     580           0 :                 rcu_read_unlock();
     581             :                 put_anon_vma(anon_vma);
     582             :                 return NULL;
     583             :         }
     584             : 
     585             :         /* we pinned the anon_vma, its safe to sleep */
     586             :         rcu_read_unlock();
     587           0 :         anon_vma_lock_read(anon_vma);
     588             : 
     589           0 :         if (atomic_dec_and_test(&anon_vma->refcount)) {
     590             :                 /*
     591             :                  * Oops, we held the last refcount, release the lock
     592             :                  * and bail -- can't simply use put_anon_vma() because
     593             :                  * we'll deadlock on the anon_vma_lock_write() recursion.
     594             :                  */
     595           0 :                 anon_vma_unlock_read(anon_vma);
     596           0 :                 __put_anon_vma(anon_vma);
     597           0 :                 anon_vma = NULL;
     598             :         }
     599             : 
     600             :         return anon_vma;
     601             : 
     602             : out:
     603             :         rcu_read_unlock();
     604           0 :         return anon_vma;
     605             : }
     606             : 
     607             : #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
     608             : /*
     609             :  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
     610             :  * important if a PTE was dirty when it was unmapped that it's flushed
     611             :  * before any IO is initiated on the page to prevent lost writes. Similarly,
     612             :  * it must be flushed before freeing to prevent data leakage.
     613             :  */
     614             : void try_to_unmap_flush(void)
     615             : {
     616             :         struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
     617             : 
     618             :         if (!tlb_ubc->flush_required)
     619             :                 return;
     620             : 
     621             :         arch_tlbbatch_flush(&tlb_ubc->arch);
     622             :         tlb_ubc->flush_required = false;
     623             :         tlb_ubc->writable = false;
     624             : }
     625             : 
     626             : /* Flush iff there are potentially writable TLB entries that can race with IO */
     627             : void try_to_unmap_flush_dirty(void)
     628             : {
     629             :         struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
     630             : 
     631             :         if (tlb_ubc->writable)
     632             :                 try_to_unmap_flush();
     633             : }
     634             : 
     635             : /*
     636             :  * Bits 0-14 of mm->tlb_flush_batched record pending generations.
     637             :  * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
     638             :  */
     639             : #define TLB_FLUSH_BATCH_FLUSHED_SHIFT   16
     640             : #define TLB_FLUSH_BATCH_PENDING_MASK                    \
     641             :         ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
     642             : #define TLB_FLUSH_BATCH_PENDING_LARGE                   \
     643             :         (TLB_FLUSH_BATCH_PENDING_MASK / 2)
     644             : 
     645             : static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval)
     646             : {
     647             :         struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
     648             :         int batch;
     649             :         bool writable = pte_dirty(pteval);
     650             : 
     651             :         if (!pte_accessible(mm, pteval))
     652             :                 return;
     653             : 
     654             :         arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
     655             :         tlb_ubc->flush_required = true;
     656             : 
     657             :         /*
     658             :          * Ensure compiler does not re-order the setting of tlb_flush_batched
     659             :          * before the PTE is cleared.
     660             :          */
     661             :         barrier();
     662             :         batch = atomic_read(&mm->tlb_flush_batched);
     663             : retry:
     664             :         if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
     665             :                 /*
     666             :                  * Prevent `pending' from catching up with `flushed' because of
     667             :                  * overflow.  Reset `pending' and `flushed' to be 1 and 0 if
     668             :                  * `pending' becomes large.
     669             :                  */
     670             :                 if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1))
     671             :                         goto retry;
     672             :         } else {
     673             :                 atomic_inc(&mm->tlb_flush_batched);
     674             :         }
     675             : 
     676             :         /*
     677             :          * If the PTE was dirty then it's best to assume it's writable. The
     678             :          * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
     679             :          * before the page is queued for IO.
     680             :          */
     681             :         if (writable)
     682             :                 tlb_ubc->writable = true;
     683             : }
     684             : 
     685             : /*
     686             :  * Returns true if the TLB flush should be deferred to the end of a batch of
     687             :  * unmap operations to reduce IPIs.
     688             :  */
     689             : static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
     690             : {
     691             :         bool should_defer = false;
     692             : 
     693             :         if (!(flags & TTU_BATCH_FLUSH))
     694             :                 return false;
     695             : 
     696             :         /* If remote CPUs need to be flushed then defer batch the flush */
     697             :         if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
     698             :                 should_defer = true;
     699             :         put_cpu();
     700             : 
     701             :         return should_defer;
     702             : }
     703             : 
     704             : /*
     705             :  * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
     706             :  * releasing the PTL if TLB flushes are batched. It's possible for a parallel
     707             :  * operation such as mprotect or munmap to race between reclaim unmapping
     708             :  * the page and flushing the page. If this race occurs, it potentially allows
     709             :  * access to data via a stale TLB entry. Tracking all mm's that have TLB
     710             :  * batching in flight would be expensive during reclaim so instead track
     711             :  * whether TLB batching occurred in the past and if so then do a flush here
     712             :  * if required. This will cost one additional flush per reclaim cycle paid
     713             :  * by the first operation at risk such as mprotect and mumap.
     714             :  *
     715             :  * This must be called under the PTL so that an access to tlb_flush_batched
     716             :  * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
     717             :  * via the PTL.
     718             :  */
     719             : void flush_tlb_batched_pending(struct mm_struct *mm)
     720             : {
     721             :         int batch = atomic_read(&mm->tlb_flush_batched);
     722             :         int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
     723             :         int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
     724             : 
     725             :         if (pending != flushed) {
     726             :                 flush_tlb_mm(mm);
     727             :                 /*
     728             :                  * If the new TLB flushing is pending during flushing, leave
     729             :                  * mm->tlb_flush_batched as is, to avoid losing flushing.
     730             :                  */
     731             :                 atomic_cmpxchg(&mm->tlb_flush_batched, batch,
     732             :                                pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
     733             :         }
     734             : }
     735             : #else
     736             : static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval)
     737             : {
     738             : }
     739             : 
     740             : static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
     741             : {
     742             :         return false;
     743             : }
     744             : #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
     745             : 
     746             : /*
     747             :  * At what user virtual address is page expected in vma?
     748             :  * Caller should check the page is actually part of the vma.
     749             :  */
     750           0 : unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
     751             : {
     752           0 :         struct folio *folio = page_folio(page);
     753           0 :         if (folio_test_anon(folio)) {
     754           0 :                 struct anon_vma *page__anon_vma = folio_anon_vma(folio);
     755             :                 /*
     756             :                  * Note: swapoff's unuse_vma() is more efficient with this
     757             :                  * check, and needs it to match anon_vma when KSM is active.
     758             :                  */
     759           0 :                 if (!vma->anon_vma || !page__anon_vma ||
     760           0 :                     vma->anon_vma->root != page__anon_vma->root)
     761             :                         return -EFAULT;
     762           0 :         } else if (!vma->vm_file) {
     763             :                 return -EFAULT;
     764           0 :         } else if (vma->vm_file->f_mapping != folio->mapping) {
     765             :                 return -EFAULT;
     766             :         }
     767             : 
     768           0 :         return vma_address(page, vma);
     769             : }
     770             : 
     771             : /*
     772             :  * Returns the actual pmd_t* where we expect 'address' to be mapped from, or
     773             :  * NULL if it doesn't exist.  No guarantees / checks on what the pmd_t*
     774             :  * represents.
     775             :  */
     776           0 : pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
     777             : {
     778             :         pgd_t *pgd;
     779             :         p4d_t *p4d;
     780             :         pud_t *pud;
     781           0 :         pmd_t *pmd = NULL;
     782             : 
     783           0 :         pgd = pgd_offset(mm, address);
     784             :         if (!pgd_present(*pgd))
     785             :                 goto out;
     786             : 
     787           0 :         p4d = p4d_offset(pgd, address);
     788             :         if (!p4d_present(*p4d))
     789             :                 goto out;
     790             : 
     791           0 :         pud = pud_offset(p4d, address);
     792           0 :         if (!pud_present(*pud))
     793             :                 goto out;
     794             : 
     795           0 :         pmd = pmd_offset(pud, address);
     796             : out:
     797           0 :         return pmd;
     798             : }
     799             : 
     800             : struct folio_referenced_arg {
     801             :         int mapcount;
     802             :         int referenced;
     803             :         unsigned long vm_flags;
     804             :         struct mem_cgroup *memcg;
     805             : };
     806             : /*
     807             :  * arg: folio_referenced_arg will be passed
     808             :  */
     809           0 : static bool folio_referenced_one(struct folio *folio,
     810             :                 struct vm_area_struct *vma, unsigned long address, void *arg)
     811             : {
     812           0 :         struct folio_referenced_arg *pra = arg;
     813           0 :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
     814           0 :         int referenced = 0;
     815             : 
     816           0 :         while (page_vma_mapped_walk(&pvmw)) {
     817           0 :                 address = pvmw.address;
     818             : 
     819           0 :                 if ((vma->vm_flags & VM_LOCKED) &&
     820           0 :                     (!folio_test_large(folio) || !pvmw.pte)) {
     821             :                         /* Restore the mlock which got missed */
     822           0 :                         mlock_vma_folio(folio, vma, !pvmw.pte);
     823           0 :                         page_vma_mapped_walk_done(&pvmw);
     824           0 :                         pra->vm_flags |= VM_LOCKED;
     825           0 :                         return false; /* To break the loop */
     826             :                 }
     827             : 
     828           0 :                 if (pvmw.pte) {
     829             :                         if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
     830             :                                 lru_gen_look_around(&pvmw);
     831             :                                 referenced++;
     832             :                         }
     833             : 
     834           0 :                         if (ptep_clear_flush_young_notify(vma, address,
     835             :                                                 pvmw.pte))
     836           0 :                                 referenced++;
     837             :                 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
     838             :                         if (pmdp_clear_flush_young_notify(vma, address,
     839             :                                                 pvmw.pmd))
     840             :                                 referenced++;
     841             :                 } else {
     842             :                         /* unexpected pmd-mapped folio? */
     843           0 :                         WARN_ON_ONCE(1);
     844             :                 }
     845             : 
     846           0 :                 pra->mapcount--;
     847             :         }
     848             : 
     849             :         if (referenced)
     850             :                 folio_clear_idle(folio);
     851           0 :         if (folio_test_clear_young(folio))
     852             :                 referenced++;
     853             : 
     854           0 :         if (referenced) {
     855           0 :                 pra->referenced++;
     856           0 :                 pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
     857             :         }
     858             : 
     859           0 :         if (!pra->mapcount)
     860             :                 return false; /* To break the loop */
     861             : 
     862           0 :         return true;
     863             : }
     864             : 
     865           0 : static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
     866             : {
     867           0 :         struct folio_referenced_arg *pra = arg;
     868           0 :         struct mem_cgroup *memcg = pra->memcg;
     869             : 
     870             :         /*
     871             :          * Ignore references from this mapping if it has no recency. If the
     872             :          * folio has been used in another mapping, we will catch it; if this
     873             :          * other mapping is already gone, the unmap path will have set the
     874             :          * referenced flag or activated the folio in zap_pte_range().
     875             :          */
     876           0 :         if (!vma_has_recency(vma))
     877             :                 return true;
     878             : 
     879             :         /*
     880             :          * If we are reclaiming on behalf of a cgroup, skip counting on behalf
     881             :          * of references from different cgroups.
     882             :          */
     883             :         if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
     884             :                 return true;
     885             : 
     886             :         return false;
     887             : }
     888             : 
     889             : /**
     890             :  * folio_referenced() - Test if the folio was referenced.
     891             :  * @folio: The folio to test.
     892             :  * @is_locked: Caller holds lock on the folio.
     893             :  * @memcg: target memory cgroup
     894             :  * @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
     895             :  *
     896             :  * Quick test_and_clear_referenced for all mappings of a folio,
     897             :  *
     898             :  * Return: The number of mappings which referenced the folio. Return -1 if
     899             :  * the function bailed out due to rmap lock contention.
     900             :  */
     901           0 : int folio_referenced(struct folio *folio, int is_locked,
     902             :                      struct mem_cgroup *memcg, unsigned long *vm_flags)
     903             : {
     904           0 :         int we_locked = 0;
     905           0 :         struct folio_referenced_arg pra = {
     906           0 :                 .mapcount = folio_mapcount(folio),
     907             :                 .memcg = memcg,
     908             :         };
     909           0 :         struct rmap_walk_control rwc = {
     910             :                 .rmap_one = folio_referenced_one,
     911             :                 .arg = (void *)&pra,
     912             :                 .anon_lock = folio_lock_anon_vma_read,
     913             :                 .try_lock = true,
     914             :                 .invalid_vma = invalid_folio_referenced_vma,
     915             :         };
     916             : 
     917           0 :         *vm_flags = 0;
     918           0 :         if (!pra.mapcount)
     919             :                 return 0;
     920             : 
     921           0 :         if (!folio_raw_mapping(folio))
     922             :                 return 0;
     923             : 
     924           0 :         if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) {
     925           0 :                 we_locked = folio_trylock(folio);
     926           0 :                 if (!we_locked)
     927             :                         return 1;
     928             :         }
     929             : 
     930           0 :         rmap_walk(folio, &rwc);
     931           0 :         *vm_flags = pra.vm_flags;
     932             : 
     933           0 :         if (we_locked)
     934           0 :                 folio_unlock(folio);
     935             : 
     936           0 :         return rwc.contended ? -1 : pra.referenced;
     937             : }
     938             : 
     939           0 : static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
     940             : {
     941           0 :         int cleaned = 0;
     942           0 :         struct vm_area_struct *vma = pvmw->vma;
     943             :         struct mmu_notifier_range range;
     944           0 :         unsigned long address = pvmw->address;
     945             : 
     946             :         /*
     947             :          * We have to assume the worse case ie pmd for invalidation. Note that
     948             :          * the folio can not be freed from this function.
     949             :          */
     950             :         mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
     951             :                                 vma->vm_mm, address, vma_address_end(pvmw));
     952             :         mmu_notifier_invalidate_range_start(&range);
     953             : 
     954           0 :         while (page_vma_mapped_walk(pvmw)) {
     955           0 :                 int ret = 0;
     956             : 
     957           0 :                 address = pvmw->address;
     958           0 :                 if (pvmw->pte) {
     959             :                         pte_t entry;
     960           0 :                         pte_t *pte = pvmw->pte;
     961             : 
     962           0 :                         if (!pte_dirty(*pte) && !pte_write(*pte))
     963           0 :                                 continue;
     964             : 
     965           0 :                         flush_cache_page(vma, address, pte_pfn(*pte));
     966           0 :                         entry = ptep_clear_flush(vma, address, pte);
     967           0 :                         entry = pte_wrprotect(entry);
     968           0 :                         entry = pte_mkclean(entry);
     969           0 :                         set_pte_at(vma->vm_mm, address, pte, entry);
     970             :                         ret = 1;
     971             :                 } else {
     972             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     973             :                         pmd_t *pmd = pvmw->pmd;
     974             :                         pmd_t entry;
     975             : 
     976             :                         if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
     977             :                                 continue;
     978             : 
     979             :                         flush_cache_range(vma, address,
     980             :                                           address + HPAGE_PMD_SIZE);
     981             :                         entry = pmdp_invalidate(vma, address, pmd);
     982             :                         entry = pmd_wrprotect(entry);
     983             :                         entry = pmd_mkclean(entry);
     984             :                         set_pmd_at(vma->vm_mm, address, pmd, entry);
     985             :                         ret = 1;
     986             : #else
     987             :                         /* unexpected pmd-mapped folio? */
     988           0 :                         WARN_ON_ONCE(1);
     989             : #endif
     990             :                 }
     991             : 
     992             :                 /*
     993             :                  * No need to call mmu_notifier_invalidate_range() as we are
     994             :                  * downgrading page table protection not changing it to point
     995             :                  * to a new page.
     996             :                  *
     997             :                  * See Documentation/mm/mmu_notifier.rst
     998             :                  */
     999           0 :                 if (ret)
    1000           0 :                         cleaned++;
    1001             :         }
    1002             : 
    1003           0 :         mmu_notifier_invalidate_range_end(&range);
    1004             : 
    1005           0 :         return cleaned;
    1006             : }
    1007             : 
    1008           0 : static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
    1009             :                              unsigned long address, void *arg)
    1010             : {
    1011           0 :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
    1012           0 :         int *cleaned = arg;
    1013             : 
    1014           0 :         *cleaned += page_vma_mkclean_one(&pvmw);
    1015             : 
    1016           0 :         return true;
    1017             : }
    1018             : 
    1019           0 : static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
    1020             : {
    1021           0 :         if (vma->vm_flags & VM_SHARED)
    1022             :                 return false;
    1023             : 
    1024           0 :         return true;
    1025             : }
    1026             : 
    1027           0 : int folio_mkclean(struct folio *folio)
    1028             : {
    1029           0 :         int cleaned = 0;
    1030             :         struct address_space *mapping;
    1031           0 :         struct rmap_walk_control rwc = {
    1032             :                 .arg = (void *)&cleaned,
    1033             :                 .rmap_one = page_mkclean_one,
    1034             :                 .invalid_vma = invalid_mkclean_vma,
    1035             :         };
    1036             : 
    1037           0 :         BUG_ON(!folio_test_locked(folio));
    1038             : 
    1039           0 :         if (!folio_mapped(folio))
    1040             :                 return 0;
    1041             : 
    1042           0 :         mapping = folio_mapping(folio);
    1043           0 :         if (!mapping)
    1044             :                 return 0;
    1045             : 
    1046           0 :         rmap_walk(folio, &rwc);
    1047             : 
    1048           0 :         return cleaned;
    1049             : }
    1050             : EXPORT_SYMBOL_GPL(folio_mkclean);
    1051             : 
    1052             : /**
    1053             :  * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
    1054             :  *                     [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
    1055             :  *                     within the @vma of shared mappings. And since clean PTEs
    1056             :  *                     should also be readonly, write protects them too.
    1057             :  * @pfn: start pfn.
    1058             :  * @nr_pages: number of physically contiguous pages srarting with @pfn.
    1059             :  * @pgoff: page offset that the @pfn mapped with.
    1060             :  * @vma: vma that @pfn mapped within.
    1061             :  *
    1062             :  * Returns the number of cleaned PTEs (including PMDs).
    1063             :  */
    1064           0 : int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
    1065             :                       struct vm_area_struct *vma)
    1066             : {
    1067           0 :         struct page_vma_mapped_walk pvmw = {
    1068             :                 .pfn            = pfn,
    1069             :                 .nr_pages       = nr_pages,
    1070             :                 .pgoff          = pgoff,
    1071             :                 .vma            = vma,
    1072             :                 .flags          = PVMW_SYNC,
    1073             :         };
    1074             : 
    1075           0 :         if (invalid_mkclean_vma(vma, NULL))
    1076             :                 return 0;
    1077             : 
    1078           0 :         pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma);
    1079             :         VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
    1080             : 
    1081           0 :         return page_vma_mkclean_one(&pvmw);
    1082             : }
    1083             : 
    1084           0 : int folio_total_mapcount(struct folio *folio)
    1085             : {
    1086           0 :         int mapcount = folio_entire_mapcount(folio);
    1087             :         int nr_pages;
    1088             :         int i;
    1089             : 
    1090             :         /* In the common case, avoid the loop when no pages mapped by PTE */
    1091           0 :         if (folio_nr_pages_mapped(folio) == 0)
    1092             :                 return mapcount;
    1093             :         /*
    1094             :          * Add all the PTE mappings of those pages mapped by PTE.
    1095             :          * Limit the loop to folio_nr_pages_mapped()?
    1096             :          * Perhaps: given all the raciness, that may be a good or a bad idea.
    1097             :          */
    1098           0 :         nr_pages = folio_nr_pages(folio);
    1099           0 :         for (i = 0; i < nr_pages; i++)
    1100           0 :                 mapcount += atomic_read(&folio_page(folio, i)->_mapcount);
    1101             : 
    1102             :         /* But each of those _mapcounts was based on -1 */
    1103           0 :         mapcount += nr_pages;
    1104           0 :         return mapcount;
    1105             : }
    1106             : 
    1107             : /**
    1108             :  * page_move_anon_rmap - move a page to our anon_vma
    1109             :  * @page:       the page to move to our anon_vma
    1110             :  * @vma:        the vma the page belongs to
    1111             :  *
    1112             :  * When a page belongs exclusively to one process after a COW event,
    1113             :  * that page can be moved into the anon_vma that belongs to just that
    1114             :  * process, so the rmap code will not search the parent or sibling
    1115             :  * processes.
    1116             :  */
    1117           0 : void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
    1118             : {
    1119           0 :         void *anon_vma = vma->anon_vma;
    1120           0 :         struct folio *folio = page_folio(page);
    1121             : 
    1122             :         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
    1123             :         VM_BUG_ON_VMA(!anon_vma, vma);
    1124             : 
    1125           0 :         anon_vma += PAGE_MAPPING_ANON;
    1126             :         /*
    1127             :          * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
    1128             :          * simultaneously, so a concurrent reader (eg folio_referenced()'s
    1129             :          * folio_test_anon()) will not see one without the other.
    1130             :          */
    1131           0 :         WRITE_ONCE(folio->mapping, anon_vma);
    1132           0 :         SetPageAnonExclusive(page);
    1133           0 : }
    1134             : 
    1135             : /**
    1136             :  * __page_set_anon_rmap - set up new anonymous rmap
    1137             :  * @folio:      Folio which contains page.
    1138             :  * @page:       Page to add to rmap.
    1139             :  * @vma:        VM area to add page to.
    1140             :  * @address:    User virtual address of the mapping     
    1141             :  * @exclusive:  the page is exclusively owned by the current process
    1142             :  */
    1143           0 : static void __page_set_anon_rmap(struct folio *folio, struct page *page,
    1144             :         struct vm_area_struct *vma, unsigned long address, int exclusive)
    1145             : {
    1146           0 :         struct anon_vma *anon_vma = vma->anon_vma;
    1147             : 
    1148           0 :         BUG_ON(!anon_vma);
    1149             : 
    1150           0 :         if (folio_test_anon(folio))
    1151             :                 goto out;
    1152             : 
    1153             :         /*
    1154             :          * If the page isn't exclusively mapped into this vma,
    1155             :          * we must use the _oldest_ possible anon_vma for the
    1156             :          * page mapping!
    1157             :          */
    1158           0 :         if (!exclusive)
    1159           0 :                 anon_vma = anon_vma->root;
    1160             : 
    1161             :         /*
    1162             :          * page_idle does a lockless/optimistic rmap scan on folio->mapping.
    1163             :          * Make sure the compiler doesn't split the stores of anon_vma and
    1164             :          * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
    1165             :          * could mistake the mapping for a struct address_space and crash.
    1166             :          */
    1167           0 :         anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
    1168           0 :         WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
    1169           0 :         folio->index = linear_page_index(vma, address);
    1170             : out:
    1171           0 :         if (exclusive)
    1172             :                 SetPageAnonExclusive(page);
    1173           0 : }
    1174             : 
    1175             : /**
    1176             :  * __page_check_anon_rmap - sanity check anonymous rmap addition
    1177             :  * @page:       the page to add the mapping to
    1178             :  * @vma:        the vm area in which the mapping is added
    1179             :  * @address:    the user virtual address mapped
    1180             :  */
    1181             : static void __page_check_anon_rmap(struct page *page,
    1182             :         struct vm_area_struct *vma, unsigned long address)
    1183             : {
    1184           0 :         struct folio *folio = page_folio(page);
    1185             :         /*
    1186             :          * The page's anon-rmap details (mapping and index) are guaranteed to
    1187             :          * be set up correctly at this point.
    1188             :          *
    1189             :          * We have exclusion against page_add_anon_rmap because the caller
    1190             :          * always holds the page locked.
    1191             :          *
    1192             :          * We have exclusion against page_add_new_anon_rmap because those pages
    1193             :          * are initially only visible via the pagetables, and the pte is locked
    1194             :          * over the call to page_add_new_anon_rmap.
    1195             :          */
    1196             :         VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
    1197             :                         folio);
    1198             :         VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
    1199             :                        page);
    1200             : }
    1201             : 
    1202             : /**
    1203             :  * page_add_anon_rmap - add pte mapping to an anonymous page
    1204             :  * @page:       the page to add the mapping to
    1205             :  * @vma:        the vm area in which the mapping is added
    1206             :  * @address:    the user virtual address mapped
    1207             :  * @flags:      the rmap flags
    1208             :  *
    1209             :  * The caller needs to hold the pte lock, and the page must be locked in
    1210             :  * the anon_vma case: to serialize mapping,index checking after setting,
    1211             :  * and to ensure that PageAnon is not being upgraded racily to PageKsm
    1212             :  * (but PageKsm is never downgraded to PageAnon).
    1213             :  */
    1214           0 : void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
    1215             :                 unsigned long address, rmap_t flags)
    1216             : {
    1217           0 :         struct folio *folio = page_folio(page);
    1218           0 :         atomic_t *mapped = &folio->_nr_pages_mapped;
    1219           0 :         int nr = 0, nr_pmdmapped = 0;
    1220           0 :         bool compound = flags & RMAP_COMPOUND;
    1221           0 :         bool first = true;
    1222             : 
    1223             :         /* Is page being mapped by PTE? Is this its first map to be added? */
    1224           0 :         if (likely(!compound)) {
    1225           0 :                 first = atomic_inc_and_test(&page->_mapcount);
    1226           0 :                 nr = first;
    1227           0 :                 if (first && folio_test_large(folio)) {
    1228           0 :                         nr = atomic_inc_return_relaxed(mapped);
    1229           0 :                         nr = (nr < COMPOUND_MAPPED);
    1230             :                 }
    1231             :         } else if (folio_test_pmd_mappable(folio)) {
    1232             :                 /* That test is redundant: it's for safety or to optimize out */
    1233             : 
    1234             :                 first = atomic_inc_and_test(&folio->_entire_mapcount);
    1235             :                 if (first) {
    1236             :                         nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
    1237             :                         if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) {
    1238             :                                 nr_pmdmapped = folio_nr_pages(folio);
    1239             :                                 nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
    1240             :                                 /* Raced ahead of a remove and another add? */
    1241             :                                 if (unlikely(nr < 0))
    1242             :                                         nr = 0;
    1243             :                         } else {
    1244             :                                 /* Raced ahead of a remove of COMPOUND_MAPPED */
    1245             :                                 nr = 0;
    1246             :                         }
    1247             :                 }
    1248             :         }
    1249             : 
    1250             :         VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
    1251             :         VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
    1252             : 
    1253             :         if (nr_pmdmapped)
    1254             :                 __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped);
    1255           0 :         if (nr)
    1256           0 :                 __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
    1257             : 
    1258           0 :         if (likely(!folio_test_ksm(folio))) {
    1259             :                 /* address might be in next vma when migration races vma_merge */
    1260           0 :                 if (first)
    1261           0 :                         __page_set_anon_rmap(folio, page, vma, address,
    1262           0 :                                              !!(flags & RMAP_EXCLUSIVE));
    1263             :                 else
    1264           0 :                         __page_check_anon_rmap(page, vma, address);
    1265             :         }
    1266             : 
    1267           0 :         mlock_vma_folio(folio, vma, compound);
    1268           0 : }
    1269             : 
    1270             : /**
    1271             :  * folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
    1272             :  * @folio:      The folio to add the mapping to.
    1273             :  * @vma:        the vm area in which the mapping is added
    1274             :  * @address:    the user virtual address mapped
    1275             :  *
    1276             :  * Like page_add_anon_rmap() but must only be called on *new* folios.
    1277             :  * This means the inc-and-test can be bypassed.
    1278             :  * The folio does not have to be locked.
    1279             :  *
    1280             :  * If the folio is large, it is accounted as a THP.  As the folio
    1281             :  * is new, it's assumed to be mapped exclusively by a single process.
    1282             :  */
    1283           0 : void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
    1284             :                 unsigned long address)
    1285             : {
    1286             :         int nr;
    1287             : 
    1288             :         VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
    1289           0 :         __folio_set_swapbacked(folio);
    1290             : 
    1291           0 :         if (likely(!folio_test_pmd_mappable(folio))) {
    1292             :                 /* increment count (starts at -1) */
    1293           0 :                 atomic_set(&folio->_mapcount, 0);
    1294           0 :                 nr = 1;
    1295             :         } else {
    1296             :                 /* increment count (starts at -1) */
    1297             :                 atomic_set(&folio->_entire_mapcount, 0);
    1298             :                 atomic_set(&folio->_nr_pages_mapped, COMPOUND_MAPPED);
    1299             :                 nr = folio_nr_pages(folio);
    1300             :                 __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr);
    1301             :         }
    1302             : 
    1303           0 :         __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
    1304           0 :         __page_set_anon_rmap(folio, &folio->page, vma, address, 1);
    1305           0 : }
    1306             : 
    1307             : /**
    1308             :  * page_add_file_rmap - add pte mapping to a file page
    1309             :  * @page:       the page to add the mapping to
    1310             :  * @vma:        the vm area in which the mapping is added
    1311             :  * @compound:   charge the page as compound or small page
    1312             :  *
    1313             :  * The caller needs to hold the pte lock.
    1314             :  */
    1315           0 : void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
    1316             :                 bool compound)
    1317             : {
    1318           0 :         struct folio *folio = page_folio(page);
    1319           0 :         atomic_t *mapped = &folio->_nr_pages_mapped;
    1320           0 :         int nr = 0, nr_pmdmapped = 0;
    1321             :         bool first;
    1322             : 
    1323             :         VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
    1324             : 
    1325             :         /* Is page being mapped by PTE? Is this its first map to be added? */
    1326           0 :         if (likely(!compound)) {
    1327           0 :                 first = atomic_inc_and_test(&page->_mapcount);
    1328           0 :                 nr = first;
    1329           0 :                 if (first && folio_test_large(folio)) {
    1330           0 :                         nr = atomic_inc_return_relaxed(mapped);
    1331           0 :                         nr = (nr < COMPOUND_MAPPED);
    1332             :                 }
    1333             :         } else if (folio_test_pmd_mappable(folio)) {
    1334             :                 /* That test is redundant: it's for safety or to optimize out */
    1335             : 
    1336             :                 first = atomic_inc_and_test(&folio->_entire_mapcount);
    1337             :                 if (first) {
    1338             :                         nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
    1339             :                         if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) {
    1340             :                                 nr_pmdmapped = folio_nr_pages(folio);
    1341             :                                 nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
    1342             :                                 /* Raced ahead of a remove and another add? */
    1343             :                                 if (unlikely(nr < 0))
    1344             :                                         nr = 0;
    1345             :                         } else {
    1346             :                                 /* Raced ahead of a remove of COMPOUND_MAPPED */
    1347             :                                 nr = 0;
    1348             :                         }
    1349             :                 }
    1350             :         }
    1351             : 
    1352             :         if (nr_pmdmapped)
    1353             :                 __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ?
    1354             :                         NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped);
    1355           0 :         if (nr)
    1356           0 :                 __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr);
    1357             : 
    1358           0 :         mlock_vma_folio(folio, vma, compound);
    1359           0 : }
    1360             : 
    1361             : /**
    1362             :  * page_remove_rmap - take down pte mapping from a page
    1363             :  * @page:       page to remove mapping from
    1364             :  * @vma:        the vm area from which the mapping is removed
    1365             :  * @compound:   uncharge the page as compound or small page
    1366             :  *
    1367             :  * The caller needs to hold the pte lock.
    1368             :  */
    1369           0 : void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
    1370             :                 bool compound)
    1371             : {
    1372           0 :         struct folio *folio = page_folio(page);
    1373           0 :         atomic_t *mapped = &folio->_nr_pages_mapped;
    1374           0 :         int nr = 0, nr_pmdmapped = 0;
    1375             :         bool last;
    1376             :         enum node_stat_item idx;
    1377             : 
    1378             :         VM_BUG_ON_PAGE(compound && !PageHead(page), page);
    1379             : 
    1380             :         /* Hugetlb pages are not counted in NR_*MAPPED */
    1381           0 :         if (unlikely(folio_test_hugetlb(folio))) {
    1382             :                 /* hugetlb pages are always mapped with pmds */
    1383             :                 atomic_dec(&folio->_entire_mapcount);
    1384             :                 return;
    1385             :         }
    1386             : 
    1387             :         /* Is page being unmapped by PTE? Is this its last map to be removed? */
    1388           0 :         if (likely(!compound)) {
    1389           0 :                 last = atomic_add_negative(-1, &page->_mapcount);
    1390           0 :                 nr = last;
    1391           0 :                 if (last && folio_test_large(folio)) {
    1392           0 :                         nr = atomic_dec_return_relaxed(mapped);
    1393           0 :                         nr = (nr < COMPOUND_MAPPED);
    1394             :                 }
    1395             :         } else if (folio_test_pmd_mappable(folio)) {
    1396             :                 /* That test is redundant: it's for safety or to optimize out */
    1397             : 
    1398             :                 last = atomic_add_negative(-1, &folio->_entire_mapcount);
    1399             :                 if (last) {
    1400             :                         nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped);
    1401             :                         if (likely(nr < COMPOUND_MAPPED)) {
    1402             :                                 nr_pmdmapped = folio_nr_pages(folio);
    1403             :                                 nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
    1404             :                                 /* Raced ahead of another remove and an add? */
    1405             :                                 if (unlikely(nr < 0))
    1406             :                                         nr = 0;
    1407             :                         } else {
    1408             :                                 /* An add of COMPOUND_MAPPED raced ahead */
    1409             :                                 nr = 0;
    1410             :                         }
    1411             :                 }
    1412             :         }
    1413             : 
    1414             :         if (nr_pmdmapped) {
    1415             :                 if (folio_test_anon(folio))
    1416             :                         idx = NR_ANON_THPS;
    1417             :                 else if (folio_test_swapbacked(folio))
    1418             :                         idx = NR_SHMEM_PMDMAPPED;
    1419             :                 else
    1420             :                         idx = NR_FILE_PMDMAPPED;
    1421             :                 __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped);
    1422             :         }
    1423           0 :         if (nr) {
    1424           0 :                 idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
    1425           0 :                 __lruvec_stat_mod_folio(folio, idx, -nr);
    1426             : 
    1427             :                 /*
    1428             :                  * Queue anon THP for deferred split if at least one
    1429             :                  * page of the folio is unmapped and at least one page
    1430             :                  * is still mapped.
    1431             :                  */
    1432           0 :                 if (folio_test_pmd_mappable(folio) && folio_test_anon(folio))
    1433             :                         if (!compound || nr < nr_pmdmapped)
    1434             :                                 deferred_split_folio(folio);
    1435             :         }
    1436             : 
    1437             :         /*
    1438             :          * It would be tidy to reset folio_test_anon mapping when fully
    1439             :          * unmapped, but that might overwrite a racing page_add_anon_rmap
    1440             :          * which increments mapcount after us but sets mapping before us:
    1441             :          * so leave the reset to free_pages_prepare, and remember that
    1442             :          * it's only reliable while mapped.
    1443             :          */
    1444             : 
    1445           0 :         munlock_vma_folio(folio, vma, compound);
    1446             : }
    1447             : 
    1448             : /*
    1449             :  * @arg: enum ttu_flags will be passed to this argument
    1450             :  */
    1451           0 : static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
    1452             :                      unsigned long address, void *arg)
    1453             : {
    1454           0 :         struct mm_struct *mm = vma->vm_mm;
    1455           0 :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
    1456             :         pte_t pteval;
    1457             :         struct page *subpage;
    1458           0 :         bool anon_exclusive, ret = true;
    1459             :         struct mmu_notifier_range range;
    1460           0 :         enum ttu_flags flags = (enum ttu_flags)(long)arg;
    1461             : 
    1462             :         /*
    1463             :          * When racing against e.g. zap_pte_range() on another cpu,
    1464             :          * in between its ptep_get_and_clear_full() and page_remove_rmap(),
    1465             :          * try_to_unmap() may return before page_mapped() has become false,
    1466             :          * if page table locking is skipped: use TTU_SYNC to wait for that.
    1467             :          */
    1468           0 :         if (flags & TTU_SYNC)
    1469           0 :                 pvmw.flags = PVMW_SYNC;
    1470             : 
    1471             :         if (flags & TTU_SPLIT_HUGE_PMD)
    1472             :                 split_huge_pmd_address(vma, address, false, folio);
    1473             : 
    1474             :         /*
    1475             :          * For THP, we have to assume the worse case ie pmd for invalidation.
    1476             :          * For hugetlb, it could be much worse if we need to do pud
    1477             :          * invalidation in the case of pmd sharing.
    1478             :          *
    1479             :          * Note that the folio can not be freed in this function as call of
    1480             :          * try_to_unmap() must hold a reference on the folio.
    1481             :          */
    1482             :         range.end = vma_address_end(&pvmw);
    1483             :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
    1484             :                                 address, range.end);
    1485             :         if (folio_test_hugetlb(folio)) {
    1486             :                 /*
    1487             :                  * If sharing is possible, start and end will be adjusted
    1488             :                  * accordingly.
    1489             :                  */
    1490             :                 adjust_range_if_pmd_sharing_possible(vma, &range.start,
    1491             :                                                      &range.end);
    1492             :         }
    1493             :         mmu_notifier_invalidate_range_start(&range);
    1494             : 
    1495           0 :         while (page_vma_mapped_walk(&pvmw)) {
    1496             :                 /* Unexpected PMD-mapped THP? */
    1497             :                 VM_BUG_ON_FOLIO(!pvmw.pte, folio);
    1498             : 
    1499             :                 /*
    1500             :                  * If the folio is in an mlock()d vma, we must not swap it out.
    1501             :                  */
    1502           0 :                 if (!(flags & TTU_IGNORE_MLOCK) &&
    1503           0 :                     (vma->vm_flags & VM_LOCKED)) {
    1504             :                         /* Restore the mlock which got missed */
    1505           0 :                         mlock_vma_folio(folio, vma, false);
    1506           0 :                         page_vma_mapped_walk_done(&pvmw);
    1507             :                         ret = false;
    1508             :                         break;
    1509             :                 }
    1510             : 
    1511           0 :                 subpage = folio_page(folio,
    1512             :                                         pte_pfn(*pvmw.pte) - folio_pfn(folio));
    1513           0 :                 address = pvmw.address;
    1514           0 :                 anon_exclusive = folio_test_anon(folio) &&
    1515           0 :                                  PageAnonExclusive(subpage);
    1516             : 
    1517           0 :                 if (folio_test_hugetlb(folio)) {
    1518             :                         bool anon = folio_test_anon(folio);
    1519             : 
    1520             :                         /*
    1521             :                          * The try_to_unmap() is only passed a hugetlb page
    1522             :                          * in the case where the hugetlb page is poisoned.
    1523             :                          */
    1524             :                         VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
    1525             :                         /*
    1526             :                          * huge_pmd_unshare may unmap an entire PMD page.
    1527             :                          * There is no way of knowing exactly which PMDs may
    1528             :                          * be cached for this mm, so we must flush them all.
    1529             :                          * start/end were already adjusted above to cover this
    1530             :                          * range.
    1531             :                          */
    1532             :                         flush_cache_range(vma, range.start, range.end);
    1533             : 
    1534             :                         /*
    1535             :                          * To call huge_pmd_unshare, i_mmap_rwsem must be
    1536             :                          * held in write mode.  Caller needs to explicitly
    1537             :                          * do this outside rmap routines.
    1538             :                          *
    1539             :                          * We also must hold hugetlb vma_lock in write mode.
    1540             :                          * Lock order dictates acquiring vma_lock BEFORE
    1541             :                          * i_mmap_rwsem.  We can only try lock here and fail
    1542             :                          * if unsuccessful.
    1543             :                          */
    1544             :                         if (!anon) {
    1545             :                                 VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
    1546             :                                 if (!hugetlb_vma_trylock_write(vma)) {
    1547             :                                         page_vma_mapped_walk_done(&pvmw);
    1548             :                                         ret = false;
    1549             :                                         break;
    1550             :                                 }
    1551             :                                 if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
    1552             :                                         hugetlb_vma_unlock_write(vma);
    1553             :                                         flush_tlb_range(vma,
    1554             :                                                 range.start, range.end);
    1555             :                                         mmu_notifier_invalidate_range(mm,
    1556             :                                                 range.start, range.end);
    1557             :                                         /*
    1558             :                                          * The ref count of the PMD page was
    1559             :                                          * dropped which is part of the way map
    1560             :                                          * counting is done for shared PMDs.
    1561             :                                          * Return 'true' here.  When there is
    1562             :                                          * no other sharing, huge_pmd_unshare
    1563             :                                          * returns false and we will unmap the
    1564             :                                          * actual page and drop map count
    1565             :                                          * to zero.
    1566             :                                          */
    1567             :                                         page_vma_mapped_walk_done(&pvmw);
    1568             :                                         break;
    1569             :                                 }
    1570             :                                 hugetlb_vma_unlock_write(vma);
    1571             :                         }
    1572             :                         pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
    1573             :                 } else {
    1574           0 :                         flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
    1575             :                         /* Nuke the page table entry. */
    1576           0 :                         if (should_defer_flush(mm, flags)) {
    1577             :                                 /*
    1578             :                                  * We clear the PTE but do not flush so potentially
    1579             :                                  * a remote CPU could still be writing to the folio.
    1580             :                                  * If the entry was previously clean then the
    1581             :                                  * architecture must guarantee that a clear->dirty
    1582             :                                  * transition on a cached TLB entry is written through
    1583             :                                  * and traps if the PTE is unmapped.
    1584             :                                  */
    1585             :                                 pteval = ptep_get_and_clear(mm, address, pvmw.pte);
    1586             : 
    1587             :                                 set_tlb_ubc_flush_pending(mm, pteval);
    1588             :                         } else {
    1589           0 :                                 pteval = ptep_clear_flush(vma, address, pvmw.pte);
    1590             :                         }
    1591             :                 }
    1592             : 
    1593             :                 /*
    1594             :                  * Now the pte is cleared. If this pte was uffd-wp armed,
    1595             :                  * we may want to replace a none pte with a marker pte if
    1596             :                  * it's file-backed, so we don't lose the tracking info.
    1597             :                  */
    1598           0 :                 pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
    1599             : 
    1600             :                 /* Set the dirty flag on the folio now the pte is gone. */
    1601           0 :                 if (pte_dirty(pteval))
    1602           0 :                         folio_mark_dirty(folio);
    1603             : 
    1604             :                 /* Update high watermark before we lower rss */
    1605           0 :                 update_hiwater_rss(mm);
    1606             : 
    1607           0 :                 if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) {
    1608             :                         pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
    1609             :                         if (folio_test_hugetlb(folio)) {
    1610             :                                 hugetlb_count_sub(folio_nr_pages(folio), mm);
    1611             :                                 set_huge_pte_at(mm, address, pvmw.pte, pteval);
    1612             :                         } else {
    1613             :                                 dec_mm_counter(mm, mm_counter(&folio->page));
    1614             :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1615             :                         }
    1616             : 
    1617           0 :                 } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
    1618             :                         /*
    1619             :                          * The guest indicated that the page content is of no
    1620             :                          * interest anymore. Simply discard the pte, vmscan
    1621             :                          * will take care of the rest.
    1622             :                          * A future reference will then fault in a new zero
    1623             :                          * page. When userfaultfd is active, we must not drop
    1624             :                          * this page though, as its main user (postcopy
    1625             :                          * migration) will not expect userfaults on already
    1626             :                          * copied pages.
    1627             :                          */
    1628             :                         dec_mm_counter(mm, mm_counter(&folio->page));
    1629             :                         /* We have to invalidate as we cleared the pte */
    1630             :                         mmu_notifier_invalidate_range(mm, address,
    1631             :                                                       address + PAGE_SIZE);
    1632           0 :                 } else if (folio_test_anon(folio)) {
    1633           0 :                         swp_entry_t entry = { .val = page_private(subpage) };
    1634             :                         pte_t swp_pte;
    1635             :                         /*
    1636             :                          * Store the swap location in the pte.
    1637             :                          * See handle_pte_fault() ...
    1638             :                          */
    1639           0 :                         if (unlikely(folio_test_swapbacked(folio) !=
    1640             :                                         folio_test_swapcache(folio))) {
    1641           0 :                                 WARN_ON_ONCE(1);
    1642           0 :                                 ret = false;
    1643             :                                 /* We have to invalidate as we cleared the pte */
    1644           0 :                                 mmu_notifier_invalidate_range(mm, address,
    1645             :                                                         address + PAGE_SIZE);
    1646           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1647             :                                 break;
    1648             :                         }
    1649             : 
    1650             :                         /* MADV_FREE page check */
    1651           0 :                         if (!folio_test_swapbacked(folio)) {
    1652             :                                 int ref_count, map_count;
    1653             : 
    1654             :                                 /*
    1655             :                                  * Synchronize with gup_pte_range():
    1656             :                                  * - clear PTE; barrier; read refcount
    1657             :                                  * - inc refcount; barrier; read PTE
    1658             :                                  */
    1659           0 :                                 smp_mb();
    1660             : 
    1661           0 :                                 ref_count = folio_ref_count(folio);
    1662           0 :                                 map_count = folio_mapcount(folio);
    1663             : 
    1664             :                                 /*
    1665             :                                  * Order reads for page refcount and dirty flag
    1666             :                                  * (see comments in __remove_mapping()).
    1667             :                                  */
    1668           0 :                                 smp_rmb();
    1669             : 
    1670             :                                 /*
    1671             :                                  * The only page refs must be one from isolation
    1672             :                                  * plus the rmap(s) (dropped by discard:).
    1673             :                                  */
    1674           0 :                                 if (ref_count == 1 + map_count &&
    1675           0 :                                     !folio_test_dirty(folio)) {
    1676             :                                         /* Invalidate as we cleared the pte */
    1677           0 :                                         mmu_notifier_invalidate_range(mm,
    1678             :                                                 address, address + PAGE_SIZE);
    1679           0 :                                         dec_mm_counter(mm, MM_ANONPAGES);
    1680           0 :                                         goto discard;
    1681             :                                 }
    1682             : 
    1683             :                                 /*
    1684             :                                  * If the folio was redirtied, it cannot be
    1685             :                                  * discarded. Remap the page to page table.
    1686             :                                  */
    1687           0 :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1688           0 :                                 folio_set_swapbacked(folio);
    1689           0 :                                 ret = false;
    1690           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1691             :                                 break;
    1692             :                         }
    1693             : 
    1694           0 :                         if (swap_duplicate(entry) < 0) {
    1695           0 :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1696           0 :                                 ret = false;
    1697           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1698             :                                 break;
    1699             :                         }
    1700           0 :                         if (arch_unmap_one(mm, vma, address, pteval) < 0) {
    1701             :                                 swap_free(entry);
    1702             :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1703             :                                 ret = false;
    1704             :                                 page_vma_mapped_walk_done(&pvmw);
    1705             :                                 break;
    1706             :                         }
    1707             : 
    1708             :                         /* See page_try_share_anon_rmap(): clear PTE first. */
    1709           0 :                         if (anon_exclusive &&
    1710           0 :                             page_try_share_anon_rmap(subpage)) {
    1711           0 :                                 swap_free(entry);
    1712           0 :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1713           0 :                                 ret = false;
    1714           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1715             :                                 break;
    1716             :                         }
    1717           0 :                         if (list_empty(&mm->mmlist)) {
    1718           0 :                                 spin_lock(&mmlist_lock);
    1719           0 :                                 if (list_empty(&mm->mmlist))
    1720           0 :                                         list_add(&mm->mmlist, &init_mm.mmlist);
    1721             :                                 spin_unlock(&mmlist_lock);
    1722             :                         }
    1723           0 :                         dec_mm_counter(mm, MM_ANONPAGES);
    1724           0 :                         inc_mm_counter(mm, MM_SWAPENTS);
    1725           0 :                         swp_pte = swp_entry_to_pte(entry);
    1726           0 :                         if (anon_exclusive)
    1727             :                                 swp_pte = pte_swp_mkexclusive(swp_pte);
    1728           0 :                         if (pte_soft_dirty(pteval))
    1729             :                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
    1730             :                         if (pte_uffd_wp(pteval))
    1731             :                                 swp_pte = pte_swp_mkuffd_wp(swp_pte);
    1732           0 :                         set_pte_at(mm, address, pvmw.pte, swp_pte);
    1733             :                         /* Invalidate as we cleared the pte */
    1734           0 :                         mmu_notifier_invalidate_range(mm, address,
    1735             :                                                       address + PAGE_SIZE);
    1736             :                 } else {
    1737             :                         /*
    1738             :                          * This is a locked file-backed folio,
    1739             :                          * so it cannot be removed from the page
    1740             :                          * cache and replaced by a new folio before
    1741             :                          * mmu_notifier_invalidate_range_end, so no
    1742             :                          * concurrent thread might update its page table
    1743             :                          * to point at a new folio while a device is
    1744             :                          * still using this folio.
    1745             :                          *
    1746             :                          * See Documentation/mm/mmu_notifier.rst
    1747             :                          */
    1748           0 :                         dec_mm_counter(mm, mm_counter_file(&folio->page));
    1749             :                 }
    1750             : discard:
    1751             :                 /*
    1752             :                  * No need to call mmu_notifier_invalidate_range() it has be
    1753             :                  * done above for all cases requiring it to happen under page
    1754             :                  * table lock before mmu_notifier_invalidate_range_end()
    1755             :                  *
    1756             :                  * See Documentation/mm/mmu_notifier.rst
    1757             :                  */
    1758           0 :                 page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
    1759           0 :                 if (vma->vm_flags & VM_LOCKED)
    1760           0 :                         mlock_drain_local();
    1761             :                 folio_put(folio);
    1762             :         }
    1763             : 
    1764           0 :         mmu_notifier_invalidate_range_end(&range);
    1765             : 
    1766           0 :         return ret;
    1767             : }
    1768             : 
    1769           0 : static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
    1770             : {
    1771           0 :         return vma_is_temporary_stack(vma);
    1772             : }
    1773             : 
    1774           0 : static int folio_not_mapped(struct folio *folio)
    1775             : {
    1776           0 :         return !folio_mapped(folio);
    1777             : }
    1778             : 
    1779             : /**
    1780             :  * try_to_unmap - Try to remove all page table mappings to a folio.
    1781             :  * @folio: The folio to unmap.
    1782             :  * @flags: action and flags
    1783             :  *
    1784             :  * Tries to remove all the page table entries which are mapping this
    1785             :  * folio.  It is the caller's responsibility to check if the folio is
    1786             :  * still mapped if needed (use TTU_SYNC to prevent accounting races).
    1787             :  *
    1788             :  * Context: Caller must hold the folio lock.
    1789             :  */
    1790           0 : void try_to_unmap(struct folio *folio, enum ttu_flags flags)
    1791             : {
    1792           0 :         struct rmap_walk_control rwc = {
    1793             :                 .rmap_one = try_to_unmap_one,
    1794           0 :                 .arg = (void *)flags,
    1795             :                 .done = folio_not_mapped,
    1796             :                 .anon_lock = folio_lock_anon_vma_read,
    1797             :         };
    1798             : 
    1799           0 :         if (flags & TTU_RMAP_LOCKED)
    1800           0 :                 rmap_walk_locked(folio, &rwc);
    1801             :         else
    1802           0 :                 rmap_walk(folio, &rwc);
    1803           0 : }
    1804             : 
    1805             : /*
    1806             :  * @arg: enum ttu_flags will be passed to this argument.
    1807             :  *
    1808             :  * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
    1809             :  * containing migration entries.
    1810             :  */
    1811           0 : static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
    1812             :                      unsigned long address, void *arg)
    1813             : {
    1814           0 :         struct mm_struct *mm = vma->vm_mm;
    1815           0 :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
    1816             :         pte_t pteval;
    1817             :         struct page *subpage;
    1818           0 :         bool anon_exclusive, ret = true;
    1819             :         struct mmu_notifier_range range;
    1820           0 :         enum ttu_flags flags = (enum ttu_flags)(long)arg;
    1821             : 
    1822             :         /*
    1823             :          * When racing against e.g. zap_pte_range() on another cpu,
    1824             :          * in between its ptep_get_and_clear_full() and page_remove_rmap(),
    1825             :          * try_to_migrate() may return before page_mapped() has become false,
    1826             :          * if page table locking is skipped: use TTU_SYNC to wait for that.
    1827             :          */
    1828           0 :         if (flags & TTU_SYNC)
    1829           0 :                 pvmw.flags = PVMW_SYNC;
    1830             : 
    1831             :         /*
    1832             :          * unmap_page() in mm/huge_memory.c is the only user of migration with
    1833             :          * TTU_SPLIT_HUGE_PMD and it wants to freeze.
    1834             :          */
    1835             :         if (flags & TTU_SPLIT_HUGE_PMD)
    1836             :                 split_huge_pmd_address(vma, address, true, folio);
    1837             : 
    1838             :         /*
    1839             :          * For THP, we have to assume the worse case ie pmd for invalidation.
    1840             :          * For hugetlb, it could be much worse if we need to do pud
    1841             :          * invalidation in the case of pmd sharing.
    1842             :          *
    1843             :          * Note that the page can not be free in this function as call of
    1844             :          * try_to_unmap() must hold a reference on the page.
    1845             :          */
    1846             :         range.end = vma_address_end(&pvmw);
    1847             :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
    1848             :                                 address, range.end);
    1849             :         if (folio_test_hugetlb(folio)) {
    1850             :                 /*
    1851             :                  * If sharing is possible, start and end will be adjusted
    1852             :                  * accordingly.
    1853             :                  */
    1854             :                 adjust_range_if_pmd_sharing_possible(vma, &range.start,
    1855             :                                                      &range.end);
    1856             :         }
    1857             :         mmu_notifier_invalidate_range_start(&range);
    1858             : 
    1859           0 :         while (page_vma_mapped_walk(&pvmw)) {
    1860             : #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
    1861             :                 /* PMD-mapped THP migration entry */
    1862             :                 if (!pvmw.pte) {
    1863             :                         subpage = folio_page(folio,
    1864             :                                 pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
    1865             :                         VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
    1866             :                                         !folio_test_pmd_mappable(folio), folio);
    1867             : 
    1868             :                         if (set_pmd_migration_entry(&pvmw, subpage)) {
    1869             :                                 ret = false;
    1870             :                                 page_vma_mapped_walk_done(&pvmw);
    1871             :                                 break;
    1872             :                         }
    1873             :                         continue;
    1874             :                 }
    1875             : #endif
    1876             : 
    1877             :                 /* Unexpected PMD-mapped THP? */
    1878             :                 VM_BUG_ON_FOLIO(!pvmw.pte, folio);
    1879             : 
    1880           0 :                 if (folio_is_zone_device(folio)) {
    1881             :                         /*
    1882             :                          * Our PTE is a non-present device exclusive entry and
    1883             :                          * calculating the subpage as for the common case would
    1884             :                          * result in an invalid pointer.
    1885             :                          *
    1886             :                          * Since only PAGE_SIZE pages can currently be
    1887             :                          * migrated, just set it to page. This will need to be
    1888             :                          * changed when hugepage migrations to device private
    1889             :                          * memory are supported.
    1890             :                          */
    1891             :                         VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
    1892             :                         subpage = &folio->page;
    1893             :                 } else {
    1894           0 :                         subpage = folio_page(folio,
    1895             :                                         pte_pfn(*pvmw.pte) - folio_pfn(folio));
    1896             :                 }
    1897           0 :                 address = pvmw.address;
    1898           0 :                 anon_exclusive = folio_test_anon(folio) &&
    1899           0 :                                  PageAnonExclusive(subpage);
    1900             : 
    1901           0 :                 if (folio_test_hugetlb(folio)) {
    1902             :                         bool anon = folio_test_anon(folio);
    1903             : 
    1904             :                         /*
    1905             :                          * huge_pmd_unshare may unmap an entire PMD page.
    1906             :                          * There is no way of knowing exactly which PMDs may
    1907             :                          * be cached for this mm, so we must flush them all.
    1908             :                          * start/end were already adjusted above to cover this
    1909             :                          * range.
    1910             :                          */
    1911             :                         flush_cache_range(vma, range.start, range.end);
    1912             : 
    1913             :                         /*
    1914             :                          * To call huge_pmd_unshare, i_mmap_rwsem must be
    1915             :                          * held in write mode.  Caller needs to explicitly
    1916             :                          * do this outside rmap routines.
    1917             :                          *
    1918             :                          * We also must hold hugetlb vma_lock in write mode.
    1919             :                          * Lock order dictates acquiring vma_lock BEFORE
    1920             :                          * i_mmap_rwsem.  We can only try lock here and
    1921             :                          * fail if unsuccessful.
    1922             :                          */
    1923             :                         if (!anon) {
    1924             :                                 VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
    1925             :                                 if (!hugetlb_vma_trylock_write(vma)) {
    1926             :                                         page_vma_mapped_walk_done(&pvmw);
    1927             :                                         ret = false;
    1928             :                                         break;
    1929             :                                 }
    1930             :                                 if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
    1931             :                                         hugetlb_vma_unlock_write(vma);
    1932             :                                         flush_tlb_range(vma,
    1933             :                                                 range.start, range.end);
    1934             :                                         mmu_notifier_invalidate_range(mm,
    1935             :                                                 range.start, range.end);
    1936             : 
    1937             :                                         /*
    1938             :                                          * The ref count of the PMD page was
    1939             :                                          * dropped which is part of the way map
    1940             :                                          * counting is done for shared PMDs.
    1941             :                                          * Return 'true' here.  When there is
    1942             :                                          * no other sharing, huge_pmd_unshare
    1943             :                                          * returns false and we will unmap the
    1944             :                                          * actual page and drop map count
    1945             :                                          * to zero.
    1946             :                                          */
    1947             :                                         page_vma_mapped_walk_done(&pvmw);
    1948             :                                         break;
    1949             :                                 }
    1950             :                                 hugetlb_vma_unlock_write(vma);
    1951             :                         }
    1952             :                         /* Nuke the hugetlb page table entry */
    1953             :                         pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
    1954             :                 } else {
    1955           0 :                         flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
    1956             :                         /* Nuke the page table entry. */
    1957           0 :                         if (should_defer_flush(mm, flags)) {
    1958             :                                 /*
    1959             :                                  * We clear the PTE but do not flush so potentially
    1960             :                                  * a remote CPU could still be writing to the folio.
    1961             :                                  * If the entry was previously clean then the
    1962             :                                  * architecture must guarantee that a clear->dirty
    1963             :                                  * transition on a cached TLB entry is written through
    1964             :                                  * and traps if the PTE is unmapped.
    1965             :                                  */
    1966             :                                 pteval = ptep_get_and_clear(mm, address, pvmw.pte);
    1967             : 
    1968             :                                 set_tlb_ubc_flush_pending(mm, pteval);
    1969             :                         } else {
    1970           0 :                                 pteval = ptep_clear_flush(vma, address, pvmw.pte);
    1971             :                         }
    1972             :                 }
    1973             : 
    1974             :                 /* Set the dirty flag on the folio now the pte is gone. */
    1975           0 :                 if (pte_dirty(pteval))
    1976           0 :                         folio_mark_dirty(folio);
    1977             : 
    1978             :                 /* Update high watermark before we lower rss */
    1979           0 :                 update_hiwater_rss(mm);
    1980             : 
    1981           0 :                 if (folio_is_device_private(folio)) {
    1982             :                         unsigned long pfn = folio_pfn(folio);
    1983             :                         swp_entry_t entry;
    1984             :                         pte_t swp_pte;
    1985             : 
    1986             :                         if (anon_exclusive)
    1987             :                                 BUG_ON(page_try_share_anon_rmap(subpage));
    1988             : 
    1989             :                         /*
    1990             :                          * Store the pfn of the page in a special migration
    1991             :                          * pte. do_swap_page() will wait until the migration
    1992             :                          * pte is removed and then restart fault handling.
    1993             :                          */
    1994             :                         entry = pte_to_swp_entry(pteval);
    1995             :                         if (is_writable_device_private_entry(entry))
    1996             :                                 entry = make_writable_migration_entry(pfn);
    1997             :                         else if (anon_exclusive)
    1998             :                                 entry = make_readable_exclusive_migration_entry(pfn);
    1999             :                         else
    2000             :                                 entry = make_readable_migration_entry(pfn);
    2001             :                         swp_pte = swp_entry_to_pte(entry);
    2002             : 
    2003             :                         /*
    2004             :                          * pteval maps a zone device page and is therefore
    2005             :                          * a swap pte.
    2006             :                          */
    2007             :                         if (pte_swp_soft_dirty(pteval))
    2008             :                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
    2009             :                         if (pte_swp_uffd_wp(pteval))
    2010             :                                 swp_pte = pte_swp_mkuffd_wp(swp_pte);
    2011             :                         set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
    2012             :                         trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
    2013             :                                                 compound_order(&folio->page));
    2014             :                         /*
    2015             :                          * No need to invalidate here it will synchronize on
    2016             :                          * against the special swap migration pte.
    2017             :                          */
    2018           0 :                 } else if (PageHWPoison(subpage)) {
    2019             :                         pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
    2020             :                         if (folio_test_hugetlb(folio)) {
    2021             :                                 hugetlb_count_sub(folio_nr_pages(folio), mm);
    2022             :                                 set_huge_pte_at(mm, address, pvmw.pte, pteval);
    2023             :                         } else {
    2024             :                                 dec_mm_counter(mm, mm_counter(&folio->page));
    2025             :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    2026             :                         }
    2027             : 
    2028           0 :                 } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
    2029             :                         /*
    2030             :                          * The guest indicated that the page content is of no
    2031             :                          * interest anymore. Simply discard the pte, vmscan
    2032             :                          * will take care of the rest.
    2033             :                          * A future reference will then fault in a new zero
    2034             :                          * page. When userfaultfd is active, we must not drop
    2035             :                          * this page though, as its main user (postcopy
    2036             :                          * migration) will not expect userfaults on already
    2037             :                          * copied pages.
    2038             :                          */
    2039             :                         dec_mm_counter(mm, mm_counter(&folio->page));
    2040             :                         /* We have to invalidate as we cleared the pte */
    2041             :                         mmu_notifier_invalidate_range(mm, address,
    2042             :                                                       address + PAGE_SIZE);
    2043             :                 } else {
    2044             :                         swp_entry_t entry;
    2045             :                         pte_t swp_pte;
    2046             : 
    2047           0 :                         if (arch_unmap_one(mm, vma, address, pteval) < 0) {
    2048             :                                 if (folio_test_hugetlb(folio))
    2049             :                                         set_huge_pte_at(mm, address, pvmw.pte, pteval);
    2050             :                                 else
    2051             :                                         set_pte_at(mm, address, pvmw.pte, pteval);
    2052             :                                 ret = false;
    2053             :                                 page_vma_mapped_walk_done(&pvmw);
    2054             :                                 break;
    2055             :                         }
    2056             :                         VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
    2057             :                                        !anon_exclusive, subpage);
    2058             : 
    2059             :                         /* See page_try_share_anon_rmap(): clear PTE first. */
    2060           0 :                         if (anon_exclusive &&
    2061           0 :                             page_try_share_anon_rmap(subpage)) {
    2062           0 :                                 if (folio_test_hugetlb(folio))
    2063             :                                         set_huge_pte_at(mm, address, pvmw.pte, pteval);
    2064             :                                 else
    2065           0 :                                         set_pte_at(mm, address, pvmw.pte, pteval);
    2066           0 :                                 ret = false;
    2067           0 :                                 page_vma_mapped_walk_done(&pvmw);
    2068             :                                 break;
    2069             :                         }
    2070             : 
    2071             :                         /*
    2072             :                          * Store the pfn of the page in a special migration
    2073             :                          * pte. do_swap_page() will wait until the migration
    2074             :                          * pte is removed and then restart fault handling.
    2075             :                          */
    2076           0 :                         if (pte_write(pteval))
    2077           0 :                                 entry = make_writable_migration_entry(
    2078           0 :                                                         page_to_pfn(subpage));
    2079           0 :                         else if (anon_exclusive)
    2080           0 :                                 entry = make_readable_exclusive_migration_entry(
    2081           0 :                                                         page_to_pfn(subpage));
    2082             :                         else
    2083           0 :                                 entry = make_readable_migration_entry(
    2084           0 :                                                         page_to_pfn(subpage));
    2085           0 :                         if (pte_young(pteval))
    2086             :                                 entry = make_migration_entry_young(entry);
    2087           0 :                         if (pte_dirty(pteval))
    2088             :                                 entry = make_migration_entry_dirty(entry);
    2089           0 :                         swp_pte = swp_entry_to_pte(entry);
    2090           0 :                         if (pte_soft_dirty(pteval))
    2091             :                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
    2092             :                         if (pte_uffd_wp(pteval))
    2093             :                                 swp_pte = pte_swp_mkuffd_wp(swp_pte);
    2094           0 :                         if (folio_test_hugetlb(folio))
    2095             :                                 set_huge_pte_at(mm, address, pvmw.pte, swp_pte);
    2096             :                         else
    2097           0 :                                 set_pte_at(mm, address, pvmw.pte, swp_pte);
    2098           0 :                         trace_set_migration_pte(address, pte_val(swp_pte),
    2099           0 :                                                 compound_order(&folio->page));
    2100             :                         /*
    2101             :                          * No need to invalidate here it will synchronize on
    2102             :                          * against the special swap migration pte.
    2103             :                          */
    2104             :                 }
    2105             : 
    2106             :                 /*
    2107             :                  * No need to call mmu_notifier_invalidate_range() it has be
    2108             :                  * done above for all cases requiring it to happen under page
    2109             :                  * table lock before mmu_notifier_invalidate_range_end()
    2110             :                  *
    2111             :                  * See Documentation/mm/mmu_notifier.rst
    2112             :                  */
    2113           0 :                 page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
    2114           0 :                 if (vma->vm_flags & VM_LOCKED)
    2115           0 :                         mlock_drain_local();
    2116             :                 folio_put(folio);
    2117             :         }
    2118             : 
    2119           0 :         mmu_notifier_invalidate_range_end(&range);
    2120             : 
    2121           0 :         return ret;
    2122             : }
    2123             : 
    2124             : /**
    2125             :  * try_to_migrate - try to replace all page table mappings with swap entries
    2126             :  * @folio: the folio to replace page table entries for
    2127             :  * @flags: action and flags
    2128             :  *
    2129             :  * Tries to remove all the page table entries which are mapping this folio and
    2130             :  * replace them with special swap entries. Caller must hold the folio lock.
    2131             :  */
    2132           0 : void try_to_migrate(struct folio *folio, enum ttu_flags flags)
    2133             : {
    2134           0 :         struct rmap_walk_control rwc = {
    2135             :                 .rmap_one = try_to_migrate_one,
    2136           0 :                 .arg = (void *)flags,
    2137             :                 .done = folio_not_mapped,
    2138             :                 .anon_lock = folio_lock_anon_vma_read,
    2139             :         };
    2140             : 
    2141             :         /*
    2142             :          * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
    2143             :          * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags.
    2144             :          */
    2145           0 :         if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
    2146             :                                         TTU_SYNC | TTU_BATCH_FLUSH)))
    2147           0 :                 return;
    2148             : 
    2149           0 :         if (folio_is_zone_device(folio) &&
    2150             :             (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
    2151             :                 return;
    2152             : 
    2153             :         /*
    2154             :          * During exec, a temporary VMA is setup and later moved.
    2155             :          * The VMA is moved under the anon_vma lock but not the
    2156             :          * page tables leading to a race where migration cannot
    2157             :          * find the migration ptes. Rather than increasing the
    2158             :          * locking requirements of exec(), migration skips
    2159             :          * temporary VMAs until after exec() completes.
    2160             :          */
    2161           0 :         if (!folio_test_ksm(folio) && folio_test_anon(folio))
    2162           0 :                 rwc.invalid_vma = invalid_migration_vma;
    2163             : 
    2164           0 :         if (flags & TTU_RMAP_LOCKED)
    2165           0 :                 rmap_walk_locked(folio, &rwc);
    2166             :         else
    2167           0 :                 rmap_walk(folio, &rwc);
    2168             : }
    2169             : 
    2170             : #ifdef CONFIG_DEVICE_PRIVATE
    2171             : struct make_exclusive_args {
    2172             :         struct mm_struct *mm;
    2173             :         unsigned long address;
    2174             :         void *owner;
    2175             :         bool valid;
    2176             : };
    2177             : 
    2178             : static bool page_make_device_exclusive_one(struct folio *folio,
    2179             :                 struct vm_area_struct *vma, unsigned long address, void *priv)
    2180             : {
    2181             :         struct mm_struct *mm = vma->vm_mm;
    2182             :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
    2183             :         struct make_exclusive_args *args = priv;
    2184             :         pte_t pteval;
    2185             :         struct page *subpage;
    2186             :         bool ret = true;
    2187             :         struct mmu_notifier_range range;
    2188             :         swp_entry_t entry;
    2189             :         pte_t swp_pte;
    2190             : 
    2191             :         mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
    2192             :                                       vma->vm_mm, address, min(vma->vm_end,
    2193             :                                       address + folio_size(folio)),
    2194             :                                       args->owner);
    2195             :         mmu_notifier_invalidate_range_start(&range);
    2196             : 
    2197             :         while (page_vma_mapped_walk(&pvmw)) {
    2198             :                 /* Unexpected PMD-mapped THP? */
    2199             :                 VM_BUG_ON_FOLIO(!pvmw.pte, folio);
    2200             : 
    2201             :                 if (!pte_present(*pvmw.pte)) {
    2202             :                         ret = false;
    2203             :                         page_vma_mapped_walk_done(&pvmw);
    2204             :                         break;
    2205             :                 }
    2206             : 
    2207             :                 subpage = folio_page(folio,
    2208             :                                 pte_pfn(*pvmw.pte) - folio_pfn(folio));
    2209             :                 address = pvmw.address;
    2210             : 
    2211             :                 /* Nuke the page table entry. */
    2212             :                 flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
    2213             :                 pteval = ptep_clear_flush(vma, address, pvmw.pte);
    2214             : 
    2215             :                 /* Set the dirty flag on the folio now the pte is gone. */
    2216             :                 if (pte_dirty(pteval))
    2217             :                         folio_mark_dirty(folio);
    2218             : 
    2219             :                 /*
    2220             :                  * Check that our target page is still mapped at the expected
    2221             :                  * address.
    2222             :                  */
    2223             :                 if (args->mm == mm && args->address == address &&
    2224             :                     pte_write(pteval))
    2225             :                         args->valid = true;
    2226             : 
    2227             :                 /*
    2228             :                  * Store the pfn of the page in a special migration
    2229             :                  * pte. do_swap_page() will wait until the migration
    2230             :                  * pte is removed and then restart fault handling.
    2231             :                  */
    2232             :                 if (pte_write(pteval))
    2233             :                         entry = make_writable_device_exclusive_entry(
    2234             :                                                         page_to_pfn(subpage));
    2235             :                 else
    2236             :                         entry = make_readable_device_exclusive_entry(
    2237             :                                                         page_to_pfn(subpage));
    2238             :                 swp_pte = swp_entry_to_pte(entry);
    2239             :                 if (pte_soft_dirty(pteval))
    2240             :                         swp_pte = pte_swp_mksoft_dirty(swp_pte);
    2241             :                 if (pte_uffd_wp(pteval))
    2242             :                         swp_pte = pte_swp_mkuffd_wp(swp_pte);
    2243             : 
    2244             :                 set_pte_at(mm, address, pvmw.pte, swp_pte);
    2245             : 
    2246             :                 /*
    2247             :                  * There is a reference on the page for the swap entry which has
    2248             :                  * been removed, so shouldn't take another.
    2249             :                  */
    2250             :                 page_remove_rmap(subpage, vma, false);
    2251             :         }
    2252             : 
    2253             :         mmu_notifier_invalidate_range_end(&range);
    2254             : 
    2255             :         return ret;
    2256             : }
    2257             : 
    2258             : /**
    2259             :  * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
    2260             :  * @folio: The folio to replace page table entries for.
    2261             :  * @mm: The mm_struct where the folio is expected to be mapped.
    2262             :  * @address: Address where the folio is expected to be mapped.
    2263             :  * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
    2264             :  *
    2265             :  * Tries to remove all the page table entries which are mapping this
    2266             :  * folio and replace them with special device exclusive swap entries to
    2267             :  * grant a device exclusive access to the folio.
    2268             :  *
    2269             :  * Context: Caller must hold the folio lock.
    2270             :  * Return: false if the page is still mapped, or if it could not be unmapped
    2271             :  * from the expected address. Otherwise returns true (success).
    2272             :  */
    2273             : static bool folio_make_device_exclusive(struct folio *folio,
    2274             :                 struct mm_struct *mm, unsigned long address, void *owner)
    2275             : {
    2276             :         struct make_exclusive_args args = {
    2277             :                 .mm = mm,
    2278             :                 .address = address,
    2279             :                 .owner = owner,
    2280             :                 .valid = false,
    2281             :         };
    2282             :         struct rmap_walk_control rwc = {
    2283             :                 .rmap_one = page_make_device_exclusive_one,
    2284             :                 .done = folio_not_mapped,
    2285             :                 .anon_lock = folio_lock_anon_vma_read,
    2286             :                 .arg = &args,
    2287             :         };
    2288             : 
    2289             :         /*
    2290             :          * Restrict to anonymous folios for now to avoid potential writeback
    2291             :          * issues.
    2292             :          */
    2293             :         if (!folio_test_anon(folio))
    2294             :                 return false;
    2295             : 
    2296             :         rmap_walk(folio, &rwc);
    2297             : 
    2298             :         return args.valid && !folio_mapcount(folio);
    2299             : }
    2300             : 
    2301             : /**
    2302             :  * make_device_exclusive_range() - Mark a range for exclusive use by a device
    2303             :  * @mm: mm_struct of associated target process
    2304             :  * @start: start of the region to mark for exclusive device access
    2305             :  * @end: end address of region
    2306             :  * @pages: returns the pages which were successfully marked for exclusive access
    2307             :  * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
    2308             :  *
    2309             :  * Returns: number of pages found in the range by GUP. A page is marked for
    2310             :  * exclusive access only if the page pointer is non-NULL.
    2311             :  *
    2312             :  * This function finds ptes mapping page(s) to the given address range, locks
    2313             :  * them and replaces mappings with special swap entries preventing userspace CPU
    2314             :  * access. On fault these entries are replaced with the original mapping after
    2315             :  * calling MMU notifiers.
    2316             :  *
    2317             :  * A driver using this to program access from a device must use a mmu notifier
    2318             :  * critical section to hold a device specific lock during programming. Once
    2319             :  * programming is complete it should drop the page lock and reference after
    2320             :  * which point CPU access to the page will revoke the exclusive access.
    2321             :  */
    2322             : int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
    2323             :                                 unsigned long end, struct page **pages,
    2324             :                                 void *owner)
    2325             : {
    2326             :         long npages = (end - start) >> PAGE_SHIFT;
    2327             :         long i;
    2328             : 
    2329             :         npages = get_user_pages_remote(mm, start, npages,
    2330             :                                        FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
    2331             :                                        pages, NULL, NULL);
    2332             :         if (npages < 0)
    2333             :                 return npages;
    2334             : 
    2335             :         for (i = 0; i < npages; i++, start += PAGE_SIZE) {
    2336             :                 struct folio *folio = page_folio(pages[i]);
    2337             :                 if (PageTail(pages[i]) || !folio_trylock(folio)) {
    2338             :                         folio_put(folio);
    2339             :                         pages[i] = NULL;
    2340             :                         continue;
    2341             :                 }
    2342             : 
    2343             :                 if (!folio_make_device_exclusive(folio, mm, start, owner)) {
    2344             :                         folio_unlock(folio);
    2345             :                         folio_put(folio);
    2346             :                         pages[i] = NULL;
    2347             :                 }
    2348             :         }
    2349             : 
    2350             :         return npages;
    2351             : }
    2352             : EXPORT_SYMBOL_GPL(make_device_exclusive_range);
    2353             : #endif
    2354             : 
    2355           0 : void __put_anon_vma(struct anon_vma *anon_vma)
    2356             : {
    2357           0 :         struct anon_vma *root = anon_vma->root;
    2358             : 
    2359           0 :         anon_vma_free(anon_vma);
    2360           0 :         if (root != anon_vma && atomic_dec_and_test(&root->refcount))
    2361           0 :                 anon_vma_free(root);
    2362           0 : }
    2363             : 
    2364           0 : static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
    2365             :                                             struct rmap_walk_control *rwc)
    2366             : {
    2367             :         struct anon_vma *anon_vma;
    2368             : 
    2369           0 :         if (rwc->anon_lock)
    2370           0 :                 return rwc->anon_lock(folio, rwc);
    2371             : 
    2372             :         /*
    2373             :          * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
    2374             :          * because that depends on page_mapped(); but not all its usages
    2375             :          * are holding mmap_lock. Users without mmap_lock are required to
    2376             :          * take a reference count to prevent the anon_vma disappearing
    2377             :          */
    2378           0 :         anon_vma = folio_anon_vma(folio);
    2379           0 :         if (!anon_vma)
    2380             :                 return NULL;
    2381             : 
    2382           0 :         if (anon_vma_trylock_read(anon_vma))
    2383             :                 goto out;
    2384             : 
    2385           0 :         if (rwc->try_lock) {
    2386           0 :                 anon_vma = NULL;
    2387           0 :                 rwc->contended = true;
    2388           0 :                 goto out;
    2389             :         }
    2390             : 
    2391           0 :         anon_vma_lock_read(anon_vma);
    2392             : out:
    2393             :         return anon_vma;
    2394             : }
    2395             : 
    2396             : /*
    2397             :  * rmap_walk_anon - do something to anonymous page using the object-based
    2398             :  * rmap method
    2399             :  * @page: the page to be handled
    2400             :  * @rwc: control variable according to each walk type
    2401             :  *
    2402             :  * Find all the mappings of a page using the mapping pointer and the vma chains
    2403             :  * contained in the anon_vma struct it points to.
    2404             :  */
    2405           0 : static void rmap_walk_anon(struct folio *folio,
    2406             :                 struct rmap_walk_control *rwc, bool locked)
    2407             : {
    2408             :         struct anon_vma *anon_vma;
    2409             :         pgoff_t pgoff_start, pgoff_end;
    2410             :         struct anon_vma_chain *avc;
    2411             : 
    2412           0 :         if (locked) {
    2413           0 :                 anon_vma = folio_anon_vma(folio);
    2414             :                 /* anon_vma disappear under us? */
    2415             :                 VM_BUG_ON_FOLIO(!anon_vma, folio);
    2416             :         } else {
    2417           0 :                 anon_vma = rmap_walk_anon_lock(folio, rwc);
    2418             :         }
    2419           0 :         if (!anon_vma)
    2420             :                 return;
    2421             : 
    2422           0 :         pgoff_start = folio_pgoff(folio);
    2423           0 :         pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
    2424           0 :         anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
    2425             :                         pgoff_start, pgoff_end) {
    2426           0 :                 struct vm_area_struct *vma = avc->vma;
    2427           0 :                 unsigned long address = vma_address(&folio->page, vma);
    2428             : 
    2429             :                 VM_BUG_ON_VMA(address == -EFAULT, vma);
    2430           0 :                 cond_resched();
    2431             : 
    2432           0 :                 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
    2433           0 :                         continue;
    2434             : 
    2435           0 :                 if (!rwc->rmap_one(folio, vma, address, rwc->arg))
    2436             :                         break;
    2437           0 :                 if (rwc->done && rwc->done(folio))
    2438             :                         break;
    2439             :         }
    2440             : 
    2441           0 :         if (!locked)
    2442           0 :                 anon_vma_unlock_read(anon_vma);
    2443             : }
    2444             : 
    2445             : /*
    2446             :  * rmap_walk_file - do something to file page using the object-based rmap method
    2447             :  * @page: the page to be handled
    2448             :  * @rwc: control variable according to each walk type
    2449             :  *
    2450             :  * Find all the mappings of a page using the mapping pointer and the vma chains
    2451             :  * contained in the address_space struct it points to.
    2452             :  */
    2453           0 : static void rmap_walk_file(struct folio *folio,
    2454             :                 struct rmap_walk_control *rwc, bool locked)
    2455             : {
    2456           0 :         struct address_space *mapping = folio_mapping(folio);
    2457             :         pgoff_t pgoff_start, pgoff_end;
    2458             :         struct vm_area_struct *vma;
    2459             : 
    2460             :         /*
    2461             :          * The page lock not only makes sure that page->mapping cannot
    2462             :          * suddenly be NULLified by truncation, it makes sure that the
    2463             :          * structure at mapping cannot be freed and reused yet,
    2464             :          * so we can safely take mapping->i_mmap_rwsem.
    2465             :          */
    2466             :         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
    2467             : 
    2468           0 :         if (!mapping)
    2469             :                 return;
    2470             : 
    2471           0 :         pgoff_start = folio_pgoff(folio);
    2472           0 :         pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
    2473           0 :         if (!locked) {
    2474           0 :                 if (i_mmap_trylock_read(mapping))
    2475             :                         goto lookup;
    2476             : 
    2477           0 :                 if (rwc->try_lock) {
    2478           0 :                         rwc->contended = true;
    2479           0 :                         return;
    2480             :                 }
    2481             : 
    2482             :                 i_mmap_lock_read(mapping);
    2483             :         }
    2484             : lookup:
    2485           0 :         vma_interval_tree_foreach(vma, &mapping->i_mmap,
    2486             :                         pgoff_start, pgoff_end) {
    2487           0 :                 unsigned long address = vma_address(&folio->page, vma);
    2488             : 
    2489             :                 VM_BUG_ON_VMA(address == -EFAULT, vma);
    2490           0 :                 cond_resched();
    2491             : 
    2492           0 :                 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
    2493           0 :                         continue;
    2494             : 
    2495           0 :                 if (!rwc->rmap_one(folio, vma, address, rwc->arg))
    2496             :                         goto done;
    2497           0 :                 if (rwc->done && rwc->done(folio))
    2498             :                         goto done;
    2499             :         }
    2500             : 
    2501             : done:
    2502           0 :         if (!locked)
    2503             :                 i_mmap_unlock_read(mapping);
    2504             : }
    2505             : 
    2506           0 : void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
    2507             : {
    2508           0 :         if (unlikely(folio_test_ksm(folio)))
    2509             :                 rmap_walk_ksm(folio, rwc);
    2510           0 :         else if (folio_test_anon(folio))
    2511           0 :                 rmap_walk_anon(folio, rwc, false);
    2512             :         else
    2513           0 :                 rmap_walk_file(folio, rwc, false);
    2514           0 : }
    2515             : 
    2516             : /* Like rmap_walk, but caller holds relevant rmap lock */
    2517           0 : void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
    2518             : {
    2519             :         /* no ksm support for now */
    2520             :         VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
    2521           0 :         if (folio_test_anon(folio))
    2522           0 :                 rmap_walk_anon(folio, rwc, true);
    2523             :         else
    2524           0 :                 rmap_walk_file(folio, rwc, true);
    2525           0 : }
    2526             : 
    2527             : #ifdef CONFIG_HUGETLB_PAGE
    2528             : /*
    2529             :  * The following two functions are for anonymous (private mapped) hugepages.
    2530             :  * Unlike common anonymous pages, anonymous hugepages have no accounting code
    2531             :  * and no lru code, because we handle hugepages differently from common pages.
    2532             :  *
    2533             :  * RMAP_COMPOUND is ignored.
    2534             :  */
    2535             : void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
    2536             :                             unsigned long address, rmap_t flags)
    2537             : {
    2538             :         struct folio *folio = page_folio(page);
    2539             :         struct anon_vma *anon_vma = vma->anon_vma;
    2540             :         int first;
    2541             : 
    2542             :         BUG_ON(!folio_test_locked(folio));
    2543             :         BUG_ON(!anon_vma);
    2544             :         /* address might be in next vma when migration races vma_merge */
    2545             :         first = atomic_inc_and_test(&folio->_entire_mapcount);
    2546             :         VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
    2547             :         VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
    2548             :         if (first)
    2549             :                 __page_set_anon_rmap(folio, page, vma, address,
    2550             :                                      !!(flags & RMAP_EXCLUSIVE));
    2551             : }
    2552             : 
    2553             : void hugepage_add_new_anon_rmap(struct folio *folio,
    2554             :                         struct vm_area_struct *vma, unsigned long address)
    2555             : {
    2556             :         BUG_ON(address < vma->vm_start || address >= vma->vm_end);
    2557             :         /* increment count (starts at -1) */
    2558             :         atomic_set(&folio->_entire_mapcount, 0);
    2559             :         folio_clear_hugetlb_restore_reserve(folio);
    2560             :         __page_set_anon_rmap(folio, &folio->page, vma, address, 1);
    2561             : }
    2562             : #endif /* CONFIG_HUGETLB_PAGE */

Generated by: LCOV version 1.14