LCOV - code coverage report
Current view: top level - mm - rmap.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 4 561 0.7 %
Date: 2023-04-06 08:38:28 Functions: 1 40 2.5 %

          Line data    Source code
       1             : /*
       2             :  * mm/rmap.c - physical to virtual reverse mappings
       3             :  *
       4             :  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
       5             :  * Released under the General Public License (GPL).
       6             :  *
       7             :  * Simple, low overhead reverse mapping scheme.
       8             :  * Please try to keep this thing as modular as possible.
       9             :  *
      10             :  * Provides methods for unmapping each kind of mapped page:
      11             :  * the anon methods track anonymous pages, and
      12             :  * the file methods track pages belonging to an inode.
      13             :  *
      14             :  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
      15             :  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
      16             :  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
      17             :  * Contributions by Hugh Dickins 2003, 2004
      18             :  */
      19             : 
      20             : /*
      21             :  * Lock ordering in mm:
      22             :  *
      23             :  * inode->i_rwsem    (while writing or truncating, not reading or faulting)
      24             :  *   mm->mmap_lock
      25             :  *     mapping->invalidate_lock (in filemap_fault)
      26             :  *       page->flags PG_locked (lock_page)
      27             :  *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
      28             :  *           mapping->i_mmap_rwsem
      29             :  *             anon_vma->rwsem
      30             :  *               mm->page_table_lock or pte_lock
      31             :  *                 swap_lock (in swap_duplicate, swap_info_get)
      32             :  *                   mmlist_lock (in mmput, drain_mmlist and others)
      33             :  *                   mapping->private_lock (in block_dirty_folio)
      34             :  *                     folio_lock_memcg move_lock (in block_dirty_folio)
      35             :  *                       i_pages lock (widely used)
      36             :  *                         lruvec->lru_lock (in folio_lruvec_lock_irq)
      37             :  *                   inode->i_lock (in set_page_dirty's __mark_inode_dirty)
      38             :  *                   bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
      39             :  *                     sb_lock (within inode_lock in fs/fs-writeback.c)
      40             :  *                     i_pages lock (widely used, in set_page_dirty,
      41             :  *                               in arch-dependent flush_dcache_mmap_lock,
      42             :  *                               within bdi.wb->list_lock in __sync_single_inode)
      43             :  *
      44             :  * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
      45             :  *   ->tasklist_lock
      46             :  *     pte map lock
      47             :  *
      48             :  * hugetlbfs PageHuge() take locks in this order:
      49             :  *   hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
      50             :  *     vma_lock (hugetlb specific lock for pmd_sharing)
      51             :  *       mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
      52             :  *         page->flags PG_locked (lock_page)
      53             :  */
      54             : 
      55             : #include <linux/mm.h>
      56             : #include <linux/sched/mm.h>
      57             : #include <linux/sched/task.h>
      58             : #include <linux/pagemap.h>
      59             : #include <linux/swap.h>
      60             : #include <linux/swapops.h>
      61             : #include <linux/slab.h>
      62             : #include <linux/init.h>
      63             : #include <linux/ksm.h>
      64             : #include <linux/rmap.h>
      65             : #include <linux/rcupdate.h>
      66             : #include <linux/export.h>
      67             : #include <linux/memcontrol.h>
      68             : #include <linux/mmu_notifier.h>
      69             : #include <linux/migrate.h>
      70             : #include <linux/hugetlb.h>
      71             : #include <linux/huge_mm.h>
      72             : #include <linux/backing-dev.h>
      73             : #include <linux/page_idle.h>
      74             : #include <linux/memremap.h>
      75             : #include <linux/userfaultfd_k.h>
      76             : #include <linux/mm_inline.h>
      77             : 
      78             : #include <asm/tlbflush.h>
      79             : 
      80             : #define CREATE_TRACE_POINTS
      81             : #include <trace/events/tlb.h>
      82             : #include <trace/events/migrate.h>
      83             : 
      84             : #include "internal.h"
      85             : 
      86             : static struct kmem_cache *anon_vma_cachep;
      87             : static struct kmem_cache *anon_vma_chain_cachep;
      88             : 
      89           0 : static inline struct anon_vma *anon_vma_alloc(void)
      90             : {
      91             :         struct anon_vma *anon_vma;
      92             : 
      93           0 :         anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
      94           0 :         if (anon_vma) {
      95           0 :                 atomic_set(&anon_vma->refcount, 1);
      96           0 :                 anon_vma->num_children = 0;
      97           0 :                 anon_vma->num_active_vmas = 0;
      98           0 :                 anon_vma->parent = anon_vma;
      99             :                 /*
     100             :                  * Initialise the anon_vma root to point to itself. If called
     101             :                  * from fork, the root will be reset to the parents anon_vma.
     102             :                  */
     103           0 :                 anon_vma->root = anon_vma;
     104             :         }
     105             : 
     106           0 :         return anon_vma;
     107             : }
     108             : 
     109           0 : static inline void anon_vma_free(struct anon_vma *anon_vma)
     110             : {
     111             :         VM_BUG_ON(atomic_read(&anon_vma->refcount));
     112             : 
     113             :         /*
     114             :          * Synchronize against folio_lock_anon_vma_read() such that
     115             :          * we can safely hold the lock without the anon_vma getting
     116             :          * freed.
     117             :          *
     118             :          * Relies on the full mb implied by the atomic_dec_and_test() from
     119             :          * put_anon_vma() against the acquire barrier implied by
     120             :          * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
     121             :          *
     122             :          * folio_lock_anon_vma_read()   VS      put_anon_vma()
     123             :          *   down_read_trylock()                  atomic_dec_and_test()
     124             :          *   LOCK                                 MB
     125             :          *   atomic_read()                        rwsem_is_locked()
     126             :          *
     127             :          * LOCK should suffice since the actual taking of the lock must
     128             :          * happen _before_ what follows.
     129             :          */
     130             :         might_sleep();
     131           0 :         if (rwsem_is_locked(&anon_vma->root->rwsem)) {
     132           0 :                 anon_vma_lock_write(anon_vma);
     133           0 :                 anon_vma_unlock_write(anon_vma);
     134             :         }
     135             : 
     136           0 :         kmem_cache_free(anon_vma_cachep, anon_vma);
     137           0 : }
     138             : 
     139             : static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
     140             : {
     141           0 :         return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
     142             : }
     143             : 
     144             : static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
     145             : {
     146           0 :         kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
     147             : }
     148             : 
     149             : static void anon_vma_chain_link(struct vm_area_struct *vma,
     150             :                                 struct anon_vma_chain *avc,
     151             :                                 struct anon_vma *anon_vma)
     152             : {
     153           0 :         avc->vma = vma;
     154           0 :         avc->anon_vma = anon_vma;
     155           0 :         list_add(&avc->same_vma, &vma->anon_vma_chain);
     156           0 :         anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
     157             : }
     158             : 
     159             : /**
     160             :  * __anon_vma_prepare - attach an anon_vma to a memory region
     161             :  * @vma: the memory region in question
     162             :  *
     163             :  * This makes sure the memory mapping described by 'vma' has
     164             :  * an 'anon_vma' attached to it, so that we can associate the
     165             :  * anonymous pages mapped into it with that anon_vma.
     166             :  *
     167             :  * The common case will be that we already have one, which
     168             :  * is handled inline by anon_vma_prepare(). But if
     169             :  * not we either need to find an adjacent mapping that we
     170             :  * can re-use the anon_vma from (very common when the only
     171             :  * reason for splitting a vma has been mprotect()), or we
     172             :  * allocate a new one.
     173             :  *
     174             :  * Anon-vma allocations are very subtle, because we may have
     175             :  * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
     176             :  * and that may actually touch the rwsem even in the newly
     177             :  * allocated vma (it depends on RCU to make sure that the
     178             :  * anon_vma isn't actually destroyed).
     179             :  *
     180             :  * As a result, we need to do proper anon_vma locking even
     181             :  * for the new allocation. At the same time, we do not want
     182             :  * to do any locking for the common case of already having
     183             :  * an anon_vma.
     184             :  *
     185             :  * This must be called with the mmap_lock held for reading.
     186             :  */
     187           0 : int __anon_vma_prepare(struct vm_area_struct *vma)
     188             : {
     189           0 :         struct mm_struct *mm = vma->vm_mm;
     190             :         struct anon_vma *anon_vma, *allocated;
     191             :         struct anon_vma_chain *avc;
     192             : 
     193             :         might_sleep();
     194             : 
     195           0 :         avc = anon_vma_chain_alloc(GFP_KERNEL);
     196           0 :         if (!avc)
     197             :                 goto out_enomem;
     198             : 
     199           0 :         anon_vma = find_mergeable_anon_vma(vma);
     200           0 :         allocated = NULL;
     201           0 :         if (!anon_vma) {
     202           0 :                 anon_vma = anon_vma_alloc();
     203           0 :                 if (unlikely(!anon_vma))
     204             :                         goto out_enomem_free_avc;
     205           0 :                 anon_vma->num_children++; /* self-parent link for new root */
     206           0 :                 allocated = anon_vma;
     207             :         }
     208             : 
     209           0 :         anon_vma_lock_write(anon_vma);
     210             :         /* page_table_lock to protect against threads */
     211           0 :         spin_lock(&mm->page_table_lock);
     212           0 :         if (likely(!vma->anon_vma)) {
     213           0 :                 vma->anon_vma = anon_vma;
     214           0 :                 anon_vma_chain_link(vma, avc, anon_vma);
     215           0 :                 anon_vma->num_active_vmas++;
     216           0 :                 allocated = NULL;
     217           0 :                 avc = NULL;
     218             :         }
     219           0 :         spin_unlock(&mm->page_table_lock);
     220           0 :         anon_vma_unlock_write(anon_vma);
     221             : 
     222           0 :         if (unlikely(allocated))
     223             :                 put_anon_vma(allocated);
     224           0 :         if (unlikely(avc))
     225             :                 anon_vma_chain_free(avc);
     226             : 
     227             :         return 0;
     228             : 
     229             :  out_enomem_free_avc:
     230             :         anon_vma_chain_free(avc);
     231             :  out_enomem:
     232             :         return -ENOMEM;
     233             : }
     234             : 
     235             : /*
     236             :  * This is a useful helper function for locking the anon_vma root as
     237             :  * we traverse the vma->anon_vma_chain, looping over anon_vma's that
     238             :  * have the same vma.
     239             :  *
     240             :  * Such anon_vma's should have the same root, so you'd expect to see
     241             :  * just a single mutex_lock for the whole traversal.
     242             :  */
     243           0 : static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
     244             : {
     245           0 :         struct anon_vma *new_root = anon_vma->root;
     246           0 :         if (new_root != root) {
     247           0 :                 if (WARN_ON_ONCE(root))
     248           0 :                         up_write(&root->rwsem);
     249           0 :                 root = new_root;
     250           0 :                 down_write(&root->rwsem);
     251             :         }
     252           0 :         return root;
     253             : }
     254             : 
     255             : static inline void unlock_anon_vma_root(struct anon_vma *root)
     256             : {
     257           0 :         if (root)
     258           0 :                 up_write(&root->rwsem);
     259             : }
     260             : 
     261             : /*
     262             :  * Attach the anon_vmas from src to dst.
     263             :  * Returns 0 on success, -ENOMEM on failure.
     264             :  *
     265             :  * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
     266             :  * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
     267             :  * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
     268             :  * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
     269             :  * call, we can identify this case by checking (!dst->anon_vma &&
     270             :  * src->anon_vma).
     271             :  *
     272             :  * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
     273             :  * and reuse existing anon_vma which has no vmas and only one child anon_vma.
     274             :  * This prevents degradation of anon_vma hierarchy to endless linear chain in
     275             :  * case of constantly forking task. On the other hand, an anon_vma with more
     276             :  * than one child isn't reused even if there was no alive vma, thus rmap
     277             :  * walker has a good chance of avoiding scanning the whole hierarchy when it
     278             :  * searches where page is mapped.
     279             :  */
     280           0 : int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
     281             : {
     282             :         struct anon_vma_chain *avc, *pavc;
     283           0 :         struct anon_vma *root = NULL;
     284             : 
     285           0 :         list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
     286             :                 struct anon_vma *anon_vma;
     287             : 
     288           0 :                 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
     289           0 :                 if (unlikely(!avc)) {
     290           0 :                         unlock_anon_vma_root(root);
     291           0 :                         root = NULL;
     292           0 :                         avc = anon_vma_chain_alloc(GFP_KERNEL);
     293           0 :                         if (!avc)
     294             :                                 goto enomem_failure;
     295             :                 }
     296           0 :                 anon_vma = pavc->anon_vma;
     297           0 :                 root = lock_anon_vma_root(root, anon_vma);
     298           0 :                 anon_vma_chain_link(dst, avc, anon_vma);
     299             : 
     300             :                 /*
     301             :                  * Reuse existing anon_vma if it has no vma and only one
     302             :                  * anon_vma child.
     303             :                  *
     304             :                  * Root anon_vma is never reused:
     305             :                  * it has self-parent reference and at least one child.
     306             :                  */
     307           0 :                 if (!dst->anon_vma && src->anon_vma &&
     308           0 :                     anon_vma->num_children < 2 &&
     309           0 :                     anon_vma->num_active_vmas == 0)
     310           0 :                         dst->anon_vma = anon_vma;
     311             :         }
     312           0 :         if (dst->anon_vma)
     313           0 :                 dst->anon_vma->num_active_vmas++;
     314             :         unlock_anon_vma_root(root);
     315             :         return 0;
     316             : 
     317             :  enomem_failure:
     318             :         /*
     319             :          * dst->anon_vma is dropped here otherwise its num_active_vmas can
     320             :          * be incorrectly decremented in unlink_anon_vmas().
     321             :          * We can safely do this because callers of anon_vma_clone() don't care
     322             :          * about dst->anon_vma if anon_vma_clone() failed.
     323             :          */
     324           0 :         dst->anon_vma = NULL;
     325           0 :         unlink_anon_vmas(dst);
     326           0 :         return -ENOMEM;
     327             : }
     328             : 
     329             : /*
     330             :  * Attach vma to its own anon_vma, as well as to the anon_vmas that
     331             :  * the corresponding VMA in the parent process is attached to.
     332             :  * Returns 0 on success, non-zero on failure.
     333             :  */
     334           0 : int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
     335             : {
     336             :         struct anon_vma_chain *avc;
     337             :         struct anon_vma *anon_vma;
     338             :         int error;
     339             : 
     340             :         /* Don't bother if the parent process has no anon_vma here. */
     341           0 :         if (!pvma->anon_vma)
     342             :                 return 0;
     343             : 
     344             :         /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
     345           0 :         vma->anon_vma = NULL;
     346             : 
     347             :         /*
     348             :          * First, attach the new VMA to the parent VMA's anon_vmas,
     349             :          * so rmap can find non-COWed pages in child processes.
     350             :          */
     351           0 :         error = anon_vma_clone(vma, pvma);
     352           0 :         if (error)
     353             :                 return error;
     354             : 
     355             :         /* An existing anon_vma has been reused, all done then. */
     356           0 :         if (vma->anon_vma)
     357             :                 return 0;
     358             : 
     359             :         /* Then add our own anon_vma. */
     360           0 :         anon_vma = anon_vma_alloc();
     361           0 :         if (!anon_vma)
     362             :                 goto out_error;
     363           0 :         anon_vma->num_active_vmas++;
     364           0 :         avc = anon_vma_chain_alloc(GFP_KERNEL);
     365           0 :         if (!avc)
     366             :                 goto out_error_free_anon_vma;
     367             : 
     368             :         /*
     369             :          * The root anon_vma's rwsem is the lock actually used when we
     370             :          * lock any of the anon_vmas in this anon_vma tree.
     371             :          */
     372           0 :         anon_vma->root = pvma->anon_vma->root;
     373           0 :         anon_vma->parent = pvma->anon_vma;
     374             :         /*
     375             :          * With refcounts, an anon_vma can stay around longer than the
     376             :          * process it belongs to. The root anon_vma needs to be pinned until
     377             :          * this anon_vma is freed, because the lock lives in the root.
     378             :          */
     379           0 :         get_anon_vma(anon_vma->root);
     380             :         /* Mark this anon_vma as the one where our new (COWed) pages go. */
     381           0 :         vma->anon_vma = anon_vma;
     382           0 :         anon_vma_lock_write(anon_vma);
     383           0 :         anon_vma_chain_link(vma, avc, anon_vma);
     384           0 :         anon_vma->parent->num_children++;
     385           0 :         anon_vma_unlock_write(anon_vma);
     386             : 
     387           0 :         return 0;
     388             : 
     389             :  out_error_free_anon_vma:
     390             :         put_anon_vma(anon_vma);
     391             :  out_error:
     392           0 :         unlink_anon_vmas(vma);
     393           0 :         return -ENOMEM;
     394             : }
     395             : 
     396           0 : void unlink_anon_vmas(struct vm_area_struct *vma)
     397             : {
     398             :         struct anon_vma_chain *avc, *next;
     399           0 :         struct anon_vma *root = NULL;
     400             : 
     401             :         /*
     402             :          * Unlink each anon_vma chained to the VMA.  This list is ordered
     403             :          * from newest to oldest, ensuring the root anon_vma gets freed last.
     404             :          */
     405           0 :         list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
     406           0 :                 struct anon_vma *anon_vma = avc->anon_vma;
     407             : 
     408           0 :                 root = lock_anon_vma_root(root, anon_vma);
     409           0 :                 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
     410             : 
     411             :                 /*
     412             :                  * Leave empty anon_vmas on the list - we'll need
     413             :                  * to free them outside the lock.
     414             :                  */
     415           0 :                 if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
     416           0 :                         anon_vma->parent->num_children--;
     417           0 :                         continue;
     418             :                 }
     419             : 
     420           0 :                 list_del(&avc->same_vma);
     421             :                 anon_vma_chain_free(avc);
     422             :         }
     423           0 :         if (vma->anon_vma) {
     424           0 :                 vma->anon_vma->num_active_vmas--;
     425             : 
     426             :                 /*
     427             :                  * vma would still be needed after unlink, and anon_vma will be prepared
     428             :                  * when handle fault.
     429             :                  */
     430           0 :                 vma->anon_vma = NULL;
     431             :         }
     432           0 :         unlock_anon_vma_root(root);
     433             : 
     434             :         /*
     435             :          * Iterate the list once more, it now only contains empty and unlinked
     436             :          * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
     437             :          * needing to write-acquire the anon_vma->root->rwsem.
     438             :          */
     439           0 :         list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
     440           0 :                 struct anon_vma *anon_vma = avc->anon_vma;
     441             : 
     442             :                 VM_WARN_ON(anon_vma->num_children);
     443             :                 VM_WARN_ON(anon_vma->num_active_vmas);
     444           0 :                 put_anon_vma(anon_vma);
     445             : 
     446           0 :                 list_del(&avc->same_vma);
     447           0 :                 anon_vma_chain_free(avc);
     448             :         }
     449           0 : }
     450             : 
     451           0 : static void anon_vma_ctor(void *data)
     452             : {
     453           0 :         struct anon_vma *anon_vma = data;
     454             : 
     455           0 :         init_rwsem(&anon_vma->rwsem);
     456           0 :         atomic_set(&anon_vma->refcount, 0);
     457           0 :         anon_vma->rb_root = RB_ROOT_CACHED;
     458           0 : }
     459             : 
     460           1 : void __init anon_vma_init(void)
     461             : {
     462           1 :         anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
     463             :                         0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
     464             :                         anon_vma_ctor);
     465           1 :         anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
     466             :                         SLAB_PANIC|SLAB_ACCOUNT);
     467           1 : }
     468             : 
     469             : /*
     470             :  * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
     471             :  *
     472             :  * Since there is no serialization what so ever against page_remove_rmap()
     473             :  * the best this function can do is return a refcount increased anon_vma
     474             :  * that might have been relevant to this page.
     475             :  *
     476             :  * The page might have been remapped to a different anon_vma or the anon_vma
     477             :  * returned may already be freed (and even reused).
     478             :  *
     479             :  * In case it was remapped to a different anon_vma, the new anon_vma will be a
     480             :  * child of the old anon_vma, and the anon_vma lifetime rules will therefore
     481             :  * ensure that any anon_vma obtained from the page will still be valid for as
     482             :  * long as we observe page_mapped() [ hence all those page_mapped() tests ].
     483             :  *
     484             :  * All users of this function must be very careful when walking the anon_vma
     485             :  * chain and verify that the page in question is indeed mapped in it
     486             :  * [ something equivalent to page_mapped_in_vma() ].
     487             :  *
     488             :  * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
     489             :  * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
     490             :  * if there is a mapcount, we can dereference the anon_vma after observing
     491             :  * those.
     492             :  */
     493           0 : struct anon_vma *folio_get_anon_vma(struct folio *folio)
     494             : {
     495           0 :         struct anon_vma *anon_vma = NULL;
     496             :         unsigned long anon_mapping;
     497             : 
     498             :         rcu_read_lock();
     499           0 :         anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
     500           0 :         if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
     501             :                 goto out;
     502           0 :         if (!folio_mapped(folio))
     503             :                 goto out;
     504             : 
     505           0 :         anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
     506           0 :         if (!atomic_inc_not_zero(&anon_vma->refcount)) {
     507             :                 anon_vma = NULL;
     508             :                 goto out;
     509             :         }
     510             : 
     511             :         /*
     512             :          * If this folio is still mapped, then its anon_vma cannot have been
     513             :          * freed.  But if it has been unmapped, we have no security against the
     514             :          * anon_vma structure being freed and reused (for another anon_vma:
     515             :          * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
     516             :          * above cannot corrupt).
     517             :          */
     518           0 :         if (!folio_mapped(folio)) {
     519           0 :                 rcu_read_unlock();
     520             :                 put_anon_vma(anon_vma);
     521             :                 return NULL;
     522             :         }
     523             : out:
     524             :         rcu_read_unlock();
     525             : 
     526           0 :         return anon_vma;
     527             : }
     528             : 
     529             : /*
     530             :  * Similar to folio_get_anon_vma() except it locks the anon_vma.
     531             :  *
     532             :  * Its a little more complex as it tries to keep the fast path to a single
     533             :  * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
     534             :  * reference like with folio_get_anon_vma() and then block on the mutex
     535             :  * on !rwc->try_lock case.
     536             :  */
     537           0 : struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
     538             :                                           struct rmap_walk_control *rwc)
     539             : {
     540           0 :         struct anon_vma *anon_vma = NULL;
     541             :         struct anon_vma *root_anon_vma;
     542             :         unsigned long anon_mapping;
     543             : 
     544             :         rcu_read_lock();
     545           0 :         anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
     546           0 :         if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
     547             :                 goto out;
     548           0 :         if (!folio_mapped(folio))
     549             :                 goto out;
     550             : 
     551           0 :         anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
     552           0 :         root_anon_vma = READ_ONCE(anon_vma->root);
     553           0 :         if (down_read_trylock(&root_anon_vma->rwsem)) {
     554             :                 /*
     555             :                  * If the folio is still mapped, then this anon_vma is still
     556             :                  * its anon_vma, and holding the mutex ensures that it will
     557             :                  * not go away, see anon_vma_free().
     558             :                  */
     559           0 :                 if (!folio_mapped(folio)) {
     560           0 :                         up_read(&root_anon_vma->rwsem);
     561           0 :                         anon_vma = NULL;
     562             :                 }
     563             :                 goto out;
     564             :         }
     565             : 
     566           0 :         if (rwc && rwc->try_lock) {
     567           0 :                 anon_vma = NULL;
     568           0 :                 rwc->contended = true;
     569           0 :                 goto out;
     570             :         }
     571             : 
     572             :         /* trylock failed, we got to sleep */
     573           0 :         if (!atomic_inc_not_zero(&anon_vma->refcount)) {
     574             :                 anon_vma = NULL;
     575             :                 goto out;
     576             :         }
     577             : 
     578           0 :         if (!folio_mapped(folio)) {
     579           0 :                 rcu_read_unlock();
     580             :                 put_anon_vma(anon_vma);
     581             :                 return NULL;
     582             :         }
     583             : 
     584             :         /* we pinned the anon_vma, its safe to sleep */
     585             :         rcu_read_unlock();
     586           0 :         anon_vma_lock_read(anon_vma);
     587             : 
     588           0 :         if (atomic_dec_and_test(&anon_vma->refcount)) {
     589             :                 /*
     590             :                  * Oops, we held the last refcount, release the lock
     591             :                  * and bail -- can't simply use put_anon_vma() because
     592             :                  * we'll deadlock on the anon_vma_lock_write() recursion.
     593             :                  */
     594           0 :                 anon_vma_unlock_read(anon_vma);
     595           0 :                 __put_anon_vma(anon_vma);
     596           0 :                 anon_vma = NULL;
     597             :         }
     598             : 
     599             :         return anon_vma;
     600             : 
     601             : out:
     602             :         rcu_read_unlock();
     603           0 :         return anon_vma;
     604             : }
     605             : 
     606             : #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
     607             : /*
     608             :  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
     609             :  * important if a PTE was dirty when it was unmapped that it's flushed
     610             :  * before any IO is initiated on the page to prevent lost writes. Similarly,
     611             :  * it must be flushed before freeing to prevent data leakage.
     612             :  */
     613             : void try_to_unmap_flush(void)
     614             : {
     615             :         struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
     616             : 
     617             :         if (!tlb_ubc->flush_required)
     618             :                 return;
     619             : 
     620             :         arch_tlbbatch_flush(&tlb_ubc->arch);
     621             :         tlb_ubc->flush_required = false;
     622             :         tlb_ubc->writable = false;
     623             : }
     624             : 
     625             : /* Flush iff there are potentially writable TLB entries that can race with IO */
     626             : void try_to_unmap_flush_dirty(void)
     627             : {
     628             :         struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
     629             : 
     630             :         if (tlb_ubc->writable)
     631             :                 try_to_unmap_flush();
     632             : }
     633             : 
     634             : /*
     635             :  * Bits 0-14 of mm->tlb_flush_batched record pending generations.
     636             :  * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
     637             :  */
     638             : #define TLB_FLUSH_BATCH_FLUSHED_SHIFT   16
     639             : #define TLB_FLUSH_BATCH_PENDING_MASK                    \
     640             :         ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
     641             : #define TLB_FLUSH_BATCH_PENDING_LARGE                   \
     642             :         (TLB_FLUSH_BATCH_PENDING_MASK / 2)
     643             : 
     644             : static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
     645             : {
     646             :         struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
     647             :         int batch, nbatch;
     648             : 
     649             :         arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
     650             :         tlb_ubc->flush_required = true;
     651             : 
     652             :         /*
     653             :          * Ensure compiler does not re-order the setting of tlb_flush_batched
     654             :          * before the PTE is cleared.
     655             :          */
     656             :         barrier();
     657             :         batch = atomic_read(&mm->tlb_flush_batched);
     658             : retry:
     659             :         if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
     660             :                 /*
     661             :                  * Prevent `pending' from catching up with `flushed' because of
     662             :                  * overflow.  Reset `pending' and `flushed' to be 1 and 0 if
     663             :                  * `pending' becomes large.
     664             :                  */
     665             :                 nbatch = atomic_cmpxchg(&mm->tlb_flush_batched, batch, 1);
     666             :                 if (nbatch != batch) {
     667             :                         batch = nbatch;
     668             :                         goto retry;
     669             :                 }
     670             :         } else {
     671             :                 atomic_inc(&mm->tlb_flush_batched);
     672             :         }
     673             : 
     674             :         /*
     675             :          * If the PTE was dirty then it's best to assume it's writable. The
     676             :          * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
     677             :          * before the page is queued for IO.
     678             :          */
     679             :         if (writable)
     680             :                 tlb_ubc->writable = true;
     681             : }
     682             : 
     683             : /*
     684             :  * Returns true if the TLB flush should be deferred to the end of a batch of
     685             :  * unmap operations to reduce IPIs.
     686             :  */
     687             : static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
     688             : {
     689             :         bool should_defer = false;
     690             : 
     691             :         if (!(flags & TTU_BATCH_FLUSH))
     692             :                 return false;
     693             : 
     694             :         /* If remote CPUs need to be flushed then defer batch the flush */
     695             :         if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
     696             :                 should_defer = true;
     697             :         put_cpu();
     698             : 
     699             :         return should_defer;
     700             : }
     701             : 
     702             : /*
     703             :  * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
     704             :  * releasing the PTL if TLB flushes are batched. It's possible for a parallel
     705             :  * operation such as mprotect or munmap to race between reclaim unmapping
     706             :  * the page and flushing the page. If this race occurs, it potentially allows
     707             :  * access to data via a stale TLB entry. Tracking all mm's that have TLB
     708             :  * batching in flight would be expensive during reclaim so instead track
     709             :  * whether TLB batching occurred in the past and if so then do a flush here
     710             :  * if required. This will cost one additional flush per reclaim cycle paid
     711             :  * by the first operation at risk such as mprotect and mumap.
     712             :  *
     713             :  * This must be called under the PTL so that an access to tlb_flush_batched
     714             :  * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
     715             :  * via the PTL.
     716             :  */
     717             : void flush_tlb_batched_pending(struct mm_struct *mm)
     718             : {
     719             :         int batch = atomic_read(&mm->tlb_flush_batched);
     720             :         int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
     721             :         int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
     722             : 
     723             :         if (pending != flushed) {
     724             :                 flush_tlb_mm(mm);
     725             :                 /*
     726             :                  * If the new TLB flushing is pending during flushing, leave
     727             :                  * mm->tlb_flush_batched as is, to avoid losing flushing.
     728             :                  */
     729             :                 atomic_cmpxchg(&mm->tlb_flush_batched, batch,
     730             :                                pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
     731             :         }
     732             : }
     733             : #else
     734             : static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
     735             : {
     736             : }
     737             : 
     738             : static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
     739             : {
     740             :         return false;
     741             : }
     742             : #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
     743             : 
     744             : /*
     745             :  * At what user virtual address is page expected in vma?
     746             :  * Caller should check the page is actually part of the vma.
     747             :  */
     748           0 : unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
     749             : {
     750           0 :         struct folio *folio = page_folio(page);
     751           0 :         if (folio_test_anon(folio)) {
     752           0 :                 struct anon_vma *page__anon_vma = folio_anon_vma(folio);
     753             :                 /*
     754             :                  * Note: swapoff's unuse_vma() is more efficient with this
     755             :                  * check, and needs it to match anon_vma when KSM is active.
     756             :                  */
     757           0 :                 if (!vma->anon_vma || !page__anon_vma ||
     758           0 :                     vma->anon_vma->root != page__anon_vma->root)
     759             :                         return -EFAULT;
     760           0 :         } else if (!vma->vm_file) {
     761             :                 return -EFAULT;
     762           0 :         } else if (vma->vm_file->f_mapping != folio->mapping) {
     763             :                 return -EFAULT;
     764             :         }
     765             : 
     766           0 :         return vma_address(page, vma);
     767             : }
     768             : 
     769             : /*
     770             :  * Returns the actual pmd_t* where we expect 'address' to be mapped from, or
     771             :  * NULL if it doesn't exist.  No guarantees / checks on what the pmd_t*
     772             :  * represents.
     773             :  */
     774           0 : pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
     775             : {
     776             :         pgd_t *pgd;
     777             :         p4d_t *p4d;
     778             :         pud_t *pud;
     779           0 :         pmd_t *pmd = NULL;
     780             : 
     781           0 :         pgd = pgd_offset(mm, address);
     782             :         if (!pgd_present(*pgd))
     783             :                 goto out;
     784             : 
     785           0 :         p4d = p4d_offset(pgd, address);
     786             :         if (!p4d_present(*p4d))
     787             :                 goto out;
     788             : 
     789           0 :         pud = pud_offset(p4d, address);
     790           0 :         if (!pud_present(*pud))
     791             :                 goto out;
     792             : 
     793           0 :         pmd = pmd_offset(pud, address);
     794             : out:
     795           0 :         return pmd;
     796             : }
     797             : 
     798             : struct folio_referenced_arg {
     799             :         int mapcount;
     800             :         int referenced;
     801             :         unsigned long vm_flags;
     802             :         struct mem_cgroup *memcg;
     803             : };
     804             : /*
     805             :  * arg: folio_referenced_arg will be passed
     806             :  */
     807           0 : static bool folio_referenced_one(struct folio *folio,
     808             :                 struct vm_area_struct *vma, unsigned long address, void *arg)
     809             : {
     810           0 :         struct folio_referenced_arg *pra = arg;
     811           0 :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
     812           0 :         int referenced = 0;
     813             : 
     814           0 :         while (page_vma_mapped_walk(&pvmw)) {
     815           0 :                 address = pvmw.address;
     816             : 
     817           0 :                 if ((vma->vm_flags & VM_LOCKED) &&
     818           0 :                     (!folio_test_large(folio) || !pvmw.pte)) {
     819             :                         /* Restore the mlock which got missed */
     820           0 :                         mlock_vma_folio(folio, vma, !pvmw.pte);
     821           0 :                         page_vma_mapped_walk_done(&pvmw);
     822           0 :                         pra->vm_flags |= VM_LOCKED;
     823           0 :                         return false; /* To break the loop */
     824             :                 }
     825             : 
     826           0 :                 if (pvmw.pte) {
     827             :                         if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
     828             :                                 lru_gen_look_around(&pvmw);
     829             :                                 referenced++;
     830             :                         }
     831             : 
     832           0 :                         if (ptep_clear_flush_young_notify(vma, address,
     833             :                                                 pvmw.pte))
     834           0 :                                 referenced++;
     835             :                 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
     836             :                         if (pmdp_clear_flush_young_notify(vma, address,
     837             :                                                 pvmw.pmd))
     838             :                                 referenced++;
     839             :                 } else {
     840             :                         /* unexpected pmd-mapped folio? */
     841           0 :                         WARN_ON_ONCE(1);
     842             :                 }
     843             : 
     844           0 :                 pra->mapcount--;
     845             :         }
     846             : 
     847             :         if (referenced)
     848             :                 folio_clear_idle(folio);
     849           0 :         if (folio_test_clear_young(folio))
     850             :                 referenced++;
     851             : 
     852           0 :         if (referenced) {
     853           0 :                 pra->referenced++;
     854           0 :                 pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
     855             :         }
     856             : 
     857           0 :         if (!pra->mapcount)
     858             :                 return false; /* To break the loop */
     859             : 
     860           0 :         return true;
     861             : }
     862             : 
     863           0 : static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
     864             : {
     865           0 :         struct folio_referenced_arg *pra = arg;
     866           0 :         struct mem_cgroup *memcg = pra->memcg;
     867             : 
     868             :         /*
     869             :          * Ignore references from this mapping if it has no recency. If the
     870             :          * folio has been used in another mapping, we will catch it; if this
     871             :          * other mapping is already gone, the unmap path will have set the
     872             :          * referenced flag or activated the folio in zap_pte_range().
     873             :          */
     874           0 :         if (!vma_has_recency(vma))
     875             :                 return true;
     876             : 
     877             :         /*
     878             :          * If we are reclaiming on behalf of a cgroup, skip counting on behalf
     879             :          * of references from different cgroups.
     880             :          */
     881             :         if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
     882             :                 return true;
     883             : 
     884             :         return false;
     885             : }
     886             : 
     887             : /**
     888             :  * folio_referenced() - Test if the folio was referenced.
     889             :  * @folio: The folio to test.
     890             :  * @is_locked: Caller holds lock on the folio.
     891             :  * @memcg: target memory cgroup
     892             :  * @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
     893             :  *
     894             :  * Quick test_and_clear_referenced for all mappings of a folio,
     895             :  *
     896             :  * Return: The number of mappings which referenced the folio. Return -1 if
     897             :  * the function bailed out due to rmap lock contention.
     898             :  */
     899           0 : int folio_referenced(struct folio *folio, int is_locked,
     900             :                      struct mem_cgroup *memcg, unsigned long *vm_flags)
     901             : {
     902           0 :         int we_locked = 0;
     903           0 :         struct folio_referenced_arg pra = {
     904           0 :                 .mapcount = folio_mapcount(folio),
     905             :                 .memcg = memcg,
     906             :         };
     907           0 :         struct rmap_walk_control rwc = {
     908             :                 .rmap_one = folio_referenced_one,
     909             :                 .arg = (void *)&pra,
     910             :                 .anon_lock = folio_lock_anon_vma_read,
     911             :                 .try_lock = true,
     912             :                 .invalid_vma = invalid_folio_referenced_vma,
     913             :         };
     914             : 
     915           0 :         *vm_flags = 0;
     916           0 :         if (!pra.mapcount)
     917             :                 return 0;
     918             : 
     919           0 :         if (!folio_raw_mapping(folio))
     920             :                 return 0;
     921             : 
     922           0 :         if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) {
     923           0 :                 we_locked = folio_trylock(folio);
     924           0 :                 if (!we_locked)
     925             :                         return 1;
     926             :         }
     927             : 
     928           0 :         rmap_walk(folio, &rwc);
     929           0 :         *vm_flags = pra.vm_flags;
     930             : 
     931           0 :         if (we_locked)
     932           0 :                 folio_unlock(folio);
     933             : 
     934           0 :         return rwc.contended ? -1 : pra.referenced;
     935             : }
     936             : 
     937           0 : static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
     938             : {
     939           0 :         int cleaned = 0;
     940           0 :         struct vm_area_struct *vma = pvmw->vma;
     941             :         struct mmu_notifier_range range;
     942           0 :         unsigned long address = pvmw->address;
     943             : 
     944             :         /*
     945             :          * We have to assume the worse case ie pmd for invalidation. Note that
     946             :          * the folio can not be freed from this function.
     947             :          */
     948             :         mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
     949             :                                 vma->vm_mm, address, vma_address_end(pvmw));
     950             :         mmu_notifier_invalidate_range_start(&range);
     951             : 
     952           0 :         while (page_vma_mapped_walk(pvmw)) {
     953           0 :                 int ret = 0;
     954             : 
     955           0 :                 address = pvmw->address;
     956           0 :                 if (pvmw->pte) {
     957             :                         pte_t entry;
     958           0 :                         pte_t *pte = pvmw->pte;
     959             : 
     960           0 :                         if (!pte_dirty(*pte) && !pte_write(*pte))
     961           0 :                                 continue;
     962             : 
     963           0 :                         flush_cache_page(vma, address, pte_pfn(*pte));
     964           0 :                         entry = ptep_clear_flush(vma, address, pte);
     965           0 :                         entry = pte_wrprotect(entry);
     966           0 :                         entry = pte_mkclean(entry);
     967           0 :                         set_pte_at(vma->vm_mm, address, pte, entry);
     968             :                         ret = 1;
     969             :                 } else {
     970             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     971             :                         pmd_t *pmd = pvmw->pmd;
     972             :                         pmd_t entry;
     973             : 
     974             :                         if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
     975             :                                 continue;
     976             : 
     977             :                         flush_cache_range(vma, address,
     978             :                                           address + HPAGE_PMD_SIZE);
     979             :                         entry = pmdp_invalidate(vma, address, pmd);
     980             :                         entry = pmd_wrprotect(entry);
     981             :                         entry = pmd_mkclean(entry);
     982             :                         set_pmd_at(vma->vm_mm, address, pmd, entry);
     983             :                         ret = 1;
     984             : #else
     985             :                         /* unexpected pmd-mapped folio? */
     986           0 :                         WARN_ON_ONCE(1);
     987             : #endif
     988             :                 }
     989             : 
     990             :                 /*
     991             :                  * No need to call mmu_notifier_invalidate_range() as we are
     992             :                  * downgrading page table protection not changing it to point
     993             :                  * to a new page.
     994             :                  *
     995             :                  * See Documentation/mm/mmu_notifier.rst
     996             :                  */
     997           0 :                 if (ret)
     998           0 :                         cleaned++;
     999             :         }
    1000             : 
    1001           0 :         mmu_notifier_invalidate_range_end(&range);
    1002             : 
    1003           0 :         return cleaned;
    1004             : }
    1005             : 
    1006           0 : static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
    1007             :                              unsigned long address, void *arg)
    1008             : {
    1009           0 :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
    1010           0 :         int *cleaned = arg;
    1011             : 
    1012           0 :         *cleaned += page_vma_mkclean_one(&pvmw);
    1013             : 
    1014           0 :         return true;
    1015             : }
    1016             : 
    1017           0 : static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
    1018             : {
    1019           0 :         if (vma->vm_flags & VM_SHARED)
    1020             :                 return false;
    1021             : 
    1022           0 :         return true;
    1023             : }
    1024             : 
    1025           0 : int folio_mkclean(struct folio *folio)
    1026             : {
    1027           0 :         int cleaned = 0;
    1028             :         struct address_space *mapping;
    1029           0 :         struct rmap_walk_control rwc = {
    1030             :                 .arg = (void *)&cleaned,
    1031             :                 .rmap_one = page_mkclean_one,
    1032             :                 .invalid_vma = invalid_mkclean_vma,
    1033             :         };
    1034             : 
    1035           0 :         BUG_ON(!folio_test_locked(folio));
    1036             : 
    1037           0 :         if (!folio_mapped(folio))
    1038             :                 return 0;
    1039             : 
    1040           0 :         mapping = folio_mapping(folio);
    1041           0 :         if (!mapping)
    1042             :                 return 0;
    1043             : 
    1044           0 :         rmap_walk(folio, &rwc);
    1045             : 
    1046           0 :         return cleaned;
    1047             : }
    1048             : EXPORT_SYMBOL_GPL(folio_mkclean);
    1049             : 
    1050             : /**
    1051             :  * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
    1052             :  *                     [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
    1053             :  *                     within the @vma of shared mappings. And since clean PTEs
    1054             :  *                     should also be readonly, write protects them too.
    1055             :  * @pfn: start pfn.
    1056             :  * @nr_pages: number of physically contiguous pages srarting with @pfn.
    1057             :  * @pgoff: page offset that the @pfn mapped with.
    1058             :  * @vma: vma that @pfn mapped within.
    1059             :  *
    1060             :  * Returns the number of cleaned PTEs (including PMDs).
    1061             :  */
    1062           0 : int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
    1063             :                       struct vm_area_struct *vma)
    1064             : {
    1065           0 :         struct page_vma_mapped_walk pvmw = {
    1066             :                 .pfn            = pfn,
    1067             :                 .nr_pages       = nr_pages,
    1068             :                 .pgoff          = pgoff,
    1069             :                 .vma            = vma,
    1070             :                 .flags          = PVMW_SYNC,
    1071             :         };
    1072             : 
    1073           0 :         if (invalid_mkclean_vma(vma, NULL))
    1074             :                 return 0;
    1075             : 
    1076           0 :         pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma);
    1077             :         VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
    1078             : 
    1079           0 :         return page_vma_mkclean_one(&pvmw);
    1080             : }
    1081             : 
    1082           0 : int folio_total_mapcount(struct folio *folio)
    1083             : {
    1084           0 :         int mapcount = folio_entire_mapcount(folio);
    1085             :         int nr_pages;
    1086             :         int i;
    1087             : 
    1088             :         /* In the common case, avoid the loop when no pages mapped by PTE */
    1089           0 :         if (folio_nr_pages_mapped(folio) == 0)
    1090             :                 return mapcount;
    1091             :         /*
    1092             :          * Add all the PTE mappings of those pages mapped by PTE.
    1093             :          * Limit the loop to folio_nr_pages_mapped()?
    1094             :          * Perhaps: given all the raciness, that may be a good or a bad idea.
    1095             :          */
    1096           0 :         nr_pages = folio_nr_pages(folio);
    1097           0 :         for (i = 0; i < nr_pages; i++)
    1098           0 :                 mapcount += atomic_read(&folio_page(folio, i)->_mapcount);
    1099             : 
    1100             :         /* But each of those _mapcounts was based on -1 */
    1101           0 :         mapcount += nr_pages;
    1102           0 :         return mapcount;
    1103             : }
    1104             : 
    1105             : /**
    1106             :  * page_move_anon_rmap - move a page to our anon_vma
    1107             :  * @page:       the page to move to our anon_vma
    1108             :  * @vma:        the vma the page belongs to
    1109             :  *
    1110             :  * When a page belongs exclusively to one process after a COW event,
    1111             :  * that page can be moved into the anon_vma that belongs to just that
    1112             :  * process, so the rmap code will not search the parent or sibling
    1113             :  * processes.
    1114             :  */
    1115           0 : void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
    1116             : {
    1117           0 :         void *anon_vma = vma->anon_vma;
    1118           0 :         struct folio *folio = page_folio(page);
    1119             : 
    1120             :         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
    1121             :         VM_BUG_ON_VMA(!anon_vma, vma);
    1122             : 
    1123           0 :         anon_vma += PAGE_MAPPING_ANON;
    1124             :         /*
    1125             :          * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
    1126             :          * simultaneously, so a concurrent reader (eg folio_referenced()'s
    1127             :          * folio_test_anon()) will not see one without the other.
    1128             :          */
    1129           0 :         WRITE_ONCE(folio->mapping, anon_vma);
    1130           0 :         SetPageAnonExclusive(page);
    1131           0 : }
    1132             : 
    1133             : /**
    1134             :  * __page_set_anon_rmap - set up new anonymous rmap
    1135             :  * @folio:      Folio which contains page.
    1136             :  * @page:       Page to add to rmap.
    1137             :  * @vma:        VM area to add page to.
    1138             :  * @address:    User virtual address of the mapping     
    1139             :  * @exclusive:  the page is exclusively owned by the current process
    1140             :  */
    1141           0 : static void __page_set_anon_rmap(struct folio *folio, struct page *page,
    1142             :         struct vm_area_struct *vma, unsigned long address, int exclusive)
    1143             : {
    1144           0 :         struct anon_vma *anon_vma = vma->anon_vma;
    1145             : 
    1146           0 :         BUG_ON(!anon_vma);
    1147             : 
    1148           0 :         if (folio_test_anon(folio))
    1149             :                 goto out;
    1150             : 
    1151             :         /*
    1152             :          * If the page isn't exclusively mapped into this vma,
    1153             :          * we must use the _oldest_ possible anon_vma for the
    1154             :          * page mapping!
    1155             :          */
    1156           0 :         if (!exclusive)
    1157           0 :                 anon_vma = anon_vma->root;
    1158             : 
    1159             :         /*
    1160             :          * page_idle does a lockless/optimistic rmap scan on folio->mapping.
    1161             :          * Make sure the compiler doesn't split the stores of anon_vma and
    1162             :          * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
    1163             :          * could mistake the mapping for a struct address_space and crash.
    1164             :          */
    1165           0 :         anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
    1166           0 :         WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
    1167           0 :         folio->index = linear_page_index(vma, address);
    1168             : out:
    1169           0 :         if (exclusive)
    1170             :                 SetPageAnonExclusive(page);
    1171           0 : }
    1172             : 
    1173             : /**
    1174             :  * __page_check_anon_rmap - sanity check anonymous rmap addition
    1175             :  * @page:       the page to add the mapping to
    1176             :  * @vma:        the vm area in which the mapping is added
    1177             :  * @address:    the user virtual address mapped
    1178             :  */
    1179             : static void __page_check_anon_rmap(struct page *page,
    1180             :         struct vm_area_struct *vma, unsigned long address)
    1181             : {
    1182           0 :         struct folio *folio = page_folio(page);
    1183             :         /*
    1184             :          * The page's anon-rmap details (mapping and index) are guaranteed to
    1185             :          * be set up correctly at this point.
    1186             :          *
    1187             :          * We have exclusion against page_add_anon_rmap because the caller
    1188             :          * always holds the page locked.
    1189             :          *
    1190             :          * We have exclusion against page_add_new_anon_rmap because those pages
    1191             :          * are initially only visible via the pagetables, and the pte is locked
    1192             :          * over the call to page_add_new_anon_rmap.
    1193             :          */
    1194             :         VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
    1195             :                         folio);
    1196             :         VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
    1197             :                        page);
    1198             : }
    1199             : 
    1200             : /**
    1201             :  * page_add_anon_rmap - add pte mapping to an anonymous page
    1202             :  * @page:       the page to add the mapping to
    1203             :  * @vma:        the vm area in which the mapping is added
    1204             :  * @address:    the user virtual address mapped
    1205             :  * @flags:      the rmap flags
    1206             :  *
    1207             :  * The caller needs to hold the pte lock, and the page must be locked in
    1208             :  * the anon_vma case: to serialize mapping,index checking after setting,
    1209             :  * and to ensure that PageAnon is not being upgraded racily to PageKsm
    1210             :  * (but PageKsm is never downgraded to PageAnon).
    1211             :  */
    1212           0 : void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
    1213             :                 unsigned long address, rmap_t flags)
    1214             : {
    1215           0 :         struct folio *folio = page_folio(page);
    1216           0 :         atomic_t *mapped = &folio->_nr_pages_mapped;
    1217           0 :         int nr = 0, nr_pmdmapped = 0;
    1218           0 :         bool compound = flags & RMAP_COMPOUND;
    1219           0 :         bool first = true;
    1220             : 
    1221             :         /* Is page being mapped by PTE? Is this its first map to be added? */
    1222           0 :         if (likely(!compound)) {
    1223           0 :                 first = atomic_inc_and_test(&page->_mapcount);
    1224           0 :                 nr = first;
    1225           0 :                 if (first && folio_test_large(folio)) {
    1226           0 :                         nr = atomic_inc_return_relaxed(mapped);
    1227           0 :                         nr = (nr < COMPOUND_MAPPED);
    1228             :                 }
    1229             :         } else if (folio_test_pmd_mappable(folio)) {
    1230             :                 /* That test is redundant: it's for safety or to optimize out */
    1231             : 
    1232             :                 first = atomic_inc_and_test(&folio->_entire_mapcount);
    1233             :                 if (first) {
    1234             :                         nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
    1235             :                         if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) {
    1236             :                                 nr_pmdmapped = folio_nr_pages(folio);
    1237             :                                 nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
    1238             :                                 /* Raced ahead of a remove and another add? */
    1239             :                                 if (unlikely(nr < 0))
    1240             :                                         nr = 0;
    1241             :                         } else {
    1242             :                                 /* Raced ahead of a remove of COMPOUND_MAPPED */
    1243             :                                 nr = 0;
    1244             :                         }
    1245             :                 }
    1246             :         }
    1247             : 
    1248             :         VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
    1249             :         VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
    1250             : 
    1251             :         if (nr_pmdmapped)
    1252             :                 __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped);
    1253           0 :         if (nr)
    1254           0 :                 __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
    1255             : 
    1256           0 :         if (likely(!folio_test_ksm(folio))) {
    1257             :                 /* address might be in next vma when migration races vma_merge */
    1258           0 :                 if (first)
    1259           0 :                         __page_set_anon_rmap(folio, page, vma, address,
    1260           0 :                                              !!(flags & RMAP_EXCLUSIVE));
    1261             :                 else
    1262           0 :                         __page_check_anon_rmap(page, vma, address);
    1263             :         }
    1264             : 
    1265           0 :         mlock_vma_folio(folio, vma, compound);
    1266           0 : }
    1267             : 
    1268             : /**
    1269             :  * folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
    1270             :  * @folio:      The folio to add the mapping to.
    1271             :  * @vma:        the vm area in which the mapping is added
    1272             :  * @address:    the user virtual address mapped
    1273             :  *
    1274             :  * Like page_add_anon_rmap() but must only be called on *new* folios.
    1275             :  * This means the inc-and-test can be bypassed.
    1276             :  * The folio does not have to be locked.
    1277             :  *
    1278             :  * If the folio is large, it is accounted as a THP.  As the folio
    1279             :  * is new, it's assumed to be mapped exclusively by a single process.
    1280             :  */
    1281           0 : void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
    1282             :                 unsigned long address)
    1283             : {
    1284             :         int nr;
    1285             : 
    1286             :         VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
    1287           0 :         __folio_set_swapbacked(folio);
    1288             : 
    1289           0 :         if (likely(!folio_test_pmd_mappable(folio))) {
    1290             :                 /* increment count (starts at -1) */
    1291           0 :                 atomic_set(&folio->_mapcount, 0);
    1292           0 :                 nr = 1;
    1293             :         } else {
    1294             :                 /* increment count (starts at -1) */
    1295             :                 atomic_set(&folio->_entire_mapcount, 0);
    1296             :                 atomic_set(&folio->_nr_pages_mapped, COMPOUND_MAPPED);
    1297             :                 nr = folio_nr_pages(folio);
    1298             :                 __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr);
    1299             :         }
    1300             : 
    1301           0 :         __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
    1302           0 :         __page_set_anon_rmap(folio, &folio->page, vma, address, 1);
    1303           0 : }
    1304             : 
    1305             : /**
    1306             :  * page_add_file_rmap - add pte mapping to a file page
    1307             :  * @page:       the page to add the mapping to
    1308             :  * @vma:        the vm area in which the mapping is added
    1309             :  * @compound:   charge the page as compound or small page
    1310             :  *
    1311             :  * The caller needs to hold the pte lock.
    1312             :  */
    1313           0 : void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
    1314             :                 bool compound)
    1315             : {
    1316           0 :         struct folio *folio = page_folio(page);
    1317           0 :         atomic_t *mapped = &folio->_nr_pages_mapped;
    1318           0 :         int nr = 0, nr_pmdmapped = 0;
    1319             :         bool first;
    1320             : 
    1321             :         VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
    1322             : 
    1323             :         /* Is page being mapped by PTE? Is this its first map to be added? */
    1324           0 :         if (likely(!compound)) {
    1325           0 :                 first = atomic_inc_and_test(&page->_mapcount);
    1326           0 :                 nr = first;
    1327           0 :                 if (first && folio_test_large(folio)) {
    1328           0 :                         nr = atomic_inc_return_relaxed(mapped);
    1329           0 :                         nr = (nr < COMPOUND_MAPPED);
    1330             :                 }
    1331             :         } else if (folio_test_pmd_mappable(folio)) {
    1332             :                 /* That test is redundant: it's for safety or to optimize out */
    1333             : 
    1334             :                 first = atomic_inc_and_test(&folio->_entire_mapcount);
    1335             :                 if (first) {
    1336             :                         nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
    1337             :                         if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) {
    1338             :                                 nr_pmdmapped = folio_nr_pages(folio);
    1339             :                                 nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
    1340             :                                 /* Raced ahead of a remove and another add? */
    1341             :                                 if (unlikely(nr < 0))
    1342             :                                         nr = 0;
    1343             :                         } else {
    1344             :                                 /* Raced ahead of a remove of COMPOUND_MAPPED */
    1345             :                                 nr = 0;
    1346             :                         }
    1347             :                 }
    1348             :         }
    1349             : 
    1350             :         if (nr_pmdmapped)
    1351             :                 __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ?
    1352             :                         NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped);
    1353           0 :         if (nr)
    1354           0 :                 __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr);
    1355             : 
    1356           0 :         mlock_vma_folio(folio, vma, compound);
    1357           0 : }
    1358             : 
    1359             : /**
    1360             :  * page_remove_rmap - take down pte mapping from a page
    1361             :  * @page:       page to remove mapping from
    1362             :  * @vma:        the vm area from which the mapping is removed
    1363             :  * @compound:   uncharge the page as compound or small page
    1364             :  *
    1365             :  * The caller needs to hold the pte lock.
    1366             :  */
    1367           0 : void page_remove_rmap(struct page *page, struct vm_area_struct *vma,
    1368             :                 bool compound)
    1369             : {
    1370           0 :         struct folio *folio = page_folio(page);
    1371           0 :         atomic_t *mapped = &folio->_nr_pages_mapped;
    1372           0 :         int nr = 0, nr_pmdmapped = 0;
    1373             :         bool last;
    1374             :         enum node_stat_item idx;
    1375             : 
    1376             :         VM_BUG_ON_PAGE(compound && !PageHead(page), page);
    1377             : 
    1378             :         /* Hugetlb pages are not counted in NR_*MAPPED */
    1379           0 :         if (unlikely(folio_test_hugetlb(folio))) {
    1380             :                 /* hugetlb pages are always mapped with pmds */
    1381             :                 atomic_dec(&folio->_entire_mapcount);
    1382             :                 return;
    1383             :         }
    1384             : 
    1385             :         /* Is page being unmapped by PTE? Is this its last map to be removed? */
    1386           0 :         if (likely(!compound)) {
    1387           0 :                 last = atomic_add_negative(-1, &page->_mapcount);
    1388           0 :                 nr = last;
    1389           0 :                 if (last && folio_test_large(folio)) {
    1390           0 :                         nr = atomic_dec_return_relaxed(mapped);
    1391           0 :                         nr = (nr < COMPOUND_MAPPED);
    1392             :                 }
    1393             :         } else if (folio_test_pmd_mappable(folio)) {
    1394             :                 /* That test is redundant: it's for safety or to optimize out */
    1395             : 
    1396             :                 last = atomic_add_negative(-1, &folio->_entire_mapcount);
    1397             :                 if (last) {
    1398             :                         nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped);
    1399             :                         if (likely(nr < COMPOUND_MAPPED)) {
    1400             :                                 nr_pmdmapped = folio_nr_pages(folio);
    1401             :                                 nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
    1402             :                                 /* Raced ahead of another remove and an add? */
    1403             :                                 if (unlikely(nr < 0))
    1404             :                                         nr = 0;
    1405             :                         } else {
    1406             :                                 /* An add of COMPOUND_MAPPED raced ahead */
    1407             :                                 nr = 0;
    1408             :                         }
    1409             :                 }
    1410             :         }
    1411             : 
    1412             :         if (nr_pmdmapped) {
    1413             :                 if (folio_test_anon(folio))
    1414             :                         idx = NR_ANON_THPS;
    1415             :                 else if (folio_test_swapbacked(folio))
    1416             :                         idx = NR_SHMEM_PMDMAPPED;
    1417             :                 else
    1418             :                         idx = NR_FILE_PMDMAPPED;
    1419             :                 __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped);
    1420             :         }
    1421           0 :         if (nr) {
    1422           0 :                 idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
    1423           0 :                 __lruvec_stat_mod_folio(folio, idx, -nr);
    1424             : 
    1425             :                 /*
    1426             :                  * Queue anon THP for deferred split if at least one
    1427             :                  * page of the folio is unmapped and at least one page
    1428             :                  * is still mapped.
    1429             :                  */
    1430           0 :                 if (folio_test_pmd_mappable(folio) && folio_test_anon(folio))
    1431             :                         if (!compound || nr < nr_pmdmapped)
    1432             :                                 deferred_split_folio(folio);
    1433             :         }
    1434             : 
    1435             :         /*
    1436             :          * It would be tidy to reset folio_test_anon mapping when fully
    1437             :          * unmapped, but that might overwrite a racing page_add_anon_rmap
    1438             :          * which increments mapcount after us but sets mapping before us:
    1439             :          * so leave the reset to free_pages_prepare, and remember that
    1440             :          * it's only reliable while mapped.
    1441             :          */
    1442             : 
    1443           0 :         munlock_vma_folio(folio, vma, compound);
    1444             : }
    1445             : 
    1446             : /*
    1447             :  * @arg: enum ttu_flags will be passed to this argument
    1448             :  */
    1449           0 : static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
    1450             :                      unsigned long address, void *arg)
    1451             : {
    1452           0 :         struct mm_struct *mm = vma->vm_mm;
    1453           0 :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
    1454             :         pte_t pteval;
    1455             :         struct page *subpage;
    1456           0 :         bool anon_exclusive, ret = true;
    1457             :         struct mmu_notifier_range range;
    1458           0 :         enum ttu_flags flags = (enum ttu_flags)(long)arg;
    1459             : 
    1460             :         /*
    1461             :          * When racing against e.g. zap_pte_range() on another cpu,
    1462             :          * in between its ptep_get_and_clear_full() and page_remove_rmap(),
    1463             :          * try_to_unmap() may return before page_mapped() has become false,
    1464             :          * if page table locking is skipped: use TTU_SYNC to wait for that.
    1465             :          */
    1466           0 :         if (flags & TTU_SYNC)
    1467           0 :                 pvmw.flags = PVMW_SYNC;
    1468             : 
    1469             :         if (flags & TTU_SPLIT_HUGE_PMD)
    1470             :                 split_huge_pmd_address(vma, address, false, folio);
    1471             : 
    1472             :         /*
    1473             :          * For THP, we have to assume the worse case ie pmd for invalidation.
    1474             :          * For hugetlb, it could be much worse if we need to do pud
    1475             :          * invalidation in the case of pmd sharing.
    1476             :          *
    1477             :          * Note that the folio can not be freed in this function as call of
    1478             :          * try_to_unmap() must hold a reference on the folio.
    1479             :          */
    1480             :         range.end = vma_address_end(&pvmw);
    1481             :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
    1482             :                                 address, range.end);
    1483             :         if (folio_test_hugetlb(folio)) {
    1484             :                 /*
    1485             :                  * If sharing is possible, start and end will be adjusted
    1486             :                  * accordingly.
    1487             :                  */
    1488             :                 adjust_range_if_pmd_sharing_possible(vma, &range.start,
    1489             :                                                      &range.end);
    1490             :         }
    1491             :         mmu_notifier_invalidate_range_start(&range);
    1492             : 
    1493           0 :         while (page_vma_mapped_walk(&pvmw)) {
    1494             :                 /* Unexpected PMD-mapped THP? */
    1495             :                 VM_BUG_ON_FOLIO(!pvmw.pte, folio);
    1496             : 
    1497             :                 /*
    1498             :                  * If the folio is in an mlock()d vma, we must not swap it out.
    1499             :                  */
    1500           0 :                 if (!(flags & TTU_IGNORE_MLOCK) &&
    1501           0 :                     (vma->vm_flags & VM_LOCKED)) {
    1502             :                         /* Restore the mlock which got missed */
    1503           0 :                         mlock_vma_folio(folio, vma, false);
    1504           0 :                         page_vma_mapped_walk_done(&pvmw);
    1505             :                         ret = false;
    1506             :                         break;
    1507             :                 }
    1508             : 
    1509           0 :                 subpage = folio_page(folio,
    1510             :                                         pte_pfn(*pvmw.pte) - folio_pfn(folio));
    1511           0 :                 address = pvmw.address;
    1512           0 :                 anon_exclusive = folio_test_anon(folio) &&
    1513           0 :                                  PageAnonExclusive(subpage);
    1514             : 
    1515           0 :                 if (folio_test_hugetlb(folio)) {
    1516             :                         bool anon = folio_test_anon(folio);
    1517             : 
    1518             :                         /*
    1519             :                          * The try_to_unmap() is only passed a hugetlb page
    1520             :                          * in the case where the hugetlb page is poisoned.
    1521             :                          */
    1522             :                         VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
    1523             :                         /*
    1524             :                          * huge_pmd_unshare may unmap an entire PMD page.
    1525             :                          * There is no way of knowing exactly which PMDs may
    1526             :                          * be cached for this mm, so we must flush them all.
    1527             :                          * start/end were already adjusted above to cover this
    1528             :                          * range.
    1529             :                          */
    1530             :                         flush_cache_range(vma, range.start, range.end);
    1531             : 
    1532             :                         /*
    1533             :                          * To call huge_pmd_unshare, i_mmap_rwsem must be
    1534             :                          * held in write mode.  Caller needs to explicitly
    1535             :                          * do this outside rmap routines.
    1536             :                          *
    1537             :                          * We also must hold hugetlb vma_lock in write mode.
    1538             :                          * Lock order dictates acquiring vma_lock BEFORE
    1539             :                          * i_mmap_rwsem.  We can only try lock here and fail
    1540             :                          * if unsuccessful.
    1541             :                          */
    1542             :                         if (!anon) {
    1543             :                                 VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
    1544             :                                 if (!hugetlb_vma_trylock_write(vma)) {
    1545             :                                         page_vma_mapped_walk_done(&pvmw);
    1546             :                                         ret = false;
    1547             :                                         break;
    1548             :                                 }
    1549             :                                 if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
    1550             :                                         hugetlb_vma_unlock_write(vma);
    1551             :                                         flush_tlb_range(vma,
    1552             :                                                 range.start, range.end);
    1553             :                                         mmu_notifier_invalidate_range(mm,
    1554             :                                                 range.start, range.end);
    1555             :                                         /*
    1556             :                                          * The ref count of the PMD page was
    1557             :                                          * dropped which is part of the way map
    1558             :                                          * counting is done for shared PMDs.
    1559             :                                          * Return 'true' here.  When there is
    1560             :                                          * no other sharing, huge_pmd_unshare
    1561             :                                          * returns false and we will unmap the
    1562             :                                          * actual page and drop map count
    1563             :                                          * to zero.
    1564             :                                          */
    1565             :                                         page_vma_mapped_walk_done(&pvmw);
    1566             :                                         break;
    1567             :                                 }
    1568             :                                 hugetlb_vma_unlock_write(vma);
    1569             :                         }
    1570             :                         pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
    1571             :                 } else {
    1572           0 :                         flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
    1573             :                         /* Nuke the page table entry. */
    1574           0 :                         if (should_defer_flush(mm, flags)) {
    1575             :                                 /*
    1576             :                                  * We clear the PTE but do not flush so potentially
    1577             :                                  * a remote CPU could still be writing to the folio.
    1578             :                                  * If the entry was previously clean then the
    1579             :                                  * architecture must guarantee that a clear->dirty
    1580             :                                  * transition on a cached TLB entry is written through
    1581             :                                  * and traps if the PTE is unmapped.
    1582             :                                  */
    1583             :                                 pteval = ptep_get_and_clear(mm, address, pvmw.pte);
    1584             : 
    1585             :                                 set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
    1586             :                         } else {
    1587           0 :                                 pteval = ptep_clear_flush(vma, address, pvmw.pte);
    1588             :                         }
    1589             :                 }
    1590             : 
    1591             :                 /*
    1592             :                  * Now the pte is cleared. If this pte was uffd-wp armed,
    1593             :                  * we may want to replace a none pte with a marker pte if
    1594             :                  * it's file-backed, so we don't lose the tracking info.
    1595             :                  */
    1596           0 :                 pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
    1597             : 
    1598             :                 /* Set the dirty flag on the folio now the pte is gone. */
    1599           0 :                 if (pte_dirty(pteval))
    1600           0 :                         folio_mark_dirty(folio);
    1601             : 
    1602             :                 /* Update high watermark before we lower rss */
    1603           0 :                 update_hiwater_rss(mm);
    1604             : 
    1605           0 :                 if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) {
    1606             :                         pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
    1607             :                         if (folio_test_hugetlb(folio)) {
    1608             :                                 hugetlb_count_sub(folio_nr_pages(folio), mm);
    1609             :                                 set_huge_pte_at(mm, address, pvmw.pte, pteval);
    1610             :                         } else {
    1611             :                                 dec_mm_counter(mm, mm_counter(&folio->page));
    1612             :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1613             :                         }
    1614             : 
    1615           0 :                 } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
    1616             :                         /*
    1617             :                          * The guest indicated that the page content is of no
    1618             :                          * interest anymore. Simply discard the pte, vmscan
    1619             :                          * will take care of the rest.
    1620             :                          * A future reference will then fault in a new zero
    1621             :                          * page. When userfaultfd is active, we must not drop
    1622             :                          * this page though, as its main user (postcopy
    1623             :                          * migration) will not expect userfaults on already
    1624             :                          * copied pages.
    1625             :                          */
    1626             :                         dec_mm_counter(mm, mm_counter(&folio->page));
    1627             :                         /* We have to invalidate as we cleared the pte */
    1628             :                         mmu_notifier_invalidate_range(mm, address,
    1629             :                                                       address + PAGE_SIZE);
    1630           0 :                 } else if (folio_test_anon(folio)) {
    1631           0 :                         swp_entry_t entry = { .val = page_private(subpage) };
    1632             :                         pte_t swp_pte;
    1633             :                         /*
    1634             :                          * Store the swap location in the pte.
    1635             :                          * See handle_pte_fault() ...
    1636             :                          */
    1637           0 :                         if (unlikely(folio_test_swapbacked(folio) !=
    1638             :                                         folio_test_swapcache(folio))) {
    1639           0 :                                 WARN_ON_ONCE(1);
    1640           0 :                                 ret = false;
    1641             :                                 /* We have to invalidate as we cleared the pte */
    1642           0 :                                 mmu_notifier_invalidate_range(mm, address,
    1643             :                                                         address + PAGE_SIZE);
    1644           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1645             :                                 break;
    1646             :                         }
    1647             : 
    1648             :                         /* MADV_FREE page check */
    1649           0 :                         if (!folio_test_swapbacked(folio)) {
    1650             :                                 int ref_count, map_count;
    1651             : 
    1652             :                                 /*
    1653             :                                  * Synchronize with gup_pte_range():
    1654             :                                  * - clear PTE; barrier; read refcount
    1655             :                                  * - inc refcount; barrier; read PTE
    1656             :                                  */
    1657           0 :                                 smp_mb();
    1658             : 
    1659           0 :                                 ref_count = folio_ref_count(folio);
    1660           0 :                                 map_count = folio_mapcount(folio);
    1661             : 
    1662             :                                 /*
    1663             :                                  * Order reads for page refcount and dirty flag
    1664             :                                  * (see comments in __remove_mapping()).
    1665             :                                  */
    1666           0 :                                 smp_rmb();
    1667             : 
    1668             :                                 /*
    1669             :                                  * The only page refs must be one from isolation
    1670             :                                  * plus the rmap(s) (dropped by discard:).
    1671             :                                  */
    1672           0 :                                 if (ref_count == 1 + map_count &&
    1673           0 :                                     !folio_test_dirty(folio)) {
    1674             :                                         /* Invalidate as we cleared the pte */
    1675           0 :                                         mmu_notifier_invalidate_range(mm,
    1676             :                                                 address, address + PAGE_SIZE);
    1677           0 :                                         dec_mm_counter(mm, MM_ANONPAGES);
    1678           0 :                                         goto discard;
    1679             :                                 }
    1680             : 
    1681             :                                 /*
    1682             :                                  * If the folio was redirtied, it cannot be
    1683             :                                  * discarded. Remap the page to page table.
    1684             :                                  */
    1685           0 :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1686           0 :                                 folio_set_swapbacked(folio);
    1687           0 :                                 ret = false;
    1688           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1689             :                                 break;
    1690             :                         }
    1691             : 
    1692           0 :                         if (swap_duplicate(entry) < 0) {
    1693           0 :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1694           0 :                                 ret = false;
    1695           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1696             :                                 break;
    1697             :                         }
    1698           0 :                         if (arch_unmap_one(mm, vma, address, pteval) < 0) {
    1699             :                                 swap_free(entry);
    1700             :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1701             :                                 ret = false;
    1702             :                                 page_vma_mapped_walk_done(&pvmw);
    1703             :                                 break;
    1704             :                         }
    1705             : 
    1706             :                         /* See page_try_share_anon_rmap(): clear PTE first. */
    1707           0 :                         if (anon_exclusive &&
    1708           0 :                             page_try_share_anon_rmap(subpage)) {
    1709           0 :                                 swap_free(entry);
    1710           0 :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    1711           0 :                                 ret = false;
    1712           0 :                                 page_vma_mapped_walk_done(&pvmw);
    1713             :                                 break;
    1714             :                         }
    1715           0 :                         if (list_empty(&mm->mmlist)) {
    1716           0 :                                 spin_lock(&mmlist_lock);
    1717           0 :                                 if (list_empty(&mm->mmlist))
    1718           0 :                                         list_add(&mm->mmlist, &init_mm.mmlist);
    1719             :                                 spin_unlock(&mmlist_lock);
    1720             :                         }
    1721           0 :                         dec_mm_counter(mm, MM_ANONPAGES);
    1722           0 :                         inc_mm_counter(mm, MM_SWAPENTS);
    1723           0 :                         swp_pte = swp_entry_to_pte(entry);
    1724           0 :                         if (anon_exclusive)
    1725             :                                 swp_pte = pte_swp_mkexclusive(swp_pte);
    1726           0 :                         if (pte_soft_dirty(pteval))
    1727             :                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
    1728             :                         if (pte_uffd_wp(pteval))
    1729             :                                 swp_pte = pte_swp_mkuffd_wp(swp_pte);
    1730           0 :                         set_pte_at(mm, address, pvmw.pte, swp_pte);
    1731             :                         /* Invalidate as we cleared the pte */
    1732           0 :                         mmu_notifier_invalidate_range(mm, address,
    1733             :                                                       address + PAGE_SIZE);
    1734             :                 } else {
    1735             :                         /*
    1736             :                          * This is a locked file-backed folio,
    1737             :                          * so it cannot be removed from the page
    1738             :                          * cache and replaced by a new folio before
    1739             :                          * mmu_notifier_invalidate_range_end, so no
    1740             :                          * concurrent thread might update its page table
    1741             :                          * to point at a new folio while a device is
    1742             :                          * still using this folio.
    1743             :                          *
    1744             :                          * See Documentation/mm/mmu_notifier.rst
    1745             :                          */
    1746           0 :                         dec_mm_counter(mm, mm_counter_file(&folio->page));
    1747             :                 }
    1748             : discard:
    1749             :                 /*
    1750             :                  * No need to call mmu_notifier_invalidate_range() it has be
    1751             :                  * done above for all cases requiring it to happen under page
    1752             :                  * table lock before mmu_notifier_invalidate_range_end()
    1753             :                  *
    1754             :                  * See Documentation/mm/mmu_notifier.rst
    1755             :                  */
    1756           0 :                 page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
    1757           0 :                 if (vma->vm_flags & VM_LOCKED)
    1758           0 :                         mlock_drain_local();
    1759             :                 folio_put(folio);
    1760             :         }
    1761             : 
    1762           0 :         mmu_notifier_invalidate_range_end(&range);
    1763             : 
    1764           0 :         return ret;
    1765             : }
    1766             : 
    1767           0 : static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
    1768             : {
    1769           0 :         return vma_is_temporary_stack(vma);
    1770             : }
    1771             : 
    1772           0 : static int folio_not_mapped(struct folio *folio)
    1773             : {
    1774           0 :         return !folio_mapped(folio);
    1775             : }
    1776             : 
    1777             : /**
    1778             :  * try_to_unmap - Try to remove all page table mappings to a folio.
    1779             :  * @folio: The folio to unmap.
    1780             :  * @flags: action and flags
    1781             :  *
    1782             :  * Tries to remove all the page table entries which are mapping this
    1783             :  * folio.  It is the caller's responsibility to check if the folio is
    1784             :  * still mapped if needed (use TTU_SYNC to prevent accounting races).
    1785             :  *
    1786             :  * Context: Caller must hold the folio lock.
    1787             :  */
    1788           0 : void try_to_unmap(struct folio *folio, enum ttu_flags flags)
    1789             : {
    1790           0 :         struct rmap_walk_control rwc = {
    1791             :                 .rmap_one = try_to_unmap_one,
    1792           0 :                 .arg = (void *)flags,
    1793             :                 .done = folio_not_mapped,
    1794             :                 .anon_lock = folio_lock_anon_vma_read,
    1795             :         };
    1796             : 
    1797           0 :         if (flags & TTU_RMAP_LOCKED)
    1798           0 :                 rmap_walk_locked(folio, &rwc);
    1799             :         else
    1800           0 :                 rmap_walk(folio, &rwc);
    1801           0 : }
    1802             : 
    1803             : /*
    1804             :  * @arg: enum ttu_flags will be passed to this argument.
    1805             :  *
    1806             :  * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
    1807             :  * containing migration entries.
    1808             :  */
    1809           0 : static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
    1810             :                      unsigned long address, void *arg)
    1811             : {
    1812           0 :         struct mm_struct *mm = vma->vm_mm;
    1813           0 :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
    1814             :         pte_t pteval;
    1815             :         struct page *subpage;
    1816           0 :         bool anon_exclusive, ret = true;
    1817             :         struct mmu_notifier_range range;
    1818           0 :         enum ttu_flags flags = (enum ttu_flags)(long)arg;
    1819             : 
    1820             :         /*
    1821             :          * When racing against e.g. zap_pte_range() on another cpu,
    1822             :          * in between its ptep_get_and_clear_full() and page_remove_rmap(),
    1823             :          * try_to_migrate() may return before page_mapped() has become false,
    1824             :          * if page table locking is skipped: use TTU_SYNC to wait for that.
    1825             :          */
    1826           0 :         if (flags & TTU_SYNC)
    1827           0 :                 pvmw.flags = PVMW_SYNC;
    1828             : 
    1829             :         /*
    1830             :          * unmap_page() in mm/huge_memory.c is the only user of migration with
    1831             :          * TTU_SPLIT_HUGE_PMD and it wants to freeze.
    1832             :          */
    1833             :         if (flags & TTU_SPLIT_HUGE_PMD)
    1834             :                 split_huge_pmd_address(vma, address, true, folio);
    1835             : 
    1836             :         /*
    1837             :          * For THP, we have to assume the worse case ie pmd for invalidation.
    1838             :          * For hugetlb, it could be much worse if we need to do pud
    1839             :          * invalidation in the case of pmd sharing.
    1840             :          *
    1841             :          * Note that the page can not be free in this function as call of
    1842             :          * try_to_unmap() must hold a reference on the page.
    1843             :          */
    1844             :         range.end = vma_address_end(&pvmw);
    1845             :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
    1846             :                                 address, range.end);
    1847             :         if (folio_test_hugetlb(folio)) {
    1848             :                 /*
    1849             :                  * If sharing is possible, start and end will be adjusted
    1850             :                  * accordingly.
    1851             :                  */
    1852             :                 adjust_range_if_pmd_sharing_possible(vma, &range.start,
    1853             :                                                      &range.end);
    1854             :         }
    1855             :         mmu_notifier_invalidate_range_start(&range);
    1856             : 
    1857           0 :         while (page_vma_mapped_walk(&pvmw)) {
    1858             : #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
    1859             :                 /* PMD-mapped THP migration entry */
    1860             :                 if (!pvmw.pte) {
    1861             :                         subpage = folio_page(folio,
    1862             :                                 pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
    1863             :                         VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
    1864             :                                         !folio_test_pmd_mappable(folio), folio);
    1865             : 
    1866             :                         if (set_pmd_migration_entry(&pvmw, subpage)) {
    1867             :                                 ret = false;
    1868             :                                 page_vma_mapped_walk_done(&pvmw);
    1869             :                                 break;
    1870             :                         }
    1871             :                         continue;
    1872             :                 }
    1873             : #endif
    1874             : 
    1875             :                 /* Unexpected PMD-mapped THP? */
    1876             :                 VM_BUG_ON_FOLIO(!pvmw.pte, folio);
    1877             : 
    1878           0 :                 if (folio_is_zone_device(folio)) {
    1879             :                         /*
    1880             :                          * Our PTE is a non-present device exclusive entry and
    1881             :                          * calculating the subpage as for the common case would
    1882             :                          * result in an invalid pointer.
    1883             :                          *
    1884             :                          * Since only PAGE_SIZE pages can currently be
    1885             :                          * migrated, just set it to page. This will need to be
    1886             :                          * changed when hugepage migrations to device private
    1887             :                          * memory are supported.
    1888             :                          */
    1889             :                         VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
    1890             :                         subpage = &folio->page;
    1891             :                 } else {
    1892           0 :                         subpage = folio_page(folio,
    1893             :                                         pte_pfn(*pvmw.pte) - folio_pfn(folio));
    1894             :                 }
    1895           0 :                 address = pvmw.address;
    1896           0 :                 anon_exclusive = folio_test_anon(folio) &&
    1897           0 :                                  PageAnonExclusive(subpage);
    1898             : 
    1899           0 :                 if (folio_test_hugetlb(folio)) {
    1900             :                         bool anon = folio_test_anon(folio);
    1901             : 
    1902             :                         /*
    1903             :                          * huge_pmd_unshare may unmap an entire PMD page.
    1904             :                          * There is no way of knowing exactly which PMDs may
    1905             :                          * be cached for this mm, so we must flush them all.
    1906             :                          * start/end were already adjusted above to cover this
    1907             :                          * range.
    1908             :                          */
    1909             :                         flush_cache_range(vma, range.start, range.end);
    1910             : 
    1911             :                         /*
    1912             :                          * To call huge_pmd_unshare, i_mmap_rwsem must be
    1913             :                          * held in write mode.  Caller needs to explicitly
    1914             :                          * do this outside rmap routines.
    1915             :                          *
    1916             :                          * We also must hold hugetlb vma_lock in write mode.
    1917             :                          * Lock order dictates acquiring vma_lock BEFORE
    1918             :                          * i_mmap_rwsem.  We can only try lock here and
    1919             :                          * fail if unsuccessful.
    1920             :                          */
    1921             :                         if (!anon) {
    1922             :                                 VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
    1923             :                                 if (!hugetlb_vma_trylock_write(vma)) {
    1924             :                                         page_vma_mapped_walk_done(&pvmw);
    1925             :                                         ret = false;
    1926             :                                         break;
    1927             :                                 }
    1928             :                                 if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
    1929             :                                         hugetlb_vma_unlock_write(vma);
    1930             :                                         flush_tlb_range(vma,
    1931             :                                                 range.start, range.end);
    1932             :                                         mmu_notifier_invalidate_range(mm,
    1933             :                                                 range.start, range.end);
    1934             : 
    1935             :                                         /*
    1936             :                                          * The ref count of the PMD page was
    1937             :                                          * dropped which is part of the way map
    1938             :                                          * counting is done for shared PMDs.
    1939             :                                          * Return 'true' here.  When there is
    1940             :                                          * no other sharing, huge_pmd_unshare
    1941             :                                          * returns false and we will unmap the
    1942             :                                          * actual page and drop map count
    1943             :                                          * to zero.
    1944             :                                          */
    1945             :                                         page_vma_mapped_walk_done(&pvmw);
    1946             :                                         break;
    1947             :                                 }
    1948             :                                 hugetlb_vma_unlock_write(vma);
    1949             :                         }
    1950             :                         /* Nuke the hugetlb page table entry */
    1951             :                         pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
    1952             :                 } else {
    1953           0 :                         flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
    1954             :                         /* Nuke the page table entry. */
    1955           0 :                         if (should_defer_flush(mm, flags)) {
    1956             :                                 /*
    1957             :                                  * We clear the PTE but do not flush so potentially
    1958             :                                  * a remote CPU could still be writing to the folio.
    1959             :                                  * If the entry was previously clean then the
    1960             :                                  * architecture must guarantee that a clear->dirty
    1961             :                                  * transition on a cached TLB entry is written through
    1962             :                                  * and traps if the PTE is unmapped.
    1963             :                                  */
    1964             :                                 pteval = ptep_get_and_clear(mm, address, pvmw.pte);
    1965             : 
    1966             :                                 set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
    1967             :                         } else {
    1968           0 :                                 pteval = ptep_clear_flush(vma, address, pvmw.pte);
    1969             :                         }
    1970             :                 }
    1971             : 
    1972             :                 /* Set the dirty flag on the folio now the pte is gone. */
    1973           0 :                 if (pte_dirty(pteval))
    1974           0 :                         folio_mark_dirty(folio);
    1975             : 
    1976             :                 /* Update high watermark before we lower rss */
    1977           0 :                 update_hiwater_rss(mm);
    1978             : 
    1979           0 :                 if (folio_is_device_private(folio)) {
    1980             :                         unsigned long pfn = folio_pfn(folio);
    1981             :                         swp_entry_t entry;
    1982             :                         pte_t swp_pte;
    1983             : 
    1984             :                         if (anon_exclusive)
    1985             :                                 BUG_ON(page_try_share_anon_rmap(subpage));
    1986             : 
    1987             :                         /*
    1988             :                          * Store the pfn of the page in a special migration
    1989             :                          * pte. do_swap_page() will wait until the migration
    1990             :                          * pte is removed and then restart fault handling.
    1991             :                          */
    1992             :                         entry = pte_to_swp_entry(pteval);
    1993             :                         if (is_writable_device_private_entry(entry))
    1994             :                                 entry = make_writable_migration_entry(pfn);
    1995             :                         else if (anon_exclusive)
    1996             :                                 entry = make_readable_exclusive_migration_entry(pfn);
    1997             :                         else
    1998             :                                 entry = make_readable_migration_entry(pfn);
    1999             :                         swp_pte = swp_entry_to_pte(entry);
    2000             : 
    2001             :                         /*
    2002             :                          * pteval maps a zone device page and is therefore
    2003             :                          * a swap pte.
    2004             :                          */
    2005             :                         if (pte_swp_soft_dirty(pteval))
    2006             :                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
    2007             :                         if (pte_swp_uffd_wp(pteval))
    2008             :                                 swp_pte = pte_swp_mkuffd_wp(swp_pte);
    2009             :                         set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
    2010             :                         trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
    2011             :                                                 compound_order(&folio->page));
    2012             :                         /*
    2013             :                          * No need to invalidate here it will synchronize on
    2014             :                          * against the special swap migration pte.
    2015             :                          */
    2016           0 :                 } else if (PageHWPoison(subpage)) {
    2017             :                         pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
    2018             :                         if (folio_test_hugetlb(folio)) {
    2019             :                                 hugetlb_count_sub(folio_nr_pages(folio), mm);
    2020             :                                 set_huge_pte_at(mm, address, pvmw.pte, pteval);
    2021             :                         } else {
    2022             :                                 dec_mm_counter(mm, mm_counter(&folio->page));
    2023             :                                 set_pte_at(mm, address, pvmw.pte, pteval);
    2024             :                         }
    2025             : 
    2026           0 :                 } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
    2027             :                         /*
    2028             :                          * The guest indicated that the page content is of no
    2029             :                          * interest anymore. Simply discard the pte, vmscan
    2030             :                          * will take care of the rest.
    2031             :                          * A future reference will then fault in a new zero
    2032             :                          * page. When userfaultfd is active, we must not drop
    2033             :                          * this page though, as its main user (postcopy
    2034             :                          * migration) will not expect userfaults on already
    2035             :                          * copied pages.
    2036             :                          */
    2037             :                         dec_mm_counter(mm, mm_counter(&folio->page));
    2038             :                         /* We have to invalidate as we cleared the pte */
    2039             :                         mmu_notifier_invalidate_range(mm, address,
    2040             :                                                       address + PAGE_SIZE);
    2041             :                 } else {
    2042             :                         swp_entry_t entry;
    2043             :                         pte_t swp_pte;
    2044             : 
    2045           0 :                         if (arch_unmap_one(mm, vma, address, pteval) < 0) {
    2046             :                                 if (folio_test_hugetlb(folio))
    2047             :                                         set_huge_pte_at(mm, address, pvmw.pte, pteval);
    2048             :                                 else
    2049             :                                         set_pte_at(mm, address, pvmw.pte, pteval);
    2050             :                                 ret = false;
    2051             :                                 page_vma_mapped_walk_done(&pvmw);
    2052             :                                 break;
    2053             :                         }
    2054             :                         VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
    2055             :                                        !anon_exclusive, subpage);
    2056             : 
    2057             :                         /* See page_try_share_anon_rmap(): clear PTE first. */
    2058           0 :                         if (anon_exclusive &&
    2059           0 :                             page_try_share_anon_rmap(subpage)) {
    2060           0 :                                 if (folio_test_hugetlb(folio))
    2061             :                                         set_huge_pte_at(mm, address, pvmw.pte, pteval);
    2062             :                                 else
    2063           0 :                                         set_pte_at(mm, address, pvmw.pte, pteval);
    2064           0 :                                 ret = false;
    2065           0 :                                 page_vma_mapped_walk_done(&pvmw);
    2066             :                                 break;
    2067             :                         }
    2068             : 
    2069             :                         /*
    2070             :                          * Store the pfn of the page in a special migration
    2071             :                          * pte. do_swap_page() will wait until the migration
    2072             :                          * pte is removed and then restart fault handling.
    2073             :                          */
    2074           0 :                         if (pte_write(pteval))
    2075           0 :                                 entry = make_writable_migration_entry(
    2076           0 :                                                         page_to_pfn(subpage));
    2077           0 :                         else if (anon_exclusive)
    2078           0 :                                 entry = make_readable_exclusive_migration_entry(
    2079           0 :                                                         page_to_pfn(subpage));
    2080             :                         else
    2081           0 :                                 entry = make_readable_migration_entry(
    2082           0 :                                                         page_to_pfn(subpage));
    2083           0 :                         if (pte_young(pteval))
    2084             :                                 entry = make_migration_entry_young(entry);
    2085           0 :                         if (pte_dirty(pteval))
    2086             :                                 entry = make_migration_entry_dirty(entry);
    2087           0 :                         swp_pte = swp_entry_to_pte(entry);
    2088           0 :                         if (pte_soft_dirty(pteval))
    2089             :                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
    2090             :                         if (pte_uffd_wp(pteval))
    2091             :                                 swp_pte = pte_swp_mkuffd_wp(swp_pte);
    2092           0 :                         if (folio_test_hugetlb(folio))
    2093             :                                 set_huge_pte_at(mm, address, pvmw.pte, swp_pte);
    2094             :                         else
    2095           0 :                                 set_pte_at(mm, address, pvmw.pte, swp_pte);
    2096           0 :                         trace_set_migration_pte(address, pte_val(swp_pte),
    2097           0 :                                                 compound_order(&folio->page));
    2098             :                         /*
    2099             :                          * No need to invalidate here it will synchronize on
    2100             :                          * against the special swap migration pte.
    2101             :                          */
    2102             :                 }
    2103             : 
    2104             :                 /*
    2105             :                  * No need to call mmu_notifier_invalidate_range() it has be
    2106             :                  * done above for all cases requiring it to happen under page
    2107             :                  * table lock before mmu_notifier_invalidate_range_end()
    2108             :                  *
    2109             :                  * See Documentation/mm/mmu_notifier.rst
    2110             :                  */
    2111           0 :                 page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
    2112           0 :                 if (vma->vm_flags & VM_LOCKED)
    2113           0 :                         mlock_drain_local();
    2114             :                 folio_put(folio);
    2115             :         }
    2116             : 
    2117           0 :         mmu_notifier_invalidate_range_end(&range);
    2118             : 
    2119           0 :         return ret;
    2120             : }
    2121             : 
    2122             : /**
    2123             :  * try_to_migrate - try to replace all page table mappings with swap entries
    2124             :  * @folio: the folio to replace page table entries for
    2125             :  * @flags: action and flags
    2126             :  *
    2127             :  * Tries to remove all the page table entries which are mapping this folio and
    2128             :  * replace them with special swap entries. Caller must hold the folio lock.
    2129             :  */
    2130           0 : void try_to_migrate(struct folio *folio, enum ttu_flags flags)
    2131             : {
    2132           0 :         struct rmap_walk_control rwc = {
    2133             :                 .rmap_one = try_to_migrate_one,
    2134           0 :                 .arg = (void *)flags,
    2135             :                 .done = folio_not_mapped,
    2136             :                 .anon_lock = folio_lock_anon_vma_read,
    2137             :         };
    2138             : 
    2139             :         /*
    2140             :          * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
    2141             :          * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags.
    2142             :          */
    2143           0 :         if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
    2144             :                                         TTU_SYNC | TTU_BATCH_FLUSH)))
    2145           0 :                 return;
    2146             : 
    2147           0 :         if (folio_is_zone_device(folio) &&
    2148             :             (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
    2149             :                 return;
    2150             : 
    2151             :         /*
    2152             :          * During exec, a temporary VMA is setup and later moved.
    2153             :          * The VMA is moved under the anon_vma lock but not the
    2154             :          * page tables leading to a race where migration cannot
    2155             :          * find the migration ptes. Rather than increasing the
    2156             :          * locking requirements of exec(), migration skips
    2157             :          * temporary VMAs until after exec() completes.
    2158             :          */
    2159           0 :         if (!folio_test_ksm(folio) && folio_test_anon(folio))
    2160           0 :                 rwc.invalid_vma = invalid_migration_vma;
    2161             : 
    2162           0 :         if (flags & TTU_RMAP_LOCKED)
    2163           0 :                 rmap_walk_locked(folio, &rwc);
    2164             :         else
    2165           0 :                 rmap_walk(folio, &rwc);
    2166             : }
    2167             : 
    2168             : #ifdef CONFIG_DEVICE_PRIVATE
    2169             : struct make_exclusive_args {
    2170             :         struct mm_struct *mm;
    2171             :         unsigned long address;
    2172             :         void *owner;
    2173             :         bool valid;
    2174             : };
    2175             : 
    2176             : static bool page_make_device_exclusive_one(struct folio *folio,
    2177             :                 struct vm_area_struct *vma, unsigned long address, void *priv)
    2178             : {
    2179             :         struct mm_struct *mm = vma->vm_mm;
    2180             :         DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
    2181             :         struct make_exclusive_args *args = priv;
    2182             :         pte_t pteval;
    2183             :         struct page *subpage;
    2184             :         bool ret = true;
    2185             :         struct mmu_notifier_range range;
    2186             :         swp_entry_t entry;
    2187             :         pte_t swp_pte;
    2188             : 
    2189             :         mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
    2190             :                                       vma->vm_mm, address, min(vma->vm_end,
    2191             :                                       address + folio_size(folio)),
    2192             :                                       args->owner);
    2193             :         mmu_notifier_invalidate_range_start(&range);
    2194             : 
    2195             :         while (page_vma_mapped_walk(&pvmw)) {
    2196             :                 /* Unexpected PMD-mapped THP? */
    2197             :                 VM_BUG_ON_FOLIO(!pvmw.pte, folio);
    2198             : 
    2199             :                 if (!pte_present(*pvmw.pte)) {
    2200             :                         ret = false;
    2201             :                         page_vma_mapped_walk_done(&pvmw);
    2202             :                         break;
    2203             :                 }
    2204             : 
    2205             :                 subpage = folio_page(folio,
    2206             :                                 pte_pfn(*pvmw.pte) - folio_pfn(folio));
    2207             :                 address = pvmw.address;
    2208             : 
    2209             :                 /* Nuke the page table entry. */
    2210             :                 flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
    2211             :                 pteval = ptep_clear_flush(vma, address, pvmw.pte);
    2212             : 
    2213             :                 /* Set the dirty flag on the folio now the pte is gone. */
    2214             :                 if (pte_dirty(pteval))
    2215             :                         folio_mark_dirty(folio);
    2216             : 
    2217             :                 /*
    2218             :                  * Check that our target page is still mapped at the expected
    2219             :                  * address.
    2220             :                  */
    2221             :                 if (args->mm == mm && args->address == address &&
    2222             :                     pte_write(pteval))
    2223             :                         args->valid = true;
    2224             : 
    2225             :                 /*
    2226             :                  * Store the pfn of the page in a special migration
    2227             :                  * pte. do_swap_page() will wait until the migration
    2228             :                  * pte is removed and then restart fault handling.
    2229             :                  */
    2230             :                 if (pte_write(pteval))
    2231             :                         entry = make_writable_device_exclusive_entry(
    2232             :                                                         page_to_pfn(subpage));
    2233             :                 else
    2234             :                         entry = make_readable_device_exclusive_entry(
    2235             :                                                         page_to_pfn(subpage));
    2236             :                 swp_pte = swp_entry_to_pte(entry);
    2237             :                 if (pte_soft_dirty(pteval))
    2238             :                         swp_pte = pte_swp_mksoft_dirty(swp_pte);
    2239             :                 if (pte_uffd_wp(pteval))
    2240             :                         swp_pte = pte_swp_mkuffd_wp(swp_pte);
    2241             : 
    2242             :                 set_pte_at(mm, address, pvmw.pte, swp_pte);
    2243             : 
    2244             :                 /*
    2245             :                  * There is a reference on the page for the swap entry which has
    2246             :                  * been removed, so shouldn't take another.
    2247             :                  */
    2248             :                 page_remove_rmap(subpage, vma, false);
    2249             :         }
    2250             : 
    2251             :         mmu_notifier_invalidate_range_end(&range);
    2252             : 
    2253             :         return ret;
    2254             : }
    2255             : 
    2256             : /**
    2257             :  * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
    2258             :  * @folio: The folio to replace page table entries for.
    2259             :  * @mm: The mm_struct where the folio is expected to be mapped.
    2260             :  * @address: Address where the folio is expected to be mapped.
    2261             :  * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
    2262             :  *
    2263             :  * Tries to remove all the page table entries which are mapping this
    2264             :  * folio and replace them with special device exclusive swap entries to
    2265             :  * grant a device exclusive access to the folio.
    2266             :  *
    2267             :  * Context: Caller must hold the folio lock.
    2268             :  * Return: false if the page is still mapped, or if it could not be unmapped
    2269             :  * from the expected address. Otherwise returns true (success).
    2270             :  */
    2271             : static bool folio_make_device_exclusive(struct folio *folio,
    2272             :                 struct mm_struct *mm, unsigned long address, void *owner)
    2273             : {
    2274             :         struct make_exclusive_args args = {
    2275             :                 .mm = mm,
    2276             :                 .address = address,
    2277             :                 .owner = owner,
    2278             :                 .valid = false,
    2279             :         };
    2280             :         struct rmap_walk_control rwc = {
    2281             :                 .rmap_one = page_make_device_exclusive_one,
    2282             :                 .done = folio_not_mapped,
    2283             :                 .anon_lock = folio_lock_anon_vma_read,
    2284             :                 .arg = &args,
    2285             :         };
    2286             : 
    2287             :         /*
    2288             :          * Restrict to anonymous folios for now to avoid potential writeback
    2289             :          * issues.
    2290             :          */
    2291             :         if (!folio_test_anon(folio))
    2292             :                 return false;
    2293             : 
    2294             :         rmap_walk(folio, &rwc);
    2295             : 
    2296             :         return args.valid && !folio_mapcount(folio);
    2297             : }
    2298             : 
    2299             : /**
    2300             :  * make_device_exclusive_range() - Mark a range for exclusive use by a device
    2301             :  * @mm: mm_struct of associated target process
    2302             :  * @start: start of the region to mark for exclusive device access
    2303             :  * @end: end address of region
    2304             :  * @pages: returns the pages which were successfully marked for exclusive access
    2305             :  * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
    2306             :  *
    2307             :  * Returns: number of pages found in the range by GUP. A page is marked for
    2308             :  * exclusive access only if the page pointer is non-NULL.
    2309             :  *
    2310             :  * This function finds ptes mapping page(s) to the given address range, locks
    2311             :  * them and replaces mappings with special swap entries preventing userspace CPU
    2312             :  * access. On fault these entries are replaced with the original mapping after
    2313             :  * calling MMU notifiers.
    2314             :  *
    2315             :  * A driver using this to program access from a device must use a mmu notifier
    2316             :  * critical section to hold a device specific lock during programming. Once
    2317             :  * programming is complete it should drop the page lock and reference after
    2318             :  * which point CPU access to the page will revoke the exclusive access.
    2319             :  */
    2320             : int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
    2321             :                                 unsigned long end, struct page **pages,
    2322             :                                 void *owner)
    2323             : {
    2324             :         long npages = (end - start) >> PAGE_SHIFT;
    2325             :         long i;
    2326             : 
    2327             :         npages = get_user_pages_remote(mm, start, npages,
    2328             :                                        FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
    2329             :                                        pages, NULL, NULL);
    2330             :         if (npages < 0)
    2331             :                 return npages;
    2332             : 
    2333             :         for (i = 0; i < npages; i++, start += PAGE_SIZE) {
    2334             :                 struct folio *folio = page_folio(pages[i]);
    2335             :                 if (PageTail(pages[i]) || !folio_trylock(folio)) {
    2336             :                         folio_put(folio);
    2337             :                         pages[i] = NULL;
    2338             :                         continue;
    2339             :                 }
    2340             : 
    2341             :                 if (!folio_make_device_exclusive(folio, mm, start, owner)) {
    2342             :                         folio_unlock(folio);
    2343             :                         folio_put(folio);
    2344             :                         pages[i] = NULL;
    2345             :                 }
    2346             :         }
    2347             : 
    2348             :         return npages;
    2349             : }
    2350             : EXPORT_SYMBOL_GPL(make_device_exclusive_range);
    2351             : #endif
    2352             : 
    2353           0 : void __put_anon_vma(struct anon_vma *anon_vma)
    2354             : {
    2355           0 :         struct anon_vma *root = anon_vma->root;
    2356             : 
    2357           0 :         anon_vma_free(anon_vma);
    2358           0 :         if (root != anon_vma && atomic_dec_and_test(&root->refcount))
    2359           0 :                 anon_vma_free(root);
    2360           0 : }
    2361             : 
    2362           0 : static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
    2363             :                                             struct rmap_walk_control *rwc)
    2364             : {
    2365             :         struct anon_vma *anon_vma;
    2366             : 
    2367           0 :         if (rwc->anon_lock)
    2368           0 :                 return rwc->anon_lock(folio, rwc);
    2369             : 
    2370             :         /*
    2371             :          * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
    2372             :          * because that depends on page_mapped(); but not all its usages
    2373             :          * are holding mmap_lock. Users without mmap_lock are required to
    2374             :          * take a reference count to prevent the anon_vma disappearing
    2375             :          */
    2376           0 :         anon_vma = folio_anon_vma(folio);
    2377           0 :         if (!anon_vma)
    2378             :                 return NULL;
    2379             : 
    2380           0 :         if (anon_vma_trylock_read(anon_vma))
    2381             :                 goto out;
    2382             : 
    2383           0 :         if (rwc->try_lock) {
    2384           0 :                 anon_vma = NULL;
    2385           0 :                 rwc->contended = true;
    2386           0 :                 goto out;
    2387             :         }
    2388             : 
    2389           0 :         anon_vma_lock_read(anon_vma);
    2390             : out:
    2391             :         return anon_vma;
    2392             : }
    2393             : 
    2394             : /*
    2395             :  * rmap_walk_anon - do something to anonymous page using the object-based
    2396             :  * rmap method
    2397             :  * @page: the page to be handled
    2398             :  * @rwc: control variable according to each walk type
    2399             :  *
    2400             :  * Find all the mappings of a page using the mapping pointer and the vma chains
    2401             :  * contained in the anon_vma struct it points to.
    2402             :  */
    2403           0 : static void rmap_walk_anon(struct folio *folio,
    2404             :                 struct rmap_walk_control *rwc, bool locked)
    2405             : {
    2406             :         struct anon_vma *anon_vma;
    2407             :         pgoff_t pgoff_start, pgoff_end;
    2408             :         struct anon_vma_chain *avc;
    2409             : 
    2410           0 :         if (locked) {
    2411           0 :                 anon_vma = folio_anon_vma(folio);
    2412             :                 /* anon_vma disappear under us? */
    2413             :                 VM_BUG_ON_FOLIO(!anon_vma, folio);
    2414             :         } else {
    2415           0 :                 anon_vma = rmap_walk_anon_lock(folio, rwc);
    2416             :         }
    2417           0 :         if (!anon_vma)
    2418             :                 return;
    2419             : 
    2420           0 :         pgoff_start = folio_pgoff(folio);
    2421           0 :         pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
    2422           0 :         anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
    2423             :                         pgoff_start, pgoff_end) {
    2424           0 :                 struct vm_area_struct *vma = avc->vma;
    2425           0 :                 unsigned long address = vma_address(&folio->page, vma);
    2426             : 
    2427             :                 VM_BUG_ON_VMA(address == -EFAULT, vma);
    2428           0 :                 cond_resched();
    2429             : 
    2430           0 :                 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
    2431           0 :                         continue;
    2432             : 
    2433           0 :                 if (!rwc->rmap_one(folio, vma, address, rwc->arg))
    2434             :                         break;
    2435           0 :                 if (rwc->done && rwc->done(folio))
    2436             :                         break;
    2437             :         }
    2438             : 
    2439           0 :         if (!locked)
    2440           0 :                 anon_vma_unlock_read(anon_vma);
    2441             : }
    2442             : 
    2443             : /*
    2444             :  * rmap_walk_file - do something to file page using the object-based rmap method
    2445             :  * @page: the page to be handled
    2446             :  * @rwc: control variable according to each walk type
    2447             :  *
    2448             :  * Find all the mappings of a page using the mapping pointer and the vma chains
    2449             :  * contained in the address_space struct it points to.
    2450             :  */
    2451           0 : static void rmap_walk_file(struct folio *folio,
    2452             :                 struct rmap_walk_control *rwc, bool locked)
    2453             : {
    2454           0 :         struct address_space *mapping = folio_mapping(folio);
    2455             :         pgoff_t pgoff_start, pgoff_end;
    2456             :         struct vm_area_struct *vma;
    2457             : 
    2458             :         /*
    2459             :          * The page lock not only makes sure that page->mapping cannot
    2460             :          * suddenly be NULLified by truncation, it makes sure that the
    2461             :          * structure at mapping cannot be freed and reused yet,
    2462             :          * so we can safely take mapping->i_mmap_rwsem.
    2463             :          */
    2464             :         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
    2465             : 
    2466           0 :         if (!mapping)
    2467             :                 return;
    2468             : 
    2469           0 :         pgoff_start = folio_pgoff(folio);
    2470           0 :         pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
    2471           0 :         if (!locked) {
    2472           0 :                 if (i_mmap_trylock_read(mapping))
    2473             :                         goto lookup;
    2474             : 
    2475           0 :                 if (rwc->try_lock) {
    2476           0 :                         rwc->contended = true;
    2477           0 :                         return;
    2478             :                 }
    2479             : 
    2480             :                 i_mmap_lock_read(mapping);
    2481             :         }
    2482             : lookup:
    2483           0 :         vma_interval_tree_foreach(vma, &mapping->i_mmap,
    2484             :                         pgoff_start, pgoff_end) {
    2485           0 :                 unsigned long address = vma_address(&folio->page, vma);
    2486             : 
    2487             :                 VM_BUG_ON_VMA(address == -EFAULT, vma);
    2488           0 :                 cond_resched();
    2489             : 
    2490           0 :                 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
    2491           0 :                         continue;
    2492             : 
    2493           0 :                 if (!rwc->rmap_one(folio, vma, address, rwc->arg))
    2494             :                         goto done;
    2495           0 :                 if (rwc->done && rwc->done(folio))
    2496             :                         goto done;
    2497             :         }
    2498             : 
    2499             : done:
    2500           0 :         if (!locked)
    2501             :                 i_mmap_unlock_read(mapping);
    2502             : }
    2503             : 
    2504           0 : void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
    2505             : {
    2506           0 :         if (unlikely(folio_test_ksm(folio)))
    2507             :                 rmap_walk_ksm(folio, rwc);
    2508           0 :         else if (folio_test_anon(folio))
    2509           0 :                 rmap_walk_anon(folio, rwc, false);
    2510             :         else
    2511           0 :                 rmap_walk_file(folio, rwc, false);
    2512           0 : }
    2513             : 
    2514             : /* Like rmap_walk, but caller holds relevant rmap lock */
    2515           0 : void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
    2516             : {
    2517             :         /* no ksm support for now */
    2518             :         VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
    2519           0 :         if (folio_test_anon(folio))
    2520           0 :                 rmap_walk_anon(folio, rwc, true);
    2521             :         else
    2522           0 :                 rmap_walk_file(folio, rwc, true);
    2523           0 : }
    2524             : 
    2525             : #ifdef CONFIG_HUGETLB_PAGE
    2526             : /*
    2527             :  * The following two functions are for anonymous (private mapped) hugepages.
    2528             :  * Unlike common anonymous pages, anonymous hugepages have no accounting code
    2529             :  * and no lru code, because we handle hugepages differently from common pages.
    2530             :  *
    2531             :  * RMAP_COMPOUND is ignored.
    2532             :  */
    2533             : void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
    2534             :                             unsigned long address, rmap_t flags)
    2535             : {
    2536             :         struct folio *folio = page_folio(page);
    2537             :         struct anon_vma *anon_vma = vma->anon_vma;
    2538             :         int first;
    2539             : 
    2540             :         BUG_ON(!folio_test_locked(folio));
    2541             :         BUG_ON(!anon_vma);
    2542             :         /* address might be in next vma when migration races vma_merge */
    2543             :         first = atomic_inc_and_test(&folio->_entire_mapcount);
    2544             :         VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
    2545             :         VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
    2546             :         if (first)
    2547             :                 __page_set_anon_rmap(folio, page, vma, address,
    2548             :                                      !!(flags & RMAP_EXCLUSIVE));
    2549             : }
    2550             : 
    2551             : void hugepage_add_new_anon_rmap(struct folio *folio,
    2552             :                         struct vm_area_struct *vma, unsigned long address)
    2553             : {
    2554             :         BUG_ON(address < vma->vm_start || address >= vma->vm_end);
    2555             :         /* increment count (starts at -1) */
    2556             :         atomic_set(&folio->_entire_mapcount, 0);
    2557             :         folio_clear_hugetlb_restore_reserve(folio);
    2558             :         __page_set_anon_rmap(folio, &folio->page, vma, address, 1);
    2559             : }
    2560             : #endif /* CONFIG_HUGETLB_PAGE */

Generated by: LCOV version 1.14