LCOV - coverage.info - mm/memory.c

LCOV - code coverage report

Current view:	top level - mm - memory.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	23	1486	1.5 %
Date:	2023-08-24 13:40:31	Functions:	3	102	2.9 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  linux/mm/memory.c
       4             :  *
       5             :  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
       6             :  */
       7             : 
       8             : /*
       9             :  * demand-loading started 01.12.91 - seems it is high on the list of
      10             :  * things wanted, and it should be easy to implement. - Linus
      11             :  */
      12             : 
      13             : /*
      14             :  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
      15             :  * pages started 02.12.91, seems to work. - Linus.
      16             :  *
      17             :  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
      18             :  * would have taken more than the 6M I have free, but it worked well as
      19             :  * far as I could see.
      20             :  *
      21             :  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
      22             :  */
      23             : 
      24             : /*
      25             :  * Real VM (paging to/from disk) started 18.12.91. Much more work and
      26             :  * thought has to go into this. Oh, well..
      27             :  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
      28             :  *              Found it. Everything seems to work now.
      29             :  * 20.12.91  -  Ok, making the swap-device changeable like the root.
      30             :  */
      31             : 
      32             : /*
      33             :  * 05.04.94  -  Multi-page memory management added for v1.1.
      34             :  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
      35             :  *
      36             :  * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
      37             :  *              (Gerhard.Wichert@pdb.siemens.de)
      38             :  *
      39             :  * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
      40             :  */
      41             : 
      42             : #include <linux/kernel_stat.h>
      43             : #include <linux/mm.h>
      44             : #include <linux/mm_inline.h>
      45             : #include <linux/sched/mm.h>
      46             : #include <linux/sched/coredump.h>
      47             : #include <linux/sched/numa_balancing.h>
      48             : #include <linux/sched/task.h>
      49             : #include <linux/hugetlb.h>
      50             : #include <linux/mman.h>
      51             : #include <linux/swap.h>
      52             : #include <linux/highmem.h>
      53             : #include <linux/pagemap.h>
      54             : #include <linux/memremap.h>
      55             : #include <linux/kmsan.h>
      56             : #include <linux/ksm.h>
      57             : #include <linux/rmap.h>
      58             : #include <linux/export.h>
      59             : #include <linux/delayacct.h>
      60             : #include <linux/init.h>
      61             : #include <linux/pfn_t.h>
      62             : #include <linux/writeback.h>
      63             : #include <linux/memcontrol.h>
      64             : #include <linux/mmu_notifier.h>
      65             : #include <linux/swapops.h>
      66             : #include <linux/elf.h>
      67             : #include <linux/gfp.h>
      68             : #include <linux/migrate.h>
      69             : #include <linux/string.h>
      70             : #include <linux/memory-tiers.h>
      71             : #include <linux/debugfs.h>
      72             : #include <linux/userfaultfd_k.h>
      73             : #include <linux/dax.h>
      74             : #include <linux/oom.h>
      75             : #include <linux/numa.h>
      76             : #include <linux/perf_event.h>
      77             : #include <linux/ptrace.h>
      78             : #include <linux/vmalloc.h>
      79             : #include <linux/sched/sysctl.h>
      80             : #include <linux/net_mm.h>
      81             : 
      82             : #include <trace/events/kmem.h>
      83             : 
      84             : #include <asm/io.h>
      85             : #include <asm/mmu_context.h>
      86             : #include <asm/pgalloc.h>
      87             : #include <linux/uaccess.h>
      88             : #include <asm/tlb.h>
      89             : #include <asm/tlbflush.h>
      90             : 
      91             : #include "pgalloc-track.h"
      92             : #include "internal.h"
      93             : #include "swap.h"
      94             : 
      95             : #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
      96             : #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
      97             : #endif
      98             : 
      99             : #ifndef CONFIG_NUMA
     100             : unsigned long max_mapnr;
     101             : EXPORT_SYMBOL(max_mapnr);
     102             : 
     103             : struct page *mem_map;
     104             : EXPORT_SYMBOL(mem_map);
     105             : #endif
     106             : 
     107             : static vm_fault_t do_fault(struct vm_fault *vmf);
     108             : static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
     109             : static bool vmf_pte_changed(struct vm_fault *vmf);
     110             : 
     111             : /*
     112             :  * Return true if the original pte was a uffd-wp pte marker (so the pte was
     113             :  * wr-protected).
     114             :  */
     115             : static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
     116             : {
     117             :         if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
     118             :                 return false;
     119             : 
     120             :         return pte_marker_uffd_wp(vmf->orig_pte);
     121             : }
     122             : 
     123             : /*
     124             :  * A number of key systems in x86 including ioremap() rely on the assumption
     125             :  * that high_memory defines the upper bound on direct map memory, then end
     126             :  * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
     127             :  * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
     128             :  * and ZONE_HIGHMEM.
     129             :  */
     130             : void *high_memory;
     131             : EXPORT_SYMBOL(high_memory);
     132             : 
     133             : /*
     134             :  * Randomize the address space (stacks, mmaps, brk, etc.).
     135             :  *
     136             :  * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
     137             :  *   as ancient (libc5 based) binaries can segfault. )
     138             :  */
     139             : int randomize_va_space __read_mostly =
     140             : #ifdef CONFIG_COMPAT_BRK
     141             :                                         1;
     142             : #else
     143             :                                         2;
     144             : #endif
     145             : 
     146             : #ifndef arch_wants_old_prefaulted_pte
     147             : static inline bool arch_wants_old_prefaulted_pte(void)
     148             : {
     149             :         /*
     150             :          * Transitioning a PTE from 'old' to 'young' can be expensive on
     151             :          * some architectures, even if it's performed in hardware. By
     152             :          * default, "false" means prefaulted entries will be 'young'.
     153             :          */
     154             :         return false;
     155             : }
     156             : #endif
     157             : 
     158           0 : static int __init disable_randmaps(char *s)
     159             : {
     160           0 :         randomize_va_space = 0;
     161           0 :         return 1;
     162             : }
     163             : __setup("norandmaps", disable_randmaps);
     164             : 
     165             : unsigned long zero_pfn __read_mostly;
     166             : EXPORT_SYMBOL(zero_pfn);
     167             : 
     168             : unsigned long highest_memmap_pfn __read_mostly;
     169             : 
     170             : /*
     171             :  * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
     172             :  */
     173           1 : static int __init init_zero_pfn(void)
     174             : {
     175           2 :         zero_pfn = page_to_pfn(ZERO_PAGE(0));
     176           1 :         return 0;
     177             : }
     178             : early_initcall(init_zero_pfn);
     179             : 
     180           0 : void mm_trace_rss_stat(struct mm_struct *mm, int member)
     181             : {
     182           0 :         trace_rss_stat(mm, member);
     183           0 : }
     184             : 
     185             : /*
     186             :  * Note: this doesn't free the actual pages themselves. That
     187             :  * has been handled earlier when unmapping all the memory regions.
     188             :  */
     189           0 : static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
     190             :                            unsigned long addr)
     191             : {
     192           0 :         pgtable_t token = pmd_pgtable(*pmd);
     193           0 :         pmd_clear(pmd);
     194           0 :         pte_free_tlb(tlb, token, addr);
     195           0 :         mm_dec_nr_ptes(tlb->mm);
     196           0 : }
     197             : 
     198           0 : static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
     199             :                                 unsigned long addr, unsigned long end,
     200             :                                 unsigned long floor, unsigned long ceiling)
     201             : {
     202             :         pmd_t *pmd;
     203             :         unsigned long next;
     204             :         unsigned long start;
     205             : 
     206           0 :         start = addr;
     207           0 :         pmd = pmd_offset(pud, addr);
     208             :         do {
     209           0 :                 next = pmd_addr_end(addr, end);
     210           0 :                 if (pmd_none_or_clear_bad(pmd))
     211           0 :                         continue;
     212           0 :                 free_pte_range(tlb, pmd, addr);
     213           0 :         } while (pmd++, addr = next, addr != end);
     214             : 
     215           0 :         start &= PUD_MASK;
     216           0 :         if (start < floor)
     217             :                 return;
     218           0 :         if (ceiling) {
     219           0 :                 ceiling &= PUD_MASK;
     220           0 :                 if (!ceiling)
     221             :                         return;
     222             :         }
     223           0 :         if (end - 1 > ceiling - 1)
     224             :                 return;
     225             : 
     226           0 :         pmd = pmd_offset(pud, start);
     227           0 :         pud_clear(pud);
     228           0 :         pmd_free_tlb(tlb, pmd, start);
     229           0 :         mm_dec_nr_pmds(tlb->mm);
     230             : }
     231             : 
     232           0 : static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
     233             :                                 unsigned long addr, unsigned long end,
     234             :                                 unsigned long floor, unsigned long ceiling)
     235             : {
     236             :         pud_t *pud;
     237             :         unsigned long next;
     238             :         unsigned long start;
     239             : 
     240           0 :         start = addr;
     241           0 :         pud = pud_offset(p4d, addr);
     242             :         do {
     243           0 :                 next = pud_addr_end(addr, end);
     244           0 :                 if (pud_none_or_clear_bad(pud))
     245           0 :                         continue;
     246           0 :                 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
     247           0 :         } while (pud++, addr = next, addr != end);
     248             : 
     249           0 :         start &= P4D_MASK;
     250             :         if (start < floor)
     251             :                 return;
     252             :         if (ceiling) {
     253             :                 ceiling &= P4D_MASK;
     254             :                 if (!ceiling)
     255             :                         return;
     256             :         }
     257             :         if (end - 1 > ceiling - 1)
     258             :                 return;
     259             : 
     260             :         pud = pud_offset(p4d, start);
     261             :         p4d_clear(p4d);
     262             :         pud_free_tlb(tlb, pud, start);
     263             :         mm_dec_nr_puds(tlb->mm);
     264             : }
     265             : 
     266             : static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
     267             :                                 unsigned long addr, unsigned long end,
     268             :                                 unsigned long floor, unsigned long ceiling)
     269             : {
     270             :         p4d_t *p4d;
     271             :         unsigned long next;
     272             :         unsigned long start;
     273             : 
     274           0 :         start = addr;
     275           0 :         p4d = p4d_offset(pgd, addr);
     276             :         do {
     277           0 :                 next = p4d_addr_end(addr, end);
     278           0 :                 if (p4d_none_or_clear_bad(p4d))
     279             :                         continue;
     280           0 :                 free_pud_range(tlb, p4d, addr, next, floor, ceiling);
     281           0 :         } while (p4d++, addr = next, addr != end);
     282             : 
     283           0 :         start &= PGDIR_MASK;
     284             :         if (start < floor)
     285             :                 return;
     286             :         if (ceiling) {
     287             :                 ceiling &= PGDIR_MASK;
     288             :                 if (!ceiling)
     289             :                         return;
     290             :         }
     291             :         if (end - 1 > ceiling - 1)
     292             :                 return;
     293             : 
     294             :         p4d = p4d_offset(pgd, start);
     295             :         pgd_clear(pgd);
     296             :         p4d_free_tlb(tlb, p4d, start);
     297             : }
     298             : 
     299             : /*
     300             :  * This function frees user-level page tables of a process.
     301             :  */
     302           0 : void free_pgd_range(struct mmu_gather *tlb,
     303             :                         unsigned long addr, unsigned long end,
     304             :                         unsigned long floor, unsigned long ceiling)
     305             : {
     306             :         pgd_t *pgd;
     307             :         unsigned long next;
     308             : 
     309             :         /*
     310             :          * The next few lines have given us lots of grief...
     311             :          *
     312             :          * Why are we testing PMD* at this top level?  Because often
     313             :          * there will be no work to do at all, and we'd prefer not to
     314             :          * go all the way down to the bottom just to discover that.
     315             :          *
     316             :          * Why all these "- 1"s?  Because 0 represents both the bottom
     317             :          * of the address space and the top of it (using -1 for the
     318             :          * top wouldn't help much: the masks would do the wrong thing).
     319             :          * The rule is that addr 0 and floor 0 refer to the bottom of
     320             :          * the address space, but end 0 and ceiling 0 refer to the top
     321             :          * Comparisons need to use "end - 1" and "ceiling - 1" (though
     322             :          * that end 0 case should be mythical).
     323             :          *
     324             :          * Wherever addr is brought up or ceiling brought down, we must
     325             :          * be careful to reject "the opposite 0" before it confuses the
     326             :          * subsequent tests.  But what about where end is brought down
     327             :          * by PMD_SIZE below? no, end can't go down to 0 there.
     328             :          *
     329             :          * Whereas we round start (addr) and ceiling down, by different
     330             :          * masks at different levels, in order to test whether a table
     331             :          * now has no other vmas using it, so can be freed, we don't
     332             :          * bother to round floor or end up - the tests don't need that.
     333             :          */
     334             : 
     335           0 :         addr &= PMD_MASK;
     336           0 :         if (addr < floor) {
     337           0 :                 addr += PMD_SIZE;
     338           0 :                 if (!addr)
     339             :                         return;
     340             :         }
     341           0 :         if (ceiling) {
     342           0 :                 ceiling &= PMD_MASK;
     343           0 :                 if (!ceiling)
     344             :                         return;
     345             :         }
     346           0 :         if (end - 1 > ceiling - 1)
     347           0 :                 end -= PMD_SIZE;
     348           0 :         if (addr > end - 1)
     349             :                 return;
     350             :         /*
     351             :          * We add page table cache pages with PAGE_SIZE,
     352             :          * (see pte_free_tlb()), flush the tlb if we need
     353             :          */
     354           0 :         tlb_change_page_size(tlb, PAGE_SIZE);
     355           0 :         pgd = pgd_offset(tlb->mm, addr);
     356             :         do {
     357           0 :                 next = pgd_addr_end(addr, end);
     358           0 :                 if (pgd_none_or_clear_bad(pgd))
     359             :                         continue;
     360             :                 free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
     361           0 :         } while (pgd++, addr = next, addr != end);
     362             : }
     363             : 
     364           0 : void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
     365             :                    struct vm_area_struct *vma, unsigned long floor,
     366             :                    unsigned long ceiling, bool mm_wr_locked)
     367             : {
     368           0 :         MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
     369             : 
     370             :         do {
     371           0 :                 unsigned long addr = vma->vm_start;
     372             :                 struct vm_area_struct *next;
     373             : 
     374             :                 /*
     375             :                  * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
     376             :                  * be 0.  This will underflow and is okay.
     377             :                  */
     378           0 :                 next = mas_find(&mas, ceiling - 1);
     379             : 
     380             :                 /*
     381             :                  * Hide vma from rmap and truncate_pagecache before freeing
     382             :                  * pgtables
     383             :                  */
     384             :                 if (mm_wr_locked)
     385             :                         vma_start_write(vma);
     386           0 :                 unlink_anon_vmas(vma);
     387           0 :                 unlink_file_vma(vma);
     388             : 
     389           0 :                 if (is_vm_hugetlb_page(vma)) {
     390             :                         hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
     391             :                                 floor, next ? next->vm_start : ceiling);
     392             :                 } else {
     393             :                         /*
     394             :                          * Optimization: gather nearby vmas into one call down
     395             :                          */
     396           0 :                         while (next && next->vm_start <= vma->vm_end + PMD_SIZE
     397           0 :                                && !is_vm_hugetlb_page(next)) {
     398           0 :                                 vma = next;
     399           0 :                                 next = mas_find(&mas, ceiling - 1);
     400             :                                 if (mm_wr_locked)
     401             :                                         vma_start_write(vma);
     402           0 :                                 unlink_anon_vmas(vma);
     403           0 :                                 unlink_file_vma(vma);
     404             :                         }
     405           0 :                         free_pgd_range(tlb, addr, vma->vm_end,
     406             :                                 floor, next ? next->vm_start : ceiling);
     407             :                 }
     408           0 :                 vma = next;
     409           0 :         } while (vma);
     410           0 : }
     411             : 
     412           0 : void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
     413             : {
     414           0 :         spinlock_t *ptl = pmd_lock(mm, pmd);
     415             : 
     416           0 :         if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
     417           0 :                 mm_inc_nr_ptes(mm);
     418             :                 /*
     419             :                  * Ensure all pte setup (eg. pte page lock and page clearing) are
     420             :                  * visible before the pte is made visible to other CPUs by being
     421             :                  * put into page tables.
     422             :                  *
     423             :                  * The other side of the story is the pointer chasing in the page
     424             :                  * table walking code (when walking the page table without locking;
     425             :                  * ie. most of the time). Fortunately, these data accesses consist
     426             :                  * of a chain of data-dependent loads, meaning most CPUs (alpha
     427             :                  * being the notable exception) will already guarantee loads are
     428             :                  * seen in-order. See the alpha page table accessors for the
     429             :                  * smp_rmb() barriers in page table walking code.
     430             :                  */
     431           0 :                 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
     432           0 :                 pmd_populate(mm, pmd, *pte);
     433           0 :                 *pte = NULL;
     434             :         }
     435           0 :         spin_unlock(ptl);
     436           0 : }
     437             : 
     438           0 : int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
     439             : {
     440           0 :         pgtable_t new = pte_alloc_one(mm);
     441           0 :         if (!new)
     442             :                 return -ENOMEM;
     443             : 
     444           0 :         pmd_install(mm, pmd, &new);
     445           0 :         if (new)
     446           0 :                 pte_free(mm, new);
     447             :         return 0;
     448             : }
     449             : 
     450           1 : int __pte_alloc_kernel(pmd_t *pmd)
     451             : {
     452           2 :         pte_t *new = pte_alloc_one_kernel(&init_mm);
     453           1 :         if (!new)
     454             :                 return -ENOMEM;
     455             : 
     456           1 :         spin_lock(&init_mm.page_table_lock);
     457           1 :         if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
     458           1 :                 smp_wmb(); /* See comment in pmd_install() */
     459           1 :                 pmd_populate_kernel(&init_mm, pmd, new);
     460           1 :                 new = NULL;
     461             :         }
     462           1 :         spin_unlock(&init_mm.page_table_lock);
     463           1 :         if (new)
     464           0 :                 pte_free_kernel(&init_mm, new);
     465             :         return 0;
     466             : }
     467             : 
     468           0 : static inline void init_rss_vec(int *rss)
     469             : {
     470           0 :         memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
     471           0 : }
     472             : 
     473           0 : static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
     474             : {
     475             :         int i;
     476             : 
     477           0 :         if (current->mm == mm)
     478             :                 sync_mm_rss(mm);
     479           0 :         for (i = 0; i < NR_MM_COUNTERS; i++)
     480           0 :                 if (rss[i])
     481           0 :                         add_mm_counter(mm, i, rss[i]);
     482           0 : }
     483             : 
     484             : /*
     485             :  * This function is called to print an error when a bad pte
     486             :  * is found. For example, we might have a PFN-mapped pte in
     487             :  * a region that doesn't allow it.
     488             :  *
     489             :  * The calling function must still handle the error.
     490             :  */
     491           0 : static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
     492             :                           pte_t pte, struct page *page)
     493             : {
     494           0 :         pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
     495           0 :         p4d_t *p4d = p4d_offset(pgd, addr);
     496           0 :         pud_t *pud = pud_offset(p4d, addr);
     497           0 :         pmd_t *pmd = pmd_offset(pud, addr);
     498             :         struct address_space *mapping;
     499             :         pgoff_t index;
     500             :         static unsigned long resume;
     501             :         static unsigned long nr_shown;
     502             :         static unsigned long nr_unshown;
     503             : 
     504             :         /*
     505             :          * Allow a burst of 60 reports, then keep quiet for that minute;
     506             :          * or allow a steady drip of one report per second.
     507             :          */
     508           0 :         if (nr_shown == 60) {
     509           0 :                 if (time_before(jiffies, resume)) {
     510           0 :                         nr_unshown++;
     511           0 :                         return;
     512             :                 }
     513           0 :                 if (nr_unshown) {
     514           0 :                         pr_alert("BUG: Bad page map: %lu messages suppressed\n",
     515             :                                  nr_unshown);
     516           0 :                         nr_unshown = 0;
     517             :                 }
     518           0 :                 nr_shown = 0;
     519             :         }
     520           0 :         if (nr_shown++ == 0)
     521           0 :                 resume = jiffies + 60 * HZ;
     522             : 
     523           0 :         mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
     524           0 :         index = linear_page_index(vma, addr);
     525             : 
     526           0 :         pr_alert("BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
     527             :                  current->comm,
     528             :                  (long long)pte_val(pte), (long long)pmd_val(*pmd));
     529           0 :         if (page)
     530           0 :                 dump_page(page, "bad pte");
     531           0 :         pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
     532             :                  (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
     533           0 :         pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
     534             :                  vma->vm_file,
     535             :                  vma->vm_ops ? vma->vm_ops->fault : NULL,
     536             :                  vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
     537             :                  mapping ? mapping->a_ops->read_folio : NULL);
     538           0 :         dump_stack();
     539           0 :         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
     540             : }
     541             : 
     542             : /*
     543             :  * vm_normal_page -- This function gets the "struct page" associated with a pte.
     544             :  *
     545             :  * "Special" mappings do not wish to be associated with a "struct page" (either
     546             :  * it doesn't exist, or it exists but they don't want to touch it). In this
     547             :  * case, NULL is returned here. "Normal" mappings do have a struct page.
     548             :  *
     549             :  * There are 2 broad cases. Firstly, an architecture may define a pte_special()
     550             :  * pte bit, in which case this function is trivial. Secondly, an architecture
     551             :  * may not have a spare pte bit, which requires a more complicated scheme,
     552             :  * described below.
     553             :  *
     554             :  * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
     555             :  * special mapping (even if there are underlying and valid "struct pages").
     556             :  * COWed pages of a VM_PFNMAP are always normal.
     557             :  *
     558             :  * The way we recognize COWed pages within VM_PFNMAP mappings is through the
     559             :  * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
     560             :  * set, and the vm_pgoff will point to the first PFN mapped: thus every special
     561             :  * mapping will always honor the rule
     562             :  *
     563             :  *      pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
     564             :  *
     565             :  * And for normal mappings this is false.
     566             :  *
     567             :  * This restricts such mappings to be a linear translation from virtual address
     568             :  * to pfn. To get around this restriction, we allow arbitrary mappings so long
     569             :  * as the vma is not a COW mapping; in that case, we know that all ptes are
     570             :  * special (because none can have been COWed).
     571             :  *
     572             :  *
     573             :  * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
     574             :  *
     575             :  * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
     576             :  * page" backing, however the difference is that _all_ pages with a struct
     577             :  * page (that is, those where pfn_valid is true) are refcounted and considered
     578             :  * normal pages by the VM. The disadvantage is that pages are refcounted
     579             :  * (which can be slower and simply not an option for some PFNMAP users). The
     580             :  * advantage is that we don't have to follow the strict linearity rule of
     581             :  * PFNMAP mappings in order to support COWable mappings.
     582             :  *
     583             :  */
     584           0 : struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
     585             :                             pte_t pte)
     586             : {
     587           0 :         unsigned long pfn = pte_pfn(pte);
     588             : 
     589             :         if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
     590             :                 if (likely(!pte_special(pte)))
     591             :                         goto check_pfn;
     592             :                 if (vma->vm_ops && vma->vm_ops->find_special_page)
     593             :                         return vma->vm_ops->find_special_page(vma, addr);
     594             :                 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
     595             :                         return NULL;
     596             :                 if (is_zero_pfn(pfn))
     597             :                         return NULL;
     598             :                 if (pte_devmap(pte))
     599             :                 /*
     600             :                  * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
     601             :                  * and will have refcounts incremented on their struct pages
     602             :                  * when they are inserted into PTEs, thus they are safe to
     603             :                  * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
     604             :                  * do not have refcounts. Example of legacy ZONE_DEVICE is
     605             :                  * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
     606             :                  */
     607             :                         return NULL;
     608             : 
     609             :                 print_bad_pte(vma, addr, pte, NULL);
     610             :                 return NULL;
     611             :         }
     612             : 
     613             :         /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
     614             : 
     615           0 :         if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
     616           0 :                 if (vma->vm_flags & VM_MIXEDMAP) {
     617           0 :                         if (!pfn_valid(pfn))
     618             :                                 return NULL;
     619             :                         goto out;
     620             :                 } else {
     621             :                         unsigned long off;
     622           0 :                         off = (addr - vma->vm_start) >> PAGE_SHIFT;
     623           0 :                         if (pfn == vma->vm_pgoff + off)
     624             :                                 return NULL;
     625           0 :                         if (!is_cow_mapping(vma->vm_flags))
     626             :                                 return NULL;
     627             :                 }
     628             :         }
     629             : 
     630           0 :         if (is_zero_pfn(pfn))
     631             :                 return NULL;
     632             : 
     633             : check_pfn:
     634           0 :         if (unlikely(pfn > highest_memmap_pfn)) {
     635           0 :                 print_bad_pte(vma, addr, pte, NULL);
     636           0 :                 return NULL;
     637             :         }
     638             : 
     639             :         /*
     640             :          * NOTE! We still have PageReserved() pages in the page tables.
     641             :          * eg. VDSO mappings can cause them to exist.
     642             :          */
     643             : out:
     644           0 :         return pfn_to_page(pfn);
     645             : }
     646             : 
     647           0 : struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
     648             :                             pte_t pte)
     649             : {
     650           0 :         struct page *page = vm_normal_page(vma, addr, pte);
     651             : 
     652           0 :         if (page)
     653           0 :                 return page_folio(page);
     654             :         return NULL;
     655             : }
     656             : 
     657             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     658             : struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
     659             :                                 pmd_t pmd)
     660             : {
     661             :         unsigned long pfn = pmd_pfn(pmd);
     662             : 
     663             :         /*
     664             :          * There is no pmd_special() but there may be special pmds, e.g.
     665             :          * in a direct-access (dax) mapping, so let's just replicate the
     666             :          * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
     667             :          */
     668             :         if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
     669             :                 if (vma->vm_flags & VM_MIXEDMAP) {
     670             :                         if (!pfn_valid(pfn))
     671             :                                 return NULL;
     672             :                         goto out;
     673             :                 } else {
     674             :                         unsigned long off;
     675             :                         off = (addr - vma->vm_start) >> PAGE_SHIFT;
     676             :                         if (pfn == vma->vm_pgoff + off)
     677             :                                 return NULL;
     678             :                         if (!is_cow_mapping(vma->vm_flags))
     679             :                                 return NULL;
     680             :                 }
     681             :         }
     682             : 
     683             :         if (pmd_devmap(pmd))
     684             :                 return NULL;
     685             :         if (is_huge_zero_pmd(pmd))
     686             :                 return NULL;
     687             :         if (unlikely(pfn > highest_memmap_pfn))
     688             :                 return NULL;
     689             : 
     690             :         /*
     691             :          * NOTE! We still have PageReserved() pages in the page tables.
     692             :          * eg. VDSO mappings can cause them to exist.
     693             :          */
     694             : out:
     695             :         return pfn_to_page(pfn);
     696             : }
     697             : #endif
     698             : 
     699             : static void restore_exclusive_pte(struct vm_area_struct *vma,
     700             :                                   struct page *page, unsigned long address,
     701             :                                   pte_t *ptep)
     702             : {
     703             :         pte_t orig_pte;
     704             :         pte_t pte;
     705             :         swp_entry_t entry;
     706             : 
     707             :         orig_pte = ptep_get(ptep);
     708             :         pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
     709             :         if (pte_swp_soft_dirty(orig_pte))
     710             :                 pte = pte_mksoft_dirty(pte);
     711             : 
     712             :         entry = pte_to_swp_entry(orig_pte);
     713             :         if (pte_swp_uffd_wp(orig_pte))
     714             :                 pte = pte_mkuffd_wp(pte);
     715             :         else if (is_writable_device_exclusive_entry(entry))
     716             :                 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
     717             : 
     718             :         VM_BUG_ON(pte_write(pte) && !(PageAnon(page) && PageAnonExclusive(page)));
     719             : 
     720             :         /*
     721             :          * No need to take a page reference as one was already
     722             :          * created when the swap entry was made.
     723             :          */
     724             :         if (PageAnon(page))
     725             :                 page_add_anon_rmap(page, vma, address, RMAP_NONE);
     726             :         else
     727             :                 /*
     728             :                  * Currently device exclusive access only supports anonymous
     729             :                  * memory so the entry shouldn't point to a filebacked page.
     730             :                  */
     731             :                 WARN_ON_ONCE(1);
     732             : 
     733             :         set_pte_at(vma->vm_mm, address, ptep, pte);
     734             : 
     735             :         /*
     736             :          * No need to invalidate - it was non-present before. However
     737             :          * secondary CPUs may have mappings that need invalidating.
     738             :          */
     739             :         update_mmu_cache(vma, address, ptep);
     740             : }
     741             : 
     742             : /*
     743             :  * Tries to restore an exclusive pte if the page lock can be acquired without
     744             :  * sleeping.
     745             :  */
     746             : static int
     747             : try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
     748             :                         unsigned long addr)
     749             : {
     750             :         swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte));
     751             :         struct page *page = pfn_swap_entry_to_page(entry);
     752             : 
     753             :         if (trylock_page(page)) {
     754             :                 restore_exclusive_pte(vma, page, addr, src_pte);
     755             :                 unlock_page(page);
     756             :                 return 0;
     757             :         }
     758             : 
     759             :         return -EBUSY;
     760             : }
     761             : 
     762             : /*
     763             :  * copy one vm_area from one task to the other. Assumes the page tables
     764             :  * already present in the new task to be cleared in the whole range
     765             :  * covered by this vma.
     766             :  */
     767             : 
     768             : static unsigned long
     769           0 : copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
     770             :                 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
     771             :                 struct vm_area_struct *src_vma, unsigned long addr, int *rss)
     772             : {
     773           0 :         unsigned long vm_flags = dst_vma->vm_flags;
     774           0 :         pte_t orig_pte = ptep_get(src_pte);
     775           0 :         pte_t pte = orig_pte;
     776             :         struct page *page;
     777           0 :         swp_entry_t entry = pte_to_swp_entry(orig_pte);
     778             : 
     779           0 :         if (likely(!non_swap_entry(entry))) {
     780           0 :                 if (swap_duplicate(entry) < 0)
     781             :                         return -EIO;
     782             : 
     783             :                 /* make sure dst_mm is on swapoff's mmlist. */
     784           0 :                 if (unlikely(list_empty(&dst_mm->mmlist))) {
     785           0 :                         spin_lock(&mmlist_lock);
     786           0 :                         if (list_empty(&dst_mm->mmlist))
     787           0 :                                 list_add(&dst_mm->mmlist,
     788             :                                                 &src_mm->mmlist);
     789             :                         spin_unlock(&mmlist_lock);
     790             :                 }
     791             :                 /* Mark the swap entry as shared. */
     792           0 :                 if (pte_swp_exclusive(orig_pte)) {
     793           0 :                         pte = pte_swp_clear_exclusive(orig_pte);
     794           0 :                         set_pte_at(src_mm, addr, src_pte, pte);
     795             :                 }
     796           0 :                 rss[MM_SWAPENTS]++;
     797           0 :         } else if (is_migration_entry(entry)) {
     798           0 :                 page = pfn_swap_entry_to_page(entry);
     799             : 
     800           0 :                 rss[mm_counter(page)]++;
     801             : 
     802           0 :                 if (!is_readable_migration_entry(entry) &&
     803           0 :                                 is_cow_mapping(vm_flags)) {
     804             :                         /*
     805             :                          * COW mappings require pages in both parent and child
     806             :                          * to be set to read. A previously exclusive entry is
     807             :                          * now shared.
     808             :                          */
     809           0 :                         entry = make_readable_migration_entry(
     810             :                                                         swp_offset(entry));
     811           0 :                         pte = swp_entry_to_pte(entry);
     812           0 :                         if (pte_swp_soft_dirty(orig_pte))
     813             :                                 pte = pte_swp_mksoft_dirty(pte);
     814             :                         if (pte_swp_uffd_wp(orig_pte))
     815             :                                 pte = pte_swp_mkuffd_wp(pte);
     816           0 :                         set_pte_at(src_mm, addr, src_pte, pte);
     817             :                 }
     818           0 :         } else if (is_device_private_entry(entry)) {
     819             :                 page = pfn_swap_entry_to_page(entry);
     820             : 
     821             :                 /*
     822             :                  * Update rss count even for unaddressable pages, as
     823             :                  * they should treated just like normal pages in this
     824             :                  * respect.
     825             :                  *
     826             :                  * We will likely want to have some new rss counters
     827             :                  * for unaddressable pages, at some point. But for now
     828             :                  * keep things as they are.
     829             :                  */
     830             :                 get_page(page);
     831             :                 rss[mm_counter(page)]++;
     832             :                 /* Cannot fail as these pages cannot get pinned. */
     833             :                 BUG_ON(page_try_dup_anon_rmap(page, false, src_vma));
     834             : 
     835             :                 /*
     836             :                  * We do not preserve soft-dirty information, because so
     837             :                  * far, checkpoint/restore is the only feature that
     838             :                  * requires that. And checkpoint/restore does not work
     839             :                  * when a device driver is involved (you cannot easily
     840             :                  * save and restore device driver state).
     841             :                  */
     842             :                 if (is_writable_device_private_entry(entry) &&
     843             :                     is_cow_mapping(vm_flags)) {
     844             :                         entry = make_readable_device_private_entry(
     845             :                                                         swp_offset(entry));
     846             :                         pte = swp_entry_to_pte(entry);
     847             :                         if (pte_swp_uffd_wp(orig_pte))
     848             :                                 pte = pte_swp_mkuffd_wp(pte);
     849             :                         set_pte_at(src_mm, addr, src_pte, pte);
     850             :                 }
     851           0 :         } else if (is_device_exclusive_entry(entry)) {
     852             :                 /*
     853             :                  * Make device exclusive entries present by restoring the
     854             :                  * original entry then copying as for a present pte. Device
     855             :                  * exclusive entries currently only support private writable
     856             :                  * (ie. COW) mappings.
     857             :                  */
     858             :                 VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
     859             :                 if (try_restore_exclusive_pte(src_pte, src_vma, addr))
     860             :                         return -EBUSY;
     861             :                 return -ENOENT;
     862           0 :         } else if (is_pte_marker_entry(entry)) {
     863           0 :                 if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma))
     864           0 :                         set_pte_at(dst_mm, addr, dst_pte, pte);
     865             :                 return 0;
     866             :         }
     867           0 :         if (!userfaultfd_wp(dst_vma))
     868             :                 pte = pte_swp_clear_uffd_wp(pte);
     869           0 :         set_pte_at(dst_mm, addr, dst_pte, pte);
     870             :         return 0;
     871             : }
     872             : 
     873             : /*
     874             :  * Copy a present and normal page.
     875             :  *
     876             :  * NOTE! The usual case is that this isn't required;
     877             :  * instead, the caller can just increase the page refcount
     878             :  * and re-use the pte the traditional way.
     879             :  *
     880             :  * And if we need a pre-allocated page but don't yet have
     881             :  * one, return a negative error to let the preallocation
     882             :  * code know so that it can do so outside the page table
     883             :  * lock.
     884             :  */
     885             : static inline int
     886           0 : copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
     887             :                   pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
     888             :                   struct folio **prealloc, struct page *page)
     889             : {
     890             :         struct folio *new_folio;
     891             :         pte_t pte;
     892             : 
     893           0 :         new_folio = *prealloc;
     894           0 :         if (!new_folio)
     895             :                 return -EAGAIN;
     896             : 
     897             :         /*
     898             :          * We have a prealloc page, all good!  Take it
     899             :          * over and copy the page & arm it.
     900             :          */
     901           0 :         *prealloc = NULL;
     902           0 :         copy_user_highpage(&new_folio->page, page, addr, src_vma);
     903           0 :         __folio_mark_uptodate(new_folio);
     904           0 :         folio_add_new_anon_rmap(new_folio, dst_vma, addr);
     905           0 :         folio_add_lru_vma(new_folio, dst_vma);
     906           0 :         rss[MM_ANONPAGES]++;
     907             : 
     908             :         /* All done, just insert the new page copy in the child */
     909           0 :         pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
     910           0 :         pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
     911           0 :         if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
     912             :                 /* Uffd-wp needs to be delivered to dest pte as well */
     913             :                 pte = pte_mkuffd_wp(pte);
     914           0 :         set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
     915             :         return 0;
     916             : }
     917             : 
     918             : /*
     919             :  * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page
     920             :  * is required to copy this pte.
     921             :  */
     922             : static inline int
     923           0 : copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
     924             :                  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
     925             :                  struct folio **prealloc)
     926             : {
     927           0 :         struct mm_struct *src_mm = src_vma->vm_mm;
     928           0 :         unsigned long vm_flags = src_vma->vm_flags;
     929           0 :         pte_t pte = ptep_get(src_pte);
     930             :         struct page *page;
     931             :         struct folio *folio;
     932             : 
     933           0 :         page = vm_normal_page(src_vma, addr, pte);
     934           0 :         if (page)
     935           0 :                 folio = page_folio(page);
     936           0 :         if (page && folio_test_anon(folio)) {
     937             :                 /*
     938             :                  * If this page may have been pinned by the parent process,
     939             :                  * copy the page immediately for the child so that we'll always
     940             :                  * guarantee the pinned page won't be randomly replaced in the
     941             :                  * future.
     942             :                  */
     943           0 :                 folio_get(folio);
     944           0 :                 if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) {
     945             :                         /* Page may be pinned, we have to copy. */
     946           0 :                         folio_put(folio);
     947           0 :                         return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
     948             :                                                  addr, rss, prealloc, page);
     949             :                 }
     950           0 :                 rss[MM_ANONPAGES]++;
     951           0 :         } else if (page) {
     952           0 :                 folio_get(folio);
     953           0 :                 page_dup_file_rmap(page, false);
     954           0 :                 rss[mm_counter_file(page)]++;
     955             :         }
     956             : 
     957             :         /*
     958             :          * If it's a COW mapping, write protect it both
     959             :          * in the parent and the child
     960             :          */
     961           0 :         if (is_cow_mapping(vm_flags) && pte_write(pte)) {
     962           0 :                 ptep_set_wrprotect(src_mm, addr, src_pte);
     963             :                 pte = pte_wrprotect(pte);
     964             :         }
     965             :         VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));
     966             : 
     967             :         /*
     968             :          * If it's a shared mapping, mark it clean in
     969             :          * the child
     970             :          */
     971           0 :         if (vm_flags & VM_SHARED)
     972             :                 pte = pte_mkclean(pte);
     973           0 :         pte = pte_mkold(pte);
     974             : 
     975           0 :         if (!userfaultfd_wp(dst_vma))
     976             :                 pte = pte_clear_uffd_wp(pte);
     977             : 
     978           0 :         set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
     979             :         return 0;
     980             : }
     981             : 
     982             : static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm,
     983             :                 struct vm_area_struct *vma, unsigned long addr)
     984             : {
     985             :         struct folio *new_folio;
     986             : 
     987           0 :         new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false);
     988           0 :         if (!new_folio)
     989             :                 return NULL;
     990             : 
     991           0 :         if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) {
     992             :                 folio_put(new_folio);
     993             :                 return NULL;
     994             :         }
     995           0 :         folio_throttle_swaprate(new_folio, GFP_KERNEL);
     996             : 
     997             :         return new_folio;
     998             : }
     999             : 
    1000             : static int
    1001           0 : copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
    1002             :                pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
    1003             :                unsigned long end)
    1004             : {
    1005           0 :         struct mm_struct *dst_mm = dst_vma->vm_mm;
    1006           0 :         struct mm_struct *src_mm = src_vma->vm_mm;
    1007             :         pte_t *orig_src_pte, *orig_dst_pte;
    1008             :         pte_t *src_pte, *dst_pte;
    1009             :         pte_t ptent;
    1010             :         spinlock_t *src_ptl, *dst_ptl;
    1011           0 :         int progress, ret = 0;
    1012             :         int rss[NR_MM_COUNTERS];
    1013           0 :         swp_entry_t entry = (swp_entry_t){0};
    1014           0 :         struct folio *prealloc = NULL;
    1015             : 
    1016             : again:
    1017           0 :         progress = 0;
    1018           0 :         init_rss_vec(rss);
    1019             : 
    1020             :         /*
    1021             :          * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
    1022             :          * error handling here, assume that exclusive mmap_lock on dst and src
    1023             :          * protects anon from unexpected THP transitions; with shmem and file
    1024             :          * protected by mmap_lock-less collapse skipping areas with anon_vma
    1025             :          * (whereas vma_needs_copy() skips areas without anon_vma).  A rework
    1026             :          * can remove such assumptions later, but this is good enough for now.
    1027             :          */
    1028           0 :         dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
    1029           0 :         if (!dst_pte) {
    1030             :                 ret = -ENOMEM;
    1031             :                 goto out;
    1032             :         }
    1033           0 :         src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
    1034           0 :         if (!src_pte) {
    1035           0 :                 pte_unmap_unlock(dst_pte, dst_ptl);
    1036             :                 /* ret == 0 */
    1037             :                 goto out;
    1038             :         }
    1039           0 :         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
    1040           0 :         orig_src_pte = src_pte;
    1041           0 :         orig_dst_pte = dst_pte;
    1042             :         arch_enter_lazy_mmu_mode();
    1043             : 
    1044             :         do {
    1045             :                 /*
    1046             :                  * We are holding two locks at this point - either of them
    1047             :                  * could generate latencies in another task on another CPU.
    1048             :                  */
    1049           0 :                 if (progress >= 32) {
    1050           0 :                         progress = 0;
    1051           0 :                         if (need_resched() ||
    1052             :                             spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
    1053             :                                 break;
    1054             :                 }
    1055           0 :                 ptent = ptep_get(src_pte);
    1056           0 :                 if (pte_none(ptent)) {
    1057           0 :                         progress++;
    1058           0 :                         continue;
    1059             :                 }
    1060           0 :                 if (unlikely(!pte_present(ptent))) {
    1061           0 :                         ret = copy_nonpresent_pte(dst_mm, src_mm,
    1062             :                                                   dst_pte, src_pte,
    1063             :                                                   dst_vma, src_vma,
    1064             :                                                   addr, rss);
    1065           0 :                         if (ret == -EIO) {
    1066           0 :                                 entry = pte_to_swp_entry(ptep_get(src_pte));
    1067             :                                 break;
    1068           0 :                         } else if (ret == -EBUSY) {
    1069             :                                 break;
    1070           0 :                         } else if (!ret) {
    1071           0 :                                 progress += 8;
    1072           0 :                                 continue;
    1073             :                         }
    1074             : 
    1075             :                         /*
    1076             :                          * Device exclusive entry restored, continue by copying
    1077             :                          * the now present pte.
    1078             :                          */
    1079           0 :                         WARN_ON_ONCE(ret != -ENOENT);
    1080             :                 }
    1081             :                 /* copy_present_pte() will clear `*prealloc' if consumed */
    1082           0 :                 ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
    1083             :                                        addr, rss, &prealloc);
    1084             :                 /*
    1085             :                  * If we need a pre-allocated page for this pte, drop the
    1086             :                  * locks, allocate, and try again.
    1087             :                  */
    1088           0 :                 if (unlikely(ret == -EAGAIN))
    1089             :                         break;
    1090           0 :                 if (unlikely(prealloc)) {
    1091             :                         /*
    1092             :                          * pre-alloc page cannot be reused by next time so as
    1093             :                          * to strictly follow mempolicy (e.g., alloc_page_vma()
    1094             :                          * will allocate page according to address).  This
    1095             :                          * could only happen if one pinned pte changed.
    1096             :                          */
    1097           0 :                         folio_put(prealloc);
    1098           0 :                         prealloc = NULL;
    1099             :                 }
    1100           0 :                 progress += 8;
    1101           0 :         } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
    1102             : 
    1103             :         arch_leave_lazy_mmu_mode();
    1104           0 :         pte_unmap_unlock(orig_src_pte, src_ptl);
    1105           0 :         add_mm_rss_vec(dst_mm, rss);
    1106           0 :         pte_unmap_unlock(orig_dst_pte, dst_ptl);
    1107           0 :         cond_resched();
    1108             : 
    1109           0 :         if (ret == -EIO) {
    1110             :                 VM_WARN_ON_ONCE(!entry.val);
    1111           0 :                 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
    1112             :                         ret = -ENOMEM;
    1113             :                         goto out;
    1114             :                 }
    1115           0 :                 entry.val = 0;
    1116           0 :         } else if (ret == -EBUSY) {
    1117             :                 goto out;
    1118           0 :         } else if (ret ==  -EAGAIN) {
    1119           0 :                 prealloc = page_copy_prealloc(src_mm, src_vma, addr);
    1120           0 :                 if (!prealloc)
    1121             :                         return -ENOMEM;
    1122             :         } else if (ret) {
    1123             :                 VM_WARN_ON_ONCE(1);
    1124             :         }
    1125             : 
    1126             :         /* We've captured and resolved the error. Reset, try again. */
    1127           0 :         ret = 0;
    1128             : 
    1129           0 :         if (addr != end)
    1130             :                 goto again;
    1131             : out:
    1132           0 :         if (unlikely(prealloc))
    1133           0 :                 folio_put(prealloc);
    1134             :         return ret;
    1135             : }
    1136             : 
    1137             : static inline int
    1138           0 : copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
    1139             :                pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
    1140             :                unsigned long end)
    1141             : {
    1142           0 :         struct mm_struct *dst_mm = dst_vma->vm_mm;
    1143           0 :         struct mm_struct *src_mm = src_vma->vm_mm;
    1144             :         pmd_t *src_pmd, *dst_pmd;
    1145             :         unsigned long next;
    1146             : 
    1147           0 :         dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
    1148           0 :         if (!dst_pmd)
    1149             :                 return -ENOMEM;
    1150           0 :         src_pmd = pmd_offset(src_pud, addr);
    1151             :         do {
    1152           0 :                 next = pmd_addr_end(addr, end);
    1153           0 :                 if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
    1154           0 :                         || pmd_devmap(*src_pmd)) {
    1155             :                         int err;
    1156             :                         VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
    1157             :                         err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
    1158             :                                             addr, dst_vma, src_vma);
    1159             :                         if (err == -ENOMEM)
    1160             :                                 return -ENOMEM;
    1161             :                         if (!err)
    1162             :                                 continue;
    1163             :                         /* fall through */
    1164             :                 }
    1165           0 :                 if (pmd_none_or_clear_bad(src_pmd))
    1166           0 :                         continue;
    1167           0 :                 if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
    1168             :                                    addr, next))
    1169             :                         return -ENOMEM;
    1170           0 :         } while (dst_pmd++, src_pmd++, addr = next, addr != end);
    1171             :         return 0;
    1172             : }
    1173             : 
    1174             : static inline int
    1175           0 : copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
    1176             :                p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
    1177             :                unsigned long end)
    1178             : {
    1179           0 :         struct mm_struct *dst_mm = dst_vma->vm_mm;
    1180           0 :         struct mm_struct *src_mm = src_vma->vm_mm;
    1181             :         pud_t *src_pud, *dst_pud;
    1182             :         unsigned long next;
    1183             : 
    1184           0 :         dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
    1185           0 :         if (!dst_pud)
    1186             :                 return -ENOMEM;
    1187           0 :         src_pud = pud_offset(src_p4d, addr);
    1188             :         do {
    1189           0 :                 next = pud_addr_end(addr, end);
    1190           0 :                 if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
    1191             :                         int err;
    1192             : 
    1193             :                         VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
    1194             :                         err = copy_huge_pud(dst_mm, src_mm,
    1195             :                                             dst_pud, src_pud, addr, src_vma);
    1196             :                         if (err == -ENOMEM)
    1197             :                                 return -ENOMEM;
    1198             :                         if (!err)
    1199             :                                 continue;
    1200             :                         /* fall through */
    1201             :                 }
    1202           0 :                 if (pud_none_or_clear_bad(src_pud))
    1203           0 :                         continue;
    1204           0 :                 if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
    1205             :                                    addr, next))
    1206             :                         return -ENOMEM;
    1207           0 :         } while (dst_pud++, src_pud++, addr = next, addr != end);
    1208           0 :         return 0;
    1209             : }
    1210             : 
    1211             : static inline int
    1212             : copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
    1213             :                pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
    1214             :                unsigned long end)
    1215             : {
    1216           0 :         struct mm_struct *dst_mm = dst_vma->vm_mm;
    1217             :         p4d_t *src_p4d, *dst_p4d;
    1218             :         unsigned long next;
    1219             : 
    1220           0 :         dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
    1221           0 :         if (!dst_p4d)
    1222             :                 return -ENOMEM;
    1223           0 :         src_p4d = p4d_offset(src_pgd, addr);
    1224             :         do {
    1225           0 :                 next = p4d_addr_end(addr, end);
    1226           0 :                 if (p4d_none_or_clear_bad(src_p4d))
    1227             :                         continue;
    1228           0 :                 if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
    1229             :                                    addr, next))
    1230             :                         return -ENOMEM;
    1231           0 :         } while (dst_p4d++, src_p4d++, addr = next, addr != end);
    1232             :         return 0;
    1233             : }
    1234             : 
    1235             : /*
    1236             :  * Return true if the vma needs to copy the pgtable during this fork().  Return
    1237             :  * false when we can speed up fork() by allowing lazy page faults later until
    1238             :  * when the child accesses the memory range.
    1239             :  */
    1240             : static bool
    1241             : vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
    1242             : {
    1243             :         /*
    1244             :          * Always copy pgtables when dst_vma has uffd-wp enabled even if it's
    1245             :          * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
    1246             :          * contains uffd-wp protection information, that's something we can't
    1247             :          * retrieve from page cache, and skip copying will lose those info.
    1248             :          */
    1249           0 :         if (userfaultfd_wp(dst_vma))
    1250             :                 return true;
    1251             : 
    1252           0 :         if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
    1253             :                 return true;
    1254             : 
    1255           0 :         if (src_vma->anon_vma)
    1256             :                 return true;
    1257             : 
    1258             :         /*
    1259             :          * Don't copy ptes where a page fault will fill them correctly.  Fork
    1260             :          * becomes much lighter when there are big shared or private readonly
    1261             :          * mappings. The tradeoff is that copy_page_range is more efficient
    1262             :          * than faulting.
    1263             :          */
    1264             :         return false;
    1265             : }
    1266             : 
    1267             : int
    1268           0 : copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
    1269             : {
    1270             :         pgd_t *src_pgd, *dst_pgd;
    1271             :         unsigned long next;
    1272           0 :         unsigned long addr = src_vma->vm_start;
    1273           0 :         unsigned long end = src_vma->vm_end;
    1274           0 :         struct mm_struct *dst_mm = dst_vma->vm_mm;
    1275           0 :         struct mm_struct *src_mm = src_vma->vm_mm;
    1276             :         struct mmu_notifier_range range;
    1277             :         bool is_cow;
    1278             :         int ret;
    1279             : 
    1280           0 :         if (!vma_needs_copy(dst_vma, src_vma))
    1281             :                 return 0;
    1282             : 
    1283           0 :         if (is_vm_hugetlb_page(src_vma))
    1284             :                 return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
    1285             : 
    1286             :         if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
    1287             :                 /*
    1288             :                  * We do not free on error cases below as remove_vma
    1289             :                  * gets called on error from higher level routine
    1290             :                  */
    1291             :                 ret = track_pfn_copy(src_vma);
    1292             :                 if (ret)
    1293             :                         return ret;
    1294             :         }
    1295             : 
    1296             :         /*
    1297             :          * We need to invalidate the secondary MMU mappings only when
    1298             :          * there could be a permission downgrade on the ptes of the
    1299             :          * parent mm. And a permission downgrade will only happen if
    1300             :          * is_cow_mapping() returns true.
    1301             :          */
    1302           0 :         is_cow = is_cow_mapping(src_vma->vm_flags);
    1303             : 
    1304           0 :         if (is_cow) {
    1305           0 :                 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
    1306             :                                         0, src_mm, addr, end);
    1307           0 :                 mmu_notifier_invalidate_range_start(&range);
    1308             :                 /*
    1309             :                  * Disabling preemption is not needed for the write side, as
    1310             :                  * the read side doesn't spin, but goes to the mmap_lock.
    1311             :                  *
    1312             :                  * Use the raw variant of the seqcount_t write API to avoid
    1313             :                  * lockdep complaining about preemptibility.
    1314             :                  */
    1315           0 :                 mmap_assert_write_locked(src_mm);
    1316           0 :                 raw_write_seqcount_begin(&src_mm->write_protect_seq);
    1317             :         }
    1318             : 
    1319           0 :         ret = 0;
    1320           0 :         dst_pgd = pgd_offset(dst_mm, addr);
    1321           0 :         src_pgd = pgd_offset(src_mm, addr);
    1322             :         do {
    1323           0 :                 next = pgd_addr_end(addr, end);
    1324           0 :                 if (pgd_none_or_clear_bad(src_pgd))
    1325             :                         continue;
    1326           0 :                 if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
    1327             :                                             addr, next))) {
    1328             :                         untrack_pfn_clear(dst_vma);
    1329             :                         ret = -ENOMEM;
    1330             :                         break;
    1331             :                 }
    1332           0 :         } while (dst_pgd++, src_pgd++, addr = next, addr != end);
    1333             : 
    1334           0 :         if (is_cow) {
    1335           0 :                 raw_write_seqcount_end(&src_mm->write_protect_seq);
    1336           0 :                 mmu_notifier_invalidate_range_end(&range);
    1337             :         }
    1338             :         return ret;
    1339             : }
    1340             : 
    1341             : /* Whether we should zap all COWed (private) pages too */
    1342             : static inline bool should_zap_cows(struct zap_details *details)
    1343             : {
    1344             :         /* By default, zap all pages */
    1345           0 :         if (!details)
    1346             :                 return true;
    1347             : 
    1348             :         /* Or, we zap COWed pages only if the caller wants to */
    1349           0 :         return details->even_cows;
    1350             : }
    1351             : 
    1352             : /* Decides whether we should zap this page with the page pointer specified */
    1353           0 : static inline bool should_zap_page(struct zap_details *details, struct page *page)
    1354             : {
    1355             :         /* If we can make a decision without *page.. */
    1356           0 :         if (should_zap_cows(details))
    1357             :                 return true;
    1358             : 
    1359             :         /* E.g. the caller passes NULL for the case of a zero page */
    1360           0 :         if (!page)
    1361             :                 return true;
    1362             : 
    1363             :         /* Otherwise we should only zap non-anon pages */
    1364           0 :         return !PageAnon(page);
    1365             : }
    1366             : 
    1367             : static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
    1368             : {
    1369             :         if (!details)
    1370             :                 return false;
    1371             : 
    1372             :         return details->zap_flags & ZAP_FLAG_DROP_MARKER;
    1373             : }
    1374             : 
    1375             : /*
    1376             :  * This function makes sure that we'll replace the none pte with an uffd-wp
    1377             :  * swap special pte marker when necessary. Must be with the pgtable lock held.
    1378             :  */
    1379             : static inline void
    1380             : zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
    1381             :                               unsigned long addr, pte_t *pte,
    1382             :                               struct zap_details *details, pte_t pteval)
    1383             : {
    1384             :         /* Zap on anonymous always means dropping everything */
    1385           0 :         if (vma_is_anonymous(vma))
    1386             :                 return;
    1387             : 
    1388             :         if (zap_drop_file_uffd_wp(details))
    1389             :                 return;
    1390             : 
    1391             :         pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
    1392             : }
    1393             : 
    1394           0 : static unsigned long zap_pte_range(struct mmu_gather *tlb,
    1395             :                                 struct vm_area_struct *vma, pmd_t *pmd,
    1396             :                                 unsigned long addr, unsigned long end,
    1397             :                                 struct zap_details *details)
    1398             : {
    1399           0 :         struct mm_struct *mm = tlb->mm;
    1400           0 :         int force_flush = 0;
    1401             :         int rss[NR_MM_COUNTERS];
    1402             :         spinlock_t *ptl;
    1403             :         pte_t *start_pte;
    1404             :         pte_t *pte;
    1405             :         swp_entry_t entry;
    1406             : 
    1407           0 :         tlb_change_page_size(tlb, PAGE_SIZE);
    1408           0 :         init_rss_vec(rss);
    1409           0 :         start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
    1410           0 :         if (!pte)
    1411             :                 return addr;
    1412             : 
    1413             :         flush_tlb_batched_pending(mm);
    1414             :         arch_enter_lazy_mmu_mode();
    1415             :         do {
    1416           0 :                 pte_t ptent = ptep_get(pte);
    1417             :                 struct page *page;
    1418             : 
    1419           0 :                 if (pte_none(ptent))
    1420           0 :                         continue;
    1421             : 
    1422           0 :                 if (need_resched())
    1423             :                         break;
    1424             : 
    1425           0 :                 if (pte_present(ptent)) {
    1426             :                         unsigned int delay_rmap;
    1427             : 
    1428           0 :                         page = vm_normal_page(vma, addr, ptent);
    1429           0 :                         if (unlikely(!should_zap_page(details, page)))
    1430           0 :                                 continue;
    1431           0 :                         ptent = ptep_get_and_clear_full(mm, addr, pte,
    1432           0 :                                                         tlb->fullmm);
    1433           0 :                         tlb_remove_tlb_entry(tlb, pte, addr);
    1434           0 :                         zap_install_uffd_wp_if_needed(vma, addr, pte, details,
    1435             :                                                       ptent);
    1436           0 :                         if (unlikely(!page))
    1437           0 :                                 continue;
    1438             : 
    1439           0 :                         delay_rmap = 0;
    1440           0 :                         if (!PageAnon(page)) {
    1441           0 :                                 if (pte_dirty(ptent)) {
    1442           0 :                                         set_page_dirty(page);
    1443             :                                         if (tlb_delay_rmap(tlb)) {
    1444             :                                                 delay_rmap = 1;
    1445             :                                                 force_flush = 1;
    1446             :                                         }
    1447             :                                 }
    1448           0 :                                 if (pte_young(ptent) && likely(vma_has_recency(vma)))
    1449           0 :                                         mark_page_accessed(page);
    1450             :                         }
    1451           0 :                         rss[mm_counter(page)]--;
    1452             :                         if (!delay_rmap) {
    1453           0 :                                 page_remove_rmap(page, vma, false);
    1454           0 :                                 if (unlikely(page_mapcount(page) < 0))
    1455           0 :                                         print_bad_pte(vma, addr, ptent, page);
    1456             :                         }
    1457           0 :                         if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
    1458             :                                 force_flush = 1;
    1459             :                                 addr += PAGE_SIZE;
    1460             :                                 break;
    1461             :                         }
    1462           0 :                         continue;
    1463             :                 }
    1464             : 
    1465           0 :                 entry = pte_to_swp_entry(ptent);
    1466           0 :                 if (is_device_private_entry(entry) ||
    1467           0 :                     is_device_exclusive_entry(entry)) {
    1468             :                         page = pfn_swap_entry_to_page(entry);
    1469             :                         if (unlikely(!should_zap_page(details, page)))
    1470             :                                 continue;
    1471             :                         /*
    1472             :                          * Both device private/exclusive mappings should only
    1473             :                          * work with anonymous page so far, so we don't need to
    1474             :                          * consider uffd-wp bit when zap. For more information,
    1475             :                          * see zap_install_uffd_wp_if_needed().
    1476             :                          */
    1477             :                         WARN_ON_ONCE(!vma_is_anonymous(vma));
    1478             :                         rss[mm_counter(page)]--;
    1479             :                         if (is_device_private_entry(entry))
    1480             :                                 page_remove_rmap(page, vma, false);
    1481             :                         put_page(page);
    1482           0 :                 } else if (!non_swap_entry(entry)) {
    1483             :                         /* Genuine swap entry, hence a private anon page */
    1484           0 :                         if (!should_zap_cows(details))
    1485           0 :                                 continue;
    1486           0 :                         rss[MM_SWAPENTS]--;
    1487           0 :                         if (unlikely(!free_swap_and_cache(entry)))
    1488           0 :                                 print_bad_pte(vma, addr, ptent, NULL);
    1489           0 :                 } else if (is_migration_entry(entry)) {
    1490           0 :                         page = pfn_swap_entry_to_page(entry);
    1491           0 :                         if (!should_zap_page(details, page))
    1492           0 :                                 continue;
    1493           0 :                         rss[mm_counter(page)]--;
    1494           0 :                 } else if (pte_marker_entry_uffd_wp(entry)) {
    1495             :                         /*
    1496             :                          * For anon: always drop the marker; for file: only
    1497             :                          * drop the marker if explicitly requested.
    1498             :                          */
    1499             :                         if (!vma_is_anonymous(vma) &&
    1500             :                             !zap_drop_file_uffd_wp(details))
    1501             :                                 continue;
    1502           0 :                 } else if (is_hwpoison_entry(entry) ||
    1503           0 :                            is_swapin_error_entry(entry)) {
    1504           0 :                         if (!should_zap_cows(details))
    1505           0 :                                 continue;
    1506             :                 } else {
    1507             :                         /* We should have covered all the swap entry types */
    1508           0 :                         WARN_ON_ONCE(1);
    1509             :                 }
    1510           0 :                 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
    1511           0 :                 zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
    1512           0 :         } while (pte++, addr += PAGE_SIZE, addr != end);
    1513             : 
    1514           0 :         add_mm_rss_vec(mm, rss);
    1515             :         arch_leave_lazy_mmu_mode();
    1516             : 
    1517             :         /* Do the actual TLB flush before dropping ptl */
    1518           0 :         if (force_flush) {
    1519           0 :                 tlb_flush_mmu_tlbonly(tlb);
    1520           0 :                 tlb_flush_rmaps(tlb, vma);
    1521             :         }
    1522           0 :         pte_unmap_unlock(start_pte, ptl);
    1523             : 
    1524             :         /*
    1525             :          * If we forced a TLB flush (either due to running out of
    1526             :          * batch buffers or because we needed to flush dirty TLB
    1527             :          * entries before releasing the ptl), free the batched
    1528             :          * memory too. Come back again if we didn't do everything.
    1529             :          */
    1530           0 :         if (force_flush)
    1531           0 :                 tlb_flush_mmu(tlb);
    1532             : 
    1533             :         return addr;
    1534             : }
    1535             : 
    1536           0 : static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
    1537             :                                 struct vm_area_struct *vma, pud_t *pud,
    1538             :                                 unsigned long addr, unsigned long end,
    1539             :                                 struct zap_details *details)
    1540             : {
    1541             :         pmd_t *pmd;
    1542             :         unsigned long next;
    1543             : 
    1544           0 :         pmd = pmd_offset(pud, addr);
    1545             :         do {
    1546           0 :                 next = pmd_addr_end(addr, end);
    1547           0 :                 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
    1548             :                         if (next - addr != HPAGE_PMD_SIZE)
    1549             :                                 __split_huge_pmd(vma, pmd, addr, false, NULL);
    1550             :                         else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
    1551             :                                 addr = next;
    1552             :                                 continue;
    1553             :                         }
    1554             :                         /* fall through */
    1555             :                 } else if (details && details->single_folio &&
    1556             :                            folio_test_pmd_mappable(details->single_folio) &&
    1557             :                            next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
    1558             :                         spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
    1559             :                         /*
    1560             :                          * Take and drop THP pmd lock so that we cannot return
    1561             :                          * prematurely, while zap_huge_pmd() has cleared *pmd,
    1562             :                          * but not yet decremented compound_mapcount().
    1563             :                          */
    1564             :                         spin_unlock(ptl);
    1565             :                 }
    1566           0 :                 if (pmd_none(*pmd)) {
    1567           0 :                         addr = next;
    1568           0 :                         continue;
    1569             :                 }
    1570           0 :                 addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
    1571           0 :                 if (addr != next)
    1572           0 :                         pmd--;
    1573           0 :         } while (pmd++, cond_resched(), addr != end);
    1574             : 
    1575           0 :         return addr;
    1576             : }
    1577             : 
    1578           0 : static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
    1579             :                                 struct vm_area_struct *vma, p4d_t *p4d,
    1580             :                                 unsigned long addr, unsigned long end,
    1581             :                                 struct zap_details *details)
    1582             : {
    1583             :         pud_t *pud;
    1584             :         unsigned long next;
    1585             : 
    1586           0 :         pud = pud_offset(p4d, addr);
    1587             :         do {
    1588           0 :                 next = pud_addr_end(addr, end);
    1589           0 :                 if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
    1590             :                         if (next - addr != HPAGE_PUD_SIZE) {
    1591             :                                 mmap_assert_locked(tlb->mm);
    1592             :                                 split_huge_pud(vma, pud, addr);
    1593             :                         } else if (zap_huge_pud(tlb, vma, pud, addr))
    1594             :                                 goto next;
    1595             :                         /* fall through */
    1596             :                 }
    1597           0 :                 if (pud_none_or_clear_bad(pud))
    1598           0 :                         continue;
    1599           0 :                 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
    1600             : next:
    1601           0 :                 cond_resched();
    1602           0 :         } while (pud++, addr = next, addr != end);
    1603             : 
    1604           0 :         return addr;
    1605             : }
    1606             : 
    1607             : static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
    1608             :                                 struct vm_area_struct *vma, pgd_t *pgd,
    1609             :                                 unsigned long addr, unsigned long end,
    1610             :                                 struct zap_details *details)
    1611             : {
    1612             :         p4d_t *p4d;
    1613             :         unsigned long next;
    1614             : 
    1615             :         p4d = p4d_offset(pgd, addr);
    1616             :         do {
    1617           0 :                 next = p4d_addr_end(addr, end);
    1618           0 :                 if (p4d_none_or_clear_bad(p4d))
    1619             :                         continue;
    1620           0 :                 next = zap_pud_range(tlb, vma, p4d, addr, next, details);
    1621           0 :         } while (p4d++, addr = next, addr != end);
    1622             : 
    1623             :         return addr;
    1624             : }
    1625             : 
    1626           0 : void unmap_page_range(struct mmu_gather *tlb,
    1627             :                              struct vm_area_struct *vma,
    1628             :                              unsigned long addr, unsigned long end,
    1629             :                              struct zap_details *details)
    1630             : {
    1631             :         pgd_t *pgd;
    1632             :         unsigned long next;
    1633             : 
    1634           0 :         BUG_ON(addr >= end);
    1635           0 :         tlb_start_vma(tlb, vma);
    1636           0 :         pgd = pgd_offset(vma->vm_mm, addr);
    1637             :         do {
    1638           0 :                 next = pgd_addr_end(addr, end);
    1639           0 :                 if (pgd_none_or_clear_bad(pgd))
    1640             :                         continue;
    1641           0 :                 next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
    1642           0 :         } while (pgd++, addr = next, addr != end);
    1643           0 :         tlb_end_vma(tlb, vma);
    1644           0 : }
    1645             : 
    1646             : 
    1647           0 : static void unmap_single_vma(struct mmu_gather *tlb,
    1648             :                 struct vm_area_struct *vma, unsigned long start_addr,
    1649             :                 unsigned long end_addr,
    1650             :                 struct zap_details *details, bool mm_wr_locked)
    1651             : {
    1652           0 :         unsigned long start = max(vma->vm_start, start_addr);
    1653             :         unsigned long end;
    1654             : 
    1655           0 :         if (start >= vma->vm_end)
    1656             :                 return;
    1657           0 :         end = min(vma->vm_end, end_addr);
    1658           0 :         if (end <= vma->vm_start)
    1659             :                 return;
    1660             : 
    1661             :         if (vma->vm_file)
    1662             :                 uprobe_munmap(vma, start, end);
    1663             : 
    1664             :         if (unlikely(vma->vm_flags & VM_PFNMAP))
    1665             :                 untrack_pfn(vma, 0, 0, mm_wr_locked);
    1666             : 
    1667           0 :         if (start != end) {
    1668           0 :                 if (unlikely(is_vm_hugetlb_page(vma))) {
    1669             :                         /*
    1670             :                          * It is undesirable to test vma->vm_file as it
    1671             :                          * should be non-null for valid hugetlb area.
    1672             :                          * However, vm_file will be NULL in the error
    1673             :                          * cleanup path of mmap_region. When
    1674             :                          * hugetlbfs ->mmap method fails,
    1675             :                          * mmap_region() nullifies vma->vm_file
    1676             :                          * before calling this function to clean up.
    1677             :                          * Since no pte has actually been setup, it is
    1678             :                          * safe to do nothing in this case.
    1679             :                          */
    1680             :                         if (vma->vm_file) {
    1681             :                                 zap_flags_t zap_flags = details ?
    1682             :                                     details->zap_flags : 0;
    1683             :                                 __unmap_hugepage_range_final(tlb, vma, start, end,
    1684             :                                                              NULL, zap_flags);
    1685             :                         }
    1686             :                 } else
    1687           0 :                         unmap_page_range(tlb, vma, start, end, details);
    1688             :         }
    1689             : }
    1690             : 
    1691             : /**
    1692             :  * unmap_vmas - unmap a range of memory covered by a list of vma's
    1693             :  * @tlb: address of the caller's struct mmu_gather
    1694             :  * @mt: the maple tree
    1695             :  * @vma: the starting vma
    1696             :  * @start_addr: virtual address at which to start unmapping
    1697             :  * @end_addr: virtual address at which to end unmapping
    1698             :  *
    1699             :  * Unmap all pages in the vma list.
    1700             :  *
    1701             :  * Only addresses between `start' and `end' will be unmapped.
    1702             :  *
    1703             :  * The VMA list must be sorted in ascending virtual address order.
    1704             :  *
    1705             :  * unmap_vmas() assumes that the caller will flush the whole unmapped address
    1706             :  * range after unmap_vmas() returns.  So the only responsibility here is to
    1707             :  * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
    1708             :  * drops the lock and schedules.
    1709             :  */
    1710           0 : void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
    1711             :                 struct vm_area_struct *vma, unsigned long start_addr,
    1712             :                 unsigned long end_addr, bool mm_wr_locked)
    1713             : {
    1714             :         struct mmu_notifier_range range;
    1715           0 :         struct zap_details details = {
    1716             :                 .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
    1717             :                 /* Careful - we need to zap private pages too! */
    1718             :                 .even_cows = true,
    1719             :         };
    1720           0 :         MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
    1721             : 
    1722             :         mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
    1723             :                                 start_addr, end_addr);
    1724             :         mmu_notifier_invalidate_range_start(&range);
    1725             :         do {
    1726           0 :                 unmap_single_vma(tlb, vma, start_addr, end_addr, &details,
    1727             :                                  mm_wr_locked);
    1728           0 :         } while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
    1729           0 :         mmu_notifier_invalidate_range_end(&range);
    1730           0 : }
    1731             : 
    1732             : /**
    1733             :  * zap_page_range_single - remove user pages in a given range
    1734             :  * @vma: vm_area_struct holding the applicable pages
    1735             :  * @address: starting address of pages to zap
    1736             :  * @size: number of bytes to zap
    1737             :  * @details: details of shared cache invalidation
    1738             :  *
    1739             :  * The range must fit into one VMA.
    1740             :  */
    1741           0 : void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
    1742             :                 unsigned long size, struct zap_details *details)
    1743             : {
    1744           0 :         const unsigned long end = address + size;
    1745             :         struct mmu_notifier_range range;
    1746             :         struct mmu_gather tlb;
    1747             : 
    1748           0 :         lru_add_drain();
    1749           0 :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
    1750             :                                 address, end);
    1751           0 :         if (is_vm_hugetlb_page(vma))
    1752             :                 adjust_range_if_pmd_sharing_possible(vma, &range.start,
    1753             :                                                      &range.end);
    1754           0 :         tlb_gather_mmu(&tlb, vma->vm_mm);
    1755           0 :         update_hiwater_rss(vma->vm_mm);
    1756           0 :         mmu_notifier_invalidate_range_start(&range);
    1757             :         /*
    1758             :          * unmap 'address-end' not 'range.start-range.end' as range
    1759             :          * could have been expanded for hugetlb pmd sharing.
    1760             :          */
    1761           0 :         unmap_single_vma(&tlb, vma, address, end, details, false);
    1762           0 :         mmu_notifier_invalidate_range_end(&range);
    1763           0 :         tlb_finish_mmu(&tlb);
    1764           0 : }
    1765             : 
    1766             : /**
    1767             :  * zap_vma_ptes - remove ptes mapping the vma
    1768             :  * @vma: vm_area_struct holding ptes to be zapped
    1769             :  * @address: starting address of pages to zap
    1770             :  * @size: number of bytes to zap
    1771             :  *
    1772             :  * This function only unmaps ptes assigned to VM_PFNMAP vmas.
    1773             :  *
    1774             :  * The entire address range must be fully contained within the vma.
    1775             :  *
    1776             :  */
    1777           0 : void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
    1778             :                 unsigned long size)
    1779             : {
    1780           0 :         if (!range_in_vma(vma, address, address + size) ||
    1781           0 :                         !(vma->vm_flags & VM_PFNMAP))
    1782             :                 return;
    1783             : 
    1784           0 :         zap_page_range_single(vma, address, size, NULL);
    1785             : }
    1786             : EXPORT_SYMBOL_GPL(zap_vma_ptes);
    1787             : 
    1788           0 : static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
    1789             : {
    1790             :         pgd_t *pgd;
    1791             :         p4d_t *p4d;
    1792             :         pud_t *pud;
    1793             :         pmd_t *pmd;
    1794             : 
    1795           0 :         pgd = pgd_offset(mm, addr);
    1796           0 :         p4d = p4d_alloc(mm, pgd, addr);
    1797           0 :         if (!p4d)
    1798             :                 return NULL;
    1799           0 :         pud = pud_alloc(mm, p4d, addr);
    1800             :         if (!pud)
    1801             :                 return NULL;
    1802           0 :         pmd = pmd_alloc(mm, pud, addr);
    1803           0 :         if (!pmd)
    1804             :                 return NULL;
    1805             : 
    1806             :         VM_BUG_ON(pmd_trans_huge(*pmd));
    1807           0 :         return pmd;
    1808             : }
    1809             : 
    1810           0 : pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
    1811             :                         spinlock_t **ptl)
    1812             : {
    1813           0 :         pmd_t *pmd = walk_to_pmd(mm, addr);
    1814             : 
    1815           0 :         if (!pmd)
    1816             :                 return NULL;
    1817           0 :         return pte_alloc_map_lock(mm, pmd, addr, ptl);
    1818             : }
    1819             : 
    1820           0 : static int validate_page_before_insert(struct page *page)
    1821             : {
    1822           0 :         if (PageAnon(page) || PageSlab(page) || page_has_type(page))
    1823             :                 return -EINVAL;
    1824             :         flush_dcache_page(page);
    1825             :         return 0;
    1826             : }
    1827             : 
    1828           0 : static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
    1829             :                         unsigned long addr, struct page *page, pgprot_t prot)
    1830             : {
    1831           0 :         if (!pte_none(ptep_get(pte)))
    1832             :                 return -EBUSY;
    1833             :         /* Ok, finally just insert the thing.. */
    1834           0 :         get_page(page);
    1835           0 :         inc_mm_counter(vma->vm_mm, mm_counter_file(page));
    1836           0 :         page_add_file_rmap(page, vma, false);
    1837           0 :         set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
    1838             :         return 0;
    1839             : }
    1840             : 
    1841             : /*
    1842             :  * This is the old fallback for page remapping.
    1843             :  *
    1844             :  * For historical reasons, it only allows reserved pages. Only
    1845             :  * old drivers should use this, and they needed to mark their
    1846             :  * pages reserved for the old functions anyway.
    1847             :  */
    1848           0 : static int insert_page(struct vm_area_struct *vma, unsigned long addr,
    1849             :                         struct page *page, pgprot_t prot)
    1850             : {
    1851             :         int retval;
    1852             :         pte_t *pte;
    1853             :         spinlock_t *ptl;
    1854             : 
    1855           0 :         retval = validate_page_before_insert(page);
    1856           0 :         if (retval)
    1857             :                 goto out;
    1858           0 :         retval = -ENOMEM;
    1859           0 :         pte = get_locked_pte(vma->vm_mm, addr, &ptl);
    1860           0 :         if (!pte)
    1861             :                 goto out;
    1862           0 :         retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
    1863           0 :         pte_unmap_unlock(pte, ptl);
    1864             : out:
    1865           0 :         return retval;
    1866             : }
    1867             : 
    1868             : #ifdef pte_index
    1869           0 : static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
    1870             :                         unsigned long addr, struct page *page, pgprot_t prot)
    1871             : {
    1872             :         int err;
    1873             : 
    1874           0 :         if (!page_count(page))
    1875             :                 return -EINVAL;
    1876           0 :         err = validate_page_before_insert(page);
    1877           0 :         if (err)
    1878             :                 return err;
    1879           0 :         return insert_page_into_pte_locked(vma, pte, addr, page, prot);
    1880             : }
    1881             : 
    1882             : /* insert_pages() amortizes the cost of spinlock operations
    1883             :  * when inserting pages in a loop. Arch *must* define pte_index.
    1884             :  */
    1885           0 : static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
    1886             :                         struct page **pages, unsigned long *num, pgprot_t prot)
    1887             : {
    1888           0 :         pmd_t *pmd = NULL;
    1889             :         pte_t *start_pte, *pte;
    1890             :         spinlock_t *pte_lock;
    1891           0 :         struct mm_struct *const mm = vma->vm_mm;
    1892           0 :         unsigned long curr_page_idx = 0;
    1893           0 :         unsigned long remaining_pages_total = *num;
    1894             :         unsigned long pages_to_write_in_pmd;
    1895             :         int ret;
    1896             : more:
    1897           0 :         ret = -EFAULT;
    1898           0 :         pmd = walk_to_pmd(mm, addr);
    1899           0 :         if (!pmd)
    1900             :                 goto out;
    1901             : 
    1902           0 :         pages_to_write_in_pmd = min_t(unsigned long,
    1903             :                 remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
    1904             : 
    1905             :         /* Allocate the PTE if necessary; takes PMD lock once only. */
    1906           0 :         ret = -ENOMEM;
    1907           0 :         if (pte_alloc(mm, pmd))
    1908             :                 goto out;
    1909             : 
    1910           0 :         while (pages_to_write_in_pmd) {
    1911           0 :                 int pte_idx = 0;
    1912           0 :                 const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
    1913             : 
    1914           0 :                 start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
    1915           0 :                 if (!start_pte) {
    1916             :                         ret = -EFAULT;
    1917             :                         goto out;
    1918             :                 }
    1919           0 :                 for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
    1920           0 :                         int err = insert_page_in_batch_locked(vma, pte,
    1921           0 :                                 addr, pages[curr_page_idx], prot);
    1922           0 :                         if (unlikely(err)) {
    1923           0 :                                 pte_unmap_unlock(start_pte, pte_lock);
    1924           0 :                                 ret = err;
    1925           0 :                                 remaining_pages_total -= pte_idx;
    1926           0 :                                 goto out;
    1927             :                         }
    1928           0 :                         addr += PAGE_SIZE;
    1929           0 :                         ++curr_page_idx;
    1930             :                 }
    1931           0 :                 pte_unmap_unlock(start_pte, pte_lock);
    1932           0 :                 pages_to_write_in_pmd -= batch_size;
    1933           0 :                 remaining_pages_total -= batch_size;
    1934             :         }
    1935           0 :         if (remaining_pages_total)
    1936             :                 goto more;
    1937             :         ret = 0;
    1938             : out:
    1939           0 :         *num = remaining_pages_total;
    1940           0 :         return ret;
    1941             : }
    1942             : #endif  /* ifdef pte_index */
    1943             : 
    1944             : /**
    1945             :  * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
    1946             :  * @vma: user vma to map to
    1947             :  * @addr: target start user address of these pages
    1948             :  * @pages: source kernel pages
    1949             :  * @num: in: number of pages to map. out: number of pages that were *not*
    1950             :  * mapped. (0 means all pages were successfully mapped).
    1951             :  *
    1952             :  * Preferred over vm_insert_page() when inserting multiple pages.
    1953             :  *
    1954             :  * In case of error, we may have mapped a subset of the provided
    1955             :  * pages. It is the caller's responsibility to account for this case.
    1956             :  *
    1957             :  * The same restrictions apply as in vm_insert_page().
    1958             :  */
    1959           0 : int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
    1960             :                         struct page **pages, unsigned long *num)
    1961             : {
    1962             : #ifdef pte_index
    1963           0 :         const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
    1964             : 
    1965           0 :         if (addr < vma->vm_start || end_addr >= vma->vm_end)
    1966             :                 return -EFAULT;
    1967           0 :         if (!(vma->vm_flags & VM_MIXEDMAP)) {
    1968           0 :                 BUG_ON(mmap_read_trylock(vma->vm_mm));
    1969           0 :                 BUG_ON(vma->vm_flags & VM_PFNMAP);
    1970           0 :                 vm_flags_set(vma, VM_MIXEDMAP);
    1971             :         }
    1972             :         /* Defer page refcount checking till we're about to map that page. */
    1973           0 :         return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
    1974             : #else
    1975             :         unsigned long idx = 0, pgcount = *num;
    1976             :         int err = -EINVAL;
    1977             : 
    1978             :         for (; idx < pgcount; ++idx) {
    1979             :                 err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
    1980             :                 if (err)
    1981             :                         break;
    1982             :         }
    1983             :         *num = pgcount - idx;
    1984             :         return err;
    1985             : #endif  /* ifdef pte_index */
    1986             : }
    1987             : EXPORT_SYMBOL(vm_insert_pages);
    1988             : 
    1989             : /**
    1990             :  * vm_insert_page - insert single page into user vma
    1991             :  * @vma: user vma to map to
    1992             :  * @addr: target user address of this page
    1993             :  * @page: source kernel page
    1994             :  *
    1995             :  * This allows drivers to insert individual pages they've allocated
    1996             :  * into a user vma.
    1997             :  *
    1998             :  * The page has to be a nice clean _individual_ kernel allocation.
    1999             :  * If you allocate a compound page, you need to have marked it as
    2000             :  * such (__GFP_COMP), or manually just split the page up yourself
    2001             :  * (see split_page()).
    2002             :  *
    2003             :  * NOTE! Traditionally this was done with "remap_pfn_range()" which
    2004             :  * took an arbitrary page protection parameter. This doesn't allow
    2005             :  * that. Your vma protection will have to be set up correctly, which
    2006             :  * means that if you want a shared writable mapping, you'd better
    2007             :  * ask for a shared writable mapping!
    2008             :  *
    2009             :  * The page does not need to be reserved.
    2010             :  *
    2011             :  * Usually this function is called from f_op->mmap() handler
    2012             :  * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
    2013             :  * Caller must set VM_MIXEDMAP on vma if it wants to call this
    2014             :  * function from other places, for example from page-fault handler.
    2015             :  *
    2016             :  * Return: %0 on success, negative error code otherwise.
    2017             :  */
    2018           0 : int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
    2019             :                         struct page *page)
    2020             : {
    2021           0 :         if (addr < vma->vm_start || addr >= vma->vm_end)
    2022             :                 return -EFAULT;
    2023           0 :         if (!page_count(page))
    2024             :                 return -EINVAL;
    2025           0 :         if (!(vma->vm_flags & VM_MIXEDMAP)) {
    2026           0 :                 BUG_ON(mmap_read_trylock(vma->vm_mm));
    2027           0 :                 BUG_ON(vma->vm_flags & VM_PFNMAP);
    2028           0 :                 vm_flags_set(vma, VM_MIXEDMAP);
    2029             :         }
    2030           0 :         return insert_page(vma, addr, page, vma->vm_page_prot);
    2031             : }
    2032             : EXPORT_SYMBOL(vm_insert_page);
    2033             : 
    2034             : /*
    2035             :  * __vm_map_pages - maps range of kernel pages into user vma
    2036             :  * @vma: user vma to map to
    2037             :  * @pages: pointer to array of source kernel pages
    2038             :  * @num: number of pages in page array
    2039             :  * @offset: user's requested vm_pgoff
    2040             :  *
    2041             :  * This allows drivers to map range of kernel pages into a user vma.
    2042             :  *
    2043             :  * Return: 0 on success and error code otherwise.
    2044             :  */
    2045           0 : static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
    2046             :                                 unsigned long num, unsigned long offset)
    2047             : {
    2048           0 :         unsigned long count = vma_pages(vma);
    2049           0 :         unsigned long uaddr = vma->vm_start;
    2050             :         int ret, i;
    2051             : 
    2052             :         /* Fail if the user requested offset is beyond the end of the object */
    2053           0 :         if (offset >= num)
    2054             :                 return -ENXIO;
    2055             : 
    2056             :         /* Fail if the user requested size exceeds available object size */
    2057           0 :         if (count > num - offset)
    2058             :                 return -ENXIO;
    2059             : 
    2060           0 :         for (i = 0; i < count; i++) {
    2061           0 :                 ret = vm_insert_page(vma, uaddr, pages[offset + i]);
    2062           0 :                 if (ret < 0)
    2063             :                         return ret;
    2064           0 :                 uaddr += PAGE_SIZE;
    2065             :         }
    2066             : 
    2067             :         return 0;
    2068             : }
    2069             : 
    2070             : /**
    2071             :  * vm_map_pages - maps range of kernel pages starts with non zero offset
    2072             :  * @vma: user vma to map to
    2073             :  * @pages: pointer to array of source kernel pages
    2074             :  * @num: number of pages in page array
    2075             :  *
    2076             :  * Maps an object consisting of @num pages, catering for the user's
    2077             :  * requested vm_pgoff
    2078             :  *
    2079             :  * If we fail to insert any page into the vma, the function will return
    2080             :  * immediately leaving any previously inserted pages present.  Callers
    2081             :  * from the mmap handler may immediately return the error as their caller
    2082             :  * will destroy the vma, removing any successfully inserted pages. Other
    2083             :  * callers should make their own arrangements for calling unmap_region().
    2084             :  *
    2085             :  * Context: Process context. Called by mmap handlers.
    2086             :  * Return: 0 on success and error code otherwise.
    2087             :  */
    2088           0 : int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
    2089             :                                 unsigned long num)
    2090             : {
    2091           0 :         return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
    2092             : }
    2093             : EXPORT_SYMBOL(vm_map_pages);
    2094             : 
    2095             : /**
    2096             :  * vm_map_pages_zero - map range of kernel pages starts with zero offset
    2097             :  * @vma: user vma to map to
    2098             :  * @pages: pointer to array of source kernel pages
    2099             :  * @num: number of pages in page array
    2100             :  *
    2101             :  * Similar to vm_map_pages(), except that it explicitly sets the offset
    2102             :  * to 0. This function is intended for the drivers that did not consider
    2103             :  * vm_pgoff.
    2104             :  *
    2105             :  * Context: Process context. Called by mmap handlers.
    2106             :  * Return: 0 on success and error code otherwise.
    2107             :  */
    2108           0 : int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
    2109             :                                 unsigned long num)
    2110             : {
    2111           0 :         return __vm_map_pages(vma, pages, num, 0);
    2112             : }
    2113             : EXPORT_SYMBOL(vm_map_pages_zero);
    2114             : 
    2115           0 : static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
    2116             :                         pfn_t pfn, pgprot_t prot, bool mkwrite)
    2117             : {
    2118           0 :         struct mm_struct *mm = vma->vm_mm;
    2119             :         pte_t *pte, entry;
    2120             :         spinlock_t *ptl;
    2121             : 
    2122           0 :         pte = get_locked_pte(mm, addr, &ptl);
    2123           0 :         if (!pte)
    2124             :                 return VM_FAULT_OOM;
    2125           0 :         entry = ptep_get(pte);
    2126           0 :         if (!pte_none(entry)) {
    2127           0 :                 if (mkwrite) {
    2128             :                         /*
    2129             :                          * For read faults on private mappings the PFN passed
    2130             :                          * in may not match the PFN we have mapped if the
    2131             :                          * mapped PFN is a writeable COW page.  In the mkwrite
    2132             :                          * case we are creating a writable PTE for a shared
    2133             :                          * mapping and we expect the PFNs to match. If they
    2134             :                          * don't match, we are likely racing with block
    2135             :                          * allocation and mapping invalidation so just skip the
    2136             :                          * update.
    2137             :                          */
    2138           0 :                         if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
    2139           0 :                                 WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
    2140             :                                 goto out_unlock;
    2141             :                         }
    2142           0 :                         entry = pte_mkyoung(entry);
    2143           0 :                         entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    2144           0 :                         if (ptep_set_access_flags(vma, addr, pte, entry, 1))
    2145             :                                 update_mmu_cache(vma, addr, pte);
    2146             :                 }
    2147             :                 goto out_unlock;
    2148             :         }
    2149             : 
    2150             :         /* Ok, finally just insert the thing.. */
    2151           0 :         if (pfn_t_devmap(pfn))
    2152             :                 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
    2153             :         else
    2154           0 :                 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
    2155             : 
    2156           0 :         if (mkwrite) {
    2157           0 :                 entry = pte_mkyoung(entry);
    2158           0 :                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    2159             :         }
    2160             : 
    2161           0 :         set_pte_at(mm, addr, pte, entry);
    2162             :         update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
    2163             : 
    2164             : out_unlock:
    2165           0 :         pte_unmap_unlock(pte, ptl);
    2166           0 :         return VM_FAULT_NOPAGE;
    2167             : }
    2168             : 
    2169             : /**
    2170             :  * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
    2171             :  * @vma: user vma to map to
    2172             :  * @addr: target user address of this page
    2173             :  * @pfn: source kernel pfn
    2174             :  * @pgprot: pgprot flags for the inserted page
    2175             :  *
    2176             :  * This is exactly like vmf_insert_pfn(), except that it allows drivers
    2177             :  * to override pgprot on a per-page basis.
    2178             :  *
    2179             :  * This only makes sense for IO mappings, and it makes no sense for
    2180             :  * COW mappings.  In general, using multiple vmas is preferable;
    2181             :  * vmf_insert_pfn_prot should only be used if using multiple VMAs is
    2182             :  * impractical.
    2183             :  *
    2184             :  * pgprot typically only differs from @vma->vm_page_prot when drivers set
    2185             :  * caching- and encryption bits different than those of @vma->vm_page_prot,
    2186             :  * because the caching- or encryption mode may not be known at mmap() time.
    2187             :  *
    2188             :  * This is ok as long as @vma->vm_page_prot is not used by the core vm
    2189             :  * to set caching and encryption bits for those vmas (except for COW pages).
    2190             :  * This is ensured by core vm only modifying these page table entries using
    2191             :  * functions that don't touch caching- or encryption bits, using pte_modify()
    2192             :  * if needed. (See for example mprotect()).
    2193             :  *
    2194             :  * Also when new page-table entries are created, this is only done using the
    2195             :  * fault() callback, and never using the value of vma->vm_page_prot,
    2196             :  * except for page-table entries that point to anonymous pages as the result
    2197             :  * of COW.
    2198             :  *
    2199             :  * Context: Process context.  May allocate using %GFP_KERNEL.
    2200             :  * Return: vm_fault_t value.
    2201             :  */
    2202           0 : vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
    2203             :                         unsigned long pfn, pgprot_t pgprot)
    2204             : {
    2205             :         /*
    2206             :          * Technically, architectures with pte_special can avoid all these
    2207             :          * restrictions (same for remap_pfn_range).  However we would like
    2208             :          * consistency in testing and feature parity among all, so we should
    2209             :          * try to keep these invariants in place for everybody.
    2210             :          */
    2211           0 :         BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
    2212           0 :         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
    2213             :                                                 (VM_PFNMAP|VM_MIXEDMAP));
    2214           0 :         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
    2215           0 :         BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
    2216             : 
    2217           0 :         if (addr < vma->vm_start || addr >= vma->vm_end)
    2218             :                 return VM_FAULT_SIGBUS;
    2219             : 
    2220           0 :         if (!pfn_modify_allowed(pfn, pgprot))
    2221             :                 return VM_FAULT_SIGBUS;
    2222             : 
    2223           0 :         track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
    2224             : 
    2225           0 :         return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
    2226             :                         false);
    2227             : }
    2228             : EXPORT_SYMBOL(vmf_insert_pfn_prot);
    2229             : 
    2230             : /**
    2231             :  * vmf_insert_pfn - insert single pfn into user vma
    2232             :  * @vma: user vma to map to
    2233             :  * @addr: target user address of this page
    2234             :  * @pfn: source kernel pfn
    2235             :  *
    2236             :  * Similar to vm_insert_page, this allows drivers to insert individual pages
    2237             :  * they've allocated into a user vma. Same comments apply.
    2238             :  *
    2239             :  * This function should only be called from a vm_ops->fault handler, and
    2240             :  * in that case the handler should return the result of this function.
    2241             :  *
    2242             :  * vma cannot be a COW mapping.
    2243             :  *
    2244             :  * As this is called only for pages that do not currently exist, we
    2245             :  * do not need to flush old virtual caches or the TLB.
    2246             :  *
    2247             :  * Context: Process context.  May allocate using %GFP_KERNEL.
    2248             :  * Return: vm_fault_t value.
    2249             :  */
    2250           0 : vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
    2251             :                         unsigned long pfn)
    2252             : {
    2253           0 :         return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
    2254             : }
    2255             : EXPORT_SYMBOL(vmf_insert_pfn);
    2256             : 
    2257             : static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
    2258             : {
    2259             :         /* these checks mirror the abort conditions in vm_normal_page */
    2260           0 :         if (vma->vm_flags & VM_MIXEDMAP)
    2261             :                 return true;
    2262           0 :         if (pfn_t_devmap(pfn))
    2263             :                 return true;
    2264           0 :         if (pfn_t_special(pfn))
    2265             :                 return true;
    2266           0 :         if (is_zero_pfn(pfn_t_to_pfn(pfn)))
    2267             :                 return true;
    2268             :         return false;
    2269             : }
    2270             : 
    2271           0 : static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
    2272             :                 unsigned long addr, pfn_t pfn, bool mkwrite)
    2273             : {
    2274           0 :         pgprot_t pgprot = vma->vm_page_prot;
    2275             :         int err;
    2276             : 
    2277           0 :         BUG_ON(!vm_mixed_ok(vma, pfn));
    2278             : 
    2279           0 :         if (addr < vma->vm_start || addr >= vma->vm_end)
    2280             :                 return VM_FAULT_SIGBUS;
    2281             : 
    2282           0 :         track_pfn_insert(vma, &pgprot, pfn);
    2283             : 
    2284           0 :         if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
    2285             :                 return VM_FAULT_SIGBUS;
    2286             : 
    2287             :         /*
    2288             :          * If we don't have pte special, then we have to use the pfn_valid()
    2289             :          * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
    2290             :          * refcount the page if pfn_valid is true (hence insert_page rather
    2291             :          * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
    2292             :          * without pte special, it would there be refcounted as a normal page.
    2293             :          */
    2294             :         if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
    2295           0 :             !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
    2296             :                 struct page *page;
    2297             : 
    2298             :                 /*
    2299             :                  * At this point we are committed to insert_page()
    2300             :                  * regardless of whether the caller specified flags that
    2301             :                  * result in pfn_t_has_page() == false.
    2302             :                  */
    2303           0 :                 page = pfn_to_page(pfn_t_to_pfn(pfn));
    2304           0 :                 err = insert_page(vma, addr, page, pgprot);
    2305             :         } else {
    2306           0 :                 return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
    2307             :         }
    2308             : 
    2309           0 :         if (err == -ENOMEM)
    2310             :                 return VM_FAULT_OOM;
    2311           0 :         if (err < 0 && err != -EBUSY)
    2312             :                 return VM_FAULT_SIGBUS;
    2313             : 
    2314           0 :         return VM_FAULT_NOPAGE;
    2315             : }
    2316             : 
    2317           0 : vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
    2318             :                 pfn_t pfn)
    2319             : {
    2320           0 :         return __vm_insert_mixed(vma, addr, pfn, false);
    2321             : }
    2322             : EXPORT_SYMBOL(vmf_insert_mixed);
    2323             : 
    2324             : /*
    2325             :  *  If the insertion of PTE failed because someone else already added a
    2326             :  *  different entry in the mean time, we treat that as success as we assume
    2327             :  *  the same entry was actually inserted.
    2328             :  */
    2329           0 : vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
    2330             :                 unsigned long addr, pfn_t pfn)
    2331             : {
    2332           0 :         return __vm_insert_mixed(vma, addr, pfn, true);
    2333             : }
    2334             : EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
    2335             : 
    2336             : /*
    2337             :  * maps a range of physical memory into the requested pages. the old
    2338             :  * mappings are removed. any references to nonexistent pages results
    2339             :  * in null mappings (currently treated as "copy-on-access")
    2340             :  */
    2341           0 : static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
    2342             :                         unsigned long addr, unsigned long end,
    2343             :                         unsigned long pfn, pgprot_t prot)
    2344             : {
    2345             :         pte_t *pte, *mapped_pte;
    2346             :         spinlock_t *ptl;
    2347           0 :         int err = 0;
    2348             : 
    2349           0 :         mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
    2350           0 :         if (!pte)
    2351             :                 return -ENOMEM;
    2352             :         arch_enter_lazy_mmu_mode();
    2353             :         do {
    2354           0 :                 BUG_ON(!pte_none(ptep_get(pte)));
    2355           0 :                 if (!pfn_modify_allowed(pfn, prot)) {
    2356             :                         err = -EACCES;
    2357             :                         break;
    2358             :                 }
    2359           0 :                 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
    2360           0 :                 pfn++;
    2361           0 :         } while (pte++, addr += PAGE_SIZE, addr != end);
    2362             :         arch_leave_lazy_mmu_mode();
    2363           0 :         pte_unmap_unlock(mapped_pte, ptl);
    2364           0 :         return err;
    2365             : }
    2366             : 
    2367           0 : static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
    2368             :                         unsigned long addr, unsigned long end,
    2369             :                         unsigned long pfn, pgprot_t prot)
    2370             : {
    2371             :         pmd_t *pmd;
    2372             :         unsigned long next;
    2373             :         int err;
    2374             : 
    2375           0 :         pfn -= addr >> PAGE_SHIFT;
    2376           0 :         pmd = pmd_alloc(mm, pud, addr);
    2377           0 :         if (!pmd)
    2378             :                 return -ENOMEM;
    2379             :         VM_BUG_ON(pmd_trans_huge(*pmd));
    2380             :         do {
    2381           0 :                 next = pmd_addr_end(addr, end);
    2382           0 :                 err = remap_pte_range(mm, pmd, addr, next,
    2383           0 :                                 pfn + (addr >> PAGE_SHIFT), prot);
    2384           0 :                 if (err)
    2385             :                         return err;
    2386           0 :         } while (pmd++, addr = next, addr != end);
    2387             :         return 0;
    2388             : }
    2389             : 
    2390             : static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
    2391             :                         unsigned long addr, unsigned long end,
    2392             :                         unsigned long pfn, pgprot_t prot)
    2393             : {
    2394             :         pud_t *pud;
    2395             :         unsigned long next;
    2396             :         int err;
    2397             : 
    2398           0 :         pfn -= addr >> PAGE_SHIFT;
    2399           0 :         pud = pud_alloc(mm, p4d, addr);
    2400             :         if (!pud)
    2401             :                 return -ENOMEM;
    2402             :         do {
    2403           0 :                 next = pud_addr_end(addr, end);
    2404           0 :                 err = remap_pmd_range(mm, pud, addr, next,
    2405             :                                 pfn + (addr >> PAGE_SHIFT), prot);
    2406           0 :                 if (err)
    2407             :                         return err;
    2408           0 :         } while (pud++, addr = next, addr != end);
    2409             :         return 0;
    2410             : }
    2411             : 
    2412           0 : static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
    2413             :                         unsigned long addr, unsigned long end,
    2414             :                         unsigned long pfn, pgprot_t prot)
    2415             : {
    2416             :         p4d_t *p4d;
    2417             :         unsigned long next;
    2418             :         int err;
    2419             : 
    2420           0 :         pfn -= addr >> PAGE_SHIFT;
    2421           0 :         p4d = p4d_alloc(mm, pgd, addr);
    2422           0 :         if (!p4d)
    2423             :                 return -ENOMEM;
    2424             :         do {
    2425           0 :                 next = p4d_addr_end(addr, end);
    2426           0 :                 err = remap_pud_range(mm, p4d, addr, next,
    2427             :                                 pfn + (addr >> PAGE_SHIFT), prot);
    2428           0 :                 if (err)
    2429             :                         return err;
    2430           0 :         } while (p4d++, addr = next, addr != end);
    2431           0 :         return 0;
    2432             : }
    2433             : 
    2434             : /*
    2435             :  * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
    2436             :  * must have pre-validated the caching bits of the pgprot_t.
    2437             :  */
    2438           0 : int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
    2439             :                 unsigned long pfn, unsigned long size, pgprot_t prot)
    2440             : {
    2441             :         pgd_t *pgd;
    2442             :         unsigned long next;
    2443           0 :         unsigned long end = addr + PAGE_ALIGN(size);
    2444           0 :         struct mm_struct *mm = vma->vm_mm;
    2445             :         int err;
    2446             : 
    2447           0 :         if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
    2448             :                 return -EINVAL;
    2449             : 
    2450             :         /*
    2451             :          * Physically remapped pages are special. Tell the
    2452             :          * rest of the world about it:
    2453             :          *   VM_IO tells people not to look at these pages
    2454             :          *      (accesses can have side effects).
    2455             :          *   VM_PFNMAP tells the core MM that the base pages are just
    2456             :          *      raw PFN mappings, and do not have a "struct page" associated
    2457             :          *      with them.
    2458             :          *   VM_DONTEXPAND
    2459             :          *      Disable vma merging and expanding with mremap().
    2460             :          *   VM_DONTDUMP
    2461             :          *      Omit vma from core dump, even when VM_IO turned off.
    2462             :          *
    2463             :          * There's a horrible special case to handle copy-on-write
    2464             :          * behaviour that some programs depend on. We mark the "original"
    2465             :          * un-COW'ed pages by matching them up with "vma->vm_pgoff".
    2466             :          * See vm_normal_page() for details.
    2467             :          */
    2468           0 :         if (is_cow_mapping(vma->vm_flags)) {
    2469           0 :                 if (addr != vma->vm_start || end != vma->vm_end)
    2470             :                         return -EINVAL;
    2471           0 :                 vma->vm_pgoff = pfn;
    2472             :         }
    2473             : 
    2474           0 :         vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
    2475             : 
    2476           0 :         BUG_ON(addr >= end);
    2477           0 :         pfn -= addr >> PAGE_SHIFT;
    2478           0 :         pgd = pgd_offset(mm, addr);
    2479           0 :         flush_cache_range(vma, addr, end);
    2480             :         do {
    2481           0 :                 next = pgd_addr_end(addr, end);
    2482           0 :                 err = remap_p4d_range(mm, pgd, addr, next,
    2483           0 :                                 pfn + (addr >> PAGE_SHIFT), prot);
    2484           0 :                 if (err)
    2485             :                         return err;
    2486           0 :         } while (pgd++, addr = next, addr != end);
    2487             : 
    2488             :         return 0;
    2489             : }
    2490             : 
    2491             : /**
    2492             :  * remap_pfn_range - remap kernel memory to userspace
    2493             :  * @vma: user vma to map to
    2494             :  * @addr: target page aligned user address to start at
    2495             :  * @pfn: page frame number of kernel physical memory address
    2496             :  * @size: size of mapping area
    2497             :  * @prot: page protection flags for this mapping
    2498             :  *
    2499             :  * Note: this is only safe if the mm semaphore is held when called.
    2500             :  *
    2501             :  * Return: %0 on success, negative error code otherwise.
    2502             :  */
    2503           0 : int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
    2504             :                     unsigned long pfn, unsigned long size, pgprot_t prot)
    2505             : {
    2506             :         int err;
    2507             : 
    2508           0 :         err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
    2509             :         if (err)
    2510             :                 return -EINVAL;
    2511             : 
    2512           0 :         err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
    2513             :         if (err)
    2514             :                 untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
    2515             :         return err;
    2516             : }
    2517             : EXPORT_SYMBOL(remap_pfn_range);
    2518             : 
    2519             : /**
    2520             :  * vm_iomap_memory - remap memory to userspace
    2521             :  * @vma: user vma to map to
    2522             :  * @start: start of the physical memory to be mapped
    2523             :  * @len: size of area
    2524             :  *
    2525             :  * This is a simplified io_remap_pfn_range() for common driver use. The
    2526             :  * driver just needs to give us the physical memory range to be mapped,
    2527             :  * we'll figure out the rest from the vma information.
    2528             :  *
    2529             :  * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
    2530             :  * whatever write-combining details or similar.
    2531             :  *
    2532             :  * Return: %0 on success, negative error code otherwise.
    2533             :  */
    2534           0 : int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
    2535             : {
    2536             :         unsigned long vm_len, pfn, pages;
    2537             : 
    2538             :         /* Check that the physical memory area passed in looks valid */
    2539           0 :         if (start + len < start)
    2540             :                 return -EINVAL;
    2541             :         /*
    2542             :          * You *really* shouldn't map things that aren't page-aligned,
    2543             :          * but we've historically allowed it because IO memory might
    2544             :          * just have smaller alignment.
    2545             :          */
    2546           0 :         len += start & ~PAGE_MASK;
    2547           0 :         pfn = start >> PAGE_SHIFT;
    2548           0 :         pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
    2549           0 :         if (pfn + pages < pfn)
    2550             :                 return -EINVAL;
    2551             : 
    2552             :         /* We start the mapping 'vm_pgoff' pages into the area */
    2553           0 :         if (vma->vm_pgoff > pages)
    2554             :                 return -EINVAL;
    2555           0 :         pfn += vma->vm_pgoff;
    2556           0 :         pages -= vma->vm_pgoff;
    2557             : 
    2558             :         /* Can we fit all of the mapping? */
    2559           0 :         vm_len = vma->vm_end - vma->vm_start;
    2560           0 :         if (vm_len >> PAGE_SHIFT > pages)
    2561             :                 return -EINVAL;
    2562             : 
    2563             :         /* Ok, let it rip */
    2564           0 :         return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
    2565             : }
    2566             : EXPORT_SYMBOL(vm_iomap_memory);
    2567             : 
    2568           0 : static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
    2569             :                                      unsigned long addr, unsigned long end,
    2570             :                                      pte_fn_t fn, void *data, bool create,
    2571             :                                      pgtbl_mod_mask *mask)
    2572             : {
    2573             :         pte_t *pte, *mapped_pte;
    2574           0 :         int err = 0;
    2575             :         spinlock_t *ptl;
    2576             : 
    2577           0 :         if (create) {
    2578           0 :                 mapped_pte = pte = (mm == &init_mm) ?
    2579           0 :                         pte_alloc_kernel_track(pmd, addr, mask) :
    2580           0 :                         pte_alloc_map_lock(mm, pmd, addr, &ptl);
    2581           0 :                 if (!pte)
    2582             :                         return -ENOMEM;
    2583             :         } else {
    2584           0 :                 mapped_pte = pte = (mm == &init_mm) ?
    2585           0 :                         pte_offset_kernel(pmd, addr) :
    2586             :                         pte_offset_map_lock(mm, pmd, addr, &ptl);
    2587           0 :                 if (!pte)
    2588             :                         return -EINVAL;
    2589             :         }
    2590             : 
    2591             :         arch_enter_lazy_mmu_mode();
    2592             : 
    2593           0 :         if (fn) {
    2594             :                 do {
    2595           0 :                         if (create || !pte_none(ptep_get(pte))) {
    2596           0 :                                 err = fn(pte++, addr, data);
    2597           0 :                                 if (err)
    2598             :                                         break;
    2599             :                         }
    2600           0 :                 } while (addr += PAGE_SIZE, addr != end);
    2601             :         }
    2602           0 :         *mask |= PGTBL_PTE_MODIFIED;
    2603             : 
    2604             :         arch_leave_lazy_mmu_mode();
    2605             : 
    2606           0 :         if (mm != &init_mm)
    2607           0 :                 pte_unmap_unlock(mapped_pte, ptl);
    2608             :         return err;
    2609             : }
    2610             : 
    2611           0 : static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
    2612             :                                      unsigned long addr, unsigned long end,
    2613             :                                      pte_fn_t fn, void *data, bool create,
    2614             :                                      pgtbl_mod_mask *mask)
    2615             : {
    2616             :         pmd_t *pmd;
    2617             :         unsigned long next;
    2618           0 :         int err = 0;
    2619             : 
    2620           0 :         BUG_ON(pud_huge(*pud));
    2621             : 
    2622           0 :         if (create) {
    2623           0 :                 pmd = pmd_alloc_track(mm, pud, addr, mask);
    2624           0 :                 if (!pmd)
    2625             :                         return -ENOMEM;
    2626             :         } else {
    2627           0 :                 pmd = pmd_offset(pud, addr);
    2628             :         }
    2629             :         do {
    2630           0 :                 next = pmd_addr_end(addr, end);
    2631           0 :                 if (pmd_none(*pmd) && !create)
    2632           0 :                         continue;
    2633           0 :                 if (WARN_ON_ONCE(pmd_leaf(*pmd)))
    2634             :                         return -EINVAL;
    2635           0 :                 if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
    2636           0 :                         if (!create)
    2637           0 :                                 continue;
    2638           0 :                         pmd_clear_bad(pmd);
    2639             :                 }
    2640           0 :                 err = apply_to_pte_range(mm, pmd, addr, next,
    2641             :                                          fn, data, create, mask);
    2642           0 :                 if (err)
    2643             :                         break;
    2644           0 :         } while (pmd++, addr = next, addr != end);
    2645             : 
    2646             :         return err;
    2647             : }
    2648             : 
    2649           0 : static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
    2650             :                                      unsigned long addr, unsigned long end,
    2651             :                                      pte_fn_t fn, void *data, bool create,
    2652             :                                      pgtbl_mod_mask *mask)
    2653             : {
    2654             :         pud_t *pud;
    2655             :         unsigned long next;
    2656           0 :         int err = 0;
    2657             : 
    2658           0 :         if (create) {
    2659           0 :                 pud = pud_alloc_track(mm, p4d, addr, mask);
    2660           0 :                 if (!pud)
    2661             :                         return -ENOMEM;
    2662             :         } else {
    2663             :                 pud = pud_offset(p4d, addr);
    2664             :         }
    2665             :         do {
    2666           0 :                 next = pud_addr_end(addr, end);
    2667           0 :                 if (pud_none(*pud) && !create)
    2668           0 :                         continue;
    2669           0 :                 if (WARN_ON_ONCE(pud_leaf(*pud)))
    2670             :                         return -EINVAL;
    2671           0 :                 if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
    2672           0 :                         if (!create)
    2673           0 :                                 continue;
    2674             :                         pud_clear_bad(pud);
    2675             :                 }
    2676           0 :                 err = apply_to_pmd_range(mm, pud, addr, next,
    2677             :                                          fn, data, create, mask);
    2678           0 :                 if (err)
    2679             :                         break;
    2680           0 :         } while (pud++, addr = next, addr != end);
    2681             : 
    2682             :         return err;
    2683             : }
    2684             : 
    2685             : static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
    2686             :                                      unsigned long addr, unsigned long end,
    2687             :                                      pte_fn_t fn, void *data, bool create,
    2688             :                                      pgtbl_mod_mask *mask)
    2689             : {
    2690             :         p4d_t *p4d;
    2691             :         unsigned long next;
    2692           0 :         int err = 0;
    2693             : 
    2694           0 :         if (create) {
    2695           0 :                 p4d = p4d_alloc_track(mm, pgd, addr, mask);
    2696           0 :                 if (!p4d)
    2697             :                         return -ENOMEM;
    2698             :         } else {
    2699             :                 p4d = p4d_offset(pgd, addr);
    2700             :         }
    2701             :         do {
    2702           0 :                 next = p4d_addr_end(addr, end);
    2703           0 :                 if (p4d_none(*p4d) && !create)
    2704             :                         continue;
    2705           0 :                 if (WARN_ON_ONCE(p4d_leaf(*p4d)))
    2706             :                         return -EINVAL;
    2707           0 :                 if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
    2708             :                         if (!create)
    2709             :                                 continue;
    2710             :                         p4d_clear_bad(p4d);
    2711             :                 }
    2712           0 :                 err = apply_to_pud_range(mm, p4d, addr, next,
    2713             :                                          fn, data, create, mask);
    2714             :                 if (err)
    2715             :                         break;
    2716             :         } while (p4d++, addr = next, addr != end);
    2717             : 
    2718             :         return err;
    2719             : }
    2720             : 
    2721           0 : static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
    2722             :                                  unsigned long size, pte_fn_t fn,
    2723             :                                  void *data, bool create)
    2724             : {
    2725             :         pgd_t *pgd;
    2726           0 :         unsigned long start = addr, next;
    2727           0 :         unsigned long end = addr + size;
    2728           0 :         pgtbl_mod_mask mask = 0;
    2729           0 :         int err = 0;
    2730             : 
    2731           0 :         if (WARN_ON(addr >= end))
    2732             :                 return -EINVAL;
    2733             : 
    2734           0 :         pgd = pgd_offset(mm, addr);
    2735             :         do {
    2736           0 :                 next = pgd_addr_end(addr, end);
    2737           0 :                 if (pgd_none(*pgd) && !create)
    2738             :                         continue;
    2739           0 :                 if (WARN_ON_ONCE(pgd_leaf(*pgd)))
    2740             :                         return -EINVAL;
    2741           0 :                 if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
    2742             :                         if (!create)
    2743             :                                 continue;
    2744             :                         pgd_clear_bad(pgd);
    2745             :                 }
    2746           0 :                 err = apply_to_p4d_range(mm, pgd, addr, next,
    2747             :                                          fn, data, create, &mask);
    2748           0 :                 if (err)
    2749             :                         break;
    2750           0 :         } while (pgd++, addr = next, addr != end);
    2751             : 
    2752             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
    2753             :                 arch_sync_kernel_mappings(start, start + size);
    2754             : 
    2755             :         return err;
    2756             : }
    2757             : 
    2758             : /*
    2759             :  * Scan a region of virtual memory, filling in page tables as necessary
    2760             :  * and calling a provided function on each leaf page table.
    2761             :  */
    2762           0 : int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
    2763             :                         unsigned long size, pte_fn_t fn, void *data)
    2764             : {
    2765           0 :         return __apply_to_page_range(mm, addr, size, fn, data, true);
    2766             : }
    2767             : EXPORT_SYMBOL_GPL(apply_to_page_range);
    2768             : 
    2769             : /*
    2770             :  * Scan a region of virtual memory, calling a provided function on
    2771             :  * each leaf page table where it exists.
    2772             :  *
    2773             :  * Unlike apply_to_page_range, this does _not_ fill in page tables
    2774             :  * where they are absent.
    2775             :  */
    2776           0 : int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
    2777             :                                  unsigned long size, pte_fn_t fn, void *data)
    2778             : {
    2779           0 :         return __apply_to_page_range(mm, addr, size, fn, data, false);
    2780             : }
    2781             : EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
    2782             : 
    2783             : /*
    2784             :  * handle_pte_fault chooses page fault handler according to an entry which was
    2785             :  * read non-atomically.  Before making any commitment, on those architectures
    2786             :  * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
    2787             :  * parts, do_swap_page must check under lock before unmapping the pte and
    2788             :  * proceeding (but do_wp_page is only called after already making such a check;
    2789             :  * and do_anonymous_page can safely check later on).
    2790             :  */
    2791             : static inline int pte_unmap_same(struct vm_fault *vmf)
    2792             : {
    2793           0 :         int same = 1;
    2794             : #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
    2795             :         if (sizeof(pte_t) > sizeof(unsigned long)) {
    2796             :                 spin_lock(vmf->ptl);
    2797             :                 same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
    2798             :                 spin_unlock(vmf->ptl);
    2799             :         }
    2800             : #endif
    2801           0 :         pte_unmap(vmf->pte);
    2802           0 :         vmf->pte = NULL;
    2803             :         return same;
    2804             : }
    2805             : 
    2806             : /*
    2807             :  * Return:
    2808             :  *      0:              copied succeeded
    2809             :  *      -EHWPOISON:     copy failed due to hwpoison in source page
    2810             :  *      -EAGAIN:        copied failed (some other reason)
    2811             :  */
    2812           0 : static inline int __wp_page_copy_user(struct page *dst, struct page *src,
    2813             :                                       struct vm_fault *vmf)
    2814             : {
    2815             :         int ret;
    2816             :         void *kaddr;
    2817             :         void __user *uaddr;
    2818           0 :         struct vm_area_struct *vma = vmf->vma;
    2819           0 :         struct mm_struct *mm = vma->vm_mm;
    2820           0 :         unsigned long addr = vmf->address;
    2821             : 
    2822           0 :         if (likely(src)) {
    2823           0 :                 if (copy_mc_user_highpage(dst, src, addr, vma)) {
    2824             :                         memory_failure_queue(page_to_pfn(src), 0);
    2825             :                         return -EHWPOISON;
    2826             :                 }
    2827           0 :                 return 0;
    2828             :         }
    2829             : 
    2830             :         /*
    2831             :          * If the source page was a PFN mapping, we don't have
    2832             :          * a "struct page" for it. We do a best-effort copy by
    2833             :          * just copying from the original user address. If that
    2834             :          * fails, we just zero-fill it. Live with it.
    2835             :          */
    2836           0 :         kaddr = kmap_atomic(dst);
    2837           0 :         uaddr = (void __user *)(addr & PAGE_MASK);
    2838             : 
    2839             :         /*
    2840             :          * On architectures with software "accessed" bits, we would
    2841             :          * take a double page fault, so mark it accessed here.
    2842             :          */
    2843           0 :         vmf->pte = NULL;
    2844           0 :         if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
    2845             :                 pte_t entry;
    2846             : 
    2847           0 :                 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
    2848           0 :                 if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
    2849             :                         /*
    2850             :                          * Other thread has already handled the fault
    2851             :                          * and update local tlb only
    2852             :                          */
    2853             :                         if (vmf->pte)
    2854             :                                 update_mmu_tlb(vma, addr, vmf->pte);
    2855           0 :                         ret = -EAGAIN;
    2856           0 :                         goto pte_unlock;
    2857             :                 }
    2858             : 
    2859           0 :                 entry = pte_mkyoung(vmf->orig_pte);
    2860           0 :                 if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
    2861             :                         update_mmu_cache(vma, addr, vmf->pte);
    2862             :         }
    2863             : 
    2864             :         /*
    2865             :          * This really shouldn't fail, because the page is there
    2866             :          * in the page tables. But it might just be unreadable,
    2867             :          * in which case we just give up and fill the result with
    2868             :          * zeroes.
    2869             :          */
    2870           0 :         if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
    2871           0 :                 if (vmf->pte)
    2872             :                         goto warn;
    2873             : 
    2874             :                 /* Re-validate under PTL if the page is still mapped */
    2875           0 :                 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
    2876           0 :                 if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
    2877             :                         /* The PTE changed under us, update local tlb */
    2878             :                         if (vmf->pte)
    2879             :                                 update_mmu_tlb(vma, addr, vmf->pte);
    2880             :                         ret = -EAGAIN;
    2881             :                         goto pte_unlock;
    2882             :                 }
    2883             : 
    2884             :                 /*
    2885             :                  * The same page can be mapped back since last copy attempt.
    2886             :                  * Try to copy again under PTL.
    2887             :                  */
    2888           0 :                 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
    2889             :                         /*
    2890             :                          * Give a warn in case there can be some obscure
    2891             :                          * use-case
    2892             :                          */
    2893             : warn:
    2894           0 :                         WARN_ON_ONCE(1);
    2895           0 :                         clear_page(kaddr);
    2896             :                 }
    2897             :         }
    2898             : 
    2899             :         ret = 0;
    2900             : 
    2901             : pte_unlock:
    2902           0 :         if (vmf->pte)
    2903           0 :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    2904           0 :         kunmap_atomic(kaddr);
    2905           0 :         flush_dcache_page(dst);
    2906             : 
    2907           0 :         return ret;
    2908             : }
    2909             : 
    2910             : static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
    2911             : {
    2912           0 :         struct file *vm_file = vma->vm_file;
    2913             : 
    2914           0 :         if (vm_file)
    2915           0 :                 return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
    2916             : 
    2917             :         /*
    2918             :          * Special mappings (e.g. VDSO) do not have any file so fake
    2919             :          * a default GFP_KERNEL for them.
    2920             :          */
    2921             :         return GFP_KERNEL;
    2922             : }
    2923             : 
    2924             : /*
    2925             :  * Notify the address space that the page is about to become writable so that
    2926             :  * it can prohibit this or wait for the page to get into an appropriate state.
    2927             :  *
    2928             :  * We do this without the lock held, so that it can sleep if it needs to.
    2929             :  */
    2930           0 : static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
    2931             : {
    2932             :         vm_fault_t ret;
    2933           0 :         struct page *page = vmf->page;
    2934           0 :         unsigned int old_flags = vmf->flags;
    2935             : 
    2936           0 :         vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
    2937             : 
    2938           0 :         if (vmf->vma->vm_file &&
    2939           0 :             IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
    2940             :                 return VM_FAULT_SIGBUS;
    2941             : 
    2942           0 :         ret = vmf->vma->vm_ops->page_mkwrite(vmf);
    2943             :         /* Restore original flags so that caller is not surprised */
    2944           0 :         vmf->flags = old_flags;
    2945           0 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
    2946             :                 return ret;
    2947           0 :         if (unlikely(!(ret & VM_FAULT_LOCKED))) {
    2948           0 :                 lock_page(page);
    2949           0 :                 if (!page->mapping) {
    2950           0 :                         unlock_page(page);
    2951           0 :                         return 0; /* retry */
    2952             :                 }
    2953           0 :                 ret |= VM_FAULT_LOCKED;
    2954             :         } else
    2955             :                 VM_BUG_ON_PAGE(!PageLocked(page), page);
    2956             :         return ret;
    2957             : }
    2958             : 
    2959             : /*
    2960             :  * Handle dirtying of a page in shared file mapping on a write fault.
    2961             :  *
    2962             :  * The function expects the page to be locked and unlocks it.
    2963             :  */
    2964           0 : static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
    2965             : {
    2966           0 :         struct vm_area_struct *vma = vmf->vma;
    2967             :         struct address_space *mapping;
    2968           0 :         struct page *page = vmf->page;
    2969             :         bool dirtied;
    2970           0 :         bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
    2971             : 
    2972           0 :         dirtied = set_page_dirty(page);
    2973             :         VM_BUG_ON_PAGE(PageAnon(page), page);
    2974             :         /*
    2975             :          * Take a local copy of the address_space - page.mapping may be zeroed
    2976             :          * by truncate after unlock_page().   The address_space itself remains
    2977             :          * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
    2978             :          * release semantics to prevent the compiler from undoing this copying.
    2979             :          */
    2980           0 :         mapping = page_rmapping(page);
    2981           0 :         unlock_page(page);
    2982             : 
    2983           0 :         if (!page_mkwrite)
    2984           0 :                 file_update_time(vma->vm_file);
    2985             : 
    2986             :         /*
    2987             :          * Throttle page dirtying rate down to writeback speed.
    2988             :          *
    2989             :          * mapping may be NULL here because some device drivers do not
    2990             :          * set page.mapping but still dirty their pages
    2991             :          *
    2992             :          * Drop the mmap_lock before waiting on IO, if we can. The file
    2993             :          * is pinning the mapping, as per above.
    2994             :          */
    2995           0 :         if ((dirtied || page_mkwrite) && mapping) {
    2996             :                 struct file *fpin;
    2997             : 
    2998           0 :                 fpin = maybe_unlock_mmap_for_io(vmf, NULL);
    2999           0 :                 balance_dirty_pages_ratelimited(mapping);
    3000           0 :                 if (fpin) {
    3001           0 :                         fput(fpin);
    3002           0 :                         return VM_FAULT_COMPLETED;
    3003             :                 }
    3004             :         }
    3005             : 
    3006             :         return 0;
    3007             : }
    3008             : 
    3009             : /*
    3010             :  * Handle write page faults for pages that can be reused in the current vma
    3011             :  *
    3012             :  * This can happen either due to the mapping being with the VM_SHARED flag,
    3013             :  * or due to us being the last reference standing to the page. In either
    3014             :  * case, all we need to do here is to mark the page as writable and update
    3015             :  * any related book-keeping.
    3016             :  */
    3017           0 : static inline void wp_page_reuse(struct vm_fault *vmf)
    3018             :         __releases(vmf->ptl)
    3019             : {
    3020           0 :         struct vm_area_struct *vma = vmf->vma;
    3021           0 :         struct page *page = vmf->page;
    3022             :         pte_t entry;
    3023             : 
    3024             :         VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
    3025             :         VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page));
    3026             : 
    3027             :         /*
    3028             :          * Clear the pages cpupid information as the existing
    3029             :          * information potentially belongs to a now completely
    3030             :          * unrelated process.
    3031             :          */
    3032             :         if (page)
    3033             :                 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
    3034             : 
    3035           0 :         flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
    3036           0 :         entry = pte_mkyoung(vmf->orig_pte);
    3037           0 :         entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    3038           0 :         if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
    3039             :                 update_mmu_cache(vma, vmf->address, vmf->pte);
    3040           0 :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    3041           0 :         count_vm_event(PGREUSE);
    3042           0 : }
    3043             : 
    3044             : /*
    3045             :  * Handle the case of a page which we actually need to copy to a new page,
    3046             :  * either due to COW or unsharing.
    3047             :  *
    3048             :  * Called with mmap_lock locked and the old page referenced, but
    3049             :  * without the ptl held.
    3050             :  *
    3051             :  * High level logic flow:
    3052             :  *
    3053             :  * - Allocate a page, copy the content of the old page to the new one.
    3054             :  * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
    3055             :  * - Take the PTL. If the pte changed, bail out and release the allocated page
    3056             :  * - If the pte is still the way we remember it, update the page table and all
    3057             :  *   relevant references. This includes dropping the reference the page-table
    3058             :  *   held to the old page, as well as updating the rmap.
    3059             :  * - In any case, unlock the PTL and drop the reference we took to the old page.
    3060             :  */
    3061           0 : static vm_fault_t wp_page_copy(struct vm_fault *vmf)
    3062             : {
    3063           0 :         const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
    3064           0 :         struct vm_area_struct *vma = vmf->vma;
    3065           0 :         struct mm_struct *mm = vma->vm_mm;
    3066           0 :         struct folio *old_folio = NULL;
    3067           0 :         struct folio *new_folio = NULL;
    3068             :         pte_t entry;
    3069           0 :         int page_copied = 0;
    3070             :         struct mmu_notifier_range range;
    3071             :         int ret;
    3072             : 
    3073             :         delayacct_wpcopy_start();
    3074             : 
    3075           0 :         if (vmf->page)
    3076           0 :                 old_folio = page_folio(vmf->page);
    3077           0 :         if (unlikely(anon_vma_prepare(vma)))
    3078             :                 goto oom;
    3079             : 
    3080           0 :         if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
    3081           0 :                 new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
    3082           0 :                 if (!new_folio)
    3083             :                         goto oom;
    3084             :         } else {
    3085           0 :                 new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
    3086             :                                 vmf->address, false);
    3087           0 :                 if (!new_folio)
    3088             :                         goto oom;
    3089             : 
    3090           0 :                 ret = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
    3091           0 :                 if (ret) {
    3092             :                         /*
    3093             :                          * COW failed, if the fault was solved by other,
    3094             :                          * it's fine. If not, userspace would re-fault on
    3095             :                          * the same address and we will handle the fault
    3096             :                          * from the second attempt.
    3097             :                          * The -EHWPOISON case will not be retried.
    3098             :                          */
    3099           0 :                         folio_put(new_folio);
    3100           0 :                         if (old_folio)
    3101             :                                 folio_put(old_folio);
    3102             : 
    3103             :                         delayacct_wpcopy_end();
    3104           0 :                         return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
    3105             :                 }
    3106             :                 kmsan_copy_page_meta(&new_folio->page, vmf->page);
    3107             :         }
    3108             : 
    3109           0 :         if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL))
    3110             :                 goto oom_free_new;
    3111           0 :         folio_throttle_swaprate(new_folio, GFP_KERNEL);
    3112             : 
    3113           0 :         __folio_mark_uptodate(new_folio);
    3114             : 
    3115           0 :         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
    3116             :                                 vmf->address & PAGE_MASK,
    3117             :                                 (vmf->address & PAGE_MASK) + PAGE_SIZE);
    3118           0 :         mmu_notifier_invalidate_range_start(&range);
    3119             : 
    3120             :         /*
    3121             :          * Re-check the pte - we dropped the lock
    3122             :          */
    3123           0 :         vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
    3124           0 :         if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
    3125           0 :                 if (old_folio) {
    3126           0 :                         if (!folio_test_anon(old_folio)) {
    3127           0 :                                 dec_mm_counter(mm, mm_counter_file(&old_folio->page));
    3128             :                                 inc_mm_counter(mm, MM_ANONPAGES);
    3129             :                         }
    3130             :                 } else {
    3131             :                         inc_mm_counter(mm, MM_ANONPAGES);
    3132             :                 }
    3133           0 :                 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
    3134           0 :                 entry = mk_pte(&new_folio->page, vma->vm_page_prot);
    3135             :                 entry = pte_sw_mkyoung(entry);
    3136           0 :                 if (unlikely(unshare)) {
    3137             :                         if (pte_soft_dirty(vmf->orig_pte))
    3138             :                                 entry = pte_mksoft_dirty(entry);
    3139             :                         if (pte_uffd_wp(vmf->orig_pte))
    3140             :                                 entry = pte_mkuffd_wp(entry);
    3141             :                 } else {
    3142           0 :                         entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    3143             :                 }
    3144             : 
    3145             :                 /*
    3146             :                  * Clear the pte entry and flush it first, before updating the
    3147             :                  * pte with the new entry, to keep TLBs on different CPUs in
    3148             :                  * sync. This code used to set the new PTE then flush TLBs, but
    3149             :                  * that left a window where the new PTE could be loaded into
    3150             :                  * some TLBs while the old PTE remains in others.
    3151             :                  */
    3152           0 :                 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
    3153           0 :                 folio_add_new_anon_rmap(new_folio, vma, vmf->address);
    3154           0 :                 folio_add_lru_vma(new_folio, vma);
    3155             :                 /*
    3156             :                  * We call the notify macro here because, when using secondary
    3157             :                  * mmu page tables (such as kvm shadow page tables), we want the
    3158             :                  * new page to be mapped directly into the secondary page table.
    3159             :                  */
    3160           0 :                 BUG_ON(unshare && pte_write(entry));
    3161           0 :                 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
    3162             :                 update_mmu_cache(vma, vmf->address, vmf->pte);
    3163           0 :                 if (old_folio) {
    3164             :                         /*
    3165             :                          * Only after switching the pte to the new page may
    3166             :                          * we remove the mapcount here. Otherwise another
    3167             :                          * process may come and find the rmap count decremented
    3168             :                          * before the pte is switched to the new page, and
    3169             :                          * "reuse" the old page writing into it while our pte
    3170             :                          * here still points into it and can be read by other
    3171             :                          * threads.
    3172             :                          *
    3173             :                          * The critical issue is to order this
    3174             :                          * page_remove_rmap with the ptp_clear_flush above.
    3175             :                          * Those stores are ordered by (if nothing else,)
    3176             :                          * the barrier present in the atomic_add_negative
    3177             :                          * in page_remove_rmap.
    3178             :                          *
    3179             :                          * Then the TLB flush in ptep_clear_flush ensures that
    3180             :                          * no process can access the old page before the
    3181             :                          * decremented mapcount is visible. And the old page
    3182             :                          * cannot be reused until after the decremented
    3183             :                          * mapcount is visible. So transitively, TLBs to
    3184             :                          * old page will be flushed before it can be reused.
    3185             :                          */
    3186           0 :                         page_remove_rmap(vmf->page, vma, false);
    3187             :                 }
    3188             : 
    3189             :                 /* Free the old page.. */
    3190           0 :                 new_folio = old_folio;
    3191           0 :                 page_copied = 1;
    3192           0 :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    3193           0 :         } else if (vmf->pte) {
    3194           0 :                 update_mmu_tlb(vma, vmf->address, vmf->pte);
    3195           0 :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    3196             :         }
    3197             : 
    3198             :         /*
    3199             :          * No need to double call mmu_notifier->invalidate_range() callback as
    3200             :          * the above ptep_clear_flush_notify() did already call it.
    3201             :          */
    3202           0 :         mmu_notifier_invalidate_range_only_end(&range);
    3203             : 
    3204           0 :         if (new_folio)
    3205             :                 folio_put(new_folio);
    3206           0 :         if (old_folio) {
    3207           0 :                 if (page_copied)
    3208           0 :                         free_swap_cache(&old_folio->page);
    3209             :                 folio_put(old_folio);
    3210             :         }
    3211             : 
    3212             :         delayacct_wpcopy_end();
    3213             :         return 0;
    3214             : oom_free_new:
    3215             :         folio_put(new_folio);
    3216             : oom:
    3217           0 :         if (old_folio)
    3218             :                 folio_put(old_folio);
    3219             : 
    3220             :         delayacct_wpcopy_end();
    3221             :         return VM_FAULT_OOM;
    3222             : }
    3223             : 
    3224             : /**
    3225             :  * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
    3226             :  *                        writeable once the page is prepared
    3227             :  *
    3228             :  * @vmf: structure describing the fault
    3229             :  *
    3230             :  * This function handles all that is needed to finish a write page fault in a
    3231             :  * shared mapping due to PTE being read-only once the mapped page is prepared.
    3232             :  * It handles locking of PTE and modifying it.
    3233             :  *
    3234             :  * The function expects the page to be locked or other protection against
    3235             :  * concurrent faults / writeback (such as DAX radix tree locks).
    3236             :  *
    3237             :  * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
    3238             :  * we acquired PTE lock.
    3239             :  */
    3240           0 : vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
    3241             : {
    3242           0 :         WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
    3243           0 :         vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
    3244             :                                        &vmf->ptl);
    3245           0 :         if (!vmf->pte)
    3246             :                 return VM_FAULT_NOPAGE;
    3247             :         /*
    3248             :          * We might have raced with another page fault while we released the
    3249             :          * pte_offset_map_lock.
    3250             :          */
    3251           0 :         if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
    3252           0 :                 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
    3253           0 :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    3254           0 :                 return VM_FAULT_NOPAGE;
    3255             :         }
    3256           0 :         wp_page_reuse(vmf);
    3257           0 :         return 0;
    3258             : }
    3259             : 
    3260             : /*
    3261             :  * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
    3262             :  * mapping
    3263             :  */
    3264           0 : static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
    3265             : {
    3266           0 :         struct vm_area_struct *vma = vmf->vma;
    3267             : 
    3268           0 :         if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
    3269             :                 vm_fault_t ret;
    3270             : 
    3271           0 :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    3272           0 :                 vmf->flags |= FAULT_FLAG_MKWRITE;
    3273           0 :                 ret = vma->vm_ops->pfn_mkwrite(vmf);
    3274           0 :                 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
    3275             :                         return ret;
    3276           0 :                 return finish_mkwrite_fault(vmf);
    3277             :         }
    3278           0 :         wp_page_reuse(vmf);
    3279           0 :         return 0;
    3280             : }
    3281             : 
    3282           0 : static vm_fault_t wp_page_shared(struct vm_fault *vmf)
    3283             :         __releases(vmf->ptl)
    3284             : {
    3285           0 :         struct vm_area_struct *vma = vmf->vma;
    3286           0 :         vm_fault_t ret = 0;
    3287             : 
    3288           0 :         get_page(vmf->page);
    3289             : 
    3290           0 :         if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
    3291             :                 vm_fault_t tmp;
    3292             : 
    3293           0 :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    3294           0 :                 tmp = do_page_mkwrite(vmf);
    3295           0 :                 if (unlikely(!tmp || (tmp &
    3296             :                                       (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
    3297           0 :                         put_page(vmf->page);
    3298           0 :                         return tmp;
    3299             :                 }
    3300           0 :                 tmp = finish_mkwrite_fault(vmf);
    3301           0 :                 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
    3302           0 :                         unlock_page(vmf->page);
    3303           0 :                         put_page(vmf->page);
    3304           0 :                         return tmp;
    3305             :                 }
    3306             :         } else {
    3307           0 :                 wp_page_reuse(vmf);
    3308           0 :                 lock_page(vmf->page);
    3309             :         }
    3310           0 :         ret |= fault_dirty_shared_page(vmf);
    3311           0 :         put_page(vmf->page);
    3312             : 
    3313           0 :         return ret;
    3314             : }
    3315             : 
    3316             : /*
    3317             :  * This routine handles present pages, when
    3318             :  * * users try to write to a shared page (FAULT_FLAG_WRITE)
    3319             :  * * GUP wants to take a R/O pin on a possibly shared anonymous page
    3320             :  *   (FAULT_FLAG_UNSHARE)
    3321             :  *
    3322             :  * It is done by copying the page to a new address and decrementing the
    3323             :  * shared-page counter for the old page.
    3324             :  *
    3325             :  * Note that this routine assumes that the protection checks have been
    3326             :  * done by the caller (the low-level page fault routine in most cases).
    3327             :  * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
    3328             :  * done any necessary COW.
    3329             :  *
    3330             :  * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
    3331             :  * though the page will change only once the write actually happens. This
    3332             :  * avoids a few races, and potentially makes it more efficient.
    3333             :  *
    3334             :  * We enter with non-exclusive mmap_lock (to exclude vma changes,
    3335             :  * but allow concurrent faults), with pte both mapped and locked.
    3336             :  * We return with mmap_lock still held, but pte unmapped and unlocked.
    3337             :  */
    3338           0 : static vm_fault_t do_wp_page(struct vm_fault *vmf)
    3339             :         __releases(vmf->ptl)
    3340             : {
    3341           0 :         const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
    3342           0 :         struct vm_area_struct *vma = vmf->vma;
    3343           0 :         struct folio *folio = NULL;
    3344             : 
    3345           0 :         if (likely(!unshare)) {
    3346           0 :                 if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
    3347             :                         pte_unmap_unlock(vmf->pte, vmf->ptl);
    3348             :                         return handle_userfault(vmf, VM_UFFD_WP);
    3349             :                 }
    3350             : 
    3351             :                 /*
    3352             :                  * Userfaultfd write-protect can defer flushes. Ensure the TLB
    3353             :                  * is flushed in this case before copying.
    3354             :                  */
    3355           0 :                 if (unlikely(userfaultfd_wp(vmf->vma) &&
    3356             :                              mm_tlb_flush_pending(vmf->vma->vm_mm)))
    3357             :                         flush_tlb_page(vmf->vma, vmf->address);
    3358             :         }
    3359             : 
    3360           0 :         vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
    3361             : 
    3362             :         /*
    3363             :          * Shared mapping: we are guaranteed to have VM_WRITE and
    3364             :          * FAULT_FLAG_WRITE set at this point.
    3365             :          */
    3366           0 :         if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
    3367             :                 /*
    3368             :                  * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
    3369             :                  * VM_PFNMAP VMA.
    3370             :                  *
    3371             :                  * We should not cow pages in a shared writeable mapping.
    3372             :                  * Just mark the pages writable and/or call ops->pfn_mkwrite.
    3373             :                  */
    3374           0 :                 if (!vmf->page)
    3375           0 :                         return wp_pfn_shared(vmf);
    3376           0 :                 return wp_page_shared(vmf);
    3377             :         }
    3378             : 
    3379           0 :         if (vmf->page)
    3380           0 :                 folio = page_folio(vmf->page);
    3381             : 
    3382             :         /*
    3383             :          * Private mapping: create an exclusive anonymous page copy if reuse
    3384             :          * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
    3385             :          */
    3386           0 :         if (folio && folio_test_anon(folio)) {
    3387             :                 /*
    3388             :                  * If the page is exclusive to this process we must reuse the
    3389             :                  * page without further checks.
    3390             :                  */
    3391           0 :                 if (PageAnonExclusive(vmf->page))
    3392             :                         goto reuse;
    3393             : 
    3394             :                 /*
    3395             :                  * We have to verify under folio lock: these early checks are
    3396             :                  * just an optimization to avoid locking the folio and freeing
    3397             :                  * the swapcache if there is little hope that we can reuse.
    3398             :                  *
    3399             :                  * KSM doesn't necessarily raise the folio refcount.
    3400             :                  */
    3401           0 :                 if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
    3402             :                         goto copy;
    3403           0 :                 if (!folio_test_lru(folio))
    3404             :                         /*
    3405             :                          * We cannot easily detect+handle references from
    3406             :                          * remote LRU caches or references to LRU folios.
    3407             :                          */
    3408           0 :                         lru_add_drain();
    3409           0 :                 if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
    3410             :                         goto copy;
    3411           0 :                 if (!folio_trylock(folio))
    3412             :                         goto copy;
    3413           0 :                 if (folio_test_swapcache(folio))
    3414           0 :                         folio_free_swap(folio);
    3415           0 :                 if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
    3416           0 :                         folio_unlock(folio);
    3417           0 :                         goto copy;
    3418             :                 }
    3419             :                 /*
    3420             :                  * Ok, we've got the only folio reference from our mapping
    3421             :                  * and the folio is locked, it's dark out, and we're wearing
    3422             :                  * sunglasses. Hit it.
    3423             :                  */
    3424           0 :                 page_move_anon_rmap(vmf->page, vma);
    3425           0 :                 folio_unlock(folio);
    3426             : reuse:
    3427           0 :                 if (unlikely(unshare)) {
    3428           0 :                         pte_unmap_unlock(vmf->pte, vmf->ptl);
    3429           0 :                         return 0;
    3430             :                 }
    3431           0 :                 wp_page_reuse(vmf);
    3432           0 :                 return 0;
    3433             :         }
    3434             : copy:
    3435             :         /*
    3436             :          * Ok, we need to copy. Oh, well..
    3437             :          */
    3438           0 :         if (folio)
    3439             :                 folio_get(folio);
    3440             : 
    3441           0 :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    3442             : #ifdef CONFIG_KSM
    3443             :         if (folio && folio_test_ksm(folio))
    3444             :                 count_vm_event(COW_KSM);
    3445             : #endif
    3446           0 :         return wp_page_copy(vmf);
    3447             : }
    3448             : 
    3449             : static void unmap_mapping_range_vma(struct vm_area_struct *vma,
    3450             :                 unsigned long start_addr, unsigned long end_addr,
    3451             :                 struct zap_details *details)
    3452             : {
    3453           0 :         zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
    3454             : }
    3455             : 
    3456           0 : static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
    3457             :                                             pgoff_t first_index,
    3458             :                                             pgoff_t last_index,
    3459             :                                             struct zap_details *details)
    3460             : {
    3461             :         struct vm_area_struct *vma;
    3462             :         pgoff_t vba, vea, zba, zea;
    3463             : 
    3464           0 :         vma_interval_tree_foreach(vma, root, first_index, last_index) {
    3465           0 :                 vba = vma->vm_pgoff;
    3466           0 :                 vea = vba + vma_pages(vma) - 1;
    3467           0 :                 zba = max(first_index, vba);
    3468           0 :                 zea = min(last_index, vea);
    3469             : 
    3470           0 :                 unmap_mapping_range_vma(vma,
    3471           0 :                         ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
    3472           0 :                         ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
    3473             :                                 details);
    3474             :         }
    3475           0 : }
    3476             : 
    3477             : /**
    3478             :  * unmap_mapping_folio() - Unmap single folio from processes.
    3479             :  * @folio: The locked folio to be unmapped.
    3480             :  *
    3481             :  * Unmap this folio from any userspace process which still has it mmaped.
    3482             :  * Typically, for efficiency, the range of nearby pages has already been
    3483             :  * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
    3484             :  * truncation or invalidation holds the lock on a folio, it may find that
    3485             :  * the page has been remapped again: and then uses unmap_mapping_folio()
    3486             :  * to unmap it finally.
    3487             :  */
    3488           0 : void unmap_mapping_folio(struct folio *folio)
    3489             : {
    3490           0 :         struct address_space *mapping = folio->mapping;
    3491           0 :         struct zap_details details = { };
    3492             :         pgoff_t first_index;
    3493             :         pgoff_t last_index;
    3494             : 
    3495             :         VM_BUG_ON(!folio_test_locked(folio));
    3496             : 
    3497           0 :         first_index = folio->index;
    3498           0 :         last_index = folio->index + folio_nr_pages(folio) - 1;
    3499             : 
    3500             :         details.even_cows = false;
    3501           0 :         details.single_folio = folio;
    3502           0 :         details.zap_flags = ZAP_FLAG_DROP_MARKER;
    3503             : 
    3504           0 :         i_mmap_lock_read(mapping);
    3505           0 :         if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
    3506           0 :                 unmap_mapping_range_tree(&mapping->i_mmap, first_index,
    3507             :                                          last_index, &details);
    3508           0 :         i_mmap_unlock_read(mapping);
    3509           0 : }
    3510             : 
    3511             : /**
    3512             :  * unmap_mapping_pages() - Unmap pages from processes.
    3513             :  * @mapping: The address space containing pages to be unmapped.
    3514             :  * @start: Index of first page to be unmapped.
    3515             :  * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
    3516             :  * @even_cows: Whether to unmap even private COWed pages.
    3517             :  *
    3518             :  * Unmap the pages in this address space from any userspace process which
    3519             :  * has them mmaped.  Generally, you want to remove COWed pages as well when
    3520             :  * a file is being truncated, but not when invalidating pages from the page
    3521             :  * cache.
    3522             :  */
    3523           0 : void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
    3524             :                 pgoff_t nr, bool even_cows)
    3525             : {
    3526           0 :         struct zap_details details = { };
    3527           0 :         pgoff_t first_index = start;
    3528           0 :         pgoff_t last_index = start + nr - 1;
    3529             : 
    3530           0 :         details.even_cows = even_cows;
    3531           0 :         if (last_index < first_index)
    3532           0 :                 last_index = ULONG_MAX;
    3533             : 
    3534           0 :         i_mmap_lock_read(mapping);
    3535           0 :         if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
    3536           0 :                 unmap_mapping_range_tree(&mapping->i_mmap, first_index,
    3537             :                                          last_index, &details);
    3538           0 :         i_mmap_unlock_read(mapping);
    3539           0 : }
    3540             : EXPORT_SYMBOL_GPL(unmap_mapping_pages);
    3541             : 
    3542             : /**
    3543             :  * unmap_mapping_range - unmap the portion of all mmaps in the specified
    3544             :  * address_space corresponding to the specified byte range in the underlying
    3545             :  * file.
    3546             :  *
    3547             :  * @mapping: the address space containing mmaps to be unmapped.
    3548             :  * @holebegin: byte in first page to unmap, relative to the start of
    3549             :  * the underlying file.  This will be rounded down to a PAGE_SIZE
    3550             :  * boundary.  Note that this is different from truncate_pagecache(), which
    3551             :  * must keep the partial page.  In contrast, we must get rid of
    3552             :  * partial pages.
    3553             :  * @holelen: size of prospective hole in bytes.  This will be rounded
    3554             :  * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
    3555             :  * end of the file.
    3556             :  * @even_cows: 1 when truncating a file, unmap even private COWed pages;
    3557             :  * but 0 when invalidating pagecache, don't throw away private data.
    3558             :  */
    3559           0 : void unmap_mapping_range(struct address_space *mapping,
    3560             :                 loff_t const holebegin, loff_t const holelen, int even_cows)
    3561             : {
    3562           0 :         pgoff_t hba = holebegin >> PAGE_SHIFT;
    3563           0 :         pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
    3564             : 
    3565             :         /* Check for overflow. */
    3566             :         if (sizeof(holelen) > sizeof(hlen)) {
    3567             :                 long long holeend =
    3568             :                         (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
    3569             :                 if (holeend & ~(long long)ULONG_MAX)
    3570             :                         hlen = ULONG_MAX - hba + 1;
    3571             :         }
    3572             : 
    3573           0 :         unmap_mapping_pages(mapping, hba, hlen, even_cows);
    3574           0 : }
    3575             : EXPORT_SYMBOL(unmap_mapping_range);
    3576             : 
    3577             : /*
    3578             :  * Restore a potential device exclusive pte to a working pte entry
    3579             :  */
    3580             : static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
    3581             : {
    3582             :         struct folio *folio = page_folio(vmf->page);
    3583             :         struct vm_area_struct *vma = vmf->vma;
    3584             :         struct mmu_notifier_range range;
    3585             : 
    3586             :         /*
    3587             :          * We need a reference to lock the folio because we don't hold
    3588             :          * the PTL so a racing thread can remove the device-exclusive
    3589             :          * entry and unmap it. If the folio is free the entry must
    3590             :          * have been removed already. If it happens to have already
    3591             :          * been re-allocated after being freed all we do is lock and
    3592             :          * unlock it.
    3593             :          */
    3594             :         if (!folio_try_get(folio))
    3595             :                 return 0;
    3596             : 
    3597             :         if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) {
    3598             :                 folio_put(folio);
    3599             :                 return VM_FAULT_RETRY;
    3600             :         }
    3601             :         mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
    3602             :                                 vma->vm_mm, vmf->address & PAGE_MASK,
    3603             :                                 (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
    3604             :         mmu_notifier_invalidate_range_start(&range);
    3605             : 
    3606             :         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
    3607             :                                 &vmf->ptl);
    3608             :         if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
    3609             :                 restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
    3610             : 
    3611             :         if (vmf->pte)
    3612             :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    3613             :         folio_unlock(folio);
    3614             :         folio_put(folio);
    3615             : 
    3616             :         mmu_notifier_invalidate_range_end(&range);
    3617             :         return 0;
    3618             : }
    3619             : 
    3620           0 : static inline bool should_try_to_free_swap(struct folio *folio,
    3621             :                                            struct vm_area_struct *vma,
    3622             :                                            unsigned int fault_flags)
    3623             : {
    3624           0 :         if (!folio_test_swapcache(folio))
    3625             :                 return false;
    3626           0 :         if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
    3627           0 :             folio_test_mlocked(folio))
    3628             :                 return true;
    3629             :         /*
    3630             :          * If we want to map a page that's in the swapcache writable, we
    3631             :          * have to detect via the refcount if we're really the exclusive
    3632             :          * user. Try freeing the swapcache to get rid of the swapcache
    3633             :          * reference only in case it's likely that we'll be the exlusive user.
    3634             :          */
    3635           0 :         return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
    3636           0 :                 folio_ref_count(folio) == 2;
    3637             : }
    3638             : 
    3639             : static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
    3640             : {
    3641             :         vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
    3642             :                                        vmf->address, &vmf->ptl);
    3643             :         if (!vmf->pte)
    3644             :                 return 0;
    3645             :         /*
    3646             :          * Be careful so that we will only recover a special uffd-wp pte into a
    3647             :          * none pte.  Otherwise it means the pte could have changed, so retry.
    3648             :          *
    3649             :          * This should also cover the case where e.g. the pte changed
    3650             :          * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
    3651             :          * So is_pte_marker() check is not enough to safely drop the pte.
    3652             :          */
    3653             :         if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
    3654             :                 pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
    3655             :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    3656             :         return 0;
    3657             : }
    3658             : 
    3659           0 : static vm_fault_t do_pte_missing(struct vm_fault *vmf)
    3660             : {
    3661           0 :         if (vma_is_anonymous(vmf->vma))
    3662           0 :                 return do_anonymous_page(vmf);
    3663             :         else
    3664           0 :                 return do_fault(vmf);
    3665             : }
    3666             : 
    3667             : /*
    3668             :  * This is actually a page-missing access, but with uffd-wp special pte
    3669             :  * installed.  It means this pte was wr-protected before being unmapped.
    3670             :  */
    3671             : static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
    3672             : {
    3673             :         /*
    3674             :          * Just in case there're leftover special ptes even after the region
    3675             :          * got unregistered - we can simply clear them.
    3676             :          */
    3677             :         if (unlikely(!userfaultfd_wp(vmf->vma)))
    3678             :                 return pte_marker_clear(vmf);
    3679             : 
    3680             :         return do_pte_missing(vmf);
    3681             : }
    3682             : 
    3683           0 : static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
    3684             : {
    3685           0 :         swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
    3686           0 :         unsigned long marker = pte_marker_get(entry);
    3687             : 
    3688             :         /*
    3689             :          * PTE markers should never be empty.  If anything weird happened,
    3690             :          * the best thing to do is to kill the process along with its mm.
    3691             :          */
    3692           0 :         if (WARN_ON_ONCE(!marker))
    3693             :                 return VM_FAULT_SIGBUS;
    3694             : 
    3695             :         /* Higher priority than uffd-wp when data corrupted */
    3696             :         if (marker & PTE_MARKER_SWAPIN_ERROR)
    3697             :                 return VM_FAULT_SIGBUS;
    3698             : 
    3699             :         if (pte_marker_entry_uffd_wp(entry))
    3700             :                 return pte_marker_handle_uffd_wp(vmf);
    3701             : 
    3702             :         /* This is an unknown pte marker */
    3703             :         return VM_FAULT_SIGBUS;
    3704             : }
    3705             : 
    3706             : /*
    3707             :  * We enter with non-exclusive mmap_lock (to exclude vma changes,
    3708             :  * but allow concurrent faults), and pte mapped but not yet locked.
    3709             :  * We return with pte unmapped and unlocked.
    3710             :  *
    3711             :  * We return with the mmap_lock locked or unlocked in the same cases
    3712             :  * as does filemap_fault().
    3713             :  */
    3714           0 : vm_fault_t do_swap_page(struct vm_fault *vmf)
    3715             : {
    3716           0 :         struct vm_area_struct *vma = vmf->vma;
    3717           0 :         struct folio *swapcache, *folio = NULL;
    3718             :         struct page *page;
    3719           0 :         struct swap_info_struct *si = NULL;
    3720           0 :         rmap_t rmap_flags = RMAP_NONE;
    3721           0 :         bool exclusive = false;
    3722             :         swp_entry_t entry;
    3723             :         pte_t pte;
    3724             :         int locked;
    3725           0 :         vm_fault_t ret = 0;
    3726           0 :         void *shadow = NULL;
    3727             : 
    3728           0 :         if (!pte_unmap_same(vmf))
    3729             :                 goto out;
    3730             : 
    3731           0 :         if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
    3732             :                 ret = VM_FAULT_RETRY;
    3733             :                 goto out;
    3734             :         }
    3735             : 
    3736           0 :         entry = pte_to_swp_entry(vmf->orig_pte);
    3737           0 :         if (unlikely(non_swap_entry(entry))) {
    3738           0 :                 if (is_migration_entry(entry)) {
    3739           0 :                         migration_entry_wait(vma->vm_mm, vmf->pmd,
    3740             :                                              vmf->address);
    3741           0 :                 } else if (is_device_exclusive_entry(entry)) {
    3742             :                         vmf->page = pfn_swap_entry_to_page(entry);
    3743             :                         ret = remove_device_exclusive_entry(vmf);
    3744           0 :                 } else if (is_device_private_entry(entry)) {
    3745             :                         vmf->page = pfn_swap_entry_to_page(entry);
    3746             :                         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
    3747             :                                         vmf->address, &vmf->ptl);
    3748             :                         if (unlikely(!vmf->pte ||
    3749             :                                      !pte_same(ptep_get(vmf->pte),
    3750             :                                                         vmf->orig_pte)))
    3751             :                                 goto unlock;
    3752             : 
    3753             :                         /*
    3754             :                          * Get a page reference while we know the page can't be
    3755             :                          * freed.
    3756             :                          */
    3757             :                         get_page(vmf->page);
    3758             :                         pte_unmap_unlock(vmf->pte, vmf->ptl);
    3759             :                         ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
    3760             :                         put_page(vmf->page);
    3761           0 :                 } else if (is_hwpoison_entry(entry)) {
    3762             :                         ret = VM_FAULT_HWPOISON;
    3763           0 :                 } else if (is_pte_marker_entry(entry)) {
    3764           0 :                         ret = handle_pte_marker(vmf);
    3765             :                 } else {
    3766           0 :                         print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
    3767           0 :                         ret = VM_FAULT_SIGBUS;
    3768             :                 }
    3769             :                 goto out;
    3770             :         }
    3771             : 
    3772             :         /* Prevent swapoff from happening to us. */
    3773           0 :         si = get_swap_device(entry);
    3774           0 :         if (unlikely(!si))
    3775             :                 goto out;
    3776             : 
    3777           0 :         folio = swap_cache_get_folio(entry, vma, vmf->address);
    3778           0 :         if (folio)
    3779           0 :                 page = folio_file_page(folio, swp_offset(entry));
    3780           0 :         swapcache = folio;
    3781             : 
    3782           0 :         if (!folio) {
    3783           0 :                 if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
    3784           0 :                     __swap_count(entry) == 1) {
    3785             :                         /* skip swapcache */
    3786           0 :                         folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
    3787             :                                                 vma, vmf->address, false);
    3788           0 :                         page = &folio->page;
    3789           0 :                         if (folio) {
    3790           0 :                                 __folio_set_locked(folio);
    3791           0 :                                 __folio_set_swapbacked(folio);
    3792             : 
    3793           0 :                                 if (mem_cgroup_swapin_charge_folio(folio,
    3794             :                                                         vma->vm_mm, GFP_KERNEL,
    3795             :                                                         entry)) {
    3796             :                                         ret = VM_FAULT_OOM;
    3797             :                                         goto out_page;
    3798             :                                 }
    3799           0 :                                 mem_cgroup_swapin_uncharge_swap(entry);
    3800             : 
    3801           0 :                                 shadow = get_shadow_from_swap_cache(entry);
    3802           0 :                                 if (shadow)
    3803           0 :                                         workingset_refault(folio, shadow);
    3804             : 
    3805           0 :                                 folio_add_lru(folio);
    3806             : 
    3807             :                                 /* To provide entry to swap_readpage() */
    3808           0 :                                 folio_set_swap_entry(folio, entry);
    3809           0 :                                 swap_readpage(page, true, NULL);
    3810           0 :                                 folio->private = NULL;
    3811             :                         }
    3812             :                 } else {
    3813           0 :                         page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
    3814             :                                                 vmf);
    3815           0 :                         if (page)
    3816           0 :                                 folio = page_folio(page);
    3817             :                         swapcache = folio;
    3818             :                 }
    3819             : 
    3820           0 :                 if (!folio) {
    3821             :                         /*
    3822             :                          * Back out if somebody else faulted in this pte
    3823             :                          * while we released the pte lock.
    3824             :                          */
    3825           0 :                         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
    3826             :                                         vmf->address, &vmf->ptl);
    3827           0 :                         if (likely(vmf->pte &&
    3828             :                                    pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
    3829           0 :                                 ret = VM_FAULT_OOM;
    3830             :                         goto unlock;
    3831             :                 }
    3832             : 
    3833             :                 /* Had to read the page from swap area: Major fault */
    3834           0 :                 ret = VM_FAULT_MAJOR;
    3835           0 :                 count_vm_event(PGMAJFAULT);
    3836           0 :                 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
    3837             :         } else if (PageHWPoison(page)) {
    3838             :                 /*
    3839             :                  * hwpoisoned dirty swapcache pages are kept for killing
    3840             :                  * owner processes (which may be unknown at hwpoison time)
    3841             :                  */
    3842             :                 ret = VM_FAULT_HWPOISON;
    3843             :                 goto out_release;
    3844             :         }
    3845             : 
    3846           0 :         locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags);
    3847             : 
    3848           0 :         if (!locked) {
    3849           0 :                 ret |= VM_FAULT_RETRY;
    3850           0 :                 goto out_release;
    3851             :         }
    3852             : 
    3853           0 :         if (swapcache) {
    3854             :                 /*
    3855             :                  * Make sure folio_free_swap() or swapoff did not release the
    3856             :                  * swapcache from under us.  The page pin, and pte_same test
    3857             :                  * below, are not enough to exclude that.  Even if it is still
    3858             :                  * swapcache, we need to check that the page's swap has not
    3859             :                  * changed.
    3860             :                  */
    3861           0 :                 if (unlikely(!folio_test_swapcache(folio) ||
    3862             :                              page_private(page) != entry.val))
    3863             :                         goto out_page;
    3864             : 
    3865             :                 /*
    3866             :                  * KSM sometimes has to copy on read faults, for example, if
    3867             :                  * page->index of !PageKSM() pages would be nonlinear inside the
    3868             :                  * anon VMA -- PageKSM() is lost on actual swapout.
    3869             :                  */
    3870           0 :                 page = ksm_might_need_to_copy(page, vma, vmf->address);
    3871           0 :                 if (unlikely(!page)) {
    3872             :                         ret = VM_FAULT_OOM;
    3873             :                         goto out_page;
    3874           0 :                 } else if (unlikely(PTR_ERR(page) == -EHWPOISON)) {
    3875             :                         ret = VM_FAULT_HWPOISON;
    3876             :                         goto out_page;
    3877             :                 }
    3878           0 :                 folio = page_folio(page);
    3879             : 
    3880             :                 /*
    3881             :                  * If we want to map a page that's in the swapcache writable, we
    3882             :                  * have to detect via the refcount if we're really the exclusive
    3883             :                  * owner. Try removing the extra reference from the local LRU
    3884             :                  * caches if required.
    3885             :                  */
    3886           0 :                 if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
    3887           0 :                     !folio_test_ksm(folio) && !folio_test_lru(folio))
    3888           0 :                         lru_add_drain();
    3889             :         }
    3890             : 
    3891           0 :         folio_throttle_swaprate(folio, GFP_KERNEL);
    3892             : 
    3893             :         /*
    3894             :          * Back out if somebody else already faulted in this pte.
    3895             :          */
    3896           0 :         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
    3897             :                         &vmf->ptl);
    3898           0 :         if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
    3899             :                 goto out_nomap;
    3900             : 
    3901           0 :         if (unlikely(!folio_test_uptodate(folio))) {
    3902             :                 ret = VM_FAULT_SIGBUS;
    3903             :                 goto out_nomap;
    3904             :         }
    3905             : 
    3906             :         /*
    3907             :          * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
    3908             :          * must never point at an anonymous page in the swapcache that is
    3909             :          * PG_anon_exclusive. Sanity check that this holds and especially, that
    3910             :          * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
    3911             :          * check after taking the PT lock and making sure that nobody
    3912             :          * concurrently faulted in this page and set PG_anon_exclusive.
    3913             :          */
    3914           0 :         BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
    3915           0 :         BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
    3916             : 
    3917             :         /*
    3918             :          * Check under PT lock (to protect against concurrent fork() sharing
    3919             :          * the swap entry concurrently) for certainly exclusive pages.
    3920             :          */
    3921           0 :         if (!folio_test_ksm(folio)) {
    3922           0 :                 exclusive = pte_swp_exclusive(vmf->orig_pte);
    3923           0 :                 if (folio != swapcache) {
    3924             :                         /*
    3925             :                          * We have a fresh page that is not exposed to the
    3926             :                          * swapcache -> certainly exclusive.
    3927             :                          */
    3928             :                         exclusive = true;
    3929           0 :                 } else if (exclusive && folio_test_writeback(folio) &&
    3930           0 :                           data_race(si->flags & SWP_STABLE_WRITES)) {
    3931             :                         /*
    3932             :                          * This is tricky: not all swap backends support
    3933             :                          * concurrent page modifications while under writeback.
    3934             :                          *
    3935             :                          * So if we stumble over such a page in the swapcache
    3936             :                          * we must not set the page exclusive, otherwise we can
    3937             :                          * map it writable without further checks and modify it
    3938             :                          * while still under writeback.
    3939             :                          *
    3940             :                          * For these problematic swap backends, simply drop the
    3941             :                          * exclusive marker: this is perfectly fine as we start
    3942             :                          * writeback only if we fully unmapped the page and
    3943             :                          * there are no unexpected references on the page after
    3944             :                          * unmapping succeeded. After fully unmapped, no
    3945             :                          * further GUP references (FOLL_GET and FOLL_PIN) can
    3946             :                          * appear, so dropping the exclusive marker and mapping
    3947             :                          * it only R/O is fine.
    3948             :                          */
    3949           0 :                         exclusive = false;
    3950             :                 }
    3951             :         }
    3952             : 
    3953             :         /*
    3954             :          * Some architectures may have to restore extra metadata to the page
    3955             :          * when reading from swap. This metadata may be indexed by swap entry
    3956             :          * so this must be called before swap_free().
    3957             :          */
    3958           0 :         arch_swap_restore(entry, folio);
    3959             : 
    3960             :         /*
    3961             :          * Remove the swap entry and conditionally try to free up the swapcache.
    3962             :          * We're already holding a reference on the page but haven't mapped it
    3963             :          * yet.
    3964             :          */
    3965           0 :         swap_free(entry);
    3966           0 :         if (should_try_to_free_swap(folio, vma, vmf->flags))
    3967           0 :                 folio_free_swap(folio);
    3968             : 
    3969           0 :         inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
    3970           0 :         dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
    3971           0 :         pte = mk_pte(page, vma->vm_page_prot);
    3972             : 
    3973             :         /*
    3974             :          * Same logic as in do_wp_page(); however, optimize for pages that are
    3975             :          * certainly not shared either because we just allocated them without
    3976             :          * exposing them to the swapcache or because the swap entry indicates
    3977             :          * exclusivity.
    3978             :          */
    3979           0 :         if (!folio_test_ksm(folio) &&
    3980           0 :             (exclusive || folio_ref_count(folio) == 1)) {
    3981           0 :                 if (vmf->flags & FAULT_FLAG_WRITE) {
    3982           0 :                         pte = maybe_mkwrite(pte_mkdirty(pte), vma);
    3983           0 :                         vmf->flags &= ~FAULT_FLAG_WRITE;
    3984             :                 }
    3985             :                 rmap_flags |= RMAP_EXCLUSIVE;
    3986             :         }
    3987           0 :         flush_icache_page(vma, page);
    3988           0 :         if (pte_swp_soft_dirty(vmf->orig_pte))
    3989             :                 pte = pte_mksoft_dirty(pte);
    3990             :         if (pte_swp_uffd_wp(vmf->orig_pte))
    3991             :                 pte = pte_mkuffd_wp(pte);
    3992           0 :         vmf->orig_pte = pte;
    3993             : 
    3994             :         /* ksm created a completely new copy */
    3995           0 :         if (unlikely(folio != swapcache && swapcache)) {
    3996           0 :                 page_add_new_anon_rmap(page, vma, vmf->address);
    3997           0 :                 folio_add_lru_vma(folio, vma);
    3998             :         } else {
    3999           0 :                 page_add_anon_rmap(page, vma, vmf->address, rmap_flags);
    4000             :         }
    4001             : 
    4002             :         VM_BUG_ON(!folio_test_anon(folio) ||
    4003             :                         (pte_write(pte) && !PageAnonExclusive(page)));
    4004           0 :         set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
    4005           0 :         arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
    4006             : 
    4007           0 :         folio_unlock(folio);
    4008           0 :         if (folio != swapcache && swapcache) {
    4009             :                 /*
    4010             :                  * Hold the lock to avoid the swap entry to be reused
    4011             :                  * until we take the PT lock for the pte_same() check
    4012             :                  * (to avoid false positives from pte_same). For
    4013             :                  * further safety release the lock after the swap_free
    4014             :                  * so that the swap count won't change under a
    4015             :                  * parallel locked swapcache.
    4016             :                  */
    4017           0 :                 folio_unlock(swapcache);
    4018             :                 folio_put(swapcache);
    4019             :         }
    4020             : 
    4021           0 :         if (vmf->flags & FAULT_FLAG_WRITE) {
    4022           0 :                 ret |= do_wp_page(vmf);
    4023           0 :                 if (ret & VM_FAULT_ERROR)
    4024           0 :                         ret &= VM_FAULT_ERROR;
    4025             :                 goto out;
    4026             :         }
    4027             : 
    4028             :         /* No need to invalidate - it was non-present before */
    4029             :         update_mmu_cache(vma, vmf->address, vmf->pte);
    4030             : unlock:
    4031           0 :         if (vmf->pte)
    4032           0 :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    4033             : out:
    4034           0 :         if (si)
    4035             :                 put_swap_device(si);
    4036             :         return ret;
    4037             : out_nomap:
    4038           0 :         if (vmf->pte)
    4039           0 :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    4040             : out_page:
    4041           0 :         folio_unlock(folio);
    4042             : out_release:
    4043           0 :         folio_put(folio);
    4044           0 :         if (folio != swapcache && swapcache) {
    4045           0 :                 folio_unlock(swapcache);
    4046             :                 folio_put(swapcache);
    4047             :         }
    4048           0 :         if (si)
    4049             :                 put_swap_device(si);
    4050             :         return ret;
    4051             : }
    4052             : 
    4053             : /*
    4054             :  * We enter with non-exclusive mmap_lock (to exclude vma changes,
    4055             :  * but allow concurrent faults), and pte mapped but not yet locked.
    4056             :  * We return with mmap_lock still held, but pte unmapped and unlocked.
    4057             :  */
    4058           0 : static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
    4059             : {
    4060           0 :         bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
    4061           0 :         struct vm_area_struct *vma = vmf->vma;
    4062             :         struct folio *folio;
    4063           0 :         vm_fault_t ret = 0;
    4064             :         pte_t entry;
    4065             : 
    4066             :         /* File mapping without ->vm_ops ? */
    4067           0 :         if (vma->vm_flags & VM_SHARED)
    4068             :                 return VM_FAULT_SIGBUS;
    4069             : 
    4070             :         /*
    4071             :          * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
    4072             :          * be distinguished from a transient failure of pte_offset_map().
    4073             :          */
    4074           0 :         if (pte_alloc(vma->vm_mm, vmf->pmd))
    4075             :                 return VM_FAULT_OOM;
    4076             : 
    4077             :         /* Use the zero-page for reads */
    4078           0 :         if (!(vmf->flags & FAULT_FLAG_WRITE) &&
    4079             :                         !mm_forbids_zeropage(vma->vm_mm)) {
    4080           0 :                 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
    4081             :                                                 vma->vm_page_prot));
    4082           0 :                 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
    4083             :                                 vmf->address, &vmf->ptl);
    4084           0 :                 if (!vmf->pte)
    4085             :                         goto unlock;
    4086           0 :                 if (vmf_pte_changed(vmf)) {
    4087             :                         update_mmu_tlb(vma, vmf->address, vmf->pte);
    4088             :                         goto unlock;
    4089             :                 }
    4090           0 :                 ret = check_stable_address_space(vma->vm_mm);
    4091           0 :                 if (ret)
    4092             :                         goto unlock;
    4093             :                 /* Deliver the page fault to userland, check inside PT lock */
    4094             :                 if (userfaultfd_missing(vma)) {
    4095             :                         pte_unmap_unlock(vmf->pte, vmf->ptl);
    4096             :                         return handle_userfault(vmf, VM_UFFD_MISSING);
    4097             :                 }
    4098             :                 goto setpte;
    4099             :         }
    4100             : 
    4101             :         /* Allocate our own private page. */
    4102           0 :         if (unlikely(anon_vma_prepare(vma)))
    4103             :                 goto oom;
    4104           0 :         folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
    4105           0 :         if (!folio)
    4106             :                 goto oom;
    4107             : 
    4108           0 :         if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
    4109             :                 goto oom_free_page;
    4110           0 :         folio_throttle_swaprate(folio, GFP_KERNEL);
    4111             : 
    4112             :         /*
    4113             :          * The memory barrier inside __folio_mark_uptodate makes sure that
    4114             :          * preceding stores to the page contents become visible before
    4115             :          * the set_pte_at() write.
    4116             :          */
    4117           0 :         __folio_mark_uptodate(folio);
    4118             : 
    4119           0 :         entry = mk_pte(&folio->page, vma->vm_page_prot);
    4120             :         entry = pte_sw_mkyoung(entry);
    4121           0 :         if (vma->vm_flags & VM_WRITE)
    4122           0 :                 entry = pte_mkwrite(pte_mkdirty(entry));
    4123             : 
    4124           0 :         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
    4125             :                         &vmf->ptl);
    4126           0 :         if (!vmf->pte)
    4127             :                 goto release;
    4128           0 :         if (vmf_pte_changed(vmf)) {
    4129             :                 update_mmu_tlb(vma, vmf->address, vmf->pte);
    4130             :                 goto release;
    4131             :         }
    4132             : 
    4133           0 :         ret = check_stable_address_space(vma->vm_mm);
    4134           0 :         if (ret)
    4135             :                 goto release;
    4136             : 
    4137             :         /* Deliver the page fault to userland, check inside PT lock */
    4138           0 :         if (userfaultfd_missing(vma)) {
    4139             :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    4140             :                 folio_put(folio);
    4141             :                 return handle_userfault(vmf, VM_UFFD_MISSING);
    4142             :         }
    4143             : 
    4144           0 :         inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
    4145           0 :         folio_add_new_anon_rmap(folio, vma, vmf->address);
    4146           0 :         folio_add_lru_vma(folio, vma);
    4147             : setpte:
    4148             :         if (uffd_wp)
    4149             :                 entry = pte_mkuffd_wp(entry);
    4150           0 :         set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
    4151             : 
    4152             :         /* No need to invalidate - it was non-present before */
    4153             :         update_mmu_cache(vma, vmf->address, vmf->pte);
    4154             : unlock:
    4155           0 :         if (vmf->pte)
    4156           0 :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    4157             :         return ret;
    4158             : release:
    4159             :         folio_put(folio);
    4160             :         goto unlock;
    4161             : oom_free_page:
    4162             :         folio_put(folio);
    4163             : oom:
    4164             :         return VM_FAULT_OOM;
    4165             : }
    4166             : 
    4167             : /*
    4168             :  * The mmap_lock must have been held on entry, and may have been
    4169             :  * released depending on flags and vma->vm_ops->fault() return value.
    4170             :  * See filemap_fault() and __lock_page_retry().
    4171             :  */
    4172           0 : static vm_fault_t __do_fault(struct vm_fault *vmf)
    4173             : {
    4174           0 :         struct vm_area_struct *vma = vmf->vma;
    4175             :         vm_fault_t ret;
    4176             : 
    4177             :         /*
    4178             :          * Preallocate pte before we take page_lock because this might lead to
    4179             :          * deadlocks for memcg reclaim which waits for pages under writeback:
    4180             :          *                              lock_page(A)
    4181             :          *                              SetPageWriteback(A)
    4182             :          *                              unlock_page(A)
    4183             :          * lock_page(B)
    4184             :          *                              lock_page(B)
    4185             :          * pte_alloc_one
    4186             :          *   shrink_page_list
    4187             :          *     wait_on_page_writeback(A)
    4188             :          *                              SetPageWriteback(B)
    4189             :          *                              unlock_page(B)
    4190             :          *                              # flush A, B to clear the writeback
    4191             :          */
    4192           0 :         if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
    4193           0 :                 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
    4194           0 :                 if (!vmf->prealloc_pte)
    4195             :                         return VM_FAULT_OOM;
    4196             :         }
    4197             : 
    4198           0 :         ret = vma->vm_ops->fault(vmf);
    4199           0 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
    4200             :                             VM_FAULT_DONE_COW)))
    4201             :                 return ret;
    4202             : 
    4203           0 :         if (unlikely(PageHWPoison(vmf->page))) {
    4204             :                 struct page *page = vmf->page;
    4205             :                 vm_fault_t poisonret = VM_FAULT_HWPOISON;
    4206             :                 if (ret & VM_FAULT_LOCKED) {
    4207             :                         if (page_mapped(page))
    4208             :                                 unmap_mapping_pages(page_mapping(page),
    4209             :                                                     page->index, 1, false);
    4210             :                         /* Retry if a clean page was removed from the cache. */
    4211             :                         if (invalidate_inode_page(page))
    4212             :                                 poisonret = VM_FAULT_NOPAGE;
    4213             :                         unlock_page(page);
    4214             :                 }
    4215             :                 put_page(page);
    4216             :                 vmf->page = NULL;
    4217             :                 return poisonret;
    4218             :         }
    4219             : 
    4220           0 :         if (unlikely(!(ret & VM_FAULT_LOCKED)))
    4221           0 :                 lock_page(vmf->page);
    4222             :         else
    4223             :                 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
    4224             : 
    4225             :         return ret;
    4226             : }
    4227             : 
    4228             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    4229             : static void deposit_prealloc_pte(struct vm_fault *vmf)
    4230             : {
    4231             :         struct vm_area_struct *vma = vmf->vma;
    4232             : 
    4233             :         pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
    4234             :         /*
    4235             :          * We are going to consume the prealloc table,
    4236             :          * count that as nr_ptes.
    4237             :          */
    4238             :         mm_inc_nr_ptes(vma->vm_mm);
    4239             :         vmf->prealloc_pte = NULL;
    4240             : }
    4241             : 
    4242             : vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
    4243             : {
    4244             :         struct vm_area_struct *vma = vmf->vma;
    4245             :         bool write = vmf->flags & FAULT_FLAG_WRITE;
    4246             :         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
    4247             :         pmd_t entry;
    4248             :         int i;
    4249             :         vm_fault_t ret = VM_FAULT_FALLBACK;
    4250             : 
    4251             :         if (!transhuge_vma_suitable(vma, haddr))
    4252             :                 return ret;
    4253             : 
    4254             :         page = compound_head(page);
    4255             :         if (compound_order(page) != HPAGE_PMD_ORDER)
    4256             :                 return ret;
    4257             : 
    4258             :         /*
    4259             :          * Just backoff if any subpage of a THP is corrupted otherwise
    4260             :          * the corrupted page may mapped by PMD silently to escape the
    4261             :          * check.  This kind of THP just can be PTE mapped.  Access to
    4262             :          * the corrupted subpage should trigger SIGBUS as expected.
    4263             :          */
    4264             :         if (unlikely(PageHasHWPoisoned(page)))
    4265             :                 return ret;
    4266             : 
    4267             :         /*
    4268             :          * Archs like ppc64 need additional space to store information
    4269             :          * related to pte entry. Use the preallocated table for that.
    4270             :          */
    4271             :         if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
    4272             :                 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
    4273             :                 if (!vmf->prealloc_pte)
    4274             :                         return VM_FAULT_OOM;
    4275             :         }
    4276             : 
    4277             :         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
    4278             :         if (unlikely(!pmd_none(*vmf->pmd)))
    4279             :                 goto out;
    4280             : 
    4281             :         for (i = 0; i < HPAGE_PMD_NR; i++)
    4282             :                 flush_icache_page(vma, page + i);
    4283             : 
    4284             :         entry = mk_huge_pmd(page, vma->vm_page_prot);
    4285             :         if (write)
    4286             :                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
    4287             : 
    4288             :         add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
    4289             :         page_add_file_rmap(page, vma, true);
    4290             : 
    4291             :         /*
    4292             :          * deposit and withdraw with pmd lock held
    4293             :          */
    4294             :         if (arch_needs_pgtable_deposit())
    4295             :                 deposit_prealloc_pte(vmf);
    4296             : 
    4297             :         set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
    4298             : 
    4299             :         update_mmu_cache_pmd(vma, haddr, vmf->pmd);
    4300             : 
    4301             :         /* fault is handled */
    4302             :         ret = 0;
    4303             :         count_vm_event(THP_FILE_MAPPED);
    4304             : out:
    4305             :         spin_unlock(vmf->ptl);
    4306             :         return ret;
    4307             : }
    4308             : #else
    4309           0 : vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
    4310             : {
    4311           0 :         return VM_FAULT_FALLBACK;
    4312             : }
    4313             : #endif
    4314             : 
    4315           0 : void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
    4316             : {
    4317           0 :         struct vm_area_struct *vma = vmf->vma;
    4318           0 :         bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
    4319           0 :         bool write = vmf->flags & FAULT_FLAG_WRITE;
    4320           0 :         bool prefault = vmf->address != addr;
    4321             :         pte_t entry;
    4322             : 
    4323           0 :         flush_icache_page(vma, page);
    4324           0 :         entry = mk_pte(page, vma->vm_page_prot);
    4325             : 
    4326             :         if (prefault && arch_wants_old_prefaulted_pte())
    4327             :                 entry = pte_mkold(entry);
    4328             :         else
    4329             :                 entry = pte_sw_mkyoung(entry);
    4330             : 
    4331           0 :         if (write)
    4332           0 :                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    4333             :         if (unlikely(uffd_wp))
    4334             :                 entry = pte_mkuffd_wp(entry);
    4335             :         /* copy-on-write page */
    4336           0 :         if (write && !(vma->vm_flags & VM_SHARED)) {
    4337           0 :                 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
    4338           0 :                 page_add_new_anon_rmap(page, vma, addr);
    4339           0 :                 lru_cache_add_inactive_or_unevictable(page, vma);
    4340             :         } else {
    4341           0 :                 inc_mm_counter(vma->vm_mm, mm_counter_file(page));
    4342           0 :                 page_add_file_rmap(page, vma, false);
    4343             :         }
    4344           0 :         set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
    4345           0 : }
    4346             : 
    4347             : static bool vmf_pte_changed(struct vm_fault *vmf)
    4348             : {
    4349           0 :         if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
    4350           0 :                 return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);
    4351             : 
    4352           0 :         return !pte_none(ptep_get(vmf->pte));
    4353             : }
    4354             : 
    4355             : /**
    4356             :  * finish_fault - finish page fault once we have prepared the page to fault
    4357             :  *
    4358             :  * @vmf: structure describing the fault
    4359             :  *
    4360             :  * This function handles all that is needed to finish a page fault once the
    4361             :  * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
    4362             :  * given page, adds reverse page mapping, handles memcg charges and LRU
    4363             :  * addition.
    4364             :  *
    4365             :  * The function expects the page to be locked and on success it consumes a
    4366             :  * reference of a page being mapped (for the PTE which maps it).
    4367             :  *
    4368             :  * Return: %0 on success, %VM_FAULT_ code in case of error.
    4369             :  */
    4370           0 : vm_fault_t finish_fault(struct vm_fault *vmf)
    4371             : {
    4372           0 :         struct vm_area_struct *vma = vmf->vma;
    4373             :         struct page *page;
    4374             :         vm_fault_t ret;
    4375             : 
    4376             :         /* Did we COW the page? */
    4377           0 :         if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
    4378           0 :                 page = vmf->cow_page;
    4379             :         else
    4380           0 :                 page = vmf->page;
    4381             : 
    4382             :         /*
    4383             :          * check even for read faults because we might have lost our CoWed
    4384             :          * page
    4385             :          */
    4386           0 :         if (!(vma->vm_flags & VM_SHARED)) {
    4387           0 :                 ret = check_stable_address_space(vma->vm_mm);
    4388           0 :                 if (ret)
    4389             :                         return ret;
    4390             :         }
    4391             : 
    4392           0 :         if (pmd_none(*vmf->pmd)) {
    4393           0 :                 if (PageTransCompound(page)) {
    4394             :                         ret = do_set_pmd(vmf, page);
    4395             :                         if (ret != VM_FAULT_FALLBACK)
    4396             :                                 return ret;
    4397             :                 }
    4398             : 
    4399           0 :                 if (vmf->prealloc_pte)
    4400           0 :                         pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
    4401           0 :                 else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
    4402             :                         return VM_FAULT_OOM;
    4403             :         }
    4404             : 
    4405           0 :         vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
    4406             :                                       vmf->address, &vmf->ptl);
    4407           0 :         if (!vmf->pte)
    4408             :                 return VM_FAULT_NOPAGE;
    4409             : 
    4410             :         /* Re-check under ptl */
    4411           0 :         if (likely(!vmf_pte_changed(vmf))) {
    4412           0 :                 do_set_pte(vmf, page, vmf->address);
    4413             : 
    4414             :                 /* no need to invalidate: a not-present page won't be cached */
    4415             :                 update_mmu_cache(vma, vmf->address, vmf->pte);
    4416             : 
    4417           0 :                 ret = 0;
    4418             :         } else {
    4419             :                 update_mmu_tlb(vma, vmf->address, vmf->pte);
    4420             :                 ret = VM_FAULT_NOPAGE;
    4421             :         }
    4422             : 
    4423           0 :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    4424           0 :         return ret;
    4425             : }
    4426             : 
    4427             : static unsigned long fault_around_pages __read_mostly =
    4428             :         65536 >> PAGE_SHIFT;
    4429             : 
    4430             : #ifdef CONFIG_DEBUG_FS
    4431             : static int fault_around_bytes_get(void *data, u64 *val)
    4432             : {
    4433             :         *val = fault_around_pages << PAGE_SHIFT;
    4434             :         return 0;
    4435             : }
    4436             : 
    4437             : /*
    4438             :  * fault_around_bytes must be rounded down to the nearest page order as it's
    4439             :  * what do_fault_around() expects to see.
    4440             :  */
    4441             : static int fault_around_bytes_set(void *data, u64 val)
    4442             : {
    4443             :         if (val / PAGE_SIZE > PTRS_PER_PTE)
    4444             :                 return -EINVAL;
    4445             : 
    4446             :         /*
    4447             :          * The minimum value is 1 page, however this results in no fault-around
    4448             :          * at all. See should_fault_around().
    4449             :          */
    4450             :         fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL);
    4451             : 
    4452             :         return 0;
    4453             : }
    4454             : DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
    4455             :                 fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
    4456             : 
    4457             : static int __init fault_around_debugfs(void)
    4458             : {
    4459             :         debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
    4460             :                                    &fault_around_bytes_fops);
    4461             :         return 0;
    4462             : }
    4463             : late_initcall(fault_around_debugfs);
    4464             : #endif
    4465             : 
    4466             : /*
    4467             :  * do_fault_around() tries to map few pages around the fault address. The hope
    4468             :  * is that the pages will be needed soon and this will lower the number of
    4469             :  * faults to handle.
    4470             :  *
    4471             :  * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
    4472             :  * not ready to be mapped: not up-to-date, locked, etc.
    4473             :  *
    4474             :  * This function doesn't cross VMA or page table boundaries, in order to call
    4475             :  * map_pages() and acquire a PTE lock only once.
    4476             :  *
    4477             :  * fault_around_pages defines how many pages we'll try to map.
    4478             :  * do_fault_around() expects it to be set to a power of two less than or equal
    4479             :  * to PTRS_PER_PTE.
    4480             :  *
    4481             :  * The virtual address of the area that we map is naturally aligned to
    4482             :  * fault_around_pages * PAGE_SIZE rounded down to the machine page size
    4483             :  * (and therefore to page order).  This way it's easier to guarantee
    4484             :  * that we don't cross page table boundaries.
    4485             :  */
    4486           0 : static vm_fault_t do_fault_around(struct vm_fault *vmf)
    4487             : {
    4488           0 :         pgoff_t nr_pages = READ_ONCE(fault_around_pages);
    4489           0 :         pgoff_t pte_off = pte_index(vmf->address);
    4490             :         /* The page offset of vmf->address within the VMA. */
    4491           0 :         pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
    4492             :         pgoff_t from_pte, to_pte;
    4493             :         vm_fault_t ret;
    4494             : 
    4495             :         /* The PTE offset of the start address, clamped to the VMA. */
    4496           0 :         from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
    4497             :                        pte_off - min(pte_off, vma_off));
    4498             : 
    4499             :         /* The PTE offset of the end address, clamped to the VMA and PTE. */
    4500           0 :         to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
    4501             :                       pte_off + vma_pages(vmf->vma) - vma_off) - 1;
    4502             : 
    4503           0 :         if (pmd_none(*vmf->pmd)) {
    4504           0 :                 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
    4505           0 :                 if (!vmf->prealloc_pte)
    4506             :                         return VM_FAULT_OOM;
    4507             :         }
    4508             : 
    4509             :         rcu_read_lock();
    4510           0 :         ret = vmf->vma->vm_ops->map_pages(vmf,
    4511           0 :                         vmf->pgoff + from_pte - pte_off,
    4512           0 :                         vmf->pgoff + to_pte - pte_off);
    4513             :         rcu_read_unlock();
    4514             : 
    4515           0 :         return ret;
    4516             : }
    4517             : 
    4518             : /* Return true if we should do read fault-around, false otherwise */
    4519             : static inline bool should_fault_around(struct vm_fault *vmf)
    4520             : {
    4521             :         /* No ->map_pages?  No way to fault around... */
    4522           0 :         if (!vmf->vma->vm_ops->map_pages)
    4523             :                 return false;
    4524             : 
    4525           0 :         if (uffd_disable_fault_around(vmf->vma))
    4526             :                 return false;
    4527             : 
    4528             :         /* A single page implies no faulting 'around' at all. */
    4529           0 :         return fault_around_pages > 1;
    4530             : }
    4531             : 
    4532           0 : static vm_fault_t do_read_fault(struct vm_fault *vmf)
    4533             : {
    4534           0 :         vm_fault_t ret = 0;
    4535             : 
    4536             :         /*
    4537             :          * Let's call ->map_pages() first and use ->fault() as fallback
    4538             :          * if page by the offset is not ready to be mapped (cold cache or
    4539             :          * something).
    4540             :          */
    4541           0 :         if (should_fault_around(vmf)) {
    4542           0 :                 ret = do_fault_around(vmf);
    4543           0 :                 if (ret)
    4544             :                         return ret;
    4545             :         }
    4546             : 
    4547           0 :         ret = __do_fault(vmf);
    4548           0 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
    4549             :                 return ret;
    4550             : 
    4551           0 :         ret |= finish_fault(vmf);
    4552           0 :         unlock_page(vmf->page);
    4553           0 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
    4554           0 :                 put_page(vmf->page);
    4555             :         return ret;
    4556             : }
    4557             : 
    4558           0 : static vm_fault_t do_cow_fault(struct vm_fault *vmf)
    4559             : {
    4560           0 :         struct vm_area_struct *vma = vmf->vma;
    4561             :         vm_fault_t ret;
    4562             : 
    4563           0 :         if (unlikely(anon_vma_prepare(vma)))
    4564             :                 return VM_FAULT_OOM;
    4565             : 
    4566           0 :         vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
    4567           0 :         if (!vmf->cow_page)
    4568             :                 return VM_FAULT_OOM;
    4569             : 
    4570           0 :         if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm,
    4571             :                                 GFP_KERNEL)) {
    4572             :                 put_page(vmf->cow_page);
    4573             :                 return VM_FAULT_OOM;
    4574             :         }
    4575           0 :         folio_throttle_swaprate(page_folio(vmf->cow_page), GFP_KERNEL);
    4576             : 
    4577           0 :         ret = __do_fault(vmf);
    4578           0 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
    4579             :                 goto uncharge_out;
    4580           0 :         if (ret & VM_FAULT_DONE_COW)
    4581             :                 return ret;
    4582             : 
    4583           0 :         copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
    4584           0 :         __SetPageUptodate(vmf->cow_page);
    4585             : 
    4586           0 :         ret |= finish_fault(vmf);
    4587           0 :         unlock_page(vmf->page);
    4588           0 :         put_page(vmf->page);
    4589           0 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
    4590             :                 goto uncharge_out;
    4591             :         return ret;
    4592             : uncharge_out:
    4593           0 :         put_page(vmf->cow_page);
    4594           0 :         return ret;
    4595             : }
    4596             : 
    4597           0 : static vm_fault_t do_shared_fault(struct vm_fault *vmf)
    4598             : {
    4599           0 :         struct vm_area_struct *vma = vmf->vma;
    4600             :         vm_fault_t ret, tmp;
    4601             : 
    4602           0 :         ret = __do_fault(vmf);
    4603           0 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
    4604             :                 return ret;
    4605             : 
    4606             :         /*
    4607             :          * Check if the backing address space wants to know that the page is
    4608             :          * about to become writable
    4609             :          */
    4610           0 :         if (vma->vm_ops->page_mkwrite) {
    4611           0 :                 unlock_page(vmf->page);
    4612           0 :                 tmp = do_page_mkwrite(vmf);
    4613           0 :                 if (unlikely(!tmp ||
    4614             :                                 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
    4615           0 :                         put_page(vmf->page);
    4616           0 :                         return tmp;
    4617             :                 }
    4618             :         }
    4619             : 
    4620           0 :         ret |= finish_fault(vmf);
    4621           0 :         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
    4622             :                                         VM_FAULT_RETRY))) {
    4623           0 :                 unlock_page(vmf->page);
    4624           0 :                 put_page(vmf->page);
    4625           0 :                 return ret;
    4626             :         }
    4627             : 
    4628           0 :         ret |= fault_dirty_shared_page(vmf);
    4629           0 :         return ret;
    4630             : }
    4631             : 
    4632             : /*
    4633             :  * We enter with non-exclusive mmap_lock (to exclude vma changes,
    4634             :  * but allow concurrent faults).
    4635             :  * The mmap_lock may have been released depending on flags and our
    4636             :  * return value.  See filemap_fault() and __folio_lock_or_retry().
    4637             :  * If mmap_lock is released, vma may become invalid (for example
    4638             :  * by other thread calling munmap()).
    4639             :  */
    4640           0 : static vm_fault_t do_fault(struct vm_fault *vmf)
    4641             : {
    4642           0 :         struct vm_area_struct *vma = vmf->vma;
    4643           0 :         struct mm_struct *vm_mm = vma->vm_mm;
    4644             :         vm_fault_t ret;
    4645             : 
    4646             :         /*
    4647             :          * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
    4648             :          */
    4649           0 :         if (!vma->vm_ops->fault) {
    4650           0 :                 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
    4651             :                                                vmf->address, &vmf->ptl);
    4652           0 :                 if (unlikely(!vmf->pte))
    4653             :                         ret = VM_FAULT_SIGBUS;
    4654             :                 else {
    4655             :                         /*
    4656             :                          * Make sure this is not a temporary clearing of pte
    4657             :                          * by holding ptl and checking again. A R/M/W update
    4658             :                          * of pte involves: take ptl, clearing the pte so that
    4659             :                          * we don't have concurrent modification by hardware
    4660             :                          * followed by an update.
    4661             :                          */
    4662           0 :                         if (unlikely(pte_none(ptep_get(vmf->pte))))
    4663             :                                 ret = VM_FAULT_SIGBUS;
    4664             :                         else
    4665           0 :                                 ret = VM_FAULT_NOPAGE;
    4666             : 
    4667           0 :                         pte_unmap_unlock(vmf->pte, vmf->ptl);
    4668             :                 }
    4669           0 :         } else if (!(vmf->flags & FAULT_FLAG_WRITE))
    4670           0 :                 ret = do_read_fault(vmf);
    4671           0 :         else if (!(vma->vm_flags & VM_SHARED))
    4672           0 :                 ret = do_cow_fault(vmf);
    4673             :         else
    4674           0 :                 ret = do_shared_fault(vmf);
    4675             : 
    4676             :         /* preallocated pagetable is unused: free it */
    4677           0 :         if (vmf->prealloc_pte) {
    4678           0 :                 pte_free(vm_mm, vmf->prealloc_pte);
    4679           0 :                 vmf->prealloc_pte = NULL;
    4680             :         }
    4681           0 :         return ret;
    4682             : }
    4683             : 
    4684           0 : int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
    4685             :                       unsigned long addr, int page_nid, int *flags)
    4686             : {
    4687           0 :         get_page(page);
    4688             : 
    4689             :         /* Record the current PID acceesing VMA */
    4690           0 :         vma_set_access_pid_bit(vma);
    4691             : 
    4692             :         count_vm_numa_event(NUMA_HINT_FAULTS);
    4693           0 :         if (page_nid == numa_node_id()) {
    4694             :                 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
    4695           0 :                 *flags |= TNF_FAULT_LOCAL;
    4696             :         }
    4697             : 
    4698           0 :         return mpol_misplaced(page, vma, addr);
    4699             : }
    4700             : 
    4701             : static vm_fault_t do_numa_page(struct vm_fault *vmf)
    4702             : {
    4703             :         struct vm_area_struct *vma = vmf->vma;
    4704             :         struct page *page = NULL;
    4705             :         int page_nid = NUMA_NO_NODE;
    4706             :         bool writable = false;
    4707             :         int last_cpupid;
    4708             :         int target_nid;
    4709             :         pte_t pte, old_pte;
    4710             :         int flags = 0;
    4711             : 
    4712             :         /*
    4713             :          * The "pte" at this point cannot be used safely without
    4714             :          * validation through pte_unmap_same(). It's of NUMA type but
    4715             :          * the pfn may be screwed if the read is non atomic.
    4716             :          */
    4717             :         spin_lock(vmf->ptl);
    4718             :         if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
    4719             :                 pte_unmap_unlock(vmf->pte, vmf->ptl);
    4720             :                 goto out;
    4721             :         }
    4722             : 
    4723             :         /* Get the normal PTE  */
    4724             :         old_pte = ptep_get(vmf->pte);
    4725             :         pte = pte_modify(old_pte, vma->vm_page_prot);
    4726             : 
    4727             :         /*
    4728             :          * Detect now whether the PTE could be writable; this information
    4729             :          * is only valid while holding the PT lock.
    4730             :          */
    4731             :         writable = pte_write(pte);
    4732             :         if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
    4733             :             can_change_pte_writable(vma, vmf->address, pte))
    4734             :                 writable = true;
    4735             : 
    4736             :         page = vm_normal_page(vma, vmf->address, pte);
    4737             :         if (!page || is_zone_device_page(page))
    4738             :                 goto out_map;
    4739             : 
    4740             :         /* TODO: handle PTE-mapped THP */
    4741             :         if (PageCompound(page))
    4742             :                 goto out_map;
    4743             : 
    4744             :         /*
    4745             :          * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
    4746             :          * much anyway since they can be in shared cache state. This misses
    4747             :          * the case where a mapping is writable but the process never writes
    4748             :          * to it but pte_write gets cleared during protection updates and
    4749             :          * pte_dirty has unpredictable behaviour between PTE scan updates,
    4750             :          * background writeback, dirty balancing and application behaviour.
    4751             :          */
    4752             :         if (!writable)
    4753             :                 flags |= TNF_NO_GROUP;
    4754             : 
    4755             :         /*
    4756             :          * Flag if the page is shared between multiple address spaces. This
    4757             :          * is later used when determining whether to group tasks together
    4758             :          */
    4759             :         if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
    4760             :                 flags |= TNF_SHARED;
    4761             : 
    4762             :         page_nid = page_to_nid(page);
    4763             :         /*
    4764             :          * For memory tiering mode, cpupid of slow memory page is used
    4765             :          * to record page access time.  So use default value.
    4766             :          */
    4767             :         if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
    4768             :             !node_is_toptier(page_nid))
    4769             :                 last_cpupid = (-1 & LAST_CPUPID_MASK);
    4770             :         else
    4771             :                 last_cpupid = page_cpupid_last(page);
    4772             :         target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
    4773             :                         &flags);
    4774             :         if (target_nid == NUMA_NO_NODE) {
    4775             :                 put_page(page);
    4776             :                 goto out_map;
    4777             :         }
    4778             :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    4779             :         writable = false;
    4780             : 
    4781             :         /* Migrate to the requested node */
    4782             :         if (migrate_misplaced_page(page, vma, target_nid)) {
    4783             :                 page_nid = target_nid;
    4784             :                 flags |= TNF_MIGRATED;
    4785             :         } else {
    4786             :                 flags |= TNF_MIGRATE_FAIL;
    4787             :                 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
    4788             :                                                vmf->address, &vmf->ptl);
    4789             :                 if (unlikely(!vmf->pte))
    4790             :                         goto out;
    4791             :                 if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
    4792             :                         pte_unmap_unlock(vmf->pte, vmf->ptl);
    4793             :                         goto out;
    4794             :                 }
    4795             :                 goto out_map;
    4796             :         }
    4797             : 
    4798             : out:
    4799             :         if (page_nid != NUMA_NO_NODE)
    4800             :                 task_numa_fault(last_cpupid, page_nid, 1, flags);
    4801             :         return 0;
    4802             : out_map:
    4803             :         /*
    4804             :          * Make it present again, depending on how arch implements
    4805             :          * non-accessible ptes, some can allow access by kernel mode.
    4806             :          */
    4807             :         old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
    4808             :         pte = pte_modify(old_pte, vma->vm_page_prot);
    4809             :         pte = pte_mkyoung(pte);
    4810             :         if (writable)
    4811             :                 pte = pte_mkwrite(pte);
    4812             :         ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
    4813             :         update_mmu_cache(vma, vmf->address, vmf->pte);
    4814             :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    4815             :         goto out;
    4816             : }
    4817             : 
    4818             : static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
    4819             : {
    4820             :         if (vma_is_anonymous(vmf->vma))
    4821             :                 return do_huge_pmd_anonymous_page(vmf);
    4822             :         if (vmf->vma->vm_ops->huge_fault)
    4823             :                 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
    4824             :         return VM_FAULT_FALLBACK;
    4825             : }
    4826             : 
    4827             : /* `inline' is required to avoid gcc 4.1.2 build error */
    4828             : static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
    4829             : {
    4830             :         const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
    4831             :         vm_fault_t ret;
    4832             : 
    4833             :         if (vma_is_anonymous(vmf->vma)) {
    4834             :                 if (likely(!unshare) &&
    4835             :                     userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
    4836             :                         return handle_userfault(vmf, VM_UFFD_WP);
    4837             :                 return do_huge_pmd_wp_page(vmf);
    4838             :         }
    4839             : 
    4840             :         if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
    4841             :                 if (vmf->vma->vm_ops->huge_fault) {
    4842             :                         ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
    4843             :                         if (!(ret & VM_FAULT_FALLBACK))
    4844             :                                 return ret;
    4845             :                 }
    4846             :         }
    4847             : 
    4848             :         /* COW or write-notify handled on pte level: split pmd. */
    4849             :         __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
    4850             : 
    4851             :         return VM_FAULT_FALLBACK;
    4852             : }
    4853             : 
    4854             : static vm_fault_t create_huge_pud(struct vm_fault *vmf)
    4855             : {
    4856             : #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                     \
    4857             :         defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
    4858             :         /* No support for anonymous transparent PUD pages yet */
    4859             :         if (vma_is_anonymous(vmf->vma))
    4860             :                 return VM_FAULT_FALLBACK;
    4861             :         if (vmf->vma->vm_ops->huge_fault)
    4862             :                 return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
    4863             : #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    4864             :         return VM_FAULT_FALLBACK;
    4865             : }
    4866             : 
    4867             : static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
    4868             : {
    4869             : #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                     \
    4870             :         defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
    4871             :         vm_fault_t ret;
    4872             : 
    4873             :         /* No support for anonymous transparent PUD pages yet */
    4874             :         if (vma_is_anonymous(vmf->vma))
    4875             :                 goto split;
    4876             :         if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
    4877             :                 if (vmf->vma->vm_ops->huge_fault) {
    4878             :                         ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
    4879             :                         if (!(ret & VM_FAULT_FALLBACK))
    4880             :                                 return ret;
    4881             :                 }
    4882             :         }
    4883             : split:
    4884             :         /* COW or write-notify not handled on PUD level: split pud.*/
    4885             :         __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
    4886             : #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
    4887             :         return VM_FAULT_FALLBACK;
    4888             : }
    4889             : 
    4890             : /*
    4891             :  * These routines also need to handle stuff like marking pages dirty
    4892             :  * and/or accessed for architectures that don't do it in hardware (most
    4893             :  * RISC architectures).  The early dirtying is also good on the i386.
    4894             :  *
    4895             :  * There is also a hook called "update_mmu_cache()" that architectures
    4896             :  * with external mmu caches can use to update those (ie the Sparc or
    4897             :  * PowerPC hashed page tables that act as extended TLBs).
    4898             :  *
    4899             :  * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
    4900             :  * concurrent faults).
    4901             :  *
    4902             :  * The mmap_lock may have been released depending on flags and our return value.
    4903             :  * See filemap_fault() and __folio_lock_or_retry().
    4904             :  */
    4905           0 : static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
    4906             : {
    4907             :         pte_t entry;
    4908             : 
    4909           0 :         if (unlikely(pmd_none(*vmf->pmd))) {
    4910             :                 /*
    4911             :                  * Leave __pte_alloc() until later: because vm_ops->fault may
    4912             :                  * want to allocate huge page, and if we expose page table
    4913             :                  * for an instant, it will be difficult to retract from
    4914             :                  * concurrent faults and from rmap lookups.
    4915             :                  */
    4916           0 :                 vmf->pte = NULL;
    4917           0 :                 vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
    4918             :         } else {
    4919             :                 /*
    4920             :                  * A regular pmd is established and it can't morph into a huge
    4921             :                  * pmd by anon khugepaged, since that takes mmap_lock in write
    4922             :                  * mode; but shmem or file collapse to THP could still morph
    4923             :                  * it into a huge pmd: just retry later if so.
    4924             :                  */
    4925           0 :                 vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
    4926             :                                                  vmf->address, &vmf->ptl);
    4927           0 :                 if (unlikely(!vmf->pte))
    4928             :                         return 0;
    4929           0 :                 vmf->orig_pte = ptep_get_lockless(vmf->pte);
    4930           0 :                 vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
    4931             : 
    4932           0 :                 if (pte_none(vmf->orig_pte)) {
    4933           0 :                         pte_unmap(vmf->pte);
    4934           0 :                         vmf->pte = NULL;
    4935             :                 }
    4936             :         }
    4937             : 
    4938           0 :         if (!vmf->pte)
    4939           0 :                 return do_pte_missing(vmf);
    4940             : 
    4941           0 :         if (!pte_present(vmf->orig_pte))
    4942           0 :                 return do_swap_page(vmf);
    4943             : 
    4944           0 :         if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
    4945             :                 return do_numa_page(vmf);
    4946             : 
    4947           0 :         spin_lock(vmf->ptl);
    4948           0 :         entry = vmf->orig_pte;
    4949           0 :         if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
    4950             :                 update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
    4951             :                 goto unlock;
    4952             :         }
    4953           0 :         if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
    4954           0 :                 if (!pte_write(entry))
    4955           0 :                         return do_wp_page(vmf);
    4956           0 :                 else if (likely(vmf->flags & FAULT_FLAG_WRITE))
    4957             :                         entry = pte_mkdirty(entry);
    4958             :         }
    4959           0 :         entry = pte_mkyoung(entry);
    4960           0 :         if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
    4961           0 :                                 vmf->flags & FAULT_FLAG_WRITE)) {
    4962             :                 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
    4963             :         } else {
    4964             :                 /* Skip spurious TLB flush for retried page fault */
    4965           0 :                 if (vmf->flags & FAULT_FLAG_TRIED)
    4966             :                         goto unlock;
    4967             :                 /*
    4968             :                  * This is needed only for protection faults but the arch code
    4969             :                  * is not yet telling us if this is a protection fault or not.
    4970             :                  * This still avoids useless tlb flushes for .text page faults
    4971             :                  * with threads.
    4972             :                  */
    4973           0 :                 if (vmf->flags & FAULT_FLAG_WRITE)
    4974           0 :                         flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
    4975             :                                                      vmf->pte);
    4976             :         }
    4977             : unlock:
    4978           0 :         pte_unmap_unlock(vmf->pte, vmf->ptl);
    4979           0 :         return 0;
    4980             : }
    4981             : 
    4982             : /*
    4983             :  * By the time we get here, we already hold the mm semaphore
    4984             :  *
    4985             :  * The mmap_lock may have been released depending on flags and our
    4986             :  * return value.  See filemap_fault() and __folio_lock_or_retry().
    4987             :  */
    4988           0 : static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
    4989             :                 unsigned long address, unsigned int flags)
    4990             : {
    4991           0 :         struct vm_fault vmf = {
    4992             :                 .vma = vma,
    4993           0 :                 .address = address & PAGE_MASK,
    4994             :                 .real_address = address,
    4995             :                 .flags = flags,
    4996           0 :                 .pgoff = linear_page_index(vma, address),
    4997           0 :                 .gfp_mask = __get_fault_gfp_mask(vma),
    4998             :         };
    4999           0 :         struct mm_struct *mm = vma->vm_mm;
    5000           0 :         unsigned long vm_flags = vma->vm_flags;
    5001             :         pgd_t *pgd;
    5002             :         p4d_t *p4d;
    5003             :         vm_fault_t ret;
    5004             : 
    5005           0 :         pgd = pgd_offset(mm, address);
    5006           0 :         p4d = p4d_alloc(mm, pgd, address);
    5007           0 :         if (!p4d)
    5008             :                 return VM_FAULT_OOM;
    5009             : 
    5010           0 :         vmf.pud = pud_alloc(mm, p4d, address);
    5011             :         if (!vmf.pud)
    5012             :                 return VM_FAULT_OOM;
    5013             : retry_pud:
    5014             :         if (pud_none(*vmf.pud) &&
    5015             :             hugepage_vma_check(vma, vm_flags, false, true, true)) {
    5016             :                 ret = create_huge_pud(&vmf);
    5017             :                 if (!(ret & VM_FAULT_FALLBACK))
    5018             :                         return ret;
    5019             :         } else {
    5020             :                 pud_t orig_pud = *vmf.pud;
    5021             : 
    5022           0 :                 barrier();
    5023           0 :                 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
    5024             : 
    5025             :                         /*
    5026             :                          * TODO once we support anonymous PUDs: NUMA case and
    5027             :                          * FAULT_FLAG_UNSHARE handling.
    5028             :                          */
    5029             :                         if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) {
    5030             :                                 ret = wp_huge_pud(&vmf, orig_pud);
    5031             :                                 if (!(ret & VM_FAULT_FALLBACK))
    5032             :                                         return ret;
    5033             :                         } else {
    5034             :                                 huge_pud_set_accessed(&vmf, orig_pud);
    5035             :                                 return 0;
    5036             :                         }
    5037             :                 }
    5038             :         }
    5039             : 
    5040           0 :         vmf.pmd = pmd_alloc(mm, vmf.pud, address);
    5041           0 :         if (!vmf.pmd)
    5042             :                 return VM_FAULT_OOM;
    5043             : 
    5044             :         /* Huge pud page fault raced with pmd_alloc? */
    5045           0 :         if (pud_trans_unstable(vmf.pud))
    5046             :                 goto retry_pud;
    5047             : 
    5048             :         if (pmd_none(*vmf.pmd) &&
    5049             :             hugepage_vma_check(vma, vm_flags, false, true, true)) {
    5050             :                 ret = create_huge_pmd(&vmf);
    5051             :                 if (!(ret & VM_FAULT_FALLBACK))
    5052             :                         return ret;
    5053             :         } else {
    5054           0 :                 vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
    5055             : 
    5056           0 :                 if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
    5057             :                         VM_BUG_ON(thp_migration_supported() &&
    5058             :                                           !is_pmd_migration_entry(vmf.orig_pmd));
    5059             :                         if (is_pmd_migration_entry(vmf.orig_pmd))
    5060             :                                 pmd_migration_entry_wait(mm, vmf.pmd);
    5061             :                         return 0;
    5062             :                 }
    5063           0 :                 if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
    5064             :                         if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
    5065             :                                 return do_huge_pmd_numa_page(&vmf);
    5066             : 
    5067             :                         if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
    5068             :                             !pmd_write(vmf.orig_pmd)) {
    5069             :                                 ret = wp_huge_pmd(&vmf);
    5070             :                                 if (!(ret & VM_FAULT_FALLBACK))
    5071             :                                         return ret;
    5072             :                         } else {
    5073             :                                 huge_pmd_set_accessed(&vmf);
    5074             :                                 return 0;
    5075             :                         }
    5076             :                 }
    5077             :         }
    5078             : 
    5079           0 :         return handle_pte_fault(&vmf);
    5080             : }
    5081             : 
    5082             : /**
    5083             :  * mm_account_fault - Do page fault accounting
    5084             :  *
    5085             :  * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
    5086             :  *        of perf event counters, but we'll still do the per-task accounting to
    5087             :  *        the task who triggered this page fault.
    5088             :  * @address: the faulted address.
    5089             :  * @flags: the fault flags.
    5090             :  * @ret: the fault retcode.
    5091             :  *
    5092             :  * This will take care of most of the page fault accounting.  Meanwhile, it
    5093             :  * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
    5094             :  * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
    5095             :  * still be in per-arch page fault handlers at the entry of page fault.
    5096             :  */
    5097           0 : static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs,
    5098             :                                     unsigned long address, unsigned int flags,
    5099             :                                     vm_fault_t ret)
    5100             : {
    5101             :         bool major;
    5102             : 
    5103             :         /* Incomplete faults will be accounted upon completion. */
    5104           0 :         if (ret & VM_FAULT_RETRY)
    5105             :                 return;
    5106             : 
    5107             :         /*
    5108             :          * To preserve the behavior of older kernels, PGFAULT counters record
    5109             :          * both successful and failed faults, as opposed to perf counters,
    5110             :          * which ignore failed cases.
    5111             :          */
    5112           0 :         count_vm_event(PGFAULT);
    5113           0 :         count_memcg_event_mm(mm, PGFAULT);
    5114             : 
    5115             :         /*
    5116             :          * Do not account for unsuccessful faults (e.g. when the address wasn't
    5117             :          * valid).  That includes arch_vma_access_permitted() failing before
    5118             :          * reaching here. So this is not a "this many hardware page faults"
    5119             :          * counter.  We should use the hw profiling for that.
    5120             :          */
    5121           0 :         if (ret & VM_FAULT_ERROR)
    5122             :                 return;
    5123             : 
    5124             :         /*
    5125             :          * We define the fault as a major fault when the final successful fault
    5126             :          * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
    5127             :          * handle it immediately previously).
    5128             :          */
    5129           0 :         major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
    5130             : 
    5131           0 :         if (major)
    5132           0 :                 current->maj_flt++;
    5133             :         else
    5134           0 :                 current->min_flt++;
    5135             : 
    5136             :         /*
    5137             :          * If the fault is done for GUP, regs will be NULL.  We only do the
    5138             :          * accounting for the per thread fault counters who triggered the
    5139             :          * fault, and we skip the perf event updates.
    5140             :          */
    5141             :         if (!regs)
    5142             :                 return;
    5143             : 
    5144             :         if (major)
    5145             :                 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
    5146             :         else
    5147             :                 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
    5148             : }
    5149             : 
    5150             : #ifdef CONFIG_LRU_GEN
    5151             : static void lru_gen_enter_fault(struct vm_area_struct *vma)
    5152             : {
    5153             :         /* the LRU algorithm only applies to accesses with recency */
    5154             :         current->in_lru_fault = vma_has_recency(vma);
    5155             : }
    5156             : 
    5157             : static void lru_gen_exit_fault(void)
    5158             : {
    5159             :         current->in_lru_fault = false;
    5160             : }
    5161             : #else
    5162             : static void lru_gen_enter_fault(struct vm_area_struct *vma)
    5163             : {
    5164             : }
    5165             : 
    5166             : static void lru_gen_exit_fault(void)
    5167             : {
    5168             : }
    5169             : #endif /* CONFIG_LRU_GEN */
    5170             : 
    5171           0 : static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
    5172             :                                        unsigned int *flags)
    5173             : {
    5174           0 :         if (unlikely(*flags & FAULT_FLAG_UNSHARE)) {
    5175           0 :                 if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE))
    5176             :                         return VM_FAULT_SIGSEGV;
    5177             :                 /*
    5178             :                  * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's
    5179             :                  * just treat it like an ordinary read-fault otherwise.
    5180             :                  */
    5181           0 :                 if (!is_cow_mapping(vma->vm_flags))
    5182           0 :                         *flags &= ~FAULT_FLAG_UNSHARE;
    5183           0 :         } else if (*flags & FAULT_FLAG_WRITE) {
    5184             :                 /* Write faults on read-only mappings are impossible ... */
    5185           0 :                 if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
    5186             :                         return VM_FAULT_SIGSEGV;
    5187             :                 /* ... and FOLL_FORCE only applies to COW mappings. */
    5188           0 :                 if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) &&
    5189             :                                  !is_cow_mapping(vma->vm_flags)))
    5190             :                         return VM_FAULT_SIGSEGV;
    5191             :         }
    5192             :         return 0;
    5193             : }
    5194             : 
    5195             : /*
    5196             :  * By the time we get here, we already hold the mm semaphore
    5197             :  *
    5198             :  * The mmap_lock may have been released depending on flags and our
    5199             :  * return value.  See filemap_fault() and __folio_lock_or_retry().
    5200             :  */
    5201           0 : vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
    5202             :                            unsigned int flags, struct pt_regs *regs)
    5203             : {
    5204             :         /* If the fault handler drops the mmap_lock, vma may be freed */
    5205           0 :         struct mm_struct *mm = vma->vm_mm;
    5206             :         vm_fault_t ret;
    5207             : 
    5208           0 :         __set_current_state(TASK_RUNNING);
    5209             : 
    5210           0 :         ret = sanitize_fault_flags(vma, &flags);
    5211           0 :         if (ret)
    5212             :                 goto out;
    5213             : 
    5214           0 :         if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
    5215           0 :                                             flags & FAULT_FLAG_INSTRUCTION,
    5216           0 :                                             flags & FAULT_FLAG_REMOTE)) {
    5217             :                 ret = VM_FAULT_SIGSEGV;
    5218             :                 goto out;
    5219             :         }
    5220             : 
    5221             :         /*
    5222             :          * Enable the memcg OOM handling for faults triggered in user
    5223             :          * space.  Kernel faults are handled more gracefully.
    5224             :          */
    5225             :         if (flags & FAULT_FLAG_USER)
    5226             :                 mem_cgroup_enter_user_fault();
    5227             : 
    5228           0 :         lru_gen_enter_fault(vma);
    5229             : 
    5230           0 :         if (unlikely(is_vm_hugetlb_page(vma)))
    5231             :                 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
    5232             :         else
    5233           0 :                 ret = __handle_mm_fault(vma, address, flags);
    5234             : 
    5235             :         lru_gen_exit_fault();
    5236             : 
    5237           0 :         if (flags & FAULT_FLAG_USER) {
    5238             :                 mem_cgroup_exit_user_fault();
    5239             :                 /*
    5240             :                  * The task may have entered a memcg OOM situation but
    5241             :                  * if the allocation error was handled gracefully (no
    5242             :                  * VM_FAULT_OOM), there is no need to kill anything.
    5243             :                  * Just clean up the OOM state peacefully.
    5244             :                  */
    5245           0 :                 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
    5246             :                         mem_cgroup_oom_synchronize(false);
    5247             :         }
    5248             : out:
    5249           0 :         mm_account_fault(mm, regs, address, flags, ret);
    5250             : 
    5251           0 :         return ret;
    5252             : }
    5253             : EXPORT_SYMBOL_GPL(handle_mm_fault);
    5254             : 
    5255             : #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
    5256             : #include <linux/extable.h>
    5257             : 
    5258             : static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
    5259             : {
    5260             :         /* Even if this succeeds, make it clear we *might* have slept */
    5261             :         if (likely(mmap_read_trylock(mm))) {
    5262             :                 might_sleep();
    5263             :                 return true;
    5264             :         }
    5265             : 
    5266             :         if (regs && !user_mode(regs)) {
    5267             :                 unsigned long ip = instruction_pointer(regs);
    5268             :                 if (!search_exception_tables(ip))
    5269             :                         return false;
    5270             :         }
    5271             : 
    5272             :         return !mmap_read_lock_killable(mm);
    5273             : }
    5274             : 
    5275             : static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
    5276             : {
    5277             :         /*
    5278             :          * We don't have this operation yet.
    5279             :          *
    5280             :          * It should be easy enough to do: it's basically a
    5281             :          *    atomic_long_try_cmpxchg_acquire()
    5282             :          * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
    5283             :          * it also needs the proper lockdep magic etc.
    5284             :          */
    5285             :         return false;
    5286             : }
    5287             : 
    5288             : static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
    5289             : {
    5290             :         mmap_read_unlock(mm);
    5291             :         if (regs && !user_mode(regs)) {
    5292             :                 unsigned long ip = instruction_pointer(regs);
    5293             :                 if (!search_exception_tables(ip))
    5294             :                         return false;
    5295             :         }
    5296             :         return !mmap_write_lock_killable(mm);
    5297             : }
    5298             : 
    5299             : /*
    5300             :  * Helper for page fault handling.
    5301             :  *
    5302             :  * This is kind of equivalend to "mmap_read_lock()" followed
    5303             :  * by "find_extend_vma()", except it's a lot more careful about
    5304             :  * the locking (and will drop the lock on failure).
    5305             :  *
    5306             :  * For example, if we have a kernel bug that causes a page
    5307             :  * fault, we don't want to just use mmap_read_lock() to get
    5308             :  * the mm lock, because that would deadlock if the bug were
    5309             :  * to happen while we're holding the mm lock for writing.
    5310             :  *
    5311             :  * So this checks the exception tables on kernel faults in
    5312             :  * order to only do this all for instructions that are actually
    5313             :  * expected to fault.
    5314             :  *
    5315             :  * We can also actually take the mm lock for writing if we
    5316             :  * need to extend the vma, which helps the VM layer a lot.
    5317             :  */
    5318             : struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
    5319             :                         unsigned long addr, struct pt_regs *regs)
    5320             : {
    5321             :         struct vm_area_struct *vma;
    5322             : 
    5323             :         if (!get_mmap_lock_carefully(mm, regs))
    5324             :                 return NULL;
    5325             : 
    5326             :         vma = find_vma(mm, addr);
    5327             :         if (likely(vma && (vma->vm_start <= addr)))
    5328             :                 return vma;
    5329             : 
    5330             :         /*
    5331             :          * Well, dang. We might still be successful, but only
    5332             :          * if we can extend a vma to do so.
    5333             :          */
    5334             :         if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
    5335             :                 mmap_read_unlock(mm);
    5336             :                 return NULL;
    5337             :         }
    5338             : 
    5339             :         /*
    5340             :          * We can try to upgrade the mmap lock atomically,
    5341             :          * in which case we can continue to use the vma
    5342             :          * we already looked up.
    5343             :          *
    5344             :          * Otherwise we'll have to drop the mmap lock and
    5345             :          * re-take it, and also look up the vma again,
    5346             :          * re-checking it.
    5347             :          */
    5348             :         if (!mmap_upgrade_trylock(mm)) {
    5349             :                 if (!upgrade_mmap_lock_carefully(mm, regs))
    5350             :                         return NULL;
    5351             : 
    5352             :                 vma = find_vma(mm, addr);
    5353             :                 if (!vma)
    5354             :                         goto fail;
    5355             :                 if (vma->vm_start <= addr)
    5356             :                         goto success;
    5357             :                 if (!(vma->vm_flags & VM_GROWSDOWN))
    5358             :                         goto fail;
    5359             :         }
    5360             : 
    5361             :         if (expand_stack_locked(vma, addr))
    5362             :                 goto fail;
    5363             : 
    5364             : success:
    5365             :         mmap_write_downgrade(mm);
    5366             :         return vma;
    5367             : 
    5368             : fail:
    5369             :         mmap_write_unlock(mm);
    5370             :         return NULL;
    5371             : }
    5372             : #endif
    5373             : 
    5374             : #ifdef CONFIG_PER_VMA_LOCK
    5375             : /*
    5376             :  * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
    5377             :  * stable and not isolated. If the VMA is not found or is being modified the
    5378             :  * function returns NULL.
    5379             :  */
    5380             : struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
    5381             :                                           unsigned long address)
    5382             : {
    5383             :         MA_STATE(mas, &mm->mm_mt, address, address);
    5384             :         struct vm_area_struct *vma;
    5385             : 
    5386             :         rcu_read_lock();
    5387             : retry:
    5388             :         vma = mas_walk(&mas);
    5389             :         if (!vma)
    5390             :                 goto inval;
    5391             : 
    5392             :         /* Only anonymous and tcp vmas are supported for now */
    5393             :         if (!vma_is_anonymous(vma) && !vma_is_tcp(vma))
    5394             :                 goto inval;
    5395             : 
    5396             :         /* find_mergeable_anon_vma uses adjacent vmas which are not locked */
    5397             :         if (!vma->anon_vma && !vma_is_tcp(vma))
    5398             :                 goto inval;
    5399             : 
    5400             :         if (!vma_start_read(vma))
    5401             :                 goto inval;
    5402             : 
    5403             :         /*
    5404             :          * Due to the possibility of userfault handler dropping mmap_lock, avoid
    5405             :          * it for now and fall back to page fault handling under mmap_lock.
    5406             :          */
    5407             :         if (userfaultfd_armed(vma)) {
    5408             :                 vma_end_read(vma);
    5409             :                 goto inval;
    5410             :         }
    5411             : 
    5412             :         /* Check since vm_start/vm_end might change before we lock the VMA */
    5413             :         if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
    5414             :                 vma_end_read(vma);
    5415             :                 goto inval;
    5416             :         }
    5417             : 
    5418             :         /* Check if the VMA got isolated after we found it */
    5419             :         if (vma->detached) {
    5420             :                 vma_end_read(vma);
    5421             :                 count_vm_vma_lock_event(VMA_LOCK_MISS);
    5422             :                 /* The area was replaced with another one */
    5423             :                 goto retry;
    5424             :         }
    5425             : 
    5426             :         rcu_read_unlock();
    5427             :         return vma;
    5428             : inval:
    5429             :         rcu_read_unlock();
    5430             :         count_vm_vma_lock_event(VMA_LOCK_ABORT);
    5431             :         return NULL;
    5432             : }
    5433             : #endif /* CONFIG_PER_VMA_LOCK */
    5434             : 
    5435             : #ifndef __PAGETABLE_P4D_FOLDED
    5436             : /*
    5437             :  * Allocate p4d page table.
    5438             :  * We've already handled the fast-path in-line.
    5439             :  */
    5440             : int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
    5441             : {
    5442             :         p4d_t *new = p4d_alloc_one(mm, address);
    5443             :         if (!new)
    5444             :                 return -ENOMEM;
    5445             : 
    5446             :         spin_lock(&mm->page_table_lock);
    5447             :         if (pgd_present(*pgd)) {        /* Another has populated it */
    5448             :                 p4d_free(mm, new);
    5449             :         } else {
    5450             :                 smp_wmb(); /* See comment in pmd_install() */
    5451             :                 pgd_populate(mm, pgd, new);
    5452             :         }
    5453             :         spin_unlock(&mm->page_table_lock);
    5454             :         return 0;
    5455             : }
    5456             : #endif /* __PAGETABLE_P4D_FOLDED */
    5457             : 
    5458             : #ifndef __PAGETABLE_PUD_FOLDED
    5459             : /*
    5460             :  * Allocate page upper directory.
    5461             :  * We've already handled the fast-path in-line.
    5462             :  */
    5463             : int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
    5464             : {
    5465             :         pud_t *new = pud_alloc_one(mm, address);
    5466             :         if (!new)
    5467             :                 return -ENOMEM;
    5468             : 
    5469             :         spin_lock(&mm->page_table_lock);
    5470             :         if (!p4d_present(*p4d)) {
    5471             :                 mm_inc_nr_puds(mm);
    5472             :                 smp_wmb(); /* See comment in pmd_install() */
    5473             :                 p4d_populate(mm, p4d, new);
    5474             :         } else  /* Another has populated it */
    5475             :                 pud_free(mm, new);
    5476             :         spin_unlock(&mm->page_table_lock);
    5477             :         return 0;
    5478             : }
    5479             : #endif /* __PAGETABLE_PUD_FOLDED */
    5480             : 
    5481             : #ifndef __PAGETABLE_PMD_FOLDED
    5482             : /*
    5483             :  * Allocate page middle directory.
    5484             :  * We've already handled the fast-path in-line.
    5485             :  */
    5486           1 : int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
    5487             : {
    5488             :         spinlock_t *ptl;
    5489           1 :         pmd_t *new = pmd_alloc_one(mm, address);
    5490           1 :         if (!new)
    5491             :                 return -ENOMEM;
    5492             : 
    5493           2 :         ptl = pud_lock(mm, pud);
    5494           1 :         if (!pud_present(*pud)) {
    5495           1 :                 mm_inc_nr_pmds(mm);
    5496           1 :                 smp_wmb(); /* See comment in pmd_install() */
    5497           1 :                 pud_populate(mm, pud, new);
    5498             :         } else {        /* Another has populated it */
    5499           0 :                 pmd_free(mm, new);
    5500             :         }
    5501           1 :         spin_unlock(ptl);
    5502           1 :         return 0;
    5503             : }
    5504             : #endif /* __PAGETABLE_PMD_FOLDED */
    5505             : 
    5506             : /**
    5507             :  * follow_pte - look up PTE at a user virtual address
    5508             :  * @mm: the mm_struct of the target address space
    5509             :  * @address: user virtual address
    5510             :  * @ptepp: location to store found PTE
    5511             :  * @ptlp: location to store the lock for the PTE
    5512             :  *
    5513             :  * On a successful return, the pointer to the PTE is stored in @ptepp;
    5514             :  * the corresponding lock is taken and its location is stored in @ptlp.
    5515             :  * The contents of the PTE are only stable until @ptlp is released;
    5516             :  * any further use, if any, must be protected against invalidation
    5517             :  * with MMU notifiers.
    5518             :  *
    5519             :  * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
    5520             :  * should be taken for read.
    5521             :  *
    5522             :  * KVM uses this function.  While it is arguably less bad than ``follow_pfn``,
    5523             :  * it is not a good general-purpose API.
    5524             :  *
    5525             :  * Return: zero on success, -ve otherwise.
    5526             :  */
    5527           0 : int follow_pte(struct mm_struct *mm, unsigned long address,
    5528             :                pte_t **ptepp, spinlock_t **ptlp)
    5529             : {
    5530             :         pgd_t *pgd;
    5531             :         p4d_t *p4d;
    5532             :         pud_t *pud;
    5533             :         pmd_t *pmd;
    5534             :         pte_t *ptep;
    5535             : 
    5536           0 :         pgd = pgd_offset(mm, address);
    5537             :         if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
    5538             :                 goto out;
    5539             : 
    5540           0 :         p4d = p4d_offset(pgd, address);
    5541             :         if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
    5542             :                 goto out;
    5543             : 
    5544           0 :         pud = pud_offset(p4d, address);
    5545           0 :         if (pud_none(*pud) || unlikely(pud_bad(*pud)))
    5546             :                 goto out;
    5547             : 
    5548           0 :         pmd = pmd_offset(pud, address);
    5549             :         VM_BUG_ON(pmd_trans_huge(*pmd));
    5550             : 
    5551           0 :         ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
    5552           0 :         if (!ptep)
    5553             :                 goto out;
    5554           0 :         if (!pte_present(ptep_get(ptep)))
    5555             :                 goto unlock;
    5556           0 :         *ptepp = ptep;
    5557           0 :         return 0;
    5558             : unlock:
    5559           0 :         pte_unmap_unlock(ptep, *ptlp);
    5560             : out:
    5561             :         return -EINVAL;
    5562             : }
    5563             : EXPORT_SYMBOL_GPL(follow_pte);
    5564             : 
    5565             : /**
    5566             :  * follow_pfn - look up PFN at a user virtual address
    5567             :  * @vma: memory mapping
    5568             :  * @address: user virtual address
    5569             :  * @pfn: location to store found PFN
    5570             :  *
    5571             :  * Only IO mappings and raw PFN mappings are allowed.
    5572             :  *
    5573             :  * This function does not allow the caller to read the permissions
    5574             :  * of the PTE.  Do not use it.
    5575             :  *
    5576             :  * Return: zero and the pfn at @pfn on success, -ve otherwise.
    5577             :  */
    5578           0 : int follow_pfn(struct vm_area_struct *vma, unsigned long address,
    5579             :         unsigned long *pfn)
    5580             : {
    5581           0 :         int ret = -EINVAL;
    5582             :         spinlock_t *ptl;
    5583             :         pte_t *ptep;
    5584             : 
    5585           0 :         if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
    5586             :                 return ret;
    5587             : 
    5588           0 :         ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
    5589           0 :         if (ret)
    5590             :                 return ret;
    5591           0 :         *pfn = pte_pfn(ptep_get(ptep));
    5592           0 :         pte_unmap_unlock(ptep, ptl);
    5593           0 :         return 0;
    5594             : }
    5595             : EXPORT_SYMBOL(follow_pfn);
    5596             : 
    5597             : #ifdef CONFIG_HAVE_IOREMAP_PROT
    5598             : int follow_phys(struct vm_area_struct *vma,
    5599             :                 unsigned long address, unsigned int flags,
    5600             :                 unsigned long *prot, resource_size_t *phys)
    5601             : {
    5602             :         int ret = -EINVAL;
    5603             :         pte_t *ptep, pte;
    5604             :         spinlock_t *ptl;
    5605             : 
    5606             :         if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
    5607             :                 goto out;
    5608             : 
    5609             :         if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
    5610             :                 goto out;
    5611             :         pte = ptep_get(ptep);
    5612             : 
    5613             :         if ((flags & FOLL_WRITE) && !pte_write(pte))
    5614             :                 goto unlock;
    5615             : 
    5616             :         *prot = pgprot_val(pte_pgprot(pte));
    5617             :         *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
    5618             : 
    5619             :         ret = 0;
    5620             : unlock:
    5621             :         pte_unmap_unlock(ptep, ptl);
    5622             : out:
    5623             :         return ret;
    5624             : }
    5625             : 
    5626             : /**
    5627             :  * generic_access_phys - generic implementation for iomem mmap access
    5628             :  * @vma: the vma to access
    5629             :  * @addr: userspace address, not relative offset within @vma
    5630             :  * @buf: buffer to read/write
    5631             :  * @len: length of transfer
    5632             :  * @write: set to FOLL_WRITE when writing, otherwise reading
    5633             :  *
    5634             :  * This is a generic implementation for &vm_operations_struct.access for an
    5635             :  * iomem mapping. This callback is used by access_process_vm() when the @vma is
    5636             :  * not page based.
    5637             :  */
    5638             : int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
    5639             :                         void *buf, int len, int write)
    5640             : {
    5641             :         resource_size_t phys_addr;
    5642             :         unsigned long prot = 0;
    5643             :         void __iomem *maddr;
    5644             :         pte_t *ptep, pte;
    5645             :         spinlock_t *ptl;
    5646             :         int offset = offset_in_page(addr);
    5647             :         int ret = -EINVAL;
    5648             : 
    5649             :         if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
    5650             :                 return -EINVAL;
    5651             : 
    5652             : retry:
    5653             :         if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
    5654             :                 return -EINVAL;
    5655             :         pte = ptep_get(ptep);
    5656             :         pte_unmap_unlock(ptep, ptl);
    5657             : 
    5658             :         prot = pgprot_val(pte_pgprot(pte));
    5659             :         phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
    5660             : 
    5661             :         if ((write & FOLL_WRITE) && !pte_write(pte))
    5662             :                 return -EINVAL;
    5663             : 
    5664             :         maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
    5665             :         if (!maddr)
    5666             :                 return -ENOMEM;
    5667             : 
    5668             :         if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
    5669             :                 goto out_unmap;
    5670             : 
    5671             :         if (!pte_same(pte, ptep_get(ptep))) {
    5672             :                 pte_unmap_unlock(ptep, ptl);
    5673             :                 iounmap(maddr);
    5674             : 
    5675             :                 goto retry;
    5676             :         }
    5677             : 
    5678             :         if (write)
    5679             :                 memcpy_toio(maddr + offset, buf, len);
    5680             :         else
    5681             :                 memcpy_fromio(buf, maddr + offset, len);
    5682             :         ret = len;
    5683             :         pte_unmap_unlock(ptep, ptl);
    5684             : out_unmap:
    5685             :         iounmap(maddr);
    5686             : 
    5687             :         return ret;
    5688             : }
    5689             : EXPORT_SYMBOL_GPL(generic_access_phys);
    5690             : #endif
    5691             : 
    5692             : /*
    5693             :  * Access another process' address space as given in mm.
    5694             :  */
    5695           0 : int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
    5696             :                        int len, unsigned int gup_flags)
    5697             : {
    5698           0 :         void *old_buf = buf;
    5699           0 :         int write = gup_flags & FOLL_WRITE;
    5700             : 
    5701           0 :         if (mmap_read_lock_killable(mm))
    5702             :                 return 0;
    5703             : 
    5704             :         /* Avoid triggering the temporary warning in __get_user_pages */
    5705           0 :         if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
    5706             :                 return 0;
    5707             : 
    5708             :         /* ignore errors, just check how much was successfully transferred */
    5709           0 :         while (len) {
    5710             :                 int bytes, offset;
    5711             :                 void *maddr;
    5712           0 :                 struct vm_area_struct *vma = NULL;
    5713           0 :                 struct page *page = get_user_page_vma_remote(mm, addr,
    5714             :                                                              gup_flags, &vma);
    5715             : 
    5716           0 :                 if (IS_ERR_OR_NULL(page)) {
    5717             :                         /* We might need to expand the stack to access it */
    5718           0 :                         vma = vma_lookup(mm, addr);
    5719           0 :                         if (!vma) {
    5720           0 :                                 vma = expand_stack(mm, addr);
    5721             : 
    5722             :                                 /* mmap_lock was dropped on failure */
    5723           0 :                                 if (!vma)
    5724           0 :                                         return buf - old_buf;
    5725             : 
    5726             :                                 /* Try again if stack expansion worked */
    5727           0 :                                 continue;
    5728             :                         }
    5729             : 
    5730             : 
    5731             :                         /*
    5732             :                          * Check if this is a VM_IO | VM_PFNMAP VMA, which
    5733             :                          * we can access using slightly different code.
    5734             :                          */
    5735           0 :                         bytes = 0;
    5736             : #ifdef CONFIG_HAVE_IOREMAP_PROT
    5737             :                         if (vma->vm_ops && vma->vm_ops->access)
    5738             :                                 bytes = vma->vm_ops->access(vma, addr, buf,
    5739             :                                                             len, write);
    5740             : #endif
    5741             :                         if (bytes <= 0)
    5742             :                                 break;
    5743             :                 } else {
    5744           0 :                         bytes = len;
    5745           0 :                         offset = addr & (PAGE_SIZE-1);
    5746           0 :                         if (bytes > PAGE_SIZE-offset)
    5747           0 :                                 bytes = PAGE_SIZE-offset;
    5748             : 
    5749           0 :                         maddr = kmap(page);
    5750           0 :                         if (write) {
    5751           0 :                                 copy_to_user_page(vma, page, addr,
    5752             :                                                   maddr + offset, buf, bytes);
    5753           0 :                                 set_page_dirty_lock(page);
    5754             :                         } else {
    5755           0 :                                 copy_from_user_page(vma, page, addr,
    5756             :                                                     buf, maddr + offset, bytes);
    5757             :                         }
    5758           0 :                         kunmap(page);
    5759           0 :                         put_page(page);
    5760             :                 }
    5761           0 :                 len -= bytes;
    5762           0 :                 buf += bytes;
    5763           0 :                 addr += bytes;
    5764             :         }
    5765           0 :         mmap_read_unlock(mm);
    5766             : 
    5767           0 :         return buf - old_buf;
    5768             : }
    5769             : 
    5770             : /**
    5771             :  * access_remote_vm - access another process' address space
    5772             :  * @mm:         the mm_struct of the target address space
    5773             :  * @addr:       start address to access
    5774             :  * @buf:        source or destination buffer
    5775             :  * @len:        number of bytes to transfer
    5776             :  * @gup_flags:  flags modifying lookup behaviour
    5777             :  *
    5778             :  * The caller must hold a reference on @mm.
    5779             :  *
    5780             :  * Return: number of bytes copied from source to destination.
    5781             :  */
    5782           0 : int access_remote_vm(struct mm_struct *mm, unsigned long addr,
    5783             :                 void *buf, int len, unsigned int gup_flags)
    5784             : {
    5785           0 :         return __access_remote_vm(mm, addr, buf, len, gup_flags);
    5786             : }
    5787             : 
    5788             : /*
    5789             :  * Access another process' address space.
    5790             :  * Source/target buffer must be kernel space,
    5791             :  * Do not walk the page table directly, use get_user_pages
    5792             :  */
    5793           0 : int access_process_vm(struct task_struct *tsk, unsigned long addr,
    5794             :                 void *buf, int len, unsigned int gup_flags)
    5795             : {
    5796             :         struct mm_struct *mm;
    5797             :         int ret;
    5798             : 
    5799           0 :         mm = get_task_mm(tsk);
    5800           0 :         if (!mm)
    5801             :                 return 0;
    5802             : 
    5803           0 :         ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
    5804             : 
    5805           0 :         mmput(mm);
    5806             : 
    5807           0 :         return ret;
    5808             : }
    5809             : EXPORT_SYMBOL_GPL(access_process_vm);
    5810             : 
    5811             : /*
    5812             :  * Print the name of a VMA.
    5813             :  */
    5814           0 : void print_vma_addr(char *prefix, unsigned long ip)
    5815             : {
    5816           0 :         struct mm_struct *mm = current->mm;
    5817             :         struct vm_area_struct *vma;
    5818             : 
    5819             :         /*
    5820             :          * we might be running from an atomic context so we cannot sleep
    5821             :          */
    5822           0 :         if (!mmap_read_trylock(mm))
    5823             :                 return;
    5824             : 
    5825           0 :         vma = find_vma(mm, ip);
    5826           0 :         if (vma && vma->vm_file) {
    5827           0 :                 struct file *f = vma->vm_file;
    5828           0 :                 char *buf = (char *)__get_free_page(GFP_NOWAIT);
    5829           0 :                 if (buf) {
    5830             :                         char *p;
    5831             : 
    5832           0 :                         p = file_path(f, buf, PAGE_SIZE);
    5833           0 :                         if (IS_ERR(p))
    5834           0 :                                 p = "?";
    5835           0 :                         printk("%s%s[%lx+%lx]", prefix, kbasename(p),
    5836             :                                         vma->vm_start,
    5837             :                                         vma->vm_end - vma->vm_start);
    5838           0 :                         free_page((unsigned long)buf);
    5839             :                 }
    5840             :         }
    5841             :         mmap_read_unlock(mm);
    5842             : }
    5843             : 
    5844             : #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
    5845             : void __might_fault(const char *file, int line)
    5846             : {
    5847             :         if (pagefault_disabled())
    5848             :                 return;
    5849             :         __might_sleep(file, line);
    5850             : #if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
    5851             :         if (current->mm)
    5852             :                 might_lock_read(&current->mm->mmap_lock);
    5853             : #endif
    5854             : }
    5855             : EXPORT_SYMBOL(__might_fault);
    5856             : #endif
    5857             : 
    5858             : #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
    5859             : /*
    5860             :  * Process all subpages of the specified huge page with the specified
    5861             :  * operation.  The target subpage will be processed last to keep its
    5862             :  * cache lines hot.
    5863             :  */
    5864             : static inline int process_huge_page(
    5865             :         unsigned long addr_hint, unsigned int pages_per_huge_page,
    5866             :         int (*process_subpage)(unsigned long addr, int idx, void *arg),
    5867             :         void *arg)
    5868             : {
    5869             :         int i, n, base, l, ret;
    5870             :         unsigned long addr = addr_hint &
    5871             :                 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
    5872             : 
    5873             :         /* Process target subpage last to keep its cache lines hot */
    5874             :         might_sleep();
    5875             :         n = (addr_hint - addr) / PAGE_SIZE;
    5876             :         if (2 * n <= pages_per_huge_page) {
    5877             :                 /* If target subpage in first half of huge page */
    5878             :                 base = 0;
    5879             :                 l = n;
    5880             :                 /* Process subpages at the end of huge page */
    5881             :                 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
    5882             :                         cond_resched();
    5883             :                         ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
    5884             :                         if (ret)
    5885             :                                 return ret;
    5886             :                 }
    5887             :         } else {
    5888             :                 /* If target subpage in second half of huge page */
    5889             :                 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
    5890             :                 l = pages_per_huge_page - n;
    5891             :                 /* Process subpages at the begin of huge page */
    5892             :                 for (i = 0; i < base; i++) {
    5893             :                         cond_resched();
    5894             :                         ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
    5895             :                         if (ret)
    5896             :                                 return ret;
    5897             :                 }
    5898             :         }
    5899             :         /*
    5900             :          * Process remaining subpages in left-right-left-right pattern
    5901             :          * towards the target subpage
    5902             :          */
    5903             :         for (i = 0; i < l; i++) {
    5904             :                 int left_idx = base + i;
    5905             :                 int right_idx = base + 2 * l - 1 - i;
    5906             : 
    5907             :                 cond_resched();
    5908             :                 ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
    5909             :                 if (ret)
    5910             :                         return ret;
    5911             :                 cond_resched();
    5912             :                 ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
    5913             :                 if (ret)
    5914             :                         return ret;
    5915             :         }
    5916             :         return 0;
    5917             : }
    5918             : 
    5919             : static void clear_gigantic_page(struct page *page,
    5920             :                                 unsigned long addr,
    5921             :                                 unsigned int pages_per_huge_page)
    5922             : {
    5923             :         int i;
    5924             :         struct page *p;
    5925             : 
    5926             :         might_sleep();
    5927             :         for (i = 0; i < pages_per_huge_page; i++) {
    5928             :                 p = nth_page(page, i);
    5929             :                 cond_resched();
    5930             :                 clear_user_highpage(p, addr + i * PAGE_SIZE);
    5931             :         }
    5932             : }
    5933             : 
    5934             : static int clear_subpage(unsigned long addr, int idx, void *arg)
    5935             : {
    5936             :         struct page *page = arg;
    5937             : 
    5938             :         clear_user_highpage(page + idx, addr);
    5939             :         return 0;
    5940             : }
    5941             : 
    5942             : void clear_huge_page(struct page *page,
    5943             :                      unsigned long addr_hint, unsigned int pages_per_huge_page)
    5944             : {
    5945             :         unsigned long addr = addr_hint &
    5946             :                 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
    5947             : 
    5948             :         if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
    5949             :                 clear_gigantic_page(page, addr, pages_per_huge_page);
    5950             :                 return;
    5951             :         }
    5952             : 
    5953             :         process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
    5954             : }
    5955             : 
    5956             : static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
    5957             :                                      unsigned long addr,
    5958             :                                      struct vm_area_struct *vma,
    5959             :                                      unsigned int pages_per_huge_page)
    5960             : {
    5961             :         int i;
    5962             :         struct page *dst_page;
    5963             :         struct page *src_page;
    5964             : 
    5965             :         for (i = 0; i < pages_per_huge_page; i++) {
    5966             :                 dst_page = folio_page(dst, i);
    5967             :                 src_page = folio_page(src, i);
    5968             : 
    5969             :                 cond_resched();
    5970             :                 if (copy_mc_user_highpage(dst_page, src_page,
    5971             :                                           addr + i*PAGE_SIZE, vma)) {
    5972             :                         memory_failure_queue(page_to_pfn(src_page), 0);
    5973             :                         return -EHWPOISON;
    5974             :                 }
    5975             :         }
    5976             :         return 0;
    5977             : }
    5978             : 
    5979             : struct copy_subpage_arg {
    5980             :         struct page *dst;
    5981             :         struct page *src;
    5982             :         struct vm_area_struct *vma;
    5983             : };
    5984             : 
    5985             : static int copy_subpage(unsigned long addr, int idx, void *arg)
    5986             : {
    5987             :         struct copy_subpage_arg *copy_arg = arg;
    5988             : 
    5989             :         if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
    5990             :                                   addr, copy_arg->vma)) {
    5991             :                 memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
    5992             :                 return -EHWPOISON;
    5993             :         }
    5994             :         return 0;
    5995             : }
    5996             : 
    5997             : int copy_user_large_folio(struct folio *dst, struct folio *src,
    5998             :                           unsigned long addr_hint, struct vm_area_struct *vma)
    5999             : {
    6000             :         unsigned int pages_per_huge_page = folio_nr_pages(dst);
    6001             :         unsigned long addr = addr_hint &
    6002             :                 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
    6003             :         struct copy_subpage_arg arg = {
    6004             :                 .dst = &dst->page,
    6005             :                 .src = &src->page,
    6006             :                 .vma = vma,
    6007             :         };
    6008             : 
    6009             :         if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES))
    6010             :                 return copy_user_gigantic_page(dst, src, addr, vma,
    6011             :                                                pages_per_huge_page);
    6012             : 
    6013             :         return process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
    6014             : }
    6015             : 
    6016             : long copy_folio_from_user(struct folio *dst_folio,
    6017             :                            const void __user *usr_src,
    6018             :                            bool allow_pagefault)
    6019             : {
    6020             :         void *kaddr;
    6021             :         unsigned long i, rc = 0;
    6022             :         unsigned int nr_pages = folio_nr_pages(dst_folio);
    6023             :         unsigned long ret_val = nr_pages * PAGE_SIZE;
    6024             :         struct page *subpage;
    6025             : 
    6026             :         for (i = 0; i < nr_pages; i++) {
    6027             :                 subpage = folio_page(dst_folio, i);
    6028             :                 kaddr = kmap_local_page(subpage);
    6029             :                 if (!allow_pagefault)
    6030             :                         pagefault_disable();
    6031             :                 rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE);
    6032             :                 if (!allow_pagefault)
    6033             :                         pagefault_enable();
    6034             :                 kunmap_local(kaddr);
    6035             : 
    6036             :                 ret_val -= (PAGE_SIZE - rc);
    6037             :                 if (rc)
    6038             :                         break;
    6039             : 
    6040             :                 flush_dcache_page(subpage);
    6041             : 
    6042             :                 cond_resched();
    6043             :         }
    6044             :         return ret_val;
    6045             : }
    6046             : #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
    6047             : 
    6048             : #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
    6049             : 
    6050             : static struct kmem_cache *page_ptl_cachep;
    6051             : 
    6052             : void __init ptlock_cache_init(void)
    6053             : {
    6054             :         page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
    6055             :                         SLAB_PANIC, NULL);
    6056             : }
    6057             : 
    6058             : bool ptlock_alloc(struct page *page)
    6059             : {
    6060             :         spinlock_t *ptl;
    6061             : 
    6062             :         ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
    6063             :         if (!ptl)
    6064             :                 return false;
    6065             :         page->ptl = ptl;
    6066             :         return true;
    6067             : }
    6068             : 
    6069             : void ptlock_free(struct page *page)
    6070             : {
    6071             :         kmem_cache_free(page_ptl_cachep, page->ptl);
    6072             : }
    6073             : #endif

Generated by: LCOV version 1.14