LCOV - code coverage report
Current view: top level - mm - vmalloc.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 476 1134 42.0 %
Date: 2023-07-19 18:55:55 Functions: 36 95 37.9 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  Copyright (C) 1993  Linus Torvalds
       4             :  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
       5             :  *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
       6             :  *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
       7             :  *  Numa awareness, Christoph Lameter, SGI, June 2005
       8             :  *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
       9             :  */
      10             : 
      11             : #include <linux/vmalloc.h>
      12             : #include <linux/mm.h>
      13             : #include <linux/module.h>
      14             : #include <linux/highmem.h>
      15             : #include <linux/sched/signal.h>
      16             : #include <linux/slab.h>
      17             : #include <linux/spinlock.h>
      18             : #include <linux/interrupt.h>
      19             : #include <linux/proc_fs.h>
      20             : #include <linux/seq_file.h>
      21             : #include <linux/set_memory.h>
      22             : #include <linux/debugobjects.h>
      23             : #include <linux/kallsyms.h>
      24             : #include <linux/list.h>
      25             : #include <linux/notifier.h>
      26             : #include <linux/rbtree.h>
      27             : #include <linux/xarray.h>
      28             : #include <linux/io.h>
      29             : #include <linux/rcupdate.h>
      30             : #include <linux/pfn.h>
      31             : #include <linux/kmemleak.h>
      32             : #include <linux/atomic.h>
      33             : #include <linux/compiler.h>
      34             : #include <linux/memcontrol.h>
      35             : #include <linux/llist.h>
      36             : #include <linux/uio.h>
      37             : #include <linux/bitops.h>
      38             : #include <linux/rbtree_augmented.h>
      39             : #include <linux/overflow.h>
      40             : #include <linux/pgtable.h>
      41             : #include <linux/hugetlb.h>
      42             : #include <linux/sched/mm.h>
      43             : #include <asm/tlbflush.h>
      44             : #include <asm/shmparam.h>
      45             : 
      46             : #define CREATE_TRACE_POINTS
      47             : #include <trace/events/vmalloc.h>
      48             : 
      49             : #include "internal.h"
      50             : #include "pgalloc-track.h"
      51             : 
      52             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
      53             : static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;
      54             : 
      55             : static int __init set_nohugeiomap(char *str)
      56             : {
      57             :         ioremap_max_page_shift = PAGE_SHIFT;
      58             :         return 0;
      59             : }
      60             : early_param("nohugeiomap", set_nohugeiomap);
      61             : #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
      62             : static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
      63             : #endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */
      64             : 
      65             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
      66             : static bool __ro_after_init vmap_allow_huge = true;
      67             : 
      68             : static int __init set_nohugevmalloc(char *str)
      69             : {
      70             :         vmap_allow_huge = false;
      71             :         return 0;
      72             : }
      73             : early_param("nohugevmalloc", set_nohugevmalloc);
      74             : #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
      75             : static const bool vmap_allow_huge = false;
      76             : #endif  /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
      77             : 
      78         265 : bool is_vmalloc_addr(const void *x)
      79             : {
      80         270 :         unsigned long addr = (unsigned long)kasan_reset_tag(x);
      81             : 
      82         265 :         return addr >= VMALLOC_START && addr < VMALLOC_END;
      83             : }
      84             : EXPORT_SYMBOL(is_vmalloc_addr);
      85             : 
      86             : struct vfree_deferred {
      87             :         struct llist_head list;
      88             :         struct work_struct wq;
      89             : };
      90             : static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
      91             : 
      92             : /*** Page table manipulation functions ***/
      93           0 : static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
      94             :                         phys_addr_t phys_addr, pgprot_t prot,
      95             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
      96             : {
      97             :         pte_t *pte;
      98             :         u64 pfn;
      99           0 :         unsigned long size = PAGE_SIZE;
     100             : 
     101           0 :         pfn = phys_addr >> PAGE_SHIFT;
     102           0 :         pte = pte_alloc_kernel_track(pmd, addr, mask);
     103           0 :         if (!pte)
     104             :                 return -ENOMEM;
     105             :         do {
     106           0 :                 BUG_ON(!pte_none(*pte));
     107             : 
     108             : #ifdef CONFIG_HUGETLB_PAGE
     109             :                 size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
     110             :                 if (size != PAGE_SIZE) {
     111             :                         pte_t entry = pfn_pte(pfn, prot);
     112             : 
     113             :                         entry = arch_make_huge_pte(entry, ilog2(size), 0);
     114             :                         set_huge_pte_at(&init_mm, addr, pte, entry);
     115             :                         pfn += PFN_DOWN(size);
     116             :                         continue;
     117             :                 }
     118             : #endif
     119           0 :                 set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
     120           0 :                 pfn++;
     121           0 :         } while (pte += PFN_DOWN(size), addr += size, addr != end);
     122           0 :         *mask |= PGTBL_PTE_MODIFIED;
     123             :         return 0;
     124             : }
     125             : 
     126             : static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
     127             :                         phys_addr_t phys_addr, pgprot_t prot,
     128             :                         unsigned int max_page_shift)
     129             : {
     130             :         if (max_page_shift < PMD_SHIFT)
     131             :                 return 0;
     132             : 
     133             :         if (!arch_vmap_pmd_supported(prot))
     134             :                 return 0;
     135             : 
     136             :         if ((end - addr) != PMD_SIZE)
     137             :                 return 0;
     138             : 
     139             :         if (!IS_ALIGNED(addr, PMD_SIZE))
     140             :                 return 0;
     141             : 
     142             :         if (!IS_ALIGNED(phys_addr, PMD_SIZE))
     143             :                 return 0;
     144             : 
     145             :         if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
     146             :                 return 0;
     147             : 
     148             :         return pmd_set_huge(pmd, phys_addr, prot);
     149             : }
     150             : 
     151           0 : static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
     152             :                         phys_addr_t phys_addr, pgprot_t prot,
     153             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
     154             : {
     155             :         pmd_t *pmd;
     156             :         unsigned long next;
     157             : 
     158           0 :         pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
     159           0 :         if (!pmd)
     160             :                 return -ENOMEM;
     161             :         do {
     162           0 :                 next = pmd_addr_end(addr, end);
     163             : 
     164           0 :                 if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
     165             :                                         max_page_shift)) {
     166             :                         *mask |= PGTBL_PMD_MODIFIED;
     167             :                         continue;
     168             :                 }
     169             : 
     170           0 :                 if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
     171             :                         return -ENOMEM;
     172           0 :         } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
     173             :         return 0;
     174             : }
     175             : 
     176             : static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
     177             :                         phys_addr_t phys_addr, pgprot_t prot,
     178             :                         unsigned int max_page_shift)
     179             : {
     180             :         if (max_page_shift < PUD_SHIFT)
     181             :                 return 0;
     182             : 
     183             :         if (!arch_vmap_pud_supported(prot))
     184             :                 return 0;
     185             : 
     186             :         if ((end - addr) != PUD_SIZE)
     187             :                 return 0;
     188             : 
     189             :         if (!IS_ALIGNED(addr, PUD_SIZE))
     190             :                 return 0;
     191             : 
     192             :         if (!IS_ALIGNED(phys_addr, PUD_SIZE))
     193             :                 return 0;
     194             : 
     195             :         if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
     196             :                 return 0;
     197             : 
     198             :         return pud_set_huge(pud, phys_addr, prot);
     199             : }
     200             : 
     201             : static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
     202             :                         phys_addr_t phys_addr, pgprot_t prot,
     203             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
     204             : {
     205             :         pud_t *pud;
     206             :         unsigned long next;
     207             : 
     208           0 :         pud = pud_alloc_track(&init_mm, p4d, addr, mask);
     209             :         if (!pud)
     210             :                 return -ENOMEM;
     211             :         do {
     212           0 :                 next = pud_addr_end(addr, end);
     213             : 
     214           0 :                 if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
     215             :                                         max_page_shift)) {
     216             :                         *mask |= PGTBL_PUD_MODIFIED;
     217             :                         continue;
     218             :                 }
     219             : 
     220           0 :                 if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
     221             :                                         max_page_shift, mask))
     222             :                         return -ENOMEM;
     223           0 :         } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
     224             :         return 0;
     225             : }
     226             : 
     227             : static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
     228             :                         phys_addr_t phys_addr, pgprot_t prot,
     229             :                         unsigned int max_page_shift)
     230             : {
     231             :         if (max_page_shift < P4D_SHIFT)
     232             :                 return 0;
     233             : 
     234             :         if (!arch_vmap_p4d_supported(prot))
     235             :                 return 0;
     236             : 
     237             :         if ((end - addr) != P4D_SIZE)
     238             :                 return 0;
     239             : 
     240             :         if (!IS_ALIGNED(addr, P4D_SIZE))
     241             :                 return 0;
     242             : 
     243             :         if (!IS_ALIGNED(phys_addr, P4D_SIZE))
     244             :                 return 0;
     245             : 
     246             :         if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
     247             :                 return 0;
     248             : 
     249             :         return p4d_set_huge(p4d, phys_addr, prot);
     250             : }
     251             : 
     252           0 : static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
     253             :                         phys_addr_t phys_addr, pgprot_t prot,
     254             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
     255             : {
     256             :         p4d_t *p4d;
     257             :         unsigned long next;
     258             : 
     259           0 :         p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
     260           0 :         if (!p4d)
     261             :                 return -ENOMEM;
     262             :         do {
     263           0 :                 next = p4d_addr_end(addr, end);
     264             : 
     265           0 :                 if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
     266             :                                         max_page_shift)) {
     267             :                         *mask |= PGTBL_P4D_MODIFIED;
     268             :                         continue;
     269             :                 }
     270             : 
     271           0 :                 if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
     272             :                                         max_page_shift, mask))
     273             :                         return -ENOMEM;
     274           0 :         } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
     275             :         return 0;
     276             : }
     277             : 
     278           0 : static int vmap_range_noflush(unsigned long addr, unsigned long end,
     279             :                         phys_addr_t phys_addr, pgprot_t prot,
     280             :                         unsigned int max_page_shift)
     281             : {
     282             :         pgd_t *pgd;
     283             :         unsigned long start;
     284             :         unsigned long next;
     285             :         int err;
     286           0 :         pgtbl_mod_mask mask = 0;
     287             : 
     288             :         might_sleep();
     289           0 :         BUG_ON(addr >= end);
     290             : 
     291           0 :         start = addr;
     292           0 :         pgd = pgd_offset_k(addr);
     293             :         do {
     294           0 :                 next = pgd_addr_end(addr, end);
     295           0 :                 err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
     296             :                                         max_page_shift, &mask);
     297           0 :                 if (err)
     298             :                         break;
     299           0 :         } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
     300             : 
     301             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
     302             :                 arch_sync_kernel_mappings(start, end);
     303             : 
     304           0 :         return err;
     305             : }
     306             : 
     307           0 : int ioremap_page_range(unsigned long addr, unsigned long end,
     308             :                 phys_addr_t phys_addr, pgprot_t prot)
     309             : {
     310             :         int err;
     311             : 
     312           0 :         err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
     313             :                                  ioremap_max_page_shift);
     314           0 :         flush_cache_vmap(addr, end);
     315           0 :         if (!err)
     316           0 :                 err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
     317             :                                                ioremap_max_page_shift);
     318           0 :         return err;
     319             : }
     320             : 
     321         337 : static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
     322             :                              pgtbl_mod_mask *mask)
     323             : {
     324             :         pte_t *pte;
     325             : 
     326         337 :         pte = pte_offset_kernel(pmd, addr);
     327             :         do {
     328       85468 :                 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
     329       42734 :                 WARN_ON(!pte_none(ptent) && !pte_present(ptent));
     330       42734 :         } while (pte++, addr += PAGE_SIZE, addr != end);
     331         337 :         *mask |= PGTBL_PTE_MODIFIED;
     332         337 : }
     333             : 
     334         258 : static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
     335             :                              pgtbl_mod_mask *mask)
     336             : {
     337             :         pmd_t *pmd;
     338             :         unsigned long next;
     339             :         int cleared;
     340             : 
     341         258 :         pmd = pmd_offset(pud, addr);
     342             :         do {
     343         338 :                 next = pmd_addr_end(addr, end);
     344             : 
     345         338 :                 cleared = pmd_clear_huge(pmd);
     346         338 :                 if (cleared || pmd_bad(*pmd))
     347           1 :                         *mask |= PGTBL_PMD_MODIFIED;
     348             : 
     349             :                 if (cleared)
     350             :                         continue;
     351         338 :                 if (pmd_none_or_clear_bad(pmd))
     352           1 :                         continue;
     353         337 :                 vunmap_pte_range(pmd, addr, next, mask);
     354             : 
     355         337 :                 cond_resched();
     356         338 :         } while (pmd++, addr = next, addr != end);
     357         258 : }
     358             : 
     359         258 : static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
     360             :                              pgtbl_mod_mask *mask)
     361             : {
     362             :         pud_t *pud;
     363             :         unsigned long next;
     364             :         int cleared;
     365             : 
     366         258 :         pud = pud_offset(p4d, addr);
     367             :         do {
     368         258 :                 next = pud_addr_end(addr, end);
     369             : 
     370         258 :                 cleared = pud_clear_huge(pud);
     371         258 :                 if (cleared || pud_bad(*pud))
     372           0 :                         *mask |= PGTBL_PUD_MODIFIED;
     373             : 
     374             :                 if (cleared)
     375             :                         continue;
     376         258 :                 if (pud_none_or_clear_bad(pud))
     377           0 :                         continue;
     378         258 :                 vunmap_pmd_range(pud, addr, next, mask);
     379         258 :         } while (pud++, addr = next, addr != end);
     380         258 : }
     381             : 
     382             : static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
     383             :                              pgtbl_mod_mask *mask)
     384             : {
     385             :         p4d_t *p4d;
     386             :         unsigned long next;
     387             : 
     388         258 :         p4d = p4d_offset(pgd, addr);
     389             :         do {
     390         258 :                 next = p4d_addr_end(addr, end);
     391             : 
     392         258 :                 p4d_clear_huge(p4d);
     393         258 :                 if (p4d_bad(*p4d))
     394             :                         *mask |= PGTBL_P4D_MODIFIED;
     395             : 
     396         258 :                 if (p4d_none_or_clear_bad(p4d))
     397             :                         continue;
     398         258 :                 vunmap_pud_range(p4d, addr, next, mask);
     399         258 :         } while (p4d++, addr = next, addr != end);
     400             : }
     401             : 
     402             : /*
     403             :  * vunmap_range_noflush is similar to vunmap_range, but does not
     404             :  * flush caches or TLBs.
     405             :  *
     406             :  * The caller is responsible for calling flush_cache_vmap() before calling
     407             :  * this function, and flush_tlb_kernel_range after it has returned
     408             :  * successfully (and before the addresses are expected to cause a page fault
     409             :  * or be re-mapped for something else, if TLB flushes are being delayed or
     410             :  * coalesced).
     411             :  *
     412             :  * This is an internal function only. Do not use outside mm/.
     413             :  */
     414         258 : void __vunmap_range_noflush(unsigned long start, unsigned long end)
     415             : {
     416             :         unsigned long next;
     417             :         pgd_t *pgd;
     418         258 :         unsigned long addr = start;
     419         258 :         pgtbl_mod_mask mask = 0;
     420             : 
     421         258 :         BUG_ON(addr >= end);
     422         516 :         pgd = pgd_offset_k(addr);
     423             :         do {
     424         258 :                 next = pgd_addr_end(addr, end);
     425         258 :                 if (pgd_bad(*pgd))
     426             :                         mask |= PGTBL_PGD_MODIFIED;
     427         258 :                 if (pgd_none_or_clear_bad(pgd))
     428             :                         continue;
     429             :                 vunmap_p4d_range(pgd, addr, next, &mask);
     430         258 :         } while (pgd++, addr = next, addr != end);
     431             : 
     432             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
     433             :                 arch_sync_kernel_mappings(start, end);
     434         258 : }
     435             : 
     436           0 : void vunmap_range_noflush(unsigned long start, unsigned long end)
     437             : {
     438         258 :         kmsan_vunmap_range_noflush(start, end);
     439         258 :         __vunmap_range_noflush(start, end);
     440           0 : }
     441             : 
     442             : /**
     443             :  * vunmap_range - unmap kernel virtual addresses
     444             :  * @addr: start of the VM area to unmap
     445             :  * @end: end of the VM area to unmap (non-inclusive)
     446             :  *
     447             :  * Clears any present PTEs in the virtual address range, flushes TLBs and
     448             :  * caches. Any subsequent access to the address before it has been re-mapped
     449             :  * is a kernel bug.
     450             :  */
     451           0 : void vunmap_range(unsigned long addr, unsigned long end)
     452             : {
     453           0 :         flush_cache_vunmap(addr, end);
     454           0 :         vunmap_range_noflush(addr, end);
     455           0 :         flush_tlb_kernel_range(addr, end);
     456           0 : }
     457             : 
     458         353 : static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
     459             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     460             :                 pgtbl_mod_mask *mask)
     461             : {
     462             :         pte_t *pte;
     463             : 
     464             :         /*
     465             :          * nr is a running index into the array which helps higher level
     466             :          * callers keep track of where we're up to.
     467             :          */
     468             : 
     469         706 :         pte = pte_alloc_kernel_track(pmd, addr, mask);
     470         353 :         if (!pte)
     471             :                 return -ENOMEM;
     472             :         do {
     473       42541 :                 struct page *page = pages[*nr];
     474             : 
     475       42541 :                 if (WARN_ON(!pte_none(*pte)))
     476             :                         return -EBUSY;
     477       42541 :                 if (WARN_ON(!page))
     478             :                         return -ENOMEM;
     479       85082 :                 if (WARN_ON(!pfn_valid(page_to_pfn(page))))
     480             :                         return -EINVAL;
     481             : 
     482       85082 :                 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
     483       42541 :                 (*nr)++;
     484       42541 :         } while (pte++, addr += PAGE_SIZE, addr != end);
     485         353 :         *mask |= PGTBL_PTE_MODIFIED;
     486         353 :         return 0;
     487             : }
     488             : 
     489         274 : static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
     490             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     491             :                 pgtbl_mod_mask *mask)
     492             : {
     493             :         pmd_t *pmd;
     494             :         unsigned long next;
     495             : 
     496         274 :         pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
     497         274 :         if (!pmd)
     498             :                 return -ENOMEM;
     499             :         do {
     500         353 :                 next = pmd_addr_end(addr, end);
     501         353 :                 if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
     502             :                         return -ENOMEM;
     503         353 :         } while (pmd++, addr = next, addr != end);
     504             :         return 0;
     505             : }
     506             : 
     507             : static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
     508             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     509             :                 pgtbl_mod_mask *mask)
     510             : {
     511             :         pud_t *pud;
     512             :         unsigned long next;
     513             : 
     514         548 :         pud = pud_alloc_track(&init_mm, p4d, addr, mask);
     515             :         if (!pud)
     516             :                 return -ENOMEM;
     517             :         do {
     518         274 :                 next = pud_addr_end(addr, end);
     519         274 :                 if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
     520             :                         return -ENOMEM;
     521         274 :         } while (pud++, addr = next, addr != end);
     522             :         return 0;
     523             : }
     524             : 
     525         274 : static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
     526             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     527             :                 pgtbl_mod_mask *mask)
     528             : {
     529             :         p4d_t *p4d;
     530             :         unsigned long next;
     531             : 
     532         548 :         p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
     533         274 :         if (!p4d)
     534             :                 return -ENOMEM;
     535             :         do {
     536         274 :                 next = p4d_addr_end(addr, end);
     537         274 :                 if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
     538             :                         return -ENOMEM;
     539         274 :         } while (p4d++, addr = next, addr != end);
     540         274 :         return 0;
     541             : }
     542             : 
     543         274 : static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
     544             :                 pgprot_t prot, struct page **pages)
     545             : {
     546         274 :         unsigned long start = addr;
     547             :         pgd_t *pgd;
     548             :         unsigned long next;
     549         274 :         int err = 0;
     550         274 :         int nr = 0;
     551         274 :         pgtbl_mod_mask mask = 0;
     552             : 
     553         274 :         BUG_ON(addr >= end);
     554         548 :         pgd = pgd_offset_k(addr);
     555             :         do {
     556         274 :                 next = pgd_addr_end(addr, end);
     557         274 :                 if (pgd_bad(*pgd))
     558             :                         mask |= PGTBL_PGD_MODIFIED;
     559         274 :                 err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
     560         274 :                 if (err)
     561             :                         return err;
     562         274 :         } while (pgd++, addr = next, addr != end);
     563             : 
     564             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
     565             :                 arch_sync_kernel_mappings(start, end);
     566             : 
     567             :         return 0;
     568             : }
     569             : 
     570             : /*
     571             :  * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
     572             :  * flush caches.
     573             :  *
     574             :  * The caller is responsible for calling flush_cache_vmap() after this
     575             :  * function returns successfully and before the addresses are accessed.
     576             :  *
     577             :  * This is an internal function only. Do not use outside mm/.
     578             :  */
     579         274 : int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
     580             :                 pgprot_t prot, struct page **pages, unsigned int page_shift)
     581             : {
     582         274 :         unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
     583             : 
     584         274 :         WARN_ON(page_shift < PAGE_SHIFT);
     585             : 
     586             :         if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
     587             :                         page_shift == PAGE_SHIFT)
     588         274 :                 return vmap_small_pages_range_noflush(addr, end, prot, pages);
     589             : 
     590             :         for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
     591             :                 int err;
     592             : 
     593             :                 err = vmap_range_noflush(addr, addr + (1UL << page_shift),
     594             :                                         page_to_phys(pages[i]), prot,
     595             :                                         page_shift);
     596             :                 if (err)
     597             :                         return err;
     598             : 
     599             :                 addr += 1UL << page_shift;
     600             :         }
     601             : 
     602             :         return 0;
     603             : }
     604             : 
     605           0 : int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
     606             :                 pgprot_t prot, struct page **pages, unsigned int page_shift)
     607             : {
     608         274 :         int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
     609             :                                                  page_shift);
     610             : 
     611             :         if (ret)
     612             :                 return ret;
     613         274 :         return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
     614             : }
     615             : 
     616             : /**
     617             :  * vmap_pages_range - map pages to a kernel virtual address
     618             :  * @addr: start of the VM area to map
     619             :  * @end: end of the VM area to map (non-inclusive)
     620             :  * @prot: page protection flags to use
     621             :  * @pages: pages to map (always PAGE_SIZE pages)
     622             :  * @page_shift: maximum shift that the pages may be mapped with, @pages must
     623             :  * be aligned and contiguous up to at least this shift.
     624             :  *
     625             :  * RETURNS:
     626             :  * 0 on success, -errno on failure.
     627             :  */
     628         274 : static int vmap_pages_range(unsigned long addr, unsigned long end,
     629             :                 pgprot_t prot, struct page **pages, unsigned int page_shift)
     630             : {
     631             :         int err;
     632             : 
     633         274 :         err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
     634         274 :         flush_cache_vmap(addr, end);
     635         274 :         return err;
     636             : }
     637             : 
     638           0 : int is_vmalloc_or_module_addr(const void *x)
     639             : {
     640             :         /*
     641             :          * ARM, x86-64 and sparc64 put modules in a special place,
     642             :          * and fall back on vmalloc() if that fails. Others
     643             :          * just put it in the vmalloc space.
     644             :          */
     645             : #if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
     646             :         unsigned long addr = (unsigned long)kasan_reset_tag(x);
     647             :         if (addr >= MODULES_VADDR && addr < MODULES_END)
     648             :                 return 1;
     649             : #endif
     650           5 :         return is_vmalloc_addr(x);
     651             : }
     652             : EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);
     653             : 
     654             : /*
     655             :  * Walk a vmap address to the struct page it maps. Huge vmap mappings will
     656             :  * return the tail page that corresponds to the base page address, which
     657             :  * matches small vmap mappings.
     658             :  */
     659           0 : struct page *vmalloc_to_page(const void *vmalloc_addr)
     660             : {
     661           0 :         unsigned long addr = (unsigned long) vmalloc_addr;
     662           0 :         struct page *page = NULL;
     663           0 :         pgd_t *pgd = pgd_offset_k(addr);
     664             :         p4d_t *p4d;
     665             :         pud_t *pud;
     666             :         pmd_t *pmd;
     667             :         pte_t *ptep, pte;
     668             : 
     669             :         /*
     670             :          * XXX we might need to change this if we add VIRTUAL_BUG_ON for
     671             :          * architectures that do not vmalloc module space
     672             :          */
     673             :         VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
     674             : 
     675             :         if (pgd_none(*pgd))
     676             :                 return NULL;
     677           0 :         if (WARN_ON_ONCE(pgd_leaf(*pgd)))
     678             :                 return NULL; /* XXX: no allowance for huge pgd */
     679           0 :         if (WARN_ON_ONCE(pgd_bad(*pgd)))
     680             :                 return NULL;
     681             : 
     682           0 :         p4d = p4d_offset(pgd, addr);
     683             :         if (p4d_none(*p4d))
     684             :                 return NULL;
     685             :         if (p4d_leaf(*p4d))
     686             :                 return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
     687           0 :         if (WARN_ON_ONCE(p4d_bad(*p4d)))
     688             :                 return NULL;
     689             : 
     690           0 :         pud = pud_offset(p4d, addr);
     691           0 :         if (pud_none(*pud))
     692             :                 return NULL;
     693             :         if (pud_leaf(*pud))
     694             :                 return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
     695           0 :         if (WARN_ON_ONCE(pud_bad(*pud)))
     696             :                 return NULL;
     697             : 
     698           0 :         pmd = pmd_offset(pud, addr);
     699           0 :         if (pmd_none(*pmd))
     700             :                 return NULL;
     701             :         if (pmd_leaf(*pmd))
     702             :                 return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
     703           0 :         if (WARN_ON_ONCE(pmd_bad(*pmd)))
     704             :                 return NULL;
     705             : 
     706           0 :         ptep = pte_offset_map(pmd, addr);
     707           0 :         pte = *ptep;
     708           0 :         if (pte_present(pte))
     709           0 :                 page = pte_page(pte);
     710             :         pte_unmap(ptep);
     711             : 
     712             :         return page;
     713             : }
     714             : EXPORT_SYMBOL(vmalloc_to_page);
     715             : 
     716             : /*
     717             :  * Map a vmalloc()-space virtual address to the physical page frame number.
     718             :  */
     719           0 : unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
     720             : {
     721           0 :         return page_to_pfn(vmalloc_to_page(vmalloc_addr));
     722             : }
     723             : EXPORT_SYMBOL(vmalloc_to_pfn);
     724             : 
     725             : 
     726             : /*** Global kva allocator ***/
     727             : 
     728             : #define DEBUG_AUGMENT_PROPAGATE_CHECK 0
     729             : #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
     730             : 
     731             : 
     732             : static DEFINE_SPINLOCK(vmap_area_lock);
     733             : static DEFINE_SPINLOCK(free_vmap_area_lock);
     734             : /* Export for kexec only */
     735             : LIST_HEAD(vmap_area_list);
     736             : static struct rb_root vmap_area_root = RB_ROOT;
     737             : static bool vmap_initialized __read_mostly;
     738             : 
     739             : static struct rb_root purge_vmap_area_root = RB_ROOT;
     740             : static LIST_HEAD(purge_vmap_area_list);
     741             : static DEFINE_SPINLOCK(purge_vmap_area_lock);
     742             : 
     743             : /*
     744             :  * This kmem_cache is used for vmap_area objects. Instead of
     745             :  * allocating from slab we reuse an object from this cache to
     746             :  * make things faster. Especially in "no edge" splitting of
     747             :  * free block.
     748             :  */
     749             : static struct kmem_cache *vmap_area_cachep;
     750             : 
     751             : /*
     752             :  * This linked list is used in pair with free_vmap_area_root.
     753             :  * It gives O(1) access to prev/next to perform fast coalescing.
     754             :  */
     755             : static LIST_HEAD(free_vmap_area_list);
     756             : 
     757             : /*
     758             :  * This augment red-black tree represents the free vmap space.
     759             :  * All vmap_area objects in this tree are sorted by va->va_start
     760             :  * address. It is used for allocation and merging when a vmap
     761             :  * object is released.
     762             :  *
     763             :  * Each vmap_area node contains a maximum available free block
     764             :  * of its sub-tree, right or left. Therefore it is possible to
     765             :  * find a lowest match of free area.
     766             :  */
     767             : static struct rb_root free_vmap_area_root = RB_ROOT;
     768             : 
     769             : /*
     770             :  * Preload a CPU with one object for "no edge" split case. The
     771             :  * aim is to get rid of allocations from the atomic context, thus
     772             :  * to use more permissive allocation masks.
     773             :  */
     774             : static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
     775             : 
     776             : static __always_inline unsigned long
     777             : va_size(struct vmap_area *va)
     778             : {
     779        1686 :         return (va->va_end - va->va_start);
     780             : }
     781             : 
     782             : static __always_inline unsigned long
     783             : get_subtree_max_size(struct rb_node *node)
     784             : {
     785             :         struct vmap_area *va;
     786             : 
     787        4016 :         va = rb_entry_safe(node, struct vmap_area, rb_node);
     788        4016 :         return va ? va->subtree_max_size : 0;
     789             : }
     790             : 
     791        2001 : RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
     792             :         struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
     793             : 
     794             : static void purge_vmap_area_lazy(void);
     795             : static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
     796             : static void drain_vmap_area_work(struct work_struct *work);
     797             : static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
     798             : 
     799             : static atomic_long_t nr_vmalloc_pages;
     800             : 
     801           0 : unsigned long vmalloc_nr_pages(void)
     802             : {
     803           0 :         return atomic_long_read(&nr_vmalloc_pages);
     804             : }
     805             : 
     806             : /* Look up the first VA which satisfies addr < va_end, NULL if none. */
     807             : static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
     808             : {
     809           0 :         struct vmap_area *va = NULL;
     810           0 :         struct rb_node *n = vmap_area_root.rb_node;
     811             : 
     812           0 :         addr = (unsigned long)kasan_reset_tag((void *)addr);
     813             : 
     814           0 :         while (n) {
     815             :                 struct vmap_area *tmp;
     816             : 
     817           0 :                 tmp = rb_entry(n, struct vmap_area, rb_node);
     818           0 :                 if (tmp->va_end > addr) {
     819           0 :                         va = tmp;
     820           0 :                         if (tmp->va_start <= addr)
     821             :                                 break;
     822             : 
     823           0 :                         n = n->rb_left;
     824             :                 } else
     825           0 :                         n = n->rb_right;
     826             :         }
     827             : 
     828             :         return va;
     829             : }
     830             : 
     831             : static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
     832             : {
     833         274 :         struct rb_node *n = root->rb_node;
     834             : 
     835         274 :         addr = (unsigned long)kasan_reset_tag((void *)addr);
     836             : 
     837        1604 :         while (n) {
     838             :                 struct vmap_area *va;
     839             : 
     840        1604 :                 va = rb_entry(n, struct vmap_area, rb_node);
     841        1604 :                 if (addr < va->va_start)
     842           0 :                         n = n->rb_left;
     843        1604 :                 else if (addr >= va->va_end)
     844        1330 :                         n = n->rb_right;
     845             :                 else
     846             :                         return va;
     847             :         }
     848             : 
     849             :         return NULL;
     850             : }
     851             : 
     852             : /*
     853             :  * This function returns back addresses of parent node
     854             :  * and its left or right link for further processing.
     855             :  *
     856             :  * Otherwise NULL is returned. In that case all further
     857             :  * steps regarding inserting of conflicting overlap range
     858             :  * have to be declined and actually considered as a bug.
     859             :  */
     860             : static __always_inline struct rb_node **
     861             : find_va_links(struct vmap_area *va,
     862             :         struct rb_root *root, struct rb_node *from,
     863             :         struct rb_node **parent)
     864             : {
     865             :         struct vmap_area *tmp_va;
     866             :         struct rb_node **link;
     867             : 
     868         275 :         if (root) {
     869         538 :                 link = &root->rb_node;
     870         538 :                 if (unlikely(!*link)) {
     871             :                         *parent = NULL;
     872             :                         return link;
     873             :                 }
     874             :         } else {
     875             :                 link = &from;
     876             :         }
     877             : 
     878             :         /*
     879             :          * Go to the bottom of the tree. When we hit the last point
     880             :          * we end up with parent rb_node and correct direction, i name
     881             :          * it link, where the new va->rb_node will be attached to.
     882             :          */
     883             :         do {
     884        1650 :                 tmp_va = rb_entry(*link, struct vmap_area, rb_node);
     885             : 
     886             :                 /*
     887             :                  * During the traversal we also do some sanity check.
     888             :                  * Trigger the BUG() if there are sides(left/right)
     889             :                  * or full overlaps.
     890             :                  */
     891        1650 :                 if (va->va_end <= tmp_va->va_start)
     892          21 :                         link = &(*link)->rb_left;
     893        1629 :                 else if (va->va_start >= tmp_va->va_end)
     894        1629 :                         link = &(*link)->rb_right;
     895             :                 else {
     896           0 :                         WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
     897             :                                 va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
     898             : 
     899             :                         return NULL;
     900             :                 }
     901        1650 :         } while (*link);
     902             : 
     903         546 :         *parent = &tmp_va->rb_node;
     904             :         return link;
     905             : }
     906             : 
     907             : static __always_inline struct list_head *
     908             : get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
     909             : {
     910             :         struct list_head *list;
     911             : 
     912         263 :         if (unlikely(!parent))
     913             :                 /*
     914             :                  * The red-black tree where we try to find VA neighbors
     915             :                  * before merging or inserting is empty, i.e. it means
     916             :                  * there is no free vmap space. Normally it does not
     917             :                  * happen but we handle this case anyway.
     918             :                  */
     919             :                 return NULL;
     920             : 
     921         257 :         list = &rb_entry(parent, struct vmap_area, rb_node)->list;
     922         257 :         return (&parent->rb_right == link ? list->next : list);
     923             : }
     924             : 
     925             : static __always_inline void
     926             : __link_va(struct vmap_area *va, struct rb_root *root,
     927             :         struct rb_node *parent, struct rb_node **link,
     928             :         struct list_head *head, bool augment)
     929             : {
     930             :         /*
     931             :          * VA is still not in the list, but we can
     932             :          * identify its future previous list_head node.
     933             :          */
     934         297 :         if (likely(parent)) {
     935         289 :                 head = &rb_entry(parent, struct vmap_area, rb_node)->list;
     936         289 :                 if (&parent->rb_right != link)
     937           8 :                         head = head->prev;
     938             :         }
     939             : 
     940             :         /* Insert to the rb-tree */
     941         594 :         rb_link_node(&va->rb_node, parent, link);
     942             :         if (augment) {
     943             :                 /*
     944             :                  * Some explanation here. Just perform simple insertion
     945             :                  * to the tree. We do not set va->subtree_max_size to
     946             :                  * its current size before calling rb_insert_augmented().
     947             :                  * It is because we populate the tree from the bottom
     948             :                  * to parent levels when the node _is_ in the tree.
     949             :                  *
     950             :                  * Therefore we set subtree_max_size to zero after insertion,
     951             :                  * to let __augment_tree_propagate_from() puts everything to
     952             :                  * the correct order later on.
     953             :                  */
     954          17 :                 rb_insert_augmented(&va->rb_node,
     955             :                         root, &free_vmap_area_rb_augment_cb);
     956          17 :                 va->subtree_max_size = 0;
     957             :         } else {
     958         280 :                 rb_insert_color(&va->rb_node, root);
     959             :         }
     960             : 
     961             :         /* Address-sort this list */
     962         314 :         list_add(&va->list, head);
     963             : }
     964             : 
     965             : static __always_inline void
     966             : link_va(struct vmap_area *va, struct rb_root *root,
     967             :         struct rb_node *parent, struct rb_node **link,
     968             :         struct list_head *head)
     969             : {
     970             :         __link_va(va, root, parent, link, head, false);
     971             : }
     972             : 
     973             : static __always_inline void
     974             : link_va_augment(struct vmap_area *va, struct rb_root *root,
     975             :         struct rb_node *parent, struct rb_node **link,
     976             :         struct list_head *head)
     977             : {
     978          17 :         __link_va(va, root, parent, link, head, true);
     979             : }
     980             : 
     981             : static __always_inline void
     982             : __unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
     983             : {
     984         258 :         if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
     985             :                 return;
     986             : 
     987             :         if (augment)
     988           0 :                 rb_erase_augmented(&va->rb_node,
     989             :                         root, &free_vmap_area_rb_augment_cb);
     990             :         else
     991         258 :                 rb_erase(&va->rb_node, root);
     992             : 
     993         516 :         list_del_init(&va->list);
     994         258 :         RB_CLEAR_NODE(&va->rb_node);
     995             : }
     996             : 
     997             : static __always_inline void
     998             : unlink_va(struct vmap_area *va, struct rb_root *root)
     999             : {
    1000           0 :         __unlink_va(va, root, false);
    1001             : }
    1002             : 
    1003             : static __always_inline void
    1004             : unlink_va_augment(struct vmap_area *va, struct rb_root *root)
    1005             : {
    1006           0 :         __unlink_va(va, root, true);
    1007             : }
    1008             : 
    1009             : #if DEBUG_AUGMENT_PROPAGATE_CHECK
    1010             : /*
    1011             :  * Gets called when remove the node and rotate.
    1012             :  */
    1013             : static __always_inline unsigned long
    1014             : compute_subtree_max_size(struct vmap_area *va)
    1015             : {
    1016             :         return max3(va_size(va),
    1017             :                 get_subtree_max_size(va->rb_node.rb_left),
    1018             :                 get_subtree_max_size(va->rb_node.rb_right));
    1019             : }
    1020             : 
    1021             : static void
    1022             : augment_tree_propagate_check(void)
    1023             : {
    1024             :         struct vmap_area *va;
    1025             :         unsigned long computed_size;
    1026             : 
    1027             :         list_for_each_entry(va, &free_vmap_area_list, list) {
    1028             :                 computed_size = compute_subtree_max_size(va);
    1029             :                 if (computed_size != va->subtree_max_size)
    1030             :                         pr_emerg("tree is corrupted: %lu, %lu\n",
    1031             :                                 va_size(va), va->subtree_max_size);
    1032             :         }
    1033             : }
    1034             : #endif
    1035             : 
    1036             : /*
    1037             :  * This function populates subtree_max_size from bottom to upper
    1038             :  * levels starting from VA point. The propagation must be done
    1039             :  * when VA size is modified by changing its va_start/va_end. Or
    1040             :  * in case of newly inserting of VA to the tree.
    1041             :  *
    1042             :  * It means that __augment_tree_propagate_from() must be called:
    1043             :  * - After VA has been inserted to the tree(free path);
    1044             :  * - After VA has been shrunk(allocation path);
    1045             :  * - After VA has been increased(merging path).
    1046             :  *
    1047             :  * Please note that, it does not mean that upper parent nodes
    1048             :  * and their subtree_max_size are recalculated all the time up
    1049             :  * to the root node.
    1050             :  *
    1051             :  *       4--8
    1052             :  *        /\
    1053             :  *       /  \
    1054             :  *      /    \
    1055             :  *    2--2  8--8
    1056             :  *
    1057             :  * For example if we modify the node 4, shrinking it to 2, then
    1058             :  * no any modification is required. If we shrink the node 2 to 1
    1059             :  * its subtree_max_size is updated only, and set to 1. If we shrink
    1060             :  * the node 8 to 6, then its subtree_max_size is set to 6 and parent
    1061             :  * node becomes 4--6.
    1062             :  */
    1063             : static __always_inline void
    1064             : augment_tree_propagate_from(struct vmap_area *va)
    1065             : {
    1066             :         /*
    1067             :          * Populate the tree from bottom towards the root until
    1068             :          * the calculated maximum available size of checked node
    1069             :          * is equal to its current one.
    1070             :          */
    1071         296 :         free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
    1072             : 
    1073             : #if DEBUG_AUGMENT_PROPAGATE_CHECK
    1074             :         augment_tree_propagate_check();
    1075             : #endif
    1076             : }
    1077             : 
    1078             : static void
    1079         274 : insert_vmap_area(struct vmap_area *va,
    1080             :         struct rb_root *root, struct list_head *head)
    1081             : {
    1082             :         struct rb_node **link;
    1083             :         struct rb_node *parent;
    1084             : 
    1085         274 :         link = find_va_links(va, root, NULL, &parent);
    1086         274 :         if (link)
    1087         274 :                 link_va(va, root, parent, link, head);
    1088         274 : }
    1089             : 
    1090             : static void
    1091          17 : insert_vmap_area_augment(struct vmap_area *va,
    1092             :         struct rb_node *from, struct rb_root *root,
    1093             :         struct list_head *head)
    1094             : {
    1095             :         struct rb_node **link;
    1096             :         struct rb_node *parent;
    1097             : 
    1098          17 :         if (from)
    1099             :                 link = find_va_links(va, NULL, from, &parent);
    1100             :         else
    1101             :                 link = find_va_links(va, root, NULL, &parent);
    1102             : 
    1103          17 :         if (link) {
    1104          34 :                 link_va_augment(va, root, parent, link, head);
    1105             :                 augment_tree_propagate_from(va);
    1106             :         }
    1107          17 : }
    1108             : 
    1109             : /*
    1110             :  * Merge de-allocated chunk of VA memory with previous
    1111             :  * and next free blocks. If coalesce is not done a new
    1112             :  * free area is inserted. If VA has been merged, it is
    1113             :  * freed.
    1114             :  *
    1115             :  * Please note, it can return NULL in case of overlap
    1116             :  * ranges, followed by WARN() report. Despite it is a
    1117             :  * buggy behaviour, a system can be alive and keep
    1118             :  * ongoing.
    1119             :  */
    1120             : static __always_inline struct vmap_area *
    1121             : __merge_or_add_vmap_area(struct vmap_area *va,
    1122             :         struct rb_root *root, struct list_head *head, bool augment)
    1123             : {
    1124             :         struct vmap_area *sibling;
    1125             :         struct list_head *next;
    1126             :         struct rb_node **link;
    1127             :         struct rb_node *parent;
    1128         263 :         bool merged = false;
    1129             : 
    1130             :         /*
    1131             :          * Find a place in the tree where VA potentially will be
    1132             :          * inserted, unless it is merged with its sibling/siblings.
    1133             :          */
    1134         263 :         link = find_va_links(va, root, NULL, &parent);
    1135         263 :         if (!link)
    1136             :                 return NULL;
    1137             : 
    1138             :         /*
    1139             :          * Get next node of VA to check if merging can be done.
    1140             :          */
    1141         526 :         next = get_va_next_sibling(parent, link);
    1142         263 :         if (unlikely(next == NULL))
    1143             :                 goto insert;
    1144             : 
    1145             :         /*
    1146             :          * start            end
    1147             :          * |                |
    1148             :          * |<------VA------>|<-----Next----->|
    1149             :          *                  |                |
    1150             :          *                  start            end
    1151             :          */
    1152         257 :         if (next != head) {
    1153           5 :                 sibling = list_entry(next, struct vmap_area, list);
    1154           5 :                 if (sibling->va_start == va->va_end) {
    1155           5 :                         sibling->va_start = va->va_start;
    1156             : 
    1157             :                         /* Free vmap_area object. */
    1158           5 :                         kmem_cache_free(vmap_area_cachep, va);
    1159             : 
    1160             :                         /* Point to the new merged area. */
    1161           5 :                         va = sibling;
    1162           5 :                         merged = true;
    1163             :                 }
    1164             :         }
    1165             : 
    1166             :         /*
    1167             :          * start            end
    1168             :          * |                |
    1169             :          * |<-----Prev----->|<------VA------>|
    1170             :          *                  |                |
    1171             :          *                  start            end
    1172             :          */
    1173         257 :         if (next->prev != head) {
    1174         257 :                 sibling = list_entry(next->prev, struct vmap_area, list);
    1175         257 :                 if (sibling->va_end == va->va_start) {
    1176             :                         /*
    1177             :                          * If both neighbors are coalesced, it is important
    1178             :                          * to unlink the "next" node first, followed by merging
    1179             :                          * with "previous" one. Otherwise the tree might not be
    1180             :                          * fully populated if a sibling's augmented value is
    1181             :                          * "normalized" because of rotation operations.
    1182             :                          */
    1183         252 :                         if (merged)
    1184           0 :                                 __unlink_va(va, root, augment);
    1185             : 
    1186         252 :                         sibling->va_end = va->va_end;
    1187             : 
    1188             :                         /* Free vmap_area object. */
    1189         252 :                         kmem_cache_free(vmap_area_cachep, va);
    1190             : 
    1191             :                         /* Point to the new merged area. */
    1192         252 :                         va = sibling;
    1193         252 :                         merged = true;
    1194             :                 }
    1195             :         }
    1196             : 
    1197             : insert:
    1198         263 :         if (!merged)
    1199           6 :                 __link_va(va, root, parent, link, head, augment);
    1200             : 
    1201             :         return va;
    1202             : }
    1203             : 
    1204             : static __always_inline struct vmap_area *
    1205             : merge_or_add_vmap_area(struct vmap_area *va,
    1206             :         struct rb_root *root, struct list_head *head)
    1207             : {
    1208         258 :         return __merge_or_add_vmap_area(va, root, head, false);
    1209             : }
    1210             : 
    1211             : static __always_inline struct vmap_area *
    1212             : merge_or_add_vmap_area_augment(struct vmap_area *va,
    1213             :         struct rb_root *root, struct list_head *head)
    1214             : {
    1215           5 :         va = __merge_or_add_vmap_area(va, root, head, true);
    1216           5 :         if (va)
    1217             :                 augment_tree_propagate_from(va);
    1218             : 
    1219             :         return va;
    1220             : }
    1221             : 
    1222             : static __always_inline bool
    1223             : is_within_this_va(struct vmap_area *va, unsigned long size,
    1224             :         unsigned long align, unsigned long vstart)
    1225             : {
    1226             :         unsigned long nva_start_addr;
    1227             : 
    1228        2145 :         if (va->va_start > vstart)
    1229        1871 :                 nva_start_addr = ALIGN(va->va_start, align);
    1230             :         else
    1231         274 :                 nva_start_addr = ALIGN(vstart, align);
    1232             : 
    1233             :         /* Can be overflowed due to big size or alignment. */
    1234        2145 :         if (nva_start_addr + size < nva_start_addr ||
    1235             :                         nva_start_addr < vstart)
    1236             :                 return false;
    1237             : 
    1238        2145 :         return (nva_start_addr + size <= va->va_end);
    1239             : }
    1240             : 
    1241             : /*
    1242             :  * Find the first free block(lowest start address) in the tree,
    1243             :  * that will accomplish the request corresponding to passing
    1244             :  * parameters. Please note, with an alignment bigger than PAGE_SIZE,
    1245             :  * a search length is adjusted to account for worst case alignment
    1246             :  * overhead.
    1247             :  */
    1248             : static __always_inline struct vmap_area *
    1249             : find_vmap_lowest_match(struct rb_root *root, unsigned long size,
    1250             :         unsigned long align, unsigned long vstart, bool adjust_search_size)
    1251             : {
    1252             :         struct vmap_area *va;
    1253             :         struct rb_node *node;
    1254             :         unsigned long length;
    1255             : 
    1256             :         /* Start from the root. */
    1257         274 :         node = root->rb_node;
    1258             : 
    1259             :         /* Adjust the search size for alignment overhead. */
    1260         274 :         length = adjust_search_size ? size + align - 1 : size;
    1261             : 
    1262        2145 :         while (node) {
    1263        2145 :                 va = rb_entry(node, struct vmap_area, rb_node);
    1264             : 
    1265        4830 :                 if (get_subtree_max_size(node->rb_left) >= length &&
    1266         540 :                                 vstart < va->va_start) {
    1267             :                         node = node->rb_left;
    1268             :                 } else {
    1269        1605 :                         if (is_within_this_va(va, size, align, vstart))
    1270             :                                 return va;
    1271             : 
    1272             :                         /*
    1273             :                          * Does not make sense to go deeper towards the right
    1274             :                          * sub-tree if it does not have a free block that is
    1275             :                          * equal or bigger to the requested search length.
    1276             :                          */
    1277        2664 :                         if (get_subtree_max_size(node->rb_right) >= length) {
    1278        1059 :                                 node = node->rb_right;
    1279        1059 :                                 continue;
    1280             :                         }
    1281             : 
    1282             :                         /*
    1283             :                          * OK. We roll back and find the first right sub-tree,
    1284             :                          * that will satisfy the search criteria. It can happen
    1285             :                          * due to "vstart" restriction or an alignment overhead
    1286             :                          * that is bigger then PAGE_SIZE.
    1287             :                          */
    1288         540 :                         while ((node = rb_parent(node))) {
    1289         540 :                                 va = rb_entry(node, struct vmap_area, rb_node);
    1290         540 :                                 if (is_within_this_va(va, size, align, vstart))
    1291             :                                         return va;
    1292             : 
    1293        1078 :                                 if (get_subtree_max_size(node->rb_right) >= length &&
    1294             :                                                 vstart <= va->va_start) {
    1295             :                                         /*
    1296             :                                          * Shift the vstart forward. Please note, we update it with
    1297             :                                          * parent's start address adding "1" because we do not want
    1298             :                                          * to enter same sub-tree after it has already been checked
    1299             :                                          * and no suitable free block found there.
    1300             :                                          */
    1301         272 :                                         vstart = va->va_start + 1;
    1302         272 :                                         node = node->rb_right;
    1303             :                                         break;
    1304             :                                 }
    1305             :                         }
    1306             :                 }
    1307             :         }
    1308             : 
    1309             :         return NULL;
    1310             : }
    1311             : 
    1312             : #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
    1313             : #include <linux/random.h>
    1314             : 
    1315             : static struct vmap_area *
    1316             : find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
    1317             :         unsigned long align, unsigned long vstart)
    1318             : {
    1319             :         struct vmap_area *va;
    1320             : 
    1321             :         list_for_each_entry(va, head, list) {
    1322             :                 if (!is_within_this_va(va, size, align, vstart))
    1323             :                         continue;
    1324             : 
    1325             :                 return va;
    1326             :         }
    1327             : 
    1328             :         return NULL;
    1329             : }
    1330             : 
    1331             : static void
    1332             : find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
    1333             :                              unsigned long size, unsigned long align)
    1334             : {
    1335             :         struct vmap_area *va_1, *va_2;
    1336             :         unsigned long vstart;
    1337             :         unsigned int rnd;
    1338             : 
    1339             :         get_random_bytes(&rnd, sizeof(rnd));
    1340             :         vstart = VMALLOC_START + rnd;
    1341             : 
    1342             :         va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
    1343             :         va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);
    1344             : 
    1345             :         if (va_1 != va_2)
    1346             :                 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
    1347             :                         va_1, va_2, vstart);
    1348             : }
    1349             : #endif
    1350             : 
    1351             : enum fit_type {
    1352             :         NOTHING_FIT = 0,
    1353             :         FL_FIT_TYPE = 1,        /* full fit */
    1354             :         LE_FIT_TYPE = 2,        /* left edge fit */
    1355             :         RE_FIT_TYPE = 3,        /* right edge fit */
    1356             :         NE_FIT_TYPE = 4         /* no edge fit */
    1357             : };
    1358             : 
    1359             : static __always_inline enum fit_type
    1360             : classify_va_fit_type(struct vmap_area *va,
    1361             :         unsigned long nva_start_addr, unsigned long size)
    1362             : {
    1363             :         enum fit_type type;
    1364             : 
    1365             :         /* Check if it is within VA. */
    1366         548 :         if (nva_start_addr < va->va_start ||
    1367         274 :                         nva_start_addr + size > va->va_end)
    1368             :                 return NOTHING_FIT;
    1369             : 
    1370             :         /* Now classify. */
    1371         274 :         if (va->va_start == nva_start_addr) {
    1372         258 :                 if (va->va_end == nva_start_addr + size)
    1373             :                         type = FL_FIT_TYPE;
    1374             :                 else
    1375         258 :                         type = LE_FIT_TYPE;
    1376          16 :         } else if (va->va_end == nva_start_addr + size) {
    1377             :                 type = RE_FIT_TYPE;
    1378             :         } else {
    1379          16 :                 type = NE_FIT_TYPE;
    1380             :         }
    1381             : 
    1382             :         return type;
    1383             : }
    1384             : 
    1385             : static __always_inline int
    1386             : adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
    1387             :                       struct vmap_area *va, unsigned long nva_start_addr,
    1388             :                       unsigned long size)
    1389             : {
    1390         274 :         struct vmap_area *lva = NULL;
    1391         274 :         enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
    1392             : 
    1393         274 :         if (type == FL_FIT_TYPE) {
    1394             :                 /*
    1395             :                  * No need to split VA, it fully fits.
    1396             :                  *
    1397             :                  * |               |
    1398             :                  * V      NVA      V
    1399             :                  * |---------------|
    1400             :                  */
    1401           0 :                 unlink_va_augment(va, root);
    1402           0 :                 kmem_cache_free(vmap_area_cachep, va);
    1403         274 :         } else if (type == LE_FIT_TYPE) {
    1404             :                 /*
    1405             :                  * Split left edge of fit VA.
    1406             :                  *
    1407             :                  * |       |
    1408             :                  * V  NVA  V   R
    1409             :                  * |-------|-------|
    1410             :                  */
    1411         258 :                 va->va_start += size;
    1412          16 :         } else if (type == RE_FIT_TYPE) {
    1413             :                 /*
    1414             :                  * Split right edge of fit VA.
    1415             :                  *
    1416             :                  *         |       |
    1417             :                  *     L   V  NVA  V
    1418             :                  * |-------|-------|
    1419             :                  */
    1420           0 :                 va->va_end = nva_start_addr;
    1421          16 :         } else if (type == NE_FIT_TYPE) {
    1422             :                 /*
    1423             :                  * Split no edge of fit VA.
    1424             :                  *
    1425             :                  *     |       |
    1426             :                  *   L V  NVA  V R
    1427             :                  * |---|-------|---|
    1428             :                  */
    1429          16 :                 lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
    1430          16 :                 if (unlikely(!lva)) {
    1431             :                         /*
    1432             :                          * For percpu allocator we do not do any pre-allocation
    1433             :                          * and leave it as it is. The reason is it most likely
    1434             :                          * never ends up with NE_FIT_TYPE splitting. In case of
    1435             :                          * percpu allocations offsets and sizes are aligned to
    1436             :                          * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
    1437             :                          * are its main fitting cases.
    1438             :                          *
    1439             :                          * There are a few exceptions though, as an example it is
    1440             :                          * a first allocation (early boot up) when we have "one"
    1441             :                          * big free space that has to be split.
    1442             :                          *
    1443             :                          * Also we can hit this path in case of regular "vmap"
    1444             :                          * allocations, if "this" current CPU was not preloaded.
    1445             :                          * See the comment in alloc_vmap_area() why. If so, then
    1446             :                          * GFP_NOWAIT is used instead to get an extra object for
    1447             :                          * split purpose. That is rare and most time does not
    1448             :                          * occur.
    1449             :                          *
    1450             :                          * What happens if an allocation gets failed. Basically,
    1451             :                          * an "overflow" path is triggered to purge lazily freed
    1452             :                          * areas to free some memory, then, the "retry" path is
    1453             :                          * triggered to repeat one more time. See more details
    1454             :                          * in alloc_vmap_area() function.
    1455             :                          */
    1456           0 :                         lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
    1457           0 :                         if (!lva)
    1458             :                                 return -1;
    1459             :                 }
    1460             : 
    1461             :                 /*
    1462             :                  * Build the remainder.
    1463             :                  */
    1464          16 :                 lva->va_start = va->va_start;
    1465          16 :                 lva->va_end = nva_start_addr;
    1466             : 
    1467             :                 /*
    1468             :                  * Shrink this VA to remaining size.
    1469             :                  */
    1470          16 :                 va->va_start = nva_start_addr + size;
    1471             :         } else {
    1472             :                 return -1;
    1473             :         }
    1474             : 
    1475         274 :         if (type != FL_FIT_TYPE) {
    1476         274 :                 augment_tree_propagate_from(va);
    1477             : 
    1478         274 :                 if (lva)        /* type == NE_FIT_TYPE */
    1479          16 :                         insert_vmap_area_augment(lva, &va->rb_node, root, head);
    1480             :         }
    1481             : 
    1482             :         return 0;
    1483             : }
    1484             : 
    1485             : /*
    1486             :  * Returns a start address of the newly allocated area, if success.
    1487             :  * Otherwise a vend is returned that indicates failure.
    1488             :  */
    1489             : static __always_inline unsigned long
    1490             : __alloc_vmap_area(struct rb_root *root, struct list_head *head,
    1491             :         unsigned long size, unsigned long align,
    1492             :         unsigned long vstart, unsigned long vend)
    1493             : {
    1494         274 :         bool adjust_search_size = true;
    1495             :         unsigned long nva_start_addr;
    1496             :         struct vmap_area *va;
    1497             :         int ret;
    1498             : 
    1499             :         /*
    1500             :          * Do not adjust when:
    1501             :          *   a) align <= PAGE_SIZE, because it does not make any sense.
    1502             :          *      All blocks(their start addresses) are at least PAGE_SIZE
    1503             :          *      aligned anyway;
    1504             :          *   b) a short range where a requested size corresponds to exactly
    1505             :          *      specified [vstart:vend] interval and an alignment > PAGE_SIZE.
    1506             :          *      With adjusted search length an allocation would not succeed.
    1507             :          */
    1508         274 :         if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
    1509         258 :                 adjust_search_size = false;
    1510             : 
    1511         548 :         va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
    1512         274 :         if (unlikely(!va))
    1513             :                 return vend;
    1514             : 
    1515         274 :         if (va->va_start > vstart)
    1516         273 :                 nva_start_addr = ALIGN(va->va_start, align);
    1517             :         else
    1518           1 :                 nva_start_addr = ALIGN(vstart, align);
    1519             : 
    1520             :         /* Check the "vend" restriction. */
    1521         274 :         if (nva_start_addr + size > vend)
    1522             :                 return vend;
    1523             : 
    1524             :         /* Update the free vmap_area. */
    1525         274 :         ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
    1526         274 :         if (WARN_ON_ONCE(ret))
    1527             :                 return vend;
    1528             : 
    1529             : #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
    1530             :         find_vmap_lowest_match_check(root, head, size, align);
    1531             : #endif
    1532             : 
    1533             :         return nva_start_addr;
    1534             : }
    1535             : 
    1536             : /*
    1537             :  * Free a region of KVA allocated by alloc_vmap_area
    1538             :  */
    1539           0 : static void free_vmap_area(struct vmap_area *va)
    1540             : {
    1541             :         /*
    1542             :          * Remove from the busy tree/list.
    1543             :          */
    1544           0 :         spin_lock(&vmap_area_lock);
    1545           0 :         unlink_va(va, &vmap_area_root);
    1546           0 :         spin_unlock(&vmap_area_lock);
    1547             : 
    1548             :         /*
    1549             :          * Insert/Merge it back to the free tree/list.
    1550             :          */
    1551           0 :         spin_lock(&free_vmap_area_lock);
    1552           0 :         merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
    1553           0 :         spin_unlock(&free_vmap_area_lock);
    1554           0 : }
    1555             : 
    1556             : static inline void
    1557         274 : preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
    1558             : {
    1559         274 :         struct vmap_area *va = NULL;
    1560             : 
    1561             :         /*
    1562             :          * Preload this CPU with one extra vmap_area object. It is used
    1563             :          * when fit type of free area is NE_FIT_TYPE. It guarantees that
    1564             :          * a CPU that does an allocation is preloaded.
    1565             :          *
    1566             :          * We do it in non-atomic context, thus it allows us to use more
    1567             :          * permissive allocation masks to be more stable under low memory
    1568             :          * condition and high memory pressure.
    1569             :          */
    1570         274 :         if (!this_cpu_read(ne_fit_preload_node))
    1571          17 :                 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
    1572             : 
    1573         274 :         spin_lock(lock);
    1574             : 
    1575         274 :         if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
    1576           0 :                 kmem_cache_free(vmap_area_cachep, va);
    1577         274 : }
    1578             : 
    1579             : /*
    1580             :  * Allocate a region of KVA of the specified size and alignment, within the
    1581             :  * vstart and vend.
    1582             :  */
    1583         274 : static struct vmap_area *alloc_vmap_area(unsigned long size,
    1584             :                                 unsigned long align,
    1585             :                                 unsigned long vstart, unsigned long vend,
    1586             :                                 int node, gfp_t gfp_mask,
    1587             :                                 unsigned long va_flags)
    1588             : {
    1589             :         struct vmap_area *va;
    1590             :         unsigned long freed;
    1591             :         unsigned long addr;
    1592         274 :         int purged = 0;
    1593             :         int ret;
    1594             : 
    1595         548 :         if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
    1596             :                 return ERR_PTR(-EINVAL);
    1597             : 
    1598         274 :         if (unlikely(!vmap_initialized))
    1599             :                 return ERR_PTR(-EBUSY);
    1600             : 
    1601             :         might_sleep();
    1602         274 :         gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
    1603             : 
    1604         274 :         va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
    1605         274 :         if (unlikely(!va))
    1606             :                 return ERR_PTR(-ENOMEM);
    1607             : 
    1608             :         /*
    1609             :          * Only scan the relevant parts containing pointers to other objects
    1610             :          * to avoid false negatives.
    1611             :          */
    1612             :         kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
    1613             : 
    1614             : retry:
    1615         274 :         preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
    1616         274 :         addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
    1617             :                 size, align, vstart, vend);
    1618         274 :         spin_unlock(&free_vmap_area_lock);
    1619             : 
    1620         274 :         trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
    1621             : 
    1622             :         /*
    1623             :          * If an allocation fails, the "vend" address is
    1624             :          * returned. Therefore trigger the overflow path.
    1625             :          */
    1626         274 :         if (unlikely(addr == vend))
    1627             :                 goto overflow;
    1628             : 
    1629         274 :         va->va_start = addr;
    1630         274 :         va->va_end = addr + size;
    1631         274 :         va->vm = NULL;
    1632         274 :         va->flags = va_flags;
    1633             : 
    1634         274 :         spin_lock(&vmap_area_lock);
    1635         274 :         insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
    1636         274 :         spin_unlock(&vmap_area_lock);
    1637             : 
    1638         274 :         BUG_ON(!IS_ALIGNED(va->va_start, align));
    1639         274 :         BUG_ON(va->va_start < vstart);
    1640         274 :         BUG_ON(va->va_end > vend);
    1641             : 
    1642             :         ret = kasan_populate_vmalloc(addr, size);
    1643             :         if (ret) {
    1644             :                 free_vmap_area(va);
    1645             :                 return ERR_PTR(ret);
    1646             :         }
    1647             : 
    1648             :         return va;
    1649             : 
    1650             : overflow:
    1651           0 :         if (!purged) {
    1652           0 :                 purge_vmap_area_lazy();
    1653           0 :                 purged = 1;
    1654           0 :                 goto retry;
    1655             :         }
    1656             : 
    1657           0 :         freed = 0;
    1658           0 :         blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
    1659             : 
    1660           0 :         if (freed > 0) {
    1661             :                 purged = 0;
    1662             :                 goto retry;
    1663             :         }
    1664             : 
    1665           0 :         if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
    1666           0 :                 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
    1667             :                         size);
    1668             : 
    1669           0 :         kmem_cache_free(vmap_area_cachep, va);
    1670           0 :         return ERR_PTR(-EBUSY);
    1671             : }
    1672             : 
    1673           0 : int register_vmap_purge_notifier(struct notifier_block *nb)
    1674             : {
    1675           0 :         return blocking_notifier_chain_register(&vmap_notify_list, nb);
    1676             : }
    1677             : EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
    1678             : 
    1679           0 : int unregister_vmap_purge_notifier(struct notifier_block *nb)
    1680             : {
    1681           0 :         return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
    1682             : }
    1683             : EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
    1684             : 
    1685             : /*
    1686             :  * lazy_max_pages is the maximum amount of virtual address space we gather up
    1687             :  * before attempting to purge with a TLB flush.
    1688             :  *
    1689             :  * There is a tradeoff here: a larger number will cover more kernel page tables
    1690             :  * and take slightly longer to purge, but it will linearly reduce the number of
    1691             :  * global TLB flushes that must be performed. It would seem natural to scale
    1692             :  * this number up linearly with the number of CPUs (because vmapping activity
    1693             :  * could also scale linearly with the number of CPUs), however it is likely
    1694             :  * that in practice, workloads might be constrained in other ways that mean
    1695             :  * vmap activity will not scale linearly with CPUs. Also, I want to be
    1696             :  * conservative and not introduce a big latency on huge systems, so go with
    1697             :  * a less aggressive log scale. It will still be an improvement over the old
    1698             :  * code, and it will be simple to change the scale factor if we find that it
    1699             :  * becomes a problem on bigger systems.
    1700             :  */
    1701             : static unsigned long lazy_max_pages(void)
    1702             : {
    1703             :         unsigned int log;
    1704             : 
    1705         268 :         log = fls(num_online_cpus());
    1706             : 
    1707         268 :         return log * (32UL * 1024 * 1024 / PAGE_SIZE);
    1708             : }
    1709             : 
    1710             : static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
    1711             : 
    1712             : /*
    1713             :  * Serialize vmap purging.  There is no actual critical section protected
    1714             :  * by this lock, but we want to avoid concurrent calls for performance
    1715             :  * reasons and to make the pcpu_get_vm_areas more deterministic.
    1716             :  */
    1717             : static DEFINE_MUTEX(vmap_purge_lock);
    1718             : 
    1719             : /* for per-CPU blocks */
    1720             : static void purge_fragmented_blocks_allcpus(void);
    1721             : 
    1722             : /*
    1723             :  * Purges all lazily-freed vmap areas.
    1724             :  */
    1725           5 : static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
    1726             : {
    1727             :         unsigned long resched_threshold;
    1728           5 :         unsigned int num_purged_areas = 0;
    1729             :         struct list_head local_purge_list;
    1730             :         struct vmap_area *va, *n_va;
    1731             : 
    1732             :         lockdep_assert_held(&vmap_purge_lock);
    1733             : 
    1734           5 :         spin_lock(&purge_vmap_area_lock);
    1735           5 :         purge_vmap_area_root = RB_ROOT;
    1736           5 :         list_replace_init(&purge_vmap_area_list, &local_purge_list);
    1737           5 :         spin_unlock(&purge_vmap_area_lock);
    1738             : 
    1739           5 :         if (unlikely(list_empty(&local_purge_list)))
    1740             :                 goto out;
    1741             : 
    1742           5 :         start = min(start,
    1743             :                 list_first_entry(&local_purge_list,
    1744             :                         struct vmap_area, list)->va_start);
    1745             : 
    1746           5 :         end = max(end,
    1747             :                 list_last_entry(&local_purge_list,
    1748             :                         struct vmap_area, list)->va_end);
    1749             : 
    1750           5 :         flush_tlb_kernel_range(start, end);
    1751           5 :         resched_threshold = lazy_max_pages() << 1;
    1752             : 
    1753           5 :         spin_lock(&free_vmap_area_lock);
    1754          10 :         list_for_each_entry_safe(va, n_va, &local_purge_list, list) {
    1755           5 :                 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
    1756           5 :                 unsigned long orig_start = va->va_start;
    1757           5 :                 unsigned long orig_end = va->va_end;
    1758             : 
    1759             :                 /*
    1760             :                  * Finally insert or merge lazily-freed area. It is
    1761             :                  * detached and there is no need to "unlink" it from
    1762             :                  * anything.
    1763             :                  */
    1764           5 :                 va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
    1765             :                                 &free_vmap_area_list);
    1766             : 
    1767           5 :                 if (!va)
    1768           0 :                         continue;
    1769             : 
    1770          10 :                 if (is_vmalloc_or_module_addr((void *)orig_start))
    1771             :                         kasan_release_vmalloc(orig_start, orig_end,
    1772             :                                               va->va_start, va->va_end);
    1773             : 
    1774          10 :                 atomic_long_sub(nr, &vmap_lazy_nr);
    1775           5 :                 num_purged_areas++;
    1776             : 
    1777           5 :                 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
    1778           5 :                         cond_resched_lock(&free_vmap_area_lock);
    1779             :         }
    1780             :         spin_unlock(&free_vmap_area_lock);
    1781             : 
    1782             : out:
    1783           5 :         trace_purge_vmap_area_lazy(start, end, num_purged_areas);
    1784           5 :         return num_purged_areas > 0;
    1785             : }
    1786             : 
    1787             : /*
    1788             :  * Kick off a purge of the outstanding lazy areas.
    1789             :  */
    1790           0 : static void purge_vmap_area_lazy(void)
    1791             : {
    1792           0 :         mutex_lock(&vmap_purge_lock);
    1793           0 :         purge_fragmented_blocks_allcpus();
    1794           0 :         __purge_vmap_area_lazy(ULONG_MAX, 0);
    1795           0 :         mutex_unlock(&vmap_purge_lock);
    1796           0 : }
    1797             : 
    1798           5 : static void drain_vmap_area_work(struct work_struct *work)
    1799             : {
    1800             :         unsigned long nr_lazy;
    1801             : 
    1802             :         do {
    1803           5 :                 mutex_lock(&vmap_purge_lock);
    1804           5 :                 __purge_vmap_area_lazy(ULONG_MAX, 0);
    1805           5 :                 mutex_unlock(&vmap_purge_lock);
    1806             : 
    1807             :                 /* Recheck if further work is required. */
    1808           5 :                 nr_lazy = atomic_long_read(&vmap_lazy_nr);
    1809           5 :         } while (nr_lazy > lazy_max_pages());
    1810           5 : }
    1811             : 
    1812             : /*
    1813             :  * Free a vmap area, caller ensuring that the area has been unmapped,
    1814             :  * unlinked and flush_cache_vunmap had been called for the correct
    1815             :  * range previously.
    1816             :  */
    1817         258 : static void free_vmap_area_noflush(struct vmap_area *va)
    1818             : {
    1819         258 :         unsigned long nr_lazy_max = lazy_max_pages();
    1820         258 :         unsigned long va_start = va->va_start;
    1821             :         unsigned long nr_lazy;
    1822             : 
    1823         516 :         if (WARN_ON_ONCE(!list_empty(&va->list)))
    1824             :                 return;
    1825             : 
    1826         516 :         nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
    1827             :                                 PAGE_SHIFT, &vmap_lazy_nr);
    1828             : 
    1829             :         /*
    1830             :          * Merge or place it to the purge tree/list.
    1831             :          */
    1832         258 :         spin_lock(&purge_vmap_area_lock);
    1833         258 :         merge_or_add_vmap_area(va,
    1834             :                 &purge_vmap_area_root, &purge_vmap_area_list);
    1835         258 :         spin_unlock(&purge_vmap_area_lock);
    1836             : 
    1837         258 :         trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
    1838             : 
    1839             :         /* After this point, we may free va at any time */
    1840         258 :         if (unlikely(nr_lazy > nr_lazy_max))
    1841             :                 schedule_work(&drain_vmap_work);
    1842             : }
    1843             : 
    1844             : /*
    1845             :  * Free and unmap a vmap area
    1846             :  */
    1847         258 : static void free_unmap_vmap_area(struct vmap_area *va)
    1848             : {
    1849         258 :         flush_cache_vunmap(va->va_start, va->va_end);
    1850         516 :         vunmap_range_noflush(va->va_start, va->va_end);
    1851             :         if (debug_pagealloc_enabled_static())
    1852             :                 flush_tlb_kernel_range(va->va_start, va->va_end);
    1853             : 
    1854         258 :         free_vmap_area_noflush(va);
    1855         258 : }
    1856             : 
    1857           0 : struct vmap_area *find_vmap_area(unsigned long addr)
    1858             : {
    1859             :         struct vmap_area *va;
    1860             : 
    1861          16 :         spin_lock(&vmap_area_lock);
    1862          16 :         va = __find_vmap_area(addr, &vmap_area_root);
    1863          16 :         spin_unlock(&vmap_area_lock);
    1864             : 
    1865           0 :         return va;
    1866             : }
    1867             : 
    1868         258 : static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
    1869             : {
    1870             :         struct vmap_area *va;
    1871             : 
    1872         258 :         spin_lock(&vmap_area_lock);
    1873         258 :         va = __find_vmap_area(addr, &vmap_area_root);
    1874         258 :         if (va)
    1875             :                 unlink_va(va, &vmap_area_root);
    1876         258 :         spin_unlock(&vmap_area_lock);
    1877             : 
    1878         258 :         return va;
    1879             : }
    1880             : 
    1881             : /*** Per cpu kva allocator ***/
    1882             : 
    1883             : /*
    1884             :  * vmap space is limited especially on 32 bit architectures. Ensure there is
    1885             :  * room for at least 16 percpu vmap blocks per CPU.
    1886             :  */
    1887             : /*
    1888             :  * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
    1889             :  * to #define VMALLOC_SPACE             (VMALLOC_END-VMALLOC_START). Guess
    1890             :  * instead (we just need a rough idea)
    1891             :  */
    1892             : #if BITS_PER_LONG == 32
    1893             : #define VMALLOC_SPACE           (128UL*1024*1024)
    1894             : #else
    1895             : #define VMALLOC_SPACE           (128UL*1024*1024*1024)
    1896             : #endif
    1897             : 
    1898             : #define VMALLOC_PAGES           (VMALLOC_SPACE / PAGE_SIZE)
    1899             : #define VMAP_MAX_ALLOC          BITS_PER_LONG   /* 256K with 4K pages */
    1900             : #define VMAP_BBMAP_BITS_MAX     1024    /* 4MB with 4K pages */
    1901             : #define VMAP_BBMAP_BITS_MIN     (VMAP_MAX_ALLOC*2)
    1902             : #define VMAP_MIN(x, y)          ((x) < (y) ? (x) : (y)) /* can't use min() */
    1903             : #define VMAP_MAX(x, y)          ((x) > (y) ? (x) : (y)) /* can't use max() */
    1904             : #define VMAP_BBMAP_BITS         \
    1905             :                 VMAP_MIN(VMAP_BBMAP_BITS_MAX,   \
    1906             :                 VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
    1907             :                         VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
    1908             : 
    1909             : #define VMAP_BLOCK_SIZE         (VMAP_BBMAP_BITS * PAGE_SIZE)
    1910             : 
    1911             : #define VMAP_RAM                0x1 /* indicates vm_map_ram area*/
    1912             : #define VMAP_BLOCK              0x2 /* mark out the vmap_block sub-type*/
    1913             : #define VMAP_FLAGS_MASK         0x3
    1914             : 
    1915             : struct vmap_block_queue {
    1916             :         spinlock_t lock;
    1917             :         struct list_head free;
    1918             : 
    1919             :         /*
    1920             :          * An xarray requires an extra memory dynamically to
    1921             :          * be allocated. If it is an issue, we can use rb-tree
    1922             :          * instead.
    1923             :          */
    1924             :         struct xarray vmap_blocks;
    1925             : };
    1926             : 
    1927             : struct vmap_block {
    1928             :         spinlock_t lock;
    1929             :         struct vmap_area *va;
    1930             :         unsigned long free, dirty;
    1931             :         DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
    1932             :         unsigned long dirty_min, dirty_max; /*< dirty range */
    1933             :         struct list_head free_list;
    1934             :         struct rcu_head rcu_head;
    1935             :         struct list_head purge;
    1936             : };
    1937             : 
    1938             : /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
    1939             : static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
    1940             : 
    1941             : /*
    1942             :  * In order to fast access to any "vmap_block" associated with a
    1943             :  * specific address, we use a hash.
    1944             :  *
    1945             :  * A per-cpu vmap_block_queue is used in both ways, to serialize
    1946             :  * an access to free block chains among CPUs(alloc path) and it
    1947             :  * also acts as a vmap_block hash(alloc/free paths). It means we
    1948             :  * overload it, since we already have the per-cpu array which is
    1949             :  * used as a hash table. When used as a hash a 'cpu' passed to
    1950             :  * per_cpu() is not actually a CPU but rather a hash index.
    1951             :  *
    1952             :  * A hash function is addr_to_vb_xa() which hashes any address
    1953             :  * to a specific index(in a hash) it belongs to. This then uses a
    1954             :  * per_cpu() macro to access an array with generated index.
    1955             :  *
    1956             :  * An example:
    1957             :  *
    1958             :  *  CPU_1  CPU_2  CPU_0
    1959             :  *    |      |      |
    1960             :  *    V      V      V
    1961             :  * 0     10     20     30     40     50     60
    1962             :  * |------|------|------|------|------|------|...<vmap address space>
    1963             :  *   CPU0   CPU1   CPU2   CPU0   CPU1   CPU2
    1964             :  *
    1965             :  * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus
    1966             :  *   it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock;
    1967             :  *
    1968             :  * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus
    1969             :  *   it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock;
    1970             :  *
    1971             :  * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus
    1972             :  *   it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock.
    1973             :  *
    1974             :  * This technique almost always avoids lock contention on insert/remove,
    1975             :  * however xarray spinlocks protect against any contention that remains.
    1976             :  */
    1977             : static struct xarray *
    1978             : addr_to_vb_xa(unsigned long addr)
    1979             : {
    1980           0 :         int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus();
    1981             : 
    1982           0 :         return &per_cpu(vmap_block_queue, index).vmap_blocks;
    1983             : }
    1984             : 
    1985             : /*
    1986             :  * We should probably have a fallback mechanism to allocate virtual memory
    1987             :  * out of partially filled vmap blocks. However vmap block sizing should be
    1988             :  * fairly reasonable according to the vmalloc size, so it shouldn't be a
    1989             :  * big problem.
    1990             :  */
    1991             : 
    1992             : static unsigned long addr_to_vb_idx(unsigned long addr)
    1993             : {
    1994           0 :         addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
    1995           0 :         addr /= VMAP_BLOCK_SIZE;
    1996             :         return addr;
    1997             : }
    1998             : 
    1999           0 : static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
    2000             : {
    2001             :         unsigned long addr;
    2002             : 
    2003           0 :         addr = va_start + (pages_off << PAGE_SHIFT);
    2004           0 :         BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
    2005           0 :         return (void *)addr;
    2006             : }
    2007             : 
    2008             : /**
    2009             :  * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
    2010             :  *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
    2011             :  * @order:    how many 2^order pages should be occupied in newly allocated block
    2012             :  * @gfp_mask: flags for the page level allocator
    2013             :  *
    2014             :  * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
    2015             :  */
    2016           0 : static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
    2017             : {
    2018             :         struct vmap_block_queue *vbq;
    2019             :         struct vmap_block *vb;
    2020             :         struct vmap_area *va;
    2021             :         struct xarray *xa;
    2022             :         unsigned long vb_idx;
    2023             :         int node, err;
    2024             :         void *vaddr;
    2025             : 
    2026           0 :         node = numa_node_id();
    2027             : 
    2028           0 :         vb = kmalloc_node(sizeof(struct vmap_block),
    2029             :                         gfp_mask & GFP_RECLAIM_MASK, node);
    2030           0 :         if (unlikely(!vb))
    2031             :                 return ERR_PTR(-ENOMEM);
    2032             : 
    2033           0 :         va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
    2034           0 :                                         VMALLOC_START, VMALLOC_END,
    2035             :                                         node, gfp_mask,
    2036             :                                         VMAP_RAM|VMAP_BLOCK);
    2037           0 :         if (IS_ERR(va)) {
    2038           0 :                 kfree(vb);
    2039           0 :                 return ERR_CAST(va);
    2040             :         }
    2041             : 
    2042           0 :         vaddr = vmap_block_vaddr(va->va_start, 0);
    2043           0 :         spin_lock_init(&vb->lock);
    2044           0 :         vb->va = va;
    2045             :         /* At least something should be left free */
    2046           0 :         BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
    2047           0 :         bitmap_zero(vb->used_map, VMAP_BBMAP_BITS);
    2048           0 :         vb->free = VMAP_BBMAP_BITS - (1UL << order);
    2049           0 :         vb->dirty = 0;
    2050           0 :         vb->dirty_min = VMAP_BBMAP_BITS;
    2051           0 :         vb->dirty_max = 0;
    2052           0 :         bitmap_set(vb->used_map, 0, (1UL << order));
    2053           0 :         INIT_LIST_HEAD(&vb->free_list);
    2054             : 
    2055           0 :         xa = addr_to_vb_xa(va->va_start);
    2056           0 :         vb_idx = addr_to_vb_idx(va->va_start);
    2057           0 :         err = xa_insert(xa, vb_idx, vb, gfp_mask);
    2058           0 :         if (err) {
    2059           0 :                 kfree(vb);
    2060           0 :                 free_vmap_area(va);
    2061           0 :                 return ERR_PTR(err);
    2062             :         }
    2063             : 
    2064           0 :         vbq = raw_cpu_ptr(&vmap_block_queue);
    2065           0 :         spin_lock(&vbq->lock);
    2066           0 :         list_add_tail_rcu(&vb->free_list, &vbq->free);
    2067           0 :         spin_unlock(&vbq->lock);
    2068             : 
    2069           0 :         return vaddr;
    2070             : }
    2071             : 
    2072           0 : static void free_vmap_block(struct vmap_block *vb)
    2073             : {
    2074             :         struct vmap_block *tmp;
    2075             :         struct xarray *xa;
    2076             : 
    2077           0 :         xa = addr_to_vb_xa(vb->va->va_start);
    2078           0 :         tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
    2079           0 :         BUG_ON(tmp != vb);
    2080             : 
    2081           0 :         spin_lock(&vmap_area_lock);
    2082           0 :         unlink_va(vb->va, &vmap_area_root);
    2083           0 :         spin_unlock(&vmap_area_lock);
    2084             : 
    2085           0 :         free_vmap_area_noflush(vb->va);
    2086           0 :         kfree_rcu(vb, rcu_head);
    2087           0 : }
    2088             : 
    2089           0 : static void purge_fragmented_blocks(int cpu)
    2090             : {
    2091           0 :         LIST_HEAD(purge);
    2092             :         struct vmap_block *vb;
    2093             :         struct vmap_block *n_vb;
    2094           0 :         struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
    2095             : 
    2096             :         rcu_read_lock();
    2097           0 :         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
    2098             : 
    2099           0 :                 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
    2100           0 :                         continue;
    2101             : 
    2102           0 :                 spin_lock(&vb->lock);
    2103           0 :                 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
    2104           0 :                         vb->free = 0; /* prevent further allocs after releasing lock */
    2105           0 :                         vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
    2106           0 :                         vb->dirty_min = 0;
    2107           0 :                         vb->dirty_max = VMAP_BBMAP_BITS;
    2108           0 :                         spin_lock(&vbq->lock);
    2109           0 :                         list_del_rcu(&vb->free_list);
    2110           0 :                         spin_unlock(&vbq->lock);
    2111           0 :                         spin_unlock(&vb->lock);
    2112           0 :                         list_add_tail(&vb->purge, &purge);
    2113             :                 } else
    2114           0 :                         spin_unlock(&vb->lock);
    2115             :         }
    2116             :         rcu_read_unlock();
    2117             : 
    2118           0 :         list_for_each_entry_safe(vb, n_vb, &purge, purge) {
    2119           0 :                 list_del(&vb->purge);
    2120           0 :                 free_vmap_block(vb);
    2121             :         }
    2122           0 : }
    2123             : 
    2124             : static void purge_fragmented_blocks_allcpus(void)
    2125             : {
    2126             :         int cpu;
    2127             : 
    2128           0 :         for_each_possible_cpu(cpu)
    2129           0 :                 purge_fragmented_blocks(cpu);
    2130             : }
    2131             : 
    2132           0 : static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
    2133             : {
    2134             :         struct vmap_block_queue *vbq;
    2135             :         struct vmap_block *vb;
    2136           0 :         void *vaddr = NULL;
    2137             :         unsigned int order;
    2138             : 
    2139           0 :         BUG_ON(offset_in_page(size));
    2140           0 :         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
    2141           0 :         if (WARN_ON(size == 0)) {
    2142             :                 /*
    2143             :                  * Allocating 0 bytes isn't what caller wants since
    2144             :                  * get_order(0) returns funny result. Just warn and terminate
    2145             :                  * early.
    2146             :                  */
    2147             :                 return NULL;
    2148             :         }
    2149           0 :         order = get_order(size);
    2150             : 
    2151             :         rcu_read_lock();
    2152           0 :         vbq = raw_cpu_ptr(&vmap_block_queue);
    2153           0 :         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
    2154             :                 unsigned long pages_off;
    2155             : 
    2156           0 :                 spin_lock(&vb->lock);
    2157           0 :                 if (vb->free < (1UL << order)) {
    2158           0 :                         spin_unlock(&vb->lock);
    2159           0 :                         continue;
    2160             :                 }
    2161             : 
    2162           0 :                 pages_off = VMAP_BBMAP_BITS - vb->free;
    2163           0 :                 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
    2164           0 :                 vb->free -= 1UL << order;
    2165           0 :                 bitmap_set(vb->used_map, pages_off, (1UL << order));
    2166           0 :                 if (vb->free == 0) {
    2167           0 :                         spin_lock(&vbq->lock);
    2168           0 :                         list_del_rcu(&vb->free_list);
    2169           0 :                         spin_unlock(&vbq->lock);
    2170             :                 }
    2171             : 
    2172           0 :                 spin_unlock(&vb->lock);
    2173             :                 break;
    2174             :         }
    2175             : 
    2176             :         rcu_read_unlock();
    2177             : 
    2178             :         /* Allocate new block if nothing was found */
    2179           0 :         if (!vaddr)
    2180           0 :                 vaddr = new_vmap_block(order, gfp_mask);
    2181             : 
    2182             :         return vaddr;
    2183             : }
    2184             : 
    2185           0 : static void vb_free(unsigned long addr, unsigned long size)
    2186             : {
    2187             :         unsigned long offset;
    2188             :         unsigned int order;
    2189             :         struct vmap_block *vb;
    2190             :         struct xarray *xa;
    2191             : 
    2192           0 :         BUG_ON(offset_in_page(size));
    2193           0 :         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
    2194             : 
    2195           0 :         flush_cache_vunmap(addr, addr + size);
    2196             : 
    2197           0 :         order = get_order(size);
    2198           0 :         offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
    2199             : 
    2200           0 :         xa = addr_to_vb_xa(addr);
    2201           0 :         vb = xa_load(xa, addr_to_vb_idx(addr));
    2202             : 
    2203           0 :         spin_lock(&vb->lock);
    2204           0 :         bitmap_clear(vb->used_map, offset, (1UL << order));
    2205           0 :         spin_unlock(&vb->lock);
    2206             : 
    2207           0 :         vunmap_range_noflush(addr, addr + size);
    2208             : 
    2209             :         if (debug_pagealloc_enabled_static())
    2210             :                 flush_tlb_kernel_range(addr, addr + size);
    2211             : 
    2212           0 :         spin_lock(&vb->lock);
    2213             : 
    2214             :         /* Expand dirty range */
    2215           0 :         vb->dirty_min = min(vb->dirty_min, offset);
    2216           0 :         vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
    2217             : 
    2218           0 :         vb->dirty += 1UL << order;
    2219           0 :         if (vb->dirty == VMAP_BBMAP_BITS) {
    2220           0 :                 BUG_ON(vb->free);
    2221           0 :                 spin_unlock(&vb->lock);
    2222           0 :                 free_vmap_block(vb);
    2223             :         } else
    2224           0 :                 spin_unlock(&vb->lock);
    2225           0 : }
    2226             : 
    2227           0 : static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
    2228             : {
    2229             :         int cpu;
    2230             : 
    2231           0 :         if (unlikely(!vmap_initialized))
    2232             :                 return;
    2233             : 
    2234             :         might_sleep();
    2235             : 
    2236           0 :         for_each_possible_cpu(cpu) {
    2237           0 :                 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
    2238             :                 struct vmap_block *vb;
    2239             : 
    2240             :                 rcu_read_lock();
    2241           0 :                 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
    2242           0 :                         spin_lock(&vb->lock);
    2243           0 :                         if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
    2244           0 :                                 unsigned long va_start = vb->va->va_start;
    2245             :                                 unsigned long s, e;
    2246             : 
    2247           0 :                                 s = va_start + (vb->dirty_min << PAGE_SHIFT);
    2248           0 :                                 e = va_start + (vb->dirty_max << PAGE_SHIFT);
    2249             : 
    2250           0 :                                 start = min(s, start);
    2251           0 :                                 end   = max(e, end);
    2252             : 
    2253           0 :                                 flush = 1;
    2254             :                         }
    2255           0 :                         spin_unlock(&vb->lock);
    2256             :                 }
    2257             :                 rcu_read_unlock();
    2258             :         }
    2259             : 
    2260           0 :         mutex_lock(&vmap_purge_lock);
    2261           0 :         purge_fragmented_blocks_allcpus();
    2262           0 :         if (!__purge_vmap_area_lazy(start, end) && flush)
    2263           0 :                 flush_tlb_kernel_range(start, end);
    2264           0 :         mutex_unlock(&vmap_purge_lock);
    2265             : }
    2266             : 
    2267             : /**
    2268             :  * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
    2269             :  *
    2270             :  * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
    2271             :  * to amortize TLB flushing overheads. What this means is that any page you
    2272             :  * have now, may, in a former life, have been mapped into kernel virtual
    2273             :  * address by the vmap layer and so there might be some CPUs with TLB entries
    2274             :  * still referencing that page (additional to the regular 1:1 kernel mapping).
    2275             :  *
    2276             :  * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
    2277             :  * be sure that none of the pages we have control over will have any aliases
    2278             :  * from the vmap layer.
    2279             :  */
    2280           0 : void vm_unmap_aliases(void)
    2281             : {
    2282           0 :         unsigned long start = ULONG_MAX, end = 0;
    2283           0 :         int flush = 0;
    2284             : 
    2285           0 :         _vm_unmap_aliases(start, end, flush);
    2286           0 : }
    2287             : EXPORT_SYMBOL_GPL(vm_unmap_aliases);
    2288             : 
    2289             : /**
    2290             :  * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
    2291             :  * @mem: the pointer returned by vm_map_ram
    2292             :  * @count: the count passed to that vm_map_ram call (cannot unmap partial)
    2293             :  */
    2294           0 : void vm_unmap_ram(const void *mem, unsigned int count)
    2295             : {
    2296           0 :         unsigned long size = (unsigned long)count << PAGE_SHIFT;
    2297           0 :         unsigned long addr = (unsigned long)kasan_reset_tag(mem);
    2298             :         struct vmap_area *va;
    2299             : 
    2300             :         might_sleep();
    2301           0 :         BUG_ON(!addr);
    2302           0 :         BUG_ON(addr < VMALLOC_START);
    2303           0 :         BUG_ON(addr > VMALLOC_END);
    2304           0 :         BUG_ON(!PAGE_ALIGNED(addr));
    2305             : 
    2306           0 :         kasan_poison_vmalloc(mem, size);
    2307             : 
    2308           0 :         if (likely(count <= VMAP_MAX_ALLOC)) {
    2309           0 :                 debug_check_no_locks_freed(mem, size);
    2310           0 :                 vb_free(addr, size);
    2311           0 :                 return;
    2312             :         }
    2313             : 
    2314           0 :         va = find_unlink_vmap_area(addr);
    2315           0 :         if (WARN_ON_ONCE(!va))
    2316             :                 return;
    2317             : 
    2318           0 :         debug_check_no_locks_freed((void *)va->va_start,
    2319           0 :                                     (va->va_end - va->va_start));
    2320           0 :         free_unmap_vmap_area(va);
    2321             : }
    2322             : EXPORT_SYMBOL(vm_unmap_ram);
    2323             : 
    2324             : /**
    2325             :  * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
    2326             :  * @pages: an array of pointers to the pages to be mapped
    2327             :  * @count: number of pages
    2328             :  * @node: prefer to allocate data structures on this node
    2329             :  *
    2330             :  * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
    2331             :  * faster than vmap so it's good.  But if you mix long-life and short-life
    2332             :  * objects with vm_map_ram(), it could consume lots of address space through
    2333             :  * fragmentation (especially on a 32bit machine).  You could see failures in
    2334             :  * the end.  Please use this function for short-lived objects.
    2335             :  *
    2336             :  * Returns: a pointer to the address that has been mapped, or %NULL on failure
    2337             :  */
    2338           0 : void *vm_map_ram(struct page **pages, unsigned int count, int node)
    2339             : {
    2340           0 :         unsigned long size = (unsigned long)count << PAGE_SHIFT;
    2341             :         unsigned long addr;
    2342             :         void *mem;
    2343             : 
    2344           0 :         if (likely(count <= VMAP_MAX_ALLOC)) {
    2345           0 :                 mem = vb_alloc(size, GFP_KERNEL);
    2346           0 :                 if (IS_ERR(mem))
    2347             :                         return NULL;
    2348             :                 addr = (unsigned long)mem;
    2349             :         } else {
    2350             :                 struct vmap_area *va;
    2351           0 :                 va = alloc_vmap_area(size, PAGE_SIZE,
    2352           0 :                                 VMALLOC_START, VMALLOC_END,
    2353             :                                 node, GFP_KERNEL, VMAP_RAM);
    2354           0 :                 if (IS_ERR(va))
    2355             :                         return NULL;
    2356             : 
    2357           0 :                 addr = va->va_start;
    2358           0 :                 mem = (void *)addr;
    2359             :         }
    2360             : 
    2361           0 :         if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
    2362             :                                 pages, PAGE_SHIFT) < 0) {
    2363           0 :                 vm_unmap_ram(mem, count);
    2364           0 :                 return NULL;
    2365             :         }
    2366             : 
    2367             :         /*
    2368             :          * Mark the pages as accessible, now that they are mapped.
    2369             :          * With hardware tag-based KASAN, marking is skipped for
    2370             :          * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
    2371             :          */
    2372             :         mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
    2373             : 
    2374             :         return mem;
    2375             : }
    2376             : EXPORT_SYMBOL(vm_map_ram);
    2377             : 
    2378             : static struct vm_struct *vmlist __initdata;
    2379             : 
    2380             : static inline unsigned int vm_area_page_order(struct vm_struct *vm)
    2381             : {
    2382             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
    2383             :         return vm->page_order;
    2384             : #else
    2385             :         return 0;
    2386             : #endif
    2387             : }
    2388             : 
    2389         274 : static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
    2390             : {
    2391             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
    2392             :         vm->page_order = order;
    2393             : #else
    2394         274 :         BUG_ON(order != 0);
    2395             : #endif
    2396         274 : }
    2397             : 
    2398             : /**
    2399             :  * vm_area_add_early - add vmap area early during boot
    2400             :  * @vm: vm_struct to add
    2401             :  *
    2402             :  * This function is used to add fixed kernel vm area to vmlist before
    2403             :  * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
    2404             :  * should contain proper values and the other fields should be zero.
    2405             :  *
    2406             :  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
    2407             :  */
    2408           0 : void __init vm_area_add_early(struct vm_struct *vm)
    2409             : {
    2410             :         struct vm_struct *tmp, **p;
    2411             : 
    2412           0 :         BUG_ON(vmap_initialized);
    2413           0 :         for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
    2414           0 :                 if (tmp->addr >= vm->addr) {
    2415           0 :                         BUG_ON(tmp->addr < vm->addr + vm->size);
    2416             :                         break;
    2417             :                 } else
    2418           0 :                         BUG_ON(tmp->addr + tmp->size > vm->addr);
    2419             :         }
    2420           0 :         vm->next = *p;
    2421           0 :         *p = vm;
    2422           0 : }
    2423             : 
    2424             : /**
    2425             :  * vm_area_register_early - register vmap area early during boot
    2426             :  * @vm: vm_struct to register
    2427             :  * @align: requested alignment
    2428             :  *
    2429             :  * This function is used to register kernel vm area before
    2430             :  * vmalloc_init() is called.  @vm->size and @vm->flags should contain
    2431             :  * proper values on entry and other fields should be zero.  On return,
    2432             :  * vm->addr contains the allocated address.
    2433             :  *
    2434             :  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
    2435             :  */
    2436           0 : void __init vm_area_register_early(struct vm_struct *vm, size_t align)
    2437             : {
    2438           0 :         unsigned long addr = ALIGN(VMALLOC_START, align);
    2439             :         struct vm_struct *cur, **p;
    2440             : 
    2441           0 :         BUG_ON(vmap_initialized);
    2442             : 
    2443           0 :         for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
    2444           0 :                 if ((unsigned long)cur->addr - addr >= vm->size)
    2445             :                         break;
    2446           0 :                 addr = ALIGN((unsigned long)cur->addr + cur->size, align);
    2447             :         }
    2448             : 
    2449           0 :         BUG_ON(addr > VMALLOC_END - vm->size);
    2450           0 :         vm->addr = (void *)addr;
    2451           0 :         vm->next = *p;
    2452           0 :         *p = vm;
    2453           0 :         kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
    2454           0 : }
    2455             : 
    2456           1 : static void vmap_init_free_space(void)
    2457             : {
    2458           1 :         unsigned long vmap_start = 1;
    2459           1 :         const unsigned long vmap_end = ULONG_MAX;
    2460             :         struct vmap_area *busy, *free;
    2461             : 
    2462             :         /*
    2463             :          *     B     F     B     B     B     F
    2464             :          * -|-----|.....|-----|-----|-----|.....|-
    2465             :          *  |           The KVA space           |
    2466             :          *  |<--------------------------------->|
    2467             :          */
    2468           1 :         list_for_each_entry(busy, &vmap_area_list, list) {
    2469           0 :                 if (busy->va_start - vmap_start > 0) {
    2470           0 :                         free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    2471           0 :                         if (!WARN_ON_ONCE(!free)) {
    2472           0 :                                 free->va_start = vmap_start;
    2473           0 :                                 free->va_end = busy->va_start;
    2474             : 
    2475           0 :                                 insert_vmap_area_augment(free, NULL,
    2476             :                                         &free_vmap_area_root,
    2477             :                                                 &free_vmap_area_list);
    2478             :                         }
    2479             :                 }
    2480             : 
    2481           0 :                 vmap_start = busy->va_end;
    2482             :         }
    2483             : 
    2484           1 :         if (vmap_end - vmap_start > 0) {
    2485           2 :                 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    2486           1 :                 if (!WARN_ON_ONCE(!free)) {
    2487           1 :                         free->va_start = vmap_start;
    2488           1 :                         free->va_end = vmap_end;
    2489             : 
    2490           1 :                         insert_vmap_area_augment(free, NULL,
    2491             :                                 &free_vmap_area_root,
    2492             :                                         &free_vmap_area_list);
    2493             :                 }
    2494             :         }
    2495           1 : }
    2496             : 
    2497             : static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
    2498             :         struct vmap_area *va, unsigned long flags, const void *caller)
    2499             : {
    2500         274 :         vm->flags = flags;
    2501         274 :         vm->addr = (void *)va->va_start;
    2502         274 :         vm->size = va->va_end - va->va_start;
    2503         274 :         vm->caller = caller;
    2504         274 :         va->vm = vm;
    2505             : }
    2506             : 
    2507             : static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
    2508             :                               unsigned long flags, const void *caller)
    2509             : {
    2510         274 :         spin_lock(&vmap_area_lock);
    2511         274 :         setup_vmalloc_vm_locked(vm, va, flags, caller);
    2512         274 :         spin_unlock(&vmap_area_lock);
    2513             : }
    2514             : 
    2515             : static void clear_vm_uninitialized_flag(struct vm_struct *vm)
    2516             : {
    2517             :         /*
    2518             :          * Before removing VM_UNINITIALIZED,
    2519             :          * we should make sure that vm has proper values.
    2520             :          * Pair with smp_rmb() in show_numa_info().
    2521             :          */
    2522         274 :         smp_wmb();
    2523         274 :         vm->flags &= ~VM_UNINITIALIZED;
    2524             : }
    2525             : 
    2526         274 : static struct vm_struct *__get_vm_area_node(unsigned long size,
    2527             :                 unsigned long align, unsigned long shift, unsigned long flags,
    2528             :                 unsigned long start, unsigned long end, int node,
    2529             :                 gfp_t gfp_mask, const void *caller)
    2530             : {
    2531             :         struct vmap_area *va;
    2532             :         struct vm_struct *area;
    2533         274 :         unsigned long requested_size = size;
    2534             : 
    2535         274 :         BUG_ON(in_interrupt());
    2536         274 :         size = ALIGN(size, 1ul << shift);
    2537         274 :         if (unlikely(!size))
    2538             :                 return NULL;
    2539             : 
    2540         274 :         if (flags & VM_IOREMAP)
    2541           0 :                 align = 1ul << clamp_t(int, get_count_order_long(size),
    2542             :                                        PAGE_SHIFT, IOREMAP_MAX_ORDER);
    2543             : 
    2544         274 :         area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
    2545         274 :         if (unlikely(!area))
    2546             :                 return NULL;
    2547             : 
    2548         274 :         if (!(flags & VM_NO_GUARD))
    2549         274 :                 size += PAGE_SIZE;
    2550             : 
    2551         274 :         va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
    2552         274 :         if (IS_ERR(va)) {
    2553           0 :                 kfree(area);
    2554           0 :                 return NULL;
    2555             :         }
    2556             : 
    2557         274 :         setup_vmalloc_vm(area, va, flags, caller);
    2558             : 
    2559             :         /*
    2560             :          * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
    2561             :          * best-effort approach, as they can be mapped outside of vmalloc code.
    2562             :          * For VM_ALLOC mappings, the pages are marked as accessible after
    2563             :          * getting mapped in __vmalloc_node_range().
    2564             :          * With hardware tag-based KASAN, marking is skipped for
    2565             :          * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
    2566             :          */
    2567         274 :         if (!(flags & VM_ALLOC))
    2568             :                 area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
    2569             :                                                     KASAN_VMALLOC_PROT_NORMAL);
    2570             : 
    2571             :         return area;
    2572             : }
    2573             : 
    2574           0 : struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
    2575             :                                        unsigned long start, unsigned long end,
    2576             :                                        const void *caller)
    2577             : {
    2578           0 :         return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
    2579             :                                   NUMA_NO_NODE, GFP_KERNEL, caller);
    2580             : }
    2581             : 
    2582             : /**
    2583             :  * get_vm_area - reserve a contiguous kernel virtual area
    2584             :  * @size:        size of the area
    2585             :  * @flags:       %VM_IOREMAP for I/O mappings or VM_ALLOC
    2586             :  *
    2587             :  * Search an area of @size in the kernel virtual mapping area,
    2588             :  * and reserved it for out purposes.  Returns the area descriptor
    2589             :  * on success or %NULL on failure.
    2590             :  *
    2591             :  * Return: the area descriptor on success or %NULL on failure.
    2592             :  */
    2593           0 : struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
    2594             : {
    2595           0 :         return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
    2596           0 :                                   VMALLOC_START, VMALLOC_END,
    2597             :                                   NUMA_NO_NODE, GFP_KERNEL,
    2598           0 :                                   __builtin_return_address(0));
    2599             : }
    2600             : 
    2601           0 : struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
    2602             :                                 const void *caller)
    2603             : {
    2604           0 :         return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
    2605           0 :                                   VMALLOC_START, VMALLOC_END,
    2606             :                                   NUMA_NO_NODE, GFP_KERNEL, caller);
    2607             : }
    2608             : 
    2609             : /**
    2610             :  * find_vm_area - find a continuous kernel virtual area
    2611             :  * @addr:         base address
    2612             :  *
    2613             :  * Search for the kernel VM area starting at @addr, and return it.
    2614             :  * It is up to the caller to do all required locking to keep the returned
    2615             :  * pointer valid.
    2616             :  *
    2617             :  * Return: the area descriptor on success or %NULL on failure.
    2618             :  */
    2619          16 : struct vm_struct *find_vm_area(const void *addr)
    2620             : {
    2621             :         struct vmap_area *va;
    2622             : 
    2623          32 :         va = find_vmap_area((unsigned long)addr);
    2624          16 :         if (!va)
    2625             :                 return NULL;
    2626             : 
    2627          16 :         return va->vm;
    2628             : }
    2629             : 
    2630             : /**
    2631             :  * remove_vm_area - find and remove a continuous kernel virtual area
    2632             :  * @addr:           base address
    2633             :  *
    2634             :  * Search for the kernel VM area starting at @addr, and remove it.
    2635             :  * This function returns the found VM area, but using it is NOT safe
    2636             :  * on SMP machines, except for its size or flags.
    2637             :  *
    2638             :  * Return: the area descriptor on success or %NULL on failure.
    2639             :  */
    2640         258 : struct vm_struct *remove_vm_area(const void *addr)
    2641             : {
    2642             :         struct vmap_area *va;
    2643             :         struct vm_struct *vm;
    2644             : 
    2645             :         might_sleep();
    2646             : 
    2647         258 :         if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
    2648             :                         addr))
    2649             :                 return NULL;
    2650             : 
    2651         258 :         va = find_unlink_vmap_area((unsigned long)addr);
    2652         258 :         if (!va || !va->vm)
    2653             :                 return NULL;
    2654         258 :         vm = va->vm;
    2655             : 
    2656         258 :         debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
    2657         258 :         debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
    2658         258 :         kasan_free_module_shadow(vm);
    2659         258 :         kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));
    2660             : 
    2661         258 :         free_unmap_vmap_area(va);
    2662         258 :         return vm;
    2663             : }
    2664             : 
    2665           0 : static inline void set_area_direct_map(const struct vm_struct *area,
    2666             :                                        int (*set_direct_map)(struct page *page))
    2667             : {
    2668             :         int i;
    2669             : 
    2670             :         /* HUGE_VMALLOC passes small pages to set_direct_map */
    2671           0 :         for (i = 0; i < area->nr_pages; i++)
    2672           0 :                 if (page_address(area->pages[i]))
    2673           0 :                         set_direct_map(area->pages[i]);
    2674           0 : }
    2675             : 
    2676             : /*
    2677             :  * Flush the vm mapping and reset the direct map.
    2678             :  */
    2679           0 : static void vm_reset_perms(struct vm_struct *area)
    2680             : {
    2681           0 :         unsigned long start = ULONG_MAX, end = 0;
    2682           0 :         unsigned int page_order = vm_area_page_order(area);
    2683           0 :         int flush_dmap = 0;
    2684             :         int i;
    2685             : 
    2686             :         /*
    2687             :          * Find the start and end range of the direct mappings to make sure that
    2688             :          * the vm_unmap_aliases() flush includes the direct map.
    2689             :          */
    2690           0 :         for (i = 0; i < area->nr_pages; i += 1U << page_order) {
    2691           0 :                 unsigned long addr = (unsigned long)page_address(area->pages[i]);
    2692             : 
    2693           0 :                 if (addr) {
    2694             :                         unsigned long page_size;
    2695             : 
    2696           0 :                         page_size = PAGE_SIZE << page_order;
    2697           0 :                         start = min(addr, start);
    2698           0 :                         end = max(addr + page_size, end);
    2699           0 :                         flush_dmap = 1;
    2700             :                 }
    2701             :         }
    2702             : 
    2703             :         /*
    2704             :          * Set direct map to something invalid so that it won't be cached if
    2705             :          * there are any accesses after the TLB flush, then flush the TLB and
    2706             :          * reset the direct map permissions to the default.
    2707             :          */
    2708           0 :         set_area_direct_map(area, set_direct_map_invalid_noflush);
    2709           0 :         _vm_unmap_aliases(start, end, flush_dmap);
    2710           0 :         set_area_direct_map(area, set_direct_map_default_noflush);
    2711           0 : }
    2712             : 
    2713           0 : static void delayed_vfree_work(struct work_struct *w)
    2714             : {
    2715           0 :         struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
    2716             :         struct llist_node *t, *llnode;
    2717             : 
    2718           0 :         llist_for_each_safe(llnode, t, llist_del_all(&p->list))
    2719           0 :                 vfree(llnode);
    2720           0 : }
    2721             : 
    2722             : /**
    2723             :  * vfree_atomic - release memory allocated by vmalloc()
    2724             :  * @addr:         memory base address
    2725             :  *
    2726             :  * This one is just like vfree() but can be called in any atomic context
    2727             :  * except NMIs.
    2728             :  */
    2729           0 : void vfree_atomic(const void *addr)
    2730             : {
    2731           0 :         struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
    2732             : 
    2733           0 :         BUG_ON(in_nmi());
    2734           0 :         kmemleak_free(addr);
    2735             : 
    2736             :         /*
    2737             :          * Use raw_cpu_ptr() because this can be called from preemptible
    2738             :          * context. Preemption is absolutely fine here, because the llist_add()
    2739             :          * implementation is lockless, so it works even if we are adding to
    2740             :          * another cpu's list. schedule_work() should be fine with this too.
    2741             :          */
    2742           0 :         if (addr && llist_add((struct llist_node *)addr, &p->list))
    2743           0 :                 schedule_work(&p->wq);
    2744           0 : }
    2745             : 
    2746             : /**
    2747             :  * vfree - Release memory allocated by vmalloc()
    2748             :  * @addr:  Memory base address
    2749             :  *
    2750             :  * Free the virtually continuous memory area starting at @addr, as obtained
    2751             :  * from one of the vmalloc() family of APIs.  This will usually also free the
    2752             :  * physical memory underlying the virtual allocation, but that memory is
    2753             :  * reference counted, so it will not be freed until the last user goes away.
    2754             :  *
    2755             :  * If @addr is NULL, no operation is performed.
    2756             :  *
    2757             :  * Context:
    2758             :  * May sleep if called *not* from interrupt context.
    2759             :  * Must not be called in NMI context (strictly speaking, it could be
    2760             :  * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
    2761             :  * conventions for vfree() arch-dependent would be a really bad idea).
    2762             :  */
    2763         258 : void vfree(const void *addr)
    2764             : {
    2765             :         struct vm_struct *vm;
    2766             :         int i;
    2767             : 
    2768         258 :         if (unlikely(in_interrupt())) {
    2769           0 :                 vfree_atomic(addr);
    2770           0 :                 return;
    2771             :         }
    2772             : 
    2773         258 :         BUG_ON(in_nmi());
    2774         258 :         kmemleak_free(addr);
    2775             :         might_sleep();
    2776             : 
    2777         258 :         if (!addr)
    2778             :                 return;
    2779             : 
    2780         258 :         vm = remove_vm_area(addr);
    2781         258 :         if (unlikely(!vm)) {
    2782           0 :                 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
    2783             :                                 addr);
    2784           0 :                 return;
    2785             :         }
    2786             : 
    2787         258 :         if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
    2788           0 :                 vm_reset_perms(vm);
    2789       42477 :         for (i = 0; i < vm->nr_pages; i++) {
    2790       42477 :                 struct page *page = vm->pages[i];
    2791             : 
    2792       42477 :                 BUG_ON(!page);
    2793       42477 :                 mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
    2794             :                 /*
    2795             :                  * High-order allocs for huge vmallocs are split, so
    2796             :                  * can be freed as an array of order-0 allocations
    2797             :                  */
    2798       42477 :                 __free_page(page);
    2799       42477 :                 cond_resched();
    2800             :         }
    2801         516 :         atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
    2802         258 :         kvfree(vm->pages);
    2803         258 :         kfree(vm);
    2804             : }
    2805             : EXPORT_SYMBOL(vfree);
    2806             : 
    2807             : /**
    2808             :  * vunmap - release virtual mapping obtained by vmap()
    2809             :  * @addr:   memory base address
    2810             :  *
    2811             :  * Free the virtually contiguous memory area starting at @addr,
    2812             :  * which was created from the page array passed to vmap().
    2813             :  *
    2814             :  * Must not be called in interrupt context.
    2815             :  */
    2816           0 : void vunmap(const void *addr)
    2817             : {
    2818             :         struct vm_struct *vm;
    2819             : 
    2820           0 :         BUG_ON(in_interrupt());
    2821             :         might_sleep();
    2822             : 
    2823           0 :         if (!addr)
    2824             :                 return;
    2825           0 :         vm = remove_vm_area(addr);
    2826           0 :         if (unlikely(!vm)) {
    2827           0 :                 WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
    2828             :                                 addr);
    2829           0 :                 return;
    2830             :         }
    2831           0 :         kfree(vm);
    2832             : }
    2833             : EXPORT_SYMBOL(vunmap);
    2834             : 
    2835             : /**
    2836             :  * vmap - map an array of pages into virtually contiguous space
    2837             :  * @pages: array of page pointers
    2838             :  * @count: number of pages to map
    2839             :  * @flags: vm_area->flags
    2840             :  * @prot: page protection for the mapping
    2841             :  *
    2842             :  * Maps @count pages from @pages into contiguous kernel virtual space.
    2843             :  * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
    2844             :  * (which must be kmalloc or vmalloc memory) and one reference per pages in it
    2845             :  * are transferred from the caller to vmap(), and will be freed / dropped when
    2846             :  * vfree() is called on the return value.
    2847             :  *
    2848             :  * Return: the address of the area or %NULL on failure
    2849             :  */
    2850           0 : void *vmap(struct page **pages, unsigned int count,
    2851             :            unsigned long flags, pgprot_t prot)
    2852             : {
    2853             :         struct vm_struct *area;
    2854             :         unsigned long addr;
    2855             :         unsigned long size;             /* In bytes */
    2856             : 
    2857             :         might_sleep();
    2858             : 
    2859           0 :         if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
    2860             :                 return NULL;
    2861             : 
    2862             :         /*
    2863             :          * Your top guard is someone else's bottom guard. Not having a top
    2864             :          * guard compromises someone else's mappings too.
    2865             :          */
    2866           0 :         if (WARN_ON_ONCE(flags & VM_NO_GUARD))
    2867           0 :                 flags &= ~VM_NO_GUARD;
    2868             : 
    2869           0 :         if (count > totalram_pages())
    2870             :                 return NULL;
    2871             : 
    2872           0 :         size = (unsigned long)count << PAGE_SHIFT;
    2873           0 :         area = get_vm_area_caller(size, flags, __builtin_return_address(0));
    2874           0 :         if (!area)
    2875             :                 return NULL;
    2876             : 
    2877           0 :         addr = (unsigned long)area->addr;
    2878           0 :         if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
    2879             :                                 pages, PAGE_SHIFT) < 0) {
    2880           0 :                 vunmap(area->addr);
    2881           0 :                 return NULL;
    2882             :         }
    2883             : 
    2884           0 :         if (flags & VM_MAP_PUT_PAGES) {
    2885           0 :                 area->pages = pages;
    2886           0 :                 area->nr_pages = count;
    2887             :         }
    2888           0 :         return area->addr;
    2889             : }
    2890             : EXPORT_SYMBOL(vmap);
    2891             : 
    2892             : #ifdef CONFIG_VMAP_PFN
    2893             : struct vmap_pfn_data {
    2894             :         unsigned long   *pfns;
    2895             :         pgprot_t        prot;
    2896             :         unsigned int    idx;
    2897             : };
    2898             : 
    2899             : static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
    2900             : {
    2901             :         struct vmap_pfn_data *data = private;
    2902             : 
    2903             :         if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
    2904             :                 return -EINVAL;
    2905             :         *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
    2906             :         return 0;
    2907             : }
    2908             : 
    2909             : /**
    2910             :  * vmap_pfn - map an array of PFNs into virtually contiguous space
    2911             :  * @pfns: array of PFNs
    2912             :  * @count: number of pages to map
    2913             :  * @prot: page protection for the mapping
    2914             :  *
    2915             :  * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
    2916             :  * the start address of the mapping.
    2917             :  */
    2918             : void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
    2919             : {
    2920             :         struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
    2921             :         struct vm_struct *area;
    2922             : 
    2923             :         area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
    2924             :                         __builtin_return_address(0));
    2925             :         if (!area)
    2926             :                 return NULL;
    2927             :         if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
    2928             :                         count * PAGE_SIZE, vmap_pfn_apply, &data)) {
    2929             :                 free_vm_area(area);
    2930             :                 return NULL;
    2931             :         }
    2932             :         return area->addr;
    2933             : }
    2934             : EXPORT_SYMBOL_GPL(vmap_pfn);
    2935             : #endif /* CONFIG_VMAP_PFN */
    2936             : 
    2937             : static inline unsigned int
    2938         274 : vm_area_alloc_pages(gfp_t gfp, int nid,
    2939             :                 unsigned int order, unsigned int nr_pages, struct page **pages)
    2940             : {
    2941         274 :         unsigned int nr_allocated = 0;
    2942         274 :         gfp_t alloc_gfp = gfp;
    2943         274 :         bool nofail = false;
    2944             :         struct page *page;
    2945             :         int i;
    2946             : 
    2947             :         /*
    2948             :          * For order-0 pages we make use of bulk allocator, if
    2949             :          * the page array is partly or not at all populated due
    2950             :          * to fails, fallback to a single page allocator that is
    2951             :          * more permissive.
    2952             :          */
    2953         274 :         if (!order) {
    2954             :                 /* bulk allocator doesn't support nofail req. officially */
    2955         274 :                 gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
    2956             : 
    2957        1144 :                 while (nr_allocated < nr_pages) {
    2958             :                         unsigned int nr, nr_pages_request;
    2959             : 
    2960             :                         /*
    2961             :                          * A maximum allowed request is hard-coded and is 100
    2962             :                          * pages per call. That is done in order to prevent a
    2963             :                          * long preemption off scenario in the bulk-allocator
    2964             :                          * so the range is [1:100].
    2965             :                          */
    2966         596 :                         nr_pages_request = min(100U, nr_pages - nr_allocated);
    2967             : 
    2968             :                         /* memory allocation should consider mempolicy, we can't
    2969             :                          * wrongly use nearest node when nid == NUMA_NO_NODE,
    2970             :                          * otherwise memory may be allocated in only one node,
    2971             :                          * but mempolicy wants to alloc memory by interleaving.
    2972             :                          */
    2973             :                         if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
    2974             :                                 nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
    2975             :                                                         nr_pages_request,
    2976             :                                                         pages + nr_allocated);
    2977             : 
    2978             :                         else
    2979        1192 :                                 nr = alloc_pages_bulk_array_node(bulk_gfp, nid,
    2980             :                                                         nr_pages_request,
    2981         596 :                                                         pages + nr_allocated);
    2982             : 
    2983         596 :                         nr_allocated += nr;
    2984         596 :                         cond_resched();
    2985             : 
    2986             :                         /*
    2987             :                          * If zero or pages were obtained partly,
    2988             :                          * fallback to a single page allocator.
    2989             :                          */
    2990         596 :                         if (nr != nr_pages_request)
    2991             :                                 break;
    2992             :                 }
    2993           0 :         } else if (gfp & __GFP_NOFAIL) {
    2994             :                 /*
    2995             :                  * Higher order nofail allocations are really expensive and
    2996             :                  * potentially dangerous (pre-mature OOM, disruptive reclaim
    2997             :                  * and compaction etc.
    2998             :                  */
    2999           0 :                 alloc_gfp &= ~__GFP_NOFAIL;
    3000           0 :                 nofail = true;
    3001             :         }
    3002             : 
    3003             :         /* High-order pages or fallback path if "bulk" fails. */
    3004         274 :         while (nr_allocated < nr_pages) {
    3005           0 :                 if (fatal_signal_pending(current))
    3006             :                         break;
    3007             : 
    3008           0 :                 if (nid == NUMA_NO_NODE)
    3009           0 :                         page = alloc_pages(alloc_gfp, order);
    3010             :                 else
    3011           0 :                         page = alloc_pages_node(nid, alloc_gfp, order);
    3012           0 :                 if (unlikely(!page)) {
    3013           0 :                         if (!nofail)
    3014             :                                 break;
    3015             : 
    3016             :                         /* fall back to the zero order allocations */
    3017           0 :                         alloc_gfp |= __GFP_NOFAIL;
    3018           0 :                         order = 0;
    3019           0 :                         continue;
    3020             :                 }
    3021             : 
    3022             :                 /*
    3023             :                  * Higher order allocations must be able to be treated as
    3024             :                  * indepdenent small pages by callers (as they can with
    3025             :                  * small-page vmallocs). Some drivers do their own refcounting
    3026             :                  * on vmalloc_to_page() pages, some use page->mapping,
    3027             :                  * page->lru, etc.
    3028             :                  */
    3029           0 :                 if (order)
    3030           0 :                         split_page(page, order);
    3031             : 
    3032             :                 /*
    3033             :                  * Careful, we allocate and map page-order pages, but
    3034             :                  * tracking is done per PAGE_SIZE page so as to keep the
    3035             :                  * vm_struct APIs independent of the physical/mapped size.
    3036             :                  */
    3037           0 :                 for (i = 0; i < (1U << order); i++)
    3038           0 :                         pages[nr_allocated + i] = page + i;
    3039             : 
    3040           0 :                 cond_resched();
    3041           0 :                 nr_allocated += 1U << order;
    3042             :         }
    3043             : 
    3044         274 :         return nr_allocated;
    3045             : }
    3046             : 
    3047         274 : static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
    3048             :                                  pgprot_t prot, unsigned int page_shift,
    3049             :                                  int node)
    3050             : {
    3051         274 :         const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
    3052         274 :         bool nofail = gfp_mask & __GFP_NOFAIL;
    3053         274 :         unsigned long addr = (unsigned long)area->addr;
    3054         548 :         unsigned long size = get_vm_area_size(area);
    3055             :         unsigned long array_size;
    3056         274 :         unsigned int nr_small_pages = size >> PAGE_SHIFT;
    3057             :         unsigned int page_order;
    3058             :         unsigned int flags;
    3059             :         int ret;
    3060             : 
    3061         274 :         array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
    3062             : 
    3063         274 :         if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
    3064         274 :                 gfp_mask |= __GFP_HIGHMEM;
    3065             : 
    3066             :         /* Please note that the recursion is strictly bounded. */
    3067         274 :         if (array_size > PAGE_SIZE) {
    3068           1 :                 area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
    3069             :                                         area->caller);
    3070             :         } else {
    3071         273 :                 area->pages = kmalloc_node(array_size, nested_gfp, node);
    3072             :         }
    3073             : 
    3074         274 :         if (!area->pages) {
    3075           0 :                 warn_alloc(gfp_mask, NULL,
    3076             :                         "vmalloc error: size %lu, failed to allocated page array size %lu",
    3077             :                         nr_small_pages * PAGE_SIZE, array_size);
    3078           0 :                 free_vm_area(area);
    3079           0 :                 return NULL;
    3080             :         }
    3081             : 
    3082         274 :         set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
    3083         274 :         page_order = vm_area_page_order(area);
    3084             : 
    3085         274 :         area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
    3086             :                 node, page_order, nr_small_pages, area->pages);
    3087             : 
    3088         548 :         atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
    3089         274 :         if (gfp_mask & __GFP_ACCOUNT) {
    3090             :                 int i;
    3091             : 
    3092           0 :                 for (i = 0; i < area->nr_pages; i++)
    3093           0 :                         mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
    3094             :         }
    3095             : 
    3096             :         /*
    3097             :          * If not enough pages were obtained to accomplish an
    3098             :          * allocation request, free them via vfree() if any.
    3099             :          */
    3100         274 :         if (area->nr_pages != nr_small_pages) {
    3101             :                 /* vm_area_alloc_pages() can also fail due to a fatal signal */
    3102           0 :                 if (!fatal_signal_pending(current))
    3103           0 :                         warn_alloc(gfp_mask, NULL,
    3104             :                                 "vmalloc error: size %lu, page order %u, failed to allocate pages",
    3105           0 :                                 area->nr_pages * PAGE_SIZE, page_order);
    3106             :                 goto fail;
    3107             :         }
    3108             : 
    3109             :         /*
    3110             :          * page tables allocations ignore external gfp mask, enforce it
    3111             :          * by the scope API
    3112             :          */
    3113         274 :         if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
    3114           0 :                 flags = memalloc_nofs_save();
    3115         274 :         else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
    3116           0 :                 flags = memalloc_noio_save();
    3117             : 
    3118             :         do {
    3119         274 :                 ret = vmap_pages_range(addr, addr + size, prot, area->pages,
    3120             :                         page_shift);
    3121         274 :                 if (nofail && (ret < 0))
    3122           0 :                         schedule_timeout_uninterruptible(1);
    3123         274 :         } while (nofail && (ret < 0));
    3124             : 
    3125         274 :         if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
    3126             :                 memalloc_nofs_restore(flags);
    3127         274 :         else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
    3128             :                 memalloc_noio_restore(flags);
    3129             : 
    3130         274 :         if (ret < 0) {
    3131           0 :                 warn_alloc(gfp_mask, NULL,
    3132             :                         "vmalloc error: size %lu, failed to map pages",
    3133           0 :                         area->nr_pages * PAGE_SIZE);
    3134           0 :                 goto fail;
    3135             :         }
    3136             : 
    3137         274 :         return area->addr;
    3138             : 
    3139             : fail:
    3140           0 :         vfree(area->addr);
    3141           0 :         return NULL;
    3142             : }
    3143             : 
    3144             : /**
    3145             :  * __vmalloc_node_range - allocate virtually contiguous memory
    3146             :  * @size:                 allocation size
    3147             :  * @align:                desired alignment
    3148             :  * @start:                vm area range start
    3149             :  * @end:                  vm area range end
    3150             :  * @gfp_mask:             flags for the page level allocator
    3151             :  * @prot:                 protection mask for the allocated pages
    3152             :  * @vm_flags:             additional vm area flags (e.g. %VM_NO_GUARD)
    3153             :  * @node:                 node to use for allocation or NUMA_NO_NODE
    3154             :  * @caller:               caller's return address
    3155             :  *
    3156             :  * Allocate enough pages to cover @size from the page level
    3157             :  * allocator with @gfp_mask flags. Please note that the full set of gfp
    3158             :  * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
    3159             :  * supported.
    3160             :  * Zone modifiers are not supported. From the reclaim modifiers
    3161             :  * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
    3162             :  * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
    3163             :  * __GFP_RETRY_MAYFAIL are not supported).
    3164             :  *
    3165             :  * __GFP_NOWARN can be used to suppress failures messages.
    3166             :  *
    3167             :  * Map them into contiguous kernel virtual space, using a pagetable
    3168             :  * protection of @prot.
    3169             :  *
    3170             :  * Return: the address of the area or %NULL on failure
    3171             :  */
    3172         274 : void *__vmalloc_node_range(unsigned long size, unsigned long align,
    3173             :                         unsigned long start, unsigned long end, gfp_t gfp_mask,
    3174             :                         pgprot_t prot, unsigned long vm_flags, int node,
    3175             :                         const void *caller)
    3176             : {
    3177             :         struct vm_struct *area;
    3178             :         void *ret;
    3179         274 :         kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
    3180         274 :         unsigned long real_size = size;
    3181         274 :         unsigned long real_align = align;
    3182         274 :         unsigned int shift = PAGE_SHIFT;
    3183             : 
    3184         274 :         if (WARN_ON_ONCE(!size))
    3185             :                 return NULL;
    3186             : 
    3187         548 :         if ((size >> PAGE_SHIFT) > totalram_pages()) {
    3188           0 :                 warn_alloc(gfp_mask, NULL,
    3189             :                         "vmalloc error: size %lu, exceeds total pages",
    3190             :                         real_size);
    3191           0 :                 return NULL;
    3192             :         }
    3193             : 
    3194             :         if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
    3195             :                 unsigned long size_per_node;
    3196             : 
    3197             :                 /*
    3198             :                  * Try huge pages. Only try for PAGE_KERNEL allocations,
    3199             :                  * others like modules don't yet expect huge pages in
    3200             :                  * their allocations due to apply_to_page_range not
    3201             :                  * supporting them.
    3202             :                  */
    3203             : 
    3204             :                 size_per_node = size;
    3205             :                 if (node == NUMA_NO_NODE)
    3206             :                         size_per_node /= num_online_nodes();
    3207             :                 if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
    3208             :                         shift = PMD_SHIFT;
    3209             :                 else
    3210             :                         shift = arch_vmap_pte_supported_shift(size_per_node);
    3211             : 
    3212             :                 align = max(real_align, 1UL << shift);
    3213             :                 size = ALIGN(real_size, 1UL << shift);
    3214             :         }
    3215             : 
    3216             : again:
    3217         274 :         area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
    3218             :                                   VM_UNINITIALIZED | vm_flags, start, end, node,
    3219             :                                   gfp_mask, caller);
    3220         274 :         if (!area) {
    3221           0 :                 bool nofail = gfp_mask & __GFP_NOFAIL;
    3222           0 :                 warn_alloc(gfp_mask, NULL,
    3223             :                         "vmalloc error: size %lu, vm_struct allocation failed%s",
    3224             :                         real_size, (nofail) ? ". Retrying." : "");
    3225           0 :                 if (nofail) {
    3226           0 :                         schedule_timeout_uninterruptible(1);
    3227           0 :                         goto again;
    3228             :                 }
    3229             :                 goto fail;
    3230             :         }
    3231             : 
    3232             :         /*
    3233             :          * Prepare arguments for __vmalloc_area_node() and
    3234             :          * kasan_unpoison_vmalloc().
    3235             :          */
    3236         274 :         if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
    3237             :                 if (kasan_hw_tags_enabled()) {
    3238             :                         /*
    3239             :                          * Modify protection bits to allow tagging.
    3240             :                          * This must be done before mapping.
    3241             :                          */
    3242             :                         prot = arch_vmap_pgprot_tagged(prot);
    3243             : 
    3244             :                         /*
    3245             :                          * Skip page_alloc poisoning and zeroing for physical
    3246             :                          * pages backing VM_ALLOC mapping. Memory is instead
    3247             :                          * poisoned and zeroed by kasan_unpoison_vmalloc().
    3248             :                          */
    3249             :                         gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
    3250             :                 }
    3251             : 
    3252             :                 /* Take note that the mapping is PAGE_KERNEL. */
    3253             :                 kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
    3254             :         }
    3255             : 
    3256             :         /* Allocate physical pages and map them into vmalloc space. */
    3257         274 :         ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
    3258         274 :         if (!ret)
    3259             :                 goto fail;
    3260             : 
    3261             :         /*
    3262             :          * Mark the pages as accessible, now that they are mapped.
    3263             :          * The condition for setting KASAN_VMALLOC_INIT should complement the
    3264             :          * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
    3265             :          * to make sure that memory is initialized under the same conditions.
    3266             :          * Tag-based KASAN modes only assign tags to normal non-executable
    3267             :          * allocations, see __kasan_unpoison_vmalloc().
    3268             :          */
    3269         274 :         kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
    3270         548 :         if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
    3271             :             (gfp_mask & __GFP_SKIP_ZERO))
    3272             :                 kasan_flags |= KASAN_VMALLOC_INIT;
    3273             :         /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
    3274         274 :         area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
    3275             : 
    3276             :         /*
    3277             :          * In this function, newly allocated vm_struct has VM_UNINITIALIZED
    3278             :          * flag. It means that vm_struct is not fully initialized.
    3279             :          * Now, it is fully initialized, so remove this flag here.
    3280             :          */
    3281         274 :         clear_vm_uninitialized_flag(area);
    3282             : 
    3283         274 :         size = PAGE_ALIGN(size);
    3284             :         if (!(vm_flags & VM_DEFER_KMEMLEAK))
    3285         274 :                 kmemleak_vmalloc(area, size, gfp_mask);
    3286             : 
    3287         274 :         return area->addr;
    3288             : 
    3289             : fail:
    3290             :         if (shift > PAGE_SHIFT) {
    3291             :                 shift = PAGE_SHIFT;
    3292             :                 align = real_align;
    3293             :                 size = real_size;
    3294             :                 goto again;
    3295             :         }
    3296             : 
    3297             :         return NULL;
    3298             : }
    3299             : 
    3300             : /**
    3301             :  * __vmalloc_node - allocate virtually contiguous memory
    3302             :  * @size:           allocation size
    3303             :  * @align:          desired alignment
    3304             :  * @gfp_mask:       flags for the page level allocator
    3305             :  * @node:           node to use for allocation or NUMA_NO_NODE
    3306             :  * @caller:         caller's return address
    3307             :  *
    3308             :  * Allocate enough pages to cover @size from the page level allocator with
    3309             :  * @gfp_mask flags.  Map them into contiguous kernel virtual space.
    3310             :  *
    3311             :  * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
    3312             :  * and __GFP_NOFAIL are not supported
    3313             :  *
    3314             :  * Any use of gfp flags outside of GFP_KERNEL should be consulted
    3315             :  * with mm people.
    3316             :  *
    3317             :  * Return: pointer to the allocated memory or %NULL on error
    3318             :  */
    3319         258 : void *__vmalloc_node(unsigned long size, unsigned long align,
    3320             :                             gfp_t gfp_mask, int node, const void *caller)
    3321             : {
    3322         258 :         return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
    3323         258 :                                 gfp_mask, PAGE_KERNEL, 0, node, caller);
    3324             : }
    3325             : /*
    3326             :  * This is only for performance analysis of vmalloc and stress purpose.
    3327             :  * It is required by vmalloc test module, therefore do not use it other
    3328             :  * than that.
    3329             :  */
    3330             : #ifdef CONFIG_TEST_VMALLOC_MODULE
    3331             : EXPORT_SYMBOL_GPL(__vmalloc_node);
    3332             : #endif
    3333             : 
    3334           0 : void *__vmalloc(unsigned long size, gfp_t gfp_mask)
    3335             : {
    3336           0 :         return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
    3337           0 :                                 __builtin_return_address(0));
    3338             : }
    3339             : EXPORT_SYMBOL(__vmalloc);
    3340             : 
    3341             : /**
    3342             :  * vmalloc - allocate virtually contiguous memory
    3343             :  * @size:    allocation size
    3344             :  *
    3345             :  * Allocate enough pages to cover @size from the page level
    3346             :  * allocator and map them into contiguous kernel virtual space.
    3347             :  *
    3348             :  * For tight control over page level allocator and protection flags
    3349             :  * use __vmalloc() instead.
    3350             :  *
    3351             :  * Return: pointer to the allocated memory or %NULL on error
    3352             :  */
    3353         102 : void *vmalloc(unsigned long size)
    3354             : {
    3355         102 :         return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
    3356         102 :                                 __builtin_return_address(0));
    3357             : }
    3358             : EXPORT_SYMBOL(vmalloc);
    3359             : 
    3360             : /**
    3361             :  * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
    3362             :  * @size:      allocation size
    3363             :  * @gfp_mask:  flags for the page level allocator
    3364             :  *
    3365             :  * Allocate enough pages to cover @size from the page level
    3366             :  * allocator and map them into contiguous kernel virtual space.
    3367             :  * If @size is greater than or equal to PMD_SIZE, allow using
    3368             :  * huge pages for the memory
    3369             :  *
    3370             :  * Return: pointer to the allocated memory or %NULL on error
    3371             :  */
    3372           0 : void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
    3373             : {
    3374           0 :         return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
    3375           0 :                                     gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
    3376           0 :                                     NUMA_NO_NODE, __builtin_return_address(0));
    3377             : }
    3378             : EXPORT_SYMBOL_GPL(vmalloc_huge);
    3379             : 
    3380             : /**
    3381             :  * vzalloc - allocate virtually contiguous memory with zero fill
    3382             :  * @size:    allocation size
    3383             :  *
    3384             :  * Allocate enough pages to cover @size from the page level
    3385             :  * allocator and map them into contiguous kernel virtual space.
    3386             :  * The memory allocated is set to zero.
    3387             :  *
    3388             :  * For tight control over page level allocator and protection flags
    3389             :  * use __vmalloc() instead.
    3390             :  *
    3391             :  * Return: pointer to the allocated memory or %NULL on error
    3392             :  */
    3393         155 : void *vzalloc(unsigned long size)
    3394             : {
    3395         155 :         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
    3396         155 :                                 __builtin_return_address(0));
    3397             : }
    3398             : EXPORT_SYMBOL(vzalloc);
    3399             : 
    3400             : /**
    3401             :  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
    3402             :  * @size: allocation size
    3403             :  *
    3404             :  * The resulting memory area is zeroed so it can be mapped to userspace
    3405             :  * without leaking data.
    3406             :  *
    3407             :  * Return: pointer to the allocated memory or %NULL on error
    3408             :  */
    3409           0 : void *vmalloc_user(unsigned long size)
    3410             : {
    3411           0 :         return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
    3412           0 :                                     GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
    3413             :                                     VM_USERMAP, NUMA_NO_NODE,
    3414           0 :                                     __builtin_return_address(0));
    3415             : }
    3416             : EXPORT_SYMBOL(vmalloc_user);
    3417             : 
    3418             : /**
    3419             :  * vmalloc_node - allocate memory on a specific node
    3420             :  * @size:         allocation size
    3421             :  * @node:         numa node
    3422             :  *
    3423             :  * Allocate enough pages to cover @size from the page level
    3424             :  * allocator and map them into contiguous kernel virtual space.
    3425             :  *
    3426             :  * For tight control over page level allocator and protection flags
    3427             :  * use __vmalloc() instead.
    3428             :  *
    3429             :  * Return: pointer to the allocated memory or %NULL on error
    3430             :  */
    3431           0 : void *vmalloc_node(unsigned long size, int node)
    3432             : {
    3433           0 :         return __vmalloc_node(size, 1, GFP_KERNEL, node,
    3434           0 :                         __builtin_return_address(0));
    3435             : }
    3436             : EXPORT_SYMBOL(vmalloc_node);
    3437             : 
    3438             : /**
    3439             :  * vzalloc_node - allocate memory on a specific node with zero fill
    3440             :  * @size:       allocation size
    3441             :  * @node:       numa node
    3442             :  *
    3443             :  * Allocate enough pages to cover @size from the page level
    3444             :  * allocator and map them into contiguous kernel virtual space.
    3445             :  * The memory allocated is set to zero.
    3446             :  *
    3447             :  * Return: pointer to the allocated memory or %NULL on error
    3448             :  */
    3449           0 : void *vzalloc_node(unsigned long size, int node)
    3450             : {
    3451           0 :         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
    3452           0 :                                 __builtin_return_address(0));
    3453             : }
    3454             : EXPORT_SYMBOL(vzalloc_node);
    3455             : 
    3456             : #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
    3457             : #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
    3458             : #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
    3459             : #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
    3460             : #else
    3461             : /*
    3462             :  * 64b systems should always have either DMA or DMA32 zones. For others
    3463             :  * GFP_DMA32 should do the right thing and use the normal zone.
    3464             :  */
    3465             : #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
    3466             : #endif
    3467             : 
    3468             : /**
    3469             :  * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
    3470             :  * @size:       allocation size
    3471             :  *
    3472             :  * Allocate enough 32bit PA addressable pages to cover @size from the
    3473             :  * page level allocator and map them into contiguous kernel virtual space.
    3474             :  *
    3475             :  * Return: pointer to the allocated memory or %NULL on error
    3476             :  */
    3477           0 : void *vmalloc_32(unsigned long size)
    3478             : {
    3479           0 :         return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
    3480           0 :                         __builtin_return_address(0));
    3481             : }
    3482             : EXPORT_SYMBOL(vmalloc_32);
    3483             : 
    3484             : /**
    3485             :  * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
    3486             :  * @size:            allocation size
    3487             :  *
    3488             :  * The resulting memory area is 32bit addressable and zeroed so it can be
    3489             :  * mapped to userspace without leaking data.
    3490             :  *
    3491             :  * Return: pointer to the allocated memory or %NULL on error
    3492             :  */
    3493           0 : void *vmalloc_32_user(unsigned long size)
    3494             : {
    3495           0 :         return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
    3496           0 :                                     GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
    3497             :                                     VM_USERMAP, NUMA_NO_NODE,
    3498           0 :                                     __builtin_return_address(0));
    3499             : }
    3500             : EXPORT_SYMBOL(vmalloc_32_user);
    3501             : 
    3502             : /*
    3503             :  * Atomically zero bytes in the iterator.
    3504             :  *
    3505             :  * Returns the number of zeroed bytes.
    3506             :  */
    3507           0 : static size_t zero_iter(struct iov_iter *iter, size_t count)
    3508             : {
    3509           0 :         size_t remains = count;
    3510             : 
    3511           0 :         while (remains > 0) {
    3512             :                 size_t num, copied;
    3513             : 
    3514           0 :                 num = remains < PAGE_SIZE ? remains : PAGE_SIZE;
    3515           0 :                 copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
    3516           0 :                 remains -= copied;
    3517             : 
    3518           0 :                 if (copied < num)
    3519             :                         break;
    3520             :         }
    3521             : 
    3522           0 :         return count - remains;
    3523             : }
    3524             : 
    3525             : /*
    3526             :  * small helper routine, copy contents to iter from addr.
    3527             :  * If the page is not present, fill zero.
    3528             :  *
    3529             :  * Returns the number of copied bytes.
    3530             :  */
    3531           0 : static size_t aligned_vread_iter(struct iov_iter *iter,
    3532             :                                  const char *addr, size_t count)
    3533             : {
    3534           0 :         size_t remains = count;
    3535             :         struct page *page;
    3536             : 
    3537           0 :         while (remains > 0) {
    3538             :                 unsigned long offset, length;
    3539           0 :                 size_t copied = 0;
    3540             : 
    3541           0 :                 offset = offset_in_page(addr);
    3542           0 :                 length = PAGE_SIZE - offset;
    3543           0 :                 if (length > remains)
    3544           0 :                         length = remains;
    3545           0 :                 page = vmalloc_to_page(addr);
    3546             :                 /*
    3547             :                  * To do safe access to this _mapped_ area, we need lock. But
    3548             :                  * adding lock here means that we need to add overhead of
    3549             :                  * vmalloc()/vfree() calls for this _debug_ interface, rarely
    3550             :                  * used. Instead of that, we'll use an local mapping via
    3551             :                  * copy_page_to_iter_nofault() and accept a small overhead in
    3552             :                  * this access function.
    3553             :                  */
    3554           0 :                 if (page)
    3555           0 :                         copied = copy_page_to_iter_nofault(page, offset,
    3556             :                                                            length, iter);
    3557             :                 else
    3558           0 :                         copied = zero_iter(iter, length);
    3559             : 
    3560           0 :                 addr += copied;
    3561           0 :                 remains -= copied;
    3562             : 
    3563           0 :                 if (copied != length)
    3564             :                         break;
    3565             :         }
    3566             : 
    3567           0 :         return count - remains;
    3568             : }
    3569             : 
    3570             : /*
    3571             :  * Read from a vm_map_ram region of memory.
    3572             :  *
    3573             :  * Returns the number of copied bytes.
    3574             :  */
    3575           0 : static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
    3576             :                                   size_t count, unsigned long flags)
    3577             : {
    3578             :         char *start;
    3579             :         struct vmap_block *vb;
    3580             :         struct xarray *xa;
    3581             :         unsigned long offset;
    3582             :         unsigned int rs, re;
    3583             :         size_t remains, n;
    3584             : 
    3585             :         /*
    3586             :          * If it's area created by vm_map_ram() interface directly, but
    3587             :          * not further subdividing and delegating management to vmap_block,
    3588             :          * handle it here.
    3589             :          */
    3590           0 :         if (!(flags & VMAP_BLOCK))
    3591           0 :                 return aligned_vread_iter(iter, addr, count);
    3592             : 
    3593           0 :         remains = count;
    3594             : 
    3595             :         /*
    3596             :          * Area is split into regions and tracked with vmap_block, read out
    3597             :          * each region and zero fill the hole between regions.
    3598             :          */
    3599           0 :         xa = addr_to_vb_xa((unsigned long) addr);
    3600           0 :         vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
    3601           0 :         if (!vb)
    3602             :                 goto finished_zero;
    3603             : 
    3604           0 :         spin_lock(&vb->lock);
    3605           0 :         if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
    3606           0 :                 spin_unlock(&vb->lock);
    3607             :                 goto finished_zero;
    3608             :         }
    3609             : 
    3610           0 :         for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
    3611             :                 size_t copied;
    3612             : 
    3613           0 :                 if (remains == 0)
    3614             :                         goto finished;
    3615             : 
    3616           0 :                 start = vmap_block_vaddr(vb->va->va_start, rs);
    3617             : 
    3618           0 :                 if (addr < start) {
    3619           0 :                         size_t to_zero = min_t(size_t, start - addr, remains);
    3620           0 :                         size_t zeroed = zero_iter(iter, to_zero);
    3621             : 
    3622           0 :                         addr += zeroed;
    3623           0 :                         remains -= zeroed;
    3624             : 
    3625           0 :                         if (remains == 0 || zeroed != to_zero)
    3626             :                                 goto finished;
    3627             :                 }
    3628             : 
    3629             :                 /*it could start reading from the middle of used region*/
    3630           0 :                 offset = offset_in_page(addr);
    3631           0 :                 n = ((re - rs + 1) << PAGE_SHIFT) - offset;
    3632           0 :                 if (n > remains)
    3633           0 :                         n = remains;
    3634             : 
    3635           0 :                 copied = aligned_vread_iter(iter, start + offset, n);
    3636             : 
    3637           0 :                 addr += copied;
    3638           0 :                 remains -= copied;
    3639             : 
    3640           0 :                 if (copied != n)
    3641             :                         goto finished;
    3642             :         }
    3643             : 
    3644           0 :         spin_unlock(&vb->lock);
    3645             : 
    3646             : finished_zero:
    3647             :         /* zero-fill the left dirty or free regions */
    3648           0 :         return count - remains + zero_iter(iter, remains);
    3649             : finished:
    3650             :         /* We couldn't copy/zero everything */
    3651           0 :         spin_unlock(&vb->lock);
    3652           0 :         return count - remains;
    3653             : }
    3654             : 
    3655             : /**
    3656             :  * vread_iter() - read vmalloc area in a safe way to an iterator.
    3657             :  * @iter:         the iterator to which data should be written.
    3658             :  * @addr:         vm address.
    3659             :  * @count:        number of bytes to be read.
    3660             :  *
    3661             :  * This function checks that addr is a valid vmalloc'ed area, and
    3662             :  * copy data from that area to a given buffer. If the given memory range
    3663             :  * of [addr...addr+count) includes some valid address, data is copied to
    3664             :  * proper area of @buf. If there are memory holes, they'll be zero-filled.
    3665             :  * IOREMAP area is treated as memory hole and no copy is done.
    3666             :  *
    3667             :  * If [addr...addr+count) doesn't includes any intersects with alive
    3668             :  * vm_struct area, returns 0. @buf should be kernel's buffer.
    3669             :  *
    3670             :  * Note: In usual ops, vread() is never necessary because the caller
    3671             :  * should know vmalloc() area is valid and can use memcpy().
    3672             :  * This is for routines which have to access vmalloc area without
    3673             :  * any information, as /proc/kcore.
    3674             :  *
    3675             :  * Return: number of bytes for which addr and buf should be increased
    3676             :  * (same number as @count) or %0 if [addr...addr+count) doesn't
    3677             :  * include any intersection with valid vmalloc area
    3678             :  */
    3679           0 : long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
    3680             : {
    3681             :         struct vmap_area *va;
    3682             :         struct vm_struct *vm;
    3683             :         char *vaddr;
    3684             :         size_t n, size, flags, remains;
    3685             : 
    3686           0 :         addr = kasan_reset_tag(addr);
    3687             : 
    3688             :         /* Don't allow overflow */
    3689           0 :         if ((unsigned long) addr + count < count)
    3690           0 :                 count = -(unsigned long) addr;
    3691             : 
    3692           0 :         remains = count;
    3693             : 
    3694           0 :         spin_lock(&vmap_area_lock);
    3695           0 :         va = find_vmap_area_exceed_addr((unsigned long)addr);
    3696           0 :         if (!va)
    3697             :                 goto finished_zero;
    3698             : 
    3699             :         /* no intersects with alive vmap_area */
    3700           0 :         if ((unsigned long)addr + remains <= va->va_start)
    3701             :                 goto finished_zero;
    3702             : 
    3703           0 :         list_for_each_entry_from(va, &vmap_area_list, list) {
    3704             :                 size_t copied;
    3705             : 
    3706           0 :                 if (remains == 0)
    3707             :                         goto finished;
    3708             : 
    3709           0 :                 vm = va->vm;
    3710           0 :                 flags = va->flags & VMAP_FLAGS_MASK;
    3711             :                 /*
    3712             :                  * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
    3713             :                  * be set together with VMAP_RAM.
    3714             :                  */
    3715           0 :                 WARN_ON(flags == VMAP_BLOCK);
    3716             : 
    3717           0 :                 if (!vm && !flags)
    3718           0 :                         continue;
    3719             : 
    3720           0 :                 if (vm && (vm->flags & VM_UNINITIALIZED))
    3721           0 :                         continue;
    3722             : 
    3723             :                 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
    3724           0 :                 smp_rmb();
    3725             : 
    3726           0 :                 vaddr = (char *) va->va_start;
    3727           0 :                 size = vm ? get_vm_area_size(vm) : va_size(va);
    3728             : 
    3729           0 :                 if (addr >= vaddr + size)
    3730           0 :                         continue;
    3731             : 
    3732           0 :                 if (addr < vaddr) {
    3733           0 :                         size_t to_zero = min_t(size_t, vaddr - addr, remains);
    3734           0 :                         size_t zeroed = zero_iter(iter, to_zero);
    3735             : 
    3736           0 :                         addr += zeroed;
    3737           0 :                         remains -= zeroed;
    3738             : 
    3739           0 :                         if (remains == 0 || zeroed != to_zero)
    3740             :                                 goto finished;
    3741             :                 }
    3742             : 
    3743           0 :                 n = vaddr + size - addr;
    3744           0 :                 if (n > remains)
    3745           0 :                         n = remains;
    3746             : 
    3747           0 :                 if (flags & VMAP_RAM)
    3748           0 :                         copied = vmap_ram_vread_iter(iter, addr, n, flags);
    3749           0 :                 else if (!(vm->flags & VM_IOREMAP))
    3750           0 :                         copied = aligned_vread_iter(iter, addr, n);
    3751             :                 else /* IOREMAP area is treated as memory hole */
    3752           0 :                         copied = zero_iter(iter, n);
    3753             : 
    3754           0 :                 addr += copied;
    3755           0 :                 remains -= copied;
    3756             : 
    3757           0 :                 if (copied != n)
    3758             :                         goto finished;
    3759             :         }
    3760             : 
    3761             : finished_zero:
    3762           0 :         spin_unlock(&vmap_area_lock);
    3763             :         /* zero-fill memory holes */
    3764           0 :         return count - remains + zero_iter(iter, remains);
    3765             : finished:
    3766             :         /* Nothing remains, or We couldn't copy/zero everything. */
    3767           0 :         spin_unlock(&vmap_area_lock);
    3768             : 
    3769           0 :         return count - remains;
    3770             : }
    3771             : 
    3772             : /**
    3773             :  * remap_vmalloc_range_partial - map vmalloc pages to userspace
    3774             :  * @vma:                vma to cover
    3775             :  * @uaddr:              target user address to start at
    3776             :  * @kaddr:              virtual address of vmalloc kernel memory
    3777             :  * @pgoff:              offset from @kaddr to start at
    3778             :  * @size:               size of map area
    3779             :  *
    3780             :  * Returns:     0 for success, -Exxx on failure
    3781             :  *
    3782             :  * This function checks that @kaddr is a valid vmalloc'ed area,
    3783             :  * and that it is big enough to cover the range starting at
    3784             :  * @uaddr in @vma. Will return failure if that criteria isn't
    3785             :  * met.
    3786             :  *
    3787             :  * Similar to remap_pfn_range() (see mm/memory.c)
    3788             :  */
    3789           0 : int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
    3790             :                                 void *kaddr, unsigned long pgoff,
    3791             :                                 unsigned long size)
    3792             : {
    3793             :         struct vm_struct *area;
    3794             :         unsigned long off;
    3795             :         unsigned long end_index;
    3796             : 
    3797           0 :         if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
    3798             :                 return -EINVAL;
    3799             : 
    3800           0 :         size = PAGE_ALIGN(size);
    3801             : 
    3802           0 :         if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
    3803             :                 return -EINVAL;
    3804             : 
    3805           0 :         area = find_vm_area(kaddr);
    3806           0 :         if (!area)
    3807             :                 return -EINVAL;
    3808             : 
    3809           0 :         if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
    3810             :                 return -EINVAL;
    3811             : 
    3812           0 :         if (check_add_overflow(size, off, &end_index) ||
    3813             :             end_index > get_vm_area_size(area))
    3814             :                 return -EINVAL;
    3815           0 :         kaddr += off;
    3816             : 
    3817             :         do {
    3818           0 :                 struct page *page = vmalloc_to_page(kaddr);
    3819             :                 int ret;
    3820             : 
    3821           0 :                 ret = vm_insert_page(vma, uaddr, page);
    3822           0 :                 if (ret)
    3823             :                         return ret;
    3824             : 
    3825           0 :                 uaddr += PAGE_SIZE;
    3826           0 :                 kaddr += PAGE_SIZE;
    3827           0 :                 size -= PAGE_SIZE;
    3828           0 :         } while (size > 0);
    3829             : 
    3830           0 :         vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);
    3831             : 
    3832           0 :         return 0;
    3833             : }
    3834             : 
    3835             : /**
    3836             :  * remap_vmalloc_range - map vmalloc pages to userspace
    3837             :  * @vma:                vma to cover (map full range of vma)
    3838             :  * @addr:               vmalloc memory
    3839             :  * @pgoff:              number of pages into addr before first page to map
    3840             :  *
    3841             :  * Returns:     0 for success, -Exxx on failure
    3842             :  *
    3843             :  * This function checks that addr is a valid vmalloc'ed area, and
    3844             :  * that it is big enough to cover the vma. Will return failure if
    3845             :  * that criteria isn't met.
    3846             :  *
    3847             :  * Similar to remap_pfn_range() (see mm/memory.c)
    3848             :  */
    3849           0 : int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
    3850             :                                                 unsigned long pgoff)
    3851             : {
    3852           0 :         return remap_vmalloc_range_partial(vma, vma->vm_start,
    3853             :                                            addr, pgoff,
    3854           0 :                                            vma->vm_end - vma->vm_start);
    3855             : }
    3856             : EXPORT_SYMBOL(remap_vmalloc_range);
    3857             : 
    3858           0 : void free_vm_area(struct vm_struct *area)
    3859             : {
    3860             :         struct vm_struct *ret;
    3861           0 :         ret = remove_vm_area(area->addr);
    3862           0 :         BUG_ON(ret != area);
    3863           0 :         kfree(area);
    3864           0 : }
    3865             : EXPORT_SYMBOL_GPL(free_vm_area);
    3866             : 
    3867             : #ifdef CONFIG_SMP
    3868             : static struct vmap_area *node_to_va(struct rb_node *n)
    3869             : {
    3870             :         return rb_entry_safe(n, struct vmap_area, rb_node);
    3871             : }
    3872             : 
    3873             : /**
    3874             :  * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
    3875             :  * @addr: target address
    3876             :  *
    3877             :  * Returns: vmap_area if it is found. If there is no such area
    3878             :  *   the first highest(reverse order) vmap_area is returned
    3879             :  *   i.e. va->va_start < addr && va->va_end < addr or NULL
    3880             :  *   if there are no any areas before @addr.
    3881             :  */
    3882             : static struct vmap_area *
    3883             : pvm_find_va_enclose_addr(unsigned long addr)
    3884             : {
    3885             :         struct vmap_area *va, *tmp;
    3886             :         struct rb_node *n;
    3887             : 
    3888             :         n = free_vmap_area_root.rb_node;
    3889             :         va = NULL;
    3890             : 
    3891             :         while (n) {
    3892             :                 tmp = rb_entry(n, struct vmap_area, rb_node);
    3893             :                 if (tmp->va_start <= addr) {
    3894             :                         va = tmp;
    3895             :                         if (tmp->va_end >= addr)
    3896             :                                 break;
    3897             : 
    3898             :                         n = n->rb_right;
    3899             :                 } else {
    3900             :                         n = n->rb_left;
    3901             :                 }
    3902             :         }
    3903             : 
    3904             :         return va;
    3905             : }
    3906             : 
    3907             : /**
    3908             :  * pvm_determine_end_from_reverse - find the highest aligned address
    3909             :  * of free block below VMALLOC_END
    3910             :  * @va:
    3911             :  *   in - the VA we start the search(reverse order);
    3912             :  *   out - the VA with the highest aligned end address.
    3913             :  * @align: alignment for required highest address
    3914             :  *
    3915             :  * Returns: determined end address within vmap_area
    3916             :  */
    3917             : static unsigned long
    3918             : pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
    3919             : {
    3920             :         unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
    3921             :         unsigned long addr;
    3922             : 
    3923             :         if (likely(*va)) {
    3924             :                 list_for_each_entry_from_reverse((*va),
    3925             :                                 &free_vmap_area_list, list) {
    3926             :                         addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
    3927             :                         if ((*va)->va_start < addr)
    3928             :                                 return addr;
    3929             :                 }
    3930             :         }
    3931             : 
    3932             :         return 0;
    3933             : }
    3934             : 
    3935             : /**
    3936             :  * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
    3937             :  * @offsets: array containing offset of each area
    3938             :  * @sizes: array containing size of each area
    3939             :  * @nr_vms: the number of areas to allocate
    3940             :  * @align: alignment, all entries in @offsets and @sizes must be aligned to this
    3941             :  *
    3942             :  * Returns: kmalloc'd vm_struct pointer array pointing to allocated
    3943             :  *          vm_structs on success, %NULL on failure
    3944             :  *
    3945             :  * Percpu allocator wants to use congruent vm areas so that it can
    3946             :  * maintain the offsets among percpu areas.  This function allocates
    3947             :  * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
    3948             :  * be scattered pretty far, distance between two areas easily going up
    3949             :  * to gigabytes.  To avoid interacting with regular vmallocs, these
    3950             :  * areas are allocated from top.
    3951             :  *
    3952             :  * Despite its complicated look, this allocator is rather simple. It
    3953             :  * does everything top-down and scans free blocks from the end looking
    3954             :  * for matching base. While scanning, if any of the areas do not fit the
    3955             :  * base address is pulled down to fit the area. Scanning is repeated till
    3956             :  * all the areas fit and then all necessary data structures are inserted
    3957             :  * and the result is returned.
    3958             :  */
    3959             : struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
    3960             :                                      const size_t *sizes, int nr_vms,
    3961             :                                      size_t align)
    3962             : {
    3963             :         const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
    3964             :         const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
    3965             :         struct vmap_area **vas, *va;
    3966             :         struct vm_struct **vms;
    3967             :         int area, area2, last_area, term_area;
    3968             :         unsigned long base, start, size, end, last_end, orig_start, orig_end;
    3969             :         bool purged = false;
    3970             : 
    3971             :         /* verify parameters and allocate data structures */
    3972             :         BUG_ON(offset_in_page(align) || !is_power_of_2(align));
    3973             :         for (last_area = 0, area = 0; area < nr_vms; area++) {
    3974             :                 start = offsets[area];
    3975             :                 end = start + sizes[area];
    3976             : 
    3977             :                 /* is everything aligned properly? */
    3978             :                 BUG_ON(!IS_ALIGNED(offsets[area], align));
    3979             :                 BUG_ON(!IS_ALIGNED(sizes[area], align));
    3980             : 
    3981             :                 /* detect the area with the highest address */
    3982             :                 if (start > offsets[last_area])
    3983             :                         last_area = area;
    3984             : 
    3985             :                 for (area2 = area + 1; area2 < nr_vms; area2++) {
    3986             :                         unsigned long start2 = offsets[area2];
    3987             :                         unsigned long end2 = start2 + sizes[area2];
    3988             : 
    3989             :                         BUG_ON(start2 < end && start < end2);
    3990             :                 }
    3991             :         }
    3992             :         last_end = offsets[last_area] + sizes[last_area];
    3993             : 
    3994             :         if (vmalloc_end - vmalloc_start < last_end) {
    3995             :                 WARN_ON(true);
    3996             :                 return NULL;
    3997             :         }
    3998             : 
    3999             :         vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
    4000             :         vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
    4001             :         if (!vas || !vms)
    4002             :                 goto err_free2;
    4003             : 
    4004             :         for (area = 0; area < nr_vms; area++) {
    4005             :                 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
    4006             :                 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
    4007             :                 if (!vas[area] || !vms[area])
    4008             :                         goto err_free;
    4009             :         }
    4010             : retry:
    4011             :         spin_lock(&free_vmap_area_lock);
    4012             : 
    4013             :         /* start scanning - we scan from the top, begin with the last area */
    4014             :         area = term_area = last_area;
    4015             :         start = offsets[area];
    4016             :         end = start + sizes[area];
    4017             : 
    4018             :         va = pvm_find_va_enclose_addr(vmalloc_end);
    4019             :         base = pvm_determine_end_from_reverse(&va, align) - end;
    4020             : 
    4021             :         while (true) {
    4022             :                 /*
    4023             :                  * base might have underflowed, add last_end before
    4024             :                  * comparing.
    4025             :                  */
    4026             :                 if (base + last_end < vmalloc_start + last_end)
    4027             :                         goto overflow;
    4028             : 
    4029             :                 /*
    4030             :                  * Fitting base has not been found.
    4031             :                  */
    4032             :                 if (va == NULL)
    4033             :                         goto overflow;
    4034             : 
    4035             :                 /*
    4036             :                  * If required width exceeds current VA block, move
    4037             :                  * base downwards and then recheck.
    4038             :                  */
    4039             :                 if (base + end > va->va_end) {
    4040             :                         base = pvm_determine_end_from_reverse(&va, align) - end;
    4041             :                         term_area = area;
    4042             :                         continue;
    4043             :                 }
    4044             : 
    4045             :                 /*
    4046             :                  * If this VA does not fit, move base downwards and recheck.
    4047             :                  */
    4048             :                 if (base + start < va->va_start) {
    4049             :                         va = node_to_va(rb_prev(&va->rb_node));
    4050             :                         base = pvm_determine_end_from_reverse(&va, align) - end;
    4051             :                         term_area = area;
    4052             :                         continue;
    4053             :                 }
    4054             : 
    4055             :                 /*
    4056             :                  * This area fits, move on to the previous one.  If
    4057             :                  * the previous one is the terminal one, we're done.
    4058             :                  */
    4059             :                 area = (area + nr_vms - 1) % nr_vms;
    4060             :                 if (area == term_area)
    4061             :                         break;
    4062             : 
    4063             :                 start = offsets[area];
    4064             :                 end = start + sizes[area];
    4065             :                 va = pvm_find_va_enclose_addr(base + end);
    4066             :         }
    4067             : 
    4068             :         /* we've found a fitting base, insert all va's */
    4069             :         for (area = 0; area < nr_vms; area++) {
    4070             :                 int ret;
    4071             : 
    4072             :                 start = base + offsets[area];
    4073             :                 size = sizes[area];
    4074             : 
    4075             :                 va = pvm_find_va_enclose_addr(start);
    4076             :                 if (WARN_ON_ONCE(va == NULL))
    4077             :                         /* It is a BUG(), but trigger recovery instead. */
    4078             :                         goto recovery;
    4079             : 
    4080             :                 ret = adjust_va_to_fit_type(&free_vmap_area_root,
    4081             :                                             &free_vmap_area_list,
    4082             :                                             va, start, size);
    4083             :                 if (WARN_ON_ONCE(unlikely(ret)))
    4084             :                         /* It is a BUG(), but trigger recovery instead. */
    4085             :                         goto recovery;
    4086             : 
    4087             :                 /* Allocated area. */
    4088             :                 va = vas[area];
    4089             :                 va->va_start = start;
    4090             :                 va->va_end = start + size;
    4091             :         }
    4092             : 
    4093             :         spin_unlock(&free_vmap_area_lock);
    4094             : 
    4095             :         /* populate the kasan shadow space */
    4096             :         for (area = 0; area < nr_vms; area++) {
    4097             :                 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
    4098             :                         goto err_free_shadow;
    4099             :         }
    4100             : 
    4101             :         /* insert all vm's */
    4102             :         spin_lock(&vmap_area_lock);
    4103             :         for (area = 0; area < nr_vms; area++) {
    4104             :                 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
    4105             : 
    4106             :                 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
    4107             :                                  pcpu_get_vm_areas);
    4108             :         }
    4109             :         spin_unlock(&vmap_area_lock);
    4110             : 
    4111             :         /*
    4112             :          * Mark allocated areas as accessible. Do it now as a best-effort
    4113             :          * approach, as they can be mapped outside of vmalloc code.
    4114             :          * With hardware tag-based KASAN, marking is skipped for
    4115             :          * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
    4116             :          */
    4117             :         for (area = 0; area < nr_vms; area++)
    4118             :                 vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
    4119             :                                 vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
    4120             : 
    4121             :         kfree(vas);
    4122             :         return vms;
    4123             : 
    4124             : recovery:
    4125             :         /*
    4126             :          * Remove previously allocated areas. There is no
    4127             :          * need in removing these areas from the busy tree,
    4128             :          * because they are inserted only on the final step
    4129             :          * and when pcpu_get_vm_areas() is success.
    4130             :          */
    4131             :         while (area--) {
    4132             :                 orig_start = vas[area]->va_start;
    4133             :                 orig_end = vas[area]->va_end;
    4134             :                 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
    4135             :                                 &free_vmap_area_list);
    4136             :                 if (va)
    4137             :                         kasan_release_vmalloc(orig_start, orig_end,
    4138             :                                 va->va_start, va->va_end);
    4139             :                 vas[area] = NULL;
    4140             :         }
    4141             : 
    4142             : overflow:
    4143             :         spin_unlock(&free_vmap_area_lock);
    4144             :         if (!purged) {
    4145             :                 purge_vmap_area_lazy();
    4146             :                 purged = true;
    4147             : 
    4148             :                 /* Before "retry", check if we recover. */
    4149             :                 for (area = 0; area < nr_vms; area++) {
    4150             :                         if (vas[area])
    4151             :                                 continue;
    4152             : 
    4153             :                         vas[area] = kmem_cache_zalloc(
    4154             :                                 vmap_area_cachep, GFP_KERNEL);
    4155             :                         if (!vas[area])
    4156             :                                 goto err_free;
    4157             :                 }
    4158             : 
    4159             :                 goto retry;
    4160             :         }
    4161             : 
    4162             : err_free:
    4163             :         for (area = 0; area < nr_vms; area++) {
    4164             :                 if (vas[area])
    4165             :                         kmem_cache_free(vmap_area_cachep, vas[area]);
    4166             : 
    4167             :                 kfree(vms[area]);
    4168             :         }
    4169             : err_free2:
    4170             :         kfree(vas);
    4171             :         kfree(vms);
    4172             :         return NULL;
    4173             : 
    4174             : err_free_shadow:
    4175             :         spin_lock(&free_vmap_area_lock);
    4176             :         /*
    4177             :          * We release all the vmalloc shadows, even the ones for regions that
    4178             :          * hadn't been successfully added. This relies on kasan_release_vmalloc
    4179             :          * being able to tolerate this case.
    4180             :          */
    4181             :         for (area = 0; area < nr_vms; area++) {
    4182             :                 orig_start = vas[area]->va_start;
    4183             :                 orig_end = vas[area]->va_end;
    4184             :                 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
    4185             :                                 &free_vmap_area_list);
    4186             :                 if (va)
    4187             :                         kasan_release_vmalloc(orig_start, orig_end,
    4188             :                                 va->va_start, va->va_end);
    4189             :                 vas[area] = NULL;
    4190             :                 kfree(vms[area]);
    4191             :         }
    4192             :         spin_unlock(&free_vmap_area_lock);
    4193             :         kfree(vas);
    4194             :         kfree(vms);
    4195             :         return NULL;
    4196             : }
    4197             : 
    4198             : /**
    4199             :  * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
    4200             :  * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
    4201             :  * @nr_vms: the number of allocated areas
    4202             :  *
    4203             :  * Free vm_structs and the array allocated by pcpu_get_vm_areas().
    4204             :  */
    4205             : void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
    4206             : {
    4207             :         int i;
    4208             : 
    4209             :         for (i = 0; i < nr_vms; i++)
    4210             :                 free_vm_area(vms[i]);
    4211             :         kfree(vms);
    4212             : }
    4213             : #endif  /* CONFIG_SMP */
    4214             : 
    4215             : #ifdef CONFIG_PRINTK
    4216           0 : bool vmalloc_dump_obj(void *object)
    4217             : {
    4218             :         struct vm_struct *vm;
    4219           0 :         void *objp = (void *)PAGE_ALIGN((unsigned long)object);
    4220             : 
    4221           0 :         vm = find_vm_area(objp);
    4222           0 :         if (!vm)
    4223             :                 return false;
    4224           0 :         pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
    4225             :                 vm->nr_pages, (unsigned long)vm->addr, vm->caller);
    4226           0 :         return true;
    4227             : }
    4228             : #endif
    4229             : 
    4230             : #ifdef CONFIG_PROC_FS
    4231           0 : static void *s_start(struct seq_file *m, loff_t *pos)
    4232             :         __acquires(&vmap_purge_lock)
    4233             :         __acquires(&vmap_area_lock)
    4234             : {
    4235           0 :         mutex_lock(&vmap_purge_lock);
    4236           0 :         spin_lock(&vmap_area_lock);
    4237             : 
    4238           0 :         return seq_list_start(&vmap_area_list, *pos);
    4239             : }
    4240             : 
    4241           0 : static void *s_next(struct seq_file *m, void *p, loff_t *pos)
    4242             : {
    4243           0 :         return seq_list_next(p, &vmap_area_list, pos);
    4244             : }
    4245             : 
    4246           0 : static void s_stop(struct seq_file *m, void *p)
    4247             :         __releases(&vmap_area_lock)
    4248             :         __releases(&vmap_purge_lock)
    4249             : {
    4250           0 :         spin_unlock(&vmap_area_lock);
    4251           0 :         mutex_unlock(&vmap_purge_lock);
    4252           0 : }
    4253             : 
    4254             : static void show_numa_info(struct seq_file *m, struct vm_struct *v)
    4255             : {
    4256             :         if (IS_ENABLED(CONFIG_NUMA)) {
    4257             :                 unsigned int nr, *counters = m->private;
    4258             :                 unsigned int step = 1U << vm_area_page_order(v);
    4259             : 
    4260             :                 if (!counters)
    4261             :                         return;
    4262             : 
    4263             :                 if (v->flags & VM_UNINITIALIZED)
    4264             :                         return;
    4265             :                 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
    4266             :                 smp_rmb();
    4267             : 
    4268             :                 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
    4269             : 
    4270             :                 for (nr = 0; nr < v->nr_pages; nr += step)
    4271             :                         counters[page_to_nid(v->pages[nr])] += step;
    4272             :                 for_each_node_state(nr, N_HIGH_MEMORY)
    4273             :                         if (counters[nr])
    4274             :                                 seq_printf(m, " N%u=%u", nr, counters[nr]);
    4275             :         }
    4276             : }
    4277             : 
    4278           0 : static void show_purge_info(struct seq_file *m)
    4279             : {
    4280             :         struct vmap_area *va;
    4281             : 
    4282           0 :         spin_lock(&purge_vmap_area_lock);
    4283           0 :         list_for_each_entry(va, &purge_vmap_area_list, list) {
    4284           0 :                 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
    4285             :                         (void *)va->va_start, (void *)va->va_end,
    4286           0 :                         va->va_end - va->va_start);
    4287             :         }
    4288           0 :         spin_unlock(&purge_vmap_area_lock);
    4289           0 : }
    4290             : 
    4291           0 : static int s_show(struct seq_file *m, void *p)
    4292             : {
    4293             :         struct vmap_area *va;
    4294             :         struct vm_struct *v;
    4295             : 
    4296           0 :         va = list_entry(p, struct vmap_area, list);
    4297             : 
    4298           0 :         if (!va->vm) {
    4299           0 :                 if (va->flags & VMAP_RAM)
    4300           0 :                         seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
    4301             :                                 (void *)va->va_start, (void *)va->va_end,
    4302           0 :                                 va->va_end - va->va_start);
    4303             : 
    4304             :                 goto final;
    4305             :         }
    4306             : 
    4307           0 :         v = va->vm;
    4308             : 
    4309           0 :         seq_printf(m, "0x%pK-0x%pK %7ld",
    4310           0 :                 v->addr, v->addr + v->size, v->size);
    4311             : 
    4312           0 :         if (v->caller)
    4313           0 :                 seq_printf(m, " %pS", v->caller);
    4314             : 
    4315           0 :         if (v->nr_pages)
    4316           0 :                 seq_printf(m, " pages=%d", v->nr_pages);
    4317             : 
    4318           0 :         if (v->phys_addr)
    4319           0 :                 seq_printf(m, " phys=%pa", &v->phys_addr);
    4320             : 
    4321           0 :         if (v->flags & VM_IOREMAP)
    4322           0 :                 seq_puts(m, " ioremap");
    4323             : 
    4324           0 :         if (v->flags & VM_ALLOC)
    4325           0 :                 seq_puts(m, " vmalloc");
    4326             : 
    4327           0 :         if (v->flags & VM_MAP)
    4328           0 :                 seq_puts(m, " vmap");
    4329             : 
    4330           0 :         if (v->flags & VM_USERMAP)
    4331           0 :                 seq_puts(m, " user");
    4332             : 
    4333           0 :         if (v->flags & VM_DMA_COHERENT)
    4334           0 :                 seq_puts(m, " dma-coherent");
    4335             : 
    4336           0 :         if (is_vmalloc_addr(v->pages))
    4337           0 :                 seq_puts(m, " vpages");
    4338             : 
    4339           0 :         show_numa_info(m, v);
    4340           0 :         seq_putc(m, '\n');
    4341             : 
    4342             :         /*
    4343             :          * As a final step, dump "unpurged" areas.
    4344             :          */
    4345             : final:
    4346           0 :         if (list_is_last(&va->list, &vmap_area_list))
    4347           0 :                 show_purge_info(m);
    4348             : 
    4349           0 :         return 0;
    4350             : }
    4351             : 
    4352             : static const struct seq_operations vmalloc_op = {
    4353             :         .start = s_start,
    4354             :         .next = s_next,
    4355             :         .stop = s_stop,
    4356             :         .show = s_show,
    4357             : };
    4358             : 
    4359           1 : static int __init proc_vmalloc_init(void)
    4360             : {
    4361             :         if (IS_ENABLED(CONFIG_NUMA))
    4362             :                 proc_create_seq_private("vmallocinfo", 0400, NULL,
    4363             :                                 &vmalloc_op,
    4364             :                                 nr_node_ids * sizeof(unsigned int), NULL);
    4365             :         else
    4366           1 :                 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
    4367           1 :         return 0;
    4368             : }
    4369             : module_init(proc_vmalloc_init);
    4370             : 
    4371             : #endif
    4372             : 
    4373           1 : void __init vmalloc_init(void)
    4374             : {
    4375             :         struct vmap_area *va;
    4376             :         struct vm_struct *tmp;
    4377             :         int i;
    4378             : 
    4379             :         /*
    4380             :          * Create the cache for vmap_area objects.
    4381             :          */
    4382           1 :         vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
    4383             : 
    4384           2 :         for_each_possible_cpu(i) {
    4385             :                 struct vmap_block_queue *vbq;
    4386             :                 struct vfree_deferred *p;
    4387             : 
    4388           1 :                 vbq = &per_cpu(vmap_block_queue, i);
    4389           1 :                 spin_lock_init(&vbq->lock);
    4390           2 :                 INIT_LIST_HEAD(&vbq->free);
    4391           1 :                 p = &per_cpu(vfree_deferred, i);
    4392           2 :                 init_llist_head(&p->list);
    4393           2 :                 INIT_WORK(&p->wq, delayed_vfree_work);
    4394           2 :                 xa_init(&vbq->vmap_blocks);
    4395             :         }
    4396             : 
    4397             :         /* Import existing vmlist entries. */
    4398           1 :         for (tmp = vmlist; tmp; tmp = tmp->next) {
    4399           0 :                 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    4400           0 :                 if (WARN_ON_ONCE(!va))
    4401           0 :                         continue;
    4402             : 
    4403           0 :                 va->va_start = (unsigned long)tmp->addr;
    4404           0 :                 va->va_end = va->va_start + tmp->size;
    4405           0 :                 va->vm = tmp;
    4406           0 :                 insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
    4407             :         }
    4408             : 
    4409             :         /*
    4410             :          * Now we can initialize a free vmap space.
    4411             :          */
    4412           1 :         vmap_init_free_space();
    4413           1 :         vmap_initialized = true;
    4414           1 : }

Generated by: LCOV version 1.14