LCOV - coverage.info - mm/vmalloc.c

LCOV - code coverage report

Current view:	top level - mm - vmalloc.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	473	1110	42.6 %
Date:	2023-04-06 08:38:28	Functions:	36	94	38.3 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  Copyright (C) 1993  Linus Torvalds
       4             :  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
       5             :  *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
       6             :  *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
       7             :  *  Numa awareness, Christoph Lameter, SGI, June 2005
       8             :  *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
       9             :  */
      10             : 
      11             : #include <linux/vmalloc.h>
      12             : #include <linux/mm.h>
      13             : #include <linux/module.h>
      14             : #include <linux/highmem.h>
      15             : #include <linux/sched/signal.h>
      16             : #include <linux/slab.h>
      17             : #include <linux/spinlock.h>
      18             : #include <linux/interrupt.h>
      19             : #include <linux/proc_fs.h>
      20             : #include <linux/seq_file.h>
      21             : #include <linux/set_memory.h>
      22             : #include <linux/debugobjects.h>
      23             : #include <linux/kallsyms.h>
      24             : #include <linux/list.h>
      25             : #include <linux/notifier.h>
      26             : #include <linux/rbtree.h>
      27             : #include <linux/xarray.h>
      28             : #include <linux/io.h>
      29             : #include <linux/rcupdate.h>
      30             : #include <linux/pfn.h>
      31             : #include <linux/kmemleak.h>
      32             : #include <linux/atomic.h>
      33             : #include <linux/compiler.h>
      34             : #include <linux/memcontrol.h>
      35             : #include <linux/llist.h>
      36             : #include <linux/bitops.h>
      37             : #include <linux/rbtree_augmented.h>
      38             : #include <linux/overflow.h>
      39             : #include <linux/pgtable.h>
      40             : #include <linux/uaccess.h>
      41             : #include <linux/hugetlb.h>
      42             : #include <linux/sched/mm.h>
      43             : #include <asm/tlbflush.h>
      44             : #include <asm/shmparam.h>
      45             : 
      46             : #define CREATE_TRACE_POINTS
      47             : #include <trace/events/vmalloc.h>
      48             : 
      49             : #include "internal.h"
      50             : #include "pgalloc-track.h"
      51             : 
      52             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
      53             : static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;
      54             : 
      55             : static int __init set_nohugeiomap(char *str)
      56             : {
      57             :         ioremap_max_page_shift = PAGE_SHIFT;
      58             :         return 0;
      59             : }
      60             : early_param("nohugeiomap", set_nohugeiomap);
      61             : #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
      62             : static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
      63             : #endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */
      64             : 
      65             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
      66             : static bool __ro_after_init vmap_allow_huge = true;
      67             : 
      68             : static int __init set_nohugevmalloc(char *str)
      69             : {
      70             :         vmap_allow_huge = false;
      71             :         return 0;
      72             : }
      73             : early_param("nohugevmalloc", set_nohugevmalloc);
      74             : #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
      75             : static const bool vmap_allow_huge = false;
      76             : #endif  /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
      77             : 
      78         262 : bool is_vmalloc_addr(const void *x)
      79             : {
      80         267 :         unsigned long addr = (unsigned long)kasan_reset_tag(x);
      81             : 
      82         262 :         return addr >= VMALLOC_START && addr < VMALLOC_END;
      83             : }
      84             : EXPORT_SYMBOL(is_vmalloc_addr);
      85             : 
      86             : struct vfree_deferred {
      87             :         struct llist_head list;
      88             :         struct work_struct wq;
      89             : };
      90             : static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
      91             : 
      92             : /*** Page table manipulation functions ***/
      93           0 : static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
      94             :                         phys_addr_t phys_addr, pgprot_t prot,
      95             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
      96             : {
      97             :         pte_t *pte;
      98             :         u64 pfn;
      99           0 :         unsigned long size = PAGE_SIZE;
     100             : 
     101           0 :         pfn = phys_addr >> PAGE_SHIFT;
     102           0 :         pte = pte_alloc_kernel_track(pmd, addr, mask);
     103           0 :         if (!pte)
     104             :                 return -ENOMEM;
     105             :         do {
     106           0 :                 BUG_ON(!pte_none(*pte));
     107             : 
     108             : #ifdef CONFIG_HUGETLB_PAGE
     109             :                 size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
     110             :                 if (size != PAGE_SIZE) {
     111             :                         pte_t entry = pfn_pte(pfn, prot);
     112             : 
     113             :                         entry = arch_make_huge_pte(entry, ilog2(size), 0);
     114             :                         set_huge_pte_at(&init_mm, addr, pte, entry);
     115             :                         pfn += PFN_DOWN(size);
     116             :                         continue;
     117             :                 }
     118             : #endif
     119           0 :                 set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
     120           0 :                 pfn++;
     121           0 :         } while (pte += PFN_DOWN(size), addr += size, addr != end);
     122           0 :         *mask |= PGTBL_PTE_MODIFIED;
     123             :         return 0;
     124             : }
     125             : 
     126             : static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
     127             :                         phys_addr_t phys_addr, pgprot_t prot,
     128             :                         unsigned int max_page_shift)
     129             : {
     130             :         if (max_page_shift < PMD_SHIFT)
     131             :                 return 0;
     132             : 
     133             :         if (!arch_vmap_pmd_supported(prot))
     134             :                 return 0;
     135             : 
     136             :         if ((end - addr) != PMD_SIZE)
     137             :                 return 0;
     138             : 
     139             :         if (!IS_ALIGNED(addr, PMD_SIZE))
     140             :                 return 0;
     141             : 
     142             :         if (!IS_ALIGNED(phys_addr, PMD_SIZE))
     143             :                 return 0;
     144             : 
     145             :         if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
     146             :                 return 0;
     147             : 
     148             :         return pmd_set_huge(pmd, phys_addr, prot);
     149             : }
     150             : 
     151           0 : static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
     152             :                         phys_addr_t phys_addr, pgprot_t prot,
     153             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
     154             : {
     155             :         pmd_t *pmd;
     156             :         unsigned long next;
     157             : 
     158           0 :         pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
     159           0 :         if (!pmd)
     160             :                 return -ENOMEM;
     161             :         do {
     162           0 :                 next = pmd_addr_end(addr, end);
     163             : 
     164           0 :                 if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
     165             :                                         max_page_shift)) {
     166             :                         *mask |= PGTBL_PMD_MODIFIED;
     167             :                         continue;
     168             :                 }
     169             : 
     170           0 :                 if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
     171             :                         return -ENOMEM;
     172           0 :         } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
     173             :         return 0;
     174             : }
     175             : 
     176             : static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
     177             :                         phys_addr_t phys_addr, pgprot_t prot,
     178             :                         unsigned int max_page_shift)
     179             : {
     180             :         if (max_page_shift < PUD_SHIFT)
     181             :                 return 0;
     182             : 
     183             :         if (!arch_vmap_pud_supported(prot))
     184             :                 return 0;
     185             : 
     186             :         if ((end - addr) != PUD_SIZE)
     187             :                 return 0;
     188             : 
     189             :         if (!IS_ALIGNED(addr, PUD_SIZE))
     190             :                 return 0;
     191             : 
     192             :         if (!IS_ALIGNED(phys_addr, PUD_SIZE))
     193             :                 return 0;
     194             : 
     195             :         if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
     196             :                 return 0;
     197             : 
     198             :         return pud_set_huge(pud, phys_addr, prot);
     199             : }
     200             : 
     201             : static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
     202             :                         phys_addr_t phys_addr, pgprot_t prot,
     203             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
     204             : {
     205             :         pud_t *pud;
     206             :         unsigned long next;
     207             : 
     208           0 :         pud = pud_alloc_track(&init_mm, p4d, addr, mask);
     209             :         if (!pud)
     210             :                 return -ENOMEM;
     211             :         do {
     212           0 :                 next = pud_addr_end(addr, end);
     213             : 
     214           0 :                 if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
     215             :                                         max_page_shift)) {
     216             :                         *mask |= PGTBL_PUD_MODIFIED;
     217             :                         continue;
     218             :                 }
     219             : 
     220           0 :                 if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
     221             :                                         max_page_shift, mask))
     222             :                         return -ENOMEM;
     223           0 :         } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
     224             :         return 0;
     225             : }
     226             : 
     227             : static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
     228             :                         phys_addr_t phys_addr, pgprot_t prot,
     229             :                         unsigned int max_page_shift)
     230             : {
     231             :         if (max_page_shift < P4D_SHIFT)
     232             :                 return 0;
     233             : 
     234             :         if (!arch_vmap_p4d_supported(prot))
     235             :                 return 0;
     236             : 
     237             :         if ((end - addr) != P4D_SIZE)
     238             :                 return 0;
     239             : 
     240             :         if (!IS_ALIGNED(addr, P4D_SIZE))
     241             :                 return 0;
     242             : 
     243             :         if (!IS_ALIGNED(phys_addr, P4D_SIZE))
     244             :                 return 0;
     245             : 
     246             :         if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
     247             :                 return 0;
     248             : 
     249             :         return p4d_set_huge(p4d, phys_addr, prot);
     250             : }
     251             : 
     252           0 : static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
     253             :                         phys_addr_t phys_addr, pgprot_t prot,
     254             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
     255             : {
     256             :         p4d_t *p4d;
     257             :         unsigned long next;
     258             : 
     259           0 :         p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
     260           0 :         if (!p4d)
     261             :                 return -ENOMEM;
     262             :         do {
     263           0 :                 next = p4d_addr_end(addr, end);
     264             : 
     265           0 :                 if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
     266             :                                         max_page_shift)) {
     267             :                         *mask |= PGTBL_P4D_MODIFIED;
     268             :                         continue;
     269             :                 }
     270             : 
     271           0 :                 if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
     272             :                                         max_page_shift, mask))
     273             :                         return -ENOMEM;
     274           0 :         } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
     275             :         return 0;
     276             : }
     277             : 
     278           0 : static int vmap_range_noflush(unsigned long addr, unsigned long end,
     279             :                         phys_addr_t phys_addr, pgprot_t prot,
     280             :                         unsigned int max_page_shift)
     281             : {
     282             :         pgd_t *pgd;
     283             :         unsigned long start;
     284             :         unsigned long next;
     285             :         int err;
     286           0 :         pgtbl_mod_mask mask = 0;
     287             : 
     288             :         might_sleep();
     289           0 :         BUG_ON(addr >= end);
     290             : 
     291           0 :         start = addr;
     292           0 :         pgd = pgd_offset_k(addr);
     293             :         do {
     294           0 :                 next = pgd_addr_end(addr, end);
     295           0 :                 err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
     296             :                                         max_page_shift, &mask);
     297           0 :                 if (err)
     298             :                         break;
     299           0 :         } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
     300             : 
     301             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
     302             :                 arch_sync_kernel_mappings(start, end);
     303             : 
     304           0 :         return err;
     305             : }
     306             : 
     307           0 : int ioremap_page_range(unsigned long addr, unsigned long end,
     308             :                 phys_addr_t phys_addr, pgprot_t prot)
     309             : {
     310             :         int err;
     311             : 
     312           0 :         err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
     313             :                                  ioremap_max_page_shift);
     314           0 :         flush_cache_vmap(addr, end);
     315             :         if (!err)
     316             :                 kmsan_ioremap_page_range(addr, end, phys_addr, prot,
     317             :                                          ioremap_max_page_shift);
     318           0 :         return err;
     319             : }
     320             : 
     321         337 : static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
     322             :                              pgtbl_mod_mask *mask)
     323             : {
     324             :         pte_t *pte;
     325             : 
     326         337 :         pte = pte_offset_kernel(pmd, addr);
     327             :         do {
     328       85468 :                 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
     329       42734 :                 WARN_ON(!pte_none(ptent) && !pte_present(ptent));
     330       42734 :         } while (pte++, addr += PAGE_SIZE, addr != end);
     331         337 :         *mask |= PGTBL_PTE_MODIFIED;
     332         337 : }
     333             : 
     334         258 : static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
     335             :                              pgtbl_mod_mask *mask)
     336             : {
     337             :         pmd_t *pmd;
     338             :         unsigned long next;
     339             :         int cleared;
     340             : 
     341         258 :         pmd = pmd_offset(pud, addr);
     342             :         do {
     343         338 :                 next = pmd_addr_end(addr, end);
     344             : 
     345         338 :                 cleared = pmd_clear_huge(pmd);
     346         338 :                 if (cleared || pmd_bad(*pmd))
     347           1 :                         *mask |= PGTBL_PMD_MODIFIED;
     348             : 
     349             :                 if (cleared)
     350             :                         continue;
     351         338 :                 if (pmd_none_or_clear_bad(pmd))
     352           1 :                         continue;
     353         337 :                 vunmap_pte_range(pmd, addr, next, mask);
     354             : 
     355         337 :                 cond_resched();
     356         338 :         } while (pmd++, addr = next, addr != end);
     357         258 : }
     358             : 
     359         258 : static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
     360             :                              pgtbl_mod_mask *mask)
     361             : {
     362             :         pud_t *pud;
     363             :         unsigned long next;
     364             :         int cleared;
     365             : 
     366         258 :         pud = pud_offset(p4d, addr);
     367             :         do {
     368         258 :                 next = pud_addr_end(addr, end);
     369             : 
     370         258 :                 cleared = pud_clear_huge(pud);
     371         258 :                 if (cleared || pud_bad(*pud))
     372           0 :                         *mask |= PGTBL_PUD_MODIFIED;
     373             : 
     374             :                 if (cleared)
     375             :                         continue;
     376         258 :                 if (pud_none_or_clear_bad(pud))
     377           0 :                         continue;
     378         258 :                 vunmap_pmd_range(pud, addr, next, mask);
     379         258 :         } while (pud++, addr = next, addr != end);
     380         258 : }
     381             : 
     382             : static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
     383             :                              pgtbl_mod_mask *mask)
     384             : {
     385             :         p4d_t *p4d;
     386             :         unsigned long next;
     387             : 
     388         258 :         p4d = p4d_offset(pgd, addr);
     389             :         do {
     390         258 :                 next = p4d_addr_end(addr, end);
     391             : 
     392         258 :                 p4d_clear_huge(p4d);
     393         258 :                 if (p4d_bad(*p4d))
     394             :                         *mask |= PGTBL_P4D_MODIFIED;
     395             : 
     396         258 :                 if (p4d_none_or_clear_bad(p4d))
     397             :                         continue;
     398         258 :                 vunmap_pud_range(p4d, addr, next, mask);
     399         258 :         } while (p4d++, addr = next, addr != end);
     400             : }
     401             : 
     402             : /*
     403             :  * vunmap_range_noflush is similar to vunmap_range, but does not
     404             :  * flush caches or TLBs.
     405             :  *
     406             :  * The caller is responsible for calling flush_cache_vmap() before calling
     407             :  * this function, and flush_tlb_kernel_range after it has returned
     408             :  * successfully (and before the addresses are expected to cause a page fault
     409             :  * or be re-mapped for something else, if TLB flushes are being delayed or
     410             :  * coalesced).
     411             :  *
     412             :  * This is an internal function only. Do not use outside mm/.
     413             :  */
     414         258 : void __vunmap_range_noflush(unsigned long start, unsigned long end)
     415             : {
     416             :         unsigned long next;
     417             :         pgd_t *pgd;
     418         258 :         unsigned long addr = start;
     419         258 :         pgtbl_mod_mask mask = 0;
     420             : 
     421         258 :         BUG_ON(addr >= end);
     422         516 :         pgd = pgd_offset_k(addr);
     423             :         do {
     424         258 :                 next = pgd_addr_end(addr, end);
     425         258 :                 if (pgd_bad(*pgd))
     426             :                         mask |= PGTBL_PGD_MODIFIED;
     427         258 :                 if (pgd_none_or_clear_bad(pgd))
     428             :                         continue;
     429             :                 vunmap_p4d_range(pgd, addr, next, &mask);
     430         258 :         } while (pgd++, addr = next, addr != end);
     431             : 
     432             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
     433             :                 arch_sync_kernel_mappings(start, end);
     434         258 : }
     435             : 
     436           0 : void vunmap_range_noflush(unsigned long start, unsigned long end)
     437             : {
     438         258 :         kmsan_vunmap_range_noflush(start, end);
     439         258 :         __vunmap_range_noflush(start, end);
     440           0 : }
     441             : 
     442             : /**
     443             :  * vunmap_range - unmap kernel virtual addresses
     444             :  * @addr: start of the VM area to unmap
     445             :  * @end: end of the VM area to unmap (non-inclusive)
     446             :  *
     447             :  * Clears any present PTEs in the virtual address range, flushes TLBs and
     448             :  * caches. Any subsequent access to the address before it has been re-mapped
     449             :  * is a kernel bug.
     450             :  */
     451           0 : void vunmap_range(unsigned long addr, unsigned long end)
     452             : {
     453           0 :         flush_cache_vunmap(addr, end);
     454           0 :         vunmap_range_noflush(addr, end);
     455           0 :         flush_tlb_kernel_range(addr, end);
     456           0 : }
     457             : 
     458         353 : static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
     459             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     460             :                 pgtbl_mod_mask *mask)
     461             : {
     462             :         pte_t *pte;
     463             : 
     464             :         /*
     465             :          * nr is a running index into the array which helps higher level
     466             :          * callers keep track of where we're up to.
     467             :          */
     468             : 
     469         706 :         pte = pte_alloc_kernel_track(pmd, addr, mask);
     470         353 :         if (!pte)
     471             :                 return -ENOMEM;
     472             :         do {
     473       42541 :                 struct page *page = pages[*nr];
     474             : 
     475       42541 :                 if (WARN_ON(!pte_none(*pte)))
     476             :                         return -EBUSY;
     477       42541 :                 if (WARN_ON(!page))
     478             :                         return -ENOMEM;
     479       85082 :                 if (WARN_ON(!pfn_valid(page_to_pfn(page))))
     480             :                         return -EINVAL;
     481             : 
     482       85082 :                 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
     483       42541 :                 (*nr)++;
     484       42541 :         } while (pte++, addr += PAGE_SIZE, addr != end);
     485         353 :         *mask |= PGTBL_PTE_MODIFIED;
     486         353 :         return 0;
     487             : }
     488             : 
     489         274 : static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
     490             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     491             :                 pgtbl_mod_mask *mask)
     492             : {
     493             :         pmd_t *pmd;
     494             :         unsigned long next;
     495             : 
     496         274 :         pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
     497         274 :         if (!pmd)
     498             :                 return -ENOMEM;
     499             :         do {
     500         353 :                 next = pmd_addr_end(addr, end);
     501         353 :                 if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
     502             :                         return -ENOMEM;
     503         353 :         } while (pmd++, addr = next, addr != end);
     504             :         return 0;
     505             : }
     506             : 
     507             : static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
     508             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     509             :                 pgtbl_mod_mask *mask)
     510             : {
     511             :         pud_t *pud;
     512             :         unsigned long next;
     513             : 
     514         548 :         pud = pud_alloc_track(&init_mm, p4d, addr, mask);
     515             :         if (!pud)
     516             :                 return -ENOMEM;
     517             :         do {
     518         274 :                 next = pud_addr_end(addr, end);
     519         274 :                 if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
     520             :                         return -ENOMEM;
     521         274 :         } while (pud++, addr = next, addr != end);
     522             :         return 0;
     523             : }
     524             : 
     525         274 : static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
     526             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     527             :                 pgtbl_mod_mask *mask)
     528             : {
     529             :         p4d_t *p4d;
     530             :         unsigned long next;
     531             : 
     532         548 :         p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
     533         274 :         if (!p4d)
     534             :                 return -ENOMEM;
     535             :         do {
     536         274 :                 next = p4d_addr_end(addr, end);
     537         274 :                 if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
     538             :                         return -ENOMEM;
     539         274 :         } while (p4d++, addr = next, addr != end);
     540         274 :         return 0;
     541             : }
     542             : 
     543         274 : static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
     544             :                 pgprot_t prot, struct page **pages)
     545             : {
     546         274 :         unsigned long start = addr;
     547             :         pgd_t *pgd;
     548             :         unsigned long next;
     549         274 :         int err = 0;
     550         274 :         int nr = 0;
     551         274 :         pgtbl_mod_mask mask = 0;
     552             : 
     553         274 :         BUG_ON(addr >= end);
     554         548 :         pgd = pgd_offset_k(addr);
     555             :         do {
     556         274 :                 next = pgd_addr_end(addr, end);
     557         274 :                 if (pgd_bad(*pgd))
     558             :                         mask |= PGTBL_PGD_MODIFIED;
     559         274 :                 err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
     560         274 :                 if (err)
     561             :                         return err;
     562         274 :         } while (pgd++, addr = next, addr != end);
     563             : 
     564             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
     565             :                 arch_sync_kernel_mappings(start, end);
     566             : 
     567             :         return 0;
     568             : }
     569             : 
     570             : /*
     571             :  * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
     572             :  * flush caches.
     573             :  *
     574             :  * The caller is responsible for calling flush_cache_vmap() after this
     575             :  * function returns successfully and before the addresses are accessed.
     576             :  *
     577             :  * This is an internal function only. Do not use outside mm/.
     578             :  */
     579         274 : int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
     580             :                 pgprot_t prot, struct page **pages, unsigned int page_shift)
     581             : {
     582         274 :         unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
     583             : 
     584         274 :         WARN_ON(page_shift < PAGE_SHIFT);
     585             : 
     586             :         if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
     587             :                         page_shift == PAGE_SHIFT)
     588         274 :                 return vmap_small_pages_range_noflush(addr, end, prot, pages);
     589             : 
     590             :         for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
     591             :                 int err;
     592             : 
     593             :                 err = vmap_range_noflush(addr, addr + (1UL << page_shift),
     594             :                                         page_to_phys(pages[i]), prot,
     595             :                                         page_shift);
     596             :                 if (err)
     597             :                         return err;
     598             : 
     599             :                 addr += 1UL << page_shift;
     600             :         }
     601             : 
     602             :         return 0;
     603             : }
     604             : 
     605           0 : int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
     606             :                 pgprot_t prot, struct page **pages, unsigned int page_shift)
     607             : {
     608         274 :         kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
     609         274 :         return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
     610             : }
     611             : 
     612             : /**
     613             :  * vmap_pages_range - map pages to a kernel virtual address
     614             :  * @addr: start of the VM area to map
     615             :  * @end: end of the VM area to map (non-inclusive)
     616             :  * @prot: page protection flags to use
     617             :  * @pages: pages to map (always PAGE_SIZE pages)
     618             :  * @page_shift: maximum shift that the pages may be mapped with, @pages must
     619             :  * be aligned and contiguous up to at least this shift.
     620             :  *
     621             :  * RETURNS:
     622             :  * 0 on success, -errno on failure.
     623             :  */
     624         274 : static int vmap_pages_range(unsigned long addr, unsigned long end,
     625             :                 pgprot_t prot, struct page **pages, unsigned int page_shift)
     626             : {
     627             :         int err;
     628             : 
     629         274 :         err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
     630         274 :         flush_cache_vmap(addr, end);
     631         274 :         return err;
     632             : }
     633             : 
     634           0 : int is_vmalloc_or_module_addr(const void *x)
     635             : {
     636             :         /*
     637             :          * ARM, x86-64 and sparc64 put modules in a special place,
     638             :          * and fall back on vmalloc() if that fails. Others
     639             :          * just put it in the vmalloc space.
     640             :          */
     641             : #if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
     642             :         unsigned long addr = (unsigned long)kasan_reset_tag(x);
     643             :         if (addr >= MODULES_VADDR && addr < MODULES_END)
     644             :                 return 1;
     645             : #endif
     646           5 :         return is_vmalloc_addr(x);
     647             : }
     648             : EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);
     649             : 
     650             : /*
     651             :  * Walk a vmap address to the struct page it maps. Huge vmap mappings will
     652             :  * return the tail page that corresponds to the base page address, which
     653             :  * matches small vmap mappings.
     654             :  */
     655           0 : struct page *vmalloc_to_page(const void *vmalloc_addr)
     656             : {
     657           0 :         unsigned long addr = (unsigned long) vmalloc_addr;
     658           0 :         struct page *page = NULL;
     659           0 :         pgd_t *pgd = pgd_offset_k(addr);
     660             :         p4d_t *p4d;
     661             :         pud_t *pud;
     662             :         pmd_t *pmd;
     663             :         pte_t *ptep, pte;
     664             : 
     665             :         /*
     666             :          * XXX we might need to change this if we add VIRTUAL_BUG_ON for
     667             :          * architectures that do not vmalloc module space
     668             :          */
     669             :         VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
     670             : 
     671             :         if (pgd_none(*pgd))
     672             :                 return NULL;
     673           0 :         if (WARN_ON_ONCE(pgd_leaf(*pgd)))
     674             :                 return NULL; /* XXX: no allowance for huge pgd */
     675           0 :         if (WARN_ON_ONCE(pgd_bad(*pgd)))
     676             :                 return NULL;
     677             : 
     678           0 :         p4d = p4d_offset(pgd, addr);
     679             :         if (p4d_none(*p4d))
     680             :                 return NULL;
     681             :         if (p4d_leaf(*p4d))
     682             :                 return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
     683           0 :         if (WARN_ON_ONCE(p4d_bad(*p4d)))
     684             :                 return NULL;
     685             : 
     686           0 :         pud = pud_offset(p4d, addr);
     687           0 :         if (pud_none(*pud))
     688             :                 return NULL;
     689             :         if (pud_leaf(*pud))
     690             :                 return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
     691           0 :         if (WARN_ON_ONCE(pud_bad(*pud)))
     692             :                 return NULL;
     693             : 
     694           0 :         pmd = pmd_offset(pud, addr);
     695           0 :         if (pmd_none(*pmd))
     696             :                 return NULL;
     697             :         if (pmd_leaf(*pmd))
     698             :                 return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
     699           0 :         if (WARN_ON_ONCE(pmd_bad(*pmd)))
     700             :                 return NULL;
     701             : 
     702           0 :         ptep = pte_offset_map(pmd, addr);
     703           0 :         pte = *ptep;
     704           0 :         if (pte_present(pte))
     705           0 :                 page = pte_page(pte);
     706             :         pte_unmap(ptep);
     707             : 
     708             :         return page;
     709             : }
     710             : EXPORT_SYMBOL(vmalloc_to_page);
     711             : 
     712             : /*
     713             :  * Map a vmalloc()-space virtual address to the physical page frame number.
     714             :  */
     715           0 : unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
     716             : {
     717           0 :         return page_to_pfn(vmalloc_to_page(vmalloc_addr));
     718             : }
     719             : EXPORT_SYMBOL(vmalloc_to_pfn);
     720             : 
     721             : 
     722             : /*** Global kva allocator ***/
     723             : 
     724             : #define DEBUG_AUGMENT_PROPAGATE_CHECK 0
     725             : #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
     726             : 
     727             : 
     728             : static DEFINE_SPINLOCK(vmap_area_lock);
     729             : static DEFINE_SPINLOCK(free_vmap_area_lock);
     730             : /* Export for kexec only */
     731             : LIST_HEAD(vmap_area_list);
     732             : static struct rb_root vmap_area_root = RB_ROOT;
     733             : static bool vmap_initialized __read_mostly;
     734             : 
     735             : static struct rb_root purge_vmap_area_root = RB_ROOT;
     736             : static LIST_HEAD(purge_vmap_area_list);
     737             : static DEFINE_SPINLOCK(purge_vmap_area_lock);
     738             : 
     739             : /*
     740             :  * This kmem_cache is used for vmap_area objects. Instead of
     741             :  * allocating from slab we reuse an object from this cache to
     742             :  * make things faster. Especially in "no edge" splitting of
     743             :  * free block.
     744             :  */
     745             : static struct kmem_cache *vmap_area_cachep;
     746             : 
     747             : /*
     748             :  * This linked list is used in pair with free_vmap_area_root.
     749             :  * It gives O(1) access to prev/next to perform fast coalescing.
     750             :  */
     751             : static LIST_HEAD(free_vmap_area_list);
     752             : 
     753             : /*
     754             :  * This augment red-black tree represents the free vmap space.
     755             :  * All vmap_area objects in this tree are sorted by va->va_start
     756             :  * address. It is used for allocation and merging when a vmap
     757             :  * object is released.
     758             :  *
     759             :  * Each vmap_area node contains a maximum available free block
     760             :  * of its sub-tree, right or left. Therefore it is possible to
     761             :  * find a lowest match of free area.
     762             :  */
     763             : static struct rb_root free_vmap_area_root = RB_ROOT;
     764             : 
     765             : /*
     766             :  * Preload a CPU with one object for "no edge" split case. The
     767             :  * aim is to get rid of allocations from the atomic context, thus
     768             :  * to use more permissive allocation masks.
     769             :  */
     770             : static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
     771             : 
     772             : static __always_inline unsigned long
     773             : va_size(struct vmap_area *va)
     774             : {
     775        1686 :         return (va->va_end - va->va_start);
     776             : }
     777             : 
     778             : static __always_inline unsigned long
     779             : get_subtree_max_size(struct rb_node *node)
     780             : {
     781             :         struct vmap_area *va;
     782             : 
     783        4016 :         va = rb_entry_safe(node, struct vmap_area, rb_node);
     784        4016 :         return va ? va->subtree_max_size : 0;
     785             : }
     786             : 
     787        2001 : RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
     788             :         struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
     789             : 
     790             : static void purge_vmap_area_lazy(void);
     791             : static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
     792             : static void drain_vmap_area_work(struct work_struct *work);
     793             : static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
     794             : 
     795             : static atomic_long_t nr_vmalloc_pages;
     796             : 
     797           0 : unsigned long vmalloc_nr_pages(void)
     798             : {
     799           0 :         return atomic_long_read(&nr_vmalloc_pages);
     800             : }
     801             : 
     802             : /* Look up the first VA which satisfies addr < va_end, NULL if none. */
     803             : static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
     804             : {
     805           0 :         struct vmap_area *va = NULL;
     806           0 :         struct rb_node *n = vmap_area_root.rb_node;
     807             : 
     808           0 :         addr = (unsigned long)kasan_reset_tag((void *)addr);
     809             : 
     810           0 :         while (n) {
     811             :                 struct vmap_area *tmp;
     812             : 
     813           0 :                 tmp = rb_entry(n, struct vmap_area, rb_node);
     814           0 :                 if (tmp->va_end > addr) {
     815           0 :                         va = tmp;
     816           0 :                         if (tmp->va_start <= addr)
     817             :                                 break;
     818             : 
     819           0 :                         n = n->rb_left;
     820             :                 } else
     821           0 :                         n = n->rb_right;
     822             :         }
     823             : 
     824             :         return va;
     825             : }
     826             : 
     827             : static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
     828             : {
     829         274 :         struct rb_node *n = root->rb_node;
     830             : 
     831         274 :         addr = (unsigned long)kasan_reset_tag((void *)addr);
     832             : 
     833        1604 :         while (n) {
     834             :                 struct vmap_area *va;
     835             : 
     836        1604 :                 va = rb_entry(n, struct vmap_area, rb_node);
     837        1604 :                 if (addr < va->va_start)
     838           0 :                         n = n->rb_left;
     839        1604 :                 else if (addr >= va->va_end)
     840        1330 :                         n = n->rb_right;
     841             :                 else
     842             :                         return va;
     843             :         }
     844             : 
     845             :         return NULL;
     846             : }
     847             : 
     848             : /*
     849             :  * This function returns back addresses of parent node
     850             :  * and its left or right link for further processing.
     851             :  *
     852             :  * Otherwise NULL is returned. In that case all further
     853             :  * steps regarding inserting of conflicting overlap range
     854             :  * have to be declined and actually considered as a bug.
     855             :  */
     856             : static __always_inline struct rb_node **
     857             : find_va_links(struct vmap_area *va,
     858             :         struct rb_root *root, struct rb_node *from,
     859             :         struct rb_node **parent)
     860             : {
     861             :         struct vmap_area *tmp_va;
     862             :         struct rb_node **link;
     863             : 
     864         275 :         if (root) {
     865         538 :                 link = &root->rb_node;
     866         538 :                 if (unlikely(!*link)) {
     867             :                         *parent = NULL;
     868             :                         return link;
     869             :                 }
     870             :         } else {
     871             :                 link = &from;
     872             :         }
     873             : 
     874             :         /*
     875             :          * Go to the bottom of the tree. When we hit the last point
     876             :          * we end up with parent rb_node and correct direction, i name
     877             :          * it link, where the new va->rb_node will be attached to.
     878             :          */
     879             :         do {
     880        1650 :                 tmp_va = rb_entry(*link, struct vmap_area, rb_node);
     881             : 
     882             :                 /*
     883             :                  * During the traversal we also do some sanity check.
     884             :                  * Trigger the BUG() if there are sides(left/right)
     885             :                  * or full overlaps.
     886             :                  */
     887        1650 :                 if (va->va_end <= tmp_va->va_start)
     888          21 :                         link = &(*link)->rb_left;
     889        1629 :                 else if (va->va_start >= tmp_va->va_end)
     890        1629 :                         link = &(*link)->rb_right;
     891             :                 else {
     892           0 :                         WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
     893             :                                 va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
     894             : 
     895             :                         return NULL;
     896             :                 }
     897        1650 :         } while (*link);
     898             : 
     899         546 :         *parent = &tmp_va->rb_node;
     900             :         return link;
     901             : }
     902             : 
     903             : static __always_inline struct list_head *
     904             : get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
     905             : {
     906             :         struct list_head *list;
     907             : 
     908         263 :         if (unlikely(!parent))
     909             :                 /*
     910             :                  * The red-black tree where we try to find VA neighbors
     911             :                  * before merging or inserting is empty, i.e. it means
     912             :                  * there is no free vmap space. Normally it does not
     913             :                  * happen but we handle this case anyway.
     914             :                  */
     915             :                 return NULL;
     916             : 
     917         257 :         list = &rb_entry(parent, struct vmap_area, rb_node)->list;
     918         257 :         return (&parent->rb_right == link ? list->next : list);
     919             : }
     920             : 
     921             : static __always_inline void
     922             : __link_va(struct vmap_area *va, struct rb_root *root,
     923             :         struct rb_node *parent, struct rb_node **link,
     924             :         struct list_head *head, bool augment)
     925             : {
     926             :         /*
     927             :          * VA is still not in the list, but we can
     928             :          * identify its future previous list_head node.
     929             :          */
     930         297 :         if (likely(parent)) {
     931         289 :                 head = &rb_entry(parent, struct vmap_area, rb_node)->list;
     932         289 :                 if (&parent->rb_right != link)
     933           8 :                         head = head->prev;
     934             :         }
     935             : 
     936             :         /* Insert to the rb-tree */
     937         594 :         rb_link_node(&va->rb_node, parent, link);
     938             :         if (augment) {
     939             :                 /*
     940             :                  * Some explanation here. Just perform simple insertion
     941             :                  * to the tree. We do not set va->subtree_max_size to
     942             :                  * its current size before calling rb_insert_augmented().
     943             :                  * It is because we populate the tree from the bottom
     944             :                  * to parent levels when the node _is_ in the tree.
     945             :                  *
     946             :                  * Therefore we set subtree_max_size to zero after insertion,
     947             :                  * to let __augment_tree_propagate_from() puts everything to
     948             :                  * the correct order later on.
     949             :                  */
     950          17 :                 rb_insert_augmented(&va->rb_node,
     951             :                         root, &free_vmap_area_rb_augment_cb);
     952          17 :                 va->subtree_max_size = 0;
     953             :         } else {
     954         280 :                 rb_insert_color(&va->rb_node, root);
     955             :         }
     956             : 
     957             :         /* Address-sort this list */
     958         314 :         list_add(&va->list, head);
     959             : }
     960             : 
     961             : static __always_inline void
     962             : link_va(struct vmap_area *va, struct rb_root *root,
     963             :         struct rb_node *parent, struct rb_node **link,
     964             :         struct list_head *head)
     965             : {
     966             :         __link_va(va, root, parent, link, head, false);
     967             : }
     968             : 
     969             : static __always_inline void
     970             : link_va_augment(struct vmap_area *va, struct rb_root *root,
     971             :         struct rb_node *parent, struct rb_node **link,
     972             :         struct list_head *head)
     973             : {
     974          17 :         __link_va(va, root, parent, link, head, true);
     975             : }
     976             : 
     977             : static __always_inline void
     978             : __unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
     979             : {
     980         258 :         if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
     981             :                 return;
     982             : 
     983             :         if (augment)
     984           0 :                 rb_erase_augmented(&va->rb_node,
     985             :                         root, &free_vmap_area_rb_augment_cb);
     986             :         else
     987         258 :                 rb_erase(&va->rb_node, root);
     988             : 
     989         516 :         list_del_init(&va->list);
     990         258 :         RB_CLEAR_NODE(&va->rb_node);
     991             : }
     992             : 
     993             : static __always_inline void
     994             : unlink_va(struct vmap_area *va, struct rb_root *root)
     995             : {
     996           0 :         __unlink_va(va, root, false);
     997             : }
     998             : 
     999             : static __always_inline void
    1000             : unlink_va_augment(struct vmap_area *va, struct rb_root *root)
    1001             : {
    1002           0 :         __unlink_va(va, root, true);
    1003             : }
    1004             : 
    1005             : #if DEBUG_AUGMENT_PROPAGATE_CHECK
    1006             : /*
    1007             :  * Gets called when remove the node and rotate.
    1008             :  */
    1009             : static __always_inline unsigned long
    1010             : compute_subtree_max_size(struct vmap_area *va)
    1011             : {
    1012             :         return max3(va_size(va),
    1013             :                 get_subtree_max_size(va->rb_node.rb_left),
    1014             :                 get_subtree_max_size(va->rb_node.rb_right));
    1015             : }
    1016             : 
    1017             : static void
    1018             : augment_tree_propagate_check(void)
    1019             : {
    1020             :         struct vmap_area *va;
    1021             :         unsigned long computed_size;
    1022             : 
    1023             :         list_for_each_entry(va, &free_vmap_area_list, list) {
    1024             :                 computed_size = compute_subtree_max_size(va);
    1025             :                 if (computed_size != va->subtree_max_size)
    1026             :                         pr_emerg("tree is corrupted: %lu, %lu\n",
    1027             :                                 va_size(va), va->subtree_max_size);
    1028             :         }
    1029             : }
    1030             : #endif
    1031             : 
    1032             : /*
    1033             :  * This function populates subtree_max_size from bottom to upper
    1034             :  * levels starting from VA point. The propagation must be done
    1035             :  * when VA size is modified by changing its va_start/va_end. Or
    1036             :  * in case of newly inserting of VA to the tree.
    1037             :  *
    1038             :  * It means that __augment_tree_propagate_from() must be called:
    1039             :  * - After VA has been inserted to the tree(free path);
    1040             :  * - After VA has been shrunk(allocation path);
    1041             :  * - After VA has been increased(merging path).
    1042             :  *
    1043             :  * Please note that, it does not mean that upper parent nodes
    1044             :  * and their subtree_max_size are recalculated all the time up
    1045             :  * to the root node.
    1046             :  *
    1047             :  *       4--8
    1048             :  *        /\
    1049             :  *       /  \
    1050             :  *      /    \
    1051             :  *    2--2  8--8
    1052             :  *
    1053             :  * For example if we modify the node 4, shrinking it to 2, then
    1054             :  * no any modification is required. If we shrink the node 2 to 1
    1055             :  * its subtree_max_size is updated only, and set to 1. If we shrink
    1056             :  * the node 8 to 6, then its subtree_max_size is set to 6 and parent
    1057             :  * node becomes 4--6.
    1058             :  */
    1059             : static __always_inline void
    1060             : augment_tree_propagate_from(struct vmap_area *va)
    1061             : {
    1062             :         /*
    1063             :          * Populate the tree from bottom towards the root until
    1064             :          * the calculated maximum available size of checked node
    1065             :          * is equal to its current one.
    1066             :          */
    1067         296 :         free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
    1068             : 
    1069             : #if DEBUG_AUGMENT_PROPAGATE_CHECK
    1070             :         augment_tree_propagate_check();
    1071             : #endif
    1072             : }
    1073             : 
    1074             : static void
    1075         274 : insert_vmap_area(struct vmap_area *va,
    1076             :         struct rb_root *root, struct list_head *head)
    1077             : {
    1078             :         struct rb_node **link;
    1079             :         struct rb_node *parent;
    1080             : 
    1081         274 :         link = find_va_links(va, root, NULL, &parent);
    1082         274 :         if (link)
    1083         274 :                 link_va(va, root, parent, link, head);
    1084         274 : }
    1085             : 
    1086             : static void
    1087          17 : insert_vmap_area_augment(struct vmap_area *va,
    1088             :         struct rb_node *from, struct rb_root *root,
    1089             :         struct list_head *head)
    1090             : {
    1091             :         struct rb_node **link;
    1092             :         struct rb_node *parent;
    1093             : 
    1094          17 :         if (from)
    1095             :                 link = find_va_links(va, NULL, from, &parent);
    1096             :         else
    1097             :                 link = find_va_links(va, root, NULL, &parent);
    1098             : 
    1099          17 :         if (link) {
    1100          34 :                 link_va_augment(va, root, parent, link, head);
    1101             :                 augment_tree_propagate_from(va);
    1102             :         }
    1103          17 : }
    1104             : 
    1105             : /*
    1106             :  * Merge de-allocated chunk of VA memory with previous
    1107             :  * and next free blocks. If coalesce is not done a new
    1108             :  * free area is inserted. If VA has been merged, it is
    1109             :  * freed.
    1110             :  *
    1111             :  * Please note, it can return NULL in case of overlap
    1112             :  * ranges, followed by WARN() report. Despite it is a
    1113             :  * buggy behaviour, a system can be alive and keep
    1114             :  * ongoing.
    1115             :  */
    1116             : static __always_inline struct vmap_area *
    1117             : __merge_or_add_vmap_area(struct vmap_area *va,
    1118             :         struct rb_root *root, struct list_head *head, bool augment)
    1119             : {
    1120             :         struct vmap_area *sibling;
    1121             :         struct list_head *next;
    1122             :         struct rb_node **link;
    1123             :         struct rb_node *parent;
    1124         263 :         bool merged = false;
    1125             : 
    1126             :         /*
    1127             :          * Find a place in the tree where VA potentially will be
    1128             :          * inserted, unless it is merged with its sibling/siblings.
    1129             :          */
    1130         263 :         link = find_va_links(va, root, NULL, &parent);
    1131         263 :         if (!link)
    1132             :                 return NULL;
    1133             : 
    1134             :         /*
    1135             :          * Get next node of VA to check if merging can be done.
    1136             :          */
    1137         526 :         next = get_va_next_sibling(parent, link);
    1138         263 :         if (unlikely(next == NULL))
    1139             :                 goto insert;
    1140             : 
    1141             :         /*
    1142             :          * start            end
    1143             :          * |                |
    1144             :          * |<------VA------>|<-----Next----->|
    1145             :          *                  |                |
    1146             :          *                  start            end
    1147             :          */
    1148         257 :         if (next != head) {
    1149           5 :                 sibling = list_entry(next, struct vmap_area, list);
    1150           5 :                 if (sibling->va_start == va->va_end) {
    1151           5 :                         sibling->va_start = va->va_start;
    1152             : 
    1153             :                         /* Free vmap_area object. */
    1154           5 :                         kmem_cache_free(vmap_area_cachep, va);
    1155             : 
    1156             :                         /* Point to the new merged area. */
    1157           5 :                         va = sibling;
    1158           5 :                         merged = true;
    1159             :                 }
    1160             :         }
    1161             : 
    1162             :         /*
    1163             :          * start            end
    1164             :          * |                |
    1165             :          * |<-----Prev----->|<------VA------>|
    1166             :          *                  |                |
    1167             :          *                  start            end
    1168             :          */
    1169         257 :         if (next->prev != head) {
    1170         257 :                 sibling = list_entry(next->prev, struct vmap_area, list);
    1171         257 :                 if (sibling->va_end == va->va_start) {
    1172             :                         /*
    1173             :                          * If both neighbors are coalesced, it is important
    1174             :                          * to unlink the "next" node first, followed by merging
    1175             :                          * with "previous" one. Otherwise the tree might not be
    1176             :                          * fully populated if a sibling's augmented value is
    1177             :                          * "normalized" because of rotation operations.
    1178             :                          */
    1179         252 :                         if (merged)
    1180           0 :                                 __unlink_va(va, root, augment);
    1181             : 
    1182         252 :                         sibling->va_end = va->va_end;
    1183             : 
    1184             :                         /* Free vmap_area object. */
    1185         252 :                         kmem_cache_free(vmap_area_cachep, va);
    1186             : 
    1187             :                         /* Point to the new merged area. */
    1188         252 :                         va = sibling;
    1189         252 :                         merged = true;
    1190             :                 }
    1191             :         }
    1192             : 
    1193             : insert:
    1194         263 :         if (!merged)
    1195           6 :                 __link_va(va, root, parent, link, head, augment);
    1196             : 
    1197             :         return va;
    1198             : }
    1199             : 
    1200             : static __always_inline struct vmap_area *
    1201             : merge_or_add_vmap_area(struct vmap_area *va,
    1202             :         struct rb_root *root, struct list_head *head)
    1203             : {
    1204         258 :         return __merge_or_add_vmap_area(va, root, head, false);
    1205             : }
    1206             : 
    1207             : static __always_inline struct vmap_area *
    1208             : merge_or_add_vmap_area_augment(struct vmap_area *va,
    1209             :         struct rb_root *root, struct list_head *head)
    1210             : {
    1211           5 :         va = __merge_or_add_vmap_area(va, root, head, true);
    1212           5 :         if (va)
    1213             :                 augment_tree_propagate_from(va);
    1214             : 
    1215             :         return va;
    1216             : }
    1217             : 
    1218             : static __always_inline bool
    1219             : is_within_this_va(struct vmap_area *va, unsigned long size,
    1220             :         unsigned long align, unsigned long vstart)
    1221             : {
    1222             :         unsigned long nva_start_addr;
    1223             : 
    1224        2145 :         if (va->va_start > vstart)
    1225        1871 :                 nva_start_addr = ALIGN(va->va_start, align);
    1226             :         else
    1227         274 :                 nva_start_addr = ALIGN(vstart, align);
    1228             : 
    1229             :         /* Can be overflowed due to big size or alignment. */
    1230        2145 :         if (nva_start_addr + size < nva_start_addr ||
    1231             :                         nva_start_addr < vstart)
    1232             :                 return false;
    1233             : 
    1234        2145 :         return (nva_start_addr + size <= va->va_end);
    1235             : }
    1236             : 
    1237             : /*
    1238             :  * Find the first free block(lowest start address) in the tree,
    1239             :  * that will accomplish the request corresponding to passing
    1240             :  * parameters. Please note, with an alignment bigger than PAGE_SIZE,
    1241             :  * a search length is adjusted to account for worst case alignment
    1242             :  * overhead.
    1243             :  */
    1244             : static __always_inline struct vmap_area *
    1245             : find_vmap_lowest_match(struct rb_root *root, unsigned long size,
    1246             :         unsigned long align, unsigned long vstart, bool adjust_search_size)
    1247             : {
    1248             :         struct vmap_area *va;
    1249             :         struct rb_node *node;
    1250             :         unsigned long length;
    1251             : 
    1252             :         /* Start from the root. */
    1253         274 :         node = root->rb_node;
    1254             : 
    1255             :         /* Adjust the search size for alignment overhead. */
    1256         274 :         length = adjust_search_size ? size + align - 1 : size;
    1257             : 
    1258        2145 :         while (node) {
    1259        2145 :                 va = rb_entry(node, struct vmap_area, rb_node);
    1260             : 
    1261        4830 :                 if (get_subtree_max_size(node->rb_left) >= length &&
    1262         540 :                                 vstart < va->va_start) {
    1263             :                         node = node->rb_left;
    1264             :                 } else {
    1265        1605 :                         if (is_within_this_va(va, size, align, vstart))
    1266             :                                 return va;
    1267             : 
    1268             :                         /*
    1269             :                          * Does not make sense to go deeper towards the right
    1270             :                          * sub-tree if it does not have a free block that is
    1271             :                          * equal or bigger to the requested search length.
    1272             :                          */
    1273        2664 :                         if (get_subtree_max_size(node->rb_right) >= length) {
    1274        1059 :                                 node = node->rb_right;
    1275        1059 :                                 continue;
    1276             :                         }
    1277             : 
    1278             :                         /*
    1279             :                          * OK. We roll back and find the first right sub-tree,
    1280             :                          * that will satisfy the search criteria. It can happen
    1281             :                          * due to "vstart" restriction or an alignment overhead
    1282             :                          * that is bigger then PAGE_SIZE.
    1283             :                          */
    1284         540 :                         while ((node = rb_parent(node))) {
    1285         540 :                                 va = rb_entry(node, struct vmap_area, rb_node);
    1286         540 :                                 if (is_within_this_va(va, size, align, vstart))
    1287             :                                         return va;
    1288             : 
    1289        1078 :                                 if (get_subtree_max_size(node->rb_right) >= length &&
    1290             :                                                 vstart <= va->va_start) {
    1291             :                                         /*
    1292             :                                          * Shift the vstart forward. Please note, we update it with
    1293             :                                          * parent's start address adding "1" because we do not want
    1294             :                                          * to enter same sub-tree after it has already been checked
    1295             :                                          * and no suitable free block found there.
    1296             :                                          */
    1297         272 :                                         vstart = va->va_start + 1;
    1298         272 :                                         node = node->rb_right;
    1299             :                                         break;
    1300             :                                 }
    1301             :                         }
    1302             :                 }
    1303             :         }
    1304             : 
    1305             :         return NULL;
    1306             : }
    1307             : 
    1308             : #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
    1309             : #include <linux/random.h>
    1310             : 
    1311             : static struct vmap_area *
    1312             : find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
    1313             :         unsigned long align, unsigned long vstart)
    1314             : {
    1315             :         struct vmap_area *va;
    1316             : 
    1317             :         list_for_each_entry(va, head, list) {
    1318             :                 if (!is_within_this_va(va, size, align, vstart))
    1319             :                         continue;
    1320             : 
    1321             :                 return va;
    1322             :         }
    1323             : 
    1324             :         return NULL;
    1325             : }
    1326             : 
    1327             : static void
    1328             : find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
    1329             :                              unsigned long size, unsigned long align)
    1330             : {
    1331             :         struct vmap_area *va_1, *va_2;
    1332             :         unsigned long vstart;
    1333             :         unsigned int rnd;
    1334             : 
    1335             :         get_random_bytes(&rnd, sizeof(rnd));
    1336             :         vstart = VMALLOC_START + rnd;
    1337             : 
    1338             :         va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
    1339             :         va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);
    1340             : 
    1341             :         if (va_1 != va_2)
    1342             :                 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
    1343             :                         va_1, va_2, vstart);
    1344             : }
    1345             : #endif
    1346             : 
    1347             : enum fit_type {
    1348             :         NOTHING_FIT = 0,
    1349             :         FL_FIT_TYPE = 1,        /* full fit */
    1350             :         LE_FIT_TYPE = 2,        /* left edge fit */
    1351             :         RE_FIT_TYPE = 3,        /* right edge fit */
    1352             :         NE_FIT_TYPE = 4         /* no edge fit */
    1353             : };
    1354             : 
    1355             : static __always_inline enum fit_type
    1356             : classify_va_fit_type(struct vmap_area *va,
    1357             :         unsigned long nva_start_addr, unsigned long size)
    1358             : {
    1359             :         enum fit_type type;
    1360             : 
    1361             :         /* Check if it is within VA. */
    1362         548 :         if (nva_start_addr < va->va_start ||
    1363         274 :                         nva_start_addr + size > va->va_end)
    1364             :                 return NOTHING_FIT;
    1365             : 
    1366             :         /* Now classify. */
    1367         274 :         if (va->va_start == nva_start_addr) {
    1368         258 :                 if (va->va_end == nva_start_addr + size)
    1369             :                         type = FL_FIT_TYPE;
    1370             :                 else
    1371         258 :                         type = LE_FIT_TYPE;
    1372          16 :         } else if (va->va_end == nva_start_addr + size) {
    1373             :                 type = RE_FIT_TYPE;
    1374             :         } else {
    1375          16 :                 type = NE_FIT_TYPE;
    1376             :         }
    1377             : 
    1378             :         return type;
    1379             : }
    1380             : 
    1381             : static __always_inline int
    1382             : adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
    1383             :                       struct vmap_area *va, unsigned long nva_start_addr,
    1384             :                       unsigned long size)
    1385             : {
    1386         274 :         struct vmap_area *lva = NULL;
    1387         274 :         enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
    1388             : 
    1389         274 :         if (type == FL_FIT_TYPE) {
    1390             :                 /*
    1391             :                  * No need to split VA, it fully fits.
    1392             :                  *
    1393             :                  * |               |
    1394             :                  * V      NVA      V
    1395             :                  * |---------------|
    1396             :                  */
    1397           0 :                 unlink_va_augment(va, root);
    1398           0 :                 kmem_cache_free(vmap_area_cachep, va);
    1399         274 :         } else if (type == LE_FIT_TYPE) {
    1400             :                 /*
    1401             :                  * Split left edge of fit VA.
    1402             :                  *
    1403             :                  * |       |
    1404             :                  * V  NVA  V   R
    1405             :                  * |-------|-------|
    1406             :                  */
    1407         258 :                 va->va_start += size;
    1408          16 :         } else if (type == RE_FIT_TYPE) {
    1409             :                 /*
    1410             :                  * Split right edge of fit VA.
    1411             :                  *
    1412             :                  *         |       |
    1413             :                  *     L   V  NVA  V
    1414             :                  * |-------|-------|
    1415             :                  */
    1416           0 :                 va->va_end = nva_start_addr;
    1417          16 :         } else if (type == NE_FIT_TYPE) {
    1418             :                 /*
    1419             :                  * Split no edge of fit VA.
    1420             :                  *
    1421             :                  *     |       |
    1422             :                  *   L V  NVA  V R
    1423             :                  * |---|-------|---|
    1424             :                  */
    1425          16 :                 lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
    1426          16 :                 if (unlikely(!lva)) {
    1427             :                         /*
    1428             :                          * For percpu allocator we do not do any pre-allocation
    1429             :                          * and leave it as it is. The reason is it most likely
    1430             :                          * never ends up with NE_FIT_TYPE splitting. In case of
    1431             :                          * percpu allocations offsets and sizes are aligned to
    1432             :                          * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
    1433             :                          * are its main fitting cases.
    1434             :                          *
    1435             :                          * There are a few exceptions though, as an example it is
    1436             :                          * a first allocation (early boot up) when we have "one"
    1437             :                          * big free space that has to be split.
    1438             :                          *
    1439             :                          * Also we can hit this path in case of regular "vmap"
    1440             :                          * allocations, if "this" current CPU was not preloaded.
    1441             :                          * See the comment in alloc_vmap_area() why. If so, then
    1442             :                          * GFP_NOWAIT is used instead to get an extra object for
    1443             :                          * split purpose. That is rare and most time does not
    1444             :                          * occur.
    1445             :                          *
    1446             :                          * What happens if an allocation gets failed. Basically,
    1447             :                          * an "overflow" path is triggered to purge lazily freed
    1448             :                          * areas to free some memory, then, the "retry" path is
    1449             :                          * triggered to repeat one more time. See more details
    1450             :                          * in alloc_vmap_area() function.
    1451             :                          */
    1452           0 :                         lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
    1453           0 :                         if (!lva)
    1454             :                                 return -1;
    1455             :                 }
    1456             : 
    1457             :                 /*
    1458             :                  * Build the remainder.
    1459             :                  */
    1460          16 :                 lva->va_start = va->va_start;
    1461          16 :                 lva->va_end = nva_start_addr;
    1462             : 
    1463             :                 /*
    1464             :                  * Shrink this VA to remaining size.
    1465             :                  */
    1466          16 :                 va->va_start = nva_start_addr + size;
    1467             :         } else {
    1468             :                 return -1;
    1469             :         }
    1470             : 
    1471         274 :         if (type != FL_FIT_TYPE) {
    1472         274 :                 augment_tree_propagate_from(va);
    1473             : 
    1474         274 :                 if (lva)        /* type == NE_FIT_TYPE */
    1475          16 :                         insert_vmap_area_augment(lva, &va->rb_node, root, head);
    1476             :         }
    1477             : 
    1478             :         return 0;
    1479             : }
    1480             : 
    1481             : /*
    1482             :  * Returns a start address of the newly allocated area, if success.
    1483             :  * Otherwise a vend is returned that indicates failure.
    1484             :  */
    1485             : static __always_inline unsigned long
    1486             : __alloc_vmap_area(struct rb_root *root, struct list_head *head,
    1487             :         unsigned long size, unsigned long align,
    1488             :         unsigned long vstart, unsigned long vend)
    1489             : {
    1490         274 :         bool adjust_search_size = true;
    1491             :         unsigned long nva_start_addr;
    1492             :         struct vmap_area *va;
    1493             :         int ret;
    1494             : 
    1495             :         /*
    1496             :          * Do not adjust when:
    1497             :          *   a) align <= PAGE_SIZE, because it does not make any sense.
    1498             :          *      All blocks(their start addresses) are at least PAGE_SIZE
    1499             :          *      aligned anyway;
    1500             :          *   b) a short range where a requested size corresponds to exactly
    1501             :          *      specified [vstart:vend] interval and an alignment > PAGE_SIZE.
    1502             :          *      With adjusted search length an allocation would not succeed.
    1503             :          */
    1504         274 :         if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
    1505         258 :                 adjust_search_size = false;
    1506             : 
    1507         548 :         va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
    1508         274 :         if (unlikely(!va))
    1509             :                 return vend;
    1510             : 
    1511         274 :         if (va->va_start > vstart)
    1512         273 :                 nva_start_addr = ALIGN(va->va_start, align);
    1513             :         else
    1514           1 :                 nva_start_addr = ALIGN(vstart, align);
    1515             : 
    1516             :         /* Check the "vend" restriction. */
    1517         274 :         if (nva_start_addr + size > vend)
    1518             :                 return vend;
    1519             : 
    1520             :         /* Update the free vmap_area. */
    1521         274 :         ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
    1522         274 :         if (WARN_ON_ONCE(ret))
    1523             :                 return vend;
    1524             : 
    1525             : #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
    1526             :         find_vmap_lowest_match_check(root, head, size, align);
    1527             : #endif
    1528             : 
    1529             :         return nva_start_addr;
    1530             : }
    1531             : 
    1532             : /*
    1533             :  * Free a region of KVA allocated by alloc_vmap_area
    1534             :  */
    1535           0 : static void free_vmap_area(struct vmap_area *va)
    1536             : {
    1537             :         /*
    1538             :          * Remove from the busy tree/list.
    1539             :          */
    1540           0 :         spin_lock(&vmap_area_lock);
    1541           0 :         unlink_va(va, &vmap_area_root);
    1542           0 :         spin_unlock(&vmap_area_lock);
    1543             : 
    1544             :         /*
    1545             :          * Insert/Merge it back to the free tree/list.
    1546             :          */
    1547           0 :         spin_lock(&free_vmap_area_lock);
    1548           0 :         merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
    1549           0 :         spin_unlock(&free_vmap_area_lock);
    1550           0 : }
    1551             : 
    1552             : static inline void
    1553         274 : preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
    1554             : {
    1555         274 :         struct vmap_area *va = NULL;
    1556             : 
    1557             :         /*
    1558             :          * Preload this CPU with one extra vmap_area object. It is used
    1559             :          * when fit type of free area is NE_FIT_TYPE. It guarantees that
    1560             :          * a CPU that does an allocation is preloaded.
    1561             :          *
    1562             :          * We do it in non-atomic context, thus it allows us to use more
    1563             :          * permissive allocation masks to be more stable under low memory
    1564             :          * condition and high memory pressure.
    1565             :          */
    1566         274 :         if (!this_cpu_read(ne_fit_preload_node))
    1567          17 :                 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
    1568             : 
    1569         274 :         spin_lock(lock);
    1570             : 
    1571         274 :         if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
    1572           0 :                 kmem_cache_free(vmap_area_cachep, va);
    1573         274 : }
    1574             : 
    1575             : /*
    1576             :  * Allocate a region of KVA of the specified size and alignment, within the
    1577             :  * vstart and vend.
    1578             :  */
    1579         274 : static struct vmap_area *alloc_vmap_area(unsigned long size,
    1580             :                                 unsigned long align,
    1581             :                                 unsigned long vstart, unsigned long vend,
    1582             :                                 int node, gfp_t gfp_mask,
    1583             :                                 unsigned long va_flags)
    1584             : {
    1585             :         struct vmap_area *va;
    1586             :         unsigned long freed;
    1587             :         unsigned long addr;
    1588         274 :         int purged = 0;
    1589             :         int ret;
    1590             : 
    1591         548 :         if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
    1592             :                 return ERR_PTR(-EINVAL);
    1593             : 
    1594         274 :         if (unlikely(!vmap_initialized))
    1595             :                 return ERR_PTR(-EBUSY);
    1596             : 
    1597             :         might_sleep();
    1598         274 :         gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
    1599             : 
    1600         274 :         va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
    1601         274 :         if (unlikely(!va))
    1602             :                 return ERR_PTR(-ENOMEM);
    1603             : 
    1604             :         /*
    1605             :          * Only scan the relevant parts containing pointers to other objects
    1606             :          * to avoid false negatives.
    1607             :          */
    1608             :         kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
    1609             : 
    1610             : retry:
    1611         274 :         preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
    1612         274 :         addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
    1613             :                 size, align, vstart, vend);
    1614         274 :         spin_unlock(&free_vmap_area_lock);
    1615             : 
    1616         274 :         trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
    1617             : 
    1618             :         /*
    1619             :          * If an allocation fails, the "vend" address is
    1620             :          * returned. Therefore trigger the overflow path.
    1621             :          */
    1622         274 :         if (unlikely(addr == vend))
    1623             :                 goto overflow;
    1624             : 
    1625         274 :         va->va_start = addr;
    1626         274 :         va->va_end = addr + size;
    1627         274 :         va->vm = NULL;
    1628         274 :         va->flags = va_flags;
    1629             : 
    1630         274 :         spin_lock(&vmap_area_lock);
    1631         274 :         insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
    1632         274 :         spin_unlock(&vmap_area_lock);
    1633             : 
    1634         274 :         BUG_ON(!IS_ALIGNED(va->va_start, align));
    1635         274 :         BUG_ON(va->va_start < vstart);
    1636         274 :         BUG_ON(va->va_end > vend);
    1637             : 
    1638             :         ret = kasan_populate_vmalloc(addr, size);
    1639             :         if (ret) {
    1640             :                 free_vmap_area(va);
    1641             :                 return ERR_PTR(ret);
    1642             :         }
    1643             : 
    1644             :         return va;
    1645             : 
    1646             : overflow:
    1647           0 :         if (!purged) {
    1648           0 :                 purge_vmap_area_lazy();
    1649           0 :                 purged = 1;
    1650           0 :                 goto retry;
    1651             :         }
    1652             : 
    1653           0 :         freed = 0;
    1654           0 :         blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
    1655             : 
    1656           0 :         if (freed > 0) {
    1657             :                 purged = 0;
    1658             :                 goto retry;
    1659             :         }
    1660             : 
    1661           0 :         if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
    1662           0 :                 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
    1663             :                         size);
    1664             : 
    1665           0 :         kmem_cache_free(vmap_area_cachep, va);
    1666           0 :         return ERR_PTR(-EBUSY);
    1667             : }
    1668             : 
    1669           0 : int register_vmap_purge_notifier(struct notifier_block *nb)
    1670             : {
    1671           0 :         return blocking_notifier_chain_register(&vmap_notify_list, nb);
    1672             : }
    1673             : EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
    1674             : 
    1675           0 : int unregister_vmap_purge_notifier(struct notifier_block *nb)
    1676             : {
    1677           0 :         return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
    1678             : }
    1679             : EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
    1680             : 
    1681             : /*
    1682             :  * lazy_max_pages is the maximum amount of virtual address space we gather up
    1683             :  * before attempting to purge with a TLB flush.
    1684             :  *
    1685             :  * There is a tradeoff here: a larger number will cover more kernel page tables
    1686             :  * and take slightly longer to purge, but it will linearly reduce the number of
    1687             :  * global TLB flushes that must be performed. It would seem natural to scale
    1688             :  * this number up linearly with the number of CPUs (because vmapping activity
    1689             :  * could also scale linearly with the number of CPUs), however it is likely
    1690             :  * that in practice, workloads might be constrained in other ways that mean
    1691             :  * vmap activity will not scale linearly with CPUs. Also, I want to be
    1692             :  * conservative and not introduce a big latency on huge systems, so go with
    1693             :  * a less aggressive log scale. It will still be an improvement over the old
    1694             :  * code, and it will be simple to change the scale factor if we find that it
    1695             :  * becomes a problem on bigger systems.
    1696             :  */
    1697             : static unsigned long lazy_max_pages(void)
    1698             : {
    1699             :         unsigned int log;
    1700             : 
    1701         268 :         log = fls(num_online_cpus());
    1702             : 
    1703         268 :         return log * (32UL * 1024 * 1024 / PAGE_SIZE);
    1704             : }
    1705             : 
    1706             : static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
    1707             : 
    1708             : /*
    1709             :  * Serialize vmap purging.  There is no actual critical section protected
    1710             :  * by this lock, but we want to avoid concurrent calls for performance
    1711             :  * reasons and to make the pcpu_get_vm_areas more deterministic.
    1712             :  */
    1713             : static DEFINE_MUTEX(vmap_purge_lock);
    1714             : 
    1715             : /* for per-CPU blocks */
    1716             : static void purge_fragmented_blocks_allcpus(void);
    1717             : 
    1718             : /*
    1719             :  * Purges all lazily-freed vmap areas.
    1720             :  */
    1721           5 : static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
    1722             : {
    1723             :         unsigned long resched_threshold;
    1724           5 :         unsigned int num_purged_areas = 0;
    1725             :         struct list_head local_purge_list;
    1726             :         struct vmap_area *va, *n_va;
    1727             : 
    1728             :         lockdep_assert_held(&vmap_purge_lock);
    1729             : 
    1730           5 :         spin_lock(&purge_vmap_area_lock);
    1731           5 :         purge_vmap_area_root = RB_ROOT;
    1732           5 :         list_replace_init(&purge_vmap_area_list, &local_purge_list);
    1733           5 :         spin_unlock(&purge_vmap_area_lock);
    1734             : 
    1735           5 :         if (unlikely(list_empty(&local_purge_list)))
    1736             :                 goto out;
    1737             : 
    1738           5 :         start = min(start,
    1739             :                 list_first_entry(&local_purge_list,
    1740             :                         struct vmap_area, list)->va_start);
    1741             : 
    1742           5 :         end = max(end,
    1743             :                 list_last_entry(&local_purge_list,
    1744             :                         struct vmap_area, list)->va_end);
    1745             : 
    1746           5 :         flush_tlb_kernel_range(start, end);
    1747           5 :         resched_threshold = lazy_max_pages() << 1;
    1748             : 
    1749           5 :         spin_lock(&free_vmap_area_lock);
    1750          10 :         list_for_each_entry_safe(va, n_va, &local_purge_list, list) {
    1751           5 :                 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
    1752           5 :                 unsigned long orig_start = va->va_start;
    1753           5 :                 unsigned long orig_end = va->va_end;
    1754             : 
    1755             :                 /*
    1756             :                  * Finally insert or merge lazily-freed area. It is
    1757             :                  * detached and there is no need to "unlink" it from
    1758             :                  * anything.
    1759             :                  */
    1760           5 :                 va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
    1761             :                                 &free_vmap_area_list);
    1762             : 
    1763           5 :                 if (!va)
    1764           0 :                         continue;
    1765             : 
    1766          10 :                 if (is_vmalloc_or_module_addr((void *)orig_start))
    1767             :                         kasan_release_vmalloc(orig_start, orig_end,
    1768             :                                               va->va_start, va->va_end);
    1769             : 
    1770          10 :                 atomic_long_sub(nr, &vmap_lazy_nr);
    1771           5 :                 num_purged_areas++;
    1772             : 
    1773           5 :                 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
    1774           5 :                         cond_resched_lock(&free_vmap_area_lock);
    1775             :         }
    1776             :         spin_unlock(&free_vmap_area_lock);
    1777             : 
    1778             : out:
    1779           5 :         trace_purge_vmap_area_lazy(start, end, num_purged_areas);
    1780           5 :         return num_purged_areas > 0;
    1781             : }
    1782             : 
    1783             : /*
    1784             :  * Kick off a purge of the outstanding lazy areas.
    1785             :  */
    1786           0 : static void purge_vmap_area_lazy(void)
    1787             : {
    1788           0 :         mutex_lock(&vmap_purge_lock);
    1789           0 :         purge_fragmented_blocks_allcpus();
    1790           0 :         __purge_vmap_area_lazy(ULONG_MAX, 0);
    1791           0 :         mutex_unlock(&vmap_purge_lock);
    1792           0 : }
    1793             : 
    1794           5 : static void drain_vmap_area_work(struct work_struct *work)
    1795             : {
    1796             :         unsigned long nr_lazy;
    1797             : 
    1798             :         do {
    1799           5 :                 mutex_lock(&vmap_purge_lock);
    1800           5 :                 __purge_vmap_area_lazy(ULONG_MAX, 0);
    1801           5 :                 mutex_unlock(&vmap_purge_lock);
    1802             : 
    1803             :                 /* Recheck if further work is required. */
    1804           5 :                 nr_lazy = atomic_long_read(&vmap_lazy_nr);
    1805           5 :         } while (nr_lazy > lazy_max_pages());
    1806           5 : }
    1807             : 
    1808             : /*
    1809             :  * Free a vmap area, caller ensuring that the area has been unmapped,
    1810             :  * unlinked and flush_cache_vunmap had been called for the correct
    1811             :  * range previously.
    1812             :  */
    1813         258 : static void free_vmap_area_noflush(struct vmap_area *va)
    1814             : {
    1815         258 :         unsigned long nr_lazy_max = lazy_max_pages();
    1816         258 :         unsigned long va_start = va->va_start;
    1817             :         unsigned long nr_lazy;
    1818             : 
    1819         516 :         if (WARN_ON_ONCE(!list_empty(&va->list)))
    1820             :                 return;
    1821             : 
    1822         516 :         nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
    1823             :                                 PAGE_SHIFT, &vmap_lazy_nr);
    1824             : 
    1825             :         /*
    1826             :          * Merge or place it to the purge tree/list.
    1827             :          */
    1828         258 :         spin_lock(&purge_vmap_area_lock);
    1829         258 :         merge_or_add_vmap_area(va,
    1830             :                 &purge_vmap_area_root, &purge_vmap_area_list);
    1831         258 :         spin_unlock(&purge_vmap_area_lock);
    1832             : 
    1833         258 :         trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
    1834             : 
    1835             :         /* After this point, we may free va at any time */
    1836         258 :         if (unlikely(nr_lazy > nr_lazy_max))
    1837             :                 schedule_work(&drain_vmap_work);
    1838             : }
    1839             : 
    1840             : /*
    1841             :  * Free and unmap a vmap area
    1842             :  */
    1843         258 : static void free_unmap_vmap_area(struct vmap_area *va)
    1844             : {
    1845         258 :         flush_cache_vunmap(va->va_start, va->va_end);
    1846         516 :         vunmap_range_noflush(va->va_start, va->va_end);
    1847             :         if (debug_pagealloc_enabled_static())
    1848             :                 flush_tlb_kernel_range(va->va_start, va->va_end);
    1849             : 
    1850         258 :         free_vmap_area_noflush(va);
    1851         258 : }
    1852             : 
    1853           0 : struct vmap_area *find_vmap_area(unsigned long addr)
    1854             : {
    1855             :         struct vmap_area *va;
    1856             : 
    1857          16 :         spin_lock(&vmap_area_lock);
    1858          16 :         va = __find_vmap_area(addr, &vmap_area_root);
    1859          16 :         spin_unlock(&vmap_area_lock);
    1860             : 
    1861           0 :         return va;
    1862             : }
    1863             : 
    1864         258 : static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
    1865             : {
    1866             :         struct vmap_area *va;
    1867             : 
    1868         258 :         spin_lock(&vmap_area_lock);
    1869         258 :         va = __find_vmap_area(addr, &vmap_area_root);
    1870         258 :         if (va)
    1871             :                 unlink_va(va, &vmap_area_root);
    1872         258 :         spin_unlock(&vmap_area_lock);
    1873             : 
    1874         258 :         return va;
    1875             : }
    1876             : 
    1877             : /*** Per cpu kva allocator ***/
    1878             : 
    1879             : /*
    1880             :  * vmap space is limited especially on 32 bit architectures. Ensure there is
    1881             :  * room for at least 16 percpu vmap blocks per CPU.
    1882             :  */
    1883             : /*
    1884             :  * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
    1885             :  * to #define VMALLOC_SPACE             (VMALLOC_END-VMALLOC_START). Guess
    1886             :  * instead (we just need a rough idea)
    1887             :  */
    1888             : #if BITS_PER_LONG == 32
    1889             : #define VMALLOC_SPACE           (128UL*1024*1024)
    1890             : #else
    1891             : #define VMALLOC_SPACE           (128UL*1024*1024*1024)
    1892             : #endif
    1893             : 
    1894             : #define VMALLOC_PAGES           (VMALLOC_SPACE / PAGE_SIZE)
    1895             : #define VMAP_MAX_ALLOC          BITS_PER_LONG   /* 256K with 4K pages */
    1896             : #define VMAP_BBMAP_BITS_MAX     1024    /* 4MB with 4K pages */
    1897             : #define VMAP_BBMAP_BITS_MIN     (VMAP_MAX_ALLOC*2)
    1898             : #define VMAP_MIN(x, y)          ((x) < (y) ? (x) : (y)) /* can't use min() */
    1899             : #define VMAP_MAX(x, y)          ((x) > (y) ? (x) : (y)) /* can't use max() */
    1900             : #define VMAP_BBMAP_BITS         \
    1901             :                 VMAP_MIN(VMAP_BBMAP_BITS_MAX,   \
    1902             :                 VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
    1903             :                         VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
    1904             : 
    1905             : #define VMAP_BLOCK_SIZE         (VMAP_BBMAP_BITS * PAGE_SIZE)
    1906             : 
    1907             : #define VMAP_RAM                0x1 /* indicates vm_map_ram area*/
    1908             : #define VMAP_BLOCK              0x2 /* mark out the vmap_block sub-type*/
    1909             : #define VMAP_FLAGS_MASK         0x3
    1910             : 
    1911             : struct vmap_block_queue {
    1912             :         spinlock_t lock;
    1913             :         struct list_head free;
    1914             : };
    1915             : 
    1916             : struct vmap_block {
    1917             :         spinlock_t lock;
    1918             :         struct vmap_area *va;
    1919             :         unsigned long free, dirty;
    1920             :         DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
    1921             :         unsigned long dirty_min, dirty_max; /*< dirty range */
    1922             :         struct list_head free_list;
    1923             :         struct rcu_head rcu_head;
    1924             :         struct list_head purge;
    1925             : };
    1926             : 
    1927             : /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
    1928             : static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
    1929             : 
    1930             : /*
    1931             :  * XArray of vmap blocks, indexed by address, to quickly find a vmap block
    1932             :  * in the free path. Could get rid of this if we change the API to return a
    1933             :  * "cookie" from alloc, to be passed to free. But no big deal yet.
    1934             :  */
    1935             : static DEFINE_XARRAY(vmap_blocks);
    1936             : 
    1937             : /*
    1938             :  * We should probably have a fallback mechanism to allocate virtual memory
    1939             :  * out of partially filled vmap blocks. However vmap block sizing should be
    1940             :  * fairly reasonable according to the vmalloc size, so it shouldn't be a
    1941             :  * big problem.
    1942             :  */
    1943             : 
    1944             : static unsigned long addr_to_vb_idx(unsigned long addr)
    1945             : {
    1946           0 :         addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
    1947           0 :         addr /= VMAP_BLOCK_SIZE;
    1948             :         return addr;
    1949             : }
    1950             : 
    1951           0 : static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
    1952             : {
    1953             :         unsigned long addr;
    1954             : 
    1955           0 :         addr = va_start + (pages_off << PAGE_SHIFT);
    1956           0 :         BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
    1957           0 :         return (void *)addr;
    1958             : }
    1959             : 
    1960             : /**
    1961             :  * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
    1962             :  *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
    1963             :  * @order:    how many 2^order pages should be occupied in newly allocated block
    1964             :  * @gfp_mask: flags for the page level allocator
    1965             :  *
    1966             :  * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
    1967             :  */
    1968           0 : static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
    1969             : {
    1970             :         struct vmap_block_queue *vbq;
    1971             :         struct vmap_block *vb;
    1972             :         struct vmap_area *va;
    1973             :         unsigned long vb_idx;
    1974             :         int node, err;
    1975             :         void *vaddr;
    1976             : 
    1977           0 :         node = numa_node_id();
    1978             : 
    1979           0 :         vb = kmalloc_node(sizeof(struct vmap_block),
    1980             :                         gfp_mask & GFP_RECLAIM_MASK, node);
    1981           0 :         if (unlikely(!vb))
    1982             :                 return ERR_PTR(-ENOMEM);
    1983             : 
    1984           0 :         va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
    1985           0 :                                         VMALLOC_START, VMALLOC_END,
    1986             :                                         node, gfp_mask,
    1987             :                                         VMAP_RAM|VMAP_BLOCK);
    1988           0 :         if (IS_ERR(va)) {
    1989           0 :                 kfree(vb);
    1990           0 :                 return ERR_CAST(va);
    1991             :         }
    1992             : 
    1993           0 :         vaddr = vmap_block_vaddr(va->va_start, 0);
    1994           0 :         spin_lock_init(&vb->lock);
    1995           0 :         vb->va = va;
    1996             :         /* At least something should be left free */
    1997           0 :         BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
    1998           0 :         bitmap_zero(vb->used_map, VMAP_BBMAP_BITS);
    1999           0 :         vb->free = VMAP_BBMAP_BITS - (1UL << order);
    2000           0 :         vb->dirty = 0;
    2001           0 :         vb->dirty_min = VMAP_BBMAP_BITS;
    2002           0 :         vb->dirty_max = 0;
    2003           0 :         bitmap_set(vb->used_map, 0, (1UL << order));
    2004           0 :         INIT_LIST_HEAD(&vb->free_list);
    2005             : 
    2006           0 :         vb_idx = addr_to_vb_idx(va->va_start);
    2007           0 :         err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
    2008           0 :         if (err) {
    2009           0 :                 kfree(vb);
    2010           0 :                 free_vmap_area(va);
    2011           0 :                 return ERR_PTR(err);
    2012             :         }
    2013             : 
    2014           0 :         vbq = raw_cpu_ptr(&vmap_block_queue);
    2015           0 :         spin_lock(&vbq->lock);
    2016           0 :         list_add_tail_rcu(&vb->free_list, &vbq->free);
    2017           0 :         spin_unlock(&vbq->lock);
    2018             : 
    2019           0 :         return vaddr;
    2020             : }
    2021             : 
    2022           0 : static void free_vmap_block(struct vmap_block *vb)
    2023             : {
    2024             :         struct vmap_block *tmp;
    2025             : 
    2026           0 :         tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
    2027           0 :         BUG_ON(tmp != vb);
    2028             : 
    2029           0 :         spin_lock(&vmap_area_lock);
    2030           0 :         unlink_va(vb->va, &vmap_area_root);
    2031           0 :         spin_unlock(&vmap_area_lock);
    2032             : 
    2033           0 :         free_vmap_area_noflush(vb->va);
    2034           0 :         kfree_rcu(vb, rcu_head);
    2035           0 : }
    2036             : 
    2037           0 : static void purge_fragmented_blocks(int cpu)
    2038             : {
    2039           0 :         LIST_HEAD(purge);
    2040             :         struct vmap_block *vb;
    2041             :         struct vmap_block *n_vb;
    2042           0 :         struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
    2043             : 
    2044             :         rcu_read_lock();
    2045           0 :         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
    2046             : 
    2047           0 :                 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
    2048           0 :                         continue;
    2049             : 
    2050           0 :                 spin_lock(&vb->lock);
    2051           0 :                 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
    2052           0 :                         vb->free = 0; /* prevent further allocs after releasing lock */
    2053           0 :                         vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
    2054           0 :                         vb->dirty_min = 0;
    2055           0 :                         vb->dirty_max = VMAP_BBMAP_BITS;
    2056           0 :                         spin_lock(&vbq->lock);
    2057           0 :                         list_del_rcu(&vb->free_list);
    2058           0 :                         spin_unlock(&vbq->lock);
    2059           0 :                         spin_unlock(&vb->lock);
    2060           0 :                         list_add_tail(&vb->purge, &purge);
    2061             :                 } else
    2062           0 :                         spin_unlock(&vb->lock);
    2063             :         }
    2064             :         rcu_read_unlock();
    2065             : 
    2066           0 :         list_for_each_entry_safe(vb, n_vb, &purge, purge) {
    2067           0 :                 list_del(&vb->purge);
    2068           0 :                 free_vmap_block(vb);
    2069             :         }
    2070           0 : }
    2071             : 
    2072             : static void purge_fragmented_blocks_allcpus(void)
    2073             : {
    2074             :         int cpu;
    2075             : 
    2076           0 :         for_each_possible_cpu(cpu)
    2077           0 :                 purge_fragmented_blocks(cpu);
    2078             : }
    2079             : 
    2080           0 : static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
    2081             : {
    2082             :         struct vmap_block_queue *vbq;
    2083             :         struct vmap_block *vb;
    2084           0 :         void *vaddr = NULL;
    2085             :         unsigned int order;
    2086             : 
    2087           0 :         BUG_ON(offset_in_page(size));
    2088           0 :         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
    2089           0 :         if (WARN_ON(size == 0)) {
    2090             :                 /*
    2091             :                  * Allocating 0 bytes isn't what caller wants since
    2092             :                  * get_order(0) returns funny result. Just warn and terminate
    2093             :                  * early.
    2094             :                  */
    2095             :                 return NULL;
    2096             :         }
    2097           0 :         order = get_order(size);
    2098             : 
    2099             :         rcu_read_lock();
    2100           0 :         vbq = raw_cpu_ptr(&vmap_block_queue);
    2101           0 :         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
    2102             :                 unsigned long pages_off;
    2103             : 
    2104           0 :                 spin_lock(&vb->lock);
    2105           0 :                 if (vb->free < (1UL << order)) {
    2106           0 :                         spin_unlock(&vb->lock);
    2107           0 :                         continue;
    2108             :                 }
    2109             : 
    2110           0 :                 pages_off = VMAP_BBMAP_BITS - vb->free;
    2111           0 :                 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
    2112           0 :                 vb->free -= 1UL << order;
    2113           0 :                 bitmap_set(vb->used_map, pages_off, (1UL << order));
    2114           0 :                 if (vb->free == 0) {
    2115           0 :                         spin_lock(&vbq->lock);
    2116           0 :                         list_del_rcu(&vb->free_list);
    2117           0 :                         spin_unlock(&vbq->lock);
    2118             :                 }
    2119             : 
    2120           0 :                 spin_unlock(&vb->lock);
    2121             :                 break;
    2122             :         }
    2123             : 
    2124             :         rcu_read_unlock();
    2125             : 
    2126             :         /* Allocate new block if nothing was found */
    2127           0 :         if (!vaddr)
    2128           0 :                 vaddr = new_vmap_block(order, gfp_mask);
    2129             : 
    2130             :         return vaddr;
    2131             : }
    2132             : 
    2133           0 : static void vb_free(unsigned long addr, unsigned long size)
    2134             : {
    2135             :         unsigned long offset;
    2136             :         unsigned int order;
    2137             :         struct vmap_block *vb;
    2138             : 
    2139           0 :         BUG_ON(offset_in_page(size));
    2140           0 :         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
    2141             : 
    2142           0 :         flush_cache_vunmap(addr, addr + size);
    2143             : 
    2144           0 :         order = get_order(size);
    2145           0 :         offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
    2146           0 :         vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
    2147           0 :         spin_lock(&vb->lock);
    2148           0 :         bitmap_clear(vb->used_map, offset, (1UL << order));
    2149           0 :         spin_unlock(&vb->lock);
    2150             : 
    2151           0 :         vunmap_range_noflush(addr, addr + size);
    2152             : 
    2153             :         if (debug_pagealloc_enabled_static())
    2154             :                 flush_tlb_kernel_range(addr, addr + size);
    2155             : 
    2156           0 :         spin_lock(&vb->lock);
    2157             : 
    2158             :         /* Expand dirty range */
    2159           0 :         vb->dirty_min = min(vb->dirty_min, offset);
    2160           0 :         vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
    2161             : 
    2162           0 :         vb->dirty += 1UL << order;
    2163           0 :         if (vb->dirty == VMAP_BBMAP_BITS) {
    2164           0 :                 BUG_ON(vb->free);
    2165           0 :                 spin_unlock(&vb->lock);
    2166           0 :                 free_vmap_block(vb);
    2167             :         } else
    2168           0 :                 spin_unlock(&vb->lock);
    2169           0 : }
    2170             : 
    2171           0 : static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
    2172             : {
    2173             :         int cpu;
    2174             : 
    2175           0 :         if (unlikely(!vmap_initialized))
    2176             :                 return;
    2177             : 
    2178             :         might_sleep();
    2179             : 
    2180           0 :         for_each_possible_cpu(cpu) {
    2181           0 :                 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
    2182             :                 struct vmap_block *vb;
    2183             : 
    2184             :                 rcu_read_lock();
    2185           0 :                 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
    2186           0 :                         spin_lock(&vb->lock);
    2187           0 :                         if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
    2188           0 :                                 unsigned long va_start = vb->va->va_start;
    2189             :                                 unsigned long s, e;
    2190             : 
    2191           0 :                                 s = va_start + (vb->dirty_min << PAGE_SHIFT);
    2192           0 :                                 e = va_start + (vb->dirty_max << PAGE_SHIFT);
    2193             : 
    2194           0 :                                 start = min(s, start);
    2195           0 :                                 end   = max(e, end);
    2196             : 
    2197           0 :                                 flush = 1;
    2198             :                         }
    2199           0 :                         spin_unlock(&vb->lock);
    2200             :                 }
    2201             :                 rcu_read_unlock();
    2202             :         }
    2203             : 
    2204           0 :         mutex_lock(&vmap_purge_lock);
    2205           0 :         purge_fragmented_blocks_allcpus();
    2206           0 :         if (!__purge_vmap_area_lazy(start, end) && flush)
    2207           0 :                 flush_tlb_kernel_range(start, end);
    2208           0 :         mutex_unlock(&vmap_purge_lock);
    2209             : }
    2210             : 
    2211             : /**
    2212             :  * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
    2213             :  *
    2214             :  * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
    2215             :  * to amortize TLB flushing overheads. What this means is that any page you
    2216             :  * have now, may, in a former life, have been mapped into kernel virtual
    2217             :  * address by the vmap layer and so there might be some CPUs with TLB entries
    2218             :  * still referencing that page (additional to the regular 1:1 kernel mapping).
    2219             :  *
    2220             :  * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
    2221             :  * be sure that none of the pages we have control over will have any aliases
    2222             :  * from the vmap layer.
    2223             :  */
    2224           0 : void vm_unmap_aliases(void)
    2225             : {
    2226           0 :         unsigned long start = ULONG_MAX, end = 0;
    2227           0 :         int flush = 0;
    2228             : 
    2229           0 :         _vm_unmap_aliases(start, end, flush);
    2230           0 : }
    2231             : EXPORT_SYMBOL_GPL(vm_unmap_aliases);
    2232             : 
    2233             : /**
    2234             :  * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
    2235             :  * @mem: the pointer returned by vm_map_ram
    2236             :  * @count: the count passed to that vm_map_ram call (cannot unmap partial)
    2237             :  */
    2238           0 : void vm_unmap_ram(const void *mem, unsigned int count)
    2239             : {
    2240           0 :         unsigned long size = (unsigned long)count << PAGE_SHIFT;
    2241           0 :         unsigned long addr = (unsigned long)kasan_reset_tag(mem);
    2242             :         struct vmap_area *va;
    2243             : 
    2244             :         might_sleep();
    2245           0 :         BUG_ON(!addr);
    2246           0 :         BUG_ON(addr < VMALLOC_START);
    2247           0 :         BUG_ON(addr > VMALLOC_END);
    2248           0 :         BUG_ON(!PAGE_ALIGNED(addr));
    2249             : 
    2250           0 :         kasan_poison_vmalloc(mem, size);
    2251             : 
    2252           0 :         if (likely(count <= VMAP_MAX_ALLOC)) {
    2253           0 :                 debug_check_no_locks_freed(mem, size);
    2254           0 :                 vb_free(addr, size);
    2255           0 :                 return;
    2256             :         }
    2257             : 
    2258           0 :         va = find_unlink_vmap_area(addr);
    2259           0 :         if (WARN_ON_ONCE(!va))
    2260             :                 return;
    2261             : 
    2262           0 :         debug_check_no_locks_freed((void *)va->va_start,
    2263           0 :                                     (va->va_end - va->va_start));
    2264           0 :         free_unmap_vmap_area(va);
    2265             : }
    2266             : EXPORT_SYMBOL(vm_unmap_ram);
    2267             : 
    2268             : /**
    2269             :  * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
    2270             :  * @pages: an array of pointers to the pages to be mapped
    2271             :  * @count: number of pages
    2272             :  * @node: prefer to allocate data structures on this node
    2273             :  *
    2274             :  * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
    2275             :  * faster than vmap so it's good.  But if you mix long-life and short-life
    2276             :  * objects with vm_map_ram(), it could consume lots of address space through
    2277             :  * fragmentation (especially on a 32bit machine).  You could see failures in
    2278             :  * the end.  Please use this function for short-lived objects.
    2279             :  *
    2280             :  * Returns: a pointer to the address that has been mapped, or %NULL on failure
    2281             :  */
    2282           0 : void *vm_map_ram(struct page **pages, unsigned int count, int node)
    2283             : {
    2284           0 :         unsigned long size = (unsigned long)count << PAGE_SHIFT;
    2285             :         unsigned long addr;
    2286             :         void *mem;
    2287             : 
    2288           0 :         if (likely(count <= VMAP_MAX_ALLOC)) {
    2289           0 :                 mem = vb_alloc(size, GFP_KERNEL);
    2290           0 :                 if (IS_ERR(mem))
    2291             :                         return NULL;
    2292             :                 addr = (unsigned long)mem;
    2293             :         } else {
    2294             :                 struct vmap_area *va;
    2295           0 :                 va = alloc_vmap_area(size, PAGE_SIZE,
    2296           0 :                                 VMALLOC_START, VMALLOC_END,
    2297             :                                 node, GFP_KERNEL, VMAP_RAM);
    2298           0 :                 if (IS_ERR(va))
    2299             :                         return NULL;
    2300             : 
    2301           0 :                 addr = va->va_start;
    2302           0 :                 mem = (void *)addr;
    2303             :         }
    2304             : 
    2305           0 :         if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
    2306             :                                 pages, PAGE_SHIFT) < 0) {
    2307           0 :                 vm_unmap_ram(mem, count);
    2308           0 :                 return NULL;
    2309             :         }
    2310             : 
    2311             :         /*
    2312             :          * Mark the pages as accessible, now that they are mapped.
    2313             :          * With hardware tag-based KASAN, marking is skipped for
    2314             :          * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
    2315             :          */
    2316             :         mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
    2317             : 
    2318             :         return mem;
    2319             : }
    2320             : EXPORT_SYMBOL(vm_map_ram);
    2321             : 
    2322             : static struct vm_struct *vmlist __initdata;
    2323             : 
    2324             : static inline unsigned int vm_area_page_order(struct vm_struct *vm)
    2325             : {
    2326             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
    2327             :         return vm->page_order;
    2328             : #else
    2329             :         return 0;
    2330             : #endif
    2331             : }
    2332             : 
    2333         274 : static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
    2334             : {
    2335             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
    2336             :         vm->page_order = order;
    2337             : #else
    2338         274 :         BUG_ON(order != 0);
    2339             : #endif
    2340         274 : }
    2341             : 
    2342             : /**
    2343             :  * vm_area_add_early - add vmap area early during boot
    2344             :  * @vm: vm_struct to add
    2345             :  *
    2346             :  * This function is used to add fixed kernel vm area to vmlist before
    2347             :  * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
    2348             :  * should contain proper values and the other fields should be zero.
    2349             :  *
    2350             :  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
    2351             :  */
    2352           0 : void __init vm_area_add_early(struct vm_struct *vm)
    2353             : {
    2354             :         struct vm_struct *tmp, **p;
    2355             : 
    2356           0 :         BUG_ON(vmap_initialized);
    2357           0 :         for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
    2358           0 :                 if (tmp->addr >= vm->addr) {
    2359           0 :                         BUG_ON(tmp->addr < vm->addr + vm->size);
    2360             :                         break;
    2361             :                 } else
    2362           0 :                         BUG_ON(tmp->addr + tmp->size > vm->addr);
    2363             :         }
    2364           0 :         vm->next = *p;
    2365           0 :         *p = vm;
    2366           0 : }
    2367             : 
    2368             : /**
    2369             :  * vm_area_register_early - register vmap area early during boot
    2370             :  * @vm: vm_struct to register
    2371             :  * @align: requested alignment
    2372             :  *
    2373             :  * This function is used to register kernel vm area before
    2374             :  * vmalloc_init() is called.  @vm->size and @vm->flags should contain
    2375             :  * proper values on entry and other fields should be zero.  On return,
    2376             :  * vm->addr contains the allocated address.
    2377             :  *
    2378             :  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
    2379             :  */
    2380           0 : void __init vm_area_register_early(struct vm_struct *vm, size_t align)
    2381             : {
    2382           0 :         unsigned long addr = ALIGN(VMALLOC_START, align);
    2383             :         struct vm_struct *cur, **p;
    2384             : 
    2385           0 :         BUG_ON(vmap_initialized);
    2386             : 
    2387           0 :         for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
    2388           0 :                 if ((unsigned long)cur->addr - addr >= vm->size)
    2389             :                         break;
    2390           0 :                 addr = ALIGN((unsigned long)cur->addr + cur->size, align);
    2391             :         }
    2392             : 
    2393           0 :         BUG_ON(addr > VMALLOC_END - vm->size);
    2394           0 :         vm->addr = (void *)addr;
    2395           0 :         vm->next = *p;
    2396           0 :         *p = vm;
    2397           0 :         kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
    2398           0 : }
    2399             : 
    2400           1 : static void vmap_init_free_space(void)
    2401             : {
    2402           1 :         unsigned long vmap_start = 1;
    2403           1 :         const unsigned long vmap_end = ULONG_MAX;
    2404             :         struct vmap_area *busy, *free;
    2405             : 
    2406             :         /*
    2407             :          *     B     F     B     B     B     F
    2408             :          * -|-----|.....|-----|-----|-----|.....|-
    2409             :          *  |           The KVA space           |
    2410             :          *  |<--------------------------------->|
    2411             :          */
    2412           1 :         list_for_each_entry(busy, &vmap_area_list, list) {
    2413           0 :                 if (busy->va_start - vmap_start > 0) {
    2414           0 :                         free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    2415           0 :                         if (!WARN_ON_ONCE(!free)) {
    2416           0 :                                 free->va_start = vmap_start;
    2417           0 :                                 free->va_end = busy->va_start;
    2418             : 
    2419           0 :                                 insert_vmap_area_augment(free, NULL,
    2420             :                                         &free_vmap_area_root,
    2421             :                                                 &free_vmap_area_list);
    2422             :                         }
    2423             :                 }
    2424             : 
    2425           0 :                 vmap_start = busy->va_end;
    2426             :         }
    2427             : 
    2428           1 :         if (vmap_end - vmap_start > 0) {
    2429           2 :                 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    2430           1 :                 if (!WARN_ON_ONCE(!free)) {
    2431           1 :                         free->va_start = vmap_start;
    2432           1 :                         free->va_end = vmap_end;
    2433             : 
    2434           1 :                         insert_vmap_area_augment(free, NULL,
    2435             :                                 &free_vmap_area_root,
    2436             :                                         &free_vmap_area_list);
    2437             :                 }
    2438             :         }
    2439           1 : }
    2440             : 
    2441             : static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
    2442             :         struct vmap_area *va, unsigned long flags, const void *caller)
    2443             : {
    2444         274 :         vm->flags = flags;
    2445         274 :         vm->addr = (void *)va->va_start;
    2446         274 :         vm->size = va->va_end - va->va_start;
    2447         274 :         vm->caller = caller;
    2448         274 :         va->vm = vm;
    2449             : }
    2450             : 
    2451             : static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
    2452             :                               unsigned long flags, const void *caller)
    2453             : {
    2454         274 :         spin_lock(&vmap_area_lock);
    2455         274 :         setup_vmalloc_vm_locked(vm, va, flags, caller);
    2456         274 :         spin_unlock(&vmap_area_lock);
    2457             : }
    2458             : 
    2459             : static void clear_vm_uninitialized_flag(struct vm_struct *vm)
    2460             : {
    2461             :         /*
    2462             :          * Before removing VM_UNINITIALIZED,
    2463             :          * we should make sure that vm has proper values.
    2464             :          * Pair with smp_rmb() in show_numa_info().
    2465             :          */
    2466         274 :         smp_wmb();
    2467         274 :         vm->flags &= ~VM_UNINITIALIZED;
    2468             : }
    2469             : 
    2470         274 : static struct vm_struct *__get_vm_area_node(unsigned long size,
    2471             :                 unsigned long align, unsigned long shift, unsigned long flags,
    2472             :                 unsigned long start, unsigned long end, int node,
    2473             :                 gfp_t gfp_mask, const void *caller)
    2474             : {
    2475             :         struct vmap_area *va;
    2476             :         struct vm_struct *area;
    2477         274 :         unsigned long requested_size = size;
    2478             : 
    2479         274 :         BUG_ON(in_interrupt());
    2480         274 :         size = ALIGN(size, 1ul << shift);
    2481         274 :         if (unlikely(!size))
    2482             :                 return NULL;
    2483             : 
    2484         274 :         if (flags & VM_IOREMAP)
    2485           0 :                 align = 1ul << clamp_t(int, get_count_order_long(size),
    2486             :                                        PAGE_SHIFT, IOREMAP_MAX_ORDER);
    2487             : 
    2488         274 :         area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
    2489         274 :         if (unlikely(!area))
    2490             :                 return NULL;
    2491             : 
    2492         274 :         if (!(flags & VM_NO_GUARD))
    2493         274 :                 size += PAGE_SIZE;
    2494             : 
    2495         274 :         va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
    2496         274 :         if (IS_ERR(va)) {
    2497           0 :                 kfree(area);
    2498           0 :                 return NULL;
    2499             :         }
    2500             : 
    2501         274 :         setup_vmalloc_vm(area, va, flags, caller);
    2502             : 
    2503             :         /*
    2504             :          * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
    2505             :          * best-effort approach, as they can be mapped outside of vmalloc code.
    2506             :          * For VM_ALLOC mappings, the pages are marked as accessible after
    2507             :          * getting mapped in __vmalloc_node_range().
    2508             :          * With hardware tag-based KASAN, marking is skipped for
    2509             :          * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
    2510             :          */
    2511         274 :         if (!(flags & VM_ALLOC))
    2512             :                 area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
    2513             :                                                     KASAN_VMALLOC_PROT_NORMAL);
    2514             : 
    2515             :         return area;
    2516             : }
    2517             : 
    2518           0 : struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
    2519             :                                        unsigned long start, unsigned long end,
    2520             :                                        const void *caller)
    2521             : {
    2522           0 :         return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
    2523             :                                   NUMA_NO_NODE, GFP_KERNEL, caller);
    2524             : }
    2525             : 
    2526             : /**
    2527             :  * get_vm_area - reserve a contiguous kernel virtual area
    2528             :  * @size:        size of the area
    2529             :  * @flags:       %VM_IOREMAP for I/O mappings or VM_ALLOC
    2530             :  *
    2531             :  * Search an area of @size in the kernel virtual mapping area,
    2532             :  * and reserved it for out purposes.  Returns the area descriptor
    2533             :  * on success or %NULL on failure.
    2534             :  *
    2535             :  * Return: the area descriptor on success or %NULL on failure.
    2536             :  */
    2537           0 : struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
    2538             : {
    2539           0 :         return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
    2540           0 :                                   VMALLOC_START, VMALLOC_END,
    2541             :                                   NUMA_NO_NODE, GFP_KERNEL,
    2542           0 :                                   __builtin_return_address(0));
    2543             : }
    2544             : 
    2545           0 : struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
    2546             :                                 const void *caller)
    2547             : {
    2548           0 :         return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
    2549           0 :                                   VMALLOC_START, VMALLOC_END,
    2550             :                                   NUMA_NO_NODE, GFP_KERNEL, caller);
    2551             : }
    2552             : 
    2553             : /**
    2554             :  * find_vm_area - find a continuous kernel virtual area
    2555             :  * @addr:         base address
    2556             :  *
    2557             :  * Search for the kernel VM area starting at @addr, and return it.
    2558             :  * It is up to the caller to do all required locking to keep the returned
    2559             :  * pointer valid.
    2560             :  *
    2561             :  * Return: the area descriptor on success or %NULL on failure.
    2562             :  */
    2563          16 : struct vm_struct *find_vm_area(const void *addr)
    2564             : {
    2565             :         struct vmap_area *va;
    2566             : 
    2567          32 :         va = find_vmap_area((unsigned long)addr);
    2568          16 :         if (!va)
    2569             :                 return NULL;
    2570             : 
    2571          16 :         return va->vm;
    2572             : }
    2573             : 
    2574             : /**
    2575             :  * remove_vm_area - find and remove a continuous kernel virtual area
    2576             :  * @addr:           base address
    2577             :  *
    2578             :  * Search for the kernel VM area starting at @addr, and remove it.
    2579             :  * This function returns the found VM area, but using it is NOT safe
    2580             :  * on SMP machines, except for its size or flags.
    2581             :  *
    2582             :  * Return: the area descriptor on success or %NULL on failure.
    2583             :  */
    2584         258 : struct vm_struct *remove_vm_area(const void *addr)
    2585             : {
    2586             :         struct vmap_area *va;
    2587             :         struct vm_struct *vm;
    2588             : 
    2589             :         might_sleep();
    2590             : 
    2591         258 :         if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
    2592             :                         addr))
    2593             :                 return NULL;
    2594             : 
    2595         258 :         va = find_unlink_vmap_area((unsigned long)addr);
    2596         258 :         if (!va || !va->vm)
    2597             :                 return NULL;
    2598         258 :         vm = va->vm;
    2599             : 
    2600         258 :         debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
    2601         258 :         debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
    2602         258 :         kasan_free_module_shadow(vm);
    2603         258 :         kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));
    2604             : 
    2605         258 :         free_unmap_vmap_area(va);
    2606         258 :         return vm;
    2607             : }
    2608             : 
    2609           0 : static inline void set_area_direct_map(const struct vm_struct *area,
    2610             :                                        int (*set_direct_map)(struct page *page))
    2611             : {
    2612             :         int i;
    2613             : 
    2614             :         /* HUGE_VMALLOC passes small pages to set_direct_map */
    2615           0 :         for (i = 0; i < area->nr_pages; i++)
    2616           0 :                 if (page_address(area->pages[i]))
    2617           0 :                         set_direct_map(area->pages[i]);
    2618           0 : }
    2619             : 
    2620             : /*
    2621             :  * Flush the vm mapping and reset the direct map.
    2622             :  */
    2623           0 : static void vm_reset_perms(struct vm_struct *area)
    2624             : {
    2625           0 :         unsigned long start = ULONG_MAX, end = 0;
    2626           0 :         unsigned int page_order = vm_area_page_order(area);
    2627           0 :         int flush_dmap = 0;
    2628             :         int i;
    2629             : 
    2630             :         /*
    2631             :          * Find the start and end range of the direct mappings to make sure that
    2632             :          * the vm_unmap_aliases() flush includes the direct map.
    2633             :          */
    2634           0 :         for (i = 0; i < area->nr_pages; i += 1U << page_order) {
    2635           0 :                 unsigned long addr = (unsigned long)page_address(area->pages[i]);
    2636             : 
    2637           0 :                 if (addr) {
    2638             :                         unsigned long page_size;
    2639             : 
    2640           0 :                         page_size = PAGE_SIZE << page_order;
    2641           0 :                         start = min(addr, start);
    2642           0 :                         end = max(addr + page_size, end);
    2643           0 :                         flush_dmap = 1;
    2644             :                 }
    2645             :         }
    2646             : 
    2647             :         /*
    2648             :          * Set direct map to something invalid so that it won't be cached if
    2649             :          * there are any accesses after the TLB flush, then flush the TLB and
    2650             :          * reset the direct map permissions to the default.
    2651             :          */
    2652           0 :         set_area_direct_map(area, set_direct_map_invalid_noflush);
    2653           0 :         _vm_unmap_aliases(start, end, flush_dmap);
    2654           0 :         set_area_direct_map(area, set_direct_map_default_noflush);
    2655           0 : }
    2656             : 
    2657           0 : static void delayed_vfree_work(struct work_struct *w)
    2658             : {
    2659           0 :         struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
    2660             :         struct llist_node *t, *llnode;
    2661             : 
    2662           0 :         llist_for_each_safe(llnode, t, llist_del_all(&p->list))
    2663           0 :                 vfree(llnode);
    2664           0 : }
    2665             : 
    2666             : /**
    2667             :  * vfree_atomic - release memory allocated by vmalloc()
    2668             :  * @addr:         memory base address
    2669             :  *
    2670             :  * This one is just like vfree() but can be called in any atomic context
    2671             :  * except NMIs.
    2672             :  */
    2673           0 : void vfree_atomic(const void *addr)
    2674             : {
    2675           0 :         struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
    2676             : 
    2677           0 :         BUG_ON(in_nmi());
    2678           0 :         kmemleak_free(addr);
    2679             : 
    2680             :         /*
    2681             :          * Use raw_cpu_ptr() because this can be called from preemptible
    2682             :          * context. Preemption is absolutely fine here, because the llist_add()
    2683             :          * implementation is lockless, so it works even if we are adding to
    2684             :          * another cpu's list. schedule_work() should be fine with this too.
    2685             :          */
    2686           0 :         if (addr && llist_add((struct llist_node *)addr, &p->list))
    2687           0 :                 schedule_work(&p->wq);
    2688           0 : }
    2689             : 
    2690             : /**
    2691             :  * vfree - Release memory allocated by vmalloc()
    2692             :  * @addr:  Memory base address
    2693             :  *
    2694             :  * Free the virtually continuous memory area starting at @addr, as obtained
    2695             :  * from one of the vmalloc() family of APIs.  This will usually also free the
    2696             :  * physical memory underlying the virtual allocation, but that memory is
    2697             :  * reference counted, so it will not be freed until the last user goes away.
    2698             :  *
    2699             :  * If @addr is NULL, no operation is performed.
    2700             :  *
    2701             :  * Context:
    2702             :  * May sleep if called *not* from interrupt context.
    2703             :  * Must not be called in NMI context (strictly speaking, it could be
    2704             :  * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
    2705             :  * conventions for vfree() arch-dependent would be a really bad idea).
    2706             :  */
    2707         258 : void vfree(const void *addr)
    2708             : {
    2709             :         struct vm_struct *vm;
    2710             :         int i;
    2711             : 
    2712         258 :         if (unlikely(in_interrupt())) {
    2713           0 :                 vfree_atomic(addr);
    2714           0 :                 return;
    2715             :         }
    2716             : 
    2717         258 :         BUG_ON(in_nmi());
    2718         258 :         kmemleak_free(addr);
    2719             :         might_sleep();
    2720             : 
    2721         258 :         if (!addr)
    2722             :                 return;
    2723             : 
    2724         258 :         vm = remove_vm_area(addr);
    2725         258 :         if (unlikely(!vm)) {
    2726           0 :                 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
    2727             :                                 addr);
    2728           0 :                 return;
    2729             :         }
    2730             : 
    2731         258 :         if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
    2732           0 :                 vm_reset_perms(vm);
    2733       42477 :         for (i = 0; i < vm->nr_pages; i++) {
    2734       42477 :                 struct page *page = vm->pages[i];
    2735             : 
    2736       42477 :                 BUG_ON(!page);
    2737       42477 :                 mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
    2738             :                 /*
    2739             :                  * High-order allocs for huge vmallocs are split, so
    2740             :                  * can be freed as an array of order-0 allocations
    2741             :                  */
    2742       42477 :                 __free_pages(page, 0);
    2743       42477 :                 cond_resched();
    2744             :         }
    2745         516 :         atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
    2746         258 :         kvfree(vm->pages);
    2747         258 :         kfree(vm);
    2748             : }
    2749             : EXPORT_SYMBOL(vfree);
    2750             : 
    2751             : /**
    2752             :  * vunmap - release virtual mapping obtained by vmap()
    2753             :  * @addr:   memory base address
    2754             :  *
    2755             :  * Free the virtually contiguous memory area starting at @addr,
    2756             :  * which was created from the page array passed to vmap().
    2757             :  *
    2758             :  * Must not be called in interrupt context.
    2759             :  */
    2760           0 : void vunmap(const void *addr)
    2761             : {
    2762             :         struct vm_struct *vm;
    2763             : 
    2764           0 :         BUG_ON(in_interrupt());
    2765             :         might_sleep();
    2766             : 
    2767           0 :         if (!addr)
    2768             :                 return;
    2769           0 :         vm = remove_vm_area(addr);
    2770           0 :         if (unlikely(!vm)) {
    2771           0 :                 WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
    2772             :                                 addr);
    2773           0 :                 return;
    2774             :         }
    2775           0 :         kfree(vm);
    2776             : }
    2777             : EXPORT_SYMBOL(vunmap);
    2778             : 
    2779             : /**
    2780             :  * vmap - map an array of pages into virtually contiguous space
    2781             :  * @pages: array of page pointers
    2782             :  * @count: number of pages to map
    2783             :  * @flags: vm_area->flags
    2784             :  * @prot: page protection for the mapping
    2785             :  *
    2786             :  * Maps @count pages from @pages into contiguous kernel virtual space.
    2787             :  * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
    2788             :  * (which must be kmalloc or vmalloc memory) and one reference per pages in it
    2789             :  * are transferred from the caller to vmap(), and will be freed / dropped when
    2790             :  * vfree() is called on the return value.
    2791             :  *
    2792             :  * Return: the address of the area or %NULL on failure
    2793             :  */
    2794           0 : void *vmap(struct page **pages, unsigned int count,
    2795             :            unsigned long flags, pgprot_t prot)
    2796             : {
    2797             :         struct vm_struct *area;
    2798             :         unsigned long addr;
    2799             :         unsigned long size;             /* In bytes */
    2800             : 
    2801             :         might_sleep();
    2802             : 
    2803           0 :         if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
    2804             :                 return NULL;
    2805             : 
    2806             :         /*
    2807             :          * Your top guard is someone else's bottom guard. Not having a top
    2808             :          * guard compromises someone else's mappings too.
    2809             :          */
    2810           0 :         if (WARN_ON_ONCE(flags & VM_NO_GUARD))
    2811           0 :                 flags &= ~VM_NO_GUARD;
    2812             : 
    2813           0 :         if (count > totalram_pages())
    2814             :                 return NULL;
    2815             : 
    2816           0 :         size = (unsigned long)count << PAGE_SHIFT;
    2817           0 :         area = get_vm_area_caller(size, flags, __builtin_return_address(0));
    2818           0 :         if (!area)
    2819             :                 return NULL;
    2820             : 
    2821           0 :         addr = (unsigned long)area->addr;
    2822           0 :         if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
    2823             :                                 pages, PAGE_SHIFT) < 0) {
    2824           0 :                 vunmap(area->addr);
    2825           0 :                 return NULL;
    2826             :         }
    2827             : 
    2828           0 :         if (flags & VM_MAP_PUT_PAGES) {
    2829           0 :                 area->pages = pages;
    2830           0 :                 area->nr_pages = count;
    2831             :         }
    2832           0 :         return area->addr;
    2833             : }
    2834             : EXPORT_SYMBOL(vmap);
    2835             : 
    2836             : #ifdef CONFIG_VMAP_PFN
    2837             : struct vmap_pfn_data {
    2838             :         unsigned long   *pfns;
    2839             :         pgprot_t        prot;
    2840             :         unsigned int    idx;
    2841             : };
    2842             : 
    2843             : static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
    2844             : {
    2845             :         struct vmap_pfn_data *data = private;
    2846             : 
    2847             :         if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
    2848             :                 return -EINVAL;
    2849             :         *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
    2850             :         return 0;
    2851             : }
    2852             : 
    2853             : /**
    2854             :  * vmap_pfn - map an array of PFNs into virtually contiguous space
    2855             :  * @pfns: array of PFNs
    2856             :  * @count: number of pages to map
    2857             :  * @prot: page protection for the mapping
    2858             :  *
    2859             :  * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
    2860             :  * the start address of the mapping.
    2861             :  */
    2862             : void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
    2863             : {
    2864             :         struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
    2865             :         struct vm_struct *area;
    2866             : 
    2867             :         area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
    2868             :                         __builtin_return_address(0));
    2869             :         if (!area)
    2870             :                 return NULL;
    2871             :         if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
    2872             :                         count * PAGE_SIZE, vmap_pfn_apply, &data)) {
    2873             :                 free_vm_area(area);
    2874             :                 return NULL;
    2875             :         }
    2876             :         return area->addr;
    2877             : }
    2878             : EXPORT_SYMBOL_GPL(vmap_pfn);
    2879             : #endif /* CONFIG_VMAP_PFN */
    2880             : 
    2881             : static inline unsigned int
    2882         274 : vm_area_alloc_pages(gfp_t gfp, int nid,
    2883             :                 unsigned int order, unsigned int nr_pages, struct page **pages)
    2884             : {
    2885         274 :         unsigned int nr_allocated = 0;
    2886             :         struct page *page;
    2887             :         int i;
    2888             : 
    2889             :         /*
    2890             :          * For order-0 pages we make use of bulk allocator, if
    2891             :          * the page array is partly or not at all populated due
    2892             :          * to fails, fallback to a single page allocator that is
    2893             :          * more permissive.
    2894             :          */
    2895         274 :         if (!order) {
    2896         274 :                 gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
    2897             : 
    2898        1144 :                 while (nr_allocated < nr_pages) {
    2899             :                         unsigned int nr, nr_pages_request;
    2900             : 
    2901             :                         /*
    2902             :                          * A maximum allowed request is hard-coded and is 100
    2903             :                          * pages per call. That is done in order to prevent a
    2904             :                          * long preemption off scenario in the bulk-allocator
    2905             :                          * so the range is [1:100].
    2906             :                          */
    2907         596 :                         nr_pages_request = min(100U, nr_pages - nr_allocated);
    2908             : 
    2909             :                         /* memory allocation should consider mempolicy, we can't
    2910             :                          * wrongly use nearest node when nid == NUMA_NO_NODE,
    2911             :                          * otherwise memory may be allocated in only one node,
    2912             :                          * but mempolicy wants to alloc memory by interleaving.
    2913             :                          */
    2914             :                         if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
    2915             :                                 nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
    2916             :                                                         nr_pages_request,
    2917             :                                                         pages + nr_allocated);
    2918             : 
    2919             :                         else
    2920        1192 :                                 nr = alloc_pages_bulk_array_node(bulk_gfp, nid,
    2921             :                                                         nr_pages_request,
    2922         596 :                                                         pages + nr_allocated);
    2923             : 
    2924         596 :                         nr_allocated += nr;
    2925         596 :                         cond_resched();
    2926             : 
    2927             :                         /*
    2928             :                          * If zero or pages were obtained partly,
    2929             :                          * fallback to a single page allocator.
    2930             :                          */
    2931         596 :                         if (nr != nr_pages_request)
    2932             :                                 break;
    2933             :                 }
    2934             :         }
    2935             : 
    2936             :         /* High-order pages or fallback path if "bulk" fails. */
    2937             : 
    2938         274 :         while (nr_allocated < nr_pages) {
    2939           0 :                 if (fatal_signal_pending(current))
    2940             :                         break;
    2941             : 
    2942           0 :                 if (nid == NUMA_NO_NODE)
    2943           0 :                         page = alloc_pages(gfp, order);
    2944             :                 else
    2945           0 :                         page = alloc_pages_node(nid, gfp, order);
    2946           0 :                 if (unlikely(!page))
    2947             :                         break;
    2948             :                 /*
    2949             :                  * Higher order allocations must be able to be treated as
    2950             :                  * indepdenent small pages by callers (as they can with
    2951             :                  * small-page vmallocs). Some drivers do their own refcounting
    2952             :                  * on vmalloc_to_page() pages, some use page->mapping,
    2953             :                  * page->lru, etc.
    2954             :                  */
    2955           0 :                 if (order)
    2956           0 :                         split_page(page, order);
    2957             : 
    2958             :                 /*
    2959             :                  * Careful, we allocate and map page-order pages, but
    2960             :                  * tracking is done per PAGE_SIZE page so as to keep the
    2961             :                  * vm_struct APIs independent of the physical/mapped size.
    2962             :                  */
    2963           0 :                 for (i = 0; i < (1U << order); i++)
    2964           0 :                         pages[nr_allocated + i] = page + i;
    2965             : 
    2966           0 :                 cond_resched();
    2967           0 :                 nr_allocated += 1U << order;
    2968             :         }
    2969             : 
    2970         274 :         return nr_allocated;
    2971             : }
    2972             : 
    2973         274 : static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
    2974             :                                  pgprot_t prot, unsigned int page_shift,
    2975             :                                  int node)
    2976             : {
    2977         274 :         const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
    2978         274 :         bool nofail = gfp_mask & __GFP_NOFAIL;
    2979         274 :         unsigned long addr = (unsigned long)area->addr;
    2980         548 :         unsigned long size = get_vm_area_size(area);
    2981             :         unsigned long array_size;
    2982         274 :         unsigned int nr_small_pages = size >> PAGE_SHIFT;
    2983             :         unsigned int page_order;
    2984             :         unsigned int flags;
    2985             :         int ret;
    2986             : 
    2987         274 :         array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
    2988             : 
    2989         274 :         if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
    2990         274 :                 gfp_mask |= __GFP_HIGHMEM;
    2991             : 
    2992             :         /* Please note that the recursion is strictly bounded. */
    2993         274 :         if (array_size > PAGE_SIZE) {
    2994           1 :                 area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
    2995             :                                         area->caller);
    2996             :         } else {
    2997         273 :                 area->pages = kmalloc_node(array_size, nested_gfp, node);
    2998             :         }
    2999             : 
    3000         274 :         if (!area->pages) {
    3001           0 :                 warn_alloc(gfp_mask, NULL,
    3002             :                         "vmalloc error: size %lu, failed to allocated page array size %lu",
    3003             :                         nr_small_pages * PAGE_SIZE, array_size);
    3004           0 :                 free_vm_area(area);
    3005           0 :                 return NULL;
    3006             :         }
    3007             : 
    3008         274 :         set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
    3009         274 :         page_order = vm_area_page_order(area);
    3010             : 
    3011         274 :         area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
    3012             :                 node, page_order, nr_small_pages, area->pages);
    3013             : 
    3014         548 :         atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
    3015         274 :         if (gfp_mask & __GFP_ACCOUNT) {
    3016             :                 int i;
    3017             : 
    3018           0 :                 for (i = 0; i < area->nr_pages; i++)
    3019           0 :                         mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
    3020             :         }
    3021             : 
    3022             :         /*
    3023             :          * If not enough pages were obtained to accomplish an
    3024             :          * allocation request, free them via vfree() if any.
    3025             :          */
    3026         274 :         if (area->nr_pages != nr_small_pages) {
    3027           0 :                 warn_alloc(gfp_mask, NULL,
    3028             :                         "vmalloc error: size %lu, page order %u, failed to allocate pages",
    3029           0 :                         area->nr_pages * PAGE_SIZE, page_order);
    3030           0 :                 goto fail;
    3031             :         }
    3032             : 
    3033             :         /*
    3034             :          * page tables allocations ignore external gfp mask, enforce it
    3035             :          * by the scope API
    3036             :          */
    3037         274 :         if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
    3038           0 :                 flags = memalloc_nofs_save();
    3039         274 :         else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
    3040           0 :                 flags = memalloc_noio_save();
    3041             : 
    3042             :         do {
    3043         274 :                 ret = vmap_pages_range(addr, addr + size, prot, area->pages,
    3044             :                         page_shift);
    3045         274 :                 if (nofail && (ret < 0))
    3046           0 :                         schedule_timeout_uninterruptible(1);
    3047         274 :         } while (nofail && (ret < 0));
    3048             : 
    3049         274 :         if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
    3050             :                 memalloc_nofs_restore(flags);
    3051         274 :         else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
    3052             :                 memalloc_noio_restore(flags);
    3053             : 
    3054         274 :         if (ret < 0) {
    3055           0 :                 warn_alloc(gfp_mask, NULL,
    3056             :                         "vmalloc error: size %lu, failed to map pages",
    3057           0 :                         area->nr_pages * PAGE_SIZE);
    3058           0 :                 goto fail;
    3059             :         }
    3060             : 
    3061         274 :         return area->addr;
    3062             : 
    3063             : fail:
    3064           0 :         vfree(area->addr);
    3065           0 :         return NULL;
    3066             : }
    3067             : 
    3068             : /**
    3069             :  * __vmalloc_node_range - allocate virtually contiguous memory
    3070             :  * @size:                 allocation size
    3071             :  * @align:                desired alignment
    3072             :  * @start:                vm area range start
    3073             :  * @end:                  vm area range end
    3074             :  * @gfp_mask:             flags for the page level allocator
    3075             :  * @prot:                 protection mask for the allocated pages
    3076             :  * @vm_flags:             additional vm area flags (e.g. %VM_NO_GUARD)
    3077             :  * @node:                 node to use for allocation or NUMA_NO_NODE
    3078             :  * @caller:               caller's return address
    3079             :  *
    3080             :  * Allocate enough pages to cover @size from the page level
    3081             :  * allocator with @gfp_mask flags. Please note that the full set of gfp
    3082             :  * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
    3083             :  * supported.
    3084             :  * Zone modifiers are not supported. From the reclaim modifiers
    3085             :  * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
    3086             :  * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
    3087             :  * __GFP_RETRY_MAYFAIL are not supported).
    3088             :  *
    3089             :  * __GFP_NOWARN can be used to suppress failures messages.
    3090             :  *
    3091             :  * Map them into contiguous kernel virtual space, using a pagetable
    3092             :  * protection of @prot.
    3093             :  *
    3094             :  * Return: the address of the area or %NULL on failure
    3095             :  */
    3096         274 : void *__vmalloc_node_range(unsigned long size, unsigned long align,
    3097             :                         unsigned long start, unsigned long end, gfp_t gfp_mask,
    3098             :                         pgprot_t prot, unsigned long vm_flags, int node,
    3099             :                         const void *caller)
    3100             : {
    3101             :         struct vm_struct *area;
    3102             :         void *ret;
    3103         274 :         kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
    3104         274 :         unsigned long real_size = size;
    3105         274 :         unsigned long real_align = align;
    3106         274 :         unsigned int shift = PAGE_SHIFT;
    3107             : 
    3108         274 :         if (WARN_ON_ONCE(!size))
    3109             :                 return NULL;
    3110             : 
    3111         548 :         if ((size >> PAGE_SHIFT) > totalram_pages()) {
    3112           0 :                 warn_alloc(gfp_mask, NULL,
    3113             :                         "vmalloc error: size %lu, exceeds total pages",
    3114             :                         real_size);
    3115           0 :                 return NULL;
    3116             :         }
    3117             : 
    3118             :         if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
    3119             :                 unsigned long size_per_node;
    3120             : 
    3121             :                 /*
    3122             :                  * Try huge pages. Only try for PAGE_KERNEL allocations,
    3123             :                  * others like modules don't yet expect huge pages in
    3124             :                  * their allocations due to apply_to_page_range not
    3125             :                  * supporting them.
    3126             :                  */
    3127             : 
    3128             :                 size_per_node = size;
    3129             :                 if (node == NUMA_NO_NODE)
    3130             :                         size_per_node /= num_online_nodes();
    3131             :                 if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
    3132             :                         shift = PMD_SHIFT;
    3133             :                 else
    3134             :                         shift = arch_vmap_pte_supported_shift(size_per_node);
    3135             : 
    3136             :                 align = max(real_align, 1UL << shift);
    3137             :                 size = ALIGN(real_size, 1UL << shift);
    3138             :         }
    3139             : 
    3140             : again:
    3141         274 :         area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
    3142             :                                   VM_UNINITIALIZED | vm_flags, start, end, node,
    3143             :                                   gfp_mask, caller);
    3144         274 :         if (!area) {
    3145           0 :                 bool nofail = gfp_mask & __GFP_NOFAIL;
    3146           0 :                 warn_alloc(gfp_mask, NULL,
    3147             :                         "vmalloc error: size %lu, vm_struct allocation failed%s",
    3148             :                         real_size, (nofail) ? ". Retrying." : "");
    3149           0 :                 if (nofail) {
    3150           0 :                         schedule_timeout_uninterruptible(1);
    3151           0 :                         goto again;
    3152             :                 }
    3153             :                 goto fail;
    3154             :         }
    3155             : 
    3156             :         /*
    3157             :          * Prepare arguments for __vmalloc_area_node() and
    3158             :          * kasan_unpoison_vmalloc().
    3159             :          */
    3160         274 :         if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
    3161             :                 if (kasan_hw_tags_enabled()) {
    3162             :                         /*
    3163             :                          * Modify protection bits to allow tagging.
    3164             :                          * This must be done before mapping.
    3165             :                          */
    3166             :                         prot = arch_vmap_pgprot_tagged(prot);
    3167             : 
    3168             :                         /*
    3169             :                          * Skip page_alloc poisoning and zeroing for physical
    3170             :                          * pages backing VM_ALLOC mapping. Memory is instead
    3171             :                          * poisoned and zeroed by kasan_unpoison_vmalloc().
    3172             :                          */
    3173             :                         gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO;
    3174             :                 }
    3175             : 
    3176             :                 /* Take note that the mapping is PAGE_KERNEL. */
    3177             :                 kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
    3178             :         }
    3179             : 
    3180             :         /* Allocate physical pages and map them into vmalloc space. */
    3181         274 :         ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
    3182         274 :         if (!ret)
    3183             :                 goto fail;
    3184             : 
    3185             :         /*
    3186             :          * Mark the pages as accessible, now that they are mapped.
    3187             :          * The condition for setting KASAN_VMALLOC_INIT should complement the
    3188             :          * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
    3189             :          * to make sure that memory is initialized under the same conditions.
    3190             :          * Tag-based KASAN modes only assign tags to normal non-executable
    3191             :          * allocations, see __kasan_unpoison_vmalloc().
    3192             :          */
    3193         274 :         kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
    3194         548 :         if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
    3195             :             (gfp_mask & __GFP_SKIP_ZERO))
    3196             :                 kasan_flags |= KASAN_VMALLOC_INIT;
    3197             :         /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
    3198         274 :         area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
    3199             : 
    3200             :         /*
    3201             :          * In this function, newly allocated vm_struct has VM_UNINITIALIZED
    3202             :          * flag. It means that vm_struct is not fully initialized.
    3203             :          * Now, it is fully initialized, so remove this flag here.
    3204             :          */
    3205         274 :         clear_vm_uninitialized_flag(area);
    3206             : 
    3207         274 :         size = PAGE_ALIGN(size);
    3208             :         if (!(vm_flags & VM_DEFER_KMEMLEAK))
    3209         274 :                 kmemleak_vmalloc(area, size, gfp_mask);
    3210             : 
    3211         274 :         return area->addr;
    3212             : 
    3213             : fail:
    3214             :         if (shift > PAGE_SHIFT) {
    3215             :                 shift = PAGE_SHIFT;
    3216             :                 align = real_align;
    3217             :                 size = real_size;
    3218             :                 goto again;
    3219             :         }
    3220             : 
    3221             :         return NULL;
    3222             : }
    3223             : 
    3224             : /**
    3225             :  * __vmalloc_node - allocate virtually contiguous memory
    3226             :  * @size:           allocation size
    3227             :  * @align:          desired alignment
    3228             :  * @gfp_mask:       flags for the page level allocator
    3229             :  * @node:           node to use for allocation or NUMA_NO_NODE
    3230             :  * @caller:         caller's return address
    3231             :  *
    3232             :  * Allocate enough pages to cover @size from the page level allocator with
    3233             :  * @gfp_mask flags.  Map them into contiguous kernel virtual space.
    3234             :  *
    3235             :  * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
    3236             :  * and __GFP_NOFAIL are not supported
    3237             :  *
    3238             :  * Any use of gfp flags outside of GFP_KERNEL should be consulted
    3239             :  * with mm people.
    3240             :  *
    3241             :  * Return: pointer to the allocated memory or %NULL on error
    3242             :  */
    3243         258 : void *__vmalloc_node(unsigned long size, unsigned long align,
    3244             :                             gfp_t gfp_mask, int node, const void *caller)
    3245             : {
    3246         258 :         return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
    3247         258 :                                 gfp_mask, PAGE_KERNEL, 0, node, caller);
    3248             : }
    3249             : /*
    3250             :  * This is only for performance analysis of vmalloc and stress purpose.
    3251             :  * It is required by vmalloc test module, therefore do not use it other
    3252             :  * than that.
    3253             :  */
    3254             : #ifdef CONFIG_TEST_VMALLOC_MODULE
    3255             : EXPORT_SYMBOL_GPL(__vmalloc_node);
    3256             : #endif
    3257             : 
    3258           0 : void *__vmalloc(unsigned long size, gfp_t gfp_mask)
    3259             : {
    3260           0 :         return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
    3261           0 :                                 __builtin_return_address(0));
    3262             : }
    3263             : EXPORT_SYMBOL(__vmalloc);
    3264             : 
    3265             : /**
    3266             :  * vmalloc - allocate virtually contiguous memory
    3267             :  * @size:    allocation size
    3268             :  *
    3269             :  * Allocate enough pages to cover @size from the page level
    3270             :  * allocator and map them into contiguous kernel virtual space.
    3271             :  *
    3272             :  * For tight control over page level allocator and protection flags
    3273             :  * use __vmalloc() instead.
    3274             :  *
    3275             :  * Return: pointer to the allocated memory or %NULL on error
    3276             :  */
    3277         102 : void *vmalloc(unsigned long size)
    3278             : {
    3279         102 :         return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
    3280         102 :                                 __builtin_return_address(0));
    3281             : }
    3282             : EXPORT_SYMBOL(vmalloc);
    3283             : 
    3284             : /**
    3285             :  * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
    3286             :  * @size:      allocation size
    3287             :  * @gfp_mask:  flags for the page level allocator
    3288             :  *
    3289             :  * Allocate enough pages to cover @size from the page level
    3290             :  * allocator and map them into contiguous kernel virtual space.
    3291             :  * If @size is greater than or equal to PMD_SIZE, allow using
    3292             :  * huge pages for the memory
    3293             :  *
    3294             :  * Return: pointer to the allocated memory or %NULL on error
    3295             :  */
    3296           0 : void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
    3297             : {
    3298           0 :         return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
    3299           0 :                                     gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
    3300           0 :                                     NUMA_NO_NODE, __builtin_return_address(0));
    3301             : }
    3302             : EXPORT_SYMBOL_GPL(vmalloc_huge);
    3303             : 
    3304             : /**
    3305             :  * vzalloc - allocate virtually contiguous memory with zero fill
    3306             :  * @size:    allocation size
    3307             :  *
    3308             :  * Allocate enough pages to cover @size from the page level
    3309             :  * allocator and map them into contiguous kernel virtual space.
    3310             :  * The memory allocated is set to zero.
    3311             :  *
    3312             :  * For tight control over page level allocator and protection flags
    3313             :  * use __vmalloc() instead.
    3314             :  *
    3315             :  * Return: pointer to the allocated memory or %NULL on error
    3316             :  */
    3317         155 : void *vzalloc(unsigned long size)
    3318             : {
    3319         155 :         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
    3320         155 :                                 __builtin_return_address(0));
    3321             : }
    3322             : EXPORT_SYMBOL(vzalloc);
    3323             : 
    3324             : /**
    3325             :  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
    3326             :  * @size: allocation size
    3327             :  *
    3328             :  * The resulting memory area is zeroed so it can be mapped to userspace
    3329             :  * without leaking data.
    3330             :  *
    3331             :  * Return: pointer to the allocated memory or %NULL on error
    3332             :  */
    3333           0 : void *vmalloc_user(unsigned long size)
    3334             : {
    3335           0 :         return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
    3336           0 :                                     GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
    3337             :                                     VM_USERMAP, NUMA_NO_NODE,
    3338           0 :                                     __builtin_return_address(0));
    3339             : }
    3340             : EXPORT_SYMBOL(vmalloc_user);
    3341             : 
    3342             : /**
    3343             :  * vmalloc_node - allocate memory on a specific node
    3344             :  * @size:         allocation size
    3345             :  * @node:         numa node
    3346             :  *
    3347             :  * Allocate enough pages to cover @size from the page level
    3348             :  * allocator and map them into contiguous kernel virtual space.
    3349             :  *
    3350             :  * For tight control over page level allocator and protection flags
    3351             :  * use __vmalloc() instead.
    3352             :  *
    3353             :  * Return: pointer to the allocated memory or %NULL on error
    3354             :  */
    3355           0 : void *vmalloc_node(unsigned long size, int node)
    3356             : {
    3357           0 :         return __vmalloc_node(size, 1, GFP_KERNEL, node,
    3358           0 :                         __builtin_return_address(0));
    3359             : }
    3360             : EXPORT_SYMBOL(vmalloc_node);
    3361             : 
    3362             : /**
    3363             :  * vzalloc_node - allocate memory on a specific node with zero fill
    3364             :  * @size:       allocation size
    3365             :  * @node:       numa node
    3366             :  *
    3367             :  * Allocate enough pages to cover @size from the page level
    3368             :  * allocator and map them into contiguous kernel virtual space.
    3369             :  * The memory allocated is set to zero.
    3370             :  *
    3371             :  * Return: pointer to the allocated memory or %NULL on error
    3372             :  */
    3373           0 : void *vzalloc_node(unsigned long size, int node)
    3374             : {
    3375           0 :         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
    3376           0 :                                 __builtin_return_address(0));
    3377             : }
    3378             : EXPORT_SYMBOL(vzalloc_node);
    3379             : 
    3380             : #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
    3381             : #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
    3382             : #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
    3383             : #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
    3384             : #else
    3385             : /*
    3386             :  * 64b systems should always have either DMA or DMA32 zones. For others
    3387             :  * GFP_DMA32 should do the right thing and use the normal zone.
    3388             :  */
    3389             : #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
    3390             : #endif
    3391             : 
    3392             : /**
    3393             :  * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
    3394             :  * @size:       allocation size
    3395             :  *
    3396             :  * Allocate enough 32bit PA addressable pages to cover @size from the
    3397             :  * page level allocator and map them into contiguous kernel virtual space.
    3398             :  *
    3399             :  * Return: pointer to the allocated memory or %NULL on error
    3400             :  */
    3401           0 : void *vmalloc_32(unsigned long size)
    3402             : {
    3403           0 :         return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
    3404           0 :                         __builtin_return_address(0));
    3405             : }
    3406             : EXPORT_SYMBOL(vmalloc_32);
    3407             : 
    3408             : /**
    3409             :  * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
    3410             :  * @size:            allocation size
    3411             :  *
    3412             :  * The resulting memory area is 32bit addressable and zeroed so it can be
    3413             :  * mapped to userspace without leaking data.
    3414             :  *
    3415             :  * Return: pointer to the allocated memory or %NULL on error
    3416             :  */
    3417           0 : void *vmalloc_32_user(unsigned long size)
    3418             : {
    3419           0 :         return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
    3420           0 :                                     GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
    3421             :                                     VM_USERMAP, NUMA_NO_NODE,
    3422           0 :                                     __builtin_return_address(0));
    3423             : }
    3424             : EXPORT_SYMBOL(vmalloc_32_user);
    3425             : 
    3426             : /*
    3427             :  * small helper routine , copy contents to buf from addr.
    3428             :  * If the page is not present, fill zero.
    3429             :  */
    3430             : 
    3431           0 : static int aligned_vread(char *buf, char *addr, unsigned long count)
    3432             : {
    3433             :         struct page *p;
    3434           0 :         int copied = 0;
    3435             : 
    3436           0 :         while (count) {
    3437             :                 unsigned long offset, length;
    3438             : 
    3439           0 :                 offset = offset_in_page(addr);
    3440           0 :                 length = PAGE_SIZE - offset;
    3441           0 :                 if (length > count)
    3442           0 :                         length = count;
    3443           0 :                 p = vmalloc_to_page(addr);
    3444             :                 /*
    3445             :                  * To do safe access to this _mapped_ area, we need
    3446             :                  * lock. But adding lock here means that we need to add
    3447             :                  * overhead of vmalloc()/vfree() calls for this _debug_
    3448             :                  * interface, rarely used. Instead of that, we'll use
    3449             :                  * kmap() and get small overhead in this access function.
    3450             :                  */
    3451           0 :                 if (p) {
    3452             :                         /* We can expect USER0 is not used -- see vread() */
    3453           0 :                         void *map = kmap_atomic(p);
    3454           0 :                         memcpy(buf, map + offset, length);
    3455             :                         kunmap_atomic(map);
    3456             :                 } else
    3457           0 :                         memset(buf, 0, length);
    3458             : 
    3459           0 :                 addr += length;
    3460           0 :                 buf += length;
    3461           0 :                 copied += length;
    3462           0 :                 count -= length;
    3463             :         }
    3464           0 :         return copied;
    3465             : }
    3466             : 
    3467           0 : static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
    3468             : {
    3469             :         char *start;
    3470             :         struct vmap_block *vb;
    3471             :         unsigned long offset;
    3472             :         unsigned int rs, re, n;
    3473             : 
    3474             :         /*
    3475             :          * If it's area created by vm_map_ram() interface directly, but
    3476             :          * not further subdividing and delegating management to vmap_block,
    3477             :          * handle it here.
    3478             :          */
    3479           0 :         if (!(flags & VMAP_BLOCK)) {
    3480           0 :                 aligned_vread(buf, addr, count);
    3481           0 :                 return;
    3482             :         }
    3483             : 
    3484             :         /*
    3485             :          * Area is split into regions and tracked with vmap_block, read out
    3486             :          * each region and zero fill the hole between regions.
    3487             :          */
    3488           0 :         vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
    3489           0 :         if (!vb)
    3490             :                 goto finished;
    3491             : 
    3492           0 :         spin_lock(&vb->lock);
    3493           0 :         if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
    3494           0 :                 spin_unlock(&vb->lock);
    3495             :                 goto finished;
    3496             :         }
    3497           0 :         for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
    3498           0 :                 if (!count)
    3499             :                         break;
    3500           0 :                 start = vmap_block_vaddr(vb->va->va_start, rs);
    3501           0 :                 while (addr < start) {
    3502           0 :                         if (count == 0)
    3503             :                                 goto unlock;
    3504           0 :                         *buf = '\0';
    3505           0 :                         buf++;
    3506           0 :                         addr++;
    3507           0 :                         count--;
    3508             :                 }
    3509             :                 /*it could start reading from the middle of used region*/
    3510           0 :                 offset = offset_in_page(addr);
    3511           0 :                 n = ((re - rs + 1) << PAGE_SHIFT) - offset;
    3512           0 :                 if (n > count)
    3513           0 :                         n = count;
    3514           0 :                 aligned_vread(buf, start+offset, n);
    3515             : 
    3516           0 :                 buf += n;
    3517           0 :                 addr += n;
    3518           0 :                 count -= n;
    3519             :         }
    3520             : unlock:
    3521           0 :         spin_unlock(&vb->lock);
    3522             : 
    3523             : finished:
    3524             :         /* zero-fill the left dirty or free regions */
    3525           0 :         if (count)
    3526           0 :                 memset(buf, 0, count);
    3527             : }
    3528             : 
    3529             : /**
    3530             :  * vread() - read vmalloc area in a safe way.
    3531             :  * @buf:     buffer for reading data
    3532             :  * @addr:    vm address.
    3533             :  * @count:   number of bytes to be read.
    3534             :  *
    3535             :  * This function checks that addr is a valid vmalloc'ed area, and
    3536             :  * copy data from that area to a given buffer. If the given memory range
    3537             :  * of [addr...addr+count) includes some valid address, data is copied to
    3538             :  * proper area of @buf. If there are memory holes, they'll be zero-filled.
    3539             :  * IOREMAP area is treated as memory hole and no copy is done.
    3540             :  *
    3541             :  * If [addr...addr+count) doesn't includes any intersects with alive
    3542             :  * vm_struct area, returns 0. @buf should be kernel's buffer.
    3543             :  *
    3544             :  * Note: In usual ops, vread() is never necessary because the caller
    3545             :  * should know vmalloc() area is valid and can use memcpy().
    3546             :  * This is for routines which have to access vmalloc area without
    3547             :  * any information, as /proc/kcore.
    3548             :  *
    3549             :  * Return: number of bytes for which addr and buf should be increased
    3550             :  * (same number as @count) or %0 if [addr...addr+count) doesn't
    3551             :  * include any intersection with valid vmalloc area
    3552             :  */
    3553           0 : long vread(char *buf, char *addr, unsigned long count)
    3554             : {
    3555             :         struct vmap_area *va;
    3556             :         struct vm_struct *vm;
    3557           0 :         char *vaddr, *buf_start = buf;
    3558           0 :         unsigned long buflen = count;
    3559             :         unsigned long n, size, flags;
    3560             : 
    3561           0 :         addr = kasan_reset_tag(addr);
    3562             : 
    3563             :         /* Don't allow overflow */
    3564           0 :         if ((unsigned long) addr + count < count)
    3565           0 :                 count = -(unsigned long) addr;
    3566             : 
    3567           0 :         spin_lock(&vmap_area_lock);
    3568           0 :         va = find_vmap_area_exceed_addr((unsigned long)addr);
    3569           0 :         if (!va)
    3570             :                 goto finished;
    3571             : 
    3572             :         /* no intersects with alive vmap_area */
    3573           0 :         if ((unsigned long)addr + count <= va->va_start)
    3574             :                 goto finished;
    3575             : 
    3576           0 :         list_for_each_entry_from(va, &vmap_area_list, list) {
    3577           0 :                 if (!count)
    3578             :                         break;
    3579             : 
    3580           0 :                 vm = va->vm;
    3581           0 :                 flags = va->flags & VMAP_FLAGS_MASK;
    3582             :                 /*
    3583             :                  * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
    3584             :                  * be set together with VMAP_RAM.
    3585             :                  */
    3586           0 :                 WARN_ON(flags == VMAP_BLOCK);
    3587             : 
    3588           0 :                 if (!vm && !flags)
    3589           0 :                         continue;
    3590             : 
    3591           0 :                 if (vm && (vm->flags & VM_UNINITIALIZED))
    3592           0 :                         continue;
    3593             :                 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
    3594           0 :                 smp_rmb();
    3595             : 
    3596           0 :                 vaddr = (char *) va->va_start;
    3597           0 :                 size = vm ? get_vm_area_size(vm) : va_size(va);
    3598             : 
    3599           0 :                 if (addr >= vaddr + size)
    3600           0 :                         continue;
    3601           0 :                 while (addr < vaddr) {
    3602           0 :                         if (count == 0)
    3603             :                                 goto finished;
    3604           0 :                         *buf = '\0';
    3605           0 :                         buf++;
    3606           0 :                         addr++;
    3607           0 :                         count--;
    3608             :                 }
    3609           0 :                 n = vaddr + size - addr;
    3610           0 :                 if (n > count)
    3611           0 :                         n = count;
    3612             : 
    3613           0 :                 if (flags & VMAP_RAM)
    3614           0 :                         vmap_ram_vread(buf, addr, n, flags);
    3615           0 :                 else if (!(vm->flags & VM_IOREMAP))
    3616           0 :                         aligned_vread(buf, addr, n);
    3617             :                 else /* IOREMAP area is treated as memory hole */
    3618           0 :                         memset(buf, 0, n);
    3619           0 :                 buf += n;
    3620           0 :                 addr += n;
    3621           0 :                 count -= n;
    3622             :         }
    3623             : finished:
    3624           0 :         spin_unlock(&vmap_area_lock);
    3625             : 
    3626           0 :         if (buf == buf_start)
    3627             :                 return 0;
    3628             :         /* zero-fill memory holes */
    3629           0 :         if (buf != buf_start + buflen)
    3630           0 :                 memset(buf, 0, buflen - (buf - buf_start));
    3631             : 
    3632           0 :         return buflen;
    3633             : }
    3634             : 
    3635             : /**
    3636             :  * remap_vmalloc_range_partial - map vmalloc pages to userspace
    3637             :  * @vma:                vma to cover
    3638             :  * @uaddr:              target user address to start at
    3639             :  * @kaddr:              virtual address of vmalloc kernel memory
    3640             :  * @pgoff:              offset from @kaddr to start at
    3641             :  * @size:               size of map area
    3642             :  *
    3643             :  * Returns:     0 for success, -Exxx on failure
    3644             :  *
    3645             :  * This function checks that @kaddr is a valid vmalloc'ed area,
    3646             :  * and that it is big enough to cover the range starting at
    3647             :  * @uaddr in @vma. Will return failure if that criteria isn't
    3648             :  * met.
    3649             :  *
    3650             :  * Similar to remap_pfn_range() (see mm/memory.c)
    3651             :  */
    3652           0 : int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
    3653             :                                 void *kaddr, unsigned long pgoff,
    3654             :                                 unsigned long size)
    3655             : {
    3656             :         struct vm_struct *area;
    3657             :         unsigned long off;
    3658             :         unsigned long end_index;
    3659             : 
    3660           0 :         if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
    3661             :                 return -EINVAL;
    3662             : 
    3663           0 :         size = PAGE_ALIGN(size);
    3664             : 
    3665           0 :         if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
    3666             :                 return -EINVAL;
    3667             : 
    3668           0 :         area = find_vm_area(kaddr);
    3669           0 :         if (!area)
    3670             :                 return -EINVAL;
    3671             : 
    3672           0 :         if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
    3673             :                 return -EINVAL;
    3674             : 
    3675           0 :         if (check_add_overflow(size, off, &end_index) ||
    3676             :             end_index > get_vm_area_size(area))
    3677             :                 return -EINVAL;
    3678           0 :         kaddr += off;
    3679             : 
    3680             :         do {
    3681           0 :                 struct page *page = vmalloc_to_page(kaddr);
    3682             :                 int ret;
    3683             : 
    3684           0 :                 ret = vm_insert_page(vma, uaddr, page);
    3685           0 :                 if (ret)
    3686             :                         return ret;
    3687             : 
    3688           0 :                 uaddr += PAGE_SIZE;
    3689           0 :                 kaddr += PAGE_SIZE;
    3690           0 :                 size -= PAGE_SIZE;
    3691           0 :         } while (size > 0);
    3692             : 
    3693           0 :         vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);
    3694             : 
    3695           0 :         return 0;
    3696             : }
    3697             : 
    3698             : /**
    3699             :  * remap_vmalloc_range - map vmalloc pages to userspace
    3700             :  * @vma:                vma to cover (map full range of vma)
    3701             :  * @addr:               vmalloc memory
    3702             :  * @pgoff:              number of pages into addr before first page to map
    3703             :  *
    3704             :  * Returns:     0 for success, -Exxx on failure
    3705             :  *
    3706             :  * This function checks that addr is a valid vmalloc'ed area, and
    3707             :  * that it is big enough to cover the vma. Will return failure if
    3708             :  * that criteria isn't met.
    3709             :  *
    3710             :  * Similar to remap_pfn_range() (see mm/memory.c)
    3711             :  */
    3712           0 : int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
    3713             :                                                 unsigned long pgoff)
    3714             : {
    3715           0 :         return remap_vmalloc_range_partial(vma, vma->vm_start,
    3716             :                                            addr, pgoff,
    3717           0 :                                            vma->vm_end - vma->vm_start);
    3718             : }
    3719             : EXPORT_SYMBOL(remap_vmalloc_range);
    3720             : 
    3721           0 : void free_vm_area(struct vm_struct *area)
    3722             : {
    3723             :         struct vm_struct *ret;
    3724           0 :         ret = remove_vm_area(area->addr);
    3725           0 :         BUG_ON(ret != area);
    3726           0 :         kfree(area);
    3727           0 : }
    3728             : EXPORT_SYMBOL_GPL(free_vm_area);
    3729             : 
    3730             : #ifdef CONFIG_SMP
    3731             : static struct vmap_area *node_to_va(struct rb_node *n)
    3732             : {
    3733             :         return rb_entry_safe(n, struct vmap_area, rb_node);
    3734             : }
    3735             : 
    3736             : /**
    3737             :  * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
    3738             :  * @addr: target address
    3739             :  *
    3740             :  * Returns: vmap_area if it is found. If there is no such area
    3741             :  *   the first highest(reverse order) vmap_area is returned
    3742             :  *   i.e. va->va_start < addr && va->va_end < addr or NULL
    3743             :  *   if there are no any areas before @addr.
    3744             :  */
    3745             : static struct vmap_area *
    3746             : pvm_find_va_enclose_addr(unsigned long addr)
    3747             : {
    3748             :         struct vmap_area *va, *tmp;
    3749             :         struct rb_node *n;
    3750             : 
    3751             :         n = free_vmap_area_root.rb_node;
    3752             :         va = NULL;
    3753             : 
    3754             :         while (n) {
    3755             :                 tmp = rb_entry(n, struct vmap_area, rb_node);
    3756             :                 if (tmp->va_start <= addr) {
    3757             :                         va = tmp;
    3758             :                         if (tmp->va_end >= addr)
    3759             :                                 break;
    3760             : 
    3761             :                         n = n->rb_right;
    3762             :                 } else {
    3763             :                         n = n->rb_left;
    3764             :                 }
    3765             :         }
    3766             : 
    3767             :         return va;
    3768             : }
    3769             : 
    3770             : /**
    3771             :  * pvm_determine_end_from_reverse - find the highest aligned address
    3772             :  * of free block below VMALLOC_END
    3773             :  * @va:
    3774             :  *   in - the VA we start the search(reverse order);
    3775             :  *   out - the VA with the highest aligned end address.
    3776             :  * @align: alignment for required highest address
    3777             :  *
    3778             :  * Returns: determined end address within vmap_area
    3779             :  */
    3780             : static unsigned long
    3781             : pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
    3782             : {
    3783             :         unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
    3784             :         unsigned long addr;
    3785             : 
    3786             :         if (likely(*va)) {
    3787             :                 list_for_each_entry_from_reverse((*va),
    3788             :                                 &free_vmap_area_list, list) {
    3789             :                         addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
    3790             :                         if ((*va)->va_start < addr)
    3791             :                                 return addr;
    3792             :                 }
    3793             :         }
    3794             : 
    3795             :         return 0;
    3796             : }
    3797             : 
    3798             : /**
    3799             :  * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
    3800             :  * @offsets: array containing offset of each area
    3801             :  * @sizes: array containing size of each area
    3802             :  * @nr_vms: the number of areas to allocate
    3803             :  * @align: alignment, all entries in @offsets and @sizes must be aligned to this
    3804             :  *
    3805             :  * Returns: kmalloc'd vm_struct pointer array pointing to allocated
    3806             :  *          vm_structs on success, %NULL on failure
    3807             :  *
    3808             :  * Percpu allocator wants to use congruent vm areas so that it can
    3809             :  * maintain the offsets among percpu areas.  This function allocates
    3810             :  * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
    3811             :  * be scattered pretty far, distance between two areas easily going up
    3812             :  * to gigabytes.  To avoid interacting with regular vmallocs, these
    3813             :  * areas are allocated from top.
    3814             :  *
    3815             :  * Despite its complicated look, this allocator is rather simple. It
    3816             :  * does everything top-down and scans free blocks from the end looking
    3817             :  * for matching base. While scanning, if any of the areas do not fit the
    3818             :  * base address is pulled down to fit the area. Scanning is repeated till
    3819             :  * all the areas fit and then all necessary data structures are inserted
    3820             :  * and the result is returned.
    3821             :  */
    3822             : struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
    3823             :                                      const size_t *sizes, int nr_vms,
    3824             :                                      size_t align)
    3825             : {
    3826             :         const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
    3827             :         const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
    3828             :         struct vmap_area **vas, *va;
    3829             :         struct vm_struct **vms;
    3830             :         int area, area2, last_area, term_area;
    3831             :         unsigned long base, start, size, end, last_end, orig_start, orig_end;
    3832             :         bool purged = false;
    3833             : 
    3834             :         /* verify parameters and allocate data structures */
    3835             :         BUG_ON(offset_in_page(align) || !is_power_of_2(align));
    3836             :         for (last_area = 0, area = 0; area < nr_vms; area++) {
    3837             :                 start = offsets[area];
    3838             :                 end = start + sizes[area];
    3839             : 
    3840             :                 /* is everything aligned properly? */
    3841             :                 BUG_ON(!IS_ALIGNED(offsets[area], align));
    3842             :                 BUG_ON(!IS_ALIGNED(sizes[area], align));
    3843             : 
    3844             :                 /* detect the area with the highest address */
    3845             :                 if (start > offsets[last_area])
    3846             :                         last_area = area;
    3847             : 
    3848             :                 for (area2 = area + 1; area2 < nr_vms; area2++) {
    3849             :                         unsigned long start2 = offsets[area2];
    3850             :                         unsigned long end2 = start2 + sizes[area2];
    3851             : 
    3852             :                         BUG_ON(start2 < end && start < end2);
    3853             :                 }
    3854             :         }
    3855             :         last_end = offsets[last_area] + sizes[last_area];
    3856             : 
    3857             :         if (vmalloc_end - vmalloc_start < last_end) {
    3858             :                 WARN_ON(true);
    3859             :                 return NULL;
    3860             :         }
    3861             : 
    3862             :         vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
    3863             :         vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
    3864             :         if (!vas || !vms)
    3865             :                 goto err_free2;
    3866             : 
    3867             :         for (area = 0; area < nr_vms; area++) {
    3868             :                 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
    3869             :                 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
    3870             :                 if (!vas[area] || !vms[area])
    3871             :                         goto err_free;
    3872             :         }
    3873             : retry:
    3874             :         spin_lock(&free_vmap_area_lock);
    3875             : 
    3876             :         /* start scanning - we scan from the top, begin with the last area */
    3877             :         area = term_area = last_area;
    3878             :         start = offsets[area];
    3879             :         end = start + sizes[area];
    3880             : 
    3881             :         va = pvm_find_va_enclose_addr(vmalloc_end);
    3882             :         base = pvm_determine_end_from_reverse(&va, align) - end;
    3883             : 
    3884             :         while (true) {
    3885             :                 /*
    3886             :                  * base might have underflowed, add last_end before
    3887             :                  * comparing.
    3888             :                  */
    3889             :                 if (base + last_end < vmalloc_start + last_end)
    3890             :                         goto overflow;
    3891             : 
    3892             :                 /*
    3893             :                  * Fitting base has not been found.
    3894             :                  */
    3895             :                 if (va == NULL)
    3896             :                         goto overflow;
    3897             : 
    3898             :                 /*
    3899             :                  * If required width exceeds current VA block, move
    3900             :                  * base downwards and then recheck.
    3901             :                  */
    3902             :                 if (base + end > va->va_end) {
    3903             :                         base = pvm_determine_end_from_reverse(&va, align) - end;
    3904             :                         term_area = area;
    3905             :                         continue;
    3906             :                 }
    3907             : 
    3908             :                 /*
    3909             :                  * If this VA does not fit, move base downwards and recheck.
    3910             :                  */
    3911             :                 if (base + start < va->va_start) {
    3912             :                         va = node_to_va(rb_prev(&va->rb_node));
    3913             :                         base = pvm_determine_end_from_reverse(&va, align) - end;
    3914             :                         term_area = area;
    3915             :                         continue;
    3916             :                 }
    3917             : 
    3918             :                 /*
    3919             :                  * This area fits, move on to the previous one.  If
    3920             :                  * the previous one is the terminal one, we're done.
    3921             :                  */
    3922             :                 area = (area + nr_vms - 1) % nr_vms;
    3923             :                 if (area == term_area)
    3924             :                         break;
    3925             : 
    3926             :                 start = offsets[area];
    3927             :                 end = start + sizes[area];
    3928             :                 va = pvm_find_va_enclose_addr(base + end);
    3929             :         }
    3930             : 
    3931             :         /* we've found a fitting base, insert all va's */
    3932             :         for (area = 0; area < nr_vms; area++) {
    3933             :                 int ret;
    3934             : 
    3935             :                 start = base + offsets[area];
    3936             :                 size = sizes[area];
    3937             : 
    3938             :                 va = pvm_find_va_enclose_addr(start);
    3939             :                 if (WARN_ON_ONCE(va == NULL))
    3940             :                         /* It is a BUG(), but trigger recovery instead. */
    3941             :                         goto recovery;
    3942             : 
    3943             :                 ret = adjust_va_to_fit_type(&free_vmap_area_root,
    3944             :                                             &free_vmap_area_list,
    3945             :                                             va, start, size);
    3946             :                 if (WARN_ON_ONCE(unlikely(ret)))
    3947             :                         /* It is a BUG(), but trigger recovery instead. */
    3948             :                         goto recovery;
    3949             : 
    3950             :                 /* Allocated area. */
    3951             :                 va = vas[area];
    3952             :                 va->va_start = start;
    3953             :                 va->va_end = start + size;
    3954             :         }
    3955             : 
    3956             :         spin_unlock(&free_vmap_area_lock);
    3957             : 
    3958             :         /* populate the kasan shadow space */
    3959             :         for (area = 0; area < nr_vms; area++) {
    3960             :                 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
    3961             :                         goto err_free_shadow;
    3962             :         }
    3963             : 
    3964             :         /* insert all vm's */
    3965             :         spin_lock(&vmap_area_lock);
    3966             :         for (area = 0; area < nr_vms; area++) {
    3967             :                 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
    3968             : 
    3969             :                 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
    3970             :                                  pcpu_get_vm_areas);
    3971             :         }
    3972             :         spin_unlock(&vmap_area_lock);
    3973             : 
    3974             :         /*
    3975             :          * Mark allocated areas as accessible. Do it now as a best-effort
    3976             :          * approach, as they can be mapped outside of vmalloc code.
    3977             :          * With hardware tag-based KASAN, marking is skipped for
    3978             :          * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
    3979             :          */
    3980             :         for (area = 0; area < nr_vms; area++)
    3981             :                 vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
    3982             :                                 vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
    3983             : 
    3984             :         kfree(vas);
    3985             :         return vms;
    3986             : 
    3987             : recovery:
    3988             :         /*
    3989             :          * Remove previously allocated areas. There is no
    3990             :          * need in removing these areas from the busy tree,
    3991             :          * because they are inserted only on the final step
    3992             :          * and when pcpu_get_vm_areas() is success.
    3993             :          */
    3994             :         while (area--) {
    3995             :                 orig_start = vas[area]->va_start;
    3996             :                 orig_end = vas[area]->va_end;
    3997             :                 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
    3998             :                                 &free_vmap_area_list);
    3999             :                 if (va)
    4000             :                         kasan_release_vmalloc(orig_start, orig_end,
    4001             :                                 va->va_start, va->va_end);
    4002             :                 vas[area] = NULL;
    4003             :         }
    4004             : 
    4005             : overflow:
    4006             :         spin_unlock(&free_vmap_area_lock);
    4007             :         if (!purged) {
    4008             :                 purge_vmap_area_lazy();
    4009             :                 purged = true;
    4010             : 
    4011             :                 /* Before "retry", check if we recover. */
    4012             :                 for (area = 0; area < nr_vms; area++) {
    4013             :                         if (vas[area])
    4014             :                                 continue;
    4015             : 
    4016             :                         vas[area] = kmem_cache_zalloc(
    4017             :                                 vmap_area_cachep, GFP_KERNEL);
    4018             :                         if (!vas[area])
    4019             :                                 goto err_free;
    4020             :                 }
    4021             : 
    4022             :                 goto retry;
    4023             :         }
    4024             : 
    4025             : err_free:
    4026             :         for (area = 0; area < nr_vms; area++) {
    4027             :                 if (vas[area])
    4028             :                         kmem_cache_free(vmap_area_cachep, vas[area]);
    4029             : 
    4030             :                 kfree(vms[area]);
    4031             :         }
    4032             : err_free2:
    4033             :         kfree(vas);
    4034             :         kfree(vms);
    4035             :         return NULL;
    4036             : 
    4037             : err_free_shadow:
    4038             :         spin_lock(&free_vmap_area_lock);
    4039             :         /*
    4040             :          * We release all the vmalloc shadows, even the ones for regions that
    4041             :          * hadn't been successfully added. This relies on kasan_release_vmalloc
    4042             :          * being able to tolerate this case.
    4043             :          */
    4044             :         for (area = 0; area < nr_vms; area++) {
    4045             :                 orig_start = vas[area]->va_start;
    4046             :                 orig_end = vas[area]->va_end;
    4047             :                 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
    4048             :                                 &free_vmap_area_list);
    4049             :                 if (va)
    4050             :                         kasan_release_vmalloc(orig_start, orig_end,
    4051             :                                 va->va_start, va->va_end);
    4052             :                 vas[area] = NULL;
    4053             :                 kfree(vms[area]);
    4054             :         }
    4055             :         spin_unlock(&free_vmap_area_lock);
    4056             :         kfree(vas);
    4057             :         kfree(vms);
    4058             :         return NULL;
    4059             : }
    4060             : 
    4061             : /**
    4062             :  * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
    4063             :  * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
    4064             :  * @nr_vms: the number of allocated areas
    4065             :  *
    4066             :  * Free vm_structs and the array allocated by pcpu_get_vm_areas().
    4067             :  */
    4068             : void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
    4069             : {
    4070             :         int i;
    4071             : 
    4072             :         for (i = 0; i < nr_vms; i++)
    4073             :                 free_vm_area(vms[i]);
    4074             :         kfree(vms);
    4075             : }
    4076             : #endif  /* CONFIG_SMP */
    4077             : 
    4078             : #ifdef CONFIG_PRINTK
    4079           0 : bool vmalloc_dump_obj(void *object)
    4080             : {
    4081             :         struct vm_struct *vm;
    4082           0 :         void *objp = (void *)PAGE_ALIGN((unsigned long)object);
    4083             : 
    4084           0 :         vm = find_vm_area(objp);
    4085           0 :         if (!vm)
    4086             :                 return false;
    4087           0 :         pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
    4088             :                 vm->nr_pages, (unsigned long)vm->addr, vm->caller);
    4089           0 :         return true;
    4090             : }
    4091             : #endif
    4092             : 
    4093             : #ifdef CONFIG_PROC_FS
    4094           0 : static void *s_start(struct seq_file *m, loff_t *pos)
    4095             :         __acquires(&vmap_purge_lock)
    4096             :         __acquires(&vmap_area_lock)
    4097             : {
    4098           0 :         mutex_lock(&vmap_purge_lock);
    4099           0 :         spin_lock(&vmap_area_lock);
    4100             : 
    4101           0 :         return seq_list_start(&vmap_area_list, *pos);
    4102             : }
    4103             : 
    4104           0 : static void *s_next(struct seq_file *m, void *p, loff_t *pos)
    4105             : {
    4106           0 :         return seq_list_next(p, &vmap_area_list, pos);
    4107             : }
    4108             : 
    4109           0 : static void s_stop(struct seq_file *m, void *p)
    4110             :         __releases(&vmap_area_lock)
    4111             :         __releases(&vmap_purge_lock)
    4112             : {
    4113           0 :         spin_unlock(&vmap_area_lock);
    4114           0 :         mutex_unlock(&vmap_purge_lock);
    4115           0 : }
    4116             : 
    4117             : static void show_numa_info(struct seq_file *m, struct vm_struct *v)
    4118             : {
    4119             :         if (IS_ENABLED(CONFIG_NUMA)) {
    4120             :                 unsigned int nr, *counters = m->private;
    4121             :                 unsigned int step = 1U << vm_area_page_order(v);
    4122             : 
    4123             :                 if (!counters)
    4124             :                         return;
    4125             : 
    4126             :                 if (v->flags & VM_UNINITIALIZED)
    4127             :                         return;
    4128             :                 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
    4129             :                 smp_rmb();
    4130             : 
    4131             :                 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
    4132             : 
    4133             :                 for (nr = 0; nr < v->nr_pages; nr += step)
    4134             :                         counters[page_to_nid(v->pages[nr])] += step;
    4135             :                 for_each_node_state(nr, N_HIGH_MEMORY)
    4136             :                         if (counters[nr])
    4137             :                                 seq_printf(m, " N%u=%u", nr, counters[nr]);
    4138             :         }
    4139             : }
    4140             : 
    4141           0 : static void show_purge_info(struct seq_file *m)
    4142             : {
    4143             :         struct vmap_area *va;
    4144             : 
    4145           0 :         spin_lock(&purge_vmap_area_lock);
    4146           0 :         list_for_each_entry(va, &purge_vmap_area_list, list) {
    4147           0 :                 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
    4148             :                         (void *)va->va_start, (void *)va->va_end,
    4149           0 :                         va->va_end - va->va_start);
    4150             :         }
    4151           0 :         spin_unlock(&purge_vmap_area_lock);
    4152           0 : }
    4153             : 
    4154           0 : static int s_show(struct seq_file *m, void *p)
    4155             : {
    4156             :         struct vmap_area *va;
    4157             :         struct vm_struct *v;
    4158             : 
    4159           0 :         va = list_entry(p, struct vmap_area, list);
    4160             : 
    4161           0 :         if (!va->vm) {
    4162           0 :                 if (va->flags & VMAP_RAM)
    4163           0 :                         seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
    4164             :                                 (void *)va->va_start, (void *)va->va_end,
    4165           0 :                                 va->va_end - va->va_start);
    4166             : 
    4167             :                 goto final;
    4168             :         }
    4169             : 
    4170           0 :         v = va->vm;
    4171             : 
    4172           0 :         seq_printf(m, "0x%pK-0x%pK %7ld",
    4173           0 :                 v->addr, v->addr + v->size, v->size);
    4174             : 
    4175           0 :         if (v->caller)
    4176           0 :                 seq_printf(m, " %pS", v->caller);
    4177             : 
    4178           0 :         if (v->nr_pages)
    4179           0 :                 seq_printf(m, " pages=%d", v->nr_pages);
    4180             : 
    4181           0 :         if (v->phys_addr)
    4182           0 :                 seq_printf(m, " phys=%pa", &v->phys_addr);
    4183             : 
    4184           0 :         if (v->flags & VM_IOREMAP)
    4185           0 :                 seq_puts(m, " ioremap");
    4186             : 
    4187           0 :         if (v->flags & VM_ALLOC)
    4188           0 :                 seq_puts(m, " vmalloc");
    4189             : 
    4190           0 :         if (v->flags & VM_MAP)
    4191           0 :                 seq_puts(m, " vmap");
    4192             : 
    4193           0 :         if (v->flags & VM_USERMAP)
    4194           0 :                 seq_puts(m, " user");
    4195             : 
    4196           0 :         if (v->flags & VM_DMA_COHERENT)
    4197           0 :                 seq_puts(m, " dma-coherent");
    4198             : 
    4199           0 :         if (is_vmalloc_addr(v->pages))
    4200           0 :                 seq_puts(m, " vpages");
    4201             : 
    4202           0 :         show_numa_info(m, v);
    4203           0 :         seq_putc(m, '\n');
    4204             : 
    4205             :         /*
    4206             :          * As a final step, dump "unpurged" areas.
    4207             :          */
    4208             : final:
    4209           0 :         if (list_is_last(&va->list, &vmap_area_list))
    4210           0 :                 show_purge_info(m);
    4211             : 
    4212           0 :         return 0;
    4213             : }
    4214             : 
    4215             : static const struct seq_operations vmalloc_op = {
    4216             :         .start = s_start,
    4217             :         .next = s_next,
    4218             :         .stop = s_stop,
    4219             :         .show = s_show,
    4220             : };
    4221             : 
    4222           1 : static int __init proc_vmalloc_init(void)
    4223             : {
    4224             :         if (IS_ENABLED(CONFIG_NUMA))
    4225             :                 proc_create_seq_private("vmallocinfo", 0400, NULL,
    4226             :                                 &vmalloc_op,
    4227             :                                 nr_node_ids * sizeof(unsigned int), NULL);
    4228             :         else
    4229           1 :                 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
    4230           1 :         return 0;
    4231             : }
    4232             : module_init(proc_vmalloc_init);
    4233             : 
    4234             : #endif
    4235             : 
    4236           1 : void __init vmalloc_init(void)
    4237             : {
    4238             :         struct vmap_area *va;
    4239             :         struct vm_struct *tmp;
    4240             :         int i;
    4241             : 
    4242             :         /*
    4243             :          * Create the cache for vmap_area objects.
    4244             :          */
    4245           1 :         vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
    4246             : 
    4247           2 :         for_each_possible_cpu(i) {
    4248             :                 struct vmap_block_queue *vbq;
    4249             :                 struct vfree_deferred *p;
    4250             : 
    4251           1 :                 vbq = &per_cpu(vmap_block_queue, i);
    4252           1 :                 spin_lock_init(&vbq->lock);
    4253           2 :                 INIT_LIST_HEAD(&vbq->free);
    4254           1 :                 p = &per_cpu(vfree_deferred, i);
    4255           2 :                 init_llist_head(&p->list);
    4256           2 :                 INIT_WORK(&p->wq, delayed_vfree_work);
    4257             :         }
    4258             : 
    4259             :         /* Import existing vmlist entries. */
    4260           1 :         for (tmp = vmlist; tmp; tmp = tmp->next) {
    4261           0 :                 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    4262           0 :                 if (WARN_ON_ONCE(!va))
    4263           0 :                         continue;
    4264             : 
    4265           0 :                 va->va_start = (unsigned long)tmp->addr;
    4266           0 :                 va->va_end = va->va_start + tmp->size;
    4267           0 :                 va->vm = tmp;
    4268           0 :                 insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
    4269             :         }
    4270             : 
    4271             :         /*
    4272             :          * Now we can initialize a free vmap space.
    4273             :          */
    4274           1 :         vmap_init_free_space();
    4275           1 :         vmap_initialized = true;
    4276           1 : }

Generated by: LCOV version 1.14