LCOV - coverage.info - mm/vmalloc.c

LCOV - code coverage report

Current view:	top level - mm - vmalloc.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	296	1147	25.8 %
Date:	2023-08-24 13:40:31	Functions:	21	96	21.9 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  Copyright (C) 1993  Linus Torvalds
       4             :  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
       5             :  *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
       6             :  *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
       7             :  *  Numa awareness, Christoph Lameter, SGI, June 2005
       8             :  *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
       9             :  */
      10             : 
      11             : #include <linux/vmalloc.h>
      12             : #include <linux/mm.h>
      13             : #include <linux/module.h>
      14             : #include <linux/highmem.h>
      15             : #include <linux/sched/signal.h>
      16             : #include <linux/slab.h>
      17             : #include <linux/spinlock.h>
      18             : #include <linux/interrupt.h>
      19             : #include <linux/proc_fs.h>
      20             : #include <linux/seq_file.h>
      21             : #include <linux/set_memory.h>
      22             : #include <linux/debugobjects.h>
      23             : #include <linux/kallsyms.h>
      24             : #include <linux/list.h>
      25             : #include <linux/notifier.h>
      26             : #include <linux/rbtree.h>
      27             : #include <linux/xarray.h>
      28             : #include <linux/io.h>
      29             : #include <linux/rcupdate.h>
      30             : #include <linux/pfn.h>
      31             : #include <linux/kmemleak.h>
      32             : #include <linux/atomic.h>
      33             : #include <linux/compiler.h>
      34             : #include <linux/memcontrol.h>
      35             : #include <linux/llist.h>
      36             : #include <linux/uio.h>
      37             : #include <linux/bitops.h>
      38             : #include <linux/rbtree_augmented.h>
      39             : #include <linux/overflow.h>
      40             : #include <linux/pgtable.h>
      41             : #include <linux/hugetlb.h>
      42             : #include <linux/sched/mm.h>
      43             : #include <asm/tlbflush.h>
      44             : #include <asm/shmparam.h>
      45             : 
      46             : #define CREATE_TRACE_POINTS
      47             : #include <trace/events/vmalloc.h>
      48             : 
      49             : #include "internal.h"
      50             : #include "pgalloc-track.h"
      51             : 
      52             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
      53             : static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;
      54             : 
      55             : static int __init set_nohugeiomap(char *str)
      56             : {
      57             :         ioremap_max_page_shift = PAGE_SHIFT;
      58             :         return 0;
      59             : }
      60             : early_param("nohugeiomap", set_nohugeiomap);
      61             : #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
      62             : static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
      63             : #endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */
      64             : 
      65             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
      66             : static bool __ro_after_init vmap_allow_huge = true;
      67             : 
      68             : static int __init set_nohugevmalloc(char *str)
      69             : {
      70             :         vmap_allow_huge = false;
      71             :         return 0;
      72             : }
      73             : early_param("nohugevmalloc", set_nohugevmalloc);
      74             : #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
      75             : static const bool vmap_allow_huge = false;
      76             : #endif  /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
      77             : 
      78           0 : bool is_vmalloc_addr(const void *x)
      79             : {
      80           0 :         unsigned long addr = (unsigned long)kasan_reset_tag(x);
      81             : 
      82           0 :         return addr >= VMALLOC_START && addr < VMALLOC_END;
      83             : }
      84             : EXPORT_SYMBOL(is_vmalloc_addr);
      85             : 
      86             : struct vfree_deferred {
      87             :         struct llist_head list;
      88             :         struct work_struct wq;
      89             : };
      90             : static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
      91             : 
      92             : /*** Page table manipulation functions ***/
      93           0 : static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
      94             :                         phys_addr_t phys_addr, pgprot_t prot,
      95             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
      96             : {
      97             :         pte_t *pte;
      98             :         u64 pfn;
      99           0 :         unsigned long size = PAGE_SIZE;
     100             : 
     101           0 :         pfn = phys_addr >> PAGE_SHIFT;
     102           0 :         pte = pte_alloc_kernel_track(pmd, addr, mask);
     103           0 :         if (!pte)
     104             :                 return -ENOMEM;
     105             :         do {
     106           0 :                 BUG_ON(!pte_none(ptep_get(pte)));
     107             : 
     108             : #ifdef CONFIG_HUGETLB_PAGE
     109             :                 size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
     110             :                 if (size != PAGE_SIZE) {
     111             :                         pte_t entry = pfn_pte(pfn, prot);
     112             : 
     113             :                         entry = arch_make_huge_pte(entry, ilog2(size), 0);
     114             :                         set_huge_pte_at(&init_mm, addr, pte, entry);
     115             :                         pfn += PFN_DOWN(size);
     116             :                         continue;
     117             :                 }
     118             : #endif
     119           0 :                 set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
     120           0 :                 pfn++;
     121           0 :         } while (pte += PFN_DOWN(size), addr += size, addr != end);
     122           0 :         *mask |= PGTBL_PTE_MODIFIED;
     123             :         return 0;
     124             : }
     125             : 
     126             : static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
     127             :                         phys_addr_t phys_addr, pgprot_t prot,
     128             :                         unsigned int max_page_shift)
     129             : {
     130             :         if (max_page_shift < PMD_SHIFT)
     131             :                 return 0;
     132             : 
     133             :         if (!arch_vmap_pmd_supported(prot))
     134             :                 return 0;
     135             : 
     136             :         if ((end - addr) != PMD_SIZE)
     137             :                 return 0;
     138             : 
     139             :         if (!IS_ALIGNED(addr, PMD_SIZE))
     140             :                 return 0;
     141             : 
     142             :         if (!IS_ALIGNED(phys_addr, PMD_SIZE))
     143             :                 return 0;
     144             : 
     145             :         if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
     146             :                 return 0;
     147             : 
     148             :         return pmd_set_huge(pmd, phys_addr, prot);
     149             : }
     150             : 
     151           0 : static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
     152             :                         phys_addr_t phys_addr, pgprot_t prot,
     153             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
     154             : {
     155             :         pmd_t *pmd;
     156             :         unsigned long next;
     157             : 
     158           0 :         pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
     159           0 :         if (!pmd)
     160             :                 return -ENOMEM;
     161             :         do {
     162           0 :                 next = pmd_addr_end(addr, end);
     163             : 
     164           0 :                 if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
     165             :                                         max_page_shift)) {
     166             :                         *mask |= PGTBL_PMD_MODIFIED;
     167             :                         continue;
     168             :                 }
     169             : 
     170           0 :                 if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
     171             :                         return -ENOMEM;
     172           0 :         } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
     173             :         return 0;
     174             : }
     175             : 
     176             : static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
     177             :                         phys_addr_t phys_addr, pgprot_t prot,
     178             :                         unsigned int max_page_shift)
     179             : {
     180             :         if (max_page_shift < PUD_SHIFT)
     181             :                 return 0;
     182             : 
     183             :         if (!arch_vmap_pud_supported(prot))
     184             :                 return 0;
     185             : 
     186             :         if ((end - addr) != PUD_SIZE)
     187             :                 return 0;
     188             : 
     189             :         if (!IS_ALIGNED(addr, PUD_SIZE))
     190             :                 return 0;
     191             : 
     192             :         if (!IS_ALIGNED(phys_addr, PUD_SIZE))
     193             :                 return 0;
     194             : 
     195             :         if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
     196             :                 return 0;
     197             : 
     198             :         return pud_set_huge(pud, phys_addr, prot);
     199             : }
     200             : 
     201             : static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
     202             :                         phys_addr_t phys_addr, pgprot_t prot,
     203             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
     204             : {
     205             :         pud_t *pud;
     206             :         unsigned long next;
     207             : 
     208           0 :         pud = pud_alloc_track(&init_mm, p4d, addr, mask);
     209             :         if (!pud)
     210             :                 return -ENOMEM;
     211             :         do {
     212           0 :                 next = pud_addr_end(addr, end);
     213             : 
     214           0 :                 if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
     215             :                                         max_page_shift)) {
     216             :                         *mask |= PGTBL_PUD_MODIFIED;
     217             :                         continue;
     218             :                 }
     219             : 
     220           0 :                 if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
     221             :                                         max_page_shift, mask))
     222             :                         return -ENOMEM;
     223           0 :         } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
     224             :         return 0;
     225             : }
     226             : 
     227             : static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
     228             :                         phys_addr_t phys_addr, pgprot_t prot,
     229             :                         unsigned int max_page_shift)
     230             : {
     231             :         if (max_page_shift < P4D_SHIFT)
     232             :                 return 0;
     233             : 
     234             :         if (!arch_vmap_p4d_supported(prot))
     235             :                 return 0;
     236             : 
     237             :         if ((end - addr) != P4D_SIZE)
     238             :                 return 0;
     239             : 
     240             :         if (!IS_ALIGNED(addr, P4D_SIZE))
     241             :                 return 0;
     242             : 
     243             :         if (!IS_ALIGNED(phys_addr, P4D_SIZE))
     244             :                 return 0;
     245             : 
     246             :         if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
     247             :                 return 0;
     248             : 
     249             :         return p4d_set_huge(p4d, phys_addr, prot);
     250             : }
     251             : 
     252           0 : static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
     253             :                         phys_addr_t phys_addr, pgprot_t prot,
     254             :                         unsigned int max_page_shift, pgtbl_mod_mask *mask)
     255             : {
     256             :         p4d_t *p4d;
     257             :         unsigned long next;
     258             : 
     259           0 :         p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
     260           0 :         if (!p4d)
     261             :                 return -ENOMEM;
     262             :         do {
     263           0 :                 next = p4d_addr_end(addr, end);
     264             : 
     265           0 :                 if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
     266             :                                         max_page_shift)) {
     267             :                         *mask |= PGTBL_P4D_MODIFIED;
     268             :                         continue;
     269             :                 }
     270             : 
     271           0 :                 if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
     272             :                                         max_page_shift, mask))
     273             :                         return -ENOMEM;
     274           0 :         } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
     275             :         return 0;
     276             : }
     277             : 
     278           0 : static int vmap_range_noflush(unsigned long addr, unsigned long end,
     279             :                         phys_addr_t phys_addr, pgprot_t prot,
     280             :                         unsigned int max_page_shift)
     281             : {
     282             :         pgd_t *pgd;
     283             :         unsigned long start;
     284             :         unsigned long next;
     285             :         int err;
     286           0 :         pgtbl_mod_mask mask = 0;
     287             : 
     288             :         might_sleep();
     289           0 :         BUG_ON(addr >= end);
     290             : 
     291           0 :         start = addr;
     292           0 :         pgd = pgd_offset_k(addr);
     293             :         do {
     294           0 :                 next = pgd_addr_end(addr, end);
     295           0 :                 err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
     296             :                                         max_page_shift, &mask);
     297           0 :                 if (err)
     298             :                         break;
     299           0 :         } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
     300             : 
     301             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
     302             :                 arch_sync_kernel_mappings(start, end);
     303             : 
     304           0 :         return err;
     305             : }
     306             : 
     307           0 : int ioremap_page_range(unsigned long addr, unsigned long end,
     308             :                 phys_addr_t phys_addr, pgprot_t prot)
     309             : {
     310             :         int err;
     311             : 
     312           0 :         err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
     313             :                                  ioremap_max_page_shift);
     314           0 :         flush_cache_vmap(addr, end);
     315           0 :         if (!err)
     316           0 :                 err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
     317             :                                                ioremap_max_page_shift);
     318           0 :         return err;
     319             : }
     320             : 
     321           0 : static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
     322             :                              pgtbl_mod_mask *mask)
     323             : {
     324             :         pte_t *pte;
     325             : 
     326           0 :         pte = pte_offset_kernel(pmd, addr);
     327             :         do {
     328           0 :                 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
     329           0 :                 WARN_ON(!pte_none(ptent) && !pte_present(ptent));
     330           0 :         } while (pte++, addr += PAGE_SIZE, addr != end);
     331           0 :         *mask |= PGTBL_PTE_MODIFIED;
     332           0 : }
     333             : 
     334           0 : static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
     335             :                              pgtbl_mod_mask *mask)
     336             : {
     337             :         pmd_t *pmd;
     338             :         unsigned long next;
     339             :         int cleared;
     340             : 
     341           0 :         pmd = pmd_offset(pud, addr);
     342             :         do {
     343           0 :                 next = pmd_addr_end(addr, end);
     344             : 
     345           0 :                 cleared = pmd_clear_huge(pmd);
     346           0 :                 if (cleared || pmd_bad(*pmd))
     347           0 :                         *mask |= PGTBL_PMD_MODIFIED;
     348             : 
     349             :                 if (cleared)
     350             :                         continue;
     351           0 :                 if (pmd_none_or_clear_bad(pmd))
     352           0 :                         continue;
     353           0 :                 vunmap_pte_range(pmd, addr, next, mask);
     354             : 
     355           0 :                 cond_resched();
     356           0 :         } while (pmd++, addr = next, addr != end);
     357           0 : }
     358             : 
     359           0 : static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
     360             :                              pgtbl_mod_mask *mask)
     361             : {
     362             :         pud_t *pud;
     363             :         unsigned long next;
     364             :         int cleared;
     365             : 
     366           0 :         pud = pud_offset(p4d, addr);
     367             :         do {
     368           0 :                 next = pud_addr_end(addr, end);
     369             : 
     370           0 :                 cleared = pud_clear_huge(pud);
     371           0 :                 if (cleared || pud_bad(*pud))
     372           0 :                         *mask |= PGTBL_PUD_MODIFIED;
     373             : 
     374             :                 if (cleared)
     375             :                         continue;
     376           0 :                 if (pud_none_or_clear_bad(pud))
     377           0 :                         continue;
     378           0 :                 vunmap_pmd_range(pud, addr, next, mask);
     379           0 :         } while (pud++, addr = next, addr != end);
     380           0 : }
     381             : 
     382             : static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
     383             :                              pgtbl_mod_mask *mask)
     384             : {
     385             :         p4d_t *p4d;
     386             :         unsigned long next;
     387             : 
     388           0 :         p4d = p4d_offset(pgd, addr);
     389             :         do {
     390           0 :                 next = p4d_addr_end(addr, end);
     391             : 
     392           0 :                 p4d_clear_huge(p4d);
     393           0 :                 if (p4d_bad(*p4d))
     394             :                         *mask |= PGTBL_P4D_MODIFIED;
     395             : 
     396           0 :                 if (p4d_none_or_clear_bad(p4d))
     397             :                         continue;
     398           0 :                 vunmap_pud_range(p4d, addr, next, mask);
     399           0 :         } while (p4d++, addr = next, addr != end);
     400             : }
     401             : 
     402             : /*
     403             :  * vunmap_range_noflush is similar to vunmap_range, but does not
     404             :  * flush caches or TLBs.
     405             :  *
     406             :  * The caller is responsible for calling flush_cache_vmap() before calling
     407             :  * this function, and flush_tlb_kernel_range after it has returned
     408             :  * successfully (and before the addresses are expected to cause a page fault
     409             :  * or be re-mapped for something else, if TLB flushes are being delayed or
     410             :  * coalesced).
     411             :  *
     412             :  * This is an internal function only. Do not use outside mm/.
     413             :  */
     414           0 : void __vunmap_range_noflush(unsigned long start, unsigned long end)
     415             : {
     416             :         unsigned long next;
     417             :         pgd_t *pgd;
     418           0 :         unsigned long addr = start;
     419           0 :         pgtbl_mod_mask mask = 0;
     420             : 
     421           0 :         BUG_ON(addr >= end);
     422           0 :         pgd = pgd_offset_k(addr);
     423             :         do {
     424           0 :                 next = pgd_addr_end(addr, end);
     425           0 :                 if (pgd_bad(*pgd))
     426             :                         mask |= PGTBL_PGD_MODIFIED;
     427           0 :                 if (pgd_none_or_clear_bad(pgd))
     428             :                         continue;
     429             :                 vunmap_p4d_range(pgd, addr, next, &mask);
     430           0 :         } while (pgd++, addr = next, addr != end);
     431             : 
     432             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
     433             :                 arch_sync_kernel_mappings(start, end);
     434           0 : }
     435             : 
     436           0 : void vunmap_range_noflush(unsigned long start, unsigned long end)
     437             : {
     438           0 :         kmsan_vunmap_range_noflush(start, end);
     439           0 :         __vunmap_range_noflush(start, end);
     440           0 : }
     441             : 
     442             : /**
     443             :  * vunmap_range - unmap kernel virtual addresses
     444             :  * @addr: start of the VM area to unmap
     445             :  * @end: end of the VM area to unmap (non-inclusive)
     446             :  *
     447             :  * Clears any present PTEs in the virtual address range, flushes TLBs and
     448             :  * caches. Any subsequent access to the address before it has been re-mapped
     449             :  * is a kernel bug.
     450             :  */
     451           0 : void vunmap_range(unsigned long addr, unsigned long end)
     452             : {
     453           0 :         flush_cache_vunmap(addr, end);
     454           0 :         vunmap_range_noflush(addr, end);
     455           0 :         flush_tlb_kernel_range(addr, end);
     456           0 : }
     457             : 
     458          16 : static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
     459             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     460             :                 pgtbl_mod_mask *mask)
     461             : {
     462             :         pte_t *pte;
     463             : 
     464             :         /*
     465             :          * nr is a running index into the array which helps higher level
     466             :          * callers keep track of where we're up to.
     467             :          */
     468             : 
     469          32 :         pte = pte_alloc_kernel_track(pmd, addr, mask);
     470          16 :         if (!pte)
     471             :                 return -ENOMEM;
     472             :         do {
     473          64 :                 struct page *page = pages[*nr];
     474             : 
     475         128 :                 if (WARN_ON(!pte_none(ptep_get(pte))))
     476             :                         return -EBUSY;
     477          64 :                 if (WARN_ON(!page))
     478             :                         return -ENOMEM;
     479         128 :                 if (WARN_ON(!pfn_valid(page_to_pfn(page))))
     480             :                         return -EINVAL;
     481             : 
     482         128 :                 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
     483          64 :                 (*nr)++;
     484          64 :         } while (pte++, addr += PAGE_SIZE, addr != end);
     485          16 :         *mask |= PGTBL_PTE_MODIFIED;
     486          16 :         return 0;
     487             : }
     488             : 
     489          16 : static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
     490             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     491             :                 pgtbl_mod_mask *mask)
     492             : {
     493             :         pmd_t *pmd;
     494             :         unsigned long next;
     495             : 
     496          16 :         pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
     497          16 :         if (!pmd)
     498             :                 return -ENOMEM;
     499             :         do {
     500          16 :                 next = pmd_addr_end(addr, end);
     501          16 :                 if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
     502             :                         return -ENOMEM;
     503          16 :         } while (pmd++, addr = next, addr != end);
     504             :         return 0;
     505             : }
     506             : 
     507             : static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
     508             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     509             :                 pgtbl_mod_mask *mask)
     510             : {
     511             :         pud_t *pud;
     512             :         unsigned long next;
     513             : 
     514          32 :         pud = pud_alloc_track(&init_mm, p4d, addr, mask);
     515             :         if (!pud)
     516             :                 return -ENOMEM;
     517             :         do {
     518          16 :                 next = pud_addr_end(addr, end);
     519          16 :                 if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
     520             :                         return -ENOMEM;
     521          16 :         } while (pud++, addr = next, addr != end);
     522             :         return 0;
     523             : }
     524             : 
     525          16 : static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
     526             :                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
     527             :                 pgtbl_mod_mask *mask)
     528             : {
     529             :         p4d_t *p4d;
     530             :         unsigned long next;
     531             : 
     532          32 :         p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
     533          16 :         if (!p4d)
     534             :                 return -ENOMEM;
     535             :         do {
     536          16 :                 next = p4d_addr_end(addr, end);
     537          16 :                 if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
     538             :                         return -ENOMEM;
     539          16 :         } while (p4d++, addr = next, addr != end);
     540          16 :         return 0;
     541             : }
     542             : 
     543          16 : static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
     544             :                 pgprot_t prot, struct page **pages)
     545             : {
     546          16 :         unsigned long start = addr;
     547             :         pgd_t *pgd;
     548             :         unsigned long next;
     549          16 :         int err = 0;
     550          16 :         int nr = 0;
     551          16 :         pgtbl_mod_mask mask = 0;
     552             : 
     553          16 :         BUG_ON(addr >= end);
     554          32 :         pgd = pgd_offset_k(addr);
     555             :         do {
     556          16 :                 next = pgd_addr_end(addr, end);
     557          16 :                 if (pgd_bad(*pgd))
     558             :                         mask |= PGTBL_PGD_MODIFIED;
     559          16 :                 err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
     560          16 :                 if (err)
     561             :                         return err;
     562          16 :         } while (pgd++, addr = next, addr != end);
     563             : 
     564             :         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
     565             :                 arch_sync_kernel_mappings(start, end);
     566             : 
     567             :         return 0;
     568             : }
     569             : 
     570             : /*
     571             :  * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
     572             :  * flush caches.
     573             :  *
     574             :  * The caller is responsible for calling flush_cache_vmap() after this
     575             :  * function returns successfully and before the addresses are accessed.
     576             :  *
     577             :  * This is an internal function only. Do not use outside mm/.
     578             :  */
     579          16 : int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
     580             :                 pgprot_t prot, struct page **pages, unsigned int page_shift)
     581             : {
     582          16 :         unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
     583             : 
     584          16 :         WARN_ON(page_shift < PAGE_SHIFT);
     585             : 
     586             :         if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
     587             :                         page_shift == PAGE_SHIFT)
     588          16 :                 return vmap_small_pages_range_noflush(addr, end, prot, pages);
     589             : 
     590             :         for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
     591             :                 int err;
     592             : 
     593             :                 err = vmap_range_noflush(addr, addr + (1UL << page_shift),
     594             :                                         page_to_phys(pages[i]), prot,
     595             :                                         page_shift);
     596             :                 if (err)
     597             :                         return err;
     598             : 
     599             :                 addr += 1UL << page_shift;
     600             :         }
     601             : 
     602             :         return 0;
     603             : }
     604             : 
     605           0 : int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
     606             :                 pgprot_t prot, struct page **pages, unsigned int page_shift)
     607             : {
     608          16 :         int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
     609             :                                                  page_shift);
     610             : 
     611             :         if (ret)
     612             :                 return ret;
     613          16 :         return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
     614             : }
     615             : 
     616             : /**
     617             :  * vmap_pages_range - map pages to a kernel virtual address
     618             :  * @addr: start of the VM area to map
     619             :  * @end: end of the VM area to map (non-inclusive)
     620             :  * @prot: page protection flags to use
     621             :  * @pages: pages to map (always PAGE_SIZE pages)
     622             :  * @page_shift: maximum shift that the pages may be mapped with, @pages must
     623             :  * be aligned and contiguous up to at least this shift.
     624             :  *
     625             :  * RETURNS:
     626             :  * 0 on success, -errno on failure.
     627             :  */
     628          16 : static int vmap_pages_range(unsigned long addr, unsigned long end,
     629             :                 pgprot_t prot, struct page **pages, unsigned int page_shift)
     630             : {
     631             :         int err;
     632             : 
     633          16 :         err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
     634          16 :         flush_cache_vmap(addr, end);
     635          16 :         return err;
     636             : }
     637             : 
     638           0 : int is_vmalloc_or_module_addr(const void *x)
     639             : {
     640             :         /*
     641             :          * ARM, x86-64 and sparc64 put modules in a special place,
     642             :          * and fall back on vmalloc() if that fails. Others
     643             :          * just put it in the vmalloc space.
     644             :          */
     645             : #if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
     646             :         unsigned long addr = (unsigned long)kasan_reset_tag(x);
     647             :         if (addr >= MODULES_VADDR && addr < MODULES_END)
     648             :                 return 1;
     649             : #endif
     650           0 :         return is_vmalloc_addr(x);
     651             : }
     652             : EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);
     653             : 
     654             : /*
     655             :  * Walk a vmap address to the struct page it maps. Huge vmap mappings will
     656             :  * return the tail page that corresponds to the base page address, which
     657             :  * matches small vmap mappings.
     658             :  */
     659           0 : struct page *vmalloc_to_page(const void *vmalloc_addr)
     660             : {
     661           0 :         unsigned long addr = (unsigned long) vmalloc_addr;
     662           0 :         struct page *page = NULL;
     663           0 :         pgd_t *pgd = pgd_offset_k(addr);
     664             :         p4d_t *p4d;
     665             :         pud_t *pud;
     666             :         pmd_t *pmd;
     667             :         pte_t *ptep, pte;
     668             : 
     669             :         /*
     670             :          * XXX we might need to change this if we add VIRTUAL_BUG_ON for
     671             :          * architectures that do not vmalloc module space
     672             :          */
     673             :         VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
     674             : 
     675             :         if (pgd_none(*pgd))
     676             :                 return NULL;
     677           0 :         if (WARN_ON_ONCE(pgd_leaf(*pgd)))
     678             :                 return NULL; /* XXX: no allowance for huge pgd */
     679           0 :         if (WARN_ON_ONCE(pgd_bad(*pgd)))
     680             :                 return NULL;
     681             : 
     682           0 :         p4d = p4d_offset(pgd, addr);
     683             :         if (p4d_none(*p4d))
     684             :                 return NULL;
     685             :         if (p4d_leaf(*p4d))
     686             :                 return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
     687           0 :         if (WARN_ON_ONCE(p4d_bad(*p4d)))
     688             :                 return NULL;
     689             : 
     690           0 :         pud = pud_offset(p4d, addr);
     691           0 :         if (pud_none(*pud))
     692             :                 return NULL;
     693             :         if (pud_leaf(*pud))
     694             :                 return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
     695           0 :         if (WARN_ON_ONCE(pud_bad(*pud)))
     696             :                 return NULL;
     697             : 
     698           0 :         pmd = pmd_offset(pud, addr);
     699           0 :         if (pmd_none(*pmd))
     700             :                 return NULL;
     701             :         if (pmd_leaf(*pmd))
     702             :                 return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
     703           0 :         if (WARN_ON_ONCE(pmd_bad(*pmd)))
     704             :                 return NULL;
     705             : 
     706           0 :         ptep = pte_offset_kernel(pmd, addr);
     707           0 :         pte = ptep_get(ptep);
     708           0 :         if (pte_present(pte))
     709           0 :                 page = pte_page(pte);
     710             : 
     711             :         return page;
     712             : }
     713             : EXPORT_SYMBOL(vmalloc_to_page);
     714             : 
     715             : /*
     716             :  * Map a vmalloc()-space virtual address to the physical page frame number.
     717             :  */
     718           0 : unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
     719             : {
     720           0 :         return page_to_pfn(vmalloc_to_page(vmalloc_addr));
     721             : }
     722             : EXPORT_SYMBOL(vmalloc_to_pfn);
     723             : 
     724             : 
     725             : /*** Global kva allocator ***/
     726             : 
     727             : #define DEBUG_AUGMENT_PROPAGATE_CHECK 0
     728             : #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
     729             : 
     730             : 
     731             : static DEFINE_SPINLOCK(vmap_area_lock);
     732             : static DEFINE_SPINLOCK(free_vmap_area_lock);
     733             : /* Export for kexec only */
     734             : LIST_HEAD(vmap_area_list);
     735             : static struct rb_root vmap_area_root = RB_ROOT;
     736             : static bool vmap_initialized __read_mostly;
     737             : 
     738             : static struct rb_root purge_vmap_area_root = RB_ROOT;
     739             : static LIST_HEAD(purge_vmap_area_list);
     740             : static DEFINE_SPINLOCK(purge_vmap_area_lock);
     741             : 
     742             : /*
     743             :  * This kmem_cache is used for vmap_area objects. Instead of
     744             :  * allocating from slab we reuse an object from this cache to
     745             :  * make things faster. Especially in "no edge" splitting of
     746             :  * free block.
     747             :  */
     748             : static struct kmem_cache *vmap_area_cachep;
     749             : 
     750             : /*
     751             :  * This linked list is used in pair with free_vmap_area_root.
     752             :  * It gives O(1) access to prev/next to perform fast coalescing.
     753             :  */
     754             : static LIST_HEAD(free_vmap_area_list);
     755             : 
     756             : /*
     757             :  * This augment red-black tree represents the free vmap space.
     758             :  * All vmap_area objects in this tree are sorted by va->va_start
     759             :  * address. It is used for allocation and merging when a vmap
     760             :  * object is released.
     761             :  *
     762             :  * Each vmap_area node contains a maximum available free block
     763             :  * of its sub-tree, right or left. Therefore it is possible to
     764             :  * find a lowest match of free area.
     765             :  */
     766             : static struct rb_root free_vmap_area_root = RB_ROOT;
     767             : 
     768             : /*
     769             :  * Preload a CPU with one object for "no edge" split case. The
     770             :  * aim is to get rid of allocations from the atomic context, thus
     771             :  * to use more permissive allocation masks.
     772             :  */
     773             : static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
     774             : 
     775             : static __always_inline unsigned long
     776             : va_size(struct vmap_area *va)
     777             : {
     778         108 :         return (va->va_end - va->va_start);
     779             : }
     780             : 
     781             : static __always_inline unsigned long
     782             : get_subtree_max_size(struct rb_node *node)
     783             : {
     784             :         struct vmap_area *va;
     785             : 
     786         146 :         va = rb_entry_safe(node, struct vmap_area, rb_node);
     787         146 :         return va ? va->subtree_max_size : 0;
     788             : }
     789             : 
     790         160 : RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
     791             :         struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
     792             : 
     793             : static void reclaim_and_purge_vmap_areas(void);
     794             : static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
     795             : static void drain_vmap_area_work(struct work_struct *work);
     796             : static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
     797             : 
     798             : static atomic_long_t nr_vmalloc_pages;
     799             : 
     800           0 : unsigned long vmalloc_nr_pages(void)
     801             : {
     802           0 :         return atomic_long_read(&nr_vmalloc_pages);
     803             : }
     804             : 
     805             : /* Look up the first VA which satisfies addr < va_end, NULL if none. */
     806             : static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
     807             : {
     808           0 :         struct vmap_area *va = NULL;
     809           0 :         struct rb_node *n = vmap_area_root.rb_node;
     810             : 
     811           0 :         addr = (unsigned long)kasan_reset_tag((void *)addr);
     812             : 
     813           0 :         while (n) {
     814             :                 struct vmap_area *tmp;
     815             : 
     816           0 :                 tmp = rb_entry(n, struct vmap_area, rb_node);
     817           0 :                 if (tmp->va_end > addr) {
     818           0 :                         va = tmp;
     819           0 :                         if (tmp->va_start <= addr)
     820             :                                 break;
     821             : 
     822           0 :                         n = n->rb_left;
     823             :                 } else
     824           0 :                         n = n->rb_right;
     825             :         }
     826             : 
     827             :         return va;
     828             : }
     829             : 
     830             : static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
     831             : {
     832          16 :         struct rb_node *n = root->rb_node;
     833             : 
     834          16 :         addr = (unsigned long)kasan_reset_tag((void *)addr);
     835             : 
     836          65 :         while (n) {
     837             :                 struct vmap_area *va;
     838             : 
     839          65 :                 va = rb_entry(n, struct vmap_area, rb_node);
     840          65 :                 if (addr < va->va_start)
     841           0 :                         n = n->rb_left;
     842          65 :                 else if (addr >= va->va_end)
     843          49 :                         n = n->rb_right;
     844             :                 else
     845             :                         return va;
     846             :         }
     847             : 
     848             :         return NULL;
     849             : }
     850             : 
     851             : /*
     852             :  * This function returns back addresses of parent node
     853             :  * and its left or right link for further processing.
     854             :  *
     855             :  * Otherwise NULL is returned. In that case all further
     856             :  * steps regarding inserting of conflicting overlap range
     857             :  * have to be declined and actually considered as a bug.
     858             :  */
     859             : static __always_inline struct rb_node **
     860             : find_va_links(struct vmap_area *va,
     861             :         struct rb_root *root, struct rb_node *from,
     862             :         struct rb_node **parent)
     863             : {
     864             :         struct vmap_area *tmp_va;
     865             :         struct rb_node **link;
     866             : 
     867          17 :         if (root) {
     868          17 :                 link = &root->rb_node;
     869          17 :                 if (unlikely(!*link)) {
     870             :                         *parent = NULL;
     871             :                         return link;
     872             :                 }
     873             :         } else {
     874             :                 link = &from;
     875             :         }
     876             : 
     877             :         /*
     878             :          * Go to the bottom of the tree. When we hit the last point
     879             :          * we end up with parent rb_node and correct direction, i name
     880             :          * it link, where the new va->rb_node will be attached to.
     881             :          */
     882             :         do {
     883          83 :                 tmp_va = rb_entry(*link, struct vmap_area, rb_node);
     884             : 
     885             :                 /*
     886             :                  * During the traversal we also do some sanity check.
     887             :                  * Trigger the BUG() if there are sides(left/right)
     888             :                  * or full overlaps.
     889             :                  */
     890          83 :                 if (va->va_end <= tmp_va->va_start)
     891          16 :                         link = &(*link)->rb_left;
     892          67 :                 else if (va->va_start >= tmp_va->va_end)
     893          67 :                         link = &(*link)->rb_right;
     894             :                 else {
     895           0 :                         WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
     896             :                                 va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
     897             : 
     898             :                         return NULL;
     899             :                 }
     900          83 :         } while (*link);
     901             : 
     902          31 :         *parent = &tmp_va->rb_node;
     903             :         return link;
     904             : }
     905             : 
     906             : static __always_inline struct list_head *
     907             : get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
     908             : {
     909             :         struct list_head *list;
     910             : 
     911           0 :         if (unlikely(!parent))
     912             :                 /*
     913             :                  * The red-black tree where we try to find VA neighbors
     914             :                  * before merging or inserting is empty, i.e. it means
     915             :                  * there is no free vmap space. Normally it does not
     916             :                  * happen but we handle this case anyway.
     917             :                  */
     918             :                 return NULL;
     919             : 
     920           0 :         list = &rb_entry(parent, struct vmap_area, rb_node)->list;
     921           0 :         return (&parent->rb_right == link ? list->next : list);
     922             : }
     923             : 
     924             : static __always_inline void
     925             : __link_va(struct vmap_area *va, struct rb_root *root,
     926             :         struct rb_node *parent, struct rb_node **link,
     927             :         struct list_head *head, bool augment)
     928             : {
     929             :         /*
     930             :          * VA is still not in the list, but we can
     931             :          * identify its future previous list_head node.
     932             :          */
     933          33 :         if (likely(parent)) {
     934          31 :                 head = &rb_entry(parent, struct vmap_area, rb_node)->list;
     935          31 :                 if (&parent->rb_right != link)
     936           8 :                         head = head->prev;
     937             :         }
     938             : 
     939             :         /* Insert to the rb-tree */
     940          66 :         rb_link_node(&va->rb_node, parent, link);
     941             :         if (augment) {
     942             :                 /*
     943             :                  * Some explanation here. Just perform simple insertion
     944             :                  * to the tree. We do not set va->subtree_max_size to
     945             :                  * its current size before calling rb_insert_augmented().
     946             :                  * It is because we populate the tree from the bottom
     947             :                  * to parent levels when the node _is_ in the tree.
     948             :                  *
     949             :                  * Therefore we set subtree_max_size to zero after insertion,
     950             :                  * to let __augment_tree_propagate_from() puts everything to
     951             :                  * the correct order later on.
     952             :                  */
     953          17 :                 rb_insert_augmented(&va->rb_node,
     954             :                         root, &free_vmap_area_rb_augment_cb);
     955          17 :                 va->subtree_max_size = 0;
     956             :         } else {
     957          16 :                 rb_insert_color(&va->rb_node, root);
     958             :         }
     959             : 
     960             :         /* Address-sort this list */
     961          50 :         list_add(&va->list, head);
     962             : }
     963             : 
     964             : static __always_inline void
     965             : link_va(struct vmap_area *va, struct rb_root *root,
     966             :         struct rb_node *parent, struct rb_node **link,
     967             :         struct list_head *head)
     968             : {
     969             :         __link_va(va, root, parent, link, head, false);
     970             : }
     971             : 
     972             : static __always_inline void
     973             : link_va_augment(struct vmap_area *va, struct rb_root *root,
     974             :         struct rb_node *parent, struct rb_node **link,
     975             :         struct list_head *head)
     976             : {
     977          17 :         __link_va(va, root, parent, link, head, true);
     978             : }
     979             : 
     980             : static __always_inline void
     981             : __unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
     982             : {
     983           0 :         if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
     984             :                 return;
     985             : 
     986             :         if (augment)
     987           0 :                 rb_erase_augmented(&va->rb_node,
     988             :                         root, &free_vmap_area_rb_augment_cb);
     989             :         else
     990           0 :                 rb_erase(&va->rb_node, root);
     991             : 
     992           0 :         list_del_init(&va->list);
     993           0 :         RB_CLEAR_NODE(&va->rb_node);
     994             : }
     995             : 
     996             : static __always_inline void
     997             : unlink_va(struct vmap_area *va, struct rb_root *root)
     998             : {
     999           0 :         __unlink_va(va, root, false);
    1000             : }
    1001             : 
    1002             : static __always_inline void
    1003             : unlink_va_augment(struct vmap_area *va, struct rb_root *root)
    1004             : {
    1005           0 :         __unlink_va(va, root, true);
    1006             : }
    1007             : 
    1008             : #if DEBUG_AUGMENT_PROPAGATE_CHECK
    1009             : /*
    1010             :  * Gets called when remove the node and rotate.
    1011             :  */
    1012             : static __always_inline unsigned long
    1013             : compute_subtree_max_size(struct vmap_area *va)
    1014             : {
    1015             :         return max3(va_size(va),
    1016             :                 get_subtree_max_size(va->rb_node.rb_left),
    1017             :                 get_subtree_max_size(va->rb_node.rb_right));
    1018             : }
    1019             : 
    1020             : static void
    1021             : augment_tree_propagate_check(void)
    1022             : {
    1023             :         struct vmap_area *va;
    1024             :         unsigned long computed_size;
    1025             : 
    1026             :         list_for_each_entry(va, &free_vmap_area_list, list) {
    1027             :                 computed_size = compute_subtree_max_size(va);
    1028             :                 if (computed_size != va->subtree_max_size)
    1029             :                         pr_emerg("tree is corrupted: %lu, %lu\n",
    1030             :                                 va_size(va), va->subtree_max_size);
    1031             :         }
    1032             : }
    1033             : #endif
    1034             : 
    1035             : /*
    1036             :  * This function populates subtree_max_size from bottom to upper
    1037             :  * levels starting from VA point. The propagation must be done
    1038             :  * when VA size is modified by changing its va_start/va_end. Or
    1039             :  * in case of newly inserting of VA to the tree.
    1040             :  *
    1041             :  * It means that __augment_tree_propagate_from() must be called:
    1042             :  * - After VA has been inserted to the tree(free path);
    1043             :  * - After VA has been shrunk(allocation path);
    1044             :  * - After VA has been increased(merging path).
    1045             :  *
    1046             :  * Please note that, it does not mean that upper parent nodes
    1047             :  * and their subtree_max_size are recalculated all the time up
    1048             :  * to the root node.
    1049             :  *
    1050             :  *       4--8
    1051             :  *        /\
    1052             :  *       /  \
    1053             :  *      /    \
    1054             :  *    2--2  8--8
    1055             :  *
    1056             :  * For example if we modify the node 4, shrinking it to 2, then
    1057             :  * no any modification is required. If we shrink the node 2 to 1
    1058             :  * its subtree_max_size is updated only, and set to 1. If we shrink
    1059             :  * the node 8 to 6, then its subtree_max_size is set to 6 and parent
    1060             :  * node becomes 4--6.
    1061             :  */
    1062             : static __always_inline void
    1063             : augment_tree_propagate_from(struct vmap_area *va)
    1064             : {
    1065             :         /*
    1066             :          * Populate the tree from bottom towards the root until
    1067             :          * the calculated maximum available size of checked node
    1068             :          * is equal to its current one.
    1069             :          */
    1070          33 :         free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
    1071             : 
    1072             : #if DEBUG_AUGMENT_PROPAGATE_CHECK
    1073             :         augment_tree_propagate_check();
    1074             : #endif
    1075             : }
    1076             : 
    1077             : static void
    1078          16 : insert_vmap_area(struct vmap_area *va,
    1079             :         struct rb_root *root, struct list_head *head)
    1080             : {
    1081             :         struct rb_node **link;
    1082             :         struct rb_node *parent;
    1083             : 
    1084          16 :         link = find_va_links(va, root, NULL, &parent);
    1085          16 :         if (link)
    1086          16 :                 link_va(va, root, parent, link, head);
    1087          16 : }
    1088             : 
    1089             : static void
    1090          17 : insert_vmap_area_augment(struct vmap_area *va,
    1091             :         struct rb_node *from, struct rb_root *root,
    1092             :         struct list_head *head)
    1093             : {
    1094             :         struct rb_node **link;
    1095             :         struct rb_node *parent;
    1096             : 
    1097          17 :         if (from)
    1098             :                 link = find_va_links(va, NULL, from, &parent);
    1099             :         else
    1100             :                 link = find_va_links(va, root, NULL, &parent);
    1101             : 
    1102          17 :         if (link) {
    1103          34 :                 link_va_augment(va, root, parent, link, head);
    1104             :                 augment_tree_propagate_from(va);
    1105             :         }
    1106          17 : }
    1107             : 
    1108             : /*
    1109             :  * Merge de-allocated chunk of VA memory with previous
    1110             :  * and next free blocks. If coalesce is not done a new
    1111             :  * free area is inserted. If VA has been merged, it is
    1112             :  * freed.
    1113             :  *
    1114             :  * Please note, it can return NULL in case of overlap
    1115             :  * ranges, followed by WARN() report. Despite it is a
    1116             :  * buggy behaviour, a system can be alive and keep
    1117             :  * ongoing.
    1118             :  */
    1119             : static __always_inline struct vmap_area *
    1120             : __merge_or_add_vmap_area(struct vmap_area *va,
    1121             :         struct rb_root *root, struct list_head *head, bool augment)
    1122             : {
    1123             :         struct vmap_area *sibling;
    1124             :         struct list_head *next;
    1125             :         struct rb_node **link;
    1126             :         struct rb_node *parent;
    1127           0 :         bool merged = false;
    1128             : 
    1129             :         /*
    1130             :          * Find a place in the tree where VA potentially will be
    1131             :          * inserted, unless it is merged with its sibling/siblings.
    1132             :          */
    1133           0 :         link = find_va_links(va, root, NULL, &parent);
    1134           0 :         if (!link)
    1135             :                 return NULL;
    1136             : 
    1137             :         /*
    1138             :          * Get next node of VA to check if merging can be done.
    1139             :          */
    1140           0 :         next = get_va_next_sibling(parent, link);
    1141           0 :         if (unlikely(next == NULL))
    1142             :                 goto insert;
    1143             : 
    1144             :         /*
    1145             :          * start            end
    1146             :          * |                |
    1147             :          * |<------VA------>|<-----Next----->|
    1148             :          *                  |                |
    1149             :          *                  start            end
    1150             :          */
    1151           0 :         if (next != head) {
    1152           0 :                 sibling = list_entry(next, struct vmap_area, list);
    1153           0 :                 if (sibling->va_start == va->va_end) {
    1154           0 :                         sibling->va_start = va->va_start;
    1155             : 
    1156             :                         /* Free vmap_area object. */
    1157           0 :                         kmem_cache_free(vmap_area_cachep, va);
    1158             : 
    1159             :                         /* Point to the new merged area. */
    1160           0 :                         va = sibling;
    1161           0 :                         merged = true;
    1162             :                 }
    1163             :         }
    1164             : 
    1165             :         /*
    1166             :          * start            end
    1167             :          * |                |
    1168             :          * |<-----Prev----->|<------VA------>|
    1169             :          *                  |                |
    1170             :          *                  start            end
    1171             :          */
    1172           0 :         if (next->prev != head) {
    1173           0 :                 sibling = list_entry(next->prev, struct vmap_area, list);
    1174           0 :                 if (sibling->va_end == va->va_start) {
    1175             :                         /*
    1176             :                          * If both neighbors are coalesced, it is important
    1177             :                          * to unlink the "next" node first, followed by merging
    1178             :                          * with "previous" one. Otherwise the tree might not be
    1179             :                          * fully populated if a sibling's augmented value is
    1180             :                          * "normalized" because of rotation operations.
    1181             :                          */
    1182           0 :                         if (merged)
    1183           0 :                                 __unlink_va(va, root, augment);
    1184             : 
    1185           0 :                         sibling->va_end = va->va_end;
    1186             : 
    1187             :                         /* Free vmap_area object. */
    1188           0 :                         kmem_cache_free(vmap_area_cachep, va);
    1189             : 
    1190             :                         /* Point to the new merged area. */
    1191           0 :                         va = sibling;
    1192           0 :                         merged = true;
    1193             :                 }
    1194             :         }
    1195             : 
    1196             : insert:
    1197           0 :         if (!merged)
    1198           0 :                 __link_va(va, root, parent, link, head, augment);
    1199             : 
    1200             :         return va;
    1201             : }
    1202             : 
    1203             : static __always_inline struct vmap_area *
    1204             : merge_or_add_vmap_area(struct vmap_area *va,
    1205             :         struct rb_root *root, struct list_head *head)
    1206             : {
    1207           0 :         return __merge_or_add_vmap_area(va, root, head, false);
    1208             : }
    1209             : 
    1210             : static __always_inline struct vmap_area *
    1211             : merge_or_add_vmap_area_augment(struct vmap_area *va,
    1212             :         struct rb_root *root, struct list_head *head)
    1213             : {
    1214           0 :         va = __merge_or_add_vmap_area(va, root, head, true);
    1215           0 :         if (va)
    1216             :                 augment_tree_propagate_from(va);
    1217             : 
    1218             :         return va;
    1219             : }
    1220             : 
    1221             : static __always_inline bool
    1222             : is_within_this_va(struct vmap_area *va, unsigned long size,
    1223             :         unsigned long align, unsigned long vstart)
    1224             : {
    1225             :         unsigned long nva_start_addr;
    1226             : 
    1227          81 :         if (va->va_start > vstart)
    1228          65 :                 nva_start_addr = ALIGN(va->va_start, align);
    1229             :         else
    1230          16 :                 nva_start_addr = ALIGN(vstart, align);
    1231             : 
    1232             :         /* Can be overflowed due to big size or alignment. */
    1233          81 :         if (nva_start_addr + size < nva_start_addr ||
    1234             :                         nva_start_addr < vstart)
    1235             :                 return false;
    1236             : 
    1237          81 :         return (nva_start_addr + size <= va->va_end);
    1238             : }
    1239             : 
    1240             : /*
    1241             :  * Find the first free block(lowest start address) in the tree,
    1242             :  * that will accomplish the request corresponding to passing
    1243             :  * parameters. Please note, with an alignment bigger than PAGE_SIZE,
    1244             :  * a search length is adjusted to account for worst case alignment
    1245             :  * overhead.
    1246             :  */
    1247             : static __always_inline struct vmap_area *
    1248             : find_vmap_lowest_match(struct rb_root *root, unsigned long size,
    1249             :         unsigned long align, unsigned long vstart, bool adjust_search_size)
    1250             : {
    1251             :         struct vmap_area *va;
    1252             :         struct rb_node *node;
    1253             :         unsigned long length;
    1254             : 
    1255             :         /* Start from the root. */
    1256          16 :         node = root->rb_node;
    1257             : 
    1258             :         /* Adjust the search size for alignment overhead. */
    1259          16 :         length = adjust_search_size ? size + align - 1 : size;
    1260             : 
    1261          81 :         while (node) {
    1262          81 :                 va = rb_entry(node, struct vmap_area, rb_node);
    1263             : 
    1264         186 :                 if (get_subtree_max_size(node->rb_left) >= length &&
    1265          24 :                                 vstart < va->va_start) {
    1266             :                         node = node->rb_left;
    1267             :                 } else {
    1268          57 :                         if (is_within_this_va(va, size, align, vstart))
    1269             :                                 return va;
    1270             : 
    1271             :                         /*
    1272             :                          * Does not make sense to go deeper towards the right
    1273             :                          * sub-tree if it does not have a free block that is
    1274             :                          * equal or bigger to the requested search length.
    1275             :                          */
    1276          84 :                         if (get_subtree_max_size(node->rb_right) >= length) {
    1277          27 :                                 node = node->rb_right;
    1278          27 :                                 continue;
    1279             :                         }
    1280             : 
    1281             :                         /*
    1282             :                          * OK. We roll back and find the first right sub-tree,
    1283             :                          * that will satisfy the search criteria. It can happen
    1284             :                          * due to "vstart" restriction or an alignment overhead
    1285             :                          * that is bigger then PAGE_SIZE.
    1286             :                          */
    1287          24 :                         while ((node = rb_parent(node))) {
    1288          24 :                                 va = rb_entry(node, struct vmap_area, rb_node);
    1289          24 :                                 if (is_within_this_va(va, size, align, vstart))
    1290             :                                         return va;
    1291             : 
    1292          46 :                                 if (get_subtree_max_size(node->rb_right) >= length &&
    1293             :                                                 vstart <= va->va_start) {
    1294             :                                         /*
    1295             :                                          * Shift the vstart forward. Please note, we update it with
    1296             :                                          * parent's start address adding "1" because we do not want
    1297             :                                          * to enter same sub-tree after it has already been checked
    1298             :                                          * and no suitable free block found there.
    1299             :                                          */
    1300          14 :                                         vstart = va->va_start + 1;
    1301          14 :                                         node = node->rb_right;
    1302             :                                         break;
    1303             :                                 }
    1304             :                         }
    1305             :                 }
    1306             :         }
    1307             : 
    1308             :         return NULL;
    1309             : }
    1310             : 
    1311             : #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
    1312             : #include <linux/random.h>
    1313             : 
    1314             : static struct vmap_area *
    1315             : find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
    1316             :         unsigned long align, unsigned long vstart)
    1317             : {
    1318             :         struct vmap_area *va;
    1319             : 
    1320             :         list_for_each_entry(va, head, list) {
    1321             :                 if (!is_within_this_va(va, size, align, vstart))
    1322             :                         continue;
    1323             : 
    1324             :                 return va;
    1325             :         }
    1326             : 
    1327             :         return NULL;
    1328             : }
    1329             : 
    1330             : static void
    1331             : find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
    1332             :                              unsigned long size, unsigned long align)
    1333             : {
    1334             :         struct vmap_area *va_1, *va_2;
    1335             :         unsigned long vstart;
    1336             :         unsigned int rnd;
    1337             : 
    1338             :         get_random_bytes(&rnd, sizeof(rnd));
    1339             :         vstart = VMALLOC_START + rnd;
    1340             : 
    1341             :         va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
    1342             :         va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);
    1343             : 
    1344             :         if (va_1 != va_2)
    1345             :                 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
    1346             :                         va_1, va_2, vstart);
    1347             : }
    1348             : #endif
    1349             : 
    1350             : enum fit_type {
    1351             :         NOTHING_FIT = 0,
    1352             :         FL_FIT_TYPE = 1,        /* full fit */
    1353             :         LE_FIT_TYPE = 2,        /* left edge fit */
    1354             :         RE_FIT_TYPE = 3,        /* right edge fit */
    1355             :         NE_FIT_TYPE = 4         /* no edge fit */
    1356             : };
    1357             : 
    1358             : static __always_inline enum fit_type
    1359             : classify_va_fit_type(struct vmap_area *va,
    1360             :         unsigned long nva_start_addr, unsigned long size)
    1361             : {
    1362             :         enum fit_type type;
    1363             : 
    1364             :         /* Check if it is within VA. */
    1365          32 :         if (nva_start_addr < va->va_start ||
    1366          16 :                         nva_start_addr + size > va->va_end)
    1367             :                 return NOTHING_FIT;
    1368             : 
    1369             :         /* Now classify. */
    1370          16 :         if (va->va_start == nva_start_addr) {
    1371           0 :                 if (va->va_end == nva_start_addr + size)
    1372             :                         type = FL_FIT_TYPE;
    1373             :                 else
    1374           0 :                         type = LE_FIT_TYPE;
    1375          16 :         } else if (va->va_end == nva_start_addr + size) {
    1376             :                 type = RE_FIT_TYPE;
    1377             :         } else {
    1378          16 :                 type = NE_FIT_TYPE;
    1379             :         }
    1380             : 
    1381             :         return type;
    1382             : }
    1383             : 
    1384             : static __always_inline int
    1385             : adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
    1386             :                       struct vmap_area *va, unsigned long nva_start_addr,
    1387             :                       unsigned long size)
    1388             : {
    1389          16 :         struct vmap_area *lva = NULL;
    1390          16 :         enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
    1391             : 
    1392          16 :         if (type == FL_FIT_TYPE) {
    1393             :                 /*
    1394             :                  * No need to split VA, it fully fits.
    1395             :                  *
    1396             :                  * |               |
    1397             :                  * V      NVA      V
    1398             :                  * |---------------|
    1399             :                  */
    1400           0 :                 unlink_va_augment(va, root);
    1401           0 :                 kmem_cache_free(vmap_area_cachep, va);
    1402          16 :         } else if (type == LE_FIT_TYPE) {
    1403             :                 /*
    1404             :                  * Split left edge of fit VA.
    1405             :                  *
    1406             :                  * |       |
    1407             :                  * V  NVA  V   R
    1408             :                  * |-------|-------|
    1409             :                  */
    1410           0 :                 va->va_start += size;
    1411          16 :         } else if (type == RE_FIT_TYPE) {
    1412             :                 /*
    1413             :                  * Split right edge of fit VA.
    1414             :                  *
    1415             :                  *         |       |
    1416             :                  *     L   V  NVA  V
    1417             :                  * |-------|-------|
    1418             :                  */
    1419           0 :                 va->va_end = nva_start_addr;
    1420          16 :         } else if (type == NE_FIT_TYPE) {
    1421             :                 /*
    1422             :                  * Split no edge of fit VA.
    1423             :                  *
    1424             :                  *     |       |
    1425             :                  *   L V  NVA  V R
    1426             :                  * |---|-------|---|
    1427             :                  */
    1428          16 :                 lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
    1429          16 :                 if (unlikely(!lva)) {
    1430             :                         /*
    1431             :                          * For percpu allocator we do not do any pre-allocation
    1432             :                          * and leave it as it is. The reason is it most likely
    1433             :                          * never ends up with NE_FIT_TYPE splitting. In case of
    1434             :                          * percpu allocations offsets and sizes are aligned to
    1435             :                          * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
    1436             :                          * are its main fitting cases.
    1437             :                          *
    1438             :                          * There are a few exceptions though, as an example it is
    1439             :                          * a first allocation (early boot up) when we have "one"
    1440             :                          * big free space that has to be split.
    1441             :                          *
    1442             :                          * Also we can hit this path in case of regular "vmap"
    1443             :                          * allocations, if "this" current CPU was not preloaded.
    1444             :                          * See the comment in alloc_vmap_area() why. If so, then
    1445             :                          * GFP_NOWAIT is used instead to get an extra object for
    1446             :                          * split purpose. That is rare and most time does not
    1447             :                          * occur.
    1448             :                          *
    1449             :                          * What happens if an allocation gets failed. Basically,
    1450             :                          * an "overflow" path is triggered to purge lazily freed
    1451             :                          * areas to free some memory, then, the "retry" path is
    1452             :                          * triggered to repeat one more time. See more details
    1453             :                          * in alloc_vmap_area() function.
    1454             :                          */
    1455           0 :                         lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
    1456           0 :                         if (!lva)
    1457             :                                 return -1;
    1458             :                 }
    1459             : 
    1460             :                 /*
    1461             :                  * Build the remainder.
    1462             :                  */
    1463          16 :                 lva->va_start = va->va_start;
    1464          16 :                 lva->va_end = nva_start_addr;
    1465             : 
    1466             :                 /*
    1467             :                  * Shrink this VA to remaining size.
    1468             :                  */
    1469          16 :                 va->va_start = nva_start_addr + size;
    1470             :         } else {
    1471             :                 return -1;
    1472             :         }
    1473             : 
    1474          16 :         if (type != FL_FIT_TYPE) {
    1475          16 :                 augment_tree_propagate_from(va);
    1476             : 
    1477          16 :                 if (lva)        /* type == NE_FIT_TYPE */
    1478          16 :                         insert_vmap_area_augment(lva, &va->rb_node, root, head);
    1479             :         }
    1480             : 
    1481             :         return 0;
    1482             : }
    1483             : 
    1484             : /*
    1485             :  * Returns a start address of the newly allocated area, if success.
    1486             :  * Otherwise a vend is returned that indicates failure.
    1487             :  */
    1488             : static __always_inline unsigned long
    1489             : __alloc_vmap_area(struct rb_root *root, struct list_head *head,
    1490             :         unsigned long size, unsigned long align,
    1491             :         unsigned long vstart, unsigned long vend)
    1492             : {
    1493          16 :         bool adjust_search_size = true;
    1494             :         unsigned long nva_start_addr;
    1495             :         struct vmap_area *va;
    1496             :         int ret;
    1497             : 
    1498             :         /*
    1499             :          * Do not adjust when:
    1500             :          *   a) align <= PAGE_SIZE, because it does not make any sense.
    1501             :          *      All blocks(their start addresses) are at least PAGE_SIZE
    1502             :          *      aligned anyway;
    1503             :          *   b) a short range where a requested size corresponds to exactly
    1504             :          *      specified [vstart:vend] interval and an alignment > PAGE_SIZE.
    1505             :          *      With adjusted search length an allocation would not succeed.
    1506             :          */
    1507          16 :         if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
    1508           0 :                 adjust_search_size = false;
    1509             : 
    1510          32 :         va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
    1511          16 :         if (unlikely(!va))
    1512             :                 return vend;
    1513             : 
    1514          16 :         if (va->va_start > vstart)
    1515          15 :                 nva_start_addr = ALIGN(va->va_start, align);
    1516             :         else
    1517           1 :                 nva_start_addr = ALIGN(vstart, align);
    1518             : 
    1519             :         /* Check the "vend" restriction. */
    1520          16 :         if (nva_start_addr + size > vend)
    1521             :                 return vend;
    1522             : 
    1523             :         /* Update the free vmap_area. */
    1524          16 :         ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
    1525          16 :         if (WARN_ON_ONCE(ret))
    1526             :                 return vend;
    1527             : 
    1528             : #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
    1529             :         find_vmap_lowest_match_check(root, head, size, align);
    1530             : #endif
    1531             : 
    1532             :         return nva_start_addr;
    1533             : }
    1534             : 
    1535             : /*
    1536             :  * Free a region of KVA allocated by alloc_vmap_area
    1537             :  */
    1538           0 : static void free_vmap_area(struct vmap_area *va)
    1539             : {
    1540             :         /*
    1541             :          * Remove from the busy tree/list.
    1542             :          */
    1543           0 :         spin_lock(&vmap_area_lock);
    1544           0 :         unlink_va(va, &vmap_area_root);
    1545           0 :         spin_unlock(&vmap_area_lock);
    1546             : 
    1547             :         /*
    1548             :          * Insert/Merge it back to the free tree/list.
    1549             :          */
    1550           0 :         spin_lock(&free_vmap_area_lock);
    1551           0 :         merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
    1552           0 :         spin_unlock(&free_vmap_area_lock);
    1553           0 : }
    1554             : 
    1555             : static inline void
    1556          16 : preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
    1557             : {
    1558          16 :         struct vmap_area *va = NULL;
    1559             : 
    1560             :         /*
    1561             :          * Preload this CPU with one extra vmap_area object. It is used
    1562             :          * when fit type of free area is NE_FIT_TYPE. It guarantees that
    1563             :          * a CPU that does an allocation is preloaded.
    1564             :          *
    1565             :          * We do it in non-atomic context, thus it allows us to use more
    1566             :          * permissive allocation masks to be more stable under low memory
    1567             :          * condition and high memory pressure.
    1568             :          */
    1569          16 :         if (!this_cpu_read(ne_fit_preload_node))
    1570          16 :                 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
    1571             : 
    1572          16 :         spin_lock(lock);
    1573             : 
    1574          16 :         if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
    1575           0 :                 kmem_cache_free(vmap_area_cachep, va);
    1576          16 : }
    1577             : 
    1578             : /*
    1579             :  * Allocate a region of KVA of the specified size and alignment, within the
    1580             :  * vstart and vend.
    1581             :  */
    1582          16 : static struct vmap_area *alloc_vmap_area(unsigned long size,
    1583             :                                 unsigned long align,
    1584             :                                 unsigned long vstart, unsigned long vend,
    1585             :                                 int node, gfp_t gfp_mask,
    1586             :                                 unsigned long va_flags)
    1587             : {
    1588             :         struct vmap_area *va;
    1589             :         unsigned long freed;
    1590             :         unsigned long addr;
    1591          16 :         int purged = 0;
    1592             :         int ret;
    1593             : 
    1594          32 :         if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
    1595             :                 return ERR_PTR(-EINVAL);
    1596             : 
    1597          16 :         if (unlikely(!vmap_initialized))
    1598             :                 return ERR_PTR(-EBUSY);
    1599             : 
    1600             :         might_sleep();
    1601          16 :         gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
    1602             : 
    1603          16 :         va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
    1604          16 :         if (unlikely(!va))
    1605             :                 return ERR_PTR(-ENOMEM);
    1606             : 
    1607             :         /*
    1608             :          * Only scan the relevant parts containing pointers to other objects
    1609             :          * to avoid false negatives.
    1610             :          */
    1611             :         kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
    1612             : 
    1613             : retry:
    1614          16 :         preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
    1615          16 :         addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
    1616             :                 size, align, vstart, vend);
    1617          16 :         spin_unlock(&free_vmap_area_lock);
    1618             : 
    1619          16 :         trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
    1620             : 
    1621             :         /*
    1622             :          * If an allocation fails, the "vend" address is
    1623             :          * returned. Therefore trigger the overflow path.
    1624             :          */
    1625          16 :         if (unlikely(addr == vend))
    1626             :                 goto overflow;
    1627             : 
    1628          16 :         va->va_start = addr;
    1629          16 :         va->va_end = addr + size;
    1630          16 :         va->vm = NULL;
    1631          16 :         va->flags = va_flags;
    1632             : 
    1633          16 :         spin_lock(&vmap_area_lock);
    1634          16 :         insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
    1635          16 :         spin_unlock(&vmap_area_lock);
    1636             : 
    1637          16 :         BUG_ON(!IS_ALIGNED(va->va_start, align));
    1638          16 :         BUG_ON(va->va_start < vstart);
    1639          16 :         BUG_ON(va->va_end > vend);
    1640             : 
    1641             :         ret = kasan_populate_vmalloc(addr, size);
    1642             :         if (ret) {
    1643             :                 free_vmap_area(va);
    1644             :                 return ERR_PTR(ret);
    1645             :         }
    1646             : 
    1647             :         return va;
    1648             : 
    1649             : overflow:
    1650           0 :         if (!purged) {
    1651           0 :                 reclaim_and_purge_vmap_areas();
    1652           0 :                 purged = 1;
    1653           0 :                 goto retry;
    1654             :         }
    1655             : 
    1656           0 :         freed = 0;
    1657           0 :         blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
    1658             : 
    1659           0 :         if (freed > 0) {
    1660             :                 purged = 0;
    1661             :                 goto retry;
    1662             :         }
    1663             : 
    1664           0 :         if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
    1665           0 :                 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
    1666             :                         size);
    1667             : 
    1668           0 :         kmem_cache_free(vmap_area_cachep, va);
    1669           0 :         return ERR_PTR(-EBUSY);
    1670             : }
    1671             : 
    1672           0 : int register_vmap_purge_notifier(struct notifier_block *nb)
    1673             : {
    1674           0 :         return blocking_notifier_chain_register(&vmap_notify_list, nb);
    1675             : }
    1676             : EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
    1677             : 
    1678           0 : int unregister_vmap_purge_notifier(struct notifier_block *nb)
    1679             : {
    1680           0 :         return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
    1681             : }
    1682             : EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
    1683             : 
    1684             : /*
    1685             :  * lazy_max_pages is the maximum amount of virtual address space we gather up
    1686             :  * before attempting to purge with a TLB flush.
    1687             :  *
    1688             :  * There is a tradeoff here: a larger number will cover more kernel page tables
    1689             :  * and take slightly longer to purge, but it will linearly reduce the number of
    1690             :  * global TLB flushes that must be performed. It would seem natural to scale
    1691             :  * this number up linearly with the number of CPUs (because vmapping activity
    1692             :  * could also scale linearly with the number of CPUs), however it is likely
    1693             :  * that in practice, workloads might be constrained in other ways that mean
    1694             :  * vmap activity will not scale linearly with CPUs. Also, I want to be
    1695             :  * conservative and not introduce a big latency on huge systems, so go with
    1696             :  * a less aggressive log scale. It will still be an improvement over the old
    1697             :  * code, and it will be simple to change the scale factor if we find that it
    1698             :  * becomes a problem on bigger systems.
    1699             :  */
    1700             : static unsigned long lazy_max_pages(void)
    1701             : {
    1702             :         unsigned int log;
    1703             : 
    1704           0 :         log = fls(num_online_cpus());
    1705             : 
    1706           0 :         return log * (32UL * 1024 * 1024 / PAGE_SIZE);
    1707             : }
    1708             : 
    1709             : static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
    1710             : 
    1711             : /*
    1712             :  * Serialize vmap purging.  There is no actual critical section protected
    1713             :  * by this lock, but we want to avoid concurrent calls for performance
    1714             :  * reasons and to make the pcpu_get_vm_areas more deterministic.
    1715             :  */
    1716             : static DEFINE_MUTEX(vmap_purge_lock);
    1717             : 
    1718             : /* for per-CPU blocks */
    1719             : static void purge_fragmented_blocks_allcpus(void);
    1720             : 
    1721             : /*
    1722             :  * Purges all lazily-freed vmap areas.
    1723             :  */
    1724           0 : static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
    1725             : {
    1726             :         unsigned long resched_threshold;
    1727           0 :         unsigned int num_purged_areas = 0;
    1728             :         struct list_head local_purge_list;
    1729             :         struct vmap_area *va, *n_va;
    1730             : 
    1731             :         lockdep_assert_held(&vmap_purge_lock);
    1732             : 
    1733           0 :         spin_lock(&purge_vmap_area_lock);
    1734           0 :         purge_vmap_area_root = RB_ROOT;
    1735           0 :         list_replace_init(&purge_vmap_area_list, &local_purge_list);
    1736           0 :         spin_unlock(&purge_vmap_area_lock);
    1737             : 
    1738           0 :         if (unlikely(list_empty(&local_purge_list)))
    1739             :                 goto out;
    1740             : 
    1741           0 :         start = min(start,
    1742             :                 list_first_entry(&local_purge_list,
    1743             :                         struct vmap_area, list)->va_start);
    1744             : 
    1745           0 :         end = max(end,
    1746             :                 list_last_entry(&local_purge_list,
    1747             :                         struct vmap_area, list)->va_end);
    1748             : 
    1749           0 :         flush_tlb_kernel_range(start, end);
    1750           0 :         resched_threshold = lazy_max_pages() << 1;
    1751             : 
    1752           0 :         spin_lock(&free_vmap_area_lock);
    1753           0 :         list_for_each_entry_safe(va, n_va, &local_purge_list, list) {
    1754           0 :                 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
    1755           0 :                 unsigned long orig_start = va->va_start;
    1756           0 :                 unsigned long orig_end = va->va_end;
    1757             : 
    1758             :                 /*
    1759             :                  * Finally insert or merge lazily-freed area. It is
    1760             :                  * detached and there is no need to "unlink" it from
    1761             :                  * anything.
    1762             :                  */
    1763           0 :                 va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
    1764             :                                 &free_vmap_area_list);
    1765             : 
    1766           0 :                 if (!va)
    1767           0 :                         continue;
    1768             : 
    1769           0 :                 if (is_vmalloc_or_module_addr((void *)orig_start))
    1770             :                         kasan_release_vmalloc(orig_start, orig_end,
    1771             :                                               va->va_start, va->va_end);
    1772             : 
    1773           0 :                 atomic_long_sub(nr, &vmap_lazy_nr);
    1774           0 :                 num_purged_areas++;
    1775             : 
    1776           0 :                 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
    1777           0 :                         cond_resched_lock(&free_vmap_area_lock);
    1778             :         }
    1779             :         spin_unlock(&free_vmap_area_lock);
    1780             : 
    1781             : out:
    1782           0 :         trace_purge_vmap_area_lazy(start, end, num_purged_areas);
    1783           0 :         return num_purged_areas > 0;
    1784             : }
    1785             : 
    1786             : /*
    1787             :  * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list.
    1788             :  */
    1789           0 : static void reclaim_and_purge_vmap_areas(void)
    1790             : 
    1791             : {
    1792           0 :         mutex_lock(&vmap_purge_lock);
    1793           0 :         purge_fragmented_blocks_allcpus();
    1794           0 :         __purge_vmap_area_lazy(ULONG_MAX, 0);
    1795           0 :         mutex_unlock(&vmap_purge_lock);
    1796           0 : }
    1797             : 
    1798           0 : static void drain_vmap_area_work(struct work_struct *work)
    1799             : {
    1800             :         unsigned long nr_lazy;
    1801             : 
    1802             :         do {
    1803           0 :                 mutex_lock(&vmap_purge_lock);
    1804           0 :                 __purge_vmap_area_lazy(ULONG_MAX, 0);
    1805           0 :                 mutex_unlock(&vmap_purge_lock);
    1806             : 
    1807             :                 /* Recheck if further work is required. */
    1808           0 :                 nr_lazy = atomic_long_read(&vmap_lazy_nr);
    1809           0 :         } while (nr_lazy > lazy_max_pages());
    1810           0 : }
    1811             : 
    1812             : /*
    1813             :  * Free a vmap area, caller ensuring that the area has been unmapped,
    1814             :  * unlinked and flush_cache_vunmap had been called for the correct
    1815             :  * range previously.
    1816             :  */
    1817           0 : static void free_vmap_area_noflush(struct vmap_area *va)
    1818             : {
    1819           0 :         unsigned long nr_lazy_max = lazy_max_pages();
    1820           0 :         unsigned long va_start = va->va_start;
    1821             :         unsigned long nr_lazy;
    1822             : 
    1823           0 :         if (WARN_ON_ONCE(!list_empty(&va->list)))
    1824             :                 return;
    1825             : 
    1826           0 :         nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
    1827             :                                 PAGE_SHIFT, &vmap_lazy_nr);
    1828             : 
    1829             :         /*
    1830             :          * Merge or place it to the purge tree/list.
    1831             :          */
    1832           0 :         spin_lock(&purge_vmap_area_lock);
    1833           0 :         merge_or_add_vmap_area(va,
    1834             :                 &purge_vmap_area_root, &purge_vmap_area_list);
    1835           0 :         spin_unlock(&purge_vmap_area_lock);
    1836             : 
    1837           0 :         trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
    1838             : 
    1839             :         /* After this point, we may free va at any time */
    1840           0 :         if (unlikely(nr_lazy > nr_lazy_max))
    1841             :                 schedule_work(&drain_vmap_work);
    1842             : }
    1843             : 
    1844             : /*
    1845             :  * Free and unmap a vmap area
    1846             :  */
    1847           0 : static void free_unmap_vmap_area(struct vmap_area *va)
    1848             : {
    1849           0 :         flush_cache_vunmap(va->va_start, va->va_end);
    1850           0 :         vunmap_range_noflush(va->va_start, va->va_end);
    1851             :         if (debug_pagealloc_enabled_static())
    1852             :                 flush_tlb_kernel_range(va->va_start, va->va_end);
    1853             : 
    1854           0 :         free_vmap_area_noflush(va);
    1855           0 : }
    1856             : 
    1857           0 : struct vmap_area *find_vmap_area(unsigned long addr)
    1858             : {
    1859             :         struct vmap_area *va;
    1860             : 
    1861          16 :         spin_lock(&vmap_area_lock);
    1862          16 :         va = __find_vmap_area(addr, &vmap_area_root);
    1863          16 :         spin_unlock(&vmap_area_lock);
    1864             : 
    1865           0 :         return va;
    1866             : }
    1867             : 
    1868           0 : static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
    1869             : {
    1870             :         struct vmap_area *va;
    1871             : 
    1872           0 :         spin_lock(&vmap_area_lock);
    1873           0 :         va = __find_vmap_area(addr, &vmap_area_root);
    1874           0 :         if (va)
    1875             :                 unlink_va(va, &vmap_area_root);
    1876           0 :         spin_unlock(&vmap_area_lock);
    1877             : 
    1878           0 :         return va;
    1879             : }
    1880             : 
    1881             : /*** Per cpu kva allocator ***/
    1882             : 
    1883             : /*
    1884             :  * vmap space is limited especially on 32 bit architectures. Ensure there is
    1885             :  * room for at least 16 percpu vmap blocks per CPU.
    1886             :  */
    1887             : /*
    1888             :  * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
    1889             :  * to #define VMALLOC_SPACE             (VMALLOC_END-VMALLOC_START). Guess
    1890             :  * instead (we just need a rough idea)
    1891             :  */
    1892             : #if BITS_PER_LONG == 32
    1893             : #define VMALLOC_SPACE           (128UL*1024*1024)
    1894             : #else
    1895             : #define VMALLOC_SPACE           (128UL*1024*1024*1024)
    1896             : #endif
    1897             : 
    1898             : #define VMALLOC_PAGES           (VMALLOC_SPACE / PAGE_SIZE)
    1899             : #define VMAP_MAX_ALLOC          BITS_PER_LONG   /* 256K with 4K pages */
    1900             : #define VMAP_BBMAP_BITS_MAX     1024    /* 4MB with 4K pages */
    1901             : #define VMAP_BBMAP_BITS_MIN     (VMAP_MAX_ALLOC*2)
    1902             : #define VMAP_MIN(x, y)          ((x) < (y) ? (x) : (y)) /* can't use min() */
    1903             : #define VMAP_MAX(x, y)          ((x) > (y) ? (x) : (y)) /* can't use max() */
    1904             : #define VMAP_BBMAP_BITS         \
    1905             :                 VMAP_MIN(VMAP_BBMAP_BITS_MAX,   \
    1906             :                 VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
    1907             :                         VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
    1908             : 
    1909             : #define VMAP_BLOCK_SIZE         (VMAP_BBMAP_BITS * PAGE_SIZE)
    1910             : 
    1911             : /*
    1912             :  * Purge threshold to prevent overeager purging of fragmented blocks for
    1913             :  * regular operations: Purge if vb->free is less than 1/4 of the capacity.
    1914             :  */
    1915             : #define VMAP_PURGE_THRESHOLD    (VMAP_BBMAP_BITS / 4)
    1916             : 
    1917             : #define VMAP_RAM                0x1 /* indicates vm_map_ram area*/
    1918             : #define VMAP_BLOCK              0x2 /* mark out the vmap_block sub-type*/
    1919             : #define VMAP_FLAGS_MASK         0x3
    1920             : 
    1921             : struct vmap_block_queue {
    1922             :         spinlock_t lock;
    1923             :         struct list_head free;
    1924             : 
    1925             :         /*
    1926             :          * An xarray requires an extra memory dynamically to
    1927             :          * be allocated. If it is an issue, we can use rb-tree
    1928             :          * instead.
    1929             :          */
    1930             :         struct xarray vmap_blocks;
    1931             : };
    1932             : 
    1933             : struct vmap_block {
    1934             :         spinlock_t lock;
    1935             :         struct vmap_area *va;
    1936             :         unsigned long free, dirty;
    1937             :         DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
    1938             :         unsigned long dirty_min, dirty_max; /*< dirty range */
    1939             :         struct list_head free_list;
    1940             :         struct rcu_head rcu_head;
    1941             :         struct list_head purge;
    1942             : };
    1943             : 
    1944             : /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
    1945             : static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
    1946             : 
    1947             : /*
    1948             :  * In order to fast access to any "vmap_block" associated with a
    1949             :  * specific address, we use a hash.
    1950             :  *
    1951             :  * A per-cpu vmap_block_queue is used in both ways, to serialize
    1952             :  * an access to free block chains among CPUs(alloc path) and it
    1953             :  * also acts as a vmap_block hash(alloc/free paths). It means we
    1954             :  * overload it, since we already have the per-cpu array which is
    1955             :  * used as a hash table. When used as a hash a 'cpu' passed to
    1956             :  * per_cpu() is not actually a CPU but rather a hash index.
    1957             :  *
    1958             :  * A hash function is addr_to_vb_xa() which hashes any address
    1959             :  * to a specific index(in a hash) it belongs to. This then uses a
    1960             :  * per_cpu() macro to access an array with generated index.
    1961             :  *
    1962             :  * An example:
    1963             :  *
    1964             :  *  CPU_1  CPU_2  CPU_0
    1965             :  *    |      |      |
    1966             :  *    V      V      V
    1967             :  * 0     10     20     30     40     50     60
    1968             :  * |------|------|------|------|------|------|...<vmap address space>
    1969             :  *   CPU0   CPU1   CPU2   CPU0   CPU1   CPU2
    1970             :  *
    1971             :  * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus
    1972             :  *   it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock;
    1973             :  *
    1974             :  * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus
    1975             :  *   it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock;
    1976             :  *
    1977             :  * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus
    1978             :  *   it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock.
    1979             :  *
    1980             :  * This technique almost always avoids lock contention on insert/remove,
    1981             :  * however xarray spinlocks protect against any contention that remains.
    1982             :  */
    1983             : static struct xarray *
    1984             : addr_to_vb_xa(unsigned long addr)
    1985             : {
    1986           0 :         int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus();
    1987             : 
    1988           0 :         return &per_cpu(vmap_block_queue, index).vmap_blocks;
    1989             : }
    1990             : 
    1991             : /*
    1992             :  * We should probably have a fallback mechanism to allocate virtual memory
    1993             :  * out of partially filled vmap blocks. However vmap block sizing should be
    1994             :  * fairly reasonable according to the vmalloc size, so it shouldn't be a
    1995             :  * big problem.
    1996             :  */
    1997             : 
    1998             : static unsigned long addr_to_vb_idx(unsigned long addr)
    1999             : {
    2000           0 :         addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
    2001           0 :         addr /= VMAP_BLOCK_SIZE;
    2002             :         return addr;
    2003             : }
    2004             : 
    2005           0 : static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
    2006             : {
    2007             :         unsigned long addr;
    2008             : 
    2009           0 :         addr = va_start + (pages_off << PAGE_SHIFT);
    2010           0 :         BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
    2011           0 :         return (void *)addr;
    2012             : }
    2013             : 
    2014             : /**
    2015             :  * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
    2016             :  *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
    2017             :  * @order:    how many 2^order pages should be occupied in newly allocated block
    2018             :  * @gfp_mask: flags for the page level allocator
    2019             :  *
    2020             :  * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
    2021             :  */
    2022           0 : static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
    2023             : {
    2024             :         struct vmap_block_queue *vbq;
    2025             :         struct vmap_block *vb;
    2026             :         struct vmap_area *va;
    2027             :         struct xarray *xa;
    2028             :         unsigned long vb_idx;
    2029             :         int node, err;
    2030             :         void *vaddr;
    2031             : 
    2032           0 :         node = numa_node_id();
    2033             : 
    2034           0 :         vb = kmalloc_node(sizeof(struct vmap_block),
    2035             :                         gfp_mask & GFP_RECLAIM_MASK, node);
    2036           0 :         if (unlikely(!vb))
    2037             :                 return ERR_PTR(-ENOMEM);
    2038             : 
    2039           0 :         va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
    2040           0 :                                         VMALLOC_START, VMALLOC_END,
    2041             :                                         node, gfp_mask,
    2042             :                                         VMAP_RAM|VMAP_BLOCK);
    2043           0 :         if (IS_ERR(va)) {
    2044           0 :                 kfree(vb);
    2045           0 :                 return ERR_CAST(va);
    2046             :         }
    2047             : 
    2048           0 :         vaddr = vmap_block_vaddr(va->va_start, 0);
    2049           0 :         spin_lock_init(&vb->lock);
    2050           0 :         vb->va = va;
    2051             :         /* At least something should be left free */
    2052           0 :         BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
    2053           0 :         bitmap_zero(vb->used_map, VMAP_BBMAP_BITS);
    2054           0 :         vb->free = VMAP_BBMAP_BITS - (1UL << order);
    2055           0 :         vb->dirty = 0;
    2056           0 :         vb->dirty_min = VMAP_BBMAP_BITS;
    2057           0 :         vb->dirty_max = 0;
    2058           0 :         bitmap_set(vb->used_map, 0, (1UL << order));
    2059           0 :         INIT_LIST_HEAD(&vb->free_list);
    2060             : 
    2061           0 :         xa = addr_to_vb_xa(va->va_start);
    2062           0 :         vb_idx = addr_to_vb_idx(va->va_start);
    2063           0 :         err = xa_insert(xa, vb_idx, vb, gfp_mask);
    2064           0 :         if (err) {
    2065           0 :                 kfree(vb);
    2066           0 :                 free_vmap_area(va);
    2067           0 :                 return ERR_PTR(err);
    2068             :         }
    2069             : 
    2070           0 :         vbq = raw_cpu_ptr(&vmap_block_queue);
    2071           0 :         spin_lock(&vbq->lock);
    2072           0 :         list_add_tail_rcu(&vb->free_list, &vbq->free);
    2073           0 :         spin_unlock(&vbq->lock);
    2074             : 
    2075           0 :         return vaddr;
    2076             : }
    2077             : 
    2078           0 : static void free_vmap_block(struct vmap_block *vb)
    2079             : {
    2080             :         struct vmap_block *tmp;
    2081             :         struct xarray *xa;
    2082             : 
    2083           0 :         xa = addr_to_vb_xa(vb->va->va_start);
    2084           0 :         tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
    2085           0 :         BUG_ON(tmp != vb);
    2086             : 
    2087           0 :         spin_lock(&vmap_area_lock);
    2088           0 :         unlink_va(vb->va, &vmap_area_root);
    2089           0 :         spin_unlock(&vmap_area_lock);
    2090             : 
    2091           0 :         free_vmap_area_noflush(vb->va);
    2092           0 :         kfree_rcu(vb, rcu_head);
    2093           0 : }
    2094             : 
    2095             : static bool purge_fragmented_block(struct vmap_block *vb,
    2096             :                 struct vmap_block_queue *vbq, struct list_head *purge_list,
    2097             :                 bool force_purge)
    2098             : {
    2099           0 :         if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
    2100             :             vb->dirty == VMAP_BBMAP_BITS)
    2101             :                 return false;
    2102             : 
    2103             :         /* Don't overeagerly purge usable blocks unless requested */
    2104           0 :         if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD))
    2105             :                 return false;
    2106             : 
    2107             :         /* prevent further allocs after releasing lock */
    2108           0 :         WRITE_ONCE(vb->free, 0);
    2109             :         /* prevent purging it again */
    2110           0 :         WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
    2111           0 :         vb->dirty_min = 0;
    2112           0 :         vb->dirty_max = VMAP_BBMAP_BITS;
    2113           0 :         spin_lock(&vbq->lock);
    2114           0 :         list_del_rcu(&vb->free_list);
    2115           0 :         spin_unlock(&vbq->lock);
    2116           0 :         list_add_tail(&vb->purge, purge_list);
    2117             :         return true;
    2118             : }
    2119             : 
    2120           0 : static void free_purged_blocks(struct list_head *purge_list)
    2121             : {
    2122             :         struct vmap_block *vb, *n_vb;
    2123             : 
    2124           0 :         list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
    2125           0 :                 list_del(&vb->purge);
    2126           0 :                 free_vmap_block(vb);
    2127             :         }
    2128           0 : }
    2129             : 
    2130           0 : static void purge_fragmented_blocks(int cpu)
    2131             : {
    2132           0 :         LIST_HEAD(purge);
    2133             :         struct vmap_block *vb;
    2134           0 :         struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
    2135             : 
    2136             :         rcu_read_lock();
    2137           0 :         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
    2138           0 :                 unsigned long free = READ_ONCE(vb->free);
    2139           0 :                 unsigned long dirty = READ_ONCE(vb->dirty);
    2140             : 
    2141           0 :                 if (free + dirty != VMAP_BBMAP_BITS ||
    2142             :                     dirty == VMAP_BBMAP_BITS)
    2143           0 :                         continue;
    2144             : 
    2145           0 :                 spin_lock(&vb->lock);
    2146           0 :                 purge_fragmented_block(vb, vbq, &purge, true);
    2147           0 :                 spin_unlock(&vb->lock);
    2148             :         }
    2149             :         rcu_read_unlock();
    2150           0 :         free_purged_blocks(&purge);
    2151           0 : }
    2152             : 
    2153             : static void purge_fragmented_blocks_allcpus(void)
    2154             : {
    2155             :         int cpu;
    2156             : 
    2157           0 :         for_each_possible_cpu(cpu)
    2158           0 :                 purge_fragmented_blocks(cpu);
    2159             : }
    2160             : 
    2161           0 : static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
    2162             : {
    2163             :         struct vmap_block_queue *vbq;
    2164             :         struct vmap_block *vb;
    2165           0 :         void *vaddr = NULL;
    2166             :         unsigned int order;
    2167             : 
    2168           0 :         BUG_ON(offset_in_page(size));
    2169           0 :         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
    2170           0 :         if (WARN_ON(size == 0)) {
    2171             :                 /*
    2172             :                  * Allocating 0 bytes isn't what caller wants since
    2173             :                  * get_order(0) returns funny result. Just warn and terminate
    2174             :                  * early.
    2175             :                  */
    2176             :                 return NULL;
    2177             :         }
    2178           0 :         order = get_order(size);
    2179             : 
    2180             :         rcu_read_lock();
    2181           0 :         vbq = raw_cpu_ptr(&vmap_block_queue);
    2182           0 :         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
    2183             :                 unsigned long pages_off;
    2184             : 
    2185           0 :                 if (READ_ONCE(vb->free) < (1UL << order))
    2186           0 :                         continue;
    2187             : 
    2188           0 :                 spin_lock(&vb->lock);
    2189           0 :                 if (vb->free < (1UL << order)) {
    2190           0 :                         spin_unlock(&vb->lock);
    2191           0 :                         continue;
    2192             :                 }
    2193             : 
    2194           0 :                 pages_off = VMAP_BBMAP_BITS - vb->free;
    2195           0 :                 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
    2196           0 :                 WRITE_ONCE(vb->free, vb->free - (1UL << order));
    2197           0 :                 bitmap_set(vb->used_map, pages_off, (1UL << order));
    2198           0 :                 if (vb->free == 0) {
    2199           0 :                         spin_lock(&vbq->lock);
    2200           0 :                         list_del_rcu(&vb->free_list);
    2201           0 :                         spin_unlock(&vbq->lock);
    2202             :                 }
    2203             : 
    2204           0 :                 spin_unlock(&vb->lock);
    2205             :                 break;
    2206             :         }
    2207             : 
    2208             :         rcu_read_unlock();
    2209             : 
    2210             :         /* Allocate new block if nothing was found */
    2211           0 :         if (!vaddr)
    2212           0 :                 vaddr = new_vmap_block(order, gfp_mask);
    2213             : 
    2214             :         return vaddr;
    2215             : }
    2216             : 
    2217           0 : static void vb_free(unsigned long addr, unsigned long size)
    2218             : {
    2219             :         unsigned long offset;
    2220             :         unsigned int order;
    2221             :         struct vmap_block *vb;
    2222             :         struct xarray *xa;
    2223             : 
    2224           0 :         BUG_ON(offset_in_page(size));
    2225           0 :         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
    2226             : 
    2227           0 :         flush_cache_vunmap(addr, addr + size);
    2228             : 
    2229           0 :         order = get_order(size);
    2230           0 :         offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
    2231             : 
    2232           0 :         xa = addr_to_vb_xa(addr);
    2233           0 :         vb = xa_load(xa, addr_to_vb_idx(addr));
    2234             : 
    2235           0 :         spin_lock(&vb->lock);
    2236           0 :         bitmap_clear(vb->used_map, offset, (1UL << order));
    2237           0 :         spin_unlock(&vb->lock);
    2238             : 
    2239           0 :         vunmap_range_noflush(addr, addr + size);
    2240             : 
    2241             :         if (debug_pagealloc_enabled_static())
    2242             :                 flush_tlb_kernel_range(addr, addr + size);
    2243             : 
    2244           0 :         spin_lock(&vb->lock);
    2245             : 
    2246             :         /* Expand the not yet TLB flushed dirty range */
    2247           0 :         vb->dirty_min = min(vb->dirty_min, offset);
    2248           0 :         vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
    2249             : 
    2250           0 :         WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order));
    2251           0 :         if (vb->dirty == VMAP_BBMAP_BITS) {
    2252           0 :                 BUG_ON(vb->free);
    2253           0 :                 spin_unlock(&vb->lock);
    2254           0 :                 free_vmap_block(vb);
    2255             :         } else
    2256           0 :                 spin_unlock(&vb->lock);
    2257           0 : }
    2258             : 
    2259           0 : static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
    2260             : {
    2261           0 :         LIST_HEAD(purge_list);
    2262             :         int cpu;
    2263             : 
    2264           0 :         if (unlikely(!vmap_initialized))
    2265           0 :                 return;
    2266             : 
    2267           0 :         mutex_lock(&vmap_purge_lock);
    2268             : 
    2269           0 :         for_each_possible_cpu(cpu) {
    2270           0 :                 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
    2271             :                 struct vmap_block *vb;
    2272             :                 unsigned long idx;
    2273             : 
    2274             :                 rcu_read_lock();
    2275           0 :                 xa_for_each(&vbq->vmap_blocks, idx, vb) {
    2276           0 :                         spin_lock(&vb->lock);
    2277             : 
    2278             :                         /*
    2279             :                          * Try to purge a fragmented block first. If it's
    2280             :                          * not purgeable, check whether there is dirty
    2281             :                          * space to be flushed.
    2282             :                          */
    2283           0 :                         if (!purge_fragmented_block(vb, vbq, &purge_list, false) &&
    2284           0 :                             vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
    2285           0 :                                 unsigned long va_start = vb->va->va_start;
    2286             :                                 unsigned long s, e;
    2287             : 
    2288           0 :                                 s = va_start + (vb->dirty_min << PAGE_SHIFT);
    2289           0 :                                 e = va_start + (vb->dirty_max << PAGE_SHIFT);
    2290             : 
    2291           0 :                                 start = min(s, start);
    2292           0 :                                 end   = max(e, end);
    2293             : 
    2294             :                                 /* Prevent that this is flushed again */
    2295           0 :                                 vb->dirty_min = VMAP_BBMAP_BITS;
    2296           0 :                                 vb->dirty_max = 0;
    2297             : 
    2298           0 :                                 flush = 1;
    2299             :                         }
    2300           0 :                         spin_unlock(&vb->lock);
    2301             :                 }
    2302             :                 rcu_read_unlock();
    2303             :         }
    2304           0 :         free_purged_blocks(&purge_list);
    2305             : 
    2306           0 :         if (!__purge_vmap_area_lazy(start, end) && flush)
    2307           0 :                 flush_tlb_kernel_range(start, end);
    2308           0 :         mutex_unlock(&vmap_purge_lock);
    2309             : }
    2310             : 
    2311             : /**
    2312             :  * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
    2313             :  *
    2314             :  * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
    2315             :  * to amortize TLB flushing overheads. What this means is that any page you
    2316             :  * have now, may, in a former life, have been mapped into kernel virtual
    2317             :  * address by the vmap layer and so there might be some CPUs with TLB entries
    2318             :  * still referencing that page (additional to the regular 1:1 kernel mapping).
    2319             :  *
    2320             :  * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
    2321             :  * be sure that none of the pages we have control over will have any aliases
    2322             :  * from the vmap layer.
    2323             :  */
    2324           0 : void vm_unmap_aliases(void)
    2325             : {
    2326           0 :         unsigned long start = ULONG_MAX, end = 0;
    2327           0 :         int flush = 0;
    2328             : 
    2329           0 :         _vm_unmap_aliases(start, end, flush);
    2330           0 : }
    2331             : EXPORT_SYMBOL_GPL(vm_unmap_aliases);
    2332             : 
    2333             : /**
    2334             :  * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
    2335             :  * @mem: the pointer returned by vm_map_ram
    2336             :  * @count: the count passed to that vm_map_ram call (cannot unmap partial)
    2337             :  */
    2338           0 : void vm_unmap_ram(const void *mem, unsigned int count)
    2339             : {
    2340           0 :         unsigned long size = (unsigned long)count << PAGE_SHIFT;
    2341           0 :         unsigned long addr = (unsigned long)kasan_reset_tag(mem);
    2342             :         struct vmap_area *va;
    2343             : 
    2344             :         might_sleep();
    2345           0 :         BUG_ON(!addr);
    2346           0 :         BUG_ON(addr < VMALLOC_START);
    2347           0 :         BUG_ON(addr > VMALLOC_END);
    2348           0 :         BUG_ON(!PAGE_ALIGNED(addr));
    2349             : 
    2350           0 :         kasan_poison_vmalloc(mem, size);
    2351             : 
    2352           0 :         if (likely(count <= VMAP_MAX_ALLOC)) {
    2353           0 :                 debug_check_no_locks_freed(mem, size);
    2354           0 :                 vb_free(addr, size);
    2355           0 :                 return;
    2356             :         }
    2357             : 
    2358           0 :         va = find_unlink_vmap_area(addr);
    2359           0 :         if (WARN_ON_ONCE(!va))
    2360             :                 return;
    2361             : 
    2362           0 :         debug_check_no_locks_freed((void *)va->va_start,
    2363           0 :                                     (va->va_end - va->va_start));
    2364           0 :         free_unmap_vmap_area(va);
    2365             : }
    2366             : EXPORT_SYMBOL(vm_unmap_ram);
    2367             : 
    2368             : /**
    2369             :  * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
    2370             :  * @pages: an array of pointers to the pages to be mapped
    2371             :  * @count: number of pages
    2372             :  * @node: prefer to allocate data structures on this node
    2373             :  *
    2374             :  * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
    2375             :  * faster than vmap so it's good.  But if you mix long-life and short-life
    2376             :  * objects with vm_map_ram(), it could consume lots of address space through
    2377             :  * fragmentation (especially on a 32bit machine).  You could see failures in
    2378             :  * the end.  Please use this function for short-lived objects.
    2379             :  *
    2380             :  * Returns: a pointer to the address that has been mapped, or %NULL on failure
    2381             :  */
    2382           0 : void *vm_map_ram(struct page **pages, unsigned int count, int node)
    2383             : {
    2384           0 :         unsigned long size = (unsigned long)count << PAGE_SHIFT;
    2385             :         unsigned long addr;
    2386             :         void *mem;
    2387             : 
    2388           0 :         if (likely(count <= VMAP_MAX_ALLOC)) {
    2389           0 :                 mem = vb_alloc(size, GFP_KERNEL);
    2390           0 :                 if (IS_ERR(mem))
    2391             :                         return NULL;
    2392             :                 addr = (unsigned long)mem;
    2393             :         } else {
    2394             :                 struct vmap_area *va;
    2395           0 :                 va = alloc_vmap_area(size, PAGE_SIZE,
    2396           0 :                                 VMALLOC_START, VMALLOC_END,
    2397             :                                 node, GFP_KERNEL, VMAP_RAM);
    2398           0 :                 if (IS_ERR(va))
    2399             :                         return NULL;
    2400             : 
    2401           0 :                 addr = va->va_start;
    2402           0 :                 mem = (void *)addr;
    2403             :         }
    2404             : 
    2405           0 :         if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
    2406             :                                 pages, PAGE_SHIFT) < 0) {
    2407           0 :                 vm_unmap_ram(mem, count);
    2408           0 :                 return NULL;
    2409             :         }
    2410             : 
    2411             :         /*
    2412             :          * Mark the pages as accessible, now that they are mapped.
    2413             :          * With hardware tag-based KASAN, marking is skipped for
    2414             :          * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
    2415             :          */
    2416             :         mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
    2417             : 
    2418             :         return mem;
    2419             : }
    2420             : EXPORT_SYMBOL(vm_map_ram);
    2421             : 
    2422             : static struct vm_struct *vmlist __initdata;
    2423             : 
    2424             : static inline unsigned int vm_area_page_order(struct vm_struct *vm)
    2425             : {
    2426             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
    2427             :         return vm->page_order;
    2428             : #else
    2429             :         return 0;
    2430             : #endif
    2431             : }
    2432             : 
    2433          16 : static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
    2434             : {
    2435             : #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
    2436             :         vm->page_order = order;
    2437             : #else
    2438          16 :         BUG_ON(order != 0);
    2439             : #endif
    2440          16 : }
    2441             : 
    2442             : /**
    2443             :  * vm_area_add_early - add vmap area early during boot
    2444             :  * @vm: vm_struct to add
    2445             :  *
    2446             :  * This function is used to add fixed kernel vm area to vmlist before
    2447             :  * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
    2448             :  * should contain proper values and the other fields should be zero.
    2449             :  *
    2450             :  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
    2451             :  */
    2452           0 : void __init vm_area_add_early(struct vm_struct *vm)
    2453             : {
    2454             :         struct vm_struct *tmp, **p;
    2455             : 
    2456           0 :         BUG_ON(vmap_initialized);
    2457           0 :         for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
    2458           0 :                 if (tmp->addr >= vm->addr) {
    2459           0 :                         BUG_ON(tmp->addr < vm->addr + vm->size);
    2460             :                         break;
    2461             :                 } else
    2462           0 :                         BUG_ON(tmp->addr + tmp->size > vm->addr);
    2463             :         }
    2464           0 :         vm->next = *p;
    2465           0 :         *p = vm;
    2466           0 : }
    2467             : 
    2468             : /**
    2469             :  * vm_area_register_early - register vmap area early during boot
    2470             :  * @vm: vm_struct to register
    2471             :  * @align: requested alignment
    2472             :  *
    2473             :  * This function is used to register kernel vm area before
    2474             :  * vmalloc_init() is called.  @vm->size and @vm->flags should contain
    2475             :  * proper values on entry and other fields should be zero.  On return,
    2476             :  * vm->addr contains the allocated address.
    2477             :  *
    2478             :  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
    2479             :  */
    2480           0 : void __init vm_area_register_early(struct vm_struct *vm, size_t align)
    2481             : {
    2482           0 :         unsigned long addr = ALIGN(VMALLOC_START, align);
    2483             :         struct vm_struct *cur, **p;
    2484             : 
    2485           0 :         BUG_ON(vmap_initialized);
    2486             : 
    2487           0 :         for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
    2488           0 :                 if ((unsigned long)cur->addr - addr >= vm->size)
    2489             :                         break;
    2490           0 :                 addr = ALIGN((unsigned long)cur->addr + cur->size, align);
    2491             :         }
    2492             : 
    2493           0 :         BUG_ON(addr > VMALLOC_END - vm->size);
    2494           0 :         vm->addr = (void *)addr;
    2495           0 :         vm->next = *p;
    2496           0 :         *p = vm;
    2497           0 :         kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
    2498           0 : }
    2499             : 
    2500           1 : static void vmap_init_free_space(void)
    2501             : {
    2502           1 :         unsigned long vmap_start = 1;
    2503           1 :         const unsigned long vmap_end = ULONG_MAX;
    2504             :         struct vmap_area *busy, *free;
    2505             : 
    2506             :         /*
    2507             :          *     B     F     B     B     B     F
    2508             :          * -|-----|.....|-----|-----|-----|.....|-
    2509             :          *  |           The KVA space           |
    2510             :          *  |<--------------------------------->|
    2511             :          */
    2512           1 :         list_for_each_entry(busy, &vmap_area_list, list) {
    2513           0 :                 if (busy->va_start - vmap_start > 0) {
    2514           0 :                         free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    2515           0 :                         if (!WARN_ON_ONCE(!free)) {
    2516           0 :                                 free->va_start = vmap_start;
    2517           0 :                                 free->va_end = busy->va_start;
    2518             : 
    2519           0 :                                 insert_vmap_area_augment(free, NULL,
    2520             :                                         &free_vmap_area_root,
    2521             :                                                 &free_vmap_area_list);
    2522             :                         }
    2523             :                 }
    2524             : 
    2525           0 :                 vmap_start = busy->va_end;
    2526             :         }
    2527             : 
    2528           1 :         if (vmap_end - vmap_start > 0) {
    2529           2 :                 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    2530           1 :                 if (!WARN_ON_ONCE(!free)) {
    2531           1 :                         free->va_start = vmap_start;
    2532           1 :                         free->va_end = vmap_end;
    2533             : 
    2534           1 :                         insert_vmap_area_augment(free, NULL,
    2535             :                                 &free_vmap_area_root,
    2536             :                                         &free_vmap_area_list);
    2537             :                 }
    2538             :         }
    2539           1 : }
    2540             : 
    2541             : static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
    2542             :         struct vmap_area *va, unsigned long flags, const void *caller)
    2543             : {
    2544          16 :         vm->flags = flags;
    2545          16 :         vm->addr = (void *)va->va_start;
    2546          16 :         vm->size = va->va_end - va->va_start;
    2547          16 :         vm->caller = caller;
    2548          16 :         va->vm = vm;
    2549             : }
    2550             : 
    2551             : static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
    2552             :                               unsigned long flags, const void *caller)
    2553             : {
    2554          16 :         spin_lock(&vmap_area_lock);
    2555          16 :         setup_vmalloc_vm_locked(vm, va, flags, caller);
    2556          16 :         spin_unlock(&vmap_area_lock);
    2557             : }
    2558             : 
    2559             : static void clear_vm_uninitialized_flag(struct vm_struct *vm)
    2560             : {
    2561             :         /*
    2562             :          * Before removing VM_UNINITIALIZED,
    2563             :          * we should make sure that vm has proper values.
    2564             :          * Pair with smp_rmb() in show_numa_info().
    2565             :          */
    2566          16 :         smp_wmb();
    2567          16 :         vm->flags &= ~VM_UNINITIALIZED;
    2568             : }
    2569             : 
    2570          16 : static struct vm_struct *__get_vm_area_node(unsigned long size,
    2571             :                 unsigned long align, unsigned long shift, unsigned long flags,
    2572             :                 unsigned long start, unsigned long end, int node,
    2573             :                 gfp_t gfp_mask, const void *caller)
    2574             : {
    2575             :         struct vmap_area *va;
    2576             :         struct vm_struct *area;
    2577          16 :         unsigned long requested_size = size;
    2578             : 
    2579          16 :         BUG_ON(in_interrupt());
    2580          16 :         size = ALIGN(size, 1ul << shift);
    2581          16 :         if (unlikely(!size))
    2582             :                 return NULL;
    2583             : 
    2584          16 :         if (flags & VM_IOREMAP)
    2585           0 :                 align = 1ul << clamp_t(int, get_count_order_long(size),
    2586             :                                        PAGE_SHIFT, IOREMAP_MAX_ORDER);
    2587             : 
    2588          16 :         area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
    2589          16 :         if (unlikely(!area))
    2590             :                 return NULL;
    2591             : 
    2592          16 :         if (!(flags & VM_NO_GUARD))
    2593          16 :                 size += PAGE_SIZE;
    2594             : 
    2595          16 :         va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
    2596          16 :         if (IS_ERR(va)) {
    2597           0 :                 kfree(area);
    2598           0 :                 return NULL;
    2599             :         }
    2600             : 
    2601          16 :         setup_vmalloc_vm(area, va, flags, caller);
    2602             : 
    2603             :         /*
    2604             :          * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
    2605             :          * best-effort approach, as they can be mapped outside of vmalloc code.
    2606             :          * For VM_ALLOC mappings, the pages are marked as accessible after
    2607             :          * getting mapped in __vmalloc_node_range().
    2608             :          * With hardware tag-based KASAN, marking is skipped for
    2609             :          * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
    2610             :          */
    2611          16 :         if (!(flags & VM_ALLOC))
    2612             :                 area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
    2613             :                                                     KASAN_VMALLOC_PROT_NORMAL);
    2614             : 
    2615             :         return area;
    2616             : }
    2617             : 
    2618           0 : struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
    2619             :                                        unsigned long start, unsigned long end,
    2620             :                                        const void *caller)
    2621             : {
    2622           0 :         return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
    2623             :                                   NUMA_NO_NODE, GFP_KERNEL, caller);
    2624             : }
    2625             : 
    2626             : /**
    2627             :  * get_vm_area - reserve a contiguous kernel virtual area
    2628             :  * @size:        size of the area
    2629             :  * @flags:       %VM_IOREMAP for I/O mappings or VM_ALLOC
    2630             :  *
    2631             :  * Search an area of @size in the kernel virtual mapping area,
    2632             :  * and reserved it for out purposes.  Returns the area descriptor
    2633             :  * on success or %NULL on failure.
    2634             :  *
    2635             :  * Return: the area descriptor on success or %NULL on failure.
    2636             :  */
    2637           0 : struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
    2638             : {
    2639           0 :         return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
    2640           0 :                                   VMALLOC_START, VMALLOC_END,
    2641             :                                   NUMA_NO_NODE, GFP_KERNEL,
    2642           0 :                                   __builtin_return_address(0));
    2643             : }
    2644             : 
    2645           0 : struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
    2646             :                                 const void *caller)
    2647             : {
    2648           0 :         return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
    2649           0 :                                   VMALLOC_START, VMALLOC_END,
    2650             :                                   NUMA_NO_NODE, GFP_KERNEL, caller);
    2651             : }
    2652             : 
    2653             : /**
    2654             :  * find_vm_area - find a continuous kernel virtual area
    2655             :  * @addr:         base address
    2656             :  *
    2657             :  * Search for the kernel VM area starting at @addr, and return it.
    2658             :  * It is up to the caller to do all required locking to keep the returned
    2659             :  * pointer valid.
    2660             :  *
    2661             :  * Return: the area descriptor on success or %NULL on failure.
    2662             :  */
    2663          16 : struct vm_struct *find_vm_area(const void *addr)
    2664             : {
    2665             :         struct vmap_area *va;
    2666             : 
    2667          32 :         va = find_vmap_area((unsigned long)addr);
    2668          16 :         if (!va)
    2669             :                 return NULL;
    2670             : 
    2671          16 :         return va->vm;
    2672             : }
    2673             : 
    2674             : /**
    2675             :  * remove_vm_area - find and remove a continuous kernel virtual area
    2676             :  * @addr:           base address
    2677             :  *
    2678             :  * Search for the kernel VM area starting at @addr, and remove it.
    2679             :  * This function returns the found VM area, but using it is NOT safe
    2680             :  * on SMP machines, except for its size or flags.
    2681             :  *
    2682             :  * Return: the area descriptor on success or %NULL on failure.
    2683             :  */
    2684           0 : struct vm_struct *remove_vm_area(const void *addr)
    2685             : {
    2686             :         struct vmap_area *va;
    2687             :         struct vm_struct *vm;
    2688             : 
    2689             :         might_sleep();
    2690             : 
    2691           0 :         if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
    2692             :                         addr))
    2693             :                 return NULL;
    2694             : 
    2695           0 :         va = find_unlink_vmap_area((unsigned long)addr);
    2696           0 :         if (!va || !va->vm)
    2697             :                 return NULL;
    2698           0 :         vm = va->vm;
    2699             : 
    2700           0 :         debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
    2701           0 :         debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
    2702           0 :         kasan_free_module_shadow(vm);
    2703           0 :         kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));
    2704             : 
    2705           0 :         free_unmap_vmap_area(va);
    2706           0 :         return vm;
    2707             : }
    2708             : 
    2709           0 : static inline void set_area_direct_map(const struct vm_struct *area,
    2710             :                                        int (*set_direct_map)(struct page *page))
    2711             : {
    2712             :         int i;
    2713             : 
    2714             :         /* HUGE_VMALLOC passes small pages to set_direct_map */
    2715           0 :         for (i = 0; i < area->nr_pages; i++)
    2716           0 :                 if (page_address(area->pages[i]))
    2717           0 :                         set_direct_map(area->pages[i]);
    2718           0 : }
    2719             : 
    2720             : /*
    2721             :  * Flush the vm mapping and reset the direct map.
    2722             :  */
    2723           0 : static void vm_reset_perms(struct vm_struct *area)
    2724             : {
    2725           0 :         unsigned long start = ULONG_MAX, end = 0;
    2726           0 :         unsigned int page_order = vm_area_page_order(area);
    2727           0 :         int flush_dmap = 0;
    2728             :         int i;
    2729             : 
    2730             :         /*
    2731             :          * Find the start and end range of the direct mappings to make sure that
    2732             :          * the vm_unmap_aliases() flush includes the direct map.
    2733             :          */
    2734           0 :         for (i = 0; i < area->nr_pages; i += 1U << page_order) {
    2735           0 :                 unsigned long addr = (unsigned long)page_address(area->pages[i]);
    2736             : 
    2737           0 :                 if (addr) {
    2738             :                         unsigned long page_size;
    2739             : 
    2740           0 :                         page_size = PAGE_SIZE << page_order;
    2741           0 :                         start = min(addr, start);
    2742           0 :                         end = max(addr + page_size, end);
    2743           0 :                         flush_dmap = 1;
    2744             :                 }
    2745             :         }
    2746             : 
    2747             :         /*
    2748             :          * Set direct map to something invalid so that it won't be cached if
    2749             :          * there are any accesses after the TLB flush, then flush the TLB and
    2750             :          * reset the direct map permissions to the default.
    2751             :          */
    2752           0 :         set_area_direct_map(area, set_direct_map_invalid_noflush);
    2753           0 :         _vm_unmap_aliases(start, end, flush_dmap);
    2754           0 :         set_area_direct_map(area, set_direct_map_default_noflush);
    2755           0 : }
    2756             : 
    2757           0 : static void delayed_vfree_work(struct work_struct *w)
    2758             : {
    2759           0 :         struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
    2760             :         struct llist_node *t, *llnode;
    2761             : 
    2762           0 :         llist_for_each_safe(llnode, t, llist_del_all(&p->list))
    2763           0 :                 vfree(llnode);
    2764           0 : }
    2765             : 
    2766             : /**
    2767             :  * vfree_atomic - release memory allocated by vmalloc()
    2768             :  * @addr:         memory base address
    2769             :  *
    2770             :  * This one is just like vfree() but can be called in any atomic context
    2771             :  * except NMIs.
    2772             :  */
    2773           0 : void vfree_atomic(const void *addr)
    2774             : {
    2775           0 :         struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
    2776             : 
    2777           0 :         BUG_ON(in_nmi());
    2778           0 :         kmemleak_free(addr);
    2779             : 
    2780             :         /*
    2781             :          * Use raw_cpu_ptr() because this can be called from preemptible
    2782             :          * context. Preemption is absolutely fine here, because the llist_add()
    2783             :          * implementation is lockless, so it works even if we are adding to
    2784             :          * another cpu's list. schedule_work() should be fine with this too.
    2785             :          */
    2786           0 :         if (addr && llist_add((struct llist_node *)addr, &p->list))
    2787           0 :                 schedule_work(&p->wq);
    2788           0 : }
    2789             : 
    2790             : /**
    2791             :  * vfree - Release memory allocated by vmalloc()
    2792             :  * @addr:  Memory base address
    2793             :  *
    2794             :  * Free the virtually continuous memory area starting at @addr, as obtained
    2795             :  * from one of the vmalloc() family of APIs.  This will usually also free the
    2796             :  * physical memory underlying the virtual allocation, but that memory is
    2797             :  * reference counted, so it will not be freed until the last user goes away.
    2798             :  *
    2799             :  * If @addr is NULL, no operation is performed.
    2800             :  *
    2801             :  * Context:
    2802             :  * May sleep if called *not* from interrupt context.
    2803             :  * Must not be called in NMI context (strictly speaking, it could be
    2804             :  * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
    2805             :  * conventions for vfree() arch-dependent would be a really bad idea).
    2806             :  */
    2807           0 : void vfree(const void *addr)
    2808             : {
    2809             :         struct vm_struct *vm;
    2810             :         int i;
    2811             : 
    2812           0 :         if (unlikely(in_interrupt())) {
    2813           0 :                 vfree_atomic(addr);
    2814           0 :                 return;
    2815             :         }
    2816             : 
    2817           0 :         BUG_ON(in_nmi());
    2818           0 :         kmemleak_free(addr);
    2819             :         might_sleep();
    2820             : 
    2821           0 :         if (!addr)
    2822             :                 return;
    2823             : 
    2824           0 :         vm = remove_vm_area(addr);
    2825           0 :         if (unlikely(!vm)) {
    2826           0 :                 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
    2827             :                                 addr);
    2828           0 :                 return;
    2829             :         }
    2830             : 
    2831           0 :         if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
    2832           0 :                 vm_reset_perms(vm);
    2833           0 :         for (i = 0; i < vm->nr_pages; i++) {
    2834           0 :                 struct page *page = vm->pages[i];
    2835             : 
    2836           0 :                 BUG_ON(!page);
    2837           0 :                 mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
    2838             :                 /*
    2839             :                  * High-order allocs for huge vmallocs are split, so
    2840             :                  * can be freed as an array of order-0 allocations
    2841             :                  */
    2842           0 :                 __free_page(page);
    2843           0 :                 cond_resched();
    2844             :         }
    2845           0 :         atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
    2846           0 :         kvfree(vm->pages);
    2847           0 :         kfree(vm);
    2848             : }
    2849             : EXPORT_SYMBOL(vfree);
    2850             : 
    2851             : /**
    2852             :  * vunmap - release virtual mapping obtained by vmap()
    2853             :  * @addr:   memory base address
    2854             :  *
    2855             :  * Free the virtually contiguous memory area starting at @addr,
    2856             :  * which was created from the page array passed to vmap().
    2857             :  *
    2858             :  * Must not be called in interrupt context.
    2859             :  */
    2860           0 : void vunmap(const void *addr)
    2861             : {
    2862             :         struct vm_struct *vm;
    2863             : 
    2864           0 :         BUG_ON(in_interrupt());
    2865             :         might_sleep();
    2866             : 
    2867           0 :         if (!addr)
    2868             :                 return;
    2869           0 :         vm = remove_vm_area(addr);
    2870           0 :         if (unlikely(!vm)) {
    2871           0 :                 WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
    2872             :                                 addr);
    2873           0 :                 return;
    2874             :         }
    2875           0 :         kfree(vm);
    2876             : }
    2877             : EXPORT_SYMBOL(vunmap);
    2878             : 
    2879             : /**
    2880             :  * vmap - map an array of pages into virtually contiguous space
    2881             :  * @pages: array of page pointers
    2882             :  * @count: number of pages to map
    2883             :  * @flags: vm_area->flags
    2884             :  * @prot: page protection for the mapping
    2885             :  *
    2886             :  * Maps @count pages from @pages into contiguous kernel virtual space.
    2887             :  * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
    2888             :  * (which must be kmalloc or vmalloc memory) and one reference per pages in it
    2889             :  * are transferred from the caller to vmap(), and will be freed / dropped when
    2890             :  * vfree() is called on the return value.
    2891             :  *
    2892             :  * Return: the address of the area or %NULL on failure
    2893             :  */
    2894           0 : void *vmap(struct page **pages, unsigned int count,
    2895             :            unsigned long flags, pgprot_t prot)
    2896             : {
    2897             :         struct vm_struct *area;
    2898             :         unsigned long addr;
    2899             :         unsigned long size;             /* In bytes */
    2900             : 
    2901             :         might_sleep();
    2902             : 
    2903           0 :         if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
    2904             :                 return NULL;
    2905             : 
    2906             :         /*
    2907             :          * Your top guard is someone else's bottom guard. Not having a top
    2908             :          * guard compromises someone else's mappings too.
    2909             :          */
    2910           0 :         if (WARN_ON_ONCE(flags & VM_NO_GUARD))
    2911           0 :                 flags &= ~VM_NO_GUARD;
    2912             : 
    2913           0 :         if (count > totalram_pages())
    2914             :                 return NULL;
    2915             : 
    2916           0 :         size = (unsigned long)count << PAGE_SHIFT;
    2917           0 :         area = get_vm_area_caller(size, flags, __builtin_return_address(0));
    2918           0 :         if (!area)
    2919             :                 return NULL;
    2920             : 
    2921           0 :         addr = (unsigned long)area->addr;
    2922           0 :         if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
    2923             :                                 pages, PAGE_SHIFT) < 0) {
    2924           0 :                 vunmap(area->addr);
    2925           0 :                 return NULL;
    2926             :         }
    2927             : 
    2928           0 :         if (flags & VM_MAP_PUT_PAGES) {
    2929           0 :                 area->pages = pages;
    2930           0 :                 area->nr_pages = count;
    2931             :         }
    2932           0 :         return area->addr;
    2933             : }
    2934             : EXPORT_SYMBOL(vmap);
    2935             : 
    2936             : #ifdef CONFIG_VMAP_PFN
    2937             : struct vmap_pfn_data {
    2938             :         unsigned long   *pfns;
    2939             :         pgprot_t        prot;
    2940             :         unsigned int    idx;
    2941             : };
    2942             : 
    2943             : static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
    2944             : {
    2945             :         struct vmap_pfn_data *data = private;
    2946             :         unsigned long pfn = data->pfns[data->idx];
    2947             :         pte_t ptent;
    2948             : 
    2949             :         if (WARN_ON_ONCE(pfn_valid(pfn)))
    2950             :                 return -EINVAL;
    2951             : 
    2952             :         ptent = pte_mkspecial(pfn_pte(pfn, data->prot));
    2953             :         set_pte_at(&init_mm, addr, pte, ptent);
    2954             : 
    2955             :         data->idx++;
    2956             :         return 0;
    2957             : }
    2958             : 
    2959             : /**
    2960             :  * vmap_pfn - map an array of PFNs into virtually contiguous space
    2961             :  * @pfns: array of PFNs
    2962             :  * @count: number of pages to map
    2963             :  * @prot: page protection for the mapping
    2964             :  *
    2965             :  * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
    2966             :  * the start address of the mapping.
    2967             :  */
    2968             : void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
    2969             : {
    2970             :         struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
    2971             :         struct vm_struct *area;
    2972             : 
    2973             :         area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
    2974             :                         __builtin_return_address(0));
    2975             :         if (!area)
    2976             :                 return NULL;
    2977             :         if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
    2978             :                         count * PAGE_SIZE, vmap_pfn_apply, &data)) {
    2979             :                 free_vm_area(area);
    2980             :                 return NULL;
    2981             :         }
    2982             :         return area->addr;
    2983             : }
    2984             : EXPORT_SYMBOL_GPL(vmap_pfn);
    2985             : #endif /* CONFIG_VMAP_PFN */
    2986             : 
    2987             : static inline unsigned int
    2988          16 : vm_area_alloc_pages(gfp_t gfp, int nid,
    2989             :                 unsigned int order, unsigned int nr_pages, struct page **pages)
    2990             : {
    2991          16 :         unsigned int nr_allocated = 0;
    2992          16 :         gfp_t alloc_gfp = gfp;
    2993          16 :         bool nofail = false;
    2994             :         struct page *page;
    2995             :         int i;
    2996             : 
    2997             :         /*
    2998             :          * For order-0 pages we make use of bulk allocator, if
    2999             :          * the page array is partly or not at all populated due
    3000             :          * to fails, fallback to a single page allocator that is
    3001             :          * more permissive.
    3002             :          */
    3003          16 :         if (!order) {
    3004             :                 /* bulk allocator doesn't support nofail req. officially */
    3005          16 :                 gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
    3006             : 
    3007          48 :                 while (nr_allocated < nr_pages) {
    3008             :                         unsigned int nr, nr_pages_request;
    3009             : 
    3010             :                         /*
    3011             :                          * A maximum allowed request is hard-coded and is 100
    3012             :                          * pages per call. That is done in order to prevent a
    3013             :                          * long preemption off scenario in the bulk-allocator
    3014             :                          * so the range is [1:100].
    3015             :                          */
    3016          16 :                         nr_pages_request = min(100U, nr_pages - nr_allocated);
    3017             : 
    3018             :                         /* memory allocation should consider mempolicy, we can't
    3019             :                          * wrongly use nearest node when nid == NUMA_NO_NODE,
    3020             :                          * otherwise memory may be allocated in only one node,
    3021             :                          * but mempolicy wants to alloc memory by interleaving.
    3022             :                          */
    3023             :                         if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
    3024             :                                 nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
    3025             :                                                         nr_pages_request,
    3026             :                                                         pages + nr_allocated);
    3027             : 
    3028             :                         else
    3029          32 :                                 nr = alloc_pages_bulk_array_node(bulk_gfp, nid,
    3030             :                                                         nr_pages_request,
    3031          16 :                                                         pages + nr_allocated);
    3032             : 
    3033          16 :                         nr_allocated += nr;
    3034          16 :                         cond_resched();
    3035             : 
    3036             :                         /*
    3037             :                          * If zero or pages were obtained partly,
    3038             :                          * fallback to a single page allocator.
    3039             :                          */
    3040          16 :                         if (nr != nr_pages_request)
    3041             :                                 break;
    3042             :                 }
    3043           0 :         } else if (gfp & __GFP_NOFAIL) {
    3044             :                 /*
    3045             :                  * Higher order nofail allocations are really expensive and
    3046             :                  * potentially dangerous (pre-mature OOM, disruptive reclaim
    3047             :                  * and compaction etc.
    3048             :                  */
    3049           0 :                 alloc_gfp &= ~__GFP_NOFAIL;
    3050           0 :                 nofail = true;
    3051             :         }
    3052             : 
    3053             :         /* High-order pages or fallback path if "bulk" fails. */
    3054          16 :         while (nr_allocated < nr_pages) {
    3055           0 :                 if (fatal_signal_pending(current))
    3056             :                         break;
    3057             : 
    3058           0 :                 if (nid == NUMA_NO_NODE)
    3059           0 :                         page = alloc_pages(alloc_gfp, order);
    3060             :                 else
    3061           0 :                         page = alloc_pages_node(nid, alloc_gfp, order);
    3062           0 :                 if (unlikely(!page)) {
    3063           0 :                         if (!nofail)
    3064             :                                 break;
    3065             : 
    3066             :                         /* fall back to the zero order allocations */
    3067           0 :                         alloc_gfp |= __GFP_NOFAIL;
    3068           0 :                         order = 0;
    3069           0 :                         continue;
    3070             :                 }
    3071             : 
    3072             :                 /*
    3073             :                  * Higher order allocations must be able to be treated as
    3074             :                  * indepdenent small pages by callers (as they can with
    3075             :                  * small-page vmallocs). Some drivers do their own refcounting
    3076             :                  * on vmalloc_to_page() pages, some use page->mapping,
    3077             :                  * page->lru, etc.
    3078             :                  */
    3079           0 :                 if (order)
    3080           0 :                         split_page(page, order);
    3081             : 
    3082             :                 /*
    3083             :                  * Careful, we allocate and map page-order pages, but
    3084             :                  * tracking is done per PAGE_SIZE page so as to keep the
    3085             :                  * vm_struct APIs independent of the physical/mapped size.
    3086             :                  */
    3087           0 :                 for (i = 0; i < (1U << order); i++)
    3088           0 :                         pages[nr_allocated + i] = page + i;
    3089             : 
    3090           0 :                 cond_resched();
    3091           0 :                 nr_allocated += 1U << order;
    3092             :         }
    3093             : 
    3094          16 :         return nr_allocated;
    3095             : }
    3096             : 
    3097          16 : static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
    3098             :                                  pgprot_t prot, unsigned int page_shift,
    3099             :                                  int node)
    3100             : {
    3101          16 :         const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
    3102          16 :         bool nofail = gfp_mask & __GFP_NOFAIL;
    3103          16 :         unsigned long addr = (unsigned long)area->addr;
    3104          32 :         unsigned long size = get_vm_area_size(area);
    3105             :         unsigned long array_size;
    3106          16 :         unsigned int nr_small_pages = size >> PAGE_SHIFT;
    3107             :         unsigned int page_order;
    3108             :         unsigned int flags;
    3109             :         int ret;
    3110             : 
    3111          16 :         array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
    3112             : 
    3113          16 :         if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
    3114          16 :                 gfp_mask |= __GFP_HIGHMEM;
    3115             : 
    3116             :         /* Please note that the recursion is strictly bounded. */
    3117          16 :         if (array_size > PAGE_SIZE) {
    3118           0 :                 area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
    3119             :                                         area->caller);
    3120             :         } else {
    3121          16 :                 area->pages = kmalloc_node(array_size, nested_gfp, node);
    3122             :         }
    3123             : 
    3124          16 :         if (!area->pages) {
    3125           0 :                 warn_alloc(gfp_mask, NULL,
    3126             :                         "vmalloc error: size %lu, failed to allocated page array size %lu",
    3127             :                         nr_small_pages * PAGE_SIZE, array_size);
    3128           0 :                 free_vm_area(area);
    3129           0 :                 return NULL;
    3130             :         }
    3131             : 
    3132          16 :         set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
    3133          16 :         page_order = vm_area_page_order(area);
    3134             : 
    3135          16 :         area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
    3136             :                 node, page_order, nr_small_pages, area->pages);
    3137             : 
    3138          32 :         atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
    3139          16 :         if (gfp_mask & __GFP_ACCOUNT) {
    3140             :                 int i;
    3141             : 
    3142           0 :                 for (i = 0; i < area->nr_pages; i++)
    3143           0 :                         mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
    3144             :         }
    3145             : 
    3146             :         /*
    3147             :          * If not enough pages were obtained to accomplish an
    3148             :          * allocation request, free them via vfree() if any.
    3149             :          */
    3150          16 :         if (area->nr_pages != nr_small_pages) {
    3151             :                 /*
    3152             :                  * vm_area_alloc_pages() can fail due to insufficient memory but
    3153             :                  * also:-
    3154             :                  *
    3155             :                  * - a pending fatal signal
    3156             :                  * - insufficient huge page-order pages
    3157             :                  *
    3158             :                  * Since we always retry allocations at order-0 in the huge page
    3159             :                  * case a warning for either is spurious.
    3160             :                  */
    3161           0 :                 if (!fatal_signal_pending(current) && page_order == 0)
    3162           0 :                         warn_alloc(gfp_mask, NULL,
    3163             :                                 "vmalloc error: size %lu, failed to allocate pages",
    3164           0 :                                 area->nr_pages * PAGE_SIZE);
    3165             :                 goto fail;
    3166             :         }
    3167             : 
    3168             :         /*
    3169             :          * page tables allocations ignore external gfp mask, enforce it
    3170             :          * by the scope API
    3171             :          */
    3172          16 :         if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
    3173           0 :                 flags = memalloc_nofs_save();
    3174          16 :         else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
    3175           0 :                 flags = memalloc_noio_save();
    3176             : 
    3177             :         do {
    3178          16 :                 ret = vmap_pages_range(addr, addr + size, prot, area->pages,
    3179             :                         page_shift);
    3180          16 :                 if (nofail && (ret < 0))
    3181           0 :                         schedule_timeout_uninterruptible(1);
    3182          16 :         } while (nofail && (ret < 0));
    3183             : 
    3184          16 :         if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
    3185             :                 memalloc_nofs_restore(flags);
    3186          16 :         else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
    3187             :                 memalloc_noio_restore(flags);
    3188             : 
    3189          16 :         if (ret < 0) {
    3190           0 :                 warn_alloc(gfp_mask, NULL,
    3191             :                         "vmalloc error: size %lu, failed to map pages",
    3192           0 :                         area->nr_pages * PAGE_SIZE);
    3193           0 :                 goto fail;
    3194             :         }
    3195             : 
    3196          16 :         return area->addr;
    3197             : 
    3198             : fail:
    3199           0 :         vfree(area->addr);
    3200           0 :         return NULL;
    3201             : }
    3202             : 
    3203             : /**
    3204             :  * __vmalloc_node_range - allocate virtually contiguous memory
    3205             :  * @size:                 allocation size
    3206             :  * @align:                desired alignment
    3207             :  * @start:                vm area range start
    3208             :  * @end:                  vm area range end
    3209             :  * @gfp_mask:             flags for the page level allocator
    3210             :  * @prot:                 protection mask for the allocated pages
    3211             :  * @vm_flags:             additional vm area flags (e.g. %VM_NO_GUARD)
    3212             :  * @node:                 node to use for allocation or NUMA_NO_NODE
    3213             :  * @caller:               caller's return address
    3214             :  *
    3215             :  * Allocate enough pages to cover @size from the page level
    3216             :  * allocator with @gfp_mask flags. Please note that the full set of gfp
    3217             :  * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
    3218             :  * supported.
    3219             :  * Zone modifiers are not supported. From the reclaim modifiers
    3220             :  * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
    3221             :  * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
    3222             :  * __GFP_RETRY_MAYFAIL are not supported).
    3223             :  *
    3224             :  * __GFP_NOWARN can be used to suppress failures messages.
    3225             :  *
    3226             :  * Map them into contiguous kernel virtual space, using a pagetable
    3227             :  * protection of @prot.
    3228             :  *
    3229             :  * Return: the address of the area or %NULL on failure
    3230             :  */
    3231          16 : void *__vmalloc_node_range(unsigned long size, unsigned long align,
    3232             :                         unsigned long start, unsigned long end, gfp_t gfp_mask,
    3233             :                         pgprot_t prot, unsigned long vm_flags, int node,
    3234             :                         const void *caller)
    3235             : {
    3236             :         struct vm_struct *area;
    3237             :         void *ret;
    3238          16 :         kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
    3239          16 :         unsigned long real_size = size;
    3240          16 :         unsigned long real_align = align;
    3241          16 :         unsigned int shift = PAGE_SHIFT;
    3242             : 
    3243          16 :         if (WARN_ON_ONCE(!size))
    3244             :                 return NULL;
    3245             : 
    3246          32 :         if ((size >> PAGE_SHIFT) > totalram_pages()) {
    3247           0 :                 warn_alloc(gfp_mask, NULL,
    3248             :                         "vmalloc error: size %lu, exceeds total pages",
    3249             :                         real_size);
    3250           0 :                 return NULL;
    3251             :         }
    3252             : 
    3253             :         if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
    3254             :                 unsigned long size_per_node;
    3255             : 
    3256             :                 /*
    3257             :                  * Try huge pages. Only try for PAGE_KERNEL allocations,
    3258             :                  * others like modules don't yet expect huge pages in
    3259             :                  * their allocations due to apply_to_page_range not
    3260             :                  * supporting them.
    3261             :                  */
    3262             : 
    3263             :                 size_per_node = size;
    3264             :                 if (node == NUMA_NO_NODE)
    3265             :                         size_per_node /= num_online_nodes();
    3266             :                 if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
    3267             :                         shift = PMD_SHIFT;
    3268             :                 else
    3269             :                         shift = arch_vmap_pte_supported_shift(size_per_node);
    3270             : 
    3271             :                 align = max(real_align, 1UL << shift);
    3272             :                 size = ALIGN(real_size, 1UL << shift);
    3273             :         }
    3274             : 
    3275             : again:
    3276          16 :         area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
    3277             :                                   VM_UNINITIALIZED | vm_flags, start, end, node,
    3278             :                                   gfp_mask, caller);
    3279          16 :         if (!area) {
    3280           0 :                 bool nofail = gfp_mask & __GFP_NOFAIL;
    3281           0 :                 warn_alloc(gfp_mask, NULL,
    3282             :                         "vmalloc error: size %lu, vm_struct allocation failed%s",
    3283             :                         real_size, (nofail) ? ". Retrying." : "");
    3284           0 :                 if (nofail) {
    3285           0 :                         schedule_timeout_uninterruptible(1);
    3286           0 :                         goto again;
    3287             :                 }
    3288             :                 goto fail;
    3289             :         }
    3290             : 
    3291             :         /*
    3292             :          * Prepare arguments for __vmalloc_area_node() and
    3293             :          * kasan_unpoison_vmalloc().
    3294             :          */
    3295          16 :         if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
    3296             :                 if (kasan_hw_tags_enabled()) {
    3297             :                         /*
    3298             :                          * Modify protection bits to allow tagging.
    3299             :                          * This must be done before mapping.
    3300             :                          */
    3301             :                         prot = arch_vmap_pgprot_tagged(prot);
    3302             : 
    3303             :                         /*
    3304             :                          * Skip page_alloc poisoning and zeroing for physical
    3305             :                          * pages backing VM_ALLOC mapping. Memory is instead
    3306             :                          * poisoned and zeroed by kasan_unpoison_vmalloc().
    3307             :                          */
    3308             :                         gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
    3309             :                 }
    3310             : 
    3311             :                 /* Take note that the mapping is PAGE_KERNEL. */
    3312             :                 kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
    3313             :         }
    3314             : 
    3315             :         /* Allocate physical pages and map them into vmalloc space. */
    3316          16 :         ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
    3317          16 :         if (!ret)
    3318             :                 goto fail;
    3319             : 
    3320             :         /*
    3321             :          * Mark the pages as accessible, now that they are mapped.
    3322             :          * The condition for setting KASAN_VMALLOC_INIT should complement the
    3323             :          * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
    3324             :          * to make sure that memory is initialized under the same conditions.
    3325             :          * Tag-based KASAN modes only assign tags to normal non-executable
    3326             :          * allocations, see __kasan_unpoison_vmalloc().
    3327             :          */
    3328          16 :         kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
    3329          32 :         if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
    3330             :             (gfp_mask & __GFP_SKIP_ZERO))
    3331             :                 kasan_flags |= KASAN_VMALLOC_INIT;
    3332             :         /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
    3333          16 :         area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
    3334             : 
    3335             :         /*
    3336             :          * In this function, newly allocated vm_struct has VM_UNINITIALIZED
    3337             :          * flag. It means that vm_struct is not fully initialized.
    3338             :          * Now, it is fully initialized, so remove this flag here.
    3339             :          */
    3340          16 :         clear_vm_uninitialized_flag(area);
    3341             : 
    3342          16 :         size = PAGE_ALIGN(size);
    3343             :         if (!(vm_flags & VM_DEFER_KMEMLEAK))
    3344          16 :                 kmemleak_vmalloc(area, size, gfp_mask);
    3345             : 
    3346          16 :         return area->addr;
    3347             : 
    3348             : fail:
    3349             :         if (shift > PAGE_SHIFT) {
    3350             :                 shift = PAGE_SHIFT;
    3351             :                 align = real_align;
    3352             :                 size = real_size;
    3353             :                 goto again;
    3354             :         }
    3355             : 
    3356             :         return NULL;
    3357             : }
    3358             : 
    3359             : /**
    3360             :  * __vmalloc_node - allocate virtually contiguous memory
    3361             :  * @size:           allocation size
    3362             :  * @align:          desired alignment
    3363             :  * @gfp_mask:       flags for the page level allocator
    3364             :  * @node:           node to use for allocation or NUMA_NO_NODE
    3365             :  * @caller:         caller's return address
    3366             :  *
    3367             :  * Allocate enough pages to cover @size from the page level allocator with
    3368             :  * @gfp_mask flags.  Map them into contiguous kernel virtual space.
    3369             :  *
    3370             :  * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
    3371             :  * and __GFP_NOFAIL are not supported
    3372             :  *
    3373             :  * Any use of gfp flags outside of GFP_KERNEL should be consulted
    3374             :  * with mm people.
    3375             :  *
    3376             :  * Return: pointer to the allocated memory or %NULL on error
    3377             :  */
    3378           0 : void *__vmalloc_node(unsigned long size, unsigned long align,
    3379             :                             gfp_t gfp_mask, int node, const void *caller)
    3380             : {
    3381           0 :         return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
    3382           0 :                                 gfp_mask, PAGE_KERNEL, 0, node, caller);
    3383             : }
    3384             : /*
    3385             :  * This is only for performance analysis of vmalloc and stress purpose.
    3386             :  * It is required by vmalloc test module, therefore do not use it other
    3387             :  * than that.
    3388             :  */
    3389             : #ifdef CONFIG_TEST_VMALLOC_MODULE
    3390             : EXPORT_SYMBOL_GPL(__vmalloc_node);
    3391             : #endif
    3392             : 
    3393           0 : void *__vmalloc(unsigned long size, gfp_t gfp_mask)
    3394             : {
    3395           0 :         return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
    3396           0 :                                 __builtin_return_address(0));
    3397             : }
    3398             : EXPORT_SYMBOL(__vmalloc);
    3399             : 
    3400             : /**
    3401             :  * vmalloc - allocate virtually contiguous memory
    3402             :  * @size:    allocation size
    3403             :  *
    3404             :  * Allocate enough pages to cover @size from the page level
    3405             :  * allocator and map them into contiguous kernel virtual space.
    3406             :  *
    3407             :  * For tight control over page level allocator and protection flags
    3408             :  * use __vmalloc() instead.
    3409             :  *
    3410             :  * Return: pointer to the allocated memory or %NULL on error
    3411             :  */
    3412           0 : void *vmalloc(unsigned long size)
    3413             : {
    3414           0 :         return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
    3415           0 :                                 __builtin_return_address(0));
    3416             : }
    3417             : EXPORT_SYMBOL(vmalloc);
    3418             : 
    3419             : /**
    3420             :  * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
    3421             :  * @size:      allocation size
    3422             :  * @gfp_mask:  flags for the page level allocator
    3423             :  *
    3424             :  * Allocate enough pages to cover @size from the page level
    3425             :  * allocator and map them into contiguous kernel virtual space.
    3426             :  * If @size is greater than or equal to PMD_SIZE, allow using
    3427             :  * huge pages for the memory
    3428             :  *
    3429             :  * Return: pointer to the allocated memory or %NULL on error
    3430             :  */
    3431           0 : void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
    3432             : {
    3433           0 :         return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
    3434           0 :                                     gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
    3435           0 :                                     NUMA_NO_NODE, __builtin_return_address(0));
    3436             : }
    3437             : EXPORT_SYMBOL_GPL(vmalloc_huge);
    3438             : 
    3439             : /**
    3440             :  * vzalloc - allocate virtually contiguous memory with zero fill
    3441             :  * @size:    allocation size
    3442             :  *
    3443             :  * Allocate enough pages to cover @size from the page level
    3444             :  * allocator and map them into contiguous kernel virtual space.
    3445             :  * The memory allocated is set to zero.
    3446             :  *
    3447             :  * For tight control over page level allocator and protection flags
    3448             :  * use __vmalloc() instead.
    3449             :  *
    3450             :  * Return: pointer to the allocated memory or %NULL on error
    3451             :  */
    3452           0 : void *vzalloc(unsigned long size)
    3453             : {
    3454           0 :         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
    3455           0 :                                 __builtin_return_address(0));
    3456             : }
    3457             : EXPORT_SYMBOL(vzalloc);
    3458             : 
    3459             : /**
    3460             :  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
    3461             :  * @size: allocation size
    3462             :  *
    3463             :  * The resulting memory area is zeroed so it can be mapped to userspace
    3464             :  * without leaking data.
    3465             :  *
    3466             :  * Return: pointer to the allocated memory or %NULL on error
    3467             :  */
    3468           0 : void *vmalloc_user(unsigned long size)
    3469             : {
    3470           0 :         return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
    3471           0 :                                     GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
    3472             :                                     VM_USERMAP, NUMA_NO_NODE,
    3473           0 :                                     __builtin_return_address(0));
    3474             : }
    3475             : EXPORT_SYMBOL(vmalloc_user);
    3476             : 
    3477             : /**
    3478             :  * vmalloc_node - allocate memory on a specific node
    3479             :  * @size:         allocation size
    3480             :  * @node:         numa node
    3481             :  *
    3482             :  * Allocate enough pages to cover @size from the page level
    3483             :  * allocator and map them into contiguous kernel virtual space.
    3484             :  *
    3485             :  * For tight control over page level allocator and protection flags
    3486             :  * use __vmalloc() instead.
    3487             :  *
    3488             :  * Return: pointer to the allocated memory or %NULL on error
    3489             :  */
    3490           0 : void *vmalloc_node(unsigned long size, int node)
    3491             : {
    3492           0 :         return __vmalloc_node(size, 1, GFP_KERNEL, node,
    3493           0 :                         __builtin_return_address(0));
    3494             : }
    3495             : EXPORT_SYMBOL(vmalloc_node);
    3496             : 
    3497             : /**
    3498             :  * vzalloc_node - allocate memory on a specific node with zero fill
    3499             :  * @size:       allocation size
    3500             :  * @node:       numa node
    3501             :  *
    3502             :  * Allocate enough pages to cover @size from the page level
    3503             :  * allocator and map them into contiguous kernel virtual space.
    3504             :  * The memory allocated is set to zero.
    3505             :  *
    3506             :  * Return: pointer to the allocated memory or %NULL on error
    3507             :  */
    3508           0 : void *vzalloc_node(unsigned long size, int node)
    3509             : {
    3510           0 :         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
    3511           0 :                                 __builtin_return_address(0));
    3512             : }
    3513             : EXPORT_SYMBOL(vzalloc_node);
    3514             : 
    3515             : #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
    3516             : #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
    3517             : #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
    3518             : #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
    3519             : #else
    3520             : /*
    3521             :  * 64b systems should always have either DMA or DMA32 zones. For others
    3522             :  * GFP_DMA32 should do the right thing and use the normal zone.
    3523             :  */
    3524             : #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
    3525             : #endif
    3526             : 
    3527             : /**
    3528             :  * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
    3529             :  * @size:       allocation size
    3530             :  *
    3531             :  * Allocate enough 32bit PA addressable pages to cover @size from the
    3532             :  * page level allocator and map them into contiguous kernel virtual space.
    3533             :  *
    3534             :  * Return: pointer to the allocated memory or %NULL on error
    3535             :  */
    3536           0 : void *vmalloc_32(unsigned long size)
    3537             : {
    3538           0 :         return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
    3539           0 :                         __builtin_return_address(0));
    3540             : }
    3541             : EXPORT_SYMBOL(vmalloc_32);
    3542             : 
    3543             : /**
    3544             :  * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
    3545             :  * @size:            allocation size
    3546             :  *
    3547             :  * The resulting memory area is 32bit addressable and zeroed so it can be
    3548             :  * mapped to userspace without leaking data.
    3549             :  *
    3550             :  * Return: pointer to the allocated memory or %NULL on error
    3551             :  */
    3552           0 : void *vmalloc_32_user(unsigned long size)
    3553             : {
    3554           0 :         return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
    3555           0 :                                     GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
    3556             :                                     VM_USERMAP, NUMA_NO_NODE,
    3557           0 :                                     __builtin_return_address(0));
    3558             : }
    3559             : EXPORT_SYMBOL(vmalloc_32_user);
    3560             : 
    3561             : /*
    3562             :  * Atomically zero bytes in the iterator.
    3563             :  *
    3564             :  * Returns the number of zeroed bytes.
    3565             :  */
    3566           0 : static size_t zero_iter(struct iov_iter *iter, size_t count)
    3567             : {
    3568           0 :         size_t remains = count;
    3569             : 
    3570           0 :         while (remains > 0) {
    3571             :                 size_t num, copied;
    3572             : 
    3573           0 :                 num = min_t(size_t, remains, PAGE_SIZE);
    3574           0 :                 copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
    3575           0 :                 remains -= copied;
    3576             : 
    3577           0 :                 if (copied < num)
    3578             :                         break;
    3579             :         }
    3580             : 
    3581           0 :         return count - remains;
    3582             : }
    3583             : 
    3584             : /*
    3585             :  * small helper routine, copy contents to iter from addr.
    3586             :  * If the page is not present, fill zero.
    3587             :  *
    3588             :  * Returns the number of copied bytes.
    3589             :  */
    3590           0 : static size_t aligned_vread_iter(struct iov_iter *iter,
    3591             :                                  const char *addr, size_t count)
    3592             : {
    3593           0 :         size_t remains = count;
    3594             :         struct page *page;
    3595             : 
    3596           0 :         while (remains > 0) {
    3597             :                 unsigned long offset, length;
    3598           0 :                 size_t copied = 0;
    3599             : 
    3600           0 :                 offset = offset_in_page(addr);
    3601           0 :                 length = PAGE_SIZE - offset;
    3602           0 :                 if (length > remains)
    3603           0 :                         length = remains;
    3604           0 :                 page = vmalloc_to_page(addr);
    3605             :                 /*
    3606             :                  * To do safe access to this _mapped_ area, we need lock. But
    3607             :                  * adding lock here means that we need to add overhead of
    3608             :                  * vmalloc()/vfree() calls for this _debug_ interface, rarely
    3609             :                  * used. Instead of that, we'll use an local mapping via
    3610             :                  * copy_page_to_iter_nofault() and accept a small overhead in
    3611             :                  * this access function.
    3612             :                  */
    3613           0 :                 if (page)
    3614           0 :                         copied = copy_page_to_iter_nofault(page, offset,
    3615             :                                                            length, iter);
    3616             :                 else
    3617           0 :                         copied = zero_iter(iter, length);
    3618             : 
    3619           0 :                 addr += copied;
    3620           0 :                 remains -= copied;
    3621             : 
    3622           0 :                 if (copied != length)
    3623             :                         break;
    3624             :         }
    3625             : 
    3626           0 :         return count - remains;
    3627             : }
    3628             : 
    3629             : /*
    3630             :  * Read from a vm_map_ram region of memory.
    3631             :  *
    3632             :  * Returns the number of copied bytes.
    3633             :  */
    3634           0 : static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
    3635             :                                   size_t count, unsigned long flags)
    3636             : {
    3637             :         char *start;
    3638             :         struct vmap_block *vb;
    3639             :         struct xarray *xa;
    3640             :         unsigned long offset;
    3641             :         unsigned int rs, re;
    3642             :         size_t remains, n;
    3643             : 
    3644             :         /*
    3645             :          * If it's area created by vm_map_ram() interface directly, but
    3646             :          * not further subdividing and delegating management to vmap_block,
    3647             :          * handle it here.
    3648             :          */
    3649           0 :         if (!(flags & VMAP_BLOCK))
    3650           0 :                 return aligned_vread_iter(iter, addr, count);
    3651             : 
    3652           0 :         remains = count;
    3653             : 
    3654             :         /*
    3655             :          * Area is split into regions and tracked with vmap_block, read out
    3656             :          * each region and zero fill the hole between regions.
    3657             :          */
    3658           0 :         xa = addr_to_vb_xa((unsigned long) addr);
    3659           0 :         vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
    3660           0 :         if (!vb)
    3661             :                 goto finished_zero;
    3662             : 
    3663           0 :         spin_lock(&vb->lock);
    3664           0 :         if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
    3665           0 :                 spin_unlock(&vb->lock);
    3666             :                 goto finished_zero;
    3667             :         }
    3668             : 
    3669           0 :         for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
    3670             :                 size_t copied;
    3671             : 
    3672           0 :                 if (remains == 0)
    3673             :                         goto finished;
    3674             : 
    3675           0 :                 start = vmap_block_vaddr(vb->va->va_start, rs);
    3676             : 
    3677           0 :                 if (addr < start) {
    3678           0 :                         size_t to_zero = min_t(size_t, start - addr, remains);
    3679           0 :                         size_t zeroed = zero_iter(iter, to_zero);
    3680             : 
    3681           0 :                         addr += zeroed;
    3682           0 :                         remains -= zeroed;
    3683             : 
    3684           0 :                         if (remains == 0 || zeroed != to_zero)
    3685             :                                 goto finished;
    3686             :                 }
    3687             : 
    3688             :                 /*it could start reading from the middle of used region*/
    3689           0 :                 offset = offset_in_page(addr);
    3690           0 :                 n = ((re - rs + 1) << PAGE_SHIFT) - offset;
    3691           0 :                 if (n > remains)
    3692           0 :                         n = remains;
    3693             : 
    3694           0 :                 copied = aligned_vread_iter(iter, start + offset, n);
    3695             : 
    3696           0 :                 addr += copied;
    3697           0 :                 remains -= copied;
    3698             : 
    3699           0 :                 if (copied != n)
    3700             :                         goto finished;
    3701             :         }
    3702             : 
    3703           0 :         spin_unlock(&vb->lock);
    3704             : 
    3705             : finished_zero:
    3706             :         /* zero-fill the left dirty or free regions */
    3707           0 :         return count - remains + zero_iter(iter, remains);
    3708             : finished:
    3709             :         /* We couldn't copy/zero everything */
    3710           0 :         spin_unlock(&vb->lock);
    3711           0 :         return count - remains;
    3712             : }
    3713             : 
    3714             : /**
    3715             :  * vread_iter() - read vmalloc area in a safe way to an iterator.
    3716             :  * @iter:         the iterator to which data should be written.
    3717             :  * @addr:         vm address.
    3718             :  * @count:        number of bytes to be read.
    3719             :  *
    3720             :  * This function checks that addr is a valid vmalloc'ed area, and
    3721             :  * copy data from that area to a given buffer. If the given memory range
    3722             :  * of [addr...addr+count) includes some valid address, data is copied to
    3723             :  * proper area of @buf. If there are memory holes, they'll be zero-filled.
    3724             :  * IOREMAP area is treated as memory hole and no copy is done.
    3725             :  *
    3726             :  * If [addr...addr+count) doesn't includes any intersects with alive
    3727             :  * vm_struct area, returns 0. @buf should be kernel's buffer.
    3728             :  *
    3729             :  * Note: In usual ops, vread() is never necessary because the caller
    3730             :  * should know vmalloc() area is valid and can use memcpy().
    3731             :  * This is for routines which have to access vmalloc area without
    3732             :  * any information, as /proc/kcore.
    3733             :  *
    3734             :  * Return: number of bytes for which addr and buf should be increased
    3735             :  * (same number as @count) or %0 if [addr...addr+count) doesn't
    3736             :  * include any intersection with valid vmalloc area
    3737             :  */
    3738           0 : long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
    3739             : {
    3740             :         struct vmap_area *va;
    3741             :         struct vm_struct *vm;
    3742             :         char *vaddr;
    3743             :         size_t n, size, flags, remains;
    3744             : 
    3745           0 :         addr = kasan_reset_tag(addr);
    3746             : 
    3747             :         /* Don't allow overflow */
    3748           0 :         if ((unsigned long) addr + count < count)
    3749           0 :                 count = -(unsigned long) addr;
    3750             : 
    3751           0 :         remains = count;
    3752             : 
    3753           0 :         spin_lock(&vmap_area_lock);
    3754           0 :         va = find_vmap_area_exceed_addr((unsigned long)addr);
    3755           0 :         if (!va)
    3756             :                 goto finished_zero;
    3757             : 
    3758             :         /* no intersects with alive vmap_area */
    3759           0 :         if ((unsigned long)addr + remains <= va->va_start)
    3760             :                 goto finished_zero;
    3761             : 
    3762           0 :         list_for_each_entry_from(va, &vmap_area_list, list) {
    3763             :                 size_t copied;
    3764             : 
    3765           0 :                 if (remains == 0)
    3766             :                         goto finished;
    3767             : 
    3768           0 :                 vm = va->vm;
    3769           0 :                 flags = va->flags & VMAP_FLAGS_MASK;
    3770             :                 /*
    3771             :                  * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
    3772             :                  * be set together with VMAP_RAM.
    3773             :                  */
    3774           0 :                 WARN_ON(flags == VMAP_BLOCK);
    3775             : 
    3776           0 :                 if (!vm && !flags)
    3777           0 :                         continue;
    3778             : 
    3779           0 :                 if (vm && (vm->flags & VM_UNINITIALIZED))
    3780           0 :                         continue;
    3781             : 
    3782             :                 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
    3783           0 :                 smp_rmb();
    3784             : 
    3785           0 :                 vaddr = (char *) va->va_start;
    3786           0 :                 size = vm ? get_vm_area_size(vm) : va_size(va);
    3787             : 
    3788           0 :                 if (addr >= vaddr + size)
    3789           0 :                         continue;
    3790             : 
    3791           0 :                 if (addr < vaddr) {
    3792           0 :                         size_t to_zero = min_t(size_t, vaddr - addr, remains);
    3793           0 :                         size_t zeroed = zero_iter(iter, to_zero);
    3794             : 
    3795           0 :                         addr += zeroed;
    3796           0 :                         remains -= zeroed;
    3797             : 
    3798           0 :                         if (remains == 0 || zeroed != to_zero)
    3799             :                                 goto finished;
    3800             :                 }
    3801             : 
    3802           0 :                 n = vaddr + size - addr;
    3803           0 :                 if (n > remains)
    3804           0 :                         n = remains;
    3805             : 
    3806           0 :                 if (flags & VMAP_RAM)
    3807           0 :                         copied = vmap_ram_vread_iter(iter, addr, n, flags);
    3808           0 :                 else if (!(vm->flags & VM_IOREMAP))
    3809           0 :                         copied = aligned_vread_iter(iter, addr, n);
    3810             :                 else /* IOREMAP area is treated as memory hole */
    3811           0 :                         copied = zero_iter(iter, n);
    3812             : 
    3813           0 :                 addr += copied;
    3814           0 :                 remains -= copied;
    3815             : 
    3816           0 :                 if (copied != n)
    3817             :                         goto finished;
    3818             :         }
    3819             : 
    3820             : finished_zero:
    3821           0 :         spin_unlock(&vmap_area_lock);
    3822             :         /* zero-fill memory holes */
    3823           0 :         return count - remains + zero_iter(iter, remains);
    3824             : finished:
    3825             :         /* Nothing remains, or We couldn't copy/zero everything. */
    3826           0 :         spin_unlock(&vmap_area_lock);
    3827             : 
    3828           0 :         return count - remains;
    3829             : }
    3830             : 
    3831             : /**
    3832             :  * remap_vmalloc_range_partial - map vmalloc pages to userspace
    3833             :  * @vma:                vma to cover
    3834             :  * @uaddr:              target user address to start at
    3835             :  * @kaddr:              virtual address of vmalloc kernel memory
    3836             :  * @pgoff:              offset from @kaddr to start at
    3837             :  * @size:               size of map area
    3838             :  *
    3839             :  * Returns:     0 for success, -Exxx on failure
    3840             :  *
    3841             :  * This function checks that @kaddr is a valid vmalloc'ed area,
    3842             :  * and that it is big enough to cover the range starting at
    3843             :  * @uaddr in @vma. Will return failure if that criteria isn't
    3844             :  * met.
    3845             :  *
    3846             :  * Similar to remap_pfn_range() (see mm/memory.c)
    3847             :  */
    3848           0 : int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
    3849             :                                 void *kaddr, unsigned long pgoff,
    3850             :                                 unsigned long size)
    3851             : {
    3852             :         struct vm_struct *area;
    3853             :         unsigned long off;
    3854             :         unsigned long end_index;
    3855             : 
    3856           0 :         if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
    3857             :                 return -EINVAL;
    3858             : 
    3859           0 :         size = PAGE_ALIGN(size);
    3860             : 
    3861           0 :         if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
    3862             :                 return -EINVAL;
    3863             : 
    3864           0 :         area = find_vm_area(kaddr);
    3865           0 :         if (!area)
    3866             :                 return -EINVAL;
    3867             : 
    3868           0 :         if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
    3869             :                 return -EINVAL;
    3870             : 
    3871           0 :         if (check_add_overflow(size, off, &end_index) ||
    3872             :             end_index > get_vm_area_size(area))
    3873             :                 return -EINVAL;
    3874           0 :         kaddr += off;
    3875             : 
    3876             :         do {
    3877           0 :                 struct page *page = vmalloc_to_page(kaddr);
    3878             :                 int ret;
    3879             : 
    3880           0 :                 ret = vm_insert_page(vma, uaddr, page);
    3881           0 :                 if (ret)
    3882             :                         return ret;
    3883             : 
    3884           0 :                 uaddr += PAGE_SIZE;
    3885           0 :                 kaddr += PAGE_SIZE;
    3886           0 :                 size -= PAGE_SIZE;
    3887           0 :         } while (size > 0);
    3888             : 
    3889           0 :         vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);
    3890             : 
    3891           0 :         return 0;
    3892             : }
    3893             : 
    3894             : /**
    3895             :  * remap_vmalloc_range - map vmalloc pages to userspace
    3896             :  * @vma:                vma to cover (map full range of vma)
    3897             :  * @addr:               vmalloc memory
    3898             :  * @pgoff:              number of pages into addr before first page to map
    3899             :  *
    3900             :  * Returns:     0 for success, -Exxx on failure
    3901             :  *
    3902             :  * This function checks that addr is a valid vmalloc'ed area, and
    3903             :  * that it is big enough to cover the vma. Will return failure if
    3904             :  * that criteria isn't met.
    3905             :  *
    3906             :  * Similar to remap_pfn_range() (see mm/memory.c)
    3907             :  */
    3908           0 : int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
    3909             :                                                 unsigned long pgoff)
    3910             : {
    3911           0 :         return remap_vmalloc_range_partial(vma, vma->vm_start,
    3912             :                                            addr, pgoff,
    3913           0 :                                            vma->vm_end - vma->vm_start);
    3914             : }
    3915             : EXPORT_SYMBOL(remap_vmalloc_range);
    3916             : 
    3917           0 : void free_vm_area(struct vm_struct *area)
    3918             : {
    3919             :         struct vm_struct *ret;
    3920           0 :         ret = remove_vm_area(area->addr);
    3921           0 :         BUG_ON(ret != area);
    3922           0 :         kfree(area);
    3923           0 : }
    3924             : EXPORT_SYMBOL_GPL(free_vm_area);
    3925             : 
    3926             : #ifdef CONFIG_SMP
    3927             : static struct vmap_area *node_to_va(struct rb_node *n)
    3928             : {
    3929             :         return rb_entry_safe(n, struct vmap_area, rb_node);
    3930             : }
    3931             : 
    3932             : /**
    3933             :  * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
    3934             :  * @addr: target address
    3935             :  *
    3936             :  * Returns: vmap_area if it is found. If there is no such area
    3937             :  *   the first highest(reverse order) vmap_area is returned
    3938             :  *   i.e. va->va_start < addr && va->va_end < addr or NULL
    3939             :  *   if there are no any areas before @addr.
    3940             :  */
    3941             : static struct vmap_area *
    3942             : pvm_find_va_enclose_addr(unsigned long addr)
    3943             : {
    3944             :         struct vmap_area *va, *tmp;
    3945             :         struct rb_node *n;
    3946             : 
    3947             :         n = free_vmap_area_root.rb_node;
    3948             :         va = NULL;
    3949             : 
    3950             :         while (n) {
    3951             :                 tmp = rb_entry(n, struct vmap_area, rb_node);
    3952             :                 if (tmp->va_start <= addr) {
    3953             :                         va = tmp;
    3954             :                         if (tmp->va_end >= addr)
    3955             :                                 break;
    3956             : 
    3957             :                         n = n->rb_right;
    3958             :                 } else {
    3959             :                         n = n->rb_left;
    3960             :                 }
    3961             :         }
    3962             : 
    3963             :         return va;
    3964             : }
    3965             : 
    3966             : /**
    3967             :  * pvm_determine_end_from_reverse - find the highest aligned address
    3968             :  * of free block below VMALLOC_END
    3969             :  * @va:
    3970             :  *   in - the VA we start the search(reverse order);
    3971             :  *   out - the VA with the highest aligned end address.
    3972             :  * @align: alignment for required highest address
    3973             :  *
    3974             :  * Returns: determined end address within vmap_area
    3975             :  */
    3976             : static unsigned long
    3977             : pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
    3978             : {
    3979             :         unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
    3980             :         unsigned long addr;
    3981             : 
    3982             :         if (likely(*va)) {
    3983             :                 list_for_each_entry_from_reverse((*va),
    3984             :                                 &free_vmap_area_list, list) {
    3985             :                         addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
    3986             :                         if ((*va)->va_start < addr)
    3987             :                                 return addr;
    3988             :                 }
    3989             :         }
    3990             : 
    3991             :         return 0;
    3992             : }
    3993             : 
    3994             : /**
    3995             :  * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
    3996             :  * @offsets: array containing offset of each area
    3997             :  * @sizes: array containing size of each area
    3998             :  * @nr_vms: the number of areas to allocate
    3999             :  * @align: alignment, all entries in @offsets and @sizes must be aligned to this
    4000             :  *
    4001             :  * Returns: kmalloc'd vm_struct pointer array pointing to allocated
    4002             :  *          vm_structs on success, %NULL on failure
    4003             :  *
    4004             :  * Percpu allocator wants to use congruent vm areas so that it can
    4005             :  * maintain the offsets among percpu areas.  This function allocates
    4006             :  * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
    4007             :  * be scattered pretty far, distance between two areas easily going up
    4008             :  * to gigabytes.  To avoid interacting with regular vmallocs, these
    4009             :  * areas are allocated from top.
    4010             :  *
    4011             :  * Despite its complicated look, this allocator is rather simple. It
    4012             :  * does everything top-down and scans free blocks from the end looking
    4013             :  * for matching base. While scanning, if any of the areas do not fit the
    4014             :  * base address is pulled down to fit the area. Scanning is repeated till
    4015             :  * all the areas fit and then all necessary data structures are inserted
    4016             :  * and the result is returned.
    4017             :  */
    4018             : struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
    4019             :                                      const size_t *sizes, int nr_vms,
    4020             :                                      size_t align)
    4021             : {
    4022             :         const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
    4023             :         const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
    4024             :         struct vmap_area **vas, *va;
    4025             :         struct vm_struct **vms;
    4026             :         int area, area2, last_area, term_area;
    4027             :         unsigned long base, start, size, end, last_end, orig_start, orig_end;
    4028             :         bool purged = false;
    4029             : 
    4030             :         /* verify parameters and allocate data structures */
    4031             :         BUG_ON(offset_in_page(align) || !is_power_of_2(align));
    4032             :         for (last_area = 0, area = 0; area < nr_vms; area++) {
    4033             :                 start = offsets[area];
    4034             :                 end = start + sizes[area];
    4035             : 
    4036             :                 /* is everything aligned properly? */
    4037             :                 BUG_ON(!IS_ALIGNED(offsets[area], align));
    4038             :                 BUG_ON(!IS_ALIGNED(sizes[area], align));
    4039             : 
    4040             :                 /* detect the area with the highest address */
    4041             :                 if (start > offsets[last_area])
    4042             :                         last_area = area;
    4043             : 
    4044             :                 for (area2 = area + 1; area2 < nr_vms; area2++) {
    4045             :                         unsigned long start2 = offsets[area2];
    4046             :                         unsigned long end2 = start2 + sizes[area2];
    4047             : 
    4048             :                         BUG_ON(start2 < end && start < end2);
    4049             :                 }
    4050             :         }
    4051             :         last_end = offsets[last_area] + sizes[last_area];
    4052             : 
    4053             :         if (vmalloc_end - vmalloc_start < last_end) {
    4054             :                 WARN_ON(true);
    4055             :                 return NULL;
    4056             :         }
    4057             : 
    4058             :         vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
    4059             :         vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
    4060             :         if (!vas || !vms)
    4061             :                 goto err_free2;
    4062             : 
    4063             :         for (area = 0; area < nr_vms; area++) {
    4064             :                 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
    4065             :                 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
    4066             :                 if (!vas[area] || !vms[area])
    4067             :                         goto err_free;
    4068             :         }
    4069             : retry:
    4070             :         spin_lock(&free_vmap_area_lock);
    4071             : 
    4072             :         /* start scanning - we scan from the top, begin with the last area */
    4073             :         area = term_area = last_area;
    4074             :         start = offsets[area];
    4075             :         end = start + sizes[area];
    4076             : 
    4077             :         va = pvm_find_va_enclose_addr(vmalloc_end);
    4078             :         base = pvm_determine_end_from_reverse(&va, align) - end;
    4079             : 
    4080             :         while (true) {
    4081             :                 /*
    4082             :                  * base might have underflowed, add last_end before
    4083             :                  * comparing.
    4084             :                  */
    4085             :                 if (base + last_end < vmalloc_start + last_end)
    4086             :                         goto overflow;
    4087             : 
    4088             :                 /*
    4089             :                  * Fitting base has not been found.
    4090             :                  */
    4091             :                 if (va == NULL)
    4092             :                         goto overflow;
    4093             : 
    4094             :                 /*
    4095             :                  * If required width exceeds current VA block, move
    4096             :                  * base downwards and then recheck.
    4097             :                  */
    4098             :                 if (base + end > va->va_end) {
    4099             :                         base = pvm_determine_end_from_reverse(&va, align) - end;
    4100             :                         term_area = area;
    4101             :                         continue;
    4102             :                 }
    4103             : 
    4104             :                 /*
    4105             :                  * If this VA does not fit, move base downwards and recheck.
    4106             :                  */
    4107             :                 if (base + start < va->va_start) {
    4108             :                         va = node_to_va(rb_prev(&va->rb_node));
    4109             :                         base = pvm_determine_end_from_reverse(&va, align) - end;
    4110             :                         term_area = area;
    4111             :                         continue;
    4112             :                 }
    4113             : 
    4114             :                 /*
    4115             :                  * This area fits, move on to the previous one.  If
    4116             :                  * the previous one is the terminal one, we're done.
    4117             :                  */
    4118             :                 area = (area + nr_vms - 1) % nr_vms;
    4119             :                 if (area == term_area)
    4120             :                         break;
    4121             : 
    4122             :                 start = offsets[area];
    4123             :                 end = start + sizes[area];
    4124             :                 va = pvm_find_va_enclose_addr(base + end);
    4125             :         }
    4126             : 
    4127             :         /* we've found a fitting base, insert all va's */
    4128             :         for (area = 0; area < nr_vms; area++) {
    4129             :                 int ret;
    4130             : 
    4131             :                 start = base + offsets[area];
    4132             :                 size = sizes[area];
    4133             : 
    4134             :                 va = pvm_find_va_enclose_addr(start);
    4135             :                 if (WARN_ON_ONCE(va == NULL))
    4136             :                         /* It is a BUG(), but trigger recovery instead. */
    4137             :                         goto recovery;
    4138             : 
    4139             :                 ret = adjust_va_to_fit_type(&free_vmap_area_root,
    4140             :                                             &free_vmap_area_list,
    4141             :                                             va, start, size);
    4142             :                 if (WARN_ON_ONCE(unlikely(ret)))
    4143             :                         /* It is a BUG(), but trigger recovery instead. */
    4144             :                         goto recovery;
    4145             : 
    4146             :                 /* Allocated area. */
    4147             :                 va = vas[area];
    4148             :                 va->va_start = start;
    4149             :                 va->va_end = start + size;
    4150             :         }
    4151             : 
    4152             :         spin_unlock(&free_vmap_area_lock);
    4153             : 
    4154             :         /* populate the kasan shadow space */
    4155             :         for (area = 0; area < nr_vms; area++) {
    4156             :                 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
    4157             :                         goto err_free_shadow;
    4158             :         }
    4159             : 
    4160             :         /* insert all vm's */
    4161             :         spin_lock(&vmap_area_lock);
    4162             :         for (area = 0; area < nr_vms; area++) {
    4163             :                 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
    4164             : 
    4165             :                 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
    4166             :                                  pcpu_get_vm_areas);
    4167             :         }
    4168             :         spin_unlock(&vmap_area_lock);
    4169             : 
    4170             :         /*
    4171             :          * Mark allocated areas as accessible. Do it now as a best-effort
    4172             :          * approach, as they can be mapped outside of vmalloc code.
    4173             :          * With hardware tag-based KASAN, marking is skipped for
    4174             :          * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
    4175             :          */
    4176             :         for (area = 0; area < nr_vms; area++)
    4177             :                 vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
    4178             :                                 vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
    4179             : 
    4180             :         kfree(vas);
    4181             :         return vms;
    4182             : 
    4183             : recovery:
    4184             :         /*
    4185             :          * Remove previously allocated areas. There is no
    4186             :          * need in removing these areas from the busy tree,
    4187             :          * because they are inserted only on the final step
    4188             :          * and when pcpu_get_vm_areas() is success.
    4189             :          */
    4190             :         while (area--) {
    4191             :                 orig_start = vas[area]->va_start;
    4192             :                 orig_end = vas[area]->va_end;
    4193             :                 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
    4194             :                                 &free_vmap_area_list);
    4195             :                 if (va)
    4196             :                         kasan_release_vmalloc(orig_start, orig_end,
    4197             :                                 va->va_start, va->va_end);
    4198             :                 vas[area] = NULL;
    4199             :         }
    4200             : 
    4201             : overflow:
    4202             :         spin_unlock(&free_vmap_area_lock);
    4203             :         if (!purged) {
    4204             :                 reclaim_and_purge_vmap_areas();
    4205             :                 purged = true;
    4206             : 
    4207             :                 /* Before "retry", check if we recover. */
    4208             :                 for (area = 0; area < nr_vms; area++) {
    4209             :                         if (vas[area])
    4210             :                                 continue;
    4211             : 
    4212             :                         vas[area] = kmem_cache_zalloc(
    4213             :                                 vmap_area_cachep, GFP_KERNEL);
    4214             :                         if (!vas[area])
    4215             :                                 goto err_free;
    4216             :                 }
    4217             : 
    4218             :                 goto retry;
    4219             :         }
    4220             : 
    4221             : err_free:
    4222             :         for (area = 0; area < nr_vms; area++) {
    4223             :                 if (vas[area])
    4224             :                         kmem_cache_free(vmap_area_cachep, vas[area]);
    4225             : 
    4226             :                 kfree(vms[area]);
    4227             :         }
    4228             : err_free2:
    4229             :         kfree(vas);
    4230             :         kfree(vms);
    4231             :         return NULL;
    4232             : 
    4233             : err_free_shadow:
    4234             :         spin_lock(&free_vmap_area_lock);
    4235             :         /*
    4236             :          * We release all the vmalloc shadows, even the ones for regions that
    4237             :          * hadn't been successfully added. This relies on kasan_release_vmalloc
    4238             :          * being able to tolerate this case.
    4239             :          */
    4240             :         for (area = 0; area < nr_vms; area++) {
    4241             :                 orig_start = vas[area]->va_start;
    4242             :                 orig_end = vas[area]->va_end;
    4243             :                 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
    4244             :                                 &free_vmap_area_list);
    4245             :                 if (va)
    4246             :                         kasan_release_vmalloc(orig_start, orig_end,
    4247             :                                 va->va_start, va->va_end);
    4248             :                 vas[area] = NULL;
    4249             :                 kfree(vms[area]);
    4250             :         }
    4251             :         spin_unlock(&free_vmap_area_lock);
    4252             :         kfree(vas);
    4253             :         kfree(vms);
    4254             :         return NULL;
    4255             : }
    4256             : 
    4257             : /**
    4258             :  * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
    4259             :  * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
    4260             :  * @nr_vms: the number of allocated areas
    4261             :  *
    4262             :  * Free vm_structs and the array allocated by pcpu_get_vm_areas().
    4263             :  */
    4264             : void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
    4265             : {
    4266             :         int i;
    4267             : 
    4268             :         for (i = 0; i < nr_vms; i++)
    4269             :                 free_vm_area(vms[i]);
    4270             :         kfree(vms);
    4271             : }
    4272             : #endif  /* CONFIG_SMP */
    4273             : 
    4274             : #ifdef CONFIG_PRINTK
    4275           0 : bool vmalloc_dump_obj(void *object)
    4276             : {
    4277             :         struct vm_struct *vm;
    4278           0 :         void *objp = (void *)PAGE_ALIGN((unsigned long)object);
    4279             : 
    4280           0 :         vm = find_vm_area(objp);
    4281           0 :         if (!vm)
    4282             :                 return false;
    4283           0 :         pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
    4284             :                 vm->nr_pages, (unsigned long)vm->addr, vm->caller);
    4285           0 :         return true;
    4286             : }
    4287             : #endif
    4288             : 
    4289             : #ifdef CONFIG_PROC_FS
    4290           0 : static void *s_start(struct seq_file *m, loff_t *pos)
    4291             :         __acquires(&vmap_purge_lock)
    4292             :         __acquires(&vmap_area_lock)
    4293             : {
    4294           0 :         mutex_lock(&vmap_purge_lock);
    4295           0 :         spin_lock(&vmap_area_lock);
    4296             : 
    4297           0 :         return seq_list_start(&vmap_area_list, *pos);
    4298             : }
    4299             : 
    4300           0 : static void *s_next(struct seq_file *m, void *p, loff_t *pos)
    4301             : {
    4302           0 :         return seq_list_next(p, &vmap_area_list, pos);
    4303             : }
    4304             : 
    4305           0 : static void s_stop(struct seq_file *m, void *p)
    4306             :         __releases(&vmap_area_lock)
    4307             :         __releases(&vmap_purge_lock)
    4308             : {
    4309           0 :         spin_unlock(&vmap_area_lock);
    4310           0 :         mutex_unlock(&vmap_purge_lock);
    4311           0 : }
    4312             : 
    4313             : static void show_numa_info(struct seq_file *m, struct vm_struct *v)
    4314             : {
    4315             :         if (IS_ENABLED(CONFIG_NUMA)) {
    4316             :                 unsigned int nr, *counters = m->private;
    4317             :                 unsigned int step = 1U << vm_area_page_order(v);
    4318             : 
    4319             :                 if (!counters)
    4320             :                         return;
    4321             : 
    4322             :                 if (v->flags & VM_UNINITIALIZED)
    4323             :                         return;
    4324             :                 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
    4325             :                 smp_rmb();
    4326             : 
    4327             :                 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
    4328             : 
    4329             :                 for (nr = 0; nr < v->nr_pages; nr += step)
    4330             :                         counters[page_to_nid(v->pages[nr])] += step;
    4331             :                 for_each_node_state(nr, N_HIGH_MEMORY)
    4332             :                         if (counters[nr])
    4333             :                                 seq_printf(m, " N%u=%u", nr, counters[nr]);
    4334             :         }
    4335             : }
    4336             : 
    4337           0 : static void show_purge_info(struct seq_file *m)
    4338             : {
    4339             :         struct vmap_area *va;
    4340             : 
    4341           0 :         spin_lock(&purge_vmap_area_lock);
    4342           0 :         list_for_each_entry(va, &purge_vmap_area_list, list) {
    4343           0 :                 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
    4344             :                         (void *)va->va_start, (void *)va->va_end,
    4345           0 :                         va->va_end - va->va_start);
    4346             :         }
    4347           0 :         spin_unlock(&purge_vmap_area_lock);
    4348           0 : }
    4349             : 
    4350           0 : static int s_show(struct seq_file *m, void *p)
    4351             : {
    4352             :         struct vmap_area *va;
    4353             :         struct vm_struct *v;
    4354             : 
    4355           0 :         va = list_entry(p, struct vmap_area, list);
    4356             : 
    4357           0 :         if (!va->vm) {
    4358           0 :                 if (va->flags & VMAP_RAM)
    4359           0 :                         seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
    4360             :                                 (void *)va->va_start, (void *)va->va_end,
    4361           0 :                                 va->va_end - va->va_start);
    4362             : 
    4363             :                 goto final;
    4364             :         }
    4365             : 
    4366           0 :         v = va->vm;
    4367             : 
    4368           0 :         seq_printf(m, "0x%pK-0x%pK %7ld",
    4369           0 :                 v->addr, v->addr + v->size, v->size);
    4370             : 
    4371           0 :         if (v->caller)
    4372           0 :                 seq_printf(m, " %pS", v->caller);
    4373             : 
    4374           0 :         if (v->nr_pages)
    4375           0 :                 seq_printf(m, " pages=%d", v->nr_pages);
    4376             : 
    4377           0 :         if (v->phys_addr)
    4378           0 :                 seq_printf(m, " phys=%pa", &v->phys_addr);
    4379             : 
    4380           0 :         if (v->flags & VM_IOREMAP)
    4381           0 :                 seq_puts(m, " ioremap");
    4382             : 
    4383           0 :         if (v->flags & VM_ALLOC)
    4384           0 :                 seq_puts(m, " vmalloc");
    4385             : 
    4386           0 :         if (v->flags & VM_MAP)
    4387           0 :                 seq_puts(m, " vmap");
    4388             : 
    4389           0 :         if (v->flags & VM_USERMAP)
    4390           0 :                 seq_puts(m, " user");
    4391             : 
    4392           0 :         if (v->flags & VM_DMA_COHERENT)
    4393           0 :                 seq_puts(m, " dma-coherent");
    4394             : 
    4395           0 :         if (is_vmalloc_addr(v->pages))
    4396           0 :                 seq_puts(m, " vpages");
    4397             : 
    4398           0 :         show_numa_info(m, v);
    4399           0 :         seq_putc(m, '\n');
    4400             : 
    4401             :         /*
    4402             :          * As a final step, dump "unpurged" areas.
    4403             :          */
    4404             : final:
    4405           0 :         if (list_is_last(&va->list, &vmap_area_list))
    4406           0 :                 show_purge_info(m);
    4407             : 
    4408           0 :         return 0;
    4409             : }
    4410             : 
    4411             : static const struct seq_operations vmalloc_op = {
    4412             :         .start = s_start,
    4413             :         .next = s_next,
    4414             :         .stop = s_stop,
    4415             :         .show = s_show,
    4416             : };
    4417             : 
    4418           1 : static int __init proc_vmalloc_init(void)
    4419             : {
    4420             :         if (IS_ENABLED(CONFIG_NUMA))
    4421             :                 proc_create_seq_private("vmallocinfo", 0400, NULL,
    4422             :                                 &vmalloc_op,
    4423             :                                 nr_node_ids * sizeof(unsigned int), NULL);
    4424             :         else
    4425           1 :                 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
    4426           1 :         return 0;
    4427             : }
    4428             : module_init(proc_vmalloc_init);
    4429             : 
    4430             : #endif
    4431             : 
    4432           1 : void __init vmalloc_init(void)
    4433             : {
    4434             :         struct vmap_area *va;
    4435             :         struct vm_struct *tmp;
    4436             :         int i;
    4437             : 
    4438             :         /*
    4439             :          * Create the cache for vmap_area objects.
    4440             :          */
    4441           1 :         vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
    4442             : 
    4443           2 :         for_each_possible_cpu(i) {
    4444             :                 struct vmap_block_queue *vbq;
    4445             :                 struct vfree_deferred *p;
    4446             : 
    4447           1 :                 vbq = &per_cpu(vmap_block_queue, i);
    4448           1 :                 spin_lock_init(&vbq->lock);
    4449           2 :                 INIT_LIST_HEAD(&vbq->free);
    4450           1 :                 p = &per_cpu(vfree_deferred, i);
    4451           2 :                 init_llist_head(&p->list);
    4452           2 :                 INIT_WORK(&p->wq, delayed_vfree_work);
    4453           2 :                 xa_init(&vbq->vmap_blocks);
    4454             :         }
    4455             : 
    4456             :         /* Import existing vmlist entries. */
    4457           1 :         for (tmp = vmlist; tmp; tmp = tmp->next) {
    4458           0 :                 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
    4459           0 :                 if (WARN_ON_ONCE(!va))
    4460           0 :                         continue;
    4461             : 
    4462           0 :                 va->va_start = (unsigned long)tmp->addr;
    4463           0 :                 va->va_end = va->va_start + tmp->size;
    4464           0 :                 va->vm = tmp;
    4465           0 :                 insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
    4466             :         }
    4467             : 
    4468             :         /*
    4469             :          * Now we can initialize a free vmap space.
    4470             :          */
    4471           1 :         vmap_init_free_space();
    4472           1 :         vmap_initialized = true;
    4473           1 : }

Generated by: LCOV version 1.14