LCOV - code coverage report
Current view: top level - mm - util.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 29 293 9.9 %
Date: 2023-04-06 08:38:28 Functions: 6 46 13.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : #include <linux/mm.h>
       3             : #include <linux/slab.h>
       4             : #include <linux/string.h>
       5             : #include <linux/compiler.h>
       6             : #include <linux/export.h>
       7             : #include <linux/err.h>
       8             : #include <linux/sched.h>
       9             : #include <linux/sched/mm.h>
      10             : #include <linux/sched/signal.h>
      11             : #include <linux/sched/task_stack.h>
      12             : #include <linux/security.h>
      13             : #include <linux/swap.h>
      14             : #include <linux/swapops.h>
      15             : #include <linux/mman.h>
      16             : #include <linux/hugetlb.h>
      17             : #include <linux/vmalloc.h>
      18             : #include <linux/userfaultfd_k.h>
      19             : #include <linux/elf.h>
      20             : #include <linux/elf-randomize.h>
      21             : #include <linux/personality.h>
      22             : #include <linux/random.h>
      23             : #include <linux/processor.h>
      24             : #include <linux/sizes.h>
      25             : #include <linux/compat.h>
      26             : 
      27             : #include <linux/uaccess.h>
      28             : 
      29             : #include "internal.h"
      30             : #include "swap.h"
      31             : 
      32             : /**
      33             :  * kfree_const - conditionally free memory
      34             :  * @x: pointer to the memory
      35             :  *
      36             :  * Function calls kfree only if @x is not in .rodata section.
      37             :  */
      38        1349 : void kfree_const(const void *x)
      39             : {
      40        2698 :         if (!is_kernel_rodata((unsigned long)x))
      41         853 :                 kfree(x);
      42        1349 : }
      43             : EXPORT_SYMBOL(kfree_const);
      44             : 
      45             : /**
      46             :  * kstrdup - allocate space for and copy an existing string
      47             :  * @s: the string to duplicate
      48             :  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
      49             :  *
      50             :  * Return: newly allocated copy of @s or %NULL in case of error
      51             :  */
      52        2320 : char *kstrdup(const char *s, gfp_t gfp)
      53             : {
      54             :         size_t len;
      55             :         char *buf;
      56             : 
      57        2320 :         if (!s)
      58             :                 return NULL;
      59             : 
      60        2318 :         len = strlen(s) + 1;
      61        2318 :         buf = kmalloc_track_caller(len, gfp);
      62        2318 :         if (buf)
      63        2318 :                 memcpy(buf, s, len);
      64             :         return buf;
      65             : }
      66             : EXPORT_SYMBOL(kstrdup);
      67             : 
      68             : /**
      69             :  * kstrdup_const - conditionally duplicate an existing const string
      70             :  * @s: the string to duplicate
      71             :  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
      72             :  *
      73             :  * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
      74             :  * must not be passed to krealloc().
      75             :  *
      76             :  * Return: source string if it is in .rodata section otherwise
      77             :  * fallback to kstrdup.
      78             :  */
      79        9520 : const char *kstrdup_const(const char *s, gfp_t gfp)
      80             : {
      81       19040 :         if (is_kernel_rodata((unsigned long)s))
      82             :                 return s;
      83             : 
      84        2301 :         return kstrdup(s, gfp);
      85             : }
      86             : EXPORT_SYMBOL(kstrdup_const);
      87             : 
      88             : /**
      89             :  * kstrndup - allocate space for and copy an existing string
      90             :  * @s: the string to duplicate
      91             :  * @max: read at most @max chars from @s
      92             :  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
      93             :  *
      94             :  * Note: Use kmemdup_nul() instead if the size is known exactly.
      95             :  *
      96             :  * Return: newly allocated copy of @s or %NULL in case of error
      97             :  */
      98           0 : char *kstrndup(const char *s, size_t max, gfp_t gfp)
      99             : {
     100             :         size_t len;
     101             :         char *buf;
     102             : 
     103           0 :         if (!s)
     104             :                 return NULL;
     105             : 
     106           0 :         len = strnlen(s, max);
     107           0 :         buf = kmalloc_track_caller(len+1, gfp);
     108           0 :         if (buf) {
     109           0 :                 memcpy(buf, s, len);
     110           0 :                 buf[len] = '\0';
     111             :         }
     112             :         return buf;
     113             : }
     114             : EXPORT_SYMBOL(kstrndup);
     115             : 
     116             : /**
     117             :  * kmemdup - duplicate region of memory
     118             :  *
     119             :  * @src: memory region to duplicate
     120             :  * @len: memory region length
     121             :  * @gfp: GFP mask to use
     122             :  *
     123             :  * Return: newly allocated copy of @src or %NULL in case of error,
     124             :  * result is physically contiguous. Use kfree() to free.
     125             :  */
     126          10 : void *kmemdup(const void *src, size_t len, gfp_t gfp)
     127             : {
     128             :         void *p;
     129             : 
     130          10 :         p = kmalloc_track_caller(len, gfp);
     131          10 :         if (p)
     132          10 :                 memcpy(p, src, len);
     133          10 :         return p;
     134             : }
     135             : EXPORT_SYMBOL(kmemdup);
     136             : 
     137             : /**
     138             :  * kvmemdup - duplicate region of memory
     139             :  *
     140             :  * @src: memory region to duplicate
     141             :  * @len: memory region length
     142             :  * @gfp: GFP mask to use
     143             :  *
     144             :  * Return: newly allocated copy of @src or %NULL in case of error,
     145             :  * result may be not physically contiguous. Use kvfree() to free.
     146             :  */
     147           0 : void *kvmemdup(const void *src, size_t len, gfp_t gfp)
     148             : {
     149             :         void *p;
     150             : 
     151           0 :         p = kvmalloc(len, gfp);
     152           0 :         if (p)
     153           0 :                 memcpy(p, src, len);
     154           0 :         return p;
     155             : }
     156             : EXPORT_SYMBOL(kvmemdup);
     157             : 
     158             : /**
     159             :  * kmemdup_nul - Create a NUL-terminated string from unterminated data
     160             :  * @s: The data to stringify
     161             :  * @len: The size of the data
     162             :  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
     163             :  *
     164             :  * Return: newly allocated copy of @s with NUL-termination or %NULL in
     165             :  * case of error
     166             :  */
     167          27 : char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
     168             : {
     169             :         char *buf;
     170             : 
     171          27 :         if (!s)
     172             :                 return NULL;
     173             : 
     174          27 :         buf = kmalloc_track_caller(len + 1, gfp);
     175          27 :         if (buf) {
     176          27 :                 memcpy(buf, s, len);
     177          27 :                 buf[len] = '\0';
     178             :         }
     179             :         return buf;
     180             : }
     181             : EXPORT_SYMBOL(kmemdup_nul);
     182             : 
     183             : /**
     184             :  * memdup_user - duplicate memory region from user space
     185             :  *
     186             :  * @src: source address in user space
     187             :  * @len: number of bytes to copy
     188             :  *
     189             :  * Return: an ERR_PTR() on failure.  Result is physically
     190             :  * contiguous, to be freed by kfree().
     191             :  */
     192           0 : void *memdup_user(const void __user *src, size_t len)
     193             : {
     194             :         void *p;
     195             : 
     196           0 :         p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
     197           0 :         if (!p)
     198             :                 return ERR_PTR(-ENOMEM);
     199             : 
     200           0 :         if (copy_from_user(p, src, len)) {
     201           0 :                 kfree(p);
     202           0 :                 return ERR_PTR(-EFAULT);
     203             :         }
     204             : 
     205             :         return p;
     206             : }
     207             : EXPORT_SYMBOL(memdup_user);
     208             : 
     209             : /**
     210             :  * vmemdup_user - duplicate memory region from user space
     211             :  *
     212             :  * @src: source address in user space
     213             :  * @len: number of bytes to copy
     214             :  *
     215             :  * Return: an ERR_PTR() on failure.  Result may be not
     216             :  * physically contiguous.  Use kvfree() to free.
     217             :  */
     218           0 : void *vmemdup_user(const void __user *src, size_t len)
     219             : {
     220             :         void *p;
     221             : 
     222           0 :         p = kvmalloc(len, GFP_USER);
     223           0 :         if (!p)
     224             :                 return ERR_PTR(-ENOMEM);
     225             : 
     226           0 :         if (copy_from_user(p, src, len)) {
     227           0 :                 kvfree(p);
     228           0 :                 return ERR_PTR(-EFAULT);
     229             :         }
     230             : 
     231             :         return p;
     232             : }
     233             : EXPORT_SYMBOL(vmemdup_user);
     234             : 
     235             : /**
     236             :  * strndup_user - duplicate an existing string from user space
     237             :  * @s: The string to duplicate
     238             :  * @n: Maximum number of bytes to copy, including the trailing NUL.
     239             :  *
     240             :  * Return: newly allocated copy of @s or an ERR_PTR() in case of error
     241             :  */
     242           0 : char *strndup_user(const char __user *s, long n)
     243             : {
     244             :         char *p;
     245             :         long length;
     246             : 
     247           0 :         length = strnlen_user(s, n);
     248             : 
     249           0 :         if (!length)
     250             :                 return ERR_PTR(-EFAULT);
     251             : 
     252           0 :         if (length > n)
     253             :                 return ERR_PTR(-EINVAL);
     254             : 
     255           0 :         p = memdup_user(s, length);
     256             : 
     257           0 :         if (IS_ERR(p))
     258             :                 return p;
     259             : 
     260           0 :         p[length - 1] = '\0';
     261             : 
     262           0 :         return p;
     263             : }
     264             : EXPORT_SYMBOL(strndup_user);
     265             : 
     266             : /**
     267             :  * memdup_user_nul - duplicate memory region from user space and NUL-terminate
     268             :  *
     269             :  * @src: source address in user space
     270             :  * @len: number of bytes to copy
     271             :  *
     272             :  * Return: an ERR_PTR() on failure.
     273             :  */
     274           0 : void *memdup_user_nul(const void __user *src, size_t len)
     275             : {
     276             :         char *p;
     277             : 
     278             :         /*
     279             :          * Always use GFP_KERNEL, since copy_from_user() can sleep and
     280             :          * cause pagefault, which makes it pointless to use GFP_NOFS
     281             :          * or GFP_ATOMIC.
     282             :          */
     283           0 :         p = kmalloc_track_caller(len + 1, GFP_KERNEL);
     284           0 :         if (!p)
     285             :                 return ERR_PTR(-ENOMEM);
     286             : 
     287           0 :         if (copy_from_user(p, src, len)) {
     288           0 :                 kfree(p);
     289           0 :                 return ERR_PTR(-EFAULT);
     290             :         }
     291           0 :         p[len] = '\0';
     292             : 
     293           0 :         return p;
     294             : }
     295             : EXPORT_SYMBOL(memdup_user_nul);
     296             : 
     297             : /* Check if the vma is being used as a stack by this task */
     298           0 : int vma_is_stack_for_current(struct vm_area_struct *vma)
     299             : {
     300           0 :         struct task_struct * __maybe_unused t = current;
     301             : 
     302           0 :         return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
     303             : }
     304             : 
     305             : /*
     306             :  * Change backing file, only valid to use during initial VMA setup.
     307             :  */
     308           0 : void vma_set_file(struct vm_area_struct *vma, struct file *file)
     309             : {
     310             :         /* Changing an anonymous vma with this is illegal */
     311           0 :         get_file(file);
     312           0 :         swap(vma->vm_file, file);
     313           0 :         fput(file);
     314           0 : }
     315             : EXPORT_SYMBOL(vma_set_file);
     316             : 
     317             : #ifndef STACK_RND_MASK
     318             : #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
     319             : #endif
     320             : 
     321           0 : unsigned long randomize_stack_top(unsigned long stack_top)
     322             : {
     323           0 :         unsigned long random_variable = 0;
     324             : 
     325           0 :         if (current->flags & PF_RANDOMIZE) {
     326           0 :                 random_variable = get_random_long();
     327           0 :                 random_variable &= STACK_RND_MASK;
     328           0 :                 random_variable <<= PAGE_SHIFT;
     329             :         }
     330             : #ifdef CONFIG_STACK_GROWSUP
     331             :         return PAGE_ALIGN(stack_top) + random_variable;
     332             : #else
     333           0 :         return PAGE_ALIGN(stack_top) - random_variable;
     334             : #endif
     335             : }
     336             : 
     337             : /**
     338             :  * randomize_page - Generate a random, page aligned address
     339             :  * @start:      The smallest acceptable address the caller will take.
     340             :  * @range:      The size of the area, starting at @start, within which the
     341             :  *              random address must fall.
     342             :  *
     343             :  * If @start + @range would overflow, @range is capped.
     344             :  *
     345             :  * NOTE: Historical use of randomize_range, which this replaces, presumed that
     346             :  * @start was already page aligned.  We now align it regardless.
     347             :  *
     348             :  * Return: A page aligned address within [start, start + range).  On error,
     349             :  * @start is returned.
     350             :  */
     351           0 : unsigned long randomize_page(unsigned long start, unsigned long range)
     352             : {
     353           0 :         if (!PAGE_ALIGNED(start)) {
     354           0 :                 range -= PAGE_ALIGN(start) - start;
     355           0 :                 start = PAGE_ALIGN(start);
     356             :         }
     357             : 
     358           0 :         if (start > ULONG_MAX - range)
     359           0 :                 range = ULONG_MAX - start;
     360             : 
     361           0 :         range >>= PAGE_SHIFT;
     362             : 
     363           0 :         if (range == 0)
     364             :                 return start;
     365             : 
     366           0 :         return start + (get_random_long() % range << PAGE_SHIFT);
     367             : }
     368             : 
     369             : #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
     370             : unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
     371             : {
     372             :         /* Is the current task 32bit ? */
     373             :         if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
     374             :                 return randomize_page(mm->brk, SZ_32M);
     375             : 
     376             :         return randomize_page(mm->brk, SZ_1G);
     377             : }
     378             : 
     379             : unsigned long arch_mmap_rnd(void)
     380             : {
     381             :         unsigned long rnd;
     382             : 
     383             : #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
     384             :         if (is_compat_task())
     385             :                 rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
     386             :         else
     387             : #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
     388             :                 rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
     389             : 
     390             :         return rnd << PAGE_SHIFT;
     391             : }
     392             : 
     393             : static int mmap_is_legacy(struct rlimit *rlim_stack)
     394             : {
     395             :         if (current->personality & ADDR_COMPAT_LAYOUT)
     396             :                 return 1;
     397             : 
     398             :         if (rlim_stack->rlim_cur == RLIM_INFINITY)
     399             :                 return 1;
     400             : 
     401             :         return sysctl_legacy_va_layout;
     402             : }
     403             : 
     404             : /*
     405             :  * Leave enough space between the mmap area and the stack to honour ulimit in
     406             :  * the face of randomisation.
     407             :  */
     408             : #define MIN_GAP         (SZ_128M)
     409             : #define MAX_GAP         (STACK_TOP / 6 * 5)
     410             : 
     411             : static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
     412             : {
     413             :         unsigned long gap = rlim_stack->rlim_cur;
     414             :         unsigned long pad = stack_guard_gap;
     415             : 
     416             :         /* Account for stack randomization if necessary */
     417             :         if (current->flags & PF_RANDOMIZE)
     418             :                 pad += (STACK_RND_MASK << PAGE_SHIFT);
     419             : 
     420             :         /* Values close to RLIM_INFINITY can overflow. */
     421             :         if (gap + pad > gap)
     422             :                 gap += pad;
     423             : 
     424             :         if (gap < MIN_GAP)
     425             :                 gap = MIN_GAP;
     426             :         else if (gap > MAX_GAP)
     427             :                 gap = MAX_GAP;
     428             : 
     429             :         return PAGE_ALIGN(STACK_TOP - gap - rnd);
     430             : }
     431             : 
     432             : void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
     433             : {
     434             :         unsigned long random_factor = 0UL;
     435             : 
     436             :         if (current->flags & PF_RANDOMIZE)
     437             :                 random_factor = arch_mmap_rnd();
     438             : 
     439             :         if (mmap_is_legacy(rlim_stack)) {
     440             :                 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
     441             :                 mm->get_unmapped_area = arch_get_unmapped_area;
     442             :         } else {
     443             :                 mm->mmap_base = mmap_base(random_factor, rlim_stack);
     444             :                 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
     445             :         }
     446             : }
     447             : #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
     448           0 : void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
     449             : {
     450           0 :         mm->mmap_base = TASK_UNMAPPED_BASE;
     451           0 :         mm->get_unmapped_area = arch_get_unmapped_area;
     452           0 : }
     453             : #endif
     454             : 
     455             : /**
     456             :  * __account_locked_vm - account locked pages to an mm's locked_vm
     457             :  * @mm:          mm to account against
     458             :  * @pages:       number of pages to account
     459             :  * @inc:         %true if @pages should be considered positive, %false if not
     460             :  * @task:        task used to check RLIMIT_MEMLOCK
     461             :  * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
     462             :  *
     463             :  * Assumes @task and @mm are valid (i.e. at least one reference on each), and
     464             :  * that mmap_lock is held as writer.
     465             :  *
     466             :  * Return:
     467             :  * * 0       on success
     468             :  * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
     469             :  */
     470           0 : int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
     471             :                         struct task_struct *task, bool bypass_rlim)
     472             : {
     473             :         unsigned long locked_vm, limit;
     474           0 :         int ret = 0;
     475             : 
     476           0 :         mmap_assert_write_locked(mm);
     477             : 
     478           0 :         locked_vm = mm->locked_vm;
     479           0 :         if (inc) {
     480           0 :                 if (!bypass_rlim) {
     481           0 :                         limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
     482           0 :                         if (locked_vm + pages > limit)
     483           0 :                                 ret = -ENOMEM;
     484             :                 }
     485           0 :                 if (!ret)
     486           0 :                         mm->locked_vm = locked_vm + pages;
     487             :         } else {
     488           0 :                 WARN_ON_ONCE(pages > locked_vm);
     489           0 :                 mm->locked_vm = locked_vm - pages;
     490             :         }
     491             : 
     492             :         pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
     493             :                  (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
     494             :                  locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
     495             :                  ret ? " - exceeded" : "");
     496             : 
     497           0 :         return ret;
     498             : }
     499             : EXPORT_SYMBOL_GPL(__account_locked_vm);
     500             : 
     501             : /**
     502             :  * account_locked_vm - account locked pages to an mm's locked_vm
     503             :  * @mm:          mm to account against, may be NULL
     504             :  * @pages:       number of pages to account
     505             :  * @inc:         %true if @pages should be considered positive, %false if not
     506             :  *
     507             :  * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
     508             :  *
     509             :  * Return:
     510             :  * * 0       on success, or if mm is NULL
     511             :  * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
     512             :  */
     513           0 : int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
     514             : {
     515             :         int ret;
     516             : 
     517           0 :         if (pages == 0 || !mm)
     518             :                 return 0;
     519             : 
     520           0 :         mmap_write_lock(mm);
     521           0 :         ret = __account_locked_vm(mm, pages, inc, current,
     522           0 :                                   capable(CAP_IPC_LOCK));
     523           0 :         mmap_write_unlock(mm);
     524             : 
     525           0 :         return ret;
     526             : }
     527             : EXPORT_SYMBOL_GPL(account_locked_vm);
     528             : 
     529           0 : unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
     530             :         unsigned long len, unsigned long prot,
     531             :         unsigned long flag, unsigned long pgoff)
     532             : {
     533             :         unsigned long ret;
     534           0 :         struct mm_struct *mm = current->mm;
     535             :         unsigned long populate;
     536           0 :         LIST_HEAD(uf);
     537             : 
     538           0 :         ret = security_mmap_file(file, prot, flag);
     539             :         if (!ret) {
     540           0 :                 if (mmap_write_lock_killable(mm))
     541             :                         return -EINTR;
     542           0 :                 ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
     543             :                               &uf);
     544           0 :                 mmap_write_unlock(mm);
     545           0 :                 userfaultfd_unmap_complete(mm, &uf);
     546           0 :                 if (populate)
     547           0 :                         mm_populate(ret, populate);
     548             :         }
     549             :         return ret;
     550             : }
     551             : 
     552           0 : unsigned long vm_mmap(struct file *file, unsigned long addr,
     553             :         unsigned long len, unsigned long prot,
     554             :         unsigned long flag, unsigned long offset)
     555             : {
     556           0 :         if (unlikely(offset + PAGE_ALIGN(len) < offset))
     557             :                 return -EINVAL;
     558           0 :         if (unlikely(offset_in_page(offset)))
     559             :                 return -EINVAL;
     560             : 
     561           0 :         return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
     562             : }
     563             : EXPORT_SYMBOL(vm_mmap);
     564             : 
     565             : /**
     566             :  * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
     567             :  * failure, fall back to non-contiguous (vmalloc) allocation.
     568             :  * @size: size of the request.
     569             :  * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
     570             :  * @node: numa node to allocate from
     571             :  *
     572             :  * Uses kmalloc to get the memory but if the allocation fails then falls back
     573             :  * to the vmalloc allocator. Use kvfree for freeing the memory.
     574             :  *
     575             :  * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
     576             :  * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
     577             :  * preferable to the vmalloc fallback, due to visible performance drawbacks.
     578             :  *
     579             :  * Return: pointer to the allocated memory of %NULL in case of failure
     580             :  */
     581           0 : void *kvmalloc_node(size_t size, gfp_t flags, int node)
     582             : {
     583           0 :         gfp_t kmalloc_flags = flags;
     584             :         void *ret;
     585             : 
     586             :         /*
     587             :          * We want to attempt a large physically contiguous block first because
     588             :          * it is less likely to fragment multiple larger blocks and therefore
     589             :          * contribute to a long term fragmentation less than vmalloc fallback.
     590             :          * However make sure that larger requests are not too disruptive - no
     591             :          * OOM killer and no allocation failure warnings as we have a fallback.
     592             :          */
     593           0 :         if (size > PAGE_SIZE) {
     594           0 :                 kmalloc_flags |= __GFP_NOWARN;
     595             : 
     596           0 :                 if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
     597           0 :                         kmalloc_flags |= __GFP_NORETRY;
     598             : 
     599             :                 /* nofail semantic is implemented by the vmalloc fallback */
     600           0 :                 kmalloc_flags &= ~__GFP_NOFAIL;
     601             :         }
     602             : 
     603           0 :         ret = kmalloc_node(size, kmalloc_flags, node);
     604             : 
     605             :         /*
     606             :          * It doesn't really make sense to fallback to vmalloc for sub page
     607             :          * requests
     608             :          */
     609           0 :         if (ret || size <= PAGE_SIZE)
     610             :                 return ret;
     611             : 
     612             :         /* non-sleeping allocations are not supported by vmalloc */
     613           0 :         if (!gfpflags_allow_blocking(flags))
     614             :                 return NULL;
     615             : 
     616             :         /* Don't even allow crazy sizes */
     617           0 :         if (unlikely(size > INT_MAX)) {
     618           0 :                 WARN_ON_ONCE(!(flags & __GFP_NOWARN));
     619             :                 return NULL;
     620             :         }
     621             : 
     622             :         /*
     623             :          * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
     624             :          * since the callers already cannot assume anything
     625             :          * about the resulting pointer, and cannot play
     626             :          * protection games.
     627             :          */
     628           0 :         return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
     629           0 :                         flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
     630           0 :                         node, __builtin_return_address(0));
     631             : }
     632             : EXPORT_SYMBOL(kvmalloc_node);
     633             : 
     634             : /**
     635             :  * kvfree() - Free memory.
     636             :  * @addr: Pointer to allocated memory.
     637             :  *
     638             :  * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
     639             :  * It is slightly more efficient to use kfree() or vfree() if you are certain
     640             :  * that you know which one to use.
     641             :  *
     642             :  * Context: Either preemptible task context or not-NMI interrupt.
     643             :  */
     644         262 : void kvfree(const void *addr)
     645             : {
     646         262 :         if (is_vmalloc_addr(addr))
     647           1 :                 vfree(addr);
     648             :         else
     649         261 :                 kfree(addr);
     650         262 : }
     651             : EXPORT_SYMBOL(kvfree);
     652             : 
     653             : /**
     654             :  * kvfree_sensitive - Free a data object containing sensitive information.
     655             :  * @addr: address of the data object to be freed.
     656             :  * @len: length of the data object.
     657             :  *
     658             :  * Use the special memzero_explicit() function to clear the content of a
     659             :  * kvmalloc'ed object containing sensitive data to make sure that the
     660             :  * compiler won't optimize out the data clearing.
     661             :  */
     662           0 : void kvfree_sensitive(const void *addr, size_t len)
     663             : {
     664           0 :         if (likely(!ZERO_OR_NULL_PTR(addr))) {
     665           0 :                 memzero_explicit((void *)addr, len);
     666           0 :                 kvfree(addr);
     667             :         }
     668           0 : }
     669             : EXPORT_SYMBOL(kvfree_sensitive);
     670             : 
     671           0 : void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
     672             : {
     673             :         void *newp;
     674             : 
     675           0 :         if (oldsize >= newsize)
     676             :                 return (void *)p;
     677           0 :         newp = kvmalloc(newsize, flags);
     678           0 :         if (!newp)
     679             :                 return NULL;
     680           0 :         memcpy(newp, p, oldsize);
     681           0 :         kvfree(p);
     682           0 :         return newp;
     683             : }
     684             : EXPORT_SYMBOL(kvrealloc);
     685             : 
     686             : /**
     687             :  * __vmalloc_array - allocate memory for a virtually contiguous array.
     688             :  * @n: number of elements.
     689             :  * @size: element size.
     690             :  * @flags: the type of memory to allocate (see kmalloc).
     691             :  */
     692           0 : void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
     693             : {
     694             :         size_t bytes;
     695             : 
     696           0 :         if (unlikely(check_mul_overflow(n, size, &bytes)))
     697             :                 return NULL;
     698           0 :         return __vmalloc(bytes, flags);
     699             : }
     700             : EXPORT_SYMBOL(__vmalloc_array);
     701             : 
     702             : /**
     703             :  * vmalloc_array - allocate memory for a virtually contiguous array.
     704             :  * @n: number of elements.
     705             :  * @size: element size.
     706             :  */
     707           0 : void *vmalloc_array(size_t n, size_t size)
     708             : {
     709           0 :         return __vmalloc_array(n, size, GFP_KERNEL);
     710             : }
     711             : EXPORT_SYMBOL(vmalloc_array);
     712             : 
     713             : /**
     714             :  * __vcalloc - allocate and zero memory for a virtually contiguous array.
     715             :  * @n: number of elements.
     716             :  * @size: element size.
     717             :  * @flags: the type of memory to allocate (see kmalloc).
     718             :  */
     719           0 : void *__vcalloc(size_t n, size_t size, gfp_t flags)
     720             : {
     721           0 :         return __vmalloc_array(n, size, flags | __GFP_ZERO);
     722             : }
     723             : EXPORT_SYMBOL(__vcalloc);
     724             : 
     725             : /**
     726             :  * vcalloc - allocate and zero memory for a virtually contiguous array.
     727             :  * @n: number of elements.
     728             :  * @size: element size.
     729             :  */
     730           0 : void *vcalloc(size_t n, size_t size)
     731             : {
     732           0 :         return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
     733             : }
     734             : EXPORT_SYMBOL(vcalloc);
     735             : 
     736             : /* Neutral page->mapping pointer to address_space or anon_vma or other */
     737           0 : void *page_rmapping(struct page *page)
     738             : {
     739           0 :         return folio_raw_mapping(page_folio(page));
     740             : }
     741             : 
     742           0 : struct anon_vma *folio_anon_vma(struct folio *folio)
     743             : {
     744           0 :         unsigned long mapping = (unsigned long)folio->mapping;
     745             : 
     746           0 :         if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
     747             :                 return NULL;
     748           0 :         return (void *)(mapping - PAGE_MAPPING_ANON);
     749             : }
     750             : 
     751             : /**
     752             :  * folio_mapping - Find the mapping where this folio is stored.
     753             :  * @folio: The folio.
     754             :  *
     755             :  * For folios which are in the page cache, return the mapping that this
     756             :  * page belongs to.  Folios in the swap cache return the swap mapping
     757             :  * this page is stored in (which is different from the mapping for the
     758             :  * swap file or swap device where the data is stored).
     759             :  *
     760             :  * You can call this for folios which aren't in the swap cache or page
     761             :  * cache and it will return NULL.
     762             :  */
     763           0 : struct address_space *folio_mapping(struct folio *folio)
     764             : {
     765             :         struct address_space *mapping;
     766             : 
     767             :         /* This happens if someone calls flush_dcache_page on slab page */
     768           0 :         if (unlikely(folio_test_slab(folio)))
     769             :                 return NULL;
     770             : 
     771           0 :         if (unlikely(folio_test_swapcache(folio)))
     772           0 :                 return swap_address_space(folio_swap_entry(folio));
     773             : 
     774           0 :         mapping = folio->mapping;
     775           0 :         if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
     776             :                 return NULL;
     777             : 
     778           0 :         return mapping;
     779             : }
     780             : EXPORT_SYMBOL(folio_mapping);
     781             : 
     782             : /**
     783             :  * folio_copy - Copy the contents of one folio to another.
     784             :  * @dst: Folio to copy to.
     785             :  * @src: Folio to copy from.
     786             :  *
     787             :  * The bytes in the folio represented by @src are copied to @dst.
     788             :  * Assumes the caller has validated that @dst is at least as large as @src.
     789             :  * Can be called in atomic context for order-0 folios, but if the folio is
     790             :  * larger, it may sleep.
     791             :  */
     792           0 : void folio_copy(struct folio *dst, struct folio *src)
     793             : {
     794           0 :         long i = 0;
     795           0 :         long nr = folio_nr_pages(src);
     796             : 
     797             :         for (;;) {
     798           0 :                 copy_highpage(folio_page(dst, i), folio_page(src, i));
     799           0 :                 if (++i == nr)
     800             :                         break;
     801           0 :                 cond_resched();
     802             :         }
     803           0 : }
     804             : 
     805             : int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
     806             : int sysctl_overcommit_ratio __read_mostly = 50;
     807             : unsigned long sysctl_overcommit_kbytes __read_mostly;
     808             : int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
     809             : unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
     810             : unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
     811             : 
     812           0 : int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
     813             :                 size_t *lenp, loff_t *ppos)
     814             : {
     815             :         int ret;
     816             : 
     817           0 :         ret = proc_dointvec(table, write, buffer, lenp, ppos);
     818           0 :         if (ret == 0 && write)
     819           0 :                 sysctl_overcommit_kbytes = 0;
     820           0 :         return ret;
     821             : }
     822             : 
     823           0 : static void sync_overcommit_as(struct work_struct *dummy)
     824             : {
     825           0 :         percpu_counter_sync(&vm_committed_as);
     826           0 : }
     827             : 
     828           0 : int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
     829             :                 size_t *lenp, loff_t *ppos)
     830             : {
     831             :         struct ctl_table t;
     832           0 :         int new_policy = -1;
     833             :         int ret;
     834             : 
     835             :         /*
     836             :          * The deviation of sync_overcommit_as could be big with loose policy
     837             :          * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
     838             :          * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
     839             :          * with the strict "NEVER", and to avoid possible race condition (even
     840             :          * though user usually won't too frequently do the switching to policy
     841             :          * OVERCOMMIT_NEVER), the switch is done in the following order:
     842             :          *      1. changing the batch
     843             :          *      2. sync percpu count on each CPU
     844             :          *      3. switch the policy
     845             :          */
     846           0 :         if (write) {
     847           0 :                 t = *table;
     848           0 :                 t.data = &new_policy;
     849           0 :                 ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
     850           0 :                 if (ret || new_policy == -1)
     851             :                         return ret;
     852             : 
     853           0 :                 mm_compute_batch(new_policy);
     854           0 :                 if (new_policy == OVERCOMMIT_NEVER)
     855           0 :                         schedule_on_each_cpu(sync_overcommit_as);
     856           0 :                 sysctl_overcommit_memory = new_policy;
     857             :         } else {
     858           0 :                 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
     859             :         }
     860             : 
     861             :         return ret;
     862             : }
     863             : 
     864           0 : int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
     865             :                 size_t *lenp, loff_t *ppos)
     866             : {
     867             :         int ret;
     868             : 
     869           0 :         ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
     870           0 :         if (ret == 0 && write)
     871           0 :                 sysctl_overcommit_ratio = 0;
     872           0 :         return ret;
     873             : }
     874             : 
     875             : /*
     876             :  * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
     877             :  */
     878           0 : unsigned long vm_commit_limit(void)
     879             : {
     880             :         unsigned long allowed;
     881             : 
     882           0 :         if (sysctl_overcommit_kbytes)
     883           0 :                 allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
     884             :         else
     885           0 :                 allowed = ((totalram_pages() - hugetlb_total_pages())
     886           0 :                            * sysctl_overcommit_ratio / 100);
     887           0 :         allowed += total_swap_pages;
     888             : 
     889           0 :         return allowed;
     890             : }
     891             : 
     892             : /*
     893             :  * Make sure vm_committed_as in one cacheline and not cacheline shared with
     894             :  * other variables. It can be updated by several CPUs frequently.
     895             :  */
     896             : struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
     897             : 
     898             : /*
     899             :  * The global memory commitment made in the system can be a metric
     900             :  * that can be used to drive ballooning decisions when Linux is hosted
     901             :  * as a guest. On Hyper-V, the host implements a policy engine for dynamically
     902             :  * balancing memory across competing virtual machines that are hosted.
     903             :  * Several metrics drive this policy engine including the guest reported
     904             :  * memory commitment.
     905             :  *
     906             :  * The time cost of this is very low for small platforms, and for big
     907             :  * platform like a 2S/36C/72T Skylake server, in worst case where
     908             :  * vm_committed_as's spinlock is under severe contention, the time cost
     909             :  * could be about 30~40 microseconds.
     910             :  */
     911           0 : unsigned long vm_memory_committed(void)
     912             : {
     913           0 :         return percpu_counter_sum_positive(&vm_committed_as);
     914             : }
     915             : EXPORT_SYMBOL_GPL(vm_memory_committed);
     916             : 
     917             : /*
     918             :  * Check that a process has enough memory to allocate a new virtual
     919             :  * mapping. 0 means there is enough memory for the allocation to
     920             :  * succeed and -ENOMEM implies there is not.
     921             :  *
     922             :  * We currently support three overcommit policies, which are set via the
     923             :  * vm.overcommit_memory sysctl.  See Documentation/mm/overcommit-accounting.rst
     924             :  *
     925             :  * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
     926             :  * Additional code 2002 Jul 20 by Robert Love.
     927             :  *
     928             :  * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
     929             :  *
     930             :  * Note this is a helper function intended to be used by LSMs which
     931             :  * wish to use this logic.
     932             :  */
     933           0 : int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
     934             : {
     935             :         long allowed;
     936             : 
     937           0 :         vm_acct_memory(pages);
     938             : 
     939             :         /*
     940             :          * Sometimes we want to use more memory than we have
     941             :          */
     942           0 :         if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
     943             :                 return 0;
     944             : 
     945           0 :         if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
     946           0 :                 if (pages > totalram_pages() + total_swap_pages)
     947             :                         goto error;
     948             :                 return 0;
     949             :         }
     950             : 
     951           0 :         allowed = vm_commit_limit();
     952             :         /*
     953             :          * Reserve some for root
     954             :          */
     955           0 :         if (!cap_sys_admin)
     956           0 :                 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
     957             : 
     958             :         /*
     959             :          * Don't let a single process grow so big a user can't recover
     960             :          */
     961           0 :         if (mm) {
     962           0 :                 long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
     963             : 
     964           0 :                 allowed -= min_t(long, mm->total_vm / 32, reserve);
     965             :         }
     966             : 
     967           0 :         if (percpu_counter_read_positive(&vm_committed_as) < allowed)
     968             :                 return 0;
     969             : error:
     970           0 :         pr_warn_ratelimited("%s: pid: %d, comm: %s, not enough memory for the allocation\n",
     971             :                             __func__, current->pid, current->comm);
     972           0 :         vm_unacct_memory(pages);
     973             : 
     974           0 :         return -ENOMEM;
     975             : }
     976             : 
     977             : /**
     978             :  * get_cmdline() - copy the cmdline value to a buffer.
     979             :  * @task:     the task whose cmdline value to copy.
     980             :  * @buffer:   the buffer to copy to.
     981             :  * @buflen:   the length of the buffer. Larger cmdline values are truncated
     982             :  *            to this length.
     983             :  *
     984             :  * Return: the size of the cmdline field copied. Note that the copy does
     985             :  * not guarantee an ending NULL byte.
     986             :  */
     987           0 : int get_cmdline(struct task_struct *task, char *buffer, int buflen)
     988             : {
     989           0 :         int res = 0;
     990             :         unsigned int len;
     991           0 :         struct mm_struct *mm = get_task_mm(task);
     992             :         unsigned long arg_start, arg_end, env_start, env_end;
     993           0 :         if (!mm)
     994             :                 goto out;
     995           0 :         if (!mm->arg_end)
     996             :                 goto out_mm;    /* Shh! No looking before we're done */
     997             : 
     998           0 :         spin_lock(&mm->arg_lock);
     999           0 :         arg_start = mm->arg_start;
    1000           0 :         arg_end = mm->arg_end;
    1001           0 :         env_start = mm->env_start;
    1002           0 :         env_end = mm->env_end;
    1003           0 :         spin_unlock(&mm->arg_lock);
    1004             : 
    1005           0 :         len = arg_end - arg_start;
    1006             : 
    1007           0 :         if (len > buflen)
    1008           0 :                 len = buflen;
    1009             : 
    1010           0 :         res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
    1011             : 
    1012             :         /*
    1013             :          * If the nul at the end of args has been overwritten, then
    1014             :          * assume application is using setproctitle(3).
    1015             :          */
    1016           0 :         if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
    1017           0 :                 len = strnlen(buffer, res);
    1018           0 :                 if (len < res) {
    1019           0 :                         res = len;
    1020             :                 } else {
    1021           0 :                         len = env_end - env_start;
    1022           0 :                         if (len > buflen - res)
    1023           0 :                                 len = buflen - res;
    1024           0 :                         res += access_process_vm(task, env_start,
    1025           0 :                                                  buffer+res, len,
    1026             :                                                  FOLL_FORCE);
    1027           0 :                         res = strnlen(buffer, res);
    1028             :                 }
    1029             :         }
    1030             : out_mm:
    1031           0 :         mmput(mm);
    1032             : out:
    1033           0 :         return res;
    1034             : }
    1035             : 
    1036           0 : int __weak memcmp_pages(struct page *page1, struct page *page2)
    1037             : {
    1038             :         char *addr1, *addr2;
    1039             :         int ret;
    1040             : 
    1041           0 :         addr1 = kmap_atomic(page1);
    1042           0 :         addr2 = kmap_atomic(page2);
    1043           0 :         ret = memcmp(addr1, addr2, PAGE_SIZE);
    1044           0 :         kunmap_atomic(addr2);
    1045           0 :         kunmap_atomic(addr1);
    1046           0 :         return ret;
    1047             : }
    1048             : 
    1049             : #ifdef CONFIG_PRINTK
    1050             : /**
    1051             :  * mem_dump_obj - Print available provenance information
    1052             :  * @object: object for which to find provenance information.
    1053             :  *
    1054             :  * This function uses pr_cont(), so that the caller is expected to have
    1055             :  * printed out whatever preamble is appropriate.  The provenance information
    1056             :  * depends on the type of object and on how much debugging is enabled.
    1057             :  * For example, for a slab-cache object, the slab name is printed, and,
    1058             :  * if available, the return address and stack trace from the allocation
    1059             :  * and last free path of that object.
    1060             :  */
    1061           0 : void mem_dump_obj(void *object)
    1062             : {
    1063             :         const char *type;
    1064             : 
    1065           0 :         if (kmem_valid_obj(object)) {
    1066           0 :                 kmem_dump_obj(object);
    1067           0 :                 return;
    1068             :         }
    1069             : 
    1070           0 :         if (vmalloc_dump_obj(object))
    1071             :                 return;
    1072             : 
    1073           0 :         if (virt_addr_valid(object))
    1074             :                 type = "non-slab/vmalloc memory";
    1075           0 :         else if (object == NULL)
    1076             :                 type = "NULL pointer";
    1077           0 :         else if (object == ZERO_SIZE_PTR)
    1078             :                 type = "zero-size pointer";
    1079             :         else
    1080           0 :                 type = "non-paged memory";
    1081             : 
    1082           0 :         pr_cont(" %s\n", type);
    1083             : }
    1084             : EXPORT_SYMBOL_GPL(mem_dump_obj);
    1085             : #endif
    1086             : 
    1087             : /*
    1088             :  * A driver might set a page logically offline -- PageOffline() -- and
    1089             :  * turn the page inaccessible in the hypervisor; after that, access to page
    1090             :  * content can be fatal.
    1091             :  *
    1092             :  * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
    1093             :  * pages after checking PageOffline(); however, these PFN walkers can race
    1094             :  * with drivers that set PageOffline().
    1095             :  *
    1096             :  * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
    1097             :  * synchronize with such drivers, achieving that a page cannot be set
    1098             :  * PageOffline() while frozen.
    1099             :  *
    1100             :  * page_offline_begin()/page_offline_end() is used by drivers that care about
    1101             :  * such races when setting a page PageOffline().
    1102             :  */
    1103             : static DECLARE_RWSEM(page_offline_rwsem);
    1104             : 
    1105           0 : void page_offline_freeze(void)
    1106             : {
    1107           0 :         down_read(&page_offline_rwsem);
    1108           0 : }
    1109             : 
    1110           0 : void page_offline_thaw(void)
    1111             : {
    1112           0 :         up_read(&page_offline_rwsem);
    1113           0 : }
    1114             : 
    1115           0 : void page_offline_begin(void)
    1116             : {
    1117           0 :         down_write(&page_offline_rwsem);
    1118           0 : }
    1119             : EXPORT_SYMBOL(page_offline_begin);
    1120             : 
    1121           0 : void page_offline_end(void)
    1122             : {
    1123           0 :         up_write(&page_offline_rwsem);
    1124           0 : }
    1125             : EXPORT_SYMBOL(page_offline_end);
    1126             : 
    1127             : #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
    1128             : void flush_dcache_folio(struct folio *folio)
    1129             : {
    1130             :         long i, nr = folio_nr_pages(folio);
    1131             : 
    1132             :         for (i = 0; i < nr; i++)
    1133             :                 flush_dcache_page(folio_page(folio, i));
    1134             : }
    1135             : EXPORT_SYMBOL(flush_dcache_folio);
    1136             : #endif

Generated by: LCOV version 1.14