LCOV - code coverage report
Current view: top level - mm - util.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 24 293 8.2 %
Date: 2023-08-24 13:40:31 Functions: 5 46 10.9 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : #include <linux/mm.h>
       3             : #include <linux/slab.h>
       4             : #include <linux/string.h>
       5             : #include <linux/compiler.h>
       6             : #include <linux/export.h>
       7             : #include <linux/err.h>
       8             : #include <linux/sched.h>
       9             : #include <linux/sched/mm.h>
      10             : #include <linux/sched/signal.h>
      11             : #include <linux/sched/task_stack.h>
      12             : #include <linux/security.h>
      13             : #include <linux/swap.h>
      14             : #include <linux/swapops.h>
      15             : #include <linux/mman.h>
      16             : #include <linux/hugetlb.h>
      17             : #include <linux/vmalloc.h>
      18             : #include <linux/userfaultfd_k.h>
      19             : #include <linux/elf.h>
      20             : #include <linux/elf-randomize.h>
      21             : #include <linux/personality.h>
      22             : #include <linux/random.h>
      23             : #include <linux/processor.h>
      24             : #include <linux/sizes.h>
      25             : #include <linux/compat.h>
      26             : 
      27             : #include <linux/uaccess.h>
      28             : 
      29             : #include "internal.h"
      30             : #include "swap.h"
      31             : 
      32             : /**
      33             :  * kfree_const - conditionally free memory
      34             :  * @x: pointer to the memory
      35             :  *
      36             :  * Function calls kfree only if @x is not in .rodata section.
      37             :  */
      38         970 : void kfree_const(const void *x)
      39             : {
      40        1940 :         if (!is_kernel_rodata((unsigned long)x))
      41         742 :                 kfree(x);
      42         970 : }
      43             : EXPORT_SYMBOL(kfree_const);
      44             : 
      45             : /**
      46             :  * kstrdup - allocate space for and copy an existing string
      47             :  * @s: the string to duplicate
      48             :  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
      49             :  *
      50             :  * Return: newly allocated copy of @s or %NULL in case of error
      51             :  */
      52             : noinline
      53        2255 : char *kstrdup(const char *s, gfp_t gfp)
      54             : {
      55             :         size_t len;
      56             :         char *buf;
      57             : 
      58        2255 :         if (!s)
      59             :                 return NULL;
      60             : 
      61        2253 :         len = strlen(s) + 1;
      62        2253 :         buf = kmalloc_track_caller(len, gfp);
      63        2253 :         if (buf)
      64        4506 :                 memcpy(buf, s, len);
      65             :         return buf;
      66             : }
      67             : EXPORT_SYMBOL(kstrdup);
      68             : 
      69             : /**
      70             :  * kstrdup_const - conditionally duplicate an existing const string
      71             :  * @s: the string to duplicate
      72             :  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
      73             :  *
      74             :  * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
      75             :  * must not be passed to krealloc().
      76             :  *
      77             :  * Return: source string if it is in .rodata section otherwise
      78             :  * fallback to kstrdup.
      79             :  */
      80        9209 : const char *kstrdup_const(const char *s, gfp_t gfp)
      81             : {
      82       18418 :         if (is_kernel_rodata((unsigned long)s))
      83             :                 return s;
      84             : 
      85        2236 :         return kstrdup(s, gfp);
      86             : }
      87             : EXPORT_SYMBOL(kstrdup_const);
      88             : 
      89             : /**
      90             :  * kstrndup - allocate space for and copy an existing string
      91             :  * @s: the string to duplicate
      92             :  * @max: read at most @max chars from @s
      93             :  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
      94             :  *
      95             :  * Note: Use kmemdup_nul() instead if the size is known exactly.
      96             :  *
      97             :  * Return: newly allocated copy of @s or %NULL in case of error
      98             :  */
      99           0 : char *kstrndup(const char *s, size_t max, gfp_t gfp)
     100             : {
     101             :         size_t len;
     102             :         char *buf;
     103             : 
     104           0 :         if (!s)
     105             :                 return NULL;
     106             : 
     107           0 :         len = strnlen(s, max);
     108           0 :         buf = kmalloc_track_caller(len+1, gfp);
     109           0 :         if (buf) {
     110           0 :                 memcpy(buf, s, len);
     111           0 :                 buf[len] = '\0';
     112             :         }
     113             :         return buf;
     114             : }
     115             : EXPORT_SYMBOL(kstrndup);
     116             : 
     117             : /**
     118             :  * kmemdup - duplicate region of memory
     119             :  *
     120             :  * @src: memory region to duplicate
     121             :  * @len: memory region length
     122             :  * @gfp: GFP mask to use
     123             :  *
     124             :  * Return: newly allocated copy of @src or %NULL in case of error,
     125             :  * result is physically contiguous. Use kfree() to free.
     126             :  */
     127           2 : void *kmemdup(const void *src, size_t len, gfp_t gfp)
     128             : {
     129             :         void *p;
     130             : 
     131           2 :         p = kmalloc_track_caller(len, gfp);
     132           2 :         if (p)
     133           4 :                 memcpy(p, src, len);
     134           2 :         return p;
     135             : }
     136             : EXPORT_SYMBOL(kmemdup);
     137             : 
     138             : /**
     139             :  * kvmemdup - duplicate region of memory
     140             :  *
     141             :  * @src: memory region to duplicate
     142             :  * @len: memory region length
     143             :  * @gfp: GFP mask to use
     144             :  *
     145             :  * Return: newly allocated copy of @src or %NULL in case of error,
     146             :  * result may be not physically contiguous. Use kvfree() to free.
     147             :  */
     148           0 : void *kvmemdup(const void *src, size_t len, gfp_t gfp)
     149             : {
     150             :         void *p;
     151             : 
     152           0 :         p = kvmalloc(len, gfp);
     153           0 :         if (p)
     154           0 :                 memcpy(p, src, len);
     155           0 :         return p;
     156             : }
     157             : EXPORT_SYMBOL(kvmemdup);
     158             : 
     159             : /**
     160             :  * kmemdup_nul - Create a NUL-terminated string from unterminated data
     161             :  * @s: The data to stringify
     162             :  * @len: The size of the data
     163             :  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
     164             :  *
     165             :  * Return: newly allocated copy of @s with NUL-termination or %NULL in
     166             :  * case of error
     167             :  */
     168          15 : char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
     169             : {
     170             :         char *buf;
     171             : 
     172          15 :         if (!s)
     173             :                 return NULL;
     174             : 
     175          15 :         buf = kmalloc_track_caller(len + 1, gfp);
     176          15 :         if (buf) {
     177          30 :                 memcpy(buf, s, len);
     178          15 :                 buf[len] = '\0';
     179             :         }
     180             :         return buf;
     181             : }
     182             : EXPORT_SYMBOL(kmemdup_nul);
     183             : 
     184             : /**
     185             :  * memdup_user - duplicate memory region from user space
     186             :  *
     187             :  * @src: source address in user space
     188             :  * @len: number of bytes to copy
     189             :  *
     190             :  * Return: an ERR_PTR() on failure.  Result is physically
     191             :  * contiguous, to be freed by kfree().
     192             :  */
     193           0 : void *memdup_user(const void __user *src, size_t len)
     194             : {
     195             :         void *p;
     196             : 
     197           0 :         p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
     198           0 :         if (!p)
     199             :                 return ERR_PTR(-ENOMEM);
     200             : 
     201           0 :         if (copy_from_user(p, src, len)) {
     202           0 :                 kfree(p);
     203           0 :                 return ERR_PTR(-EFAULT);
     204             :         }
     205             : 
     206             :         return p;
     207             : }
     208             : EXPORT_SYMBOL(memdup_user);
     209             : 
     210             : /**
     211             :  * vmemdup_user - duplicate memory region from user space
     212             :  *
     213             :  * @src: source address in user space
     214             :  * @len: number of bytes to copy
     215             :  *
     216             :  * Return: an ERR_PTR() on failure.  Result may be not
     217             :  * physically contiguous.  Use kvfree() to free.
     218             :  */
     219           0 : void *vmemdup_user(const void __user *src, size_t len)
     220             : {
     221             :         void *p;
     222             : 
     223           0 :         p = kvmalloc(len, GFP_USER);
     224           0 :         if (!p)
     225             :                 return ERR_PTR(-ENOMEM);
     226             : 
     227           0 :         if (copy_from_user(p, src, len)) {
     228           0 :                 kvfree(p);
     229           0 :                 return ERR_PTR(-EFAULT);
     230             :         }
     231             : 
     232             :         return p;
     233             : }
     234             : EXPORT_SYMBOL(vmemdup_user);
     235             : 
     236             : /**
     237             :  * strndup_user - duplicate an existing string from user space
     238             :  * @s: The string to duplicate
     239             :  * @n: Maximum number of bytes to copy, including the trailing NUL.
     240             :  *
     241             :  * Return: newly allocated copy of @s or an ERR_PTR() in case of error
     242             :  */
     243           0 : char *strndup_user(const char __user *s, long n)
     244             : {
     245             :         char *p;
     246             :         long length;
     247             : 
     248           0 :         length = strnlen_user(s, n);
     249             : 
     250           0 :         if (!length)
     251             :                 return ERR_PTR(-EFAULT);
     252             : 
     253           0 :         if (length > n)
     254             :                 return ERR_PTR(-EINVAL);
     255             : 
     256           0 :         p = memdup_user(s, length);
     257             : 
     258           0 :         if (IS_ERR(p))
     259             :                 return p;
     260             : 
     261           0 :         p[length - 1] = '\0';
     262             : 
     263           0 :         return p;
     264             : }
     265             : EXPORT_SYMBOL(strndup_user);
     266             : 
     267             : /**
     268             :  * memdup_user_nul - duplicate memory region from user space and NUL-terminate
     269             :  *
     270             :  * @src: source address in user space
     271             :  * @len: number of bytes to copy
     272             :  *
     273             :  * Return: an ERR_PTR() on failure.
     274             :  */
     275           0 : void *memdup_user_nul(const void __user *src, size_t len)
     276             : {
     277             :         char *p;
     278             : 
     279             :         /*
     280             :          * Always use GFP_KERNEL, since copy_from_user() can sleep and
     281             :          * cause pagefault, which makes it pointless to use GFP_NOFS
     282             :          * or GFP_ATOMIC.
     283             :          */
     284           0 :         p = kmalloc_track_caller(len + 1, GFP_KERNEL);
     285           0 :         if (!p)
     286             :                 return ERR_PTR(-ENOMEM);
     287             : 
     288           0 :         if (copy_from_user(p, src, len)) {
     289           0 :                 kfree(p);
     290           0 :                 return ERR_PTR(-EFAULT);
     291             :         }
     292           0 :         p[len] = '\0';
     293             : 
     294           0 :         return p;
     295             : }
     296             : EXPORT_SYMBOL(memdup_user_nul);
     297             : 
     298             : /* Check if the vma is being used as a stack by this task */
     299           0 : int vma_is_stack_for_current(struct vm_area_struct *vma)
     300             : {
     301           0 :         struct task_struct * __maybe_unused t = current;
     302             : 
     303           0 :         return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
     304             : }
     305             : 
     306             : /*
     307             :  * Change backing file, only valid to use during initial VMA setup.
     308             :  */
     309           0 : void vma_set_file(struct vm_area_struct *vma, struct file *file)
     310             : {
     311             :         /* Changing an anonymous vma with this is illegal */
     312           0 :         get_file(file);
     313           0 :         swap(vma->vm_file, file);
     314           0 :         fput(file);
     315           0 : }
     316             : EXPORT_SYMBOL(vma_set_file);
     317             : 
     318             : #ifndef STACK_RND_MASK
     319             : #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
     320             : #endif
     321             : 
     322           0 : unsigned long randomize_stack_top(unsigned long stack_top)
     323             : {
     324           0 :         unsigned long random_variable = 0;
     325             : 
     326           0 :         if (current->flags & PF_RANDOMIZE) {
     327           0 :                 random_variable = get_random_long();
     328           0 :                 random_variable &= STACK_RND_MASK;
     329           0 :                 random_variable <<= PAGE_SHIFT;
     330             :         }
     331             : #ifdef CONFIG_STACK_GROWSUP
     332             :         return PAGE_ALIGN(stack_top) + random_variable;
     333             : #else
     334           0 :         return PAGE_ALIGN(stack_top) - random_variable;
     335             : #endif
     336             : }
     337             : 
     338             : /**
     339             :  * randomize_page - Generate a random, page aligned address
     340             :  * @start:      The smallest acceptable address the caller will take.
     341             :  * @range:      The size of the area, starting at @start, within which the
     342             :  *              random address must fall.
     343             :  *
     344             :  * If @start + @range would overflow, @range is capped.
     345             :  *
     346             :  * NOTE: Historical use of randomize_range, which this replaces, presumed that
     347             :  * @start was already page aligned.  We now align it regardless.
     348             :  *
     349             :  * Return: A page aligned address within [start, start + range).  On error,
     350             :  * @start is returned.
     351             :  */
     352           0 : unsigned long randomize_page(unsigned long start, unsigned long range)
     353             : {
     354           0 :         if (!PAGE_ALIGNED(start)) {
     355           0 :                 range -= PAGE_ALIGN(start) - start;
     356           0 :                 start = PAGE_ALIGN(start);
     357             :         }
     358             : 
     359           0 :         if (start > ULONG_MAX - range)
     360           0 :                 range = ULONG_MAX - start;
     361             : 
     362           0 :         range >>= PAGE_SHIFT;
     363             : 
     364           0 :         if (range == 0)
     365             :                 return start;
     366             : 
     367           0 :         return start + (get_random_long() % range << PAGE_SHIFT);
     368             : }
     369             : 
     370             : #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
     371             : unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
     372             : {
     373             :         /* Is the current task 32bit ? */
     374             :         if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
     375             :                 return randomize_page(mm->brk, SZ_32M);
     376             : 
     377             :         return randomize_page(mm->brk, SZ_1G);
     378             : }
     379             : 
     380             : unsigned long arch_mmap_rnd(void)
     381             : {
     382             :         unsigned long rnd;
     383             : 
     384             : #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
     385             :         if (is_compat_task())
     386             :                 rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
     387             :         else
     388             : #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
     389             :                 rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
     390             : 
     391             :         return rnd << PAGE_SHIFT;
     392             : }
     393             : 
     394             : static int mmap_is_legacy(struct rlimit *rlim_stack)
     395             : {
     396             :         if (current->personality & ADDR_COMPAT_LAYOUT)
     397             :                 return 1;
     398             : 
     399             :         if (rlim_stack->rlim_cur == RLIM_INFINITY)
     400             :                 return 1;
     401             : 
     402             :         return sysctl_legacy_va_layout;
     403             : }
     404             : 
     405             : /*
     406             :  * Leave enough space between the mmap area and the stack to honour ulimit in
     407             :  * the face of randomisation.
     408             :  */
     409             : #define MIN_GAP         (SZ_128M)
     410             : #define MAX_GAP         (STACK_TOP / 6 * 5)
     411             : 
     412             : static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
     413             : {
     414             :         unsigned long gap = rlim_stack->rlim_cur;
     415             :         unsigned long pad = stack_guard_gap;
     416             : 
     417             :         /* Account for stack randomization if necessary */
     418             :         if (current->flags & PF_RANDOMIZE)
     419             :                 pad += (STACK_RND_MASK << PAGE_SHIFT);
     420             : 
     421             :         /* Values close to RLIM_INFINITY can overflow. */
     422             :         if (gap + pad > gap)
     423             :                 gap += pad;
     424             : 
     425             :         if (gap < MIN_GAP)
     426             :                 gap = MIN_GAP;
     427             :         else if (gap > MAX_GAP)
     428             :                 gap = MAX_GAP;
     429             : 
     430             :         return PAGE_ALIGN(STACK_TOP - gap - rnd);
     431             : }
     432             : 
     433             : void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
     434             : {
     435             :         unsigned long random_factor = 0UL;
     436             : 
     437             :         if (current->flags & PF_RANDOMIZE)
     438             :                 random_factor = arch_mmap_rnd();
     439             : 
     440             :         if (mmap_is_legacy(rlim_stack)) {
     441             :                 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
     442             :                 mm->get_unmapped_area = arch_get_unmapped_area;
     443             :         } else {
     444             :                 mm->mmap_base = mmap_base(random_factor, rlim_stack);
     445             :                 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
     446             :         }
     447             : }
     448             : #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
     449           0 : void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
     450             : {
     451           0 :         mm->mmap_base = TASK_UNMAPPED_BASE;
     452           0 :         mm->get_unmapped_area = arch_get_unmapped_area;
     453           0 : }
     454             : #endif
     455             : 
     456             : /**
     457             :  * __account_locked_vm - account locked pages to an mm's locked_vm
     458             :  * @mm:          mm to account against
     459             :  * @pages:       number of pages to account
     460             :  * @inc:         %true if @pages should be considered positive, %false if not
     461             :  * @task:        task used to check RLIMIT_MEMLOCK
     462             :  * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
     463             :  *
     464             :  * Assumes @task and @mm are valid (i.e. at least one reference on each), and
     465             :  * that mmap_lock is held as writer.
     466             :  *
     467             :  * Return:
     468             :  * * 0       on success
     469             :  * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
     470             :  */
     471           0 : int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
     472             :                         struct task_struct *task, bool bypass_rlim)
     473             : {
     474             :         unsigned long locked_vm, limit;
     475           0 :         int ret = 0;
     476             : 
     477           0 :         mmap_assert_write_locked(mm);
     478             : 
     479           0 :         locked_vm = mm->locked_vm;
     480           0 :         if (inc) {
     481           0 :                 if (!bypass_rlim) {
     482           0 :                         limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
     483           0 :                         if (locked_vm + pages > limit)
     484           0 :                                 ret = -ENOMEM;
     485             :                 }
     486           0 :                 if (!ret)
     487           0 :                         mm->locked_vm = locked_vm + pages;
     488             :         } else {
     489           0 :                 WARN_ON_ONCE(pages > locked_vm);
     490           0 :                 mm->locked_vm = locked_vm - pages;
     491             :         }
     492             : 
     493             :         pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
     494             :                  (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
     495             :                  locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
     496             :                  ret ? " - exceeded" : "");
     497             : 
     498           0 :         return ret;
     499             : }
     500             : EXPORT_SYMBOL_GPL(__account_locked_vm);
     501             : 
     502             : /**
     503             :  * account_locked_vm - account locked pages to an mm's locked_vm
     504             :  * @mm:          mm to account against, may be NULL
     505             :  * @pages:       number of pages to account
     506             :  * @inc:         %true if @pages should be considered positive, %false if not
     507             :  *
     508             :  * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
     509             :  *
     510             :  * Return:
     511             :  * * 0       on success, or if mm is NULL
     512             :  * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
     513             :  */
     514           0 : int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
     515             : {
     516             :         int ret;
     517             : 
     518           0 :         if (pages == 0 || !mm)
     519             :                 return 0;
     520             : 
     521           0 :         mmap_write_lock(mm);
     522           0 :         ret = __account_locked_vm(mm, pages, inc, current,
     523           0 :                                   capable(CAP_IPC_LOCK));
     524           0 :         mmap_write_unlock(mm);
     525             : 
     526           0 :         return ret;
     527             : }
     528             : EXPORT_SYMBOL_GPL(account_locked_vm);
     529             : 
     530           0 : unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
     531             :         unsigned long len, unsigned long prot,
     532             :         unsigned long flag, unsigned long pgoff)
     533             : {
     534             :         unsigned long ret;
     535           0 :         struct mm_struct *mm = current->mm;
     536             :         unsigned long populate;
     537           0 :         LIST_HEAD(uf);
     538             : 
     539           0 :         ret = security_mmap_file(file, prot, flag);
     540             :         if (!ret) {
     541           0 :                 if (mmap_write_lock_killable(mm))
     542             :                         return -EINTR;
     543           0 :                 ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
     544             :                               &uf);
     545           0 :                 mmap_write_unlock(mm);
     546           0 :                 userfaultfd_unmap_complete(mm, &uf);
     547           0 :                 if (populate)
     548           0 :                         mm_populate(ret, populate);
     549             :         }
     550             :         return ret;
     551             : }
     552             : 
     553           0 : unsigned long vm_mmap(struct file *file, unsigned long addr,
     554             :         unsigned long len, unsigned long prot,
     555             :         unsigned long flag, unsigned long offset)
     556             : {
     557           0 :         if (unlikely(offset + PAGE_ALIGN(len) < offset))
     558             :                 return -EINVAL;
     559           0 :         if (unlikely(offset_in_page(offset)))
     560             :                 return -EINVAL;
     561             : 
     562           0 :         return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
     563             : }
     564             : EXPORT_SYMBOL(vm_mmap);
     565             : 
     566             : /**
     567             :  * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
     568             :  * failure, fall back to non-contiguous (vmalloc) allocation.
     569             :  * @size: size of the request.
     570             :  * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
     571             :  * @node: numa node to allocate from
     572             :  *
     573             :  * Uses kmalloc to get the memory but if the allocation fails then falls back
     574             :  * to the vmalloc allocator. Use kvfree for freeing the memory.
     575             :  *
     576             :  * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
     577             :  * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
     578             :  * preferable to the vmalloc fallback, due to visible performance drawbacks.
     579             :  *
     580             :  * Return: pointer to the allocated memory of %NULL in case of failure
     581             :  */
     582           0 : void *kvmalloc_node(size_t size, gfp_t flags, int node)
     583             : {
     584           0 :         gfp_t kmalloc_flags = flags;
     585             :         void *ret;
     586             : 
     587             :         /*
     588             :          * We want to attempt a large physically contiguous block first because
     589             :          * it is less likely to fragment multiple larger blocks and therefore
     590             :          * contribute to a long term fragmentation less than vmalloc fallback.
     591             :          * However make sure that larger requests are not too disruptive - no
     592             :          * OOM killer and no allocation failure warnings as we have a fallback.
     593             :          */
     594           0 :         if (size > PAGE_SIZE) {
     595           0 :                 kmalloc_flags |= __GFP_NOWARN;
     596             : 
     597           0 :                 if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
     598           0 :                         kmalloc_flags |= __GFP_NORETRY;
     599             : 
     600             :                 /* nofail semantic is implemented by the vmalloc fallback */
     601           0 :                 kmalloc_flags &= ~__GFP_NOFAIL;
     602             :         }
     603             : 
     604           0 :         ret = kmalloc_node(size, kmalloc_flags, node);
     605             : 
     606             :         /*
     607             :          * It doesn't really make sense to fallback to vmalloc for sub page
     608             :          * requests
     609             :          */
     610           0 :         if (ret || size <= PAGE_SIZE)
     611             :                 return ret;
     612             : 
     613             :         /* non-sleeping allocations are not supported by vmalloc */
     614           0 :         if (!gfpflags_allow_blocking(flags))
     615             :                 return NULL;
     616             : 
     617             :         /* Don't even allow crazy sizes */
     618           0 :         if (unlikely(size > INT_MAX)) {
     619           0 :                 WARN_ON_ONCE(!(flags & __GFP_NOWARN));
     620             :                 return NULL;
     621             :         }
     622             : 
     623             :         /*
     624             :          * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
     625             :          * since the callers already cannot assume anything
     626             :          * about the resulting pointer, and cannot play
     627             :          * protection games.
     628             :          */
     629           0 :         return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
     630           0 :                         flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
     631           0 :                         node, __builtin_return_address(0));
     632             : }
     633             : EXPORT_SYMBOL(kvmalloc_node);
     634             : 
     635             : /**
     636             :  * kvfree() - Free memory.
     637             :  * @addr: Pointer to allocated memory.
     638             :  *
     639             :  * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
     640             :  * It is slightly more efficient to use kfree() or vfree() if you are certain
     641             :  * that you know which one to use.
     642             :  *
     643             :  * Context: Either preemptible task context or not-NMI interrupt.
     644             :  */
     645           0 : void kvfree(const void *addr)
     646             : {
     647           0 :         if (is_vmalloc_addr(addr))
     648           0 :                 vfree(addr);
     649             :         else
     650           0 :                 kfree(addr);
     651           0 : }
     652             : EXPORT_SYMBOL(kvfree);
     653             : 
     654             : /**
     655             :  * kvfree_sensitive - Free a data object containing sensitive information.
     656             :  * @addr: address of the data object to be freed.
     657             :  * @len: length of the data object.
     658             :  *
     659             :  * Use the special memzero_explicit() function to clear the content of a
     660             :  * kvmalloc'ed object containing sensitive data to make sure that the
     661             :  * compiler won't optimize out the data clearing.
     662             :  */
     663           0 : void kvfree_sensitive(const void *addr, size_t len)
     664             : {
     665           0 :         if (likely(!ZERO_OR_NULL_PTR(addr))) {
     666           0 :                 memzero_explicit((void *)addr, len);
     667           0 :                 kvfree(addr);
     668             :         }
     669           0 : }
     670             : EXPORT_SYMBOL(kvfree_sensitive);
     671             : 
     672           0 : void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
     673             : {
     674             :         void *newp;
     675             : 
     676           0 :         if (oldsize >= newsize)
     677             :                 return (void *)p;
     678           0 :         newp = kvmalloc(newsize, flags);
     679           0 :         if (!newp)
     680             :                 return NULL;
     681           0 :         memcpy(newp, p, oldsize);
     682           0 :         kvfree(p);
     683           0 :         return newp;
     684             : }
     685             : EXPORT_SYMBOL(kvrealloc);
     686             : 
     687             : /**
     688             :  * __vmalloc_array - allocate memory for a virtually contiguous array.
     689             :  * @n: number of elements.
     690             :  * @size: element size.
     691             :  * @flags: the type of memory to allocate (see kmalloc).
     692             :  */
     693           0 : void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
     694             : {
     695             :         size_t bytes;
     696             : 
     697           0 :         if (unlikely(check_mul_overflow(n, size, &bytes)))
     698             :                 return NULL;
     699           0 :         return __vmalloc(bytes, flags);
     700             : }
     701             : EXPORT_SYMBOL(__vmalloc_array);
     702             : 
     703             : /**
     704             :  * vmalloc_array - allocate memory for a virtually contiguous array.
     705             :  * @n: number of elements.
     706             :  * @size: element size.
     707             :  */
     708           0 : void *vmalloc_array(size_t n, size_t size)
     709             : {
     710           0 :         return __vmalloc_array(n, size, GFP_KERNEL);
     711             : }
     712             : EXPORT_SYMBOL(vmalloc_array);
     713             : 
     714             : /**
     715             :  * __vcalloc - allocate and zero memory for a virtually contiguous array.
     716             :  * @n: number of elements.
     717             :  * @size: element size.
     718             :  * @flags: the type of memory to allocate (see kmalloc).
     719             :  */
     720           0 : void *__vcalloc(size_t n, size_t size, gfp_t flags)
     721             : {
     722           0 :         return __vmalloc_array(n, size, flags | __GFP_ZERO);
     723             : }
     724             : EXPORT_SYMBOL(__vcalloc);
     725             : 
     726             : /**
     727             :  * vcalloc - allocate and zero memory for a virtually contiguous array.
     728             :  * @n: number of elements.
     729             :  * @size: element size.
     730             :  */
     731           0 : void *vcalloc(size_t n, size_t size)
     732             : {
     733           0 :         return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
     734             : }
     735             : EXPORT_SYMBOL(vcalloc);
     736             : 
     737             : /* Neutral page->mapping pointer to address_space or anon_vma or other */
     738           0 : void *page_rmapping(struct page *page)
     739             : {
     740           0 :         return folio_raw_mapping(page_folio(page));
     741             : }
     742             : 
     743           0 : struct anon_vma *folio_anon_vma(struct folio *folio)
     744             : {
     745           0 :         unsigned long mapping = (unsigned long)folio->mapping;
     746             : 
     747           0 :         if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
     748             :                 return NULL;
     749           0 :         return (void *)(mapping - PAGE_MAPPING_ANON);
     750             : }
     751             : 
     752             : /**
     753             :  * folio_mapping - Find the mapping where this folio is stored.
     754             :  * @folio: The folio.
     755             :  *
     756             :  * For folios which are in the page cache, return the mapping that this
     757             :  * page belongs to.  Folios in the swap cache return the swap mapping
     758             :  * this page is stored in (which is different from the mapping for the
     759             :  * swap file or swap device where the data is stored).
     760             :  *
     761             :  * You can call this for folios which aren't in the swap cache or page
     762             :  * cache and it will return NULL.
     763             :  */
     764           0 : struct address_space *folio_mapping(struct folio *folio)
     765             : {
     766             :         struct address_space *mapping;
     767             : 
     768             :         /* This happens if someone calls flush_dcache_page on slab page */
     769           0 :         if (unlikely(folio_test_slab(folio)))
     770             :                 return NULL;
     771             : 
     772           0 :         if (unlikely(folio_test_swapcache(folio)))
     773           0 :                 return swap_address_space(folio_swap_entry(folio));
     774             : 
     775           0 :         mapping = folio->mapping;
     776           0 :         if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
     777             :                 return NULL;
     778             : 
     779           0 :         return mapping;
     780             : }
     781             : EXPORT_SYMBOL(folio_mapping);
     782             : 
     783             : /**
     784             :  * folio_copy - Copy the contents of one folio to another.
     785             :  * @dst: Folio to copy to.
     786             :  * @src: Folio to copy from.
     787             :  *
     788             :  * The bytes in the folio represented by @src are copied to @dst.
     789             :  * Assumes the caller has validated that @dst is at least as large as @src.
     790             :  * Can be called in atomic context for order-0 folios, but if the folio is
     791             :  * larger, it may sleep.
     792             :  */
     793           0 : void folio_copy(struct folio *dst, struct folio *src)
     794             : {
     795           0 :         long i = 0;
     796           0 :         long nr = folio_nr_pages(src);
     797             : 
     798             :         for (;;) {
     799           0 :                 copy_highpage(folio_page(dst, i), folio_page(src, i));
     800           0 :                 if (++i == nr)
     801             :                         break;
     802           0 :                 cond_resched();
     803             :         }
     804           0 : }
     805             : 
     806             : int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
     807             : int sysctl_overcommit_ratio __read_mostly = 50;
     808             : unsigned long sysctl_overcommit_kbytes __read_mostly;
     809             : int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
     810             : unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
     811             : unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
     812             : 
     813           0 : int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
     814             :                 size_t *lenp, loff_t *ppos)
     815             : {
     816             :         int ret;
     817             : 
     818           0 :         ret = proc_dointvec(table, write, buffer, lenp, ppos);
     819           0 :         if (ret == 0 && write)
     820           0 :                 sysctl_overcommit_kbytes = 0;
     821           0 :         return ret;
     822             : }
     823             : 
     824           0 : static void sync_overcommit_as(struct work_struct *dummy)
     825             : {
     826           0 :         percpu_counter_sync(&vm_committed_as);
     827           0 : }
     828             : 
     829           0 : int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
     830             :                 size_t *lenp, loff_t *ppos)
     831             : {
     832             :         struct ctl_table t;
     833           0 :         int new_policy = -1;
     834             :         int ret;
     835             : 
     836             :         /*
     837             :          * The deviation of sync_overcommit_as could be big with loose policy
     838             :          * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
     839             :          * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
     840             :          * with the strict "NEVER", and to avoid possible race condition (even
     841             :          * though user usually won't too frequently do the switching to policy
     842             :          * OVERCOMMIT_NEVER), the switch is done in the following order:
     843             :          *      1. changing the batch
     844             :          *      2. sync percpu count on each CPU
     845             :          *      3. switch the policy
     846             :          */
     847           0 :         if (write) {
     848           0 :                 t = *table;
     849           0 :                 t.data = &new_policy;
     850           0 :                 ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
     851           0 :                 if (ret || new_policy == -1)
     852             :                         return ret;
     853             : 
     854           0 :                 mm_compute_batch(new_policy);
     855           0 :                 if (new_policy == OVERCOMMIT_NEVER)
     856           0 :                         schedule_on_each_cpu(sync_overcommit_as);
     857           0 :                 sysctl_overcommit_memory = new_policy;
     858             :         } else {
     859           0 :                 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
     860             :         }
     861             : 
     862             :         return ret;
     863             : }
     864             : 
     865           0 : int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
     866             :                 size_t *lenp, loff_t *ppos)
     867             : {
     868             :         int ret;
     869             : 
     870           0 :         ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
     871           0 :         if (ret == 0 && write)
     872           0 :                 sysctl_overcommit_ratio = 0;
     873           0 :         return ret;
     874             : }
     875             : 
     876             : /*
     877             :  * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
     878             :  */
     879           0 : unsigned long vm_commit_limit(void)
     880             : {
     881             :         unsigned long allowed;
     882             : 
     883           0 :         if (sysctl_overcommit_kbytes)
     884           0 :                 allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
     885             :         else
     886           0 :                 allowed = ((totalram_pages() - hugetlb_total_pages())
     887           0 :                            * sysctl_overcommit_ratio / 100);
     888           0 :         allowed += total_swap_pages;
     889             : 
     890           0 :         return allowed;
     891             : }
     892             : 
     893             : /*
     894             :  * Make sure vm_committed_as in one cacheline and not cacheline shared with
     895             :  * other variables. It can be updated by several CPUs frequently.
     896             :  */
     897             : struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
     898             : 
     899             : /*
     900             :  * The global memory commitment made in the system can be a metric
     901             :  * that can be used to drive ballooning decisions when Linux is hosted
     902             :  * as a guest. On Hyper-V, the host implements a policy engine for dynamically
     903             :  * balancing memory across competing virtual machines that are hosted.
     904             :  * Several metrics drive this policy engine including the guest reported
     905             :  * memory commitment.
     906             :  *
     907             :  * The time cost of this is very low for small platforms, and for big
     908             :  * platform like a 2S/36C/72T Skylake server, in worst case where
     909             :  * vm_committed_as's spinlock is under severe contention, the time cost
     910             :  * could be about 30~40 microseconds.
     911             :  */
     912           0 : unsigned long vm_memory_committed(void)
     913             : {
     914           0 :         return percpu_counter_sum_positive(&vm_committed_as);
     915             : }
     916             : EXPORT_SYMBOL_GPL(vm_memory_committed);
     917             : 
     918             : /*
     919             :  * Check that a process has enough memory to allocate a new virtual
     920             :  * mapping. 0 means there is enough memory for the allocation to
     921             :  * succeed and -ENOMEM implies there is not.
     922             :  *
     923             :  * We currently support three overcommit policies, which are set via the
     924             :  * vm.overcommit_memory sysctl.  See Documentation/mm/overcommit-accounting.rst
     925             :  *
     926             :  * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
     927             :  * Additional code 2002 Jul 20 by Robert Love.
     928             :  *
     929             :  * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
     930             :  *
     931             :  * Note this is a helper function intended to be used by LSMs which
     932             :  * wish to use this logic.
     933             :  */
     934           0 : int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
     935             : {
     936             :         long allowed;
     937             : 
     938           0 :         vm_acct_memory(pages);
     939             : 
     940             :         /*
     941             :          * Sometimes we want to use more memory than we have
     942             :          */
     943           0 :         if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
     944             :                 return 0;
     945             : 
     946           0 :         if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
     947           0 :                 if (pages > totalram_pages() + total_swap_pages)
     948             :                         goto error;
     949             :                 return 0;
     950             :         }
     951             : 
     952           0 :         allowed = vm_commit_limit();
     953             :         /*
     954             :          * Reserve some for root
     955             :          */
     956           0 :         if (!cap_sys_admin)
     957           0 :                 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
     958             : 
     959             :         /*
     960             :          * Don't let a single process grow so big a user can't recover
     961             :          */
     962           0 :         if (mm) {
     963           0 :                 long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
     964             : 
     965           0 :                 allowed -= min_t(long, mm->total_vm / 32, reserve);
     966             :         }
     967             : 
     968           0 :         if (percpu_counter_read_positive(&vm_committed_as) < allowed)
     969             :                 return 0;
     970             : error:
     971           0 :         pr_warn_ratelimited("%s: pid: %d, comm: %s, not enough memory for the allocation\n",
     972             :                             __func__, current->pid, current->comm);
     973           0 :         vm_unacct_memory(pages);
     974             : 
     975           0 :         return -ENOMEM;
     976             : }
     977             : 
     978             : /**
     979             :  * get_cmdline() - copy the cmdline value to a buffer.
     980             :  * @task:     the task whose cmdline value to copy.
     981             :  * @buffer:   the buffer to copy to.
     982             :  * @buflen:   the length of the buffer. Larger cmdline values are truncated
     983             :  *            to this length.
     984             :  *
     985             :  * Return: the size of the cmdline field copied. Note that the copy does
     986             :  * not guarantee an ending NULL byte.
     987             :  */
     988           0 : int get_cmdline(struct task_struct *task, char *buffer, int buflen)
     989             : {
     990           0 :         int res = 0;
     991             :         unsigned int len;
     992           0 :         struct mm_struct *mm = get_task_mm(task);
     993             :         unsigned long arg_start, arg_end, env_start, env_end;
     994           0 :         if (!mm)
     995             :                 goto out;
     996           0 :         if (!mm->arg_end)
     997             :                 goto out_mm;    /* Shh! No looking before we're done */
     998             : 
     999           0 :         spin_lock(&mm->arg_lock);
    1000           0 :         arg_start = mm->arg_start;
    1001           0 :         arg_end = mm->arg_end;
    1002           0 :         env_start = mm->env_start;
    1003           0 :         env_end = mm->env_end;
    1004           0 :         spin_unlock(&mm->arg_lock);
    1005             : 
    1006           0 :         len = arg_end - arg_start;
    1007             : 
    1008           0 :         if (len > buflen)
    1009           0 :                 len = buflen;
    1010             : 
    1011           0 :         res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
    1012             : 
    1013             :         /*
    1014             :          * If the nul at the end of args has been overwritten, then
    1015             :          * assume application is using setproctitle(3).
    1016             :          */
    1017           0 :         if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
    1018           0 :                 len = strnlen(buffer, res);
    1019           0 :                 if (len < res) {
    1020           0 :                         res = len;
    1021             :                 } else {
    1022           0 :                         len = env_end - env_start;
    1023           0 :                         if (len > buflen - res)
    1024           0 :                                 len = buflen - res;
    1025           0 :                         res += access_process_vm(task, env_start,
    1026           0 :                                                  buffer+res, len,
    1027             :                                                  FOLL_FORCE);
    1028           0 :                         res = strnlen(buffer, res);
    1029             :                 }
    1030             :         }
    1031             : out_mm:
    1032           0 :         mmput(mm);
    1033             : out:
    1034           0 :         return res;
    1035             : }
    1036             : 
    1037           0 : int __weak memcmp_pages(struct page *page1, struct page *page2)
    1038             : {
    1039             :         char *addr1, *addr2;
    1040             :         int ret;
    1041             : 
    1042           0 :         addr1 = kmap_atomic(page1);
    1043           0 :         addr2 = kmap_atomic(page2);
    1044           0 :         ret = memcmp(addr1, addr2, PAGE_SIZE);
    1045           0 :         kunmap_atomic(addr2);
    1046           0 :         kunmap_atomic(addr1);
    1047           0 :         return ret;
    1048             : }
    1049             : 
    1050             : #ifdef CONFIG_PRINTK
    1051             : /**
    1052             :  * mem_dump_obj - Print available provenance information
    1053             :  * @object: object for which to find provenance information.
    1054             :  *
    1055             :  * This function uses pr_cont(), so that the caller is expected to have
    1056             :  * printed out whatever preamble is appropriate.  The provenance information
    1057             :  * depends on the type of object and on how much debugging is enabled.
    1058             :  * For example, for a slab-cache object, the slab name is printed, and,
    1059             :  * if available, the return address and stack trace from the allocation
    1060             :  * and last free path of that object.
    1061             :  */
    1062           0 : void mem_dump_obj(void *object)
    1063             : {
    1064             :         const char *type;
    1065             : 
    1066           0 :         if (kmem_valid_obj(object)) {
    1067           0 :                 kmem_dump_obj(object);
    1068           0 :                 return;
    1069             :         }
    1070             : 
    1071           0 :         if (vmalloc_dump_obj(object))
    1072             :                 return;
    1073             : 
    1074           0 :         if (virt_addr_valid(object))
    1075             :                 type = "non-slab/vmalloc memory";
    1076           0 :         else if (object == NULL)
    1077             :                 type = "NULL pointer";
    1078           0 :         else if (object == ZERO_SIZE_PTR)
    1079             :                 type = "zero-size pointer";
    1080             :         else
    1081           0 :                 type = "non-paged memory";
    1082             : 
    1083           0 :         pr_cont(" %s\n", type);
    1084             : }
    1085             : EXPORT_SYMBOL_GPL(mem_dump_obj);
    1086             : #endif
    1087             : 
    1088             : /*
    1089             :  * A driver might set a page logically offline -- PageOffline() -- and
    1090             :  * turn the page inaccessible in the hypervisor; after that, access to page
    1091             :  * content can be fatal.
    1092             :  *
    1093             :  * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
    1094             :  * pages after checking PageOffline(); however, these PFN walkers can race
    1095             :  * with drivers that set PageOffline().
    1096             :  *
    1097             :  * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
    1098             :  * synchronize with such drivers, achieving that a page cannot be set
    1099             :  * PageOffline() while frozen.
    1100             :  *
    1101             :  * page_offline_begin()/page_offline_end() is used by drivers that care about
    1102             :  * such races when setting a page PageOffline().
    1103             :  */
    1104             : static DECLARE_RWSEM(page_offline_rwsem);
    1105             : 
    1106           0 : void page_offline_freeze(void)
    1107             : {
    1108           0 :         down_read(&page_offline_rwsem);
    1109           0 : }
    1110             : 
    1111           0 : void page_offline_thaw(void)
    1112             : {
    1113           0 :         up_read(&page_offline_rwsem);
    1114           0 : }
    1115             : 
    1116           0 : void page_offline_begin(void)
    1117             : {
    1118           0 :         down_write(&page_offline_rwsem);
    1119           0 : }
    1120             : EXPORT_SYMBOL(page_offline_begin);
    1121             : 
    1122           0 : void page_offline_end(void)
    1123             : {
    1124           0 :         up_write(&page_offline_rwsem);
    1125           0 : }
    1126             : EXPORT_SYMBOL(page_offline_end);
    1127             : 
    1128             : #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
    1129             : void flush_dcache_folio(struct folio *folio)
    1130             : {
    1131             :         long i, nr = folio_nr_pages(folio);
    1132             : 
    1133             :         for (i = 0; i < nr; i++)
    1134             :                 flush_dcache_page(folio_page(folio, i));
    1135             : }
    1136             : EXPORT_SYMBOL(flush_dcache_folio);
    1137             : #endif

Generated by: LCOV version 1.14