Line data Source code
1 : /*
2 : * Resizable virtual memory filesystem for Linux.
3 : *
4 : * Copyright (C) 2000 Linus Torvalds.
5 : * 2000 Transmeta Corp.
6 : * 2000-2001 Christoph Rohland
7 : * 2000-2001 SAP AG
8 : * 2002 Red Hat Inc.
9 : * Copyright (C) 2002-2011 Hugh Dickins.
10 : * Copyright (C) 2011 Google Inc.
11 : * Copyright (C) 2002-2005 VERITAS Software Corporation.
12 : * Copyright (C) 2004 Andi Kleen, SuSE Labs
13 : *
14 : * Extended attribute support for tmpfs:
15 : * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
16 : * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
17 : *
18 : * tiny-shmem:
19 : * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
20 : *
21 : * This file is released under the GPL.
22 : */
23 :
24 : #include <linux/fs.h>
25 : #include <linux/init.h>
26 : #include <linux/vfs.h>
27 : #include <linux/mount.h>
28 : #include <linux/ramfs.h>
29 : #include <linux/pagemap.h>
30 : #include <linux/file.h>
31 : #include <linux/fileattr.h>
32 : #include <linux/mm.h>
33 : #include <linux/random.h>
34 : #include <linux/sched/signal.h>
35 : #include <linux/export.h>
36 : #include <linux/shmem_fs.h>
37 : #include <linux/swap.h>
38 : #include <linux/uio.h>
39 : #include <linux/hugetlb.h>
40 : #include <linux/fs_parser.h>
41 : #include <linux/swapfile.h>
42 : #include <linux/iversion.h>
43 : #include "swap.h"
44 :
45 : static struct vfsmount *shm_mnt;
46 :
47 : #ifdef CONFIG_SHMEM
48 : /*
49 : * This virtual memory filesystem is heavily based on the ramfs. It
50 : * extends ramfs by the ability to use swap and honor resource limits
51 : * which makes it a completely usable filesystem.
52 : */
53 :
54 : #include <linux/xattr.h>
55 : #include <linux/exportfs.h>
56 : #include <linux/posix_acl.h>
57 : #include <linux/posix_acl_xattr.h>
58 : #include <linux/mman.h>
59 : #include <linux/string.h>
60 : #include <linux/slab.h>
61 : #include <linux/backing-dev.h>
62 : #include <linux/writeback.h>
63 : #include <linux/pagevec.h>
64 : #include <linux/percpu_counter.h>
65 : #include <linux/falloc.h>
66 : #include <linux/splice.h>
67 : #include <linux/security.h>
68 : #include <linux/swapops.h>
69 : #include <linux/mempolicy.h>
70 : #include <linux/namei.h>
71 : #include <linux/ctype.h>
72 : #include <linux/migrate.h>
73 : #include <linux/highmem.h>
74 : #include <linux/seq_file.h>
75 : #include <linux/magic.h>
76 : #include <linux/syscalls.h>
77 : #include <linux/fcntl.h>
78 : #include <uapi/linux/memfd.h>
79 : #include <linux/rmap.h>
80 : #include <linux/uuid.h>
81 :
82 : #include <linux/uaccess.h>
83 :
84 : #include "internal.h"
85 :
86 : #define BLOCKS_PER_PAGE (PAGE_SIZE/512)
87 : #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
88 :
89 : /* Pretend that each entry is of this size in directory's i_size */
90 : #define BOGO_DIRENT_SIZE 20
91 :
92 : /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
93 : #define SHORT_SYMLINK_LEN 128
94 :
95 : /*
96 : * shmem_fallocate communicates with shmem_fault or shmem_writepage via
97 : * inode->i_private (with i_rwsem making sure that it has only one user at
98 : * a time): we would prefer not to enlarge the shmem inode just for that.
99 : */
100 : struct shmem_falloc {
101 : wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
102 : pgoff_t start; /* start of range currently being fallocated */
103 : pgoff_t next; /* the next page offset to be fallocated */
104 : pgoff_t nr_falloced; /* how many new pages have been fallocated */
105 : pgoff_t nr_unswapped; /* how often writepage refused to swap out */
106 : };
107 :
108 : struct shmem_options {
109 : unsigned long long blocks;
110 : unsigned long long inodes;
111 : struct mempolicy *mpol;
112 : kuid_t uid;
113 : kgid_t gid;
114 : umode_t mode;
115 : bool full_inums;
116 : int huge;
117 : int seen;
118 : bool noswap;
119 : #define SHMEM_SEEN_BLOCKS 1
120 : #define SHMEM_SEEN_INODES 2
121 : #define SHMEM_SEEN_HUGE 4
122 : #define SHMEM_SEEN_INUMS 8
123 : #define SHMEM_SEEN_NOSWAP 16
124 : };
125 :
126 : #ifdef CONFIG_TMPFS
127 : static unsigned long shmem_default_max_blocks(void)
128 : {
129 : return totalram_pages() / 2;
130 : }
131 :
132 : static unsigned long shmem_default_max_inodes(void)
133 : {
134 : unsigned long nr_pages = totalram_pages();
135 :
136 : return min(nr_pages - totalhigh_pages(), nr_pages / 2);
137 : }
138 : #endif
139 :
140 : static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
141 : struct folio **foliop, enum sgp_type sgp,
142 : gfp_t gfp, struct vm_area_struct *vma,
143 : vm_fault_t *fault_type);
144 :
145 : static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
146 : {
147 : return sb->s_fs_info;
148 : }
149 :
150 : /*
151 : * shmem_file_setup pre-accounts the whole fixed size of a VM object,
152 : * for shared memory and for shared anonymous (/dev/zero) mappings
153 : * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
154 : * consistent with the pre-accounting of private mappings ...
155 : */
156 0 : static inline int shmem_acct_size(unsigned long flags, loff_t size)
157 : {
158 0 : return (flags & VM_NORESERVE) ?
159 0 : 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
160 : }
161 :
162 : static inline void shmem_unacct_size(unsigned long flags, loff_t size)
163 : {
164 0 : if (!(flags & VM_NORESERVE))
165 0 : vm_unacct_memory(VM_ACCT(size));
166 : }
167 :
168 0 : static inline int shmem_reacct_size(unsigned long flags,
169 : loff_t oldsize, loff_t newsize)
170 : {
171 0 : if (!(flags & VM_NORESERVE)) {
172 0 : if (VM_ACCT(newsize) > VM_ACCT(oldsize))
173 0 : return security_vm_enough_memory_mm(current->mm,
174 0 : VM_ACCT(newsize) - VM_ACCT(oldsize));
175 0 : else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
176 0 : vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
177 : }
178 : return 0;
179 : }
180 :
181 : /*
182 : * ... whereas tmpfs objects are accounted incrementally as
183 : * pages are allocated, in order to allow large sparse files.
184 : * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
185 : * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
186 : */
187 : static inline int shmem_acct_block(unsigned long flags, long pages)
188 : {
189 0 : if (!(flags & VM_NORESERVE))
190 : return 0;
191 :
192 0 : return security_vm_enough_memory_mm(current->mm,
193 : pages * VM_ACCT(PAGE_SIZE));
194 : }
195 :
196 : static inline void shmem_unacct_blocks(unsigned long flags, long pages)
197 : {
198 0 : if (flags & VM_NORESERVE)
199 : vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
200 : }
201 :
202 0 : static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
203 : {
204 0 : struct shmem_inode_info *info = SHMEM_I(inode);
205 0 : struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
206 :
207 0 : if (shmem_acct_block(info->flags, pages))
208 : return false;
209 :
210 0 : if (sbinfo->max_blocks) {
211 0 : if (percpu_counter_compare(&sbinfo->used_blocks,
212 0 : sbinfo->max_blocks - pages) > 0)
213 : goto unacct;
214 0 : percpu_counter_add(&sbinfo->used_blocks, pages);
215 : }
216 :
217 : return true;
218 :
219 : unacct:
220 0 : shmem_unacct_blocks(info->flags, pages);
221 : return false;
222 : }
223 :
224 0 : static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
225 : {
226 0 : struct shmem_inode_info *info = SHMEM_I(inode);
227 0 : struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
228 :
229 0 : if (sbinfo->max_blocks)
230 0 : percpu_counter_sub(&sbinfo->used_blocks, pages);
231 0 : shmem_unacct_blocks(info->flags, pages);
232 0 : }
233 :
234 : static const struct super_operations shmem_ops;
235 : const struct address_space_operations shmem_aops;
236 : static const struct file_operations shmem_file_operations;
237 : static const struct inode_operations shmem_inode_operations;
238 : static const struct inode_operations shmem_dir_inode_operations;
239 : static const struct inode_operations shmem_special_inode_operations;
240 : static const struct vm_operations_struct shmem_vm_ops;
241 : static const struct vm_operations_struct shmem_anon_vm_ops;
242 : static struct file_system_type shmem_fs_type;
243 :
244 0 : bool vma_is_anon_shmem(struct vm_area_struct *vma)
245 : {
246 0 : return vma->vm_ops == &shmem_anon_vm_ops;
247 : }
248 :
249 0 : bool vma_is_shmem(struct vm_area_struct *vma)
250 : {
251 0 : return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
252 : }
253 :
254 : static LIST_HEAD(shmem_swaplist);
255 : static DEFINE_MUTEX(shmem_swaplist_mutex);
256 :
257 : /*
258 : * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
259 : * produces a novel ino for the newly allocated inode.
260 : *
261 : * It may also be called when making a hard link to permit the space needed by
262 : * each dentry. However, in that case, no new inode number is needed since that
263 : * internally draws from another pool of inode numbers (currently global
264 : * get_next_ino()). This case is indicated by passing NULL as inop.
265 : */
266 : #define SHMEM_INO_BATCH 1024
267 1 : static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
268 : {
269 1 : struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
270 : ino_t ino;
271 :
272 1 : if (!(sb->s_flags & SB_KERNMOUNT)) {
273 0 : raw_spin_lock(&sbinfo->stat_lock);
274 0 : if (sbinfo->max_inodes) {
275 0 : if (!sbinfo->free_inodes) {
276 0 : raw_spin_unlock(&sbinfo->stat_lock);
277 0 : return -ENOSPC;
278 : }
279 0 : sbinfo->free_inodes--;
280 : }
281 0 : if (inop) {
282 0 : ino = sbinfo->next_ino++;
283 0 : if (unlikely(is_zero_ino(ino)))
284 0 : ino = sbinfo->next_ino++;
285 0 : if (unlikely(!sbinfo->full_inums &&
286 : ino > UINT_MAX)) {
287 : /*
288 : * Emulate get_next_ino uint wraparound for
289 : * compatibility
290 : */
291 : if (IS_ENABLED(CONFIG_64BIT))
292 0 : pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
293 : __func__, MINOR(sb->s_dev));
294 : sbinfo->next_ino = 1;
295 0 : ino = sbinfo->next_ino++;
296 : }
297 0 : *inop = ino;
298 : }
299 0 : raw_spin_unlock(&sbinfo->stat_lock);
300 1 : } else if (inop) {
301 : /*
302 : * __shmem_file_setup, one of our callers, is lock-free: it
303 : * doesn't hold stat_lock in shmem_reserve_inode since
304 : * max_inodes is always 0, and is called from potentially
305 : * unknown contexts. As such, use a per-cpu batched allocator
306 : * which doesn't require the per-sb stat_lock unless we are at
307 : * the batch boundary.
308 : *
309 : * We don't need to worry about inode{32,64} since SB_KERNMOUNT
310 : * shmem mounts are not exposed to userspace, so we don't need
311 : * to worry about things like glibc compatibility.
312 : */
313 : ino_t *next_ino;
314 :
315 1 : next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
316 1 : ino = *next_ino;
317 1 : if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
318 1 : raw_spin_lock(&sbinfo->stat_lock);
319 1 : ino = sbinfo->next_ino;
320 1 : sbinfo->next_ino += SHMEM_INO_BATCH;
321 1 : raw_spin_unlock(&sbinfo->stat_lock);
322 1 : if (unlikely(is_zero_ino(ino)))
323 1 : ino++;
324 : }
325 1 : *inop = ino;
326 1 : *next_ino = ++ino;
327 1 : put_cpu();
328 : }
329 :
330 : return 0;
331 : }
332 :
333 : static void shmem_free_inode(struct super_block *sb)
334 : {
335 0 : struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
336 0 : if (sbinfo->max_inodes) {
337 0 : raw_spin_lock(&sbinfo->stat_lock);
338 0 : sbinfo->free_inodes++;
339 0 : raw_spin_unlock(&sbinfo->stat_lock);
340 : }
341 : }
342 :
343 : /**
344 : * shmem_recalc_inode - recalculate the block usage of an inode
345 : * @inode: inode to recalc
346 : *
347 : * We have to calculate the free blocks since the mm can drop
348 : * undirtied hole pages behind our back.
349 : *
350 : * But normally info->alloced == inode->i_mapping->nrpages + info->swapped
351 : * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
352 : *
353 : * It has to be called with the spinlock held.
354 : */
355 0 : static void shmem_recalc_inode(struct inode *inode)
356 : {
357 0 : struct shmem_inode_info *info = SHMEM_I(inode);
358 : long freed;
359 :
360 0 : freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
361 0 : if (freed > 0) {
362 0 : info->alloced -= freed;
363 0 : inode->i_blocks -= freed * BLOCKS_PER_PAGE;
364 0 : shmem_inode_unacct_blocks(inode, freed);
365 : }
366 0 : }
367 :
368 0 : bool shmem_charge(struct inode *inode, long pages)
369 : {
370 0 : struct shmem_inode_info *info = SHMEM_I(inode);
371 : unsigned long flags;
372 :
373 0 : if (!shmem_inode_acct_block(inode, pages))
374 : return false;
375 :
376 : /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
377 0 : inode->i_mapping->nrpages += pages;
378 :
379 0 : spin_lock_irqsave(&info->lock, flags);
380 0 : info->alloced += pages;
381 0 : inode->i_blocks += pages * BLOCKS_PER_PAGE;
382 0 : shmem_recalc_inode(inode);
383 0 : spin_unlock_irqrestore(&info->lock, flags);
384 :
385 0 : return true;
386 : }
387 :
388 0 : void shmem_uncharge(struct inode *inode, long pages)
389 : {
390 0 : struct shmem_inode_info *info = SHMEM_I(inode);
391 : unsigned long flags;
392 :
393 : /* nrpages adjustment done by __filemap_remove_folio() or caller */
394 :
395 0 : spin_lock_irqsave(&info->lock, flags);
396 0 : info->alloced -= pages;
397 0 : inode->i_blocks -= pages * BLOCKS_PER_PAGE;
398 0 : shmem_recalc_inode(inode);
399 0 : spin_unlock_irqrestore(&info->lock, flags);
400 :
401 0 : shmem_inode_unacct_blocks(inode, pages);
402 0 : }
403 :
404 : /*
405 : * Replace item expected in xarray by a new item, while holding xa_lock.
406 : */
407 0 : static int shmem_replace_entry(struct address_space *mapping,
408 : pgoff_t index, void *expected, void *replacement)
409 : {
410 0 : XA_STATE(xas, &mapping->i_pages, index);
411 : void *item;
412 :
413 : VM_BUG_ON(!expected);
414 : VM_BUG_ON(!replacement);
415 0 : item = xas_load(&xas);
416 0 : if (item != expected)
417 : return -ENOENT;
418 0 : xas_store(&xas, replacement);
419 0 : return 0;
420 : }
421 :
422 : /*
423 : * Sometimes, before we decide whether to proceed or to fail, we must check
424 : * that an entry was not already brought back from swap by a racing thread.
425 : *
426 : * Checking page is not enough: by the time a SwapCache page is locked, it
427 : * might be reused, and again be SwapCache, using the same swap as before.
428 : */
429 0 : static bool shmem_confirm_swap(struct address_space *mapping,
430 : pgoff_t index, swp_entry_t swap)
431 : {
432 0 : return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
433 : }
434 :
435 : /*
436 : * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
437 : *
438 : * SHMEM_HUGE_NEVER:
439 : * disables huge pages for the mount;
440 : * SHMEM_HUGE_ALWAYS:
441 : * enables huge pages for the mount;
442 : * SHMEM_HUGE_WITHIN_SIZE:
443 : * only allocate huge pages if the page will be fully within i_size,
444 : * also respect fadvise()/madvise() hints;
445 : * SHMEM_HUGE_ADVISE:
446 : * only allocate huge pages if requested with fadvise()/madvise();
447 : */
448 :
449 : #define SHMEM_HUGE_NEVER 0
450 : #define SHMEM_HUGE_ALWAYS 1
451 : #define SHMEM_HUGE_WITHIN_SIZE 2
452 : #define SHMEM_HUGE_ADVISE 3
453 :
454 : /*
455 : * Special values.
456 : * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
457 : *
458 : * SHMEM_HUGE_DENY:
459 : * disables huge on shm_mnt and all mounts, for emergency use;
460 : * SHMEM_HUGE_FORCE:
461 : * enables huge on shm_mnt and all mounts, w/o needing option, for testing;
462 : *
463 : */
464 : #define SHMEM_HUGE_DENY (-1)
465 : #define SHMEM_HUGE_FORCE (-2)
466 :
467 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
468 : /* ifdef here to avoid bloating shmem.o when not necessary */
469 :
470 : static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
471 :
472 : bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
473 : struct mm_struct *mm, unsigned long vm_flags)
474 : {
475 : loff_t i_size;
476 :
477 : if (!S_ISREG(inode->i_mode))
478 : return false;
479 : if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags)))
480 : return false;
481 : if (shmem_huge == SHMEM_HUGE_DENY)
482 : return false;
483 : if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
484 : return true;
485 :
486 : switch (SHMEM_SB(inode->i_sb)->huge) {
487 : case SHMEM_HUGE_ALWAYS:
488 : return true;
489 : case SHMEM_HUGE_WITHIN_SIZE:
490 : index = round_up(index + 1, HPAGE_PMD_NR);
491 : i_size = round_up(i_size_read(inode), PAGE_SIZE);
492 : if (i_size >> PAGE_SHIFT >= index)
493 : return true;
494 : fallthrough;
495 : case SHMEM_HUGE_ADVISE:
496 : if (mm && (vm_flags & VM_HUGEPAGE))
497 : return true;
498 : fallthrough;
499 : default:
500 : return false;
501 : }
502 : }
503 :
504 : #if defined(CONFIG_SYSFS)
505 : static int shmem_parse_huge(const char *str)
506 : {
507 : if (!strcmp(str, "never"))
508 : return SHMEM_HUGE_NEVER;
509 : if (!strcmp(str, "always"))
510 : return SHMEM_HUGE_ALWAYS;
511 : if (!strcmp(str, "within_size"))
512 : return SHMEM_HUGE_WITHIN_SIZE;
513 : if (!strcmp(str, "advise"))
514 : return SHMEM_HUGE_ADVISE;
515 : if (!strcmp(str, "deny"))
516 : return SHMEM_HUGE_DENY;
517 : if (!strcmp(str, "force"))
518 : return SHMEM_HUGE_FORCE;
519 : return -EINVAL;
520 : }
521 : #endif
522 :
523 : #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
524 : static const char *shmem_format_huge(int huge)
525 : {
526 : switch (huge) {
527 : case SHMEM_HUGE_NEVER:
528 : return "never";
529 : case SHMEM_HUGE_ALWAYS:
530 : return "always";
531 : case SHMEM_HUGE_WITHIN_SIZE:
532 : return "within_size";
533 : case SHMEM_HUGE_ADVISE:
534 : return "advise";
535 : case SHMEM_HUGE_DENY:
536 : return "deny";
537 : case SHMEM_HUGE_FORCE:
538 : return "force";
539 : default:
540 : VM_BUG_ON(1);
541 : return "bad_val";
542 : }
543 : }
544 : #endif
545 :
546 : static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
547 : struct shrink_control *sc, unsigned long nr_to_split)
548 : {
549 : LIST_HEAD(list), *pos, *next;
550 : LIST_HEAD(to_remove);
551 : struct inode *inode;
552 : struct shmem_inode_info *info;
553 : struct folio *folio;
554 : unsigned long batch = sc ? sc->nr_to_scan : 128;
555 : int split = 0;
556 :
557 : if (list_empty(&sbinfo->shrinklist))
558 : return SHRINK_STOP;
559 :
560 : spin_lock(&sbinfo->shrinklist_lock);
561 : list_for_each_safe(pos, next, &sbinfo->shrinklist) {
562 : info = list_entry(pos, struct shmem_inode_info, shrinklist);
563 :
564 : /* pin the inode */
565 : inode = igrab(&info->vfs_inode);
566 :
567 : /* inode is about to be evicted */
568 : if (!inode) {
569 : list_del_init(&info->shrinklist);
570 : goto next;
571 : }
572 :
573 : /* Check if there's anything to gain */
574 : if (round_up(inode->i_size, PAGE_SIZE) ==
575 : round_up(inode->i_size, HPAGE_PMD_SIZE)) {
576 : list_move(&info->shrinklist, &to_remove);
577 : goto next;
578 : }
579 :
580 : list_move(&info->shrinklist, &list);
581 : next:
582 : sbinfo->shrinklist_len--;
583 : if (!--batch)
584 : break;
585 : }
586 : spin_unlock(&sbinfo->shrinklist_lock);
587 :
588 : list_for_each_safe(pos, next, &to_remove) {
589 : info = list_entry(pos, struct shmem_inode_info, shrinklist);
590 : inode = &info->vfs_inode;
591 : list_del_init(&info->shrinklist);
592 : iput(inode);
593 : }
594 :
595 : list_for_each_safe(pos, next, &list) {
596 : int ret;
597 : pgoff_t index;
598 :
599 : info = list_entry(pos, struct shmem_inode_info, shrinklist);
600 : inode = &info->vfs_inode;
601 :
602 : if (nr_to_split && split >= nr_to_split)
603 : goto move_back;
604 :
605 : index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
606 : folio = filemap_get_folio(inode->i_mapping, index);
607 : if (IS_ERR(folio))
608 : goto drop;
609 :
610 : /* No huge page at the end of the file: nothing to split */
611 : if (!folio_test_large(folio)) {
612 : folio_put(folio);
613 : goto drop;
614 : }
615 :
616 : /*
617 : * Move the inode on the list back to shrinklist if we failed
618 : * to lock the page at this time.
619 : *
620 : * Waiting for the lock may lead to deadlock in the
621 : * reclaim path.
622 : */
623 : if (!folio_trylock(folio)) {
624 : folio_put(folio);
625 : goto move_back;
626 : }
627 :
628 : ret = split_folio(folio);
629 : folio_unlock(folio);
630 : folio_put(folio);
631 :
632 : /* If split failed move the inode on the list back to shrinklist */
633 : if (ret)
634 : goto move_back;
635 :
636 : split++;
637 : drop:
638 : list_del_init(&info->shrinklist);
639 : goto put;
640 : move_back:
641 : /*
642 : * Make sure the inode is either on the global list or deleted
643 : * from any local list before iput() since it could be deleted
644 : * in another thread once we put the inode (then the local list
645 : * is corrupted).
646 : */
647 : spin_lock(&sbinfo->shrinklist_lock);
648 : list_move(&info->shrinklist, &sbinfo->shrinklist);
649 : sbinfo->shrinklist_len++;
650 : spin_unlock(&sbinfo->shrinklist_lock);
651 : put:
652 : iput(inode);
653 : }
654 :
655 : return split;
656 : }
657 :
658 : static long shmem_unused_huge_scan(struct super_block *sb,
659 : struct shrink_control *sc)
660 : {
661 : struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
662 :
663 : if (!READ_ONCE(sbinfo->shrinklist_len))
664 : return SHRINK_STOP;
665 :
666 : return shmem_unused_huge_shrink(sbinfo, sc, 0);
667 : }
668 :
669 : static long shmem_unused_huge_count(struct super_block *sb,
670 : struct shrink_control *sc)
671 : {
672 : struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
673 : return READ_ONCE(sbinfo->shrinklist_len);
674 : }
675 : #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
676 :
677 : #define shmem_huge SHMEM_HUGE_DENY
678 :
679 0 : bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
680 : struct mm_struct *mm, unsigned long vm_flags)
681 : {
682 0 : return false;
683 : }
684 :
685 : static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
686 : struct shrink_control *sc, unsigned long nr_to_split)
687 : {
688 : return 0;
689 : }
690 : #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
691 :
692 : /*
693 : * Like filemap_add_folio, but error if expected item has gone.
694 : */
695 0 : static int shmem_add_to_page_cache(struct folio *folio,
696 : struct address_space *mapping,
697 : pgoff_t index, void *expected, gfp_t gfp,
698 : struct mm_struct *charge_mm)
699 : {
700 0 : XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
701 0 : long nr = folio_nr_pages(folio);
702 : int error;
703 :
704 : VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
705 : VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
706 : VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
707 : VM_BUG_ON(expected && folio_test_large(folio));
708 :
709 0 : folio_ref_add(folio, nr);
710 0 : folio->mapping = mapping;
711 0 : folio->index = index;
712 :
713 : if (!folio_test_swapcache(folio)) {
714 : error = mem_cgroup_charge(folio, charge_mm, gfp);
715 : if (error) {
716 : if (folio_test_pmd_mappable(folio)) {
717 : count_vm_event(THP_FILE_FALLBACK);
718 : count_vm_event(THP_FILE_FALLBACK_CHARGE);
719 : }
720 : goto error;
721 : }
722 : }
723 : folio_throttle_swaprate(folio, gfp);
724 :
725 : do {
726 0 : xas_lock_irq(&xas);
727 0 : if (expected != xas_find_conflict(&xas)) {
728 0 : xas_set_err(&xas, -EEXIST);
729 : goto unlock;
730 : }
731 0 : if (expected && xas_find_conflict(&xas)) {
732 0 : xas_set_err(&xas, -EEXIST);
733 : goto unlock;
734 : }
735 0 : xas_store(&xas, folio);
736 0 : if (xas_error(&xas))
737 : goto unlock;
738 0 : if (folio_test_pmd_mappable(folio)) {
739 : count_vm_event(THP_FILE_ALLOC);
740 : __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
741 : }
742 0 : mapping->nrpages += nr;
743 0 : __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
744 0 : __lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
745 : unlock:
746 0 : xas_unlock_irq(&xas);
747 0 : } while (xas_nomem(&xas, gfp));
748 :
749 0 : if (xas_error(&xas)) {
750 0 : error = xas_error(&xas);
751 : goto error;
752 : }
753 :
754 : return 0;
755 : error:
756 0 : folio->mapping = NULL;
757 0 : folio_ref_sub(folio, nr);
758 : return error;
759 : }
760 :
761 : /*
762 : * Like delete_from_page_cache, but substitutes swap for @folio.
763 : */
764 0 : static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
765 : {
766 0 : struct address_space *mapping = folio->mapping;
767 0 : long nr = folio_nr_pages(folio);
768 : int error;
769 :
770 0 : xa_lock_irq(&mapping->i_pages);
771 0 : error = shmem_replace_entry(mapping, folio->index, folio, radswap);
772 0 : folio->mapping = NULL;
773 0 : mapping->nrpages -= nr;
774 0 : __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
775 0 : __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
776 0 : xa_unlock_irq(&mapping->i_pages);
777 0 : folio_put(folio);
778 0 : BUG_ON(error);
779 0 : }
780 :
781 : /*
782 : * Remove swap entry from page cache, free the swap and its page cache.
783 : */
784 0 : static int shmem_free_swap(struct address_space *mapping,
785 : pgoff_t index, void *radswap)
786 : {
787 : void *old;
788 :
789 0 : old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
790 0 : if (old != radswap)
791 : return -ENOENT;
792 0 : free_swap_and_cache(radix_to_swp_entry(radswap));
793 0 : return 0;
794 : }
795 :
796 : /*
797 : * Determine (in bytes) how many of the shmem object's pages mapped by the
798 : * given offsets are swapped out.
799 : *
800 : * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
801 : * as long as the inode doesn't go away and racy results are not a problem.
802 : */
803 0 : unsigned long shmem_partial_swap_usage(struct address_space *mapping,
804 : pgoff_t start, pgoff_t end)
805 : {
806 0 : XA_STATE(xas, &mapping->i_pages, start);
807 : struct page *page;
808 0 : unsigned long swapped = 0;
809 :
810 : rcu_read_lock();
811 0 : xas_for_each(&xas, page, end - 1) {
812 0 : if (xas_retry(&xas, page))
813 0 : continue;
814 0 : if (xa_is_value(page))
815 0 : swapped++;
816 :
817 0 : if (need_resched()) {
818 0 : xas_pause(&xas);
819 : cond_resched_rcu();
820 : }
821 : }
822 :
823 : rcu_read_unlock();
824 :
825 0 : return swapped << PAGE_SHIFT;
826 : }
827 :
828 : /*
829 : * Determine (in bytes) how many of the shmem object's pages mapped by the
830 : * given vma is swapped out.
831 : *
832 : * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
833 : * as long as the inode doesn't go away and racy results are not a problem.
834 : */
835 0 : unsigned long shmem_swap_usage(struct vm_area_struct *vma)
836 : {
837 0 : struct inode *inode = file_inode(vma->vm_file);
838 0 : struct shmem_inode_info *info = SHMEM_I(inode);
839 0 : struct address_space *mapping = inode->i_mapping;
840 : unsigned long swapped;
841 :
842 : /* Be careful as we don't hold info->lock */
843 0 : swapped = READ_ONCE(info->swapped);
844 :
845 : /*
846 : * The easier cases are when the shmem object has nothing in swap, or
847 : * the vma maps it whole. Then we can simply use the stats that we
848 : * already track.
849 : */
850 0 : if (!swapped)
851 : return 0;
852 :
853 0 : if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
854 0 : return swapped << PAGE_SHIFT;
855 :
856 : /* Here comes the more involved part */
857 0 : return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
858 0 : vma->vm_pgoff + vma_pages(vma));
859 : }
860 :
861 : /*
862 : * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
863 : */
864 0 : void shmem_unlock_mapping(struct address_space *mapping)
865 : {
866 : struct folio_batch fbatch;
867 0 : pgoff_t index = 0;
868 :
869 0 : folio_batch_init(&fbatch);
870 : /*
871 : * Minor point, but we might as well stop if someone else SHM_LOCKs it.
872 : */
873 0 : while (!mapping_unevictable(mapping) &&
874 0 : filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
875 0 : check_move_unevictable_folios(&fbatch);
876 0 : folio_batch_release(&fbatch);
877 0 : cond_resched();
878 : }
879 0 : }
880 :
881 0 : static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
882 : {
883 : struct folio *folio;
884 :
885 : /*
886 : * At first avoid shmem_get_folio(,,,SGP_READ): that fails
887 : * beyond i_size, and reports fallocated folios as holes.
888 : */
889 0 : folio = filemap_get_entry(inode->i_mapping, index);
890 0 : if (!folio)
891 : return folio;
892 0 : if (!xa_is_value(folio)) {
893 0 : folio_lock(folio);
894 0 : if (folio->mapping == inode->i_mapping)
895 : return folio;
896 : /* The folio has been swapped out */
897 0 : folio_unlock(folio);
898 0 : folio_put(folio);
899 : }
900 : /*
901 : * But read a folio back from swap if any of it is within i_size
902 : * (although in some cases this is just a waste of time).
903 : */
904 0 : folio = NULL;
905 0 : shmem_get_folio(inode, index, &folio, SGP_READ);
906 0 : return folio;
907 : }
908 :
909 : /*
910 : * Remove range of pages and swap entries from page cache, and free them.
911 : * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
912 : */
913 0 : static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
914 : bool unfalloc)
915 : {
916 0 : struct address_space *mapping = inode->i_mapping;
917 0 : struct shmem_inode_info *info = SHMEM_I(inode);
918 0 : pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
919 0 : pgoff_t end = (lend + 1) >> PAGE_SHIFT;
920 : struct folio_batch fbatch;
921 : pgoff_t indices[PAGEVEC_SIZE];
922 : struct folio *folio;
923 : bool same_folio;
924 0 : long nr_swaps_freed = 0;
925 : pgoff_t index;
926 : int i;
927 :
928 0 : if (lend == -1)
929 0 : end = -1; /* unsigned, so actually very big */
930 :
931 0 : if (info->fallocend > start && info->fallocend <= end && !unfalloc)
932 0 : info->fallocend = start;
933 :
934 0 : folio_batch_init(&fbatch);
935 0 : index = start;
936 0 : while (index < end && find_lock_entries(mapping, &index, end - 1,
937 : &fbatch, indices)) {
938 0 : for (i = 0; i < folio_batch_count(&fbatch); i++) {
939 0 : folio = fbatch.folios[i];
940 :
941 0 : if (xa_is_value(folio)) {
942 0 : if (unfalloc)
943 0 : continue;
944 0 : nr_swaps_freed += !shmem_free_swap(mapping,
945 : indices[i], folio);
946 0 : continue;
947 : }
948 :
949 0 : if (!unfalloc || !folio_test_uptodate(folio))
950 0 : truncate_inode_folio(mapping, folio);
951 0 : folio_unlock(folio);
952 : }
953 0 : folio_batch_remove_exceptionals(&fbatch);
954 0 : folio_batch_release(&fbatch);
955 0 : cond_resched();
956 : }
957 :
958 : /*
959 : * When undoing a failed fallocate, we want none of the partial folio
960 : * zeroing and splitting below, but shall want to truncate the whole
961 : * folio when !uptodate indicates that it was added by this fallocate,
962 : * even when [lstart, lend] covers only a part of the folio.
963 : */
964 0 : if (unfalloc)
965 : goto whole_folios;
966 :
967 0 : same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
968 0 : folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
969 0 : if (folio) {
970 0 : same_folio = lend < folio_pos(folio) + folio_size(folio);
971 0 : folio_mark_dirty(folio);
972 0 : if (!truncate_inode_partial_folio(folio, lstart, lend)) {
973 0 : start = folio->index + folio_nr_pages(folio);
974 0 : if (same_folio)
975 0 : end = folio->index;
976 : }
977 0 : folio_unlock(folio);
978 : folio_put(folio);
979 : folio = NULL;
980 : }
981 :
982 0 : if (!same_folio)
983 0 : folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
984 0 : if (folio) {
985 0 : folio_mark_dirty(folio);
986 0 : if (!truncate_inode_partial_folio(folio, lstart, lend))
987 0 : end = folio->index;
988 0 : folio_unlock(folio);
989 : folio_put(folio);
990 : }
991 :
992 : whole_folios:
993 :
994 0 : index = start;
995 0 : while (index < end) {
996 0 : cond_resched();
997 :
998 0 : if (!find_get_entries(mapping, &index, end - 1, &fbatch,
999 : indices)) {
1000 : /* If all gone or hole-punch or unfalloc, we're done */
1001 0 : if (index == start || end != -1)
1002 : break;
1003 : /* But if truncating, restart to make sure all gone */
1004 0 : index = start;
1005 0 : continue;
1006 : }
1007 0 : for (i = 0; i < folio_batch_count(&fbatch); i++) {
1008 0 : folio = fbatch.folios[i];
1009 :
1010 0 : if (xa_is_value(folio)) {
1011 0 : if (unfalloc)
1012 0 : continue;
1013 0 : if (shmem_free_swap(mapping, indices[i], folio)) {
1014 : /* Swap was replaced by page: retry */
1015 0 : index = indices[i];
1016 0 : break;
1017 : }
1018 0 : nr_swaps_freed++;
1019 0 : continue;
1020 : }
1021 :
1022 0 : folio_lock(folio);
1023 :
1024 0 : if (!unfalloc || !folio_test_uptodate(folio)) {
1025 0 : if (folio_mapping(folio) != mapping) {
1026 : /* Page was replaced by swap: retry */
1027 0 : folio_unlock(folio);
1028 0 : index = indices[i];
1029 0 : break;
1030 : }
1031 : VM_BUG_ON_FOLIO(folio_test_writeback(folio),
1032 : folio);
1033 0 : truncate_inode_folio(mapping, folio);
1034 : }
1035 0 : folio_unlock(folio);
1036 : }
1037 0 : folio_batch_remove_exceptionals(&fbatch);
1038 : folio_batch_release(&fbatch);
1039 : }
1040 :
1041 0 : spin_lock_irq(&info->lock);
1042 0 : info->swapped -= nr_swaps_freed;
1043 0 : shmem_recalc_inode(inode);
1044 0 : spin_unlock_irq(&info->lock);
1045 0 : }
1046 :
1047 0 : void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
1048 : {
1049 0 : shmem_undo_range(inode, lstart, lend, false);
1050 0 : inode->i_ctime = inode->i_mtime = current_time(inode);
1051 0 : inode_inc_iversion(inode);
1052 0 : }
1053 : EXPORT_SYMBOL_GPL(shmem_truncate_range);
1054 :
1055 0 : static int shmem_getattr(struct mnt_idmap *idmap,
1056 : const struct path *path, struct kstat *stat,
1057 : u32 request_mask, unsigned int query_flags)
1058 : {
1059 0 : struct inode *inode = path->dentry->d_inode;
1060 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1061 :
1062 0 : if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
1063 0 : spin_lock_irq(&info->lock);
1064 0 : shmem_recalc_inode(inode);
1065 0 : spin_unlock_irq(&info->lock);
1066 : }
1067 0 : if (info->fsflags & FS_APPEND_FL)
1068 0 : stat->attributes |= STATX_ATTR_APPEND;
1069 0 : if (info->fsflags & FS_IMMUTABLE_FL)
1070 0 : stat->attributes |= STATX_ATTR_IMMUTABLE;
1071 0 : if (info->fsflags & FS_NODUMP_FL)
1072 0 : stat->attributes |= STATX_ATTR_NODUMP;
1073 0 : stat->attributes_mask |= (STATX_ATTR_APPEND |
1074 : STATX_ATTR_IMMUTABLE |
1075 : STATX_ATTR_NODUMP);
1076 0 : generic_fillattr(idmap, inode, stat);
1077 :
1078 0 : if (shmem_is_huge(inode, 0, false, NULL, 0))
1079 : stat->blksize = HPAGE_PMD_SIZE;
1080 :
1081 0 : if (request_mask & STATX_BTIME) {
1082 0 : stat->result_mask |= STATX_BTIME;
1083 0 : stat->btime.tv_sec = info->i_crtime.tv_sec;
1084 0 : stat->btime.tv_nsec = info->i_crtime.tv_nsec;
1085 : }
1086 :
1087 0 : return 0;
1088 : }
1089 :
1090 0 : static int shmem_setattr(struct mnt_idmap *idmap,
1091 : struct dentry *dentry, struct iattr *attr)
1092 : {
1093 0 : struct inode *inode = d_inode(dentry);
1094 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1095 : int error;
1096 0 : bool update_mtime = false;
1097 0 : bool update_ctime = true;
1098 :
1099 0 : error = setattr_prepare(idmap, dentry, attr);
1100 0 : if (error)
1101 : return error;
1102 :
1103 0 : if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
1104 0 : if ((inode->i_mode ^ attr->ia_mode) & 0111) {
1105 : return -EPERM;
1106 : }
1107 : }
1108 :
1109 0 : if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
1110 0 : loff_t oldsize = inode->i_size;
1111 0 : loff_t newsize = attr->ia_size;
1112 :
1113 : /* protected by i_rwsem */
1114 0 : if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
1115 0 : (newsize > oldsize && (info->seals & F_SEAL_GROW)))
1116 : return -EPERM;
1117 :
1118 0 : if (newsize != oldsize) {
1119 0 : error = shmem_reacct_size(SHMEM_I(inode)->flags,
1120 : oldsize, newsize);
1121 0 : if (error)
1122 : return error;
1123 0 : i_size_write(inode, newsize);
1124 0 : update_mtime = true;
1125 : } else {
1126 : update_ctime = false;
1127 : }
1128 0 : if (newsize <= oldsize) {
1129 0 : loff_t holebegin = round_up(newsize, PAGE_SIZE);
1130 0 : if (oldsize > holebegin)
1131 0 : unmap_mapping_range(inode->i_mapping,
1132 : holebegin, 0, 1);
1133 0 : if (info->alloced)
1134 0 : shmem_truncate_range(inode,
1135 : newsize, (loff_t)-1);
1136 : /* unmap again to remove racily COWed private pages */
1137 0 : if (oldsize > holebegin)
1138 0 : unmap_mapping_range(inode->i_mapping,
1139 : holebegin, 0, 1);
1140 : }
1141 : }
1142 :
1143 0 : setattr_copy(idmap, inode, attr);
1144 0 : if (attr->ia_valid & ATTR_MODE)
1145 0 : error = posix_acl_chmod(idmap, dentry, inode->i_mode);
1146 0 : if (!error && update_ctime) {
1147 0 : inode->i_ctime = current_time(inode);
1148 0 : if (update_mtime)
1149 0 : inode->i_mtime = inode->i_ctime;
1150 : inode_inc_iversion(inode);
1151 : }
1152 : return error;
1153 : }
1154 :
1155 0 : static void shmem_evict_inode(struct inode *inode)
1156 : {
1157 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1158 0 : struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1159 :
1160 0 : if (shmem_mapping(inode->i_mapping)) {
1161 0 : shmem_unacct_size(info->flags, inode->i_size);
1162 0 : inode->i_size = 0;
1163 0 : mapping_set_exiting(inode->i_mapping);
1164 0 : shmem_truncate_range(inode, 0, (loff_t)-1);
1165 0 : if (!list_empty(&info->shrinklist)) {
1166 0 : spin_lock(&sbinfo->shrinklist_lock);
1167 0 : if (!list_empty(&info->shrinklist)) {
1168 0 : list_del_init(&info->shrinklist);
1169 0 : sbinfo->shrinklist_len--;
1170 : }
1171 0 : spin_unlock(&sbinfo->shrinklist_lock);
1172 : }
1173 0 : while (!list_empty(&info->swaplist)) {
1174 : /* Wait while shmem_unuse() is scanning this inode... */
1175 0 : wait_var_event(&info->stop_eviction,
1176 : !atomic_read(&info->stop_eviction));
1177 0 : mutex_lock(&shmem_swaplist_mutex);
1178 : /* ...but beware of the race if we peeked too early */
1179 0 : if (!atomic_read(&info->stop_eviction))
1180 0 : list_del_init(&info->swaplist);
1181 0 : mutex_unlock(&shmem_swaplist_mutex);
1182 : }
1183 : }
1184 :
1185 0 : simple_xattrs_free(&info->xattrs);
1186 0 : WARN_ON(inode->i_blocks);
1187 0 : shmem_free_inode(inode->i_sb);
1188 0 : clear_inode(inode);
1189 0 : }
1190 :
1191 0 : static int shmem_find_swap_entries(struct address_space *mapping,
1192 : pgoff_t start, struct folio_batch *fbatch,
1193 : pgoff_t *indices, unsigned int type)
1194 : {
1195 0 : XA_STATE(xas, &mapping->i_pages, start);
1196 : struct folio *folio;
1197 : swp_entry_t entry;
1198 :
1199 : rcu_read_lock();
1200 0 : xas_for_each(&xas, folio, ULONG_MAX) {
1201 0 : if (xas_retry(&xas, folio))
1202 0 : continue;
1203 :
1204 0 : if (!xa_is_value(folio))
1205 0 : continue;
1206 :
1207 0 : entry = radix_to_swp_entry(folio);
1208 : /*
1209 : * swapin error entries can be found in the mapping. But they're
1210 : * deliberately ignored here as we've done everything we can do.
1211 : */
1212 0 : if (swp_type(entry) != type)
1213 0 : continue;
1214 :
1215 0 : indices[folio_batch_count(fbatch)] = xas.xa_index;
1216 0 : if (!folio_batch_add(fbatch, folio))
1217 : break;
1218 :
1219 0 : if (need_resched()) {
1220 0 : xas_pause(&xas);
1221 : cond_resched_rcu();
1222 : }
1223 : }
1224 : rcu_read_unlock();
1225 :
1226 0 : return xas.xa_index;
1227 : }
1228 :
1229 : /*
1230 : * Move the swapped pages for an inode to page cache. Returns the count
1231 : * of pages swapped in, or the error in case of failure.
1232 : */
1233 0 : static int shmem_unuse_swap_entries(struct inode *inode,
1234 : struct folio_batch *fbatch, pgoff_t *indices)
1235 : {
1236 0 : int i = 0;
1237 0 : int ret = 0;
1238 0 : int error = 0;
1239 0 : struct address_space *mapping = inode->i_mapping;
1240 :
1241 0 : for (i = 0; i < folio_batch_count(fbatch); i++) {
1242 0 : struct folio *folio = fbatch->folios[i];
1243 :
1244 0 : if (!xa_is_value(folio))
1245 0 : continue;
1246 0 : error = shmem_swapin_folio(inode, indices[i],
1247 : &folio, SGP_CACHE,
1248 : mapping_gfp_mask(mapping),
1249 : NULL, NULL);
1250 0 : if (error == 0) {
1251 0 : folio_unlock(folio);
1252 0 : folio_put(folio);
1253 0 : ret++;
1254 : }
1255 0 : if (error == -ENOMEM)
1256 : break;
1257 0 : error = 0;
1258 : }
1259 0 : return error ? error : ret;
1260 : }
1261 :
1262 : /*
1263 : * If swap found in inode, free it and move page from swapcache to filecache.
1264 : */
1265 0 : static int shmem_unuse_inode(struct inode *inode, unsigned int type)
1266 : {
1267 0 : struct address_space *mapping = inode->i_mapping;
1268 0 : pgoff_t start = 0;
1269 : struct folio_batch fbatch;
1270 : pgoff_t indices[PAGEVEC_SIZE];
1271 0 : int ret = 0;
1272 :
1273 : do {
1274 0 : folio_batch_init(&fbatch);
1275 0 : shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
1276 0 : if (folio_batch_count(&fbatch) == 0) {
1277 : ret = 0;
1278 : break;
1279 : }
1280 :
1281 0 : ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
1282 0 : if (ret < 0)
1283 : break;
1284 :
1285 0 : start = indices[folio_batch_count(&fbatch) - 1];
1286 : } while (true);
1287 :
1288 0 : return ret;
1289 : }
1290 :
1291 : /*
1292 : * Read all the shared memory data that resides in the swap
1293 : * device 'type' back into memory, so the swap device can be
1294 : * unused.
1295 : */
1296 0 : int shmem_unuse(unsigned int type)
1297 : {
1298 : struct shmem_inode_info *info, *next;
1299 0 : int error = 0;
1300 :
1301 0 : if (list_empty(&shmem_swaplist))
1302 : return 0;
1303 :
1304 0 : mutex_lock(&shmem_swaplist_mutex);
1305 0 : list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1306 0 : if (!info->swapped) {
1307 0 : list_del_init(&info->swaplist);
1308 0 : continue;
1309 : }
1310 : /*
1311 : * Drop the swaplist mutex while searching the inode for swap;
1312 : * but before doing so, make sure shmem_evict_inode() will not
1313 : * remove placeholder inode from swaplist, nor let it be freed
1314 : * (igrab() would protect from unlink, but not from unmount).
1315 : */
1316 0 : atomic_inc(&info->stop_eviction);
1317 0 : mutex_unlock(&shmem_swaplist_mutex);
1318 :
1319 0 : error = shmem_unuse_inode(&info->vfs_inode, type);
1320 0 : cond_resched();
1321 :
1322 0 : mutex_lock(&shmem_swaplist_mutex);
1323 0 : next = list_next_entry(info, swaplist);
1324 0 : if (!info->swapped)
1325 0 : list_del_init(&info->swaplist);
1326 0 : if (atomic_dec_and_test(&info->stop_eviction))
1327 0 : wake_up_var(&info->stop_eviction);
1328 0 : if (error)
1329 : break;
1330 : }
1331 0 : mutex_unlock(&shmem_swaplist_mutex);
1332 :
1333 0 : return error;
1334 : }
1335 :
1336 : /*
1337 : * Move the page from the page cache to the swap cache.
1338 : */
1339 0 : static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1340 : {
1341 0 : struct folio *folio = page_folio(page);
1342 0 : struct address_space *mapping = folio->mapping;
1343 0 : struct inode *inode = mapping->host;
1344 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1345 0 : struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1346 : swp_entry_t swap;
1347 : pgoff_t index;
1348 :
1349 : /*
1350 : * Our capabilities prevent regular writeback or sync from ever calling
1351 : * shmem_writepage; but a stacking filesystem might use ->writepage of
1352 : * its underlying filesystem, in which case tmpfs should write out to
1353 : * swap only in response to memory pressure, and not for the writeback
1354 : * threads or sync.
1355 : */
1356 0 : if (WARN_ON_ONCE(!wbc->for_reclaim))
1357 : goto redirty;
1358 :
1359 0 : if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
1360 : goto redirty;
1361 :
1362 0 : if (!total_swap_pages)
1363 : goto redirty;
1364 :
1365 : /*
1366 : * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
1367 : * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
1368 : * and its shmem_writeback() needs them to be split when swapping.
1369 : */
1370 0 : if (folio_test_large(folio)) {
1371 : /* Ensure the subpages are still dirty */
1372 0 : folio_test_set_dirty(folio);
1373 0 : if (split_huge_page(page) < 0)
1374 : goto redirty;
1375 0 : folio = page_folio(page);
1376 : folio_clear_dirty(folio);
1377 : }
1378 :
1379 0 : index = folio->index;
1380 :
1381 : /*
1382 : * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1383 : * value into swapfile.c, the only way we can correctly account for a
1384 : * fallocated folio arriving here is now to initialize it and write it.
1385 : *
1386 : * That's okay for a folio already fallocated earlier, but if we have
1387 : * not yet completed the fallocation, then (a) we want to keep track
1388 : * of this folio in case we have to undo it, and (b) it may not be a
1389 : * good idea to continue anyway, once we're pushing into swap. So
1390 : * reactivate the folio, and let shmem_fallocate() quit when too many.
1391 : */
1392 0 : if (!folio_test_uptodate(folio)) {
1393 0 : if (inode->i_private) {
1394 : struct shmem_falloc *shmem_falloc;
1395 0 : spin_lock(&inode->i_lock);
1396 0 : shmem_falloc = inode->i_private;
1397 0 : if (shmem_falloc &&
1398 0 : !shmem_falloc->waitq &&
1399 0 : index >= shmem_falloc->start &&
1400 0 : index < shmem_falloc->next)
1401 0 : shmem_falloc->nr_unswapped++;
1402 : else
1403 : shmem_falloc = NULL;
1404 0 : spin_unlock(&inode->i_lock);
1405 0 : if (shmem_falloc)
1406 : goto redirty;
1407 : }
1408 0 : folio_zero_range(folio, 0, folio_size(folio));
1409 0 : flush_dcache_folio(folio);
1410 : folio_mark_uptodate(folio);
1411 : }
1412 :
1413 0 : swap = folio_alloc_swap(folio);
1414 0 : if (!swap.val)
1415 : goto redirty;
1416 :
1417 : /*
1418 : * Add inode to shmem_unuse()'s list of swapped-out inodes,
1419 : * if it's not already there. Do it now before the folio is
1420 : * moved to swap cache, when its pagelock no longer protects
1421 : * the inode from eviction. But don't unlock the mutex until
1422 : * we've incremented swapped, because shmem_unuse_inode() will
1423 : * prune a !swapped inode from the swaplist under this mutex.
1424 : */
1425 0 : mutex_lock(&shmem_swaplist_mutex);
1426 0 : if (list_empty(&info->swaplist))
1427 0 : list_add(&info->swaplist, &shmem_swaplist);
1428 :
1429 0 : if (add_to_swap_cache(folio, swap,
1430 : __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
1431 : NULL) == 0) {
1432 0 : spin_lock_irq(&info->lock);
1433 0 : shmem_recalc_inode(inode);
1434 0 : info->swapped++;
1435 0 : spin_unlock_irq(&info->lock);
1436 :
1437 0 : swap_shmem_alloc(swap);
1438 0 : shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
1439 :
1440 0 : mutex_unlock(&shmem_swaplist_mutex);
1441 0 : BUG_ON(folio_mapped(folio));
1442 0 : swap_writepage(&folio->page, wbc);
1443 0 : return 0;
1444 : }
1445 :
1446 0 : mutex_unlock(&shmem_swaplist_mutex);
1447 0 : put_swap_folio(folio, swap);
1448 : redirty:
1449 0 : folio_mark_dirty(folio);
1450 0 : if (wbc->for_reclaim)
1451 : return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
1452 0 : folio_unlock(folio);
1453 0 : return 0;
1454 : }
1455 :
1456 : #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
1457 : static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1458 : {
1459 : char buffer[64];
1460 :
1461 : if (!mpol || mpol->mode == MPOL_DEFAULT)
1462 : return; /* show nothing */
1463 :
1464 : mpol_to_str(buffer, sizeof(buffer), mpol);
1465 :
1466 : seq_printf(seq, ",mpol=%s", buffer);
1467 : }
1468 :
1469 : static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1470 : {
1471 : struct mempolicy *mpol = NULL;
1472 : if (sbinfo->mpol) {
1473 : raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
1474 : mpol = sbinfo->mpol;
1475 : mpol_get(mpol);
1476 : raw_spin_unlock(&sbinfo->stat_lock);
1477 : }
1478 : return mpol;
1479 : }
1480 : #else /* !CONFIG_NUMA || !CONFIG_TMPFS */
1481 : static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1482 : {
1483 : }
1484 : static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1485 : {
1486 : return NULL;
1487 : }
1488 : #endif /* CONFIG_NUMA && CONFIG_TMPFS */
1489 : #ifndef CONFIG_NUMA
1490 : #define vm_policy vm_private_data
1491 : #endif
1492 :
1493 : static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
1494 : struct shmem_inode_info *info, pgoff_t index)
1495 : {
1496 : /* Create a pseudo vma that just contains the policy */
1497 0 : vma_init(vma, NULL);
1498 : /* Bias interleave by inode number to distribute better across nodes */
1499 0 : vma->vm_pgoff = index + info->vfs_inode.i_ino;
1500 0 : vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1501 : }
1502 :
1503 : static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
1504 : {
1505 : /* Drop reference taken by mpol_shared_policy_lookup() */
1506 0 : mpol_cond_put(vma->vm_policy);
1507 : }
1508 :
1509 0 : static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1510 : struct shmem_inode_info *info, pgoff_t index)
1511 : {
1512 : struct vm_area_struct pvma;
1513 : struct page *page;
1514 0 : struct vm_fault vmf = {
1515 : .vma = &pvma,
1516 : };
1517 :
1518 0 : shmem_pseudo_vma_init(&pvma, info, index);
1519 0 : page = swap_cluster_readahead(swap, gfp, &vmf);
1520 0 : shmem_pseudo_vma_destroy(&pvma);
1521 :
1522 0 : if (!page)
1523 : return NULL;
1524 0 : return page_folio(page);
1525 : }
1526 :
1527 : /*
1528 : * Make sure huge_gfp is always more limited than limit_gfp.
1529 : * Some of the flags set permissions, while others set limitations.
1530 : */
1531 : static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
1532 : {
1533 : gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
1534 : gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
1535 : gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
1536 : gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
1537 :
1538 : /* Allow allocations only from the originally specified zones. */
1539 : result |= zoneflags;
1540 :
1541 : /*
1542 : * Minimize the result gfp by taking the union with the deny flags,
1543 : * and the intersection of the allow flags.
1544 : */
1545 : result |= (limit_gfp & denyflags);
1546 : result |= (huge_gfp & limit_gfp) & allowflags;
1547 :
1548 : return result;
1549 : }
1550 :
1551 : static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
1552 : struct shmem_inode_info *info, pgoff_t index)
1553 : {
1554 : struct vm_area_struct pvma;
1555 : struct address_space *mapping = info->vfs_inode.i_mapping;
1556 : pgoff_t hindex;
1557 : struct folio *folio;
1558 :
1559 : hindex = round_down(index, HPAGE_PMD_NR);
1560 : if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
1561 : XA_PRESENT))
1562 : return NULL;
1563 :
1564 : shmem_pseudo_vma_init(&pvma, info, hindex);
1565 : folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
1566 : shmem_pseudo_vma_destroy(&pvma);
1567 : if (!folio)
1568 : count_vm_event(THP_FILE_FALLBACK);
1569 : return folio;
1570 : }
1571 :
1572 0 : static struct folio *shmem_alloc_folio(gfp_t gfp,
1573 : struct shmem_inode_info *info, pgoff_t index)
1574 : {
1575 : struct vm_area_struct pvma;
1576 : struct folio *folio;
1577 :
1578 0 : shmem_pseudo_vma_init(&pvma, info, index);
1579 0 : folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
1580 0 : shmem_pseudo_vma_destroy(&pvma);
1581 :
1582 0 : return folio;
1583 : }
1584 :
1585 0 : static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
1586 : pgoff_t index, bool huge)
1587 : {
1588 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1589 : struct folio *folio;
1590 : int nr;
1591 0 : int err = -ENOSPC;
1592 :
1593 : if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1594 0 : huge = false;
1595 0 : nr = huge ? HPAGE_PMD_NR : 1;
1596 :
1597 0 : if (!shmem_inode_acct_block(inode, nr))
1598 : goto failed;
1599 :
1600 : if (huge)
1601 : folio = shmem_alloc_hugefolio(gfp, info, index);
1602 : else
1603 0 : folio = shmem_alloc_folio(gfp, info, index);
1604 0 : if (folio) {
1605 0 : __folio_set_locked(folio);
1606 0 : __folio_set_swapbacked(folio);
1607 0 : return folio;
1608 : }
1609 :
1610 0 : err = -ENOMEM;
1611 0 : shmem_inode_unacct_blocks(inode, nr);
1612 : failed:
1613 0 : return ERR_PTR(err);
1614 : }
1615 :
1616 : /*
1617 : * When a page is moved from swapcache to shmem filecache (either by the
1618 : * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
1619 : * shmem_unuse_inode()), it may have been read in earlier from swap, in
1620 : * ignorance of the mapping it belongs to. If that mapping has special
1621 : * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
1622 : * we may need to copy to a suitable page before moving to filecache.
1623 : *
1624 : * In a future release, this may well be extended to respect cpuset and
1625 : * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
1626 : * but for now it is a simple matter of zone.
1627 : */
1628 : static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
1629 : {
1630 0 : return folio_zonenum(folio) > gfp_zone(gfp);
1631 : }
1632 :
1633 0 : static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
1634 : struct shmem_inode_info *info, pgoff_t index)
1635 : {
1636 : struct folio *old, *new;
1637 : struct address_space *swap_mapping;
1638 : swp_entry_t entry;
1639 : pgoff_t swap_index;
1640 : int error;
1641 :
1642 0 : old = *foliop;
1643 0 : entry = folio_swap_entry(old);
1644 0 : swap_index = swp_offset(entry);
1645 0 : swap_mapping = swap_address_space(entry);
1646 :
1647 : /*
1648 : * We have arrived here because our zones are constrained, so don't
1649 : * limit chance of success by further cpuset and node constraints.
1650 : */
1651 0 : gfp &= ~GFP_CONSTRAINT_MASK;
1652 : VM_BUG_ON_FOLIO(folio_test_large(old), old);
1653 0 : new = shmem_alloc_folio(gfp, info, index);
1654 0 : if (!new)
1655 : return -ENOMEM;
1656 :
1657 0 : folio_get(new);
1658 0 : folio_copy(new, old);
1659 0 : flush_dcache_folio(new);
1660 :
1661 0 : __folio_set_locked(new);
1662 0 : __folio_set_swapbacked(new);
1663 0 : folio_mark_uptodate(new);
1664 0 : folio_set_swap_entry(new, entry);
1665 0 : folio_set_swapcache(new);
1666 :
1667 : /*
1668 : * Our caller will very soon move newpage out of swapcache, but it's
1669 : * a nice clean interface for us to replace oldpage by newpage there.
1670 : */
1671 0 : xa_lock_irq(&swap_mapping->i_pages);
1672 0 : error = shmem_replace_entry(swap_mapping, swap_index, old, new);
1673 0 : if (!error) {
1674 0 : mem_cgroup_migrate(old, new);
1675 0 : __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
1676 0 : __lruvec_stat_mod_folio(new, NR_SHMEM, 1);
1677 0 : __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
1678 0 : __lruvec_stat_mod_folio(old, NR_SHMEM, -1);
1679 : }
1680 0 : xa_unlock_irq(&swap_mapping->i_pages);
1681 :
1682 0 : if (unlikely(error)) {
1683 : /*
1684 : * Is this possible? I think not, now that our callers check
1685 : * both PageSwapCache and page_private after getting page lock;
1686 : * but be defensive. Reverse old to newpage for clear and free.
1687 : */
1688 : old = new;
1689 : } else {
1690 0 : folio_add_lru(new);
1691 0 : *foliop = new;
1692 : }
1693 :
1694 0 : folio_clear_swapcache(old);
1695 0 : old->private = NULL;
1696 :
1697 0 : folio_unlock(old);
1698 : folio_put_refs(old, 2);
1699 : return error;
1700 : }
1701 :
1702 0 : static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
1703 : struct folio *folio, swp_entry_t swap)
1704 : {
1705 0 : struct address_space *mapping = inode->i_mapping;
1706 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1707 : swp_entry_t swapin_error;
1708 : void *old;
1709 :
1710 : swapin_error = make_swapin_error_entry();
1711 0 : old = xa_cmpxchg_irq(&mapping->i_pages, index,
1712 : swp_to_radix_entry(swap),
1713 : swp_to_radix_entry(swapin_error), 0);
1714 0 : if (old != swp_to_radix_entry(swap))
1715 : return;
1716 :
1717 0 : folio_wait_writeback(folio);
1718 0 : delete_from_swap_cache(folio);
1719 0 : spin_lock_irq(&info->lock);
1720 : /*
1721 : * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
1722 : * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
1723 : * shmem_evict_inode.
1724 : */
1725 0 : info->alloced--;
1726 0 : info->swapped--;
1727 0 : shmem_recalc_inode(inode);
1728 0 : spin_unlock_irq(&info->lock);
1729 0 : swap_free(swap);
1730 : }
1731 :
1732 : /*
1733 : * Swap in the folio pointed to by *foliop.
1734 : * Caller has to make sure that *foliop contains a valid swapped folio.
1735 : * Returns 0 and the folio in foliop if success. On failure, returns the
1736 : * error code and NULL in *foliop.
1737 : */
1738 0 : static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
1739 : struct folio **foliop, enum sgp_type sgp,
1740 : gfp_t gfp, struct vm_area_struct *vma,
1741 : vm_fault_t *fault_type)
1742 : {
1743 0 : struct address_space *mapping = inode->i_mapping;
1744 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1745 0 : struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
1746 : struct swap_info_struct *si;
1747 0 : struct folio *folio = NULL;
1748 : swp_entry_t swap;
1749 : int error;
1750 :
1751 : VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
1752 0 : swap = radix_to_swp_entry(*foliop);
1753 0 : *foliop = NULL;
1754 :
1755 0 : if (is_swapin_error_entry(swap))
1756 : return -EIO;
1757 :
1758 0 : si = get_swap_device(swap);
1759 0 : if (!si) {
1760 0 : if (!shmem_confirm_swap(mapping, index, swap))
1761 : return -EEXIST;
1762 : else
1763 : return -EINVAL;
1764 : }
1765 :
1766 : /* Look it up and read it in.. */
1767 0 : folio = swap_cache_get_folio(swap, NULL, 0);
1768 0 : if (!folio) {
1769 : /* Or update major stats only when swapin succeeds?? */
1770 0 : if (fault_type) {
1771 0 : *fault_type |= VM_FAULT_MAJOR;
1772 0 : count_vm_event(PGMAJFAULT);
1773 0 : count_memcg_event_mm(charge_mm, PGMAJFAULT);
1774 : }
1775 : /* Here we actually start the io */
1776 0 : folio = shmem_swapin(swap, gfp, info, index);
1777 0 : if (!folio) {
1778 : error = -ENOMEM;
1779 : goto failed;
1780 : }
1781 : }
1782 :
1783 : /* We have to do this with folio locked to prevent races */
1784 0 : folio_lock(folio);
1785 0 : if (!folio_test_swapcache(folio) ||
1786 0 : folio_swap_entry(folio).val != swap.val ||
1787 0 : !shmem_confirm_swap(mapping, index, swap)) {
1788 : error = -EEXIST;
1789 : goto unlock;
1790 : }
1791 0 : if (!folio_test_uptodate(folio)) {
1792 : error = -EIO;
1793 : goto failed;
1794 : }
1795 0 : folio_wait_writeback(folio);
1796 :
1797 : /*
1798 : * Some architectures may have to restore extra metadata to the
1799 : * folio after reading from swap.
1800 : */
1801 0 : arch_swap_restore(swap, folio);
1802 :
1803 0 : if (shmem_should_replace_folio(folio, gfp)) {
1804 0 : error = shmem_replace_folio(&folio, gfp, info, index);
1805 0 : if (error)
1806 : goto failed;
1807 : }
1808 :
1809 0 : error = shmem_add_to_page_cache(folio, mapping, index,
1810 : swp_to_radix_entry(swap), gfp,
1811 : charge_mm);
1812 0 : if (error)
1813 : goto failed;
1814 :
1815 0 : spin_lock_irq(&info->lock);
1816 0 : info->swapped--;
1817 0 : shmem_recalc_inode(inode);
1818 0 : spin_unlock_irq(&info->lock);
1819 :
1820 0 : if (sgp == SGP_WRITE)
1821 0 : folio_mark_accessed(folio);
1822 :
1823 0 : delete_from_swap_cache(folio);
1824 0 : folio_mark_dirty(folio);
1825 0 : swap_free(swap);
1826 0 : put_swap_device(si);
1827 :
1828 0 : *foliop = folio;
1829 : return 0;
1830 : failed:
1831 0 : if (!shmem_confirm_swap(mapping, index, swap))
1832 0 : error = -EEXIST;
1833 0 : if (error == -EIO)
1834 0 : shmem_set_folio_swapin_error(inode, index, folio, swap);
1835 : unlock:
1836 0 : if (folio) {
1837 0 : folio_unlock(folio);
1838 0 : folio_put(folio);
1839 : }
1840 0 : put_swap_device(si);
1841 :
1842 : return error;
1843 : }
1844 :
1845 : /*
1846 : * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
1847 : *
1848 : * If we allocate a new one we do not mark it dirty. That's up to the
1849 : * vm. If we swap it in we mark it dirty since we also free the swap
1850 : * entry since a page cannot live in both the swap and page cache.
1851 : *
1852 : * vma, vmf, and fault_type are only supplied by shmem_fault:
1853 : * otherwise they are NULL.
1854 : */
1855 0 : static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
1856 : struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
1857 : struct vm_area_struct *vma, struct vm_fault *vmf,
1858 : vm_fault_t *fault_type)
1859 : {
1860 0 : struct address_space *mapping = inode->i_mapping;
1861 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1862 : struct shmem_sb_info *sbinfo;
1863 : struct mm_struct *charge_mm;
1864 : struct folio *folio;
1865 : pgoff_t hindex;
1866 : gfp_t huge_gfp;
1867 : int error;
1868 0 : int once = 0;
1869 0 : int alloced = 0;
1870 :
1871 0 : if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
1872 : return -EFBIG;
1873 : repeat:
1874 0 : if (sgp <= SGP_CACHE &&
1875 0 : ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1876 : return -EINVAL;
1877 : }
1878 :
1879 0 : sbinfo = SHMEM_SB(inode->i_sb);
1880 0 : charge_mm = vma ? vma->vm_mm : NULL;
1881 :
1882 0 : folio = filemap_get_entry(mapping, index);
1883 : if (folio && vma && userfaultfd_minor(vma)) {
1884 : if (!xa_is_value(folio))
1885 : folio_put(folio);
1886 : *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
1887 : return 0;
1888 : }
1889 :
1890 0 : if (xa_is_value(folio)) {
1891 0 : error = shmem_swapin_folio(inode, index, &folio,
1892 : sgp, gfp, vma, fault_type);
1893 0 : if (error == -EEXIST)
1894 : goto repeat;
1895 :
1896 0 : *foliop = folio;
1897 : return error;
1898 : }
1899 :
1900 0 : if (folio) {
1901 0 : folio_lock(folio);
1902 :
1903 : /* Has the folio been truncated or swapped out? */
1904 0 : if (unlikely(folio->mapping != mapping)) {
1905 0 : folio_unlock(folio);
1906 0 : folio_put(folio);
1907 : goto repeat;
1908 : }
1909 0 : if (sgp == SGP_WRITE)
1910 0 : folio_mark_accessed(folio);
1911 0 : if (folio_test_uptodate(folio))
1912 : goto out;
1913 : /* fallocated folio */
1914 0 : if (sgp != SGP_READ)
1915 : goto clear;
1916 0 : folio_unlock(folio);
1917 0 : folio_put(folio);
1918 : }
1919 :
1920 : /*
1921 : * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
1922 : * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
1923 : */
1924 0 : *foliop = NULL;
1925 0 : if (sgp == SGP_READ)
1926 : return 0;
1927 0 : if (sgp == SGP_NOALLOC)
1928 : return -ENOENT;
1929 :
1930 : /*
1931 : * Fast cache lookup and swap lookup did not find it: allocate.
1932 : */
1933 :
1934 : if (vma && userfaultfd_missing(vma)) {
1935 : *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1936 : return 0;
1937 : }
1938 :
1939 : if (!shmem_is_huge(inode, index, false,
1940 : vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0))
1941 : goto alloc_nohuge;
1942 :
1943 : huge_gfp = vma_thp_gfp_mask(vma);
1944 : huge_gfp = limit_gfp_mask(huge_gfp, gfp);
1945 : folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true);
1946 : if (IS_ERR(folio)) {
1947 : alloc_nohuge:
1948 0 : folio = shmem_alloc_and_acct_folio(gfp, inode, index, false);
1949 : }
1950 0 : if (IS_ERR(folio)) {
1951 0 : int retry = 5;
1952 :
1953 0 : error = PTR_ERR(folio);
1954 0 : folio = NULL;
1955 : if (error != -ENOSPC)
1956 : goto unlock;
1957 : /*
1958 : * Try to reclaim some space by splitting a large folio
1959 : * beyond i_size on the filesystem.
1960 : */
1961 : while (retry--) {
1962 : int ret;
1963 :
1964 : ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1965 : if (ret == SHRINK_STOP)
1966 : break;
1967 : if (ret)
1968 : goto alloc_nohuge;
1969 : }
1970 : goto unlock;
1971 : }
1972 :
1973 0 : hindex = round_down(index, folio_nr_pages(folio));
1974 :
1975 0 : if (sgp == SGP_WRITE)
1976 0 : __folio_set_referenced(folio);
1977 :
1978 0 : error = shmem_add_to_page_cache(folio, mapping, hindex,
1979 : NULL, gfp & GFP_RECLAIM_MASK,
1980 : charge_mm);
1981 0 : if (error)
1982 : goto unacct;
1983 0 : folio_add_lru(folio);
1984 :
1985 0 : spin_lock_irq(&info->lock);
1986 0 : info->alloced += folio_nr_pages(folio);
1987 0 : inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
1988 0 : shmem_recalc_inode(inode);
1989 0 : spin_unlock_irq(&info->lock);
1990 0 : alloced = true;
1991 :
1992 0 : if (folio_test_pmd_mappable(folio) &&
1993 : DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1994 : folio_next_index(folio) - 1) {
1995 : /*
1996 : * Part of the large folio is beyond i_size: subject
1997 : * to shrink under memory pressure.
1998 : */
1999 : spin_lock(&sbinfo->shrinklist_lock);
2000 : /*
2001 : * _careful to defend against unlocked access to
2002 : * ->shrink_list in shmem_unused_huge_shrink()
2003 : */
2004 : if (list_empty_careful(&info->shrinklist)) {
2005 : list_add_tail(&info->shrinklist,
2006 : &sbinfo->shrinklist);
2007 : sbinfo->shrinklist_len++;
2008 : }
2009 : spin_unlock(&sbinfo->shrinklist_lock);
2010 : }
2011 :
2012 : /*
2013 : * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
2014 : */
2015 0 : if (sgp == SGP_FALLOC)
2016 0 : sgp = SGP_WRITE;
2017 : clear:
2018 : /*
2019 : * Let SGP_WRITE caller clear ends if write does not fill folio;
2020 : * but SGP_FALLOC on a folio fallocated earlier must initialize
2021 : * it now, lest undo on failure cancel our earlier guarantee.
2022 : */
2023 0 : if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
2024 0 : long i, n = folio_nr_pages(folio);
2025 :
2026 0 : for (i = 0; i < n; i++)
2027 0 : clear_highpage(folio_page(folio, i));
2028 0 : flush_dcache_folio(folio);
2029 0 : folio_mark_uptodate(folio);
2030 : }
2031 :
2032 : /* Perhaps the file has been truncated since we checked */
2033 0 : if (sgp <= SGP_CACHE &&
2034 0 : ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
2035 0 : if (alloced) {
2036 0 : folio_clear_dirty(folio);
2037 0 : filemap_remove_folio(folio);
2038 0 : spin_lock_irq(&info->lock);
2039 0 : shmem_recalc_inode(inode);
2040 0 : spin_unlock_irq(&info->lock);
2041 : }
2042 : error = -EINVAL;
2043 : goto unlock;
2044 : }
2045 : out:
2046 0 : *foliop = folio;
2047 : return 0;
2048 :
2049 : /*
2050 : * Error recovery.
2051 : */
2052 : unacct:
2053 0 : shmem_inode_unacct_blocks(inode, folio_nr_pages(folio));
2054 :
2055 0 : if (folio_test_large(folio)) {
2056 0 : folio_unlock(folio);
2057 0 : folio_put(folio);
2058 : goto alloc_nohuge;
2059 : }
2060 : unlock:
2061 0 : if (folio) {
2062 0 : folio_unlock(folio);
2063 0 : folio_put(folio);
2064 : }
2065 0 : if (error == -ENOSPC && !once++) {
2066 0 : spin_lock_irq(&info->lock);
2067 0 : shmem_recalc_inode(inode);
2068 0 : spin_unlock_irq(&info->lock);
2069 : goto repeat;
2070 : }
2071 0 : if (error == -EEXIST)
2072 : goto repeat;
2073 : return error;
2074 : }
2075 :
2076 0 : int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
2077 : enum sgp_type sgp)
2078 : {
2079 0 : return shmem_get_folio_gfp(inode, index, foliop, sgp,
2080 : mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
2081 : }
2082 :
2083 : /*
2084 : * This is like autoremove_wake_function, but it removes the wait queue
2085 : * entry unconditionally - even if something else had already woken the
2086 : * target.
2087 : */
2088 0 : static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
2089 : {
2090 0 : int ret = default_wake_function(wait, mode, sync, key);
2091 0 : list_del_init(&wait->entry);
2092 0 : return ret;
2093 : }
2094 :
2095 0 : static vm_fault_t shmem_fault(struct vm_fault *vmf)
2096 : {
2097 0 : struct vm_area_struct *vma = vmf->vma;
2098 0 : struct inode *inode = file_inode(vma->vm_file);
2099 0 : gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
2100 0 : struct folio *folio = NULL;
2101 : int err;
2102 0 : vm_fault_t ret = VM_FAULT_LOCKED;
2103 :
2104 : /*
2105 : * Trinity finds that probing a hole which tmpfs is punching can
2106 : * prevent the hole-punch from ever completing: which in turn
2107 : * locks writers out with its hold on i_rwsem. So refrain from
2108 : * faulting pages into the hole while it's being punched. Although
2109 : * shmem_undo_range() does remove the additions, it may be unable to
2110 : * keep up, as each new page needs its own unmap_mapping_range() call,
2111 : * and the i_mmap tree grows ever slower to scan if new vmas are added.
2112 : *
2113 : * It does not matter if we sometimes reach this check just before the
2114 : * hole-punch begins, so that one fault then races with the punch:
2115 : * we just need to make racing faults a rare case.
2116 : *
2117 : * The implementation below would be much simpler if we just used a
2118 : * standard mutex or completion: but we cannot take i_rwsem in fault,
2119 : * and bloating every shmem inode for this unlikely case would be sad.
2120 : */
2121 0 : if (unlikely(inode->i_private)) {
2122 : struct shmem_falloc *shmem_falloc;
2123 :
2124 0 : spin_lock(&inode->i_lock);
2125 0 : shmem_falloc = inode->i_private;
2126 0 : if (shmem_falloc &&
2127 0 : shmem_falloc->waitq &&
2128 0 : vmf->pgoff >= shmem_falloc->start &&
2129 0 : vmf->pgoff < shmem_falloc->next) {
2130 : struct file *fpin;
2131 : wait_queue_head_t *shmem_falloc_waitq;
2132 0 : DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
2133 :
2134 0 : ret = VM_FAULT_NOPAGE;
2135 0 : fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2136 0 : if (fpin)
2137 0 : ret = VM_FAULT_RETRY;
2138 :
2139 0 : shmem_falloc_waitq = shmem_falloc->waitq;
2140 0 : prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
2141 : TASK_UNINTERRUPTIBLE);
2142 0 : spin_unlock(&inode->i_lock);
2143 0 : schedule();
2144 :
2145 : /*
2146 : * shmem_falloc_waitq points into the shmem_fallocate()
2147 : * stack of the hole-punching task: shmem_falloc_waitq
2148 : * is usually invalid by the time we reach here, but
2149 : * finish_wait() does not dereference it in that case;
2150 : * though i_lock needed lest racing with wake_up_all().
2151 : */
2152 0 : spin_lock(&inode->i_lock);
2153 0 : finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2154 0 : spin_unlock(&inode->i_lock);
2155 :
2156 0 : if (fpin)
2157 0 : fput(fpin);
2158 0 : return ret;
2159 : }
2160 0 : spin_unlock(&inode->i_lock);
2161 : }
2162 :
2163 0 : err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
2164 : gfp, vma, vmf, &ret);
2165 0 : if (err)
2166 : return vmf_error(err);
2167 0 : if (folio)
2168 0 : vmf->page = folio_file_page(folio, vmf->pgoff);
2169 0 : return ret;
2170 : }
2171 :
2172 0 : unsigned long shmem_get_unmapped_area(struct file *file,
2173 : unsigned long uaddr, unsigned long len,
2174 : unsigned long pgoff, unsigned long flags)
2175 : {
2176 : unsigned long (*get_area)(struct file *,
2177 : unsigned long, unsigned long, unsigned long, unsigned long);
2178 : unsigned long addr;
2179 : unsigned long offset;
2180 : unsigned long inflated_len;
2181 : unsigned long inflated_addr;
2182 : unsigned long inflated_offset;
2183 :
2184 0 : if (len > TASK_SIZE)
2185 : return -ENOMEM;
2186 :
2187 0 : get_area = current->mm->get_unmapped_area;
2188 0 : addr = get_area(file, uaddr, len, pgoff, flags);
2189 :
2190 : if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
2191 0 : return addr;
2192 : if (IS_ERR_VALUE(addr))
2193 : return addr;
2194 : if (addr & ~PAGE_MASK)
2195 : return addr;
2196 : if (addr > TASK_SIZE - len)
2197 : return addr;
2198 :
2199 : if (shmem_huge == SHMEM_HUGE_DENY)
2200 : return addr;
2201 : if (len < HPAGE_PMD_SIZE)
2202 : return addr;
2203 : if (flags & MAP_FIXED)
2204 : return addr;
2205 : /*
2206 : * Our priority is to support MAP_SHARED mapped hugely;
2207 : * and support MAP_PRIVATE mapped hugely too, until it is COWed.
2208 : * But if caller specified an address hint and we allocated area there
2209 : * successfully, respect that as before.
2210 : */
2211 : if (uaddr == addr)
2212 : return addr;
2213 :
2214 : if (shmem_huge != SHMEM_HUGE_FORCE) {
2215 : struct super_block *sb;
2216 :
2217 : if (file) {
2218 : VM_BUG_ON(file->f_op != &shmem_file_operations);
2219 : sb = file_inode(file)->i_sb;
2220 : } else {
2221 : /*
2222 : * Called directly from mm/mmap.c, or drivers/char/mem.c
2223 : * for "/dev/zero", to create a shared anonymous object.
2224 : */
2225 : if (IS_ERR(shm_mnt))
2226 : return addr;
2227 : sb = shm_mnt->mnt_sb;
2228 : }
2229 : if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
2230 : return addr;
2231 : }
2232 :
2233 : offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
2234 : if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
2235 : return addr;
2236 : if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
2237 : return addr;
2238 :
2239 : inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
2240 : if (inflated_len > TASK_SIZE)
2241 : return addr;
2242 : if (inflated_len < len)
2243 : return addr;
2244 :
2245 : inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
2246 : if (IS_ERR_VALUE(inflated_addr))
2247 : return addr;
2248 : if (inflated_addr & ~PAGE_MASK)
2249 : return addr;
2250 :
2251 : inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
2252 : inflated_addr += offset - inflated_offset;
2253 : if (inflated_offset > offset)
2254 : inflated_addr += HPAGE_PMD_SIZE;
2255 :
2256 : if (inflated_addr > TASK_SIZE - len)
2257 : return addr;
2258 : return inflated_addr;
2259 : }
2260 :
2261 : #ifdef CONFIG_NUMA
2262 : static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
2263 : {
2264 : struct inode *inode = file_inode(vma->vm_file);
2265 : return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
2266 : }
2267 :
2268 : static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2269 : unsigned long addr)
2270 : {
2271 : struct inode *inode = file_inode(vma->vm_file);
2272 : pgoff_t index;
2273 :
2274 : index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2275 : return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
2276 : }
2277 : #endif
2278 :
2279 0 : int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
2280 : {
2281 0 : struct inode *inode = file_inode(file);
2282 0 : struct shmem_inode_info *info = SHMEM_I(inode);
2283 0 : int retval = -ENOMEM;
2284 :
2285 : /*
2286 : * What serializes the accesses to info->flags?
2287 : * ipc_lock_object() when called from shmctl_do_lock(),
2288 : * no serialization needed when called from shm_destroy().
2289 : */
2290 0 : if (lock && !(info->flags & VM_LOCKED)) {
2291 0 : if (!user_shm_lock(inode->i_size, ucounts))
2292 : goto out_nomem;
2293 0 : info->flags |= VM_LOCKED;
2294 0 : mapping_set_unevictable(file->f_mapping);
2295 : }
2296 0 : if (!lock && (info->flags & VM_LOCKED) && ucounts) {
2297 0 : user_shm_unlock(inode->i_size, ucounts);
2298 0 : info->flags &= ~VM_LOCKED;
2299 0 : mapping_clear_unevictable(file->f_mapping);
2300 : }
2301 : retval = 0;
2302 :
2303 : out_nomem:
2304 0 : return retval;
2305 : }
2306 :
2307 0 : static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
2308 : {
2309 0 : struct inode *inode = file_inode(file);
2310 0 : struct shmem_inode_info *info = SHMEM_I(inode);
2311 : int ret;
2312 :
2313 0 : ret = seal_check_future_write(info->seals, vma);
2314 0 : if (ret)
2315 : return ret;
2316 :
2317 : /* arm64 - allow memory tagging on RAM-based files */
2318 0 : vm_flags_set(vma, VM_MTE_ALLOWED);
2319 :
2320 0 : file_accessed(file);
2321 : /* This is anonymous shared memory if it is unlinked at the time of mmap */
2322 0 : if (inode->i_nlink)
2323 0 : vma->vm_ops = &shmem_vm_ops;
2324 : else
2325 0 : vma->vm_ops = &shmem_anon_vm_ops;
2326 : return 0;
2327 : }
2328 :
2329 : #ifdef CONFIG_TMPFS_XATTR
2330 : static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2331 :
2332 : /*
2333 : * chattr's fsflags are unrelated to extended attributes,
2334 : * but tmpfs has chosen to enable them under the same config option.
2335 : */
2336 : static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
2337 : {
2338 : unsigned int i_flags = 0;
2339 :
2340 : if (fsflags & FS_NOATIME_FL)
2341 : i_flags |= S_NOATIME;
2342 : if (fsflags & FS_APPEND_FL)
2343 : i_flags |= S_APPEND;
2344 : if (fsflags & FS_IMMUTABLE_FL)
2345 : i_flags |= S_IMMUTABLE;
2346 : /*
2347 : * But FS_NODUMP_FL does not require any action in i_flags.
2348 : */
2349 : inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
2350 : }
2351 : #else
2352 : static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
2353 : {
2354 : }
2355 : #define shmem_initxattrs NULL
2356 : #endif
2357 :
2358 1 : static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb,
2359 : struct inode *dir, umode_t mode, dev_t dev,
2360 : unsigned long flags)
2361 : {
2362 : struct inode *inode;
2363 : struct shmem_inode_info *info;
2364 1 : struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2365 : ino_t ino;
2366 :
2367 1 : if (shmem_reserve_inode(sb, &ino))
2368 : return NULL;
2369 :
2370 1 : inode = new_inode(sb);
2371 1 : if (inode) {
2372 1 : inode->i_ino = ino;
2373 1 : inode_init_owner(idmap, inode, dir, mode);
2374 1 : inode->i_blocks = 0;
2375 1 : inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
2376 1 : inode->i_generation = get_random_u32();
2377 1 : info = SHMEM_I(inode);
2378 1 : memset(info, 0, (char *)inode - (char *)info);
2379 1 : spin_lock_init(&info->lock);
2380 2 : atomic_set(&info->stop_eviction, 0);
2381 1 : info->seals = F_SEAL_SEAL;
2382 1 : info->flags = flags & VM_NORESERVE;
2383 1 : info->i_crtime = inode->i_mtime;
2384 1 : info->fsflags = (dir == NULL) ? 0 :
2385 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
2386 : if (info->fsflags)
2387 : shmem_set_inode_flags(inode, info->fsflags);
2388 2 : INIT_LIST_HEAD(&info->shrinklist);
2389 2 : INIT_LIST_HEAD(&info->swaplist);
2390 1 : if (sbinfo->noswap)
2391 0 : mapping_set_unevictable(inode->i_mapping);
2392 1 : simple_xattrs_init(&info->xattrs);
2393 1 : cache_no_acl(inode);
2394 2 : mapping_set_large_folios(inode->i_mapping);
2395 :
2396 1 : switch (mode & S_IFMT) {
2397 : default:
2398 0 : inode->i_op = &shmem_special_inode_operations;
2399 0 : init_special_inode(inode, mode, dev);
2400 0 : break;
2401 : case S_IFREG:
2402 0 : inode->i_mapping->a_ops = &shmem_aops;
2403 0 : inode->i_op = &shmem_inode_operations;
2404 0 : inode->i_fop = &shmem_file_operations;
2405 0 : mpol_shared_policy_init(&info->policy,
2406 : shmem_get_sbmpol(sbinfo));
2407 : break;
2408 : case S_IFDIR:
2409 1 : inc_nlink(inode);
2410 : /* Some things misbehave if size == 0 on a directory */
2411 1 : inode->i_size = 2 * BOGO_DIRENT_SIZE;
2412 1 : inode->i_op = &shmem_dir_inode_operations;
2413 1 : inode->i_fop = &simple_dir_operations;
2414 1 : break;
2415 : case S_IFLNK:
2416 : /*
2417 : * Must not load anything in the rbtree,
2418 : * mpol_free_shared_policy will not be called.
2419 : */
2420 : mpol_shared_policy_init(&info->policy, NULL);
2421 : break;
2422 : }
2423 :
2424 : lockdep_annotate_inode_mutex_key(inode);
2425 : } else
2426 : shmem_free_inode(sb);
2427 : return inode;
2428 : }
2429 :
2430 : #ifdef CONFIG_USERFAULTFD
2431 : int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
2432 : struct vm_area_struct *dst_vma,
2433 : unsigned long dst_addr,
2434 : unsigned long src_addr,
2435 : uffd_flags_t flags,
2436 : struct folio **foliop)
2437 : {
2438 : struct inode *inode = file_inode(dst_vma->vm_file);
2439 : struct shmem_inode_info *info = SHMEM_I(inode);
2440 : struct address_space *mapping = inode->i_mapping;
2441 : gfp_t gfp = mapping_gfp_mask(mapping);
2442 : pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
2443 : void *page_kaddr;
2444 : struct folio *folio;
2445 : int ret;
2446 : pgoff_t max_off;
2447 :
2448 : if (!shmem_inode_acct_block(inode, 1)) {
2449 : /*
2450 : * We may have got a page, returned -ENOENT triggering a retry,
2451 : * and now we find ourselves with -ENOMEM. Release the page, to
2452 : * avoid a BUG_ON in our caller.
2453 : */
2454 : if (unlikely(*foliop)) {
2455 : folio_put(*foliop);
2456 : *foliop = NULL;
2457 : }
2458 : return -ENOMEM;
2459 : }
2460 :
2461 : if (!*foliop) {
2462 : ret = -ENOMEM;
2463 : folio = shmem_alloc_folio(gfp, info, pgoff);
2464 : if (!folio)
2465 : goto out_unacct_blocks;
2466 :
2467 : if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
2468 : page_kaddr = kmap_local_folio(folio, 0);
2469 : /*
2470 : * The read mmap_lock is held here. Despite the
2471 : * mmap_lock being read recursive a deadlock is still
2472 : * possible if a writer has taken a lock. For example:
2473 : *
2474 : * process A thread 1 takes read lock on own mmap_lock
2475 : * process A thread 2 calls mmap, blocks taking write lock
2476 : * process B thread 1 takes page fault, read lock on own mmap lock
2477 : * process B thread 2 calls mmap, blocks taking write lock
2478 : * process A thread 1 blocks taking read lock on process B
2479 : * process B thread 1 blocks taking read lock on process A
2480 : *
2481 : * Disable page faults to prevent potential deadlock
2482 : * and retry the copy outside the mmap_lock.
2483 : */
2484 : pagefault_disable();
2485 : ret = copy_from_user(page_kaddr,
2486 : (const void __user *)src_addr,
2487 : PAGE_SIZE);
2488 : pagefault_enable();
2489 : kunmap_local(page_kaddr);
2490 :
2491 : /* fallback to copy_from_user outside mmap_lock */
2492 : if (unlikely(ret)) {
2493 : *foliop = folio;
2494 : ret = -ENOENT;
2495 : /* don't free the page */
2496 : goto out_unacct_blocks;
2497 : }
2498 :
2499 : flush_dcache_folio(folio);
2500 : } else { /* ZEROPAGE */
2501 : clear_user_highpage(&folio->page, dst_addr);
2502 : }
2503 : } else {
2504 : folio = *foliop;
2505 : VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
2506 : *foliop = NULL;
2507 : }
2508 :
2509 : VM_BUG_ON(folio_test_locked(folio));
2510 : VM_BUG_ON(folio_test_swapbacked(folio));
2511 : __folio_set_locked(folio);
2512 : __folio_set_swapbacked(folio);
2513 : __folio_mark_uptodate(folio);
2514 :
2515 : ret = -EFAULT;
2516 : max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2517 : if (unlikely(pgoff >= max_off))
2518 : goto out_release;
2519 :
2520 : ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
2521 : gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm);
2522 : if (ret)
2523 : goto out_release;
2524 :
2525 : ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
2526 : &folio->page, true, flags);
2527 : if (ret)
2528 : goto out_delete_from_cache;
2529 :
2530 : spin_lock_irq(&info->lock);
2531 : info->alloced++;
2532 : inode->i_blocks += BLOCKS_PER_PAGE;
2533 : shmem_recalc_inode(inode);
2534 : spin_unlock_irq(&info->lock);
2535 :
2536 : folio_unlock(folio);
2537 : return 0;
2538 : out_delete_from_cache:
2539 : filemap_remove_folio(folio);
2540 : out_release:
2541 : folio_unlock(folio);
2542 : folio_put(folio);
2543 : out_unacct_blocks:
2544 : shmem_inode_unacct_blocks(inode, 1);
2545 : return ret;
2546 : }
2547 : #endif /* CONFIG_USERFAULTFD */
2548 :
2549 : #ifdef CONFIG_TMPFS
2550 : static const struct inode_operations shmem_symlink_inode_operations;
2551 : static const struct inode_operations shmem_short_symlink_operations;
2552 :
2553 : static int
2554 : shmem_write_begin(struct file *file, struct address_space *mapping,
2555 : loff_t pos, unsigned len,
2556 : struct page **pagep, void **fsdata)
2557 : {
2558 : struct inode *inode = mapping->host;
2559 : struct shmem_inode_info *info = SHMEM_I(inode);
2560 : pgoff_t index = pos >> PAGE_SHIFT;
2561 : struct folio *folio;
2562 : int ret = 0;
2563 :
2564 : /* i_rwsem is held by caller */
2565 : if (unlikely(info->seals & (F_SEAL_GROW |
2566 : F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
2567 : if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
2568 : return -EPERM;
2569 : if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
2570 : return -EPERM;
2571 : }
2572 :
2573 : ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
2574 :
2575 : if (ret)
2576 : return ret;
2577 :
2578 : *pagep = folio_file_page(folio, index);
2579 : if (PageHWPoison(*pagep)) {
2580 : folio_unlock(folio);
2581 : folio_put(folio);
2582 : *pagep = NULL;
2583 : return -EIO;
2584 : }
2585 :
2586 : return 0;
2587 : }
2588 :
2589 : static int
2590 : shmem_write_end(struct file *file, struct address_space *mapping,
2591 : loff_t pos, unsigned len, unsigned copied,
2592 : struct page *page, void *fsdata)
2593 : {
2594 : struct folio *folio = page_folio(page);
2595 : struct inode *inode = mapping->host;
2596 :
2597 : if (pos + copied > inode->i_size)
2598 : i_size_write(inode, pos + copied);
2599 :
2600 : if (!folio_test_uptodate(folio)) {
2601 : if (copied < folio_size(folio)) {
2602 : size_t from = offset_in_folio(folio, pos);
2603 : folio_zero_segments(folio, 0, from,
2604 : from + copied, folio_size(folio));
2605 : }
2606 : folio_mark_uptodate(folio);
2607 : }
2608 : folio_mark_dirty(folio);
2609 : folio_unlock(folio);
2610 : folio_put(folio);
2611 :
2612 : return copied;
2613 : }
2614 :
2615 : static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
2616 : {
2617 : struct file *file = iocb->ki_filp;
2618 : struct inode *inode = file_inode(file);
2619 : struct address_space *mapping = inode->i_mapping;
2620 : pgoff_t index;
2621 : unsigned long offset;
2622 : int error = 0;
2623 : ssize_t retval = 0;
2624 : loff_t *ppos = &iocb->ki_pos;
2625 :
2626 : index = *ppos >> PAGE_SHIFT;
2627 : offset = *ppos & ~PAGE_MASK;
2628 :
2629 : for (;;) {
2630 : struct folio *folio = NULL;
2631 : struct page *page = NULL;
2632 : pgoff_t end_index;
2633 : unsigned long nr, ret;
2634 : loff_t i_size = i_size_read(inode);
2635 :
2636 : end_index = i_size >> PAGE_SHIFT;
2637 : if (index > end_index)
2638 : break;
2639 : if (index == end_index) {
2640 : nr = i_size & ~PAGE_MASK;
2641 : if (nr <= offset)
2642 : break;
2643 : }
2644 :
2645 : error = shmem_get_folio(inode, index, &folio, SGP_READ);
2646 : if (error) {
2647 : if (error == -EINVAL)
2648 : error = 0;
2649 : break;
2650 : }
2651 : if (folio) {
2652 : folio_unlock(folio);
2653 :
2654 : page = folio_file_page(folio, index);
2655 : if (PageHWPoison(page)) {
2656 : folio_put(folio);
2657 : error = -EIO;
2658 : break;
2659 : }
2660 : }
2661 :
2662 : /*
2663 : * We must evaluate after, since reads (unlike writes)
2664 : * are called without i_rwsem protection against truncate
2665 : */
2666 : nr = PAGE_SIZE;
2667 : i_size = i_size_read(inode);
2668 : end_index = i_size >> PAGE_SHIFT;
2669 : if (index == end_index) {
2670 : nr = i_size & ~PAGE_MASK;
2671 : if (nr <= offset) {
2672 : if (folio)
2673 : folio_put(folio);
2674 : break;
2675 : }
2676 : }
2677 : nr -= offset;
2678 :
2679 : if (folio) {
2680 : /*
2681 : * If users can be writing to this page using arbitrary
2682 : * virtual addresses, take care about potential aliasing
2683 : * before reading the page on the kernel side.
2684 : */
2685 : if (mapping_writably_mapped(mapping))
2686 : flush_dcache_page(page);
2687 : /*
2688 : * Mark the page accessed if we read the beginning.
2689 : */
2690 : if (!offset)
2691 : folio_mark_accessed(folio);
2692 : /*
2693 : * Ok, we have the page, and it's up-to-date, so
2694 : * now we can copy it to user space...
2695 : */
2696 : ret = copy_page_to_iter(page, offset, nr, to);
2697 : folio_put(folio);
2698 :
2699 : } else if (user_backed_iter(to)) {
2700 : /*
2701 : * Copy to user tends to be so well optimized, but
2702 : * clear_user() not so much, that it is noticeably
2703 : * faster to copy the zero page instead of clearing.
2704 : */
2705 : ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
2706 : } else {
2707 : /*
2708 : * But submitting the same page twice in a row to
2709 : * splice() - or others? - can result in confusion:
2710 : * so don't attempt that optimization on pipes etc.
2711 : */
2712 : ret = iov_iter_zero(nr, to);
2713 : }
2714 :
2715 : retval += ret;
2716 : offset += ret;
2717 : index += offset >> PAGE_SHIFT;
2718 : offset &= ~PAGE_MASK;
2719 :
2720 : if (!iov_iter_count(to))
2721 : break;
2722 : if (ret < nr) {
2723 : error = -EFAULT;
2724 : break;
2725 : }
2726 : cond_resched();
2727 : }
2728 :
2729 : *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
2730 : file_accessed(file);
2731 : return retval ? retval : error;
2732 : }
2733 :
2734 : static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
2735 : {
2736 : struct address_space *mapping = file->f_mapping;
2737 : struct inode *inode = mapping->host;
2738 :
2739 : if (whence != SEEK_DATA && whence != SEEK_HOLE)
2740 : return generic_file_llseek_size(file, offset, whence,
2741 : MAX_LFS_FILESIZE, i_size_read(inode));
2742 : if (offset < 0)
2743 : return -ENXIO;
2744 :
2745 : inode_lock(inode);
2746 : /* We're holding i_rwsem so we can access i_size directly */
2747 : offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
2748 : if (offset >= 0)
2749 : offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
2750 : inode_unlock(inode);
2751 : return offset;
2752 : }
2753 :
2754 : static long shmem_fallocate(struct file *file, int mode, loff_t offset,
2755 : loff_t len)
2756 : {
2757 : struct inode *inode = file_inode(file);
2758 : struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2759 : struct shmem_inode_info *info = SHMEM_I(inode);
2760 : struct shmem_falloc shmem_falloc;
2761 : pgoff_t start, index, end, undo_fallocend;
2762 : int error;
2763 :
2764 : if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2765 : return -EOPNOTSUPP;
2766 :
2767 : inode_lock(inode);
2768 :
2769 : if (mode & FALLOC_FL_PUNCH_HOLE) {
2770 : struct address_space *mapping = file->f_mapping;
2771 : loff_t unmap_start = round_up(offset, PAGE_SIZE);
2772 : loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
2773 : DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
2774 :
2775 : /* protected by i_rwsem */
2776 : if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
2777 : error = -EPERM;
2778 : goto out;
2779 : }
2780 :
2781 : shmem_falloc.waitq = &shmem_falloc_waitq;
2782 : shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
2783 : shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
2784 : spin_lock(&inode->i_lock);
2785 : inode->i_private = &shmem_falloc;
2786 : spin_unlock(&inode->i_lock);
2787 :
2788 : if ((u64)unmap_end > (u64)unmap_start)
2789 : unmap_mapping_range(mapping, unmap_start,
2790 : 1 + unmap_end - unmap_start, 0);
2791 : shmem_truncate_range(inode, offset, offset + len - 1);
2792 : /* No need to unmap again: hole-punching leaves COWed pages */
2793 :
2794 : spin_lock(&inode->i_lock);
2795 : inode->i_private = NULL;
2796 : wake_up_all(&shmem_falloc_waitq);
2797 : WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
2798 : spin_unlock(&inode->i_lock);
2799 : error = 0;
2800 : goto out;
2801 : }
2802 :
2803 : /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
2804 : error = inode_newsize_ok(inode, offset + len);
2805 : if (error)
2806 : goto out;
2807 :
2808 : if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
2809 : error = -EPERM;
2810 : goto out;
2811 : }
2812 :
2813 : start = offset >> PAGE_SHIFT;
2814 : end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
2815 : /* Try to avoid a swapstorm if len is impossible to satisfy */
2816 : if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
2817 : error = -ENOSPC;
2818 : goto out;
2819 : }
2820 :
2821 : shmem_falloc.waitq = NULL;
2822 : shmem_falloc.start = start;
2823 : shmem_falloc.next = start;
2824 : shmem_falloc.nr_falloced = 0;
2825 : shmem_falloc.nr_unswapped = 0;
2826 : spin_lock(&inode->i_lock);
2827 : inode->i_private = &shmem_falloc;
2828 : spin_unlock(&inode->i_lock);
2829 :
2830 : /*
2831 : * info->fallocend is only relevant when huge pages might be
2832 : * involved: to prevent split_huge_page() freeing fallocated
2833 : * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
2834 : */
2835 : undo_fallocend = info->fallocend;
2836 : if (info->fallocend < end)
2837 : info->fallocend = end;
2838 :
2839 : for (index = start; index < end; ) {
2840 : struct folio *folio;
2841 :
2842 : /*
2843 : * Good, the fallocate(2) manpage permits EINTR: we may have
2844 : * been interrupted because we are using up too much memory.
2845 : */
2846 : if (signal_pending(current))
2847 : error = -EINTR;
2848 : else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
2849 : error = -ENOMEM;
2850 : else
2851 : error = shmem_get_folio(inode, index, &folio,
2852 : SGP_FALLOC);
2853 : if (error) {
2854 : info->fallocend = undo_fallocend;
2855 : /* Remove the !uptodate folios we added */
2856 : if (index > start) {
2857 : shmem_undo_range(inode,
2858 : (loff_t)start << PAGE_SHIFT,
2859 : ((loff_t)index << PAGE_SHIFT) - 1, true);
2860 : }
2861 : goto undone;
2862 : }
2863 :
2864 : /*
2865 : * Here is a more important optimization than it appears:
2866 : * a second SGP_FALLOC on the same large folio will clear it,
2867 : * making it uptodate and un-undoable if we fail later.
2868 : */
2869 : index = folio_next_index(folio);
2870 : /* Beware 32-bit wraparound */
2871 : if (!index)
2872 : index--;
2873 :
2874 : /*
2875 : * Inform shmem_writepage() how far we have reached.
2876 : * No need for lock or barrier: we have the page lock.
2877 : */
2878 : if (!folio_test_uptodate(folio))
2879 : shmem_falloc.nr_falloced += index - shmem_falloc.next;
2880 : shmem_falloc.next = index;
2881 :
2882 : /*
2883 : * If !uptodate, leave it that way so that freeable folios
2884 : * can be recognized if we need to rollback on error later.
2885 : * But mark it dirty so that memory pressure will swap rather
2886 : * than free the folios we are allocating (and SGP_CACHE folios
2887 : * might still be clean: we now need to mark those dirty too).
2888 : */
2889 : folio_mark_dirty(folio);
2890 : folio_unlock(folio);
2891 : folio_put(folio);
2892 : cond_resched();
2893 : }
2894 :
2895 : if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
2896 : i_size_write(inode, offset + len);
2897 : undone:
2898 : spin_lock(&inode->i_lock);
2899 : inode->i_private = NULL;
2900 : spin_unlock(&inode->i_lock);
2901 : out:
2902 : if (!error)
2903 : file_modified(file);
2904 : inode_unlock(inode);
2905 : return error;
2906 : }
2907 :
2908 : static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
2909 : {
2910 : struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
2911 :
2912 : buf->f_type = TMPFS_MAGIC;
2913 : buf->f_bsize = PAGE_SIZE;
2914 : buf->f_namelen = NAME_MAX;
2915 : if (sbinfo->max_blocks) {
2916 : buf->f_blocks = sbinfo->max_blocks;
2917 : buf->f_bavail =
2918 : buf->f_bfree = sbinfo->max_blocks -
2919 : percpu_counter_sum(&sbinfo->used_blocks);
2920 : }
2921 : if (sbinfo->max_inodes) {
2922 : buf->f_files = sbinfo->max_inodes;
2923 : buf->f_ffree = sbinfo->free_inodes;
2924 : }
2925 : /* else leave those fields 0 like simple_statfs */
2926 :
2927 : buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
2928 :
2929 : return 0;
2930 : }
2931 :
2932 : /*
2933 : * File creation. Allocate an inode, and we're done..
2934 : */
2935 : static int
2936 : shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
2937 : struct dentry *dentry, umode_t mode, dev_t dev)
2938 : {
2939 : struct inode *inode;
2940 : int error = -ENOSPC;
2941 :
2942 : inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
2943 : if (inode) {
2944 : error = simple_acl_create(dir, inode);
2945 : if (error)
2946 : goto out_iput;
2947 : error = security_inode_init_security(inode, dir,
2948 : &dentry->d_name,
2949 : shmem_initxattrs, NULL);
2950 : if (error && error != -EOPNOTSUPP)
2951 : goto out_iput;
2952 :
2953 : error = 0;
2954 : dir->i_size += BOGO_DIRENT_SIZE;
2955 : dir->i_ctime = dir->i_mtime = current_time(dir);
2956 : inode_inc_iversion(dir);
2957 : d_instantiate(dentry, inode);
2958 : dget(dentry); /* Extra count - pin the dentry in core */
2959 : }
2960 : return error;
2961 : out_iput:
2962 : iput(inode);
2963 : return error;
2964 : }
2965 :
2966 : static int
2967 : shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
2968 : struct file *file, umode_t mode)
2969 : {
2970 : struct inode *inode;
2971 : int error = -ENOSPC;
2972 :
2973 : inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
2974 : if (inode) {
2975 : error = security_inode_init_security(inode, dir,
2976 : NULL,
2977 : shmem_initxattrs, NULL);
2978 : if (error && error != -EOPNOTSUPP)
2979 : goto out_iput;
2980 : error = simple_acl_create(dir, inode);
2981 : if (error)
2982 : goto out_iput;
2983 : d_tmpfile(file, inode);
2984 : }
2985 : return finish_open_simple(file, error);
2986 : out_iput:
2987 : iput(inode);
2988 : return error;
2989 : }
2990 :
2991 : static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
2992 : struct dentry *dentry, umode_t mode)
2993 : {
2994 : int error;
2995 :
2996 : error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
2997 : if (error)
2998 : return error;
2999 : inc_nlink(dir);
3000 : return 0;
3001 : }
3002 :
3003 : static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
3004 : struct dentry *dentry, umode_t mode, bool excl)
3005 : {
3006 : return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
3007 : }
3008 :
3009 : /*
3010 : * Link a file..
3011 : */
3012 : static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
3013 : {
3014 : struct inode *inode = d_inode(old_dentry);
3015 : int ret = 0;
3016 :
3017 : /*
3018 : * No ordinary (disk based) filesystem counts links as inodes;
3019 : * but each new link needs a new dentry, pinning lowmem, and
3020 : * tmpfs dentries cannot be pruned until they are unlinked.
3021 : * But if an O_TMPFILE file is linked into the tmpfs, the
3022 : * first link must skip that, to get the accounting right.
3023 : */
3024 : if (inode->i_nlink) {
3025 : ret = shmem_reserve_inode(inode->i_sb, NULL);
3026 : if (ret)
3027 : goto out;
3028 : }
3029 :
3030 : dir->i_size += BOGO_DIRENT_SIZE;
3031 : inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
3032 : inode_inc_iversion(dir);
3033 : inc_nlink(inode);
3034 : ihold(inode); /* New dentry reference */
3035 : dget(dentry); /* Extra pinning count for the created dentry */
3036 : d_instantiate(dentry, inode);
3037 : out:
3038 : return ret;
3039 : }
3040 :
3041 : static int shmem_unlink(struct inode *dir, struct dentry *dentry)
3042 : {
3043 : struct inode *inode = d_inode(dentry);
3044 :
3045 : if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
3046 : shmem_free_inode(inode->i_sb);
3047 :
3048 : dir->i_size -= BOGO_DIRENT_SIZE;
3049 : inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
3050 : inode_inc_iversion(dir);
3051 : drop_nlink(inode);
3052 : dput(dentry); /* Undo the count from "create" - this does all the work */
3053 : return 0;
3054 : }
3055 :
3056 : static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
3057 : {
3058 : if (!simple_empty(dentry))
3059 : return -ENOTEMPTY;
3060 :
3061 : drop_nlink(d_inode(dentry));
3062 : drop_nlink(dir);
3063 : return shmem_unlink(dir, dentry);
3064 : }
3065 :
3066 : static int shmem_whiteout(struct mnt_idmap *idmap,
3067 : struct inode *old_dir, struct dentry *old_dentry)
3068 : {
3069 : struct dentry *whiteout;
3070 : int error;
3071 :
3072 : whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
3073 : if (!whiteout)
3074 : return -ENOMEM;
3075 :
3076 : error = shmem_mknod(idmap, old_dir, whiteout,
3077 : S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
3078 : dput(whiteout);
3079 : if (error)
3080 : return error;
3081 :
3082 : /*
3083 : * Cheat and hash the whiteout while the old dentry is still in
3084 : * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
3085 : *
3086 : * d_lookup() will consistently find one of them at this point,
3087 : * not sure which one, but that isn't even important.
3088 : */
3089 : d_rehash(whiteout);
3090 : return 0;
3091 : }
3092 :
3093 : /*
3094 : * The VFS layer already does all the dentry stuff for rename,
3095 : * we just have to decrement the usage count for the target if
3096 : * it exists so that the VFS layer correctly free's it when it
3097 : * gets overwritten.
3098 : */
3099 : static int shmem_rename2(struct mnt_idmap *idmap,
3100 : struct inode *old_dir, struct dentry *old_dentry,
3101 : struct inode *new_dir, struct dentry *new_dentry,
3102 : unsigned int flags)
3103 : {
3104 : struct inode *inode = d_inode(old_dentry);
3105 : int they_are_dirs = S_ISDIR(inode->i_mode);
3106 :
3107 : if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
3108 : return -EINVAL;
3109 :
3110 : if (flags & RENAME_EXCHANGE)
3111 : return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
3112 :
3113 : if (!simple_empty(new_dentry))
3114 : return -ENOTEMPTY;
3115 :
3116 : if (flags & RENAME_WHITEOUT) {
3117 : int error;
3118 :
3119 : error = shmem_whiteout(idmap, old_dir, old_dentry);
3120 : if (error)
3121 : return error;
3122 : }
3123 :
3124 : if (d_really_is_positive(new_dentry)) {
3125 : (void) shmem_unlink(new_dir, new_dentry);
3126 : if (they_are_dirs) {
3127 : drop_nlink(d_inode(new_dentry));
3128 : drop_nlink(old_dir);
3129 : }
3130 : } else if (they_are_dirs) {
3131 : drop_nlink(old_dir);
3132 : inc_nlink(new_dir);
3133 : }
3134 :
3135 : old_dir->i_size -= BOGO_DIRENT_SIZE;
3136 : new_dir->i_size += BOGO_DIRENT_SIZE;
3137 : old_dir->i_ctime = old_dir->i_mtime =
3138 : new_dir->i_ctime = new_dir->i_mtime =
3139 : inode->i_ctime = current_time(old_dir);
3140 : inode_inc_iversion(old_dir);
3141 : inode_inc_iversion(new_dir);
3142 : return 0;
3143 : }
3144 :
3145 : static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
3146 : struct dentry *dentry, const char *symname)
3147 : {
3148 : int error;
3149 : int len;
3150 : struct inode *inode;
3151 : struct folio *folio;
3152 :
3153 : len = strlen(symname) + 1;
3154 : if (len > PAGE_SIZE)
3155 : return -ENAMETOOLONG;
3156 :
3157 : inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
3158 : VM_NORESERVE);
3159 : if (!inode)
3160 : return -ENOSPC;
3161 :
3162 : error = security_inode_init_security(inode, dir, &dentry->d_name,
3163 : shmem_initxattrs, NULL);
3164 : if (error && error != -EOPNOTSUPP) {
3165 : iput(inode);
3166 : return error;
3167 : }
3168 :
3169 : inode->i_size = len-1;
3170 : if (len <= SHORT_SYMLINK_LEN) {
3171 : inode->i_link = kmemdup(symname, len, GFP_KERNEL);
3172 : if (!inode->i_link) {
3173 : iput(inode);
3174 : return -ENOMEM;
3175 : }
3176 : inode->i_op = &shmem_short_symlink_operations;
3177 : } else {
3178 : inode_nohighmem(inode);
3179 : error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
3180 : if (error) {
3181 : iput(inode);
3182 : return error;
3183 : }
3184 : inode->i_mapping->a_ops = &shmem_aops;
3185 : inode->i_op = &shmem_symlink_inode_operations;
3186 : memcpy(folio_address(folio), symname, len);
3187 : folio_mark_uptodate(folio);
3188 : folio_mark_dirty(folio);
3189 : folio_unlock(folio);
3190 : folio_put(folio);
3191 : }
3192 : dir->i_size += BOGO_DIRENT_SIZE;
3193 : dir->i_ctime = dir->i_mtime = current_time(dir);
3194 : inode_inc_iversion(dir);
3195 : d_instantiate(dentry, inode);
3196 : dget(dentry);
3197 : return 0;
3198 : }
3199 :
3200 : static void shmem_put_link(void *arg)
3201 : {
3202 : folio_mark_accessed(arg);
3203 : folio_put(arg);
3204 : }
3205 :
3206 : static const char *shmem_get_link(struct dentry *dentry,
3207 : struct inode *inode,
3208 : struct delayed_call *done)
3209 : {
3210 : struct folio *folio = NULL;
3211 : int error;
3212 :
3213 : if (!dentry) {
3214 : folio = filemap_get_folio(inode->i_mapping, 0);
3215 : if (IS_ERR(folio))
3216 : return ERR_PTR(-ECHILD);
3217 : if (PageHWPoison(folio_page(folio, 0)) ||
3218 : !folio_test_uptodate(folio)) {
3219 : folio_put(folio);
3220 : return ERR_PTR(-ECHILD);
3221 : }
3222 : } else {
3223 : error = shmem_get_folio(inode, 0, &folio, SGP_READ);
3224 : if (error)
3225 : return ERR_PTR(error);
3226 : if (!folio)
3227 : return ERR_PTR(-ECHILD);
3228 : if (PageHWPoison(folio_page(folio, 0))) {
3229 : folio_unlock(folio);
3230 : folio_put(folio);
3231 : return ERR_PTR(-ECHILD);
3232 : }
3233 : folio_unlock(folio);
3234 : }
3235 : set_delayed_call(done, shmem_put_link, folio);
3236 : return folio_address(folio);
3237 : }
3238 :
3239 : #ifdef CONFIG_TMPFS_XATTR
3240 :
3241 : static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
3242 : {
3243 : struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3244 :
3245 : fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
3246 :
3247 : return 0;
3248 : }
3249 :
3250 : static int shmem_fileattr_set(struct mnt_idmap *idmap,
3251 : struct dentry *dentry, struct fileattr *fa)
3252 : {
3253 : struct inode *inode = d_inode(dentry);
3254 : struct shmem_inode_info *info = SHMEM_I(inode);
3255 :
3256 : if (fileattr_has_fsx(fa))
3257 : return -EOPNOTSUPP;
3258 : if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
3259 : return -EOPNOTSUPP;
3260 :
3261 : info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
3262 : (fa->flags & SHMEM_FL_USER_MODIFIABLE);
3263 :
3264 : shmem_set_inode_flags(inode, info->fsflags);
3265 : inode->i_ctime = current_time(inode);
3266 : inode_inc_iversion(inode);
3267 : return 0;
3268 : }
3269 :
3270 : /*
3271 : * Superblocks without xattr inode operations may get some security.* xattr
3272 : * support from the LSM "for free". As soon as we have any other xattrs
3273 : * like ACLs, we also need to implement the security.* handlers at
3274 : * filesystem level, though.
3275 : */
3276 :
3277 : /*
3278 : * Callback for security_inode_init_security() for acquiring xattrs.
3279 : */
3280 : static int shmem_initxattrs(struct inode *inode,
3281 : const struct xattr *xattr_array,
3282 : void *fs_info)
3283 : {
3284 : struct shmem_inode_info *info = SHMEM_I(inode);
3285 : const struct xattr *xattr;
3286 : struct simple_xattr *new_xattr;
3287 : size_t len;
3288 :
3289 : for (xattr = xattr_array; xattr->name != NULL; xattr++) {
3290 : new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
3291 : if (!new_xattr)
3292 : return -ENOMEM;
3293 :
3294 : len = strlen(xattr->name) + 1;
3295 : new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
3296 : GFP_KERNEL);
3297 : if (!new_xattr->name) {
3298 : kvfree(new_xattr);
3299 : return -ENOMEM;
3300 : }
3301 :
3302 : memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
3303 : XATTR_SECURITY_PREFIX_LEN);
3304 : memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
3305 : xattr->name, len);
3306 :
3307 : simple_xattr_add(&info->xattrs, new_xattr);
3308 : }
3309 :
3310 : return 0;
3311 : }
3312 :
3313 : static int shmem_xattr_handler_get(const struct xattr_handler *handler,
3314 : struct dentry *unused, struct inode *inode,
3315 : const char *name, void *buffer, size_t size)
3316 : {
3317 : struct shmem_inode_info *info = SHMEM_I(inode);
3318 :
3319 : name = xattr_full_name(handler, name);
3320 : return simple_xattr_get(&info->xattrs, name, buffer, size);
3321 : }
3322 :
3323 : static int shmem_xattr_handler_set(const struct xattr_handler *handler,
3324 : struct mnt_idmap *idmap,
3325 : struct dentry *unused, struct inode *inode,
3326 : const char *name, const void *value,
3327 : size_t size, int flags)
3328 : {
3329 : struct shmem_inode_info *info = SHMEM_I(inode);
3330 : int err;
3331 :
3332 : name = xattr_full_name(handler, name);
3333 : err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
3334 : if (!err) {
3335 : inode->i_ctime = current_time(inode);
3336 : inode_inc_iversion(inode);
3337 : }
3338 : return err;
3339 : }
3340 :
3341 : static const struct xattr_handler shmem_security_xattr_handler = {
3342 : .prefix = XATTR_SECURITY_PREFIX,
3343 : .get = shmem_xattr_handler_get,
3344 : .set = shmem_xattr_handler_set,
3345 : };
3346 :
3347 : static const struct xattr_handler shmem_trusted_xattr_handler = {
3348 : .prefix = XATTR_TRUSTED_PREFIX,
3349 : .get = shmem_xattr_handler_get,
3350 : .set = shmem_xattr_handler_set,
3351 : };
3352 :
3353 : static const struct xattr_handler *shmem_xattr_handlers[] = {
3354 : &shmem_security_xattr_handler,
3355 : &shmem_trusted_xattr_handler,
3356 : NULL
3357 : };
3358 :
3359 : static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
3360 : {
3361 : struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3362 : return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
3363 : }
3364 : #endif /* CONFIG_TMPFS_XATTR */
3365 :
3366 : static const struct inode_operations shmem_short_symlink_operations = {
3367 : .getattr = shmem_getattr,
3368 : .get_link = simple_get_link,
3369 : #ifdef CONFIG_TMPFS_XATTR
3370 : .listxattr = shmem_listxattr,
3371 : #endif
3372 : };
3373 :
3374 : static const struct inode_operations shmem_symlink_inode_operations = {
3375 : .getattr = shmem_getattr,
3376 : .get_link = shmem_get_link,
3377 : #ifdef CONFIG_TMPFS_XATTR
3378 : .listxattr = shmem_listxattr,
3379 : #endif
3380 : };
3381 :
3382 : static struct dentry *shmem_get_parent(struct dentry *child)
3383 : {
3384 : return ERR_PTR(-ESTALE);
3385 : }
3386 :
3387 : static int shmem_match(struct inode *ino, void *vfh)
3388 : {
3389 : __u32 *fh = vfh;
3390 : __u64 inum = fh[2];
3391 : inum = (inum << 32) | fh[1];
3392 : return ino->i_ino == inum && fh[0] == ino->i_generation;
3393 : }
3394 :
3395 : /* Find any alias of inode, but prefer a hashed alias */
3396 : static struct dentry *shmem_find_alias(struct inode *inode)
3397 : {
3398 : struct dentry *alias = d_find_alias(inode);
3399 :
3400 : return alias ?: d_find_any_alias(inode);
3401 : }
3402 :
3403 :
3404 : static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
3405 : struct fid *fid, int fh_len, int fh_type)
3406 : {
3407 : struct inode *inode;
3408 : struct dentry *dentry = NULL;
3409 : u64 inum;
3410 :
3411 : if (fh_len < 3)
3412 : return NULL;
3413 :
3414 : inum = fid->raw[2];
3415 : inum = (inum << 32) | fid->raw[1];
3416 :
3417 : inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
3418 : shmem_match, fid->raw);
3419 : if (inode) {
3420 : dentry = shmem_find_alias(inode);
3421 : iput(inode);
3422 : }
3423 :
3424 : return dentry;
3425 : }
3426 :
3427 : static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
3428 : struct inode *parent)
3429 : {
3430 : if (*len < 3) {
3431 : *len = 3;
3432 : return FILEID_INVALID;
3433 : }
3434 :
3435 : if (inode_unhashed(inode)) {
3436 : /* Unfortunately insert_inode_hash is not idempotent,
3437 : * so as we hash inodes here rather than at creation
3438 : * time, we need a lock to ensure we only try
3439 : * to do it once
3440 : */
3441 : static DEFINE_SPINLOCK(lock);
3442 : spin_lock(&lock);
3443 : if (inode_unhashed(inode))
3444 : __insert_inode_hash(inode,
3445 : inode->i_ino + inode->i_generation);
3446 : spin_unlock(&lock);
3447 : }
3448 :
3449 : fh[0] = inode->i_generation;
3450 : fh[1] = inode->i_ino;
3451 : fh[2] = ((__u64)inode->i_ino) >> 32;
3452 :
3453 : *len = 3;
3454 : return 1;
3455 : }
3456 :
3457 : static const struct export_operations shmem_export_ops = {
3458 : .get_parent = shmem_get_parent,
3459 : .encode_fh = shmem_encode_fh,
3460 : .fh_to_dentry = shmem_fh_to_dentry,
3461 : };
3462 :
3463 : enum shmem_param {
3464 : Opt_gid,
3465 : Opt_huge,
3466 : Opt_mode,
3467 : Opt_mpol,
3468 : Opt_nr_blocks,
3469 : Opt_nr_inodes,
3470 : Opt_size,
3471 : Opt_uid,
3472 : Opt_inode32,
3473 : Opt_inode64,
3474 : Opt_noswap,
3475 : };
3476 :
3477 : static const struct constant_table shmem_param_enums_huge[] = {
3478 : {"never", SHMEM_HUGE_NEVER },
3479 : {"always", SHMEM_HUGE_ALWAYS },
3480 : {"within_size", SHMEM_HUGE_WITHIN_SIZE },
3481 : {"advise", SHMEM_HUGE_ADVISE },
3482 : {}
3483 : };
3484 :
3485 : const struct fs_parameter_spec shmem_fs_parameters[] = {
3486 : fsparam_u32 ("gid", Opt_gid),
3487 : fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
3488 : fsparam_u32oct("mode", Opt_mode),
3489 : fsparam_string("mpol", Opt_mpol),
3490 : fsparam_string("nr_blocks", Opt_nr_blocks),
3491 : fsparam_string("nr_inodes", Opt_nr_inodes),
3492 : fsparam_string("size", Opt_size),
3493 : fsparam_u32 ("uid", Opt_uid),
3494 : fsparam_flag ("inode32", Opt_inode32),
3495 : fsparam_flag ("inode64", Opt_inode64),
3496 : fsparam_flag ("noswap", Opt_noswap),
3497 : {}
3498 : };
3499 :
3500 : static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
3501 : {
3502 : struct shmem_options *ctx = fc->fs_private;
3503 : struct fs_parse_result result;
3504 : unsigned long long size;
3505 : char *rest;
3506 : int opt;
3507 :
3508 : opt = fs_parse(fc, shmem_fs_parameters, param, &result);
3509 : if (opt < 0)
3510 : return opt;
3511 :
3512 : switch (opt) {
3513 : case Opt_size:
3514 : size = memparse(param->string, &rest);
3515 : if (*rest == '%') {
3516 : size <<= PAGE_SHIFT;
3517 : size *= totalram_pages();
3518 : do_div(size, 100);
3519 : rest++;
3520 : }
3521 : if (*rest)
3522 : goto bad_value;
3523 : ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
3524 : ctx->seen |= SHMEM_SEEN_BLOCKS;
3525 : break;
3526 : case Opt_nr_blocks:
3527 : ctx->blocks = memparse(param->string, &rest);
3528 : if (*rest || ctx->blocks > S64_MAX)
3529 : goto bad_value;
3530 : ctx->seen |= SHMEM_SEEN_BLOCKS;
3531 : break;
3532 : case Opt_nr_inodes:
3533 : ctx->inodes = memparse(param->string, &rest);
3534 : if (*rest)
3535 : goto bad_value;
3536 : ctx->seen |= SHMEM_SEEN_INODES;
3537 : break;
3538 : case Opt_mode:
3539 : ctx->mode = result.uint_32 & 07777;
3540 : break;
3541 : case Opt_uid:
3542 : ctx->uid = make_kuid(current_user_ns(), result.uint_32);
3543 : if (!uid_valid(ctx->uid))
3544 : goto bad_value;
3545 : break;
3546 : case Opt_gid:
3547 : ctx->gid = make_kgid(current_user_ns(), result.uint_32);
3548 : if (!gid_valid(ctx->gid))
3549 : goto bad_value;
3550 : break;
3551 : case Opt_huge:
3552 : ctx->huge = result.uint_32;
3553 : if (ctx->huge != SHMEM_HUGE_NEVER &&
3554 : !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
3555 : has_transparent_hugepage()))
3556 : goto unsupported_parameter;
3557 : ctx->seen |= SHMEM_SEEN_HUGE;
3558 : break;
3559 : case Opt_mpol:
3560 : if (IS_ENABLED(CONFIG_NUMA)) {
3561 : mpol_put(ctx->mpol);
3562 : ctx->mpol = NULL;
3563 : if (mpol_parse_str(param->string, &ctx->mpol))
3564 : goto bad_value;
3565 : break;
3566 : }
3567 : goto unsupported_parameter;
3568 : case Opt_inode32:
3569 : ctx->full_inums = false;
3570 : ctx->seen |= SHMEM_SEEN_INUMS;
3571 : break;
3572 : case Opt_inode64:
3573 : if (sizeof(ino_t) < 8) {
3574 : return invalfc(fc,
3575 : "Cannot use inode64 with <64bit inums in kernel\n");
3576 : }
3577 : ctx->full_inums = true;
3578 : ctx->seen |= SHMEM_SEEN_INUMS;
3579 : break;
3580 : case Opt_noswap:
3581 : if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
3582 : return invalfc(fc,
3583 : "Turning off swap in unprivileged tmpfs mounts unsupported");
3584 : }
3585 : ctx->noswap = true;
3586 : ctx->seen |= SHMEM_SEEN_NOSWAP;
3587 : break;
3588 : }
3589 : return 0;
3590 :
3591 : unsupported_parameter:
3592 : return invalfc(fc, "Unsupported parameter '%s'", param->key);
3593 : bad_value:
3594 : return invalfc(fc, "Bad value for '%s'", param->key);
3595 : }
3596 :
3597 : static int shmem_parse_options(struct fs_context *fc, void *data)
3598 : {
3599 : char *options = data;
3600 :
3601 : if (options) {
3602 : int err = security_sb_eat_lsm_opts(options, &fc->security);
3603 : if (err)
3604 : return err;
3605 : }
3606 :
3607 : while (options != NULL) {
3608 : char *this_char = options;
3609 : for (;;) {
3610 : /*
3611 : * NUL-terminate this option: unfortunately,
3612 : * mount options form a comma-separated list,
3613 : * but mpol's nodelist may also contain commas.
3614 : */
3615 : options = strchr(options, ',');
3616 : if (options == NULL)
3617 : break;
3618 : options++;
3619 : if (!isdigit(*options)) {
3620 : options[-1] = '\0';
3621 : break;
3622 : }
3623 : }
3624 : if (*this_char) {
3625 : char *value = strchr(this_char, '=');
3626 : size_t len = 0;
3627 : int err;
3628 :
3629 : if (value) {
3630 : *value++ = '\0';
3631 : len = strlen(value);
3632 : }
3633 : err = vfs_parse_fs_string(fc, this_char, value, len);
3634 : if (err < 0)
3635 : return err;
3636 : }
3637 : }
3638 : return 0;
3639 : }
3640 :
3641 : /*
3642 : * Reconfigure a shmem filesystem.
3643 : *
3644 : * Note that we disallow change from limited->unlimited blocks/inodes while any
3645 : * are in use; but we must separately disallow unlimited->limited, because in
3646 : * that case we have no record of how much is already in use.
3647 : */
3648 : static int shmem_reconfigure(struct fs_context *fc)
3649 : {
3650 : struct shmem_options *ctx = fc->fs_private;
3651 : struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
3652 : unsigned long inodes;
3653 : struct mempolicy *mpol = NULL;
3654 : const char *err;
3655 :
3656 : raw_spin_lock(&sbinfo->stat_lock);
3657 : inodes = sbinfo->max_inodes - sbinfo->free_inodes;
3658 :
3659 : if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
3660 : if (!sbinfo->max_blocks) {
3661 : err = "Cannot retroactively limit size";
3662 : goto out;
3663 : }
3664 : if (percpu_counter_compare(&sbinfo->used_blocks,
3665 : ctx->blocks) > 0) {
3666 : err = "Too small a size for current use";
3667 : goto out;
3668 : }
3669 : }
3670 : if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
3671 : if (!sbinfo->max_inodes) {
3672 : err = "Cannot retroactively limit inodes";
3673 : goto out;
3674 : }
3675 : if (ctx->inodes < inodes) {
3676 : err = "Too few inodes for current use";
3677 : goto out;
3678 : }
3679 : }
3680 :
3681 : if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
3682 : sbinfo->next_ino > UINT_MAX) {
3683 : err = "Current inum too high to switch to 32-bit inums";
3684 : goto out;
3685 : }
3686 : if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
3687 : err = "Cannot disable swap on remount";
3688 : goto out;
3689 : }
3690 : if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
3691 : err = "Cannot enable swap on remount if it was disabled on first mount";
3692 : goto out;
3693 : }
3694 :
3695 : if (ctx->seen & SHMEM_SEEN_HUGE)
3696 : sbinfo->huge = ctx->huge;
3697 : if (ctx->seen & SHMEM_SEEN_INUMS)
3698 : sbinfo->full_inums = ctx->full_inums;
3699 : if (ctx->seen & SHMEM_SEEN_BLOCKS)
3700 : sbinfo->max_blocks = ctx->blocks;
3701 : if (ctx->seen & SHMEM_SEEN_INODES) {
3702 : sbinfo->max_inodes = ctx->inodes;
3703 : sbinfo->free_inodes = ctx->inodes - inodes;
3704 : }
3705 :
3706 : /*
3707 : * Preserve previous mempolicy unless mpol remount option was specified.
3708 : */
3709 : if (ctx->mpol) {
3710 : mpol = sbinfo->mpol;
3711 : sbinfo->mpol = ctx->mpol; /* transfers initial ref */
3712 : ctx->mpol = NULL;
3713 : }
3714 :
3715 : if (ctx->noswap)
3716 : sbinfo->noswap = true;
3717 :
3718 : raw_spin_unlock(&sbinfo->stat_lock);
3719 : mpol_put(mpol);
3720 : return 0;
3721 : out:
3722 : raw_spin_unlock(&sbinfo->stat_lock);
3723 : return invalfc(fc, "%s", err);
3724 : }
3725 :
3726 : static int shmem_show_options(struct seq_file *seq, struct dentry *root)
3727 : {
3728 : struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
3729 :
3730 : if (sbinfo->max_blocks != shmem_default_max_blocks())
3731 : seq_printf(seq, ",size=%luk",
3732 : sbinfo->max_blocks << (PAGE_SHIFT - 10));
3733 : if (sbinfo->max_inodes != shmem_default_max_inodes())
3734 : seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
3735 : if (sbinfo->mode != (0777 | S_ISVTX))
3736 : seq_printf(seq, ",mode=%03ho", sbinfo->mode);
3737 : if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
3738 : seq_printf(seq, ",uid=%u",
3739 : from_kuid_munged(&init_user_ns, sbinfo->uid));
3740 : if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
3741 : seq_printf(seq, ",gid=%u",
3742 : from_kgid_munged(&init_user_ns, sbinfo->gid));
3743 :
3744 : /*
3745 : * Showing inode{64,32} might be useful even if it's the system default,
3746 : * since then people don't have to resort to checking both here and
3747 : * /proc/config.gz to confirm 64-bit inums were successfully applied
3748 : * (which may not even exist if IKCONFIG_PROC isn't enabled).
3749 : *
3750 : * We hide it when inode64 isn't the default and we are using 32-bit
3751 : * inodes, since that probably just means the feature isn't even under
3752 : * consideration.
3753 : *
3754 : * As such:
3755 : *
3756 : * +-----------------+-----------------+
3757 : * | TMPFS_INODE64=y | TMPFS_INODE64=n |
3758 : * +------------------+-----------------+-----------------+
3759 : * | full_inums=true | show | show |
3760 : * | full_inums=false | show | hide |
3761 : * +------------------+-----------------+-----------------+
3762 : *
3763 : */
3764 : if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
3765 : seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
3766 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3767 : /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
3768 : if (sbinfo->huge)
3769 : seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
3770 : #endif
3771 : shmem_show_mpol(seq, sbinfo->mpol);
3772 : if (sbinfo->noswap)
3773 : seq_printf(seq, ",noswap");
3774 : return 0;
3775 : }
3776 :
3777 : #endif /* CONFIG_TMPFS */
3778 :
3779 0 : static void shmem_put_super(struct super_block *sb)
3780 : {
3781 0 : struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3782 :
3783 0 : free_percpu(sbinfo->ino_batch);
3784 0 : percpu_counter_destroy(&sbinfo->used_blocks);
3785 0 : mpol_put(sbinfo->mpol);
3786 0 : kfree(sbinfo);
3787 0 : sb->s_fs_info = NULL;
3788 0 : }
3789 :
3790 1 : static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
3791 : {
3792 1 : struct shmem_options *ctx = fc->fs_private;
3793 : struct inode *inode;
3794 : struct shmem_sb_info *sbinfo;
3795 :
3796 : /* Round up to L1_CACHE_BYTES to resist false sharing */
3797 1 : sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
3798 : L1_CACHE_BYTES), GFP_KERNEL);
3799 1 : if (!sbinfo)
3800 : return -ENOMEM;
3801 :
3802 1 : sb->s_fs_info = sbinfo;
3803 :
3804 : #ifdef CONFIG_TMPFS
3805 : /*
3806 : * Per default we only allow half of the physical ram per
3807 : * tmpfs instance, limiting inodes to one per page of lowmem;
3808 : * but the internal instance is left unlimited.
3809 : */
3810 : if (!(sb->s_flags & SB_KERNMOUNT)) {
3811 : if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
3812 : ctx->blocks = shmem_default_max_blocks();
3813 : if (!(ctx->seen & SHMEM_SEEN_INODES))
3814 : ctx->inodes = shmem_default_max_inodes();
3815 : if (!(ctx->seen & SHMEM_SEEN_INUMS))
3816 : ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
3817 : sbinfo->noswap = ctx->noswap;
3818 : } else {
3819 : sb->s_flags |= SB_NOUSER;
3820 : }
3821 : sb->s_export_op = &shmem_export_ops;
3822 : sb->s_flags |= SB_NOSEC | SB_I_VERSION;
3823 : #else
3824 1 : sb->s_flags |= SB_NOUSER;
3825 : #endif
3826 1 : sbinfo->max_blocks = ctx->blocks;
3827 1 : sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
3828 1 : if (sb->s_flags & SB_KERNMOUNT) {
3829 1 : sbinfo->ino_batch = alloc_percpu(ino_t);
3830 1 : if (!sbinfo->ino_batch)
3831 : goto failed;
3832 : }
3833 1 : sbinfo->uid = ctx->uid;
3834 1 : sbinfo->gid = ctx->gid;
3835 1 : sbinfo->full_inums = ctx->full_inums;
3836 1 : sbinfo->mode = ctx->mode;
3837 1 : sbinfo->huge = ctx->huge;
3838 1 : sbinfo->mpol = ctx->mpol;
3839 1 : ctx->mpol = NULL;
3840 :
3841 : raw_spin_lock_init(&sbinfo->stat_lock);
3842 2 : if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
3843 : goto failed;
3844 1 : spin_lock_init(&sbinfo->shrinklist_lock);
3845 2 : INIT_LIST_HEAD(&sbinfo->shrinklist);
3846 :
3847 1 : sb->s_maxbytes = MAX_LFS_FILESIZE;
3848 1 : sb->s_blocksize = PAGE_SIZE;
3849 1 : sb->s_blocksize_bits = PAGE_SHIFT;
3850 1 : sb->s_magic = TMPFS_MAGIC;
3851 1 : sb->s_op = &shmem_ops;
3852 1 : sb->s_time_gran = 1;
3853 : #ifdef CONFIG_TMPFS_XATTR
3854 : sb->s_xattr = shmem_xattr_handlers;
3855 : #endif
3856 : #ifdef CONFIG_TMPFS_POSIX_ACL
3857 : sb->s_flags |= SB_POSIXACL;
3858 : #endif
3859 1 : uuid_gen(&sb->s_uuid);
3860 :
3861 1 : inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0,
3862 : VM_NORESERVE);
3863 1 : if (!inode)
3864 : goto failed;
3865 1 : inode->i_uid = sbinfo->uid;
3866 1 : inode->i_gid = sbinfo->gid;
3867 1 : sb->s_root = d_make_root(inode);
3868 1 : if (!sb->s_root)
3869 : goto failed;
3870 : return 0;
3871 :
3872 : failed:
3873 0 : shmem_put_super(sb);
3874 0 : return -ENOMEM;
3875 : }
3876 :
3877 1 : static int shmem_get_tree(struct fs_context *fc)
3878 : {
3879 1 : return get_tree_nodev(fc, shmem_fill_super);
3880 : }
3881 :
3882 1 : static void shmem_free_fc(struct fs_context *fc)
3883 : {
3884 1 : struct shmem_options *ctx = fc->fs_private;
3885 :
3886 1 : if (ctx) {
3887 1 : mpol_put(ctx->mpol);
3888 1 : kfree(ctx);
3889 : }
3890 1 : }
3891 :
3892 : static const struct fs_context_operations shmem_fs_context_ops = {
3893 : .free = shmem_free_fc,
3894 : .get_tree = shmem_get_tree,
3895 : #ifdef CONFIG_TMPFS
3896 : .parse_monolithic = shmem_parse_options,
3897 : .parse_param = shmem_parse_one,
3898 : .reconfigure = shmem_reconfigure,
3899 : #endif
3900 : };
3901 :
3902 : static struct kmem_cache *shmem_inode_cachep;
3903 :
3904 1 : static struct inode *shmem_alloc_inode(struct super_block *sb)
3905 : {
3906 : struct shmem_inode_info *info;
3907 2 : info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
3908 1 : if (!info)
3909 : return NULL;
3910 1 : return &info->vfs_inode;
3911 : }
3912 :
3913 0 : static void shmem_free_in_core_inode(struct inode *inode)
3914 : {
3915 0 : if (S_ISLNK(inode->i_mode))
3916 0 : kfree(inode->i_link);
3917 0 : kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
3918 0 : }
3919 :
3920 0 : static void shmem_destroy_inode(struct inode *inode)
3921 : {
3922 : if (S_ISREG(inode->i_mode))
3923 : mpol_free_shared_policy(&SHMEM_I(inode)->policy);
3924 0 : }
3925 :
3926 12 : static void shmem_init_inode(void *foo)
3927 : {
3928 12 : struct shmem_inode_info *info = foo;
3929 12 : inode_init_once(&info->vfs_inode);
3930 12 : }
3931 :
3932 : static void shmem_init_inodecache(void)
3933 : {
3934 1 : shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
3935 : sizeof(struct shmem_inode_info),
3936 : 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
3937 : }
3938 :
3939 : static void shmem_destroy_inodecache(void)
3940 : {
3941 0 : kmem_cache_destroy(shmem_inode_cachep);
3942 : }
3943 :
3944 : /* Keep the page in page cache instead of truncating it */
3945 0 : static int shmem_error_remove_page(struct address_space *mapping,
3946 : struct page *page)
3947 : {
3948 0 : return 0;
3949 : }
3950 :
3951 : const struct address_space_operations shmem_aops = {
3952 : .writepage = shmem_writepage,
3953 : .dirty_folio = noop_dirty_folio,
3954 : #ifdef CONFIG_TMPFS
3955 : .write_begin = shmem_write_begin,
3956 : .write_end = shmem_write_end,
3957 : #endif
3958 : #ifdef CONFIG_MIGRATION
3959 : .migrate_folio = migrate_folio,
3960 : #endif
3961 : .error_remove_page = shmem_error_remove_page,
3962 : };
3963 : EXPORT_SYMBOL(shmem_aops);
3964 :
3965 : static const struct file_operations shmem_file_operations = {
3966 : .mmap = shmem_mmap,
3967 : .open = generic_file_open,
3968 : .get_unmapped_area = shmem_get_unmapped_area,
3969 : #ifdef CONFIG_TMPFS
3970 : .llseek = shmem_file_llseek,
3971 : .read_iter = shmem_file_read_iter,
3972 : .write_iter = generic_file_write_iter,
3973 : .fsync = noop_fsync,
3974 : .splice_read = generic_file_splice_read,
3975 : .splice_write = iter_file_splice_write,
3976 : .fallocate = shmem_fallocate,
3977 : #endif
3978 : };
3979 :
3980 : static const struct inode_operations shmem_inode_operations = {
3981 : .getattr = shmem_getattr,
3982 : .setattr = shmem_setattr,
3983 : #ifdef CONFIG_TMPFS_XATTR
3984 : .listxattr = shmem_listxattr,
3985 : .set_acl = simple_set_acl,
3986 : .fileattr_get = shmem_fileattr_get,
3987 : .fileattr_set = shmem_fileattr_set,
3988 : #endif
3989 : };
3990 :
3991 : static const struct inode_operations shmem_dir_inode_operations = {
3992 : #ifdef CONFIG_TMPFS
3993 : .getattr = shmem_getattr,
3994 : .create = shmem_create,
3995 : .lookup = simple_lookup,
3996 : .link = shmem_link,
3997 : .unlink = shmem_unlink,
3998 : .symlink = shmem_symlink,
3999 : .mkdir = shmem_mkdir,
4000 : .rmdir = shmem_rmdir,
4001 : .mknod = shmem_mknod,
4002 : .rename = shmem_rename2,
4003 : .tmpfile = shmem_tmpfile,
4004 : #endif
4005 : #ifdef CONFIG_TMPFS_XATTR
4006 : .listxattr = shmem_listxattr,
4007 : .fileattr_get = shmem_fileattr_get,
4008 : .fileattr_set = shmem_fileattr_set,
4009 : #endif
4010 : #ifdef CONFIG_TMPFS_POSIX_ACL
4011 : .setattr = shmem_setattr,
4012 : .set_acl = simple_set_acl,
4013 : #endif
4014 : };
4015 :
4016 : static const struct inode_operations shmem_special_inode_operations = {
4017 : .getattr = shmem_getattr,
4018 : #ifdef CONFIG_TMPFS_XATTR
4019 : .listxattr = shmem_listxattr,
4020 : #endif
4021 : #ifdef CONFIG_TMPFS_POSIX_ACL
4022 : .setattr = shmem_setattr,
4023 : .set_acl = simple_set_acl,
4024 : #endif
4025 : };
4026 :
4027 : static const struct super_operations shmem_ops = {
4028 : .alloc_inode = shmem_alloc_inode,
4029 : .free_inode = shmem_free_in_core_inode,
4030 : .destroy_inode = shmem_destroy_inode,
4031 : #ifdef CONFIG_TMPFS
4032 : .statfs = shmem_statfs,
4033 : .show_options = shmem_show_options,
4034 : #endif
4035 : .evict_inode = shmem_evict_inode,
4036 : .drop_inode = generic_delete_inode,
4037 : .put_super = shmem_put_super,
4038 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4039 : .nr_cached_objects = shmem_unused_huge_count,
4040 : .free_cached_objects = shmem_unused_huge_scan,
4041 : #endif
4042 : };
4043 :
4044 : static const struct vm_operations_struct shmem_vm_ops = {
4045 : .fault = shmem_fault,
4046 : .map_pages = filemap_map_pages,
4047 : #ifdef CONFIG_NUMA
4048 : .set_policy = shmem_set_policy,
4049 : .get_policy = shmem_get_policy,
4050 : #endif
4051 : };
4052 :
4053 : static const struct vm_operations_struct shmem_anon_vm_ops = {
4054 : .fault = shmem_fault,
4055 : .map_pages = filemap_map_pages,
4056 : #ifdef CONFIG_NUMA
4057 : .set_policy = shmem_set_policy,
4058 : .get_policy = shmem_get_policy,
4059 : #endif
4060 : };
4061 :
4062 1 : int shmem_init_fs_context(struct fs_context *fc)
4063 : {
4064 : struct shmem_options *ctx;
4065 :
4066 1 : ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
4067 1 : if (!ctx)
4068 : return -ENOMEM;
4069 :
4070 1 : ctx->mode = 0777 | S_ISVTX;
4071 1 : ctx->uid = current_fsuid();
4072 1 : ctx->gid = current_fsgid();
4073 :
4074 1 : fc->fs_private = ctx;
4075 1 : fc->ops = &shmem_fs_context_ops;
4076 1 : return 0;
4077 : }
4078 :
4079 : static struct file_system_type shmem_fs_type = {
4080 : .owner = THIS_MODULE,
4081 : .name = "tmpfs",
4082 : .init_fs_context = shmem_init_fs_context,
4083 : #ifdef CONFIG_TMPFS
4084 : .parameters = shmem_fs_parameters,
4085 : #endif
4086 : .kill_sb = kill_litter_super,
4087 : #ifdef CONFIG_SHMEM
4088 : .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
4089 : #else
4090 : .fs_flags = FS_USERNS_MOUNT,
4091 : #endif
4092 : };
4093 :
4094 1 : void __init shmem_init(void)
4095 : {
4096 : int error;
4097 :
4098 : shmem_init_inodecache();
4099 :
4100 1 : error = register_filesystem(&shmem_fs_type);
4101 1 : if (error) {
4102 0 : pr_err("Could not register tmpfs\n");
4103 0 : goto out2;
4104 : }
4105 :
4106 1 : shm_mnt = kern_mount(&shmem_fs_type);
4107 2 : if (IS_ERR(shm_mnt)) {
4108 0 : error = PTR_ERR(shm_mnt);
4109 0 : pr_err("Could not kern_mount tmpfs\n");
4110 : goto out1;
4111 : }
4112 :
4113 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4114 : if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
4115 : SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
4116 : else
4117 : shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
4118 : #endif
4119 : return;
4120 :
4121 : out1:
4122 0 : unregister_filesystem(&shmem_fs_type);
4123 : out2:
4124 : shmem_destroy_inodecache();
4125 0 : shm_mnt = ERR_PTR(error);
4126 : }
4127 :
4128 : #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
4129 : static ssize_t shmem_enabled_show(struct kobject *kobj,
4130 : struct kobj_attribute *attr, char *buf)
4131 : {
4132 : static const int values[] = {
4133 : SHMEM_HUGE_ALWAYS,
4134 : SHMEM_HUGE_WITHIN_SIZE,
4135 : SHMEM_HUGE_ADVISE,
4136 : SHMEM_HUGE_NEVER,
4137 : SHMEM_HUGE_DENY,
4138 : SHMEM_HUGE_FORCE,
4139 : };
4140 : int len = 0;
4141 : int i;
4142 :
4143 : for (i = 0; i < ARRAY_SIZE(values); i++) {
4144 : len += sysfs_emit_at(buf, len,
4145 : shmem_huge == values[i] ? "%s[%s]" : "%s%s",
4146 : i ? " " : "",
4147 : shmem_format_huge(values[i]));
4148 : }
4149 :
4150 : len += sysfs_emit_at(buf, len, "\n");
4151 :
4152 : return len;
4153 : }
4154 :
4155 : static ssize_t shmem_enabled_store(struct kobject *kobj,
4156 : struct kobj_attribute *attr, const char *buf, size_t count)
4157 : {
4158 : char tmp[16];
4159 : int huge;
4160 :
4161 : if (count + 1 > sizeof(tmp))
4162 : return -EINVAL;
4163 : memcpy(tmp, buf, count);
4164 : tmp[count] = '\0';
4165 : if (count && tmp[count - 1] == '\n')
4166 : tmp[count - 1] = '\0';
4167 :
4168 : huge = shmem_parse_huge(tmp);
4169 : if (huge == -EINVAL)
4170 : return -EINVAL;
4171 : if (!has_transparent_hugepage() &&
4172 : huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
4173 : return -EINVAL;
4174 :
4175 : shmem_huge = huge;
4176 : if (shmem_huge > SHMEM_HUGE_DENY)
4177 : SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
4178 : return count;
4179 : }
4180 :
4181 : struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
4182 : #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
4183 :
4184 : #else /* !CONFIG_SHMEM */
4185 :
4186 : /*
4187 : * tiny-shmem: simple shmemfs and tmpfs using ramfs code
4188 : *
4189 : * This is intended for small system where the benefits of the full
4190 : * shmem code (swap-backed and resource-limited) are outweighed by
4191 : * their complexity. On systems without swap this code should be
4192 : * effectively equivalent, but much lighter weight.
4193 : */
4194 :
4195 : static struct file_system_type shmem_fs_type = {
4196 : .name = "tmpfs",
4197 : .init_fs_context = ramfs_init_fs_context,
4198 : .parameters = ramfs_fs_parameters,
4199 : .kill_sb = kill_litter_super,
4200 : .fs_flags = FS_USERNS_MOUNT,
4201 : };
4202 :
4203 : void __init shmem_init(void)
4204 : {
4205 : BUG_ON(register_filesystem(&shmem_fs_type) != 0);
4206 :
4207 : shm_mnt = kern_mount(&shmem_fs_type);
4208 : BUG_ON(IS_ERR(shm_mnt));
4209 : }
4210 :
4211 : int shmem_unuse(unsigned int type)
4212 : {
4213 : return 0;
4214 : }
4215 :
4216 : int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
4217 : {
4218 : return 0;
4219 : }
4220 :
4221 : void shmem_unlock_mapping(struct address_space *mapping)
4222 : {
4223 : }
4224 :
4225 : #ifdef CONFIG_MMU
4226 : unsigned long shmem_get_unmapped_area(struct file *file,
4227 : unsigned long addr, unsigned long len,
4228 : unsigned long pgoff, unsigned long flags)
4229 : {
4230 : return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
4231 : }
4232 : #endif
4233 :
4234 : void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
4235 : {
4236 : truncate_inode_pages_range(inode->i_mapping, lstart, lend);
4237 : }
4238 : EXPORT_SYMBOL_GPL(shmem_truncate_range);
4239 :
4240 : #define shmem_vm_ops generic_file_vm_ops
4241 : #define shmem_anon_vm_ops generic_file_vm_ops
4242 : #define shmem_file_operations ramfs_file_operations
4243 : #define shmem_get_inode(idmap, sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
4244 : #define shmem_acct_size(flags, size) 0
4245 : #define shmem_unacct_size(flags, size) do {} while (0)
4246 :
4247 : #endif /* CONFIG_SHMEM */
4248 :
4249 : /* common code */
4250 :
4251 0 : static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
4252 : unsigned long flags, unsigned int i_flags)
4253 : {
4254 : struct inode *inode;
4255 : struct file *res;
4256 :
4257 0 : if (IS_ERR(mnt))
4258 : return ERR_CAST(mnt);
4259 :
4260 0 : if (size < 0 || size > MAX_LFS_FILESIZE)
4261 : return ERR_PTR(-EINVAL);
4262 :
4263 0 : if (shmem_acct_size(flags, size))
4264 : return ERR_PTR(-ENOMEM);
4265 :
4266 0 : if (is_idmapped_mnt(mnt))
4267 : return ERR_PTR(-EINVAL);
4268 :
4269 0 : inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
4270 : S_IFREG | S_IRWXUGO, 0, flags);
4271 0 : if (unlikely(!inode)) {
4272 : shmem_unacct_size(flags, size);
4273 : return ERR_PTR(-ENOSPC);
4274 : }
4275 0 : inode->i_flags |= i_flags;
4276 0 : inode->i_size = size;
4277 0 : clear_nlink(inode); /* It is unlinked */
4278 0 : res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
4279 0 : if (!IS_ERR(res))
4280 0 : res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
4281 : &shmem_file_operations);
4282 0 : if (IS_ERR(res))
4283 0 : iput(inode);
4284 : return res;
4285 : }
4286 :
4287 : /**
4288 : * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
4289 : * kernel internal. There will be NO LSM permission checks against the
4290 : * underlying inode. So users of this interface must do LSM checks at a
4291 : * higher layer. The users are the big_key and shm implementations. LSM
4292 : * checks are provided at the key or shm level rather than the inode.
4293 : * @name: name for dentry (to be seen in /proc/<pid>/maps
4294 : * @size: size to be set for the file
4295 : * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4296 : */
4297 0 : struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
4298 : {
4299 0 : return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
4300 : }
4301 :
4302 : /**
4303 : * shmem_file_setup - get an unlinked file living in tmpfs
4304 : * @name: name for dentry (to be seen in /proc/<pid>/maps
4305 : * @size: size to be set for the file
4306 : * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4307 : */
4308 0 : struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
4309 : {
4310 0 : return __shmem_file_setup(shm_mnt, name, size, flags, 0);
4311 : }
4312 : EXPORT_SYMBOL_GPL(shmem_file_setup);
4313 :
4314 : /**
4315 : * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
4316 : * @mnt: the tmpfs mount where the file will be created
4317 : * @name: name for dentry (to be seen in /proc/<pid>/maps
4318 : * @size: size to be set for the file
4319 : * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4320 : */
4321 0 : struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
4322 : loff_t size, unsigned long flags)
4323 : {
4324 0 : return __shmem_file_setup(mnt, name, size, flags, 0);
4325 : }
4326 : EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
4327 :
4328 : /**
4329 : * shmem_zero_setup - setup a shared anonymous mapping
4330 : * @vma: the vma to be mmapped is prepared by do_mmap
4331 : */
4332 0 : int shmem_zero_setup(struct vm_area_struct *vma)
4333 : {
4334 : struct file *file;
4335 0 : loff_t size = vma->vm_end - vma->vm_start;
4336 :
4337 : /*
4338 : * Cloning a new file under mmap_lock leads to a lock ordering conflict
4339 : * between XFS directory reading and selinux: since this file is only
4340 : * accessible to the user through its mapping, use S_PRIVATE flag to
4341 : * bypass file security, in the same way as shmem_kernel_file_setup().
4342 : */
4343 0 : file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
4344 0 : if (IS_ERR(file))
4345 0 : return PTR_ERR(file);
4346 :
4347 0 : if (vma->vm_file)
4348 0 : fput(vma->vm_file);
4349 0 : vma->vm_file = file;
4350 0 : vma->vm_ops = &shmem_anon_vm_ops;
4351 :
4352 0 : return 0;
4353 : }
4354 :
4355 : /**
4356 : * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
4357 : * @mapping: the folio's address_space
4358 : * @index: the folio index
4359 : * @gfp: the page allocator flags to use if allocating
4360 : *
4361 : * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
4362 : * with any new page allocations done using the specified allocation flags.
4363 : * But read_cache_page_gfp() uses the ->read_folio() method: which does not
4364 : * suit tmpfs, since it may have pages in swapcache, and needs to find those
4365 : * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
4366 : *
4367 : * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
4368 : * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
4369 : */
4370 0 : struct folio *shmem_read_folio_gfp(struct address_space *mapping,
4371 : pgoff_t index, gfp_t gfp)
4372 : {
4373 : #ifdef CONFIG_SHMEM
4374 0 : struct inode *inode = mapping->host;
4375 : struct folio *folio;
4376 : int error;
4377 :
4378 0 : BUG_ON(!shmem_mapping(mapping));
4379 0 : error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
4380 : gfp, NULL, NULL, NULL);
4381 0 : if (error)
4382 0 : return ERR_PTR(error);
4383 :
4384 0 : folio_unlock(folio);
4385 0 : return folio;
4386 : #else
4387 : /*
4388 : * The tiny !SHMEM case uses ramfs without swap
4389 : */
4390 : return mapping_read_folio_gfp(mapping, index, gfp);
4391 : #endif
4392 : }
4393 : EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);
4394 :
4395 0 : struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
4396 : pgoff_t index, gfp_t gfp)
4397 : {
4398 0 : struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
4399 : struct page *page;
4400 :
4401 0 : if (IS_ERR(folio))
4402 0 : return &folio->page;
4403 :
4404 0 : page = folio_file_page(folio, index);
4405 : if (PageHWPoison(page)) {
4406 : folio_put(folio);
4407 : return ERR_PTR(-EIO);
4408 : }
4409 :
4410 0 : return page;
4411 : }
4412 : EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
|