Line data Source code
1 : /*
2 : * Resizable virtual memory filesystem for Linux.
3 : *
4 : * Copyright (C) 2000 Linus Torvalds.
5 : * 2000 Transmeta Corp.
6 : * 2000-2001 Christoph Rohland
7 : * 2000-2001 SAP AG
8 : * 2002 Red Hat Inc.
9 : * Copyright (C) 2002-2011 Hugh Dickins.
10 : * Copyright (C) 2011 Google Inc.
11 : * Copyright (C) 2002-2005 VERITAS Software Corporation.
12 : * Copyright (C) 2004 Andi Kleen, SuSE Labs
13 : *
14 : * Extended attribute support for tmpfs:
15 : * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
16 : * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
17 : *
18 : * tiny-shmem:
19 : * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
20 : *
21 : * This file is released under the GPL.
22 : */
23 :
24 : #include <linux/fs.h>
25 : #include <linux/init.h>
26 : #include <linux/vfs.h>
27 : #include <linux/mount.h>
28 : #include <linux/ramfs.h>
29 : #include <linux/pagemap.h>
30 : #include <linux/file.h>
31 : #include <linux/fileattr.h>
32 : #include <linux/mm.h>
33 : #include <linux/random.h>
34 : #include <linux/sched/signal.h>
35 : #include <linux/export.h>
36 : #include <linux/shmem_fs.h>
37 : #include <linux/swap.h>
38 : #include <linux/uio.h>
39 : #include <linux/hugetlb.h>
40 : #include <linux/fs_parser.h>
41 : #include <linux/swapfile.h>
42 : #include <linux/iversion.h>
43 : #include "swap.h"
44 :
45 : static struct vfsmount *shm_mnt;
46 :
47 : #ifdef CONFIG_SHMEM
48 : /*
49 : * This virtual memory filesystem is heavily based on the ramfs. It
50 : * extends ramfs by the ability to use swap and honor resource limits
51 : * which makes it a completely usable filesystem.
52 : */
53 :
54 : #include <linux/xattr.h>
55 : #include <linux/exportfs.h>
56 : #include <linux/posix_acl.h>
57 : #include <linux/posix_acl_xattr.h>
58 : #include <linux/mman.h>
59 : #include <linux/string.h>
60 : #include <linux/slab.h>
61 : #include <linux/backing-dev.h>
62 : #include <linux/writeback.h>
63 : #include <linux/pagevec.h>
64 : #include <linux/percpu_counter.h>
65 : #include <linux/falloc.h>
66 : #include <linux/splice.h>
67 : #include <linux/security.h>
68 : #include <linux/swapops.h>
69 : #include <linux/mempolicy.h>
70 : #include <linux/namei.h>
71 : #include <linux/ctype.h>
72 : #include <linux/migrate.h>
73 : #include <linux/highmem.h>
74 : #include <linux/seq_file.h>
75 : #include <linux/magic.h>
76 : #include <linux/syscalls.h>
77 : #include <linux/fcntl.h>
78 : #include <uapi/linux/memfd.h>
79 : #include <linux/rmap.h>
80 : #include <linux/uuid.h>
81 :
82 : #include <linux/uaccess.h>
83 :
84 : #include "internal.h"
85 :
86 : #define BLOCKS_PER_PAGE (PAGE_SIZE/512)
87 : #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
88 :
89 : /* Pretend that each entry is of this size in directory's i_size */
90 : #define BOGO_DIRENT_SIZE 20
91 :
92 : /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
93 : #define SHORT_SYMLINK_LEN 128
94 :
95 : /*
96 : * shmem_fallocate communicates with shmem_fault or shmem_writepage via
97 : * inode->i_private (with i_rwsem making sure that it has only one user at
98 : * a time): we would prefer not to enlarge the shmem inode just for that.
99 : */
100 : struct shmem_falloc {
101 : wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
102 : pgoff_t start; /* start of range currently being fallocated */
103 : pgoff_t next; /* the next page offset to be fallocated */
104 : pgoff_t nr_falloced; /* how many new pages have been fallocated */
105 : pgoff_t nr_unswapped; /* how often writepage refused to swap out */
106 : };
107 :
108 : struct shmem_options {
109 : unsigned long long blocks;
110 : unsigned long long inodes;
111 : struct mempolicy *mpol;
112 : kuid_t uid;
113 : kgid_t gid;
114 : umode_t mode;
115 : bool full_inums;
116 : int huge;
117 : int seen;
118 : bool noswap;
119 : #define SHMEM_SEEN_BLOCKS 1
120 : #define SHMEM_SEEN_INODES 2
121 : #define SHMEM_SEEN_HUGE 4
122 : #define SHMEM_SEEN_INUMS 8
123 : #define SHMEM_SEEN_NOSWAP 16
124 : };
125 :
126 : #ifdef CONFIG_TMPFS
127 : static unsigned long shmem_default_max_blocks(void)
128 : {
129 : return totalram_pages() / 2;
130 : }
131 :
132 : static unsigned long shmem_default_max_inodes(void)
133 : {
134 : unsigned long nr_pages = totalram_pages();
135 :
136 : return min(nr_pages - totalhigh_pages(), nr_pages / 2);
137 : }
138 : #endif
139 :
140 : static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
141 : struct folio **foliop, enum sgp_type sgp,
142 : gfp_t gfp, struct vm_area_struct *vma,
143 : vm_fault_t *fault_type);
144 :
145 : static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
146 : {
147 : return sb->s_fs_info;
148 : }
149 :
150 : /*
151 : * shmem_file_setup pre-accounts the whole fixed size of a VM object,
152 : * for shared memory and for shared anonymous (/dev/zero) mappings
153 : * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
154 : * consistent with the pre-accounting of private mappings ...
155 : */
156 0 : static inline int shmem_acct_size(unsigned long flags, loff_t size)
157 : {
158 0 : return (flags & VM_NORESERVE) ?
159 0 : 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
160 : }
161 :
162 : static inline void shmem_unacct_size(unsigned long flags, loff_t size)
163 : {
164 0 : if (!(flags & VM_NORESERVE))
165 0 : vm_unacct_memory(VM_ACCT(size));
166 : }
167 :
168 0 : static inline int shmem_reacct_size(unsigned long flags,
169 : loff_t oldsize, loff_t newsize)
170 : {
171 0 : if (!(flags & VM_NORESERVE)) {
172 0 : if (VM_ACCT(newsize) > VM_ACCT(oldsize))
173 0 : return security_vm_enough_memory_mm(current->mm,
174 0 : VM_ACCT(newsize) - VM_ACCT(oldsize));
175 0 : else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
176 0 : vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
177 : }
178 : return 0;
179 : }
180 :
181 : /*
182 : * ... whereas tmpfs objects are accounted incrementally as
183 : * pages are allocated, in order to allow large sparse files.
184 : * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
185 : * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
186 : */
187 : static inline int shmem_acct_block(unsigned long flags, long pages)
188 : {
189 0 : if (!(flags & VM_NORESERVE))
190 : return 0;
191 :
192 0 : return security_vm_enough_memory_mm(current->mm,
193 : pages * VM_ACCT(PAGE_SIZE));
194 : }
195 :
196 : static inline void shmem_unacct_blocks(unsigned long flags, long pages)
197 : {
198 0 : if (flags & VM_NORESERVE)
199 : vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
200 : }
201 :
202 0 : static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
203 : {
204 0 : struct shmem_inode_info *info = SHMEM_I(inode);
205 0 : struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
206 :
207 0 : if (shmem_acct_block(info->flags, pages))
208 : return false;
209 :
210 0 : if (sbinfo->max_blocks) {
211 0 : if (percpu_counter_compare(&sbinfo->used_blocks,
212 0 : sbinfo->max_blocks - pages) > 0)
213 : goto unacct;
214 0 : percpu_counter_add(&sbinfo->used_blocks, pages);
215 : }
216 :
217 : return true;
218 :
219 : unacct:
220 0 : shmem_unacct_blocks(info->flags, pages);
221 : return false;
222 : }
223 :
224 0 : static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
225 : {
226 0 : struct shmem_inode_info *info = SHMEM_I(inode);
227 0 : struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
228 :
229 0 : if (sbinfo->max_blocks)
230 0 : percpu_counter_sub(&sbinfo->used_blocks, pages);
231 0 : shmem_unacct_blocks(info->flags, pages);
232 0 : }
233 :
234 : static const struct super_operations shmem_ops;
235 : const struct address_space_operations shmem_aops;
236 : static const struct file_operations shmem_file_operations;
237 : static const struct inode_operations shmem_inode_operations;
238 : static const struct inode_operations shmem_dir_inode_operations;
239 : static const struct inode_operations shmem_special_inode_operations;
240 : static const struct vm_operations_struct shmem_vm_ops;
241 : static const struct vm_operations_struct shmem_anon_vm_ops;
242 : static struct file_system_type shmem_fs_type;
243 :
244 0 : bool vma_is_anon_shmem(struct vm_area_struct *vma)
245 : {
246 0 : return vma->vm_ops == &shmem_anon_vm_ops;
247 : }
248 :
249 0 : bool vma_is_shmem(struct vm_area_struct *vma)
250 : {
251 0 : return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
252 : }
253 :
254 : static LIST_HEAD(shmem_swaplist);
255 : static DEFINE_MUTEX(shmem_swaplist_mutex);
256 :
257 : /*
258 : * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
259 : * produces a novel ino for the newly allocated inode.
260 : *
261 : * It may also be called when making a hard link to permit the space needed by
262 : * each dentry. However, in that case, no new inode number is needed since that
263 : * internally draws from another pool of inode numbers (currently global
264 : * get_next_ino()). This case is indicated by passing NULL as inop.
265 : */
266 : #define SHMEM_INO_BATCH 1024
267 1 : static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
268 : {
269 1 : struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
270 : ino_t ino;
271 :
272 1 : if (!(sb->s_flags & SB_KERNMOUNT)) {
273 0 : raw_spin_lock(&sbinfo->stat_lock);
274 0 : if (sbinfo->max_inodes) {
275 0 : if (!sbinfo->free_inodes) {
276 0 : raw_spin_unlock(&sbinfo->stat_lock);
277 0 : return -ENOSPC;
278 : }
279 0 : sbinfo->free_inodes--;
280 : }
281 0 : if (inop) {
282 0 : ino = sbinfo->next_ino++;
283 0 : if (unlikely(is_zero_ino(ino)))
284 0 : ino = sbinfo->next_ino++;
285 0 : if (unlikely(!sbinfo->full_inums &&
286 : ino > UINT_MAX)) {
287 : /*
288 : * Emulate get_next_ino uint wraparound for
289 : * compatibility
290 : */
291 : if (IS_ENABLED(CONFIG_64BIT))
292 0 : pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
293 : __func__, MINOR(sb->s_dev));
294 : sbinfo->next_ino = 1;
295 0 : ino = sbinfo->next_ino++;
296 : }
297 0 : *inop = ino;
298 : }
299 0 : raw_spin_unlock(&sbinfo->stat_lock);
300 1 : } else if (inop) {
301 : /*
302 : * __shmem_file_setup, one of our callers, is lock-free: it
303 : * doesn't hold stat_lock in shmem_reserve_inode since
304 : * max_inodes is always 0, and is called from potentially
305 : * unknown contexts. As such, use a per-cpu batched allocator
306 : * which doesn't require the per-sb stat_lock unless we are at
307 : * the batch boundary.
308 : *
309 : * We don't need to worry about inode{32,64} since SB_KERNMOUNT
310 : * shmem mounts are not exposed to userspace, so we don't need
311 : * to worry about things like glibc compatibility.
312 : */
313 : ino_t *next_ino;
314 :
315 1 : next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
316 1 : ino = *next_ino;
317 1 : if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
318 1 : raw_spin_lock(&sbinfo->stat_lock);
319 1 : ino = sbinfo->next_ino;
320 1 : sbinfo->next_ino += SHMEM_INO_BATCH;
321 1 : raw_spin_unlock(&sbinfo->stat_lock);
322 1 : if (unlikely(is_zero_ino(ino)))
323 1 : ino++;
324 : }
325 1 : *inop = ino;
326 1 : *next_ino = ++ino;
327 1 : put_cpu();
328 : }
329 :
330 : return 0;
331 : }
332 :
333 : static void shmem_free_inode(struct super_block *sb)
334 : {
335 0 : struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
336 0 : if (sbinfo->max_inodes) {
337 0 : raw_spin_lock(&sbinfo->stat_lock);
338 0 : sbinfo->free_inodes++;
339 0 : raw_spin_unlock(&sbinfo->stat_lock);
340 : }
341 : }
342 :
343 : /**
344 : * shmem_recalc_inode - recalculate the block usage of an inode
345 : * @inode: inode to recalc
346 : *
347 : * We have to calculate the free blocks since the mm can drop
348 : * undirtied hole pages behind our back.
349 : *
350 : * But normally info->alloced == inode->i_mapping->nrpages + info->swapped
351 : * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
352 : *
353 : * It has to be called with the spinlock held.
354 : */
355 0 : static void shmem_recalc_inode(struct inode *inode)
356 : {
357 0 : struct shmem_inode_info *info = SHMEM_I(inode);
358 : long freed;
359 :
360 0 : freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
361 0 : if (freed > 0) {
362 0 : info->alloced -= freed;
363 0 : inode->i_blocks -= freed * BLOCKS_PER_PAGE;
364 0 : shmem_inode_unacct_blocks(inode, freed);
365 : }
366 0 : }
367 :
368 0 : bool shmem_charge(struct inode *inode, long pages)
369 : {
370 0 : struct shmem_inode_info *info = SHMEM_I(inode);
371 : unsigned long flags;
372 :
373 0 : if (!shmem_inode_acct_block(inode, pages))
374 : return false;
375 :
376 : /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
377 0 : inode->i_mapping->nrpages += pages;
378 :
379 0 : spin_lock_irqsave(&info->lock, flags);
380 0 : info->alloced += pages;
381 0 : inode->i_blocks += pages * BLOCKS_PER_PAGE;
382 0 : shmem_recalc_inode(inode);
383 0 : spin_unlock_irqrestore(&info->lock, flags);
384 :
385 0 : return true;
386 : }
387 :
388 0 : void shmem_uncharge(struct inode *inode, long pages)
389 : {
390 0 : struct shmem_inode_info *info = SHMEM_I(inode);
391 : unsigned long flags;
392 :
393 : /* nrpages adjustment done by __filemap_remove_folio() or caller */
394 :
395 0 : spin_lock_irqsave(&info->lock, flags);
396 0 : info->alloced -= pages;
397 0 : inode->i_blocks -= pages * BLOCKS_PER_PAGE;
398 0 : shmem_recalc_inode(inode);
399 0 : spin_unlock_irqrestore(&info->lock, flags);
400 :
401 0 : shmem_inode_unacct_blocks(inode, pages);
402 0 : }
403 :
404 : /*
405 : * Replace item expected in xarray by a new item, while holding xa_lock.
406 : */
407 0 : static int shmem_replace_entry(struct address_space *mapping,
408 : pgoff_t index, void *expected, void *replacement)
409 : {
410 0 : XA_STATE(xas, &mapping->i_pages, index);
411 : void *item;
412 :
413 : VM_BUG_ON(!expected);
414 : VM_BUG_ON(!replacement);
415 0 : item = xas_load(&xas);
416 0 : if (item != expected)
417 : return -ENOENT;
418 0 : xas_store(&xas, replacement);
419 0 : return 0;
420 : }
421 :
422 : /*
423 : * Sometimes, before we decide whether to proceed or to fail, we must check
424 : * that an entry was not already brought back from swap by a racing thread.
425 : *
426 : * Checking page is not enough: by the time a SwapCache page is locked, it
427 : * might be reused, and again be SwapCache, using the same swap as before.
428 : */
429 0 : static bool shmem_confirm_swap(struct address_space *mapping,
430 : pgoff_t index, swp_entry_t swap)
431 : {
432 0 : return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
433 : }
434 :
435 : /*
436 : * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
437 : *
438 : * SHMEM_HUGE_NEVER:
439 : * disables huge pages for the mount;
440 : * SHMEM_HUGE_ALWAYS:
441 : * enables huge pages for the mount;
442 : * SHMEM_HUGE_WITHIN_SIZE:
443 : * only allocate huge pages if the page will be fully within i_size,
444 : * also respect fadvise()/madvise() hints;
445 : * SHMEM_HUGE_ADVISE:
446 : * only allocate huge pages if requested with fadvise()/madvise();
447 : */
448 :
449 : #define SHMEM_HUGE_NEVER 0
450 : #define SHMEM_HUGE_ALWAYS 1
451 : #define SHMEM_HUGE_WITHIN_SIZE 2
452 : #define SHMEM_HUGE_ADVISE 3
453 :
454 : /*
455 : * Special values.
456 : * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
457 : *
458 : * SHMEM_HUGE_DENY:
459 : * disables huge on shm_mnt and all mounts, for emergency use;
460 : * SHMEM_HUGE_FORCE:
461 : * enables huge on shm_mnt and all mounts, w/o needing option, for testing;
462 : *
463 : */
464 : #define SHMEM_HUGE_DENY (-1)
465 : #define SHMEM_HUGE_FORCE (-2)
466 :
467 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
468 : /* ifdef here to avoid bloating shmem.o when not necessary */
469 :
470 : static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
471 :
472 : bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
473 : struct mm_struct *mm, unsigned long vm_flags)
474 : {
475 : loff_t i_size;
476 :
477 : if (!S_ISREG(inode->i_mode))
478 : return false;
479 : if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags)))
480 : return false;
481 : if (shmem_huge == SHMEM_HUGE_DENY)
482 : return false;
483 : if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
484 : return true;
485 :
486 : switch (SHMEM_SB(inode->i_sb)->huge) {
487 : case SHMEM_HUGE_ALWAYS:
488 : return true;
489 : case SHMEM_HUGE_WITHIN_SIZE:
490 : index = round_up(index + 1, HPAGE_PMD_NR);
491 : i_size = round_up(i_size_read(inode), PAGE_SIZE);
492 : if (i_size >> PAGE_SHIFT >= index)
493 : return true;
494 : fallthrough;
495 : case SHMEM_HUGE_ADVISE:
496 : if (mm && (vm_flags & VM_HUGEPAGE))
497 : return true;
498 : fallthrough;
499 : default:
500 : return false;
501 : }
502 : }
503 :
504 : #if defined(CONFIG_SYSFS)
505 : static int shmem_parse_huge(const char *str)
506 : {
507 : if (!strcmp(str, "never"))
508 : return SHMEM_HUGE_NEVER;
509 : if (!strcmp(str, "always"))
510 : return SHMEM_HUGE_ALWAYS;
511 : if (!strcmp(str, "within_size"))
512 : return SHMEM_HUGE_WITHIN_SIZE;
513 : if (!strcmp(str, "advise"))
514 : return SHMEM_HUGE_ADVISE;
515 : if (!strcmp(str, "deny"))
516 : return SHMEM_HUGE_DENY;
517 : if (!strcmp(str, "force"))
518 : return SHMEM_HUGE_FORCE;
519 : return -EINVAL;
520 : }
521 : #endif
522 :
523 : #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
524 : static const char *shmem_format_huge(int huge)
525 : {
526 : switch (huge) {
527 : case SHMEM_HUGE_NEVER:
528 : return "never";
529 : case SHMEM_HUGE_ALWAYS:
530 : return "always";
531 : case SHMEM_HUGE_WITHIN_SIZE:
532 : return "within_size";
533 : case SHMEM_HUGE_ADVISE:
534 : return "advise";
535 : case SHMEM_HUGE_DENY:
536 : return "deny";
537 : case SHMEM_HUGE_FORCE:
538 : return "force";
539 : default:
540 : VM_BUG_ON(1);
541 : return "bad_val";
542 : }
543 : }
544 : #endif
545 :
546 : static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
547 : struct shrink_control *sc, unsigned long nr_to_split)
548 : {
549 : LIST_HEAD(list), *pos, *next;
550 : LIST_HEAD(to_remove);
551 : struct inode *inode;
552 : struct shmem_inode_info *info;
553 : struct folio *folio;
554 : unsigned long batch = sc ? sc->nr_to_scan : 128;
555 : int split = 0;
556 :
557 : if (list_empty(&sbinfo->shrinklist))
558 : return SHRINK_STOP;
559 :
560 : spin_lock(&sbinfo->shrinklist_lock);
561 : list_for_each_safe(pos, next, &sbinfo->shrinklist) {
562 : info = list_entry(pos, struct shmem_inode_info, shrinklist);
563 :
564 : /* pin the inode */
565 : inode = igrab(&info->vfs_inode);
566 :
567 : /* inode is about to be evicted */
568 : if (!inode) {
569 : list_del_init(&info->shrinklist);
570 : goto next;
571 : }
572 :
573 : /* Check if there's anything to gain */
574 : if (round_up(inode->i_size, PAGE_SIZE) ==
575 : round_up(inode->i_size, HPAGE_PMD_SIZE)) {
576 : list_move(&info->shrinklist, &to_remove);
577 : goto next;
578 : }
579 :
580 : list_move(&info->shrinklist, &list);
581 : next:
582 : sbinfo->shrinklist_len--;
583 : if (!--batch)
584 : break;
585 : }
586 : spin_unlock(&sbinfo->shrinklist_lock);
587 :
588 : list_for_each_safe(pos, next, &to_remove) {
589 : info = list_entry(pos, struct shmem_inode_info, shrinklist);
590 : inode = &info->vfs_inode;
591 : list_del_init(&info->shrinklist);
592 : iput(inode);
593 : }
594 :
595 : list_for_each_safe(pos, next, &list) {
596 : int ret;
597 : pgoff_t index;
598 :
599 : info = list_entry(pos, struct shmem_inode_info, shrinklist);
600 : inode = &info->vfs_inode;
601 :
602 : if (nr_to_split && split >= nr_to_split)
603 : goto move_back;
604 :
605 : index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
606 : folio = filemap_get_folio(inode->i_mapping, index);
607 : if (IS_ERR(folio))
608 : goto drop;
609 :
610 : /* No huge page at the end of the file: nothing to split */
611 : if (!folio_test_large(folio)) {
612 : folio_put(folio);
613 : goto drop;
614 : }
615 :
616 : /*
617 : * Move the inode on the list back to shrinklist if we failed
618 : * to lock the page at this time.
619 : *
620 : * Waiting for the lock may lead to deadlock in the
621 : * reclaim path.
622 : */
623 : if (!folio_trylock(folio)) {
624 : folio_put(folio);
625 : goto move_back;
626 : }
627 :
628 : ret = split_folio(folio);
629 : folio_unlock(folio);
630 : folio_put(folio);
631 :
632 : /* If split failed move the inode on the list back to shrinklist */
633 : if (ret)
634 : goto move_back;
635 :
636 : split++;
637 : drop:
638 : list_del_init(&info->shrinklist);
639 : goto put;
640 : move_back:
641 : /*
642 : * Make sure the inode is either on the global list or deleted
643 : * from any local list before iput() since it could be deleted
644 : * in another thread once we put the inode (then the local list
645 : * is corrupted).
646 : */
647 : spin_lock(&sbinfo->shrinklist_lock);
648 : list_move(&info->shrinklist, &sbinfo->shrinklist);
649 : sbinfo->shrinklist_len++;
650 : spin_unlock(&sbinfo->shrinklist_lock);
651 : put:
652 : iput(inode);
653 : }
654 :
655 : return split;
656 : }
657 :
658 : static long shmem_unused_huge_scan(struct super_block *sb,
659 : struct shrink_control *sc)
660 : {
661 : struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
662 :
663 : if (!READ_ONCE(sbinfo->shrinklist_len))
664 : return SHRINK_STOP;
665 :
666 : return shmem_unused_huge_shrink(sbinfo, sc, 0);
667 : }
668 :
669 : static long shmem_unused_huge_count(struct super_block *sb,
670 : struct shrink_control *sc)
671 : {
672 : struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
673 : return READ_ONCE(sbinfo->shrinklist_len);
674 : }
675 : #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
676 :
677 : #define shmem_huge SHMEM_HUGE_DENY
678 :
679 0 : bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
680 : struct mm_struct *mm, unsigned long vm_flags)
681 : {
682 0 : return false;
683 : }
684 :
685 : static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
686 : struct shrink_control *sc, unsigned long nr_to_split)
687 : {
688 : return 0;
689 : }
690 : #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
691 :
692 : /*
693 : * Like filemap_add_folio, but error if expected item has gone.
694 : */
695 0 : static int shmem_add_to_page_cache(struct folio *folio,
696 : struct address_space *mapping,
697 : pgoff_t index, void *expected, gfp_t gfp,
698 : struct mm_struct *charge_mm)
699 : {
700 0 : XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
701 0 : long nr = folio_nr_pages(folio);
702 : int error;
703 :
704 : VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
705 : VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
706 : VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
707 : VM_BUG_ON(expected && folio_test_large(folio));
708 :
709 0 : folio_ref_add(folio, nr);
710 0 : folio->mapping = mapping;
711 0 : folio->index = index;
712 :
713 : if (!folio_test_swapcache(folio)) {
714 : error = mem_cgroup_charge(folio, charge_mm, gfp);
715 : if (error) {
716 : if (folio_test_pmd_mappable(folio)) {
717 : count_vm_event(THP_FILE_FALLBACK);
718 : count_vm_event(THP_FILE_FALLBACK_CHARGE);
719 : }
720 : goto error;
721 : }
722 : }
723 : folio_throttle_swaprate(folio, gfp);
724 :
725 : do {
726 0 : xas_lock_irq(&xas);
727 0 : if (expected != xas_find_conflict(&xas)) {
728 0 : xas_set_err(&xas, -EEXIST);
729 : goto unlock;
730 : }
731 0 : if (expected && xas_find_conflict(&xas)) {
732 0 : xas_set_err(&xas, -EEXIST);
733 : goto unlock;
734 : }
735 0 : xas_store(&xas, folio);
736 0 : if (xas_error(&xas))
737 : goto unlock;
738 0 : if (folio_test_pmd_mappable(folio)) {
739 : count_vm_event(THP_FILE_ALLOC);
740 : __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
741 : }
742 0 : mapping->nrpages += nr;
743 0 : __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
744 0 : __lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
745 : unlock:
746 0 : xas_unlock_irq(&xas);
747 0 : } while (xas_nomem(&xas, gfp));
748 :
749 0 : if (xas_error(&xas)) {
750 0 : error = xas_error(&xas);
751 : goto error;
752 : }
753 :
754 : return 0;
755 : error:
756 0 : folio->mapping = NULL;
757 0 : folio_ref_sub(folio, nr);
758 : return error;
759 : }
760 :
761 : /*
762 : * Like delete_from_page_cache, but substitutes swap for @folio.
763 : */
764 0 : static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
765 : {
766 0 : struct address_space *mapping = folio->mapping;
767 0 : long nr = folio_nr_pages(folio);
768 : int error;
769 :
770 0 : xa_lock_irq(&mapping->i_pages);
771 0 : error = shmem_replace_entry(mapping, folio->index, folio, radswap);
772 0 : folio->mapping = NULL;
773 0 : mapping->nrpages -= nr;
774 0 : __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
775 0 : __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
776 0 : xa_unlock_irq(&mapping->i_pages);
777 0 : folio_put(folio);
778 0 : BUG_ON(error);
779 0 : }
780 :
781 : /*
782 : * Remove swap entry from page cache, free the swap and its page cache.
783 : */
784 0 : static int shmem_free_swap(struct address_space *mapping,
785 : pgoff_t index, void *radswap)
786 : {
787 : void *old;
788 :
789 0 : old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
790 0 : if (old != radswap)
791 : return -ENOENT;
792 0 : free_swap_and_cache(radix_to_swp_entry(radswap));
793 0 : return 0;
794 : }
795 :
796 : /*
797 : * Determine (in bytes) how many of the shmem object's pages mapped by the
798 : * given offsets are swapped out.
799 : *
800 : * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
801 : * as long as the inode doesn't go away and racy results are not a problem.
802 : */
803 0 : unsigned long shmem_partial_swap_usage(struct address_space *mapping,
804 : pgoff_t start, pgoff_t end)
805 : {
806 0 : XA_STATE(xas, &mapping->i_pages, start);
807 : struct page *page;
808 0 : unsigned long swapped = 0;
809 :
810 : rcu_read_lock();
811 0 : xas_for_each(&xas, page, end - 1) {
812 0 : if (xas_retry(&xas, page))
813 0 : continue;
814 0 : if (xa_is_value(page))
815 0 : swapped++;
816 :
817 0 : if (need_resched()) {
818 0 : xas_pause(&xas);
819 : cond_resched_rcu();
820 : }
821 : }
822 :
823 : rcu_read_unlock();
824 :
825 0 : return swapped << PAGE_SHIFT;
826 : }
827 :
828 : /*
829 : * Determine (in bytes) how many of the shmem object's pages mapped by the
830 : * given vma is swapped out.
831 : *
832 : * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
833 : * as long as the inode doesn't go away and racy results are not a problem.
834 : */
835 0 : unsigned long shmem_swap_usage(struct vm_area_struct *vma)
836 : {
837 0 : struct inode *inode = file_inode(vma->vm_file);
838 0 : struct shmem_inode_info *info = SHMEM_I(inode);
839 0 : struct address_space *mapping = inode->i_mapping;
840 : unsigned long swapped;
841 :
842 : /* Be careful as we don't hold info->lock */
843 0 : swapped = READ_ONCE(info->swapped);
844 :
845 : /*
846 : * The easier cases are when the shmem object has nothing in swap, or
847 : * the vma maps it whole. Then we can simply use the stats that we
848 : * already track.
849 : */
850 0 : if (!swapped)
851 : return 0;
852 :
853 0 : if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
854 0 : return swapped << PAGE_SHIFT;
855 :
856 : /* Here comes the more involved part */
857 0 : return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
858 0 : vma->vm_pgoff + vma_pages(vma));
859 : }
860 :
861 : /*
862 : * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
863 : */
864 0 : void shmem_unlock_mapping(struct address_space *mapping)
865 : {
866 : struct folio_batch fbatch;
867 0 : pgoff_t index = 0;
868 :
869 0 : folio_batch_init(&fbatch);
870 : /*
871 : * Minor point, but we might as well stop if someone else SHM_LOCKs it.
872 : */
873 0 : while (!mapping_unevictable(mapping) &&
874 0 : filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
875 0 : check_move_unevictable_folios(&fbatch);
876 0 : folio_batch_release(&fbatch);
877 0 : cond_resched();
878 : }
879 0 : }
880 :
881 0 : static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
882 : {
883 : struct folio *folio;
884 :
885 : /*
886 : * At first avoid shmem_get_folio(,,,SGP_READ): that fails
887 : * beyond i_size, and reports fallocated folios as holes.
888 : */
889 0 : folio = filemap_get_entry(inode->i_mapping, index);
890 0 : if (!folio)
891 : return folio;
892 0 : if (!xa_is_value(folio)) {
893 0 : folio_lock(folio);
894 0 : if (folio->mapping == inode->i_mapping)
895 : return folio;
896 : /* The folio has been swapped out */
897 0 : folio_unlock(folio);
898 0 : folio_put(folio);
899 : }
900 : /*
901 : * But read a folio back from swap if any of it is within i_size
902 : * (although in some cases this is just a waste of time).
903 : */
904 0 : folio = NULL;
905 0 : shmem_get_folio(inode, index, &folio, SGP_READ);
906 0 : return folio;
907 : }
908 :
909 : /*
910 : * Remove range of pages and swap entries from page cache, and free them.
911 : * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
912 : */
913 0 : static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
914 : bool unfalloc)
915 : {
916 0 : struct address_space *mapping = inode->i_mapping;
917 0 : struct shmem_inode_info *info = SHMEM_I(inode);
918 0 : pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
919 0 : pgoff_t end = (lend + 1) >> PAGE_SHIFT;
920 : struct folio_batch fbatch;
921 : pgoff_t indices[PAGEVEC_SIZE];
922 : struct folio *folio;
923 : bool same_folio;
924 0 : long nr_swaps_freed = 0;
925 : pgoff_t index;
926 : int i;
927 :
928 0 : if (lend == -1)
929 0 : end = -1; /* unsigned, so actually very big */
930 :
931 0 : if (info->fallocend > start && info->fallocend <= end && !unfalloc)
932 0 : info->fallocend = start;
933 :
934 0 : folio_batch_init(&fbatch);
935 0 : index = start;
936 0 : while (index < end && find_lock_entries(mapping, &index, end - 1,
937 : &fbatch, indices)) {
938 0 : for (i = 0; i < folio_batch_count(&fbatch); i++) {
939 0 : folio = fbatch.folios[i];
940 :
941 0 : if (xa_is_value(folio)) {
942 0 : if (unfalloc)
943 0 : continue;
944 0 : nr_swaps_freed += !shmem_free_swap(mapping,
945 : indices[i], folio);
946 0 : continue;
947 : }
948 :
949 0 : if (!unfalloc || !folio_test_uptodate(folio))
950 0 : truncate_inode_folio(mapping, folio);
951 0 : folio_unlock(folio);
952 : }
953 0 : folio_batch_remove_exceptionals(&fbatch);
954 0 : folio_batch_release(&fbatch);
955 0 : cond_resched();
956 : }
957 :
958 : /*
959 : * When undoing a failed fallocate, we want none of the partial folio
960 : * zeroing and splitting below, but shall want to truncate the whole
961 : * folio when !uptodate indicates that it was added by this fallocate,
962 : * even when [lstart, lend] covers only a part of the folio.
963 : */
964 0 : if (unfalloc)
965 : goto whole_folios;
966 :
967 0 : same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
968 0 : folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
969 0 : if (folio) {
970 0 : same_folio = lend < folio_pos(folio) + folio_size(folio);
971 0 : folio_mark_dirty(folio);
972 0 : if (!truncate_inode_partial_folio(folio, lstart, lend)) {
973 0 : start = folio->index + folio_nr_pages(folio);
974 0 : if (same_folio)
975 0 : end = folio->index;
976 : }
977 0 : folio_unlock(folio);
978 : folio_put(folio);
979 : folio = NULL;
980 : }
981 :
982 0 : if (!same_folio)
983 0 : folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
984 0 : if (folio) {
985 0 : folio_mark_dirty(folio);
986 0 : if (!truncate_inode_partial_folio(folio, lstart, lend))
987 0 : end = folio->index;
988 0 : folio_unlock(folio);
989 : folio_put(folio);
990 : }
991 :
992 : whole_folios:
993 :
994 0 : index = start;
995 0 : while (index < end) {
996 0 : cond_resched();
997 :
998 0 : if (!find_get_entries(mapping, &index, end - 1, &fbatch,
999 : indices)) {
1000 : /* If all gone or hole-punch or unfalloc, we're done */
1001 0 : if (index == start || end != -1)
1002 : break;
1003 : /* But if truncating, restart to make sure all gone */
1004 0 : index = start;
1005 0 : continue;
1006 : }
1007 0 : for (i = 0; i < folio_batch_count(&fbatch); i++) {
1008 0 : folio = fbatch.folios[i];
1009 :
1010 0 : if (xa_is_value(folio)) {
1011 0 : if (unfalloc)
1012 0 : continue;
1013 0 : if (shmem_free_swap(mapping, indices[i], folio)) {
1014 : /* Swap was replaced by page: retry */
1015 0 : index = indices[i];
1016 0 : break;
1017 : }
1018 0 : nr_swaps_freed++;
1019 0 : continue;
1020 : }
1021 :
1022 0 : folio_lock(folio);
1023 :
1024 0 : if (!unfalloc || !folio_test_uptodate(folio)) {
1025 0 : if (folio_mapping(folio) != mapping) {
1026 : /* Page was replaced by swap: retry */
1027 0 : folio_unlock(folio);
1028 0 : index = indices[i];
1029 0 : break;
1030 : }
1031 : VM_BUG_ON_FOLIO(folio_test_writeback(folio),
1032 : folio);
1033 0 : truncate_inode_folio(mapping, folio);
1034 : }
1035 0 : folio_unlock(folio);
1036 : }
1037 0 : folio_batch_remove_exceptionals(&fbatch);
1038 : folio_batch_release(&fbatch);
1039 : }
1040 :
1041 0 : spin_lock_irq(&info->lock);
1042 0 : info->swapped -= nr_swaps_freed;
1043 0 : shmem_recalc_inode(inode);
1044 0 : spin_unlock_irq(&info->lock);
1045 0 : }
1046 :
1047 0 : void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
1048 : {
1049 0 : shmem_undo_range(inode, lstart, lend, false);
1050 0 : inode->i_ctime = inode->i_mtime = current_time(inode);
1051 0 : inode_inc_iversion(inode);
1052 0 : }
1053 : EXPORT_SYMBOL_GPL(shmem_truncate_range);
1054 :
1055 0 : static int shmem_getattr(struct mnt_idmap *idmap,
1056 : const struct path *path, struct kstat *stat,
1057 : u32 request_mask, unsigned int query_flags)
1058 : {
1059 0 : struct inode *inode = path->dentry->d_inode;
1060 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1061 :
1062 0 : if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
1063 0 : spin_lock_irq(&info->lock);
1064 0 : shmem_recalc_inode(inode);
1065 0 : spin_unlock_irq(&info->lock);
1066 : }
1067 0 : if (info->fsflags & FS_APPEND_FL)
1068 0 : stat->attributes |= STATX_ATTR_APPEND;
1069 0 : if (info->fsflags & FS_IMMUTABLE_FL)
1070 0 : stat->attributes |= STATX_ATTR_IMMUTABLE;
1071 0 : if (info->fsflags & FS_NODUMP_FL)
1072 0 : stat->attributes |= STATX_ATTR_NODUMP;
1073 0 : stat->attributes_mask |= (STATX_ATTR_APPEND |
1074 : STATX_ATTR_IMMUTABLE |
1075 : STATX_ATTR_NODUMP);
1076 0 : generic_fillattr(idmap, inode, stat);
1077 :
1078 0 : if (shmem_is_huge(inode, 0, false, NULL, 0))
1079 : stat->blksize = HPAGE_PMD_SIZE;
1080 :
1081 0 : if (request_mask & STATX_BTIME) {
1082 0 : stat->result_mask |= STATX_BTIME;
1083 0 : stat->btime.tv_sec = info->i_crtime.tv_sec;
1084 0 : stat->btime.tv_nsec = info->i_crtime.tv_nsec;
1085 : }
1086 :
1087 0 : return 0;
1088 : }
1089 :
1090 0 : static int shmem_setattr(struct mnt_idmap *idmap,
1091 : struct dentry *dentry, struct iattr *attr)
1092 : {
1093 0 : struct inode *inode = d_inode(dentry);
1094 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1095 : int error;
1096 0 : bool update_mtime = false;
1097 0 : bool update_ctime = true;
1098 :
1099 0 : error = setattr_prepare(idmap, dentry, attr);
1100 0 : if (error)
1101 : return error;
1102 :
1103 0 : if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
1104 0 : if ((inode->i_mode ^ attr->ia_mode) & 0111) {
1105 : return -EPERM;
1106 : }
1107 : }
1108 :
1109 0 : if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
1110 0 : loff_t oldsize = inode->i_size;
1111 0 : loff_t newsize = attr->ia_size;
1112 :
1113 : /* protected by i_rwsem */
1114 0 : if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
1115 0 : (newsize > oldsize && (info->seals & F_SEAL_GROW)))
1116 : return -EPERM;
1117 :
1118 0 : if (newsize != oldsize) {
1119 0 : error = shmem_reacct_size(SHMEM_I(inode)->flags,
1120 : oldsize, newsize);
1121 0 : if (error)
1122 : return error;
1123 0 : i_size_write(inode, newsize);
1124 0 : update_mtime = true;
1125 : } else {
1126 : update_ctime = false;
1127 : }
1128 0 : if (newsize <= oldsize) {
1129 0 : loff_t holebegin = round_up(newsize, PAGE_SIZE);
1130 0 : if (oldsize > holebegin)
1131 0 : unmap_mapping_range(inode->i_mapping,
1132 : holebegin, 0, 1);
1133 0 : if (info->alloced)
1134 0 : shmem_truncate_range(inode,
1135 : newsize, (loff_t)-1);
1136 : /* unmap again to remove racily COWed private pages */
1137 0 : if (oldsize > holebegin)
1138 0 : unmap_mapping_range(inode->i_mapping,
1139 : holebegin, 0, 1);
1140 : }
1141 : }
1142 :
1143 0 : setattr_copy(idmap, inode, attr);
1144 0 : if (attr->ia_valid & ATTR_MODE)
1145 0 : error = posix_acl_chmod(idmap, dentry, inode->i_mode);
1146 0 : if (!error && update_ctime) {
1147 0 : inode->i_ctime = current_time(inode);
1148 0 : if (update_mtime)
1149 0 : inode->i_mtime = inode->i_ctime;
1150 : inode_inc_iversion(inode);
1151 : }
1152 : return error;
1153 : }
1154 :
1155 0 : static void shmem_evict_inode(struct inode *inode)
1156 : {
1157 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1158 0 : struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1159 :
1160 0 : if (shmem_mapping(inode->i_mapping)) {
1161 0 : shmem_unacct_size(info->flags, inode->i_size);
1162 0 : inode->i_size = 0;
1163 0 : mapping_set_exiting(inode->i_mapping);
1164 0 : shmem_truncate_range(inode, 0, (loff_t)-1);
1165 0 : if (!list_empty(&info->shrinklist)) {
1166 0 : spin_lock(&sbinfo->shrinklist_lock);
1167 0 : if (!list_empty(&info->shrinklist)) {
1168 0 : list_del_init(&info->shrinklist);
1169 0 : sbinfo->shrinklist_len--;
1170 : }
1171 0 : spin_unlock(&sbinfo->shrinklist_lock);
1172 : }
1173 0 : while (!list_empty(&info->swaplist)) {
1174 : /* Wait while shmem_unuse() is scanning this inode... */
1175 0 : wait_var_event(&info->stop_eviction,
1176 : !atomic_read(&info->stop_eviction));
1177 0 : mutex_lock(&shmem_swaplist_mutex);
1178 : /* ...but beware of the race if we peeked too early */
1179 0 : if (!atomic_read(&info->stop_eviction))
1180 0 : list_del_init(&info->swaplist);
1181 0 : mutex_unlock(&shmem_swaplist_mutex);
1182 : }
1183 : }
1184 :
1185 0 : simple_xattrs_free(&info->xattrs);
1186 0 : WARN_ON(inode->i_blocks);
1187 0 : shmem_free_inode(inode->i_sb);
1188 0 : clear_inode(inode);
1189 0 : }
1190 :
1191 0 : static int shmem_find_swap_entries(struct address_space *mapping,
1192 : pgoff_t start, struct folio_batch *fbatch,
1193 : pgoff_t *indices, unsigned int type)
1194 : {
1195 0 : XA_STATE(xas, &mapping->i_pages, start);
1196 : struct folio *folio;
1197 : swp_entry_t entry;
1198 :
1199 : rcu_read_lock();
1200 0 : xas_for_each(&xas, folio, ULONG_MAX) {
1201 0 : if (xas_retry(&xas, folio))
1202 0 : continue;
1203 :
1204 0 : if (!xa_is_value(folio))
1205 0 : continue;
1206 :
1207 0 : entry = radix_to_swp_entry(folio);
1208 : /*
1209 : * swapin error entries can be found in the mapping. But they're
1210 : * deliberately ignored here as we've done everything we can do.
1211 : */
1212 0 : if (swp_type(entry) != type)
1213 0 : continue;
1214 :
1215 0 : indices[folio_batch_count(fbatch)] = xas.xa_index;
1216 0 : if (!folio_batch_add(fbatch, folio))
1217 : break;
1218 :
1219 0 : if (need_resched()) {
1220 0 : xas_pause(&xas);
1221 : cond_resched_rcu();
1222 : }
1223 : }
1224 : rcu_read_unlock();
1225 :
1226 0 : return xas.xa_index;
1227 : }
1228 :
1229 : /*
1230 : * Move the swapped pages for an inode to page cache. Returns the count
1231 : * of pages swapped in, or the error in case of failure.
1232 : */
1233 0 : static int shmem_unuse_swap_entries(struct inode *inode,
1234 : struct folio_batch *fbatch, pgoff_t *indices)
1235 : {
1236 0 : int i = 0;
1237 0 : int ret = 0;
1238 0 : int error = 0;
1239 0 : struct address_space *mapping = inode->i_mapping;
1240 :
1241 0 : for (i = 0; i < folio_batch_count(fbatch); i++) {
1242 0 : struct folio *folio = fbatch->folios[i];
1243 :
1244 0 : if (!xa_is_value(folio))
1245 0 : continue;
1246 0 : error = shmem_swapin_folio(inode, indices[i],
1247 : &folio, SGP_CACHE,
1248 : mapping_gfp_mask(mapping),
1249 : NULL, NULL);
1250 0 : if (error == 0) {
1251 0 : folio_unlock(folio);
1252 0 : folio_put(folio);
1253 0 : ret++;
1254 : }
1255 0 : if (error == -ENOMEM)
1256 : break;
1257 0 : error = 0;
1258 : }
1259 0 : return error ? error : ret;
1260 : }
1261 :
1262 : /*
1263 : * If swap found in inode, free it and move page from swapcache to filecache.
1264 : */
1265 0 : static int shmem_unuse_inode(struct inode *inode, unsigned int type)
1266 : {
1267 0 : struct address_space *mapping = inode->i_mapping;
1268 0 : pgoff_t start = 0;
1269 : struct folio_batch fbatch;
1270 : pgoff_t indices[PAGEVEC_SIZE];
1271 0 : int ret = 0;
1272 :
1273 : do {
1274 0 : folio_batch_init(&fbatch);
1275 0 : shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
1276 0 : if (folio_batch_count(&fbatch) == 0) {
1277 : ret = 0;
1278 : break;
1279 : }
1280 :
1281 0 : ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
1282 0 : if (ret < 0)
1283 : break;
1284 :
1285 0 : start = indices[folio_batch_count(&fbatch) - 1];
1286 : } while (true);
1287 :
1288 0 : return ret;
1289 : }
1290 :
1291 : /*
1292 : * Read all the shared memory data that resides in the swap
1293 : * device 'type' back into memory, so the swap device can be
1294 : * unused.
1295 : */
1296 0 : int shmem_unuse(unsigned int type)
1297 : {
1298 : struct shmem_inode_info *info, *next;
1299 0 : int error = 0;
1300 :
1301 0 : if (list_empty(&shmem_swaplist))
1302 : return 0;
1303 :
1304 0 : mutex_lock(&shmem_swaplist_mutex);
1305 0 : list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1306 0 : if (!info->swapped) {
1307 0 : list_del_init(&info->swaplist);
1308 0 : continue;
1309 : }
1310 : /*
1311 : * Drop the swaplist mutex while searching the inode for swap;
1312 : * but before doing so, make sure shmem_evict_inode() will not
1313 : * remove placeholder inode from swaplist, nor let it be freed
1314 : * (igrab() would protect from unlink, but not from unmount).
1315 : */
1316 0 : atomic_inc(&info->stop_eviction);
1317 0 : mutex_unlock(&shmem_swaplist_mutex);
1318 :
1319 0 : error = shmem_unuse_inode(&info->vfs_inode, type);
1320 0 : cond_resched();
1321 :
1322 0 : mutex_lock(&shmem_swaplist_mutex);
1323 0 : next = list_next_entry(info, swaplist);
1324 0 : if (!info->swapped)
1325 0 : list_del_init(&info->swaplist);
1326 0 : if (atomic_dec_and_test(&info->stop_eviction))
1327 0 : wake_up_var(&info->stop_eviction);
1328 0 : if (error)
1329 : break;
1330 : }
1331 0 : mutex_unlock(&shmem_swaplist_mutex);
1332 :
1333 0 : return error;
1334 : }
1335 :
1336 : /*
1337 : * Move the page from the page cache to the swap cache.
1338 : */
1339 0 : static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1340 : {
1341 0 : struct folio *folio = page_folio(page);
1342 0 : struct address_space *mapping = folio->mapping;
1343 0 : struct inode *inode = mapping->host;
1344 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1345 0 : struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1346 : swp_entry_t swap;
1347 : pgoff_t index;
1348 :
1349 : /*
1350 : * Our capabilities prevent regular writeback or sync from ever calling
1351 : * shmem_writepage; but a stacking filesystem might use ->writepage of
1352 : * its underlying filesystem, in which case tmpfs should write out to
1353 : * swap only in response to memory pressure, and not for the writeback
1354 : * threads or sync.
1355 : */
1356 0 : if (WARN_ON_ONCE(!wbc->for_reclaim))
1357 : goto redirty;
1358 :
1359 0 : if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
1360 : goto redirty;
1361 :
1362 0 : if (!total_swap_pages)
1363 : goto redirty;
1364 :
1365 : /*
1366 : * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
1367 : * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
1368 : * and its shmem_writeback() needs them to be split when swapping.
1369 : */
1370 0 : if (folio_test_large(folio)) {
1371 : /* Ensure the subpages are still dirty */
1372 0 : folio_test_set_dirty(folio);
1373 0 : if (split_huge_page(page) < 0)
1374 : goto redirty;
1375 0 : folio = page_folio(page);
1376 : folio_clear_dirty(folio);
1377 : }
1378 :
1379 0 : index = folio->index;
1380 :
1381 : /*
1382 : * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1383 : * value into swapfile.c, the only way we can correctly account for a
1384 : * fallocated folio arriving here is now to initialize it and write it.
1385 : *
1386 : * That's okay for a folio already fallocated earlier, but if we have
1387 : * not yet completed the fallocation, then (a) we want to keep track
1388 : * of this folio in case we have to undo it, and (b) it may not be a
1389 : * good idea to continue anyway, once we're pushing into swap. So
1390 : * reactivate the folio, and let shmem_fallocate() quit when too many.
1391 : */
1392 0 : if (!folio_test_uptodate(folio)) {
1393 0 : if (inode->i_private) {
1394 : struct shmem_falloc *shmem_falloc;
1395 0 : spin_lock(&inode->i_lock);
1396 0 : shmem_falloc = inode->i_private;
1397 0 : if (shmem_falloc &&
1398 0 : !shmem_falloc->waitq &&
1399 0 : index >= shmem_falloc->start &&
1400 0 : index < shmem_falloc->next)
1401 0 : shmem_falloc->nr_unswapped++;
1402 : else
1403 : shmem_falloc = NULL;
1404 0 : spin_unlock(&inode->i_lock);
1405 0 : if (shmem_falloc)
1406 : goto redirty;
1407 : }
1408 0 : folio_zero_range(folio, 0, folio_size(folio));
1409 0 : flush_dcache_folio(folio);
1410 : folio_mark_uptodate(folio);
1411 : }
1412 :
1413 0 : swap = folio_alloc_swap(folio);
1414 0 : if (!swap.val)
1415 : goto redirty;
1416 :
1417 : /*
1418 : * Add inode to shmem_unuse()'s list of swapped-out inodes,
1419 : * if it's not already there. Do it now before the folio is
1420 : * moved to swap cache, when its pagelock no longer protects
1421 : * the inode from eviction. But don't unlock the mutex until
1422 : * we've incremented swapped, because shmem_unuse_inode() will
1423 : * prune a !swapped inode from the swaplist under this mutex.
1424 : */
1425 0 : mutex_lock(&shmem_swaplist_mutex);
1426 0 : if (list_empty(&info->swaplist))
1427 0 : list_add(&info->swaplist, &shmem_swaplist);
1428 :
1429 0 : if (add_to_swap_cache(folio, swap,
1430 : __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
1431 : NULL) == 0) {
1432 0 : spin_lock_irq(&info->lock);
1433 0 : shmem_recalc_inode(inode);
1434 0 : info->swapped++;
1435 0 : spin_unlock_irq(&info->lock);
1436 :
1437 0 : swap_shmem_alloc(swap);
1438 0 : shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
1439 :
1440 0 : mutex_unlock(&shmem_swaplist_mutex);
1441 0 : BUG_ON(folio_mapped(folio));
1442 0 : swap_writepage(&folio->page, wbc);
1443 0 : return 0;
1444 : }
1445 :
1446 0 : mutex_unlock(&shmem_swaplist_mutex);
1447 0 : put_swap_folio(folio, swap);
1448 : redirty:
1449 0 : folio_mark_dirty(folio);
1450 0 : if (wbc->for_reclaim)
1451 : return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
1452 0 : folio_unlock(folio);
1453 0 : return 0;
1454 : }
1455 :
1456 : #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
1457 : static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1458 : {
1459 : char buffer[64];
1460 :
1461 : if (!mpol || mpol->mode == MPOL_DEFAULT)
1462 : return; /* show nothing */
1463 :
1464 : mpol_to_str(buffer, sizeof(buffer), mpol);
1465 :
1466 : seq_printf(seq, ",mpol=%s", buffer);
1467 : }
1468 :
1469 : static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1470 : {
1471 : struct mempolicy *mpol = NULL;
1472 : if (sbinfo->mpol) {
1473 : raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
1474 : mpol = sbinfo->mpol;
1475 : mpol_get(mpol);
1476 : raw_spin_unlock(&sbinfo->stat_lock);
1477 : }
1478 : return mpol;
1479 : }
1480 : #else /* !CONFIG_NUMA || !CONFIG_TMPFS */
1481 : static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1482 : {
1483 : }
1484 : static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1485 : {
1486 : return NULL;
1487 : }
1488 : #endif /* CONFIG_NUMA && CONFIG_TMPFS */
1489 : #ifndef CONFIG_NUMA
1490 : #define vm_policy vm_private_data
1491 : #endif
1492 :
1493 : static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
1494 : struct shmem_inode_info *info, pgoff_t index)
1495 : {
1496 : /* Create a pseudo vma that just contains the policy */
1497 0 : vma_init(vma, NULL);
1498 : /* Bias interleave by inode number to distribute better across nodes */
1499 0 : vma->vm_pgoff = index + info->vfs_inode.i_ino;
1500 0 : vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1501 : }
1502 :
1503 : static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
1504 : {
1505 : /* Drop reference taken by mpol_shared_policy_lookup() */
1506 0 : mpol_cond_put(vma->vm_policy);
1507 : }
1508 :
1509 0 : static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1510 : struct shmem_inode_info *info, pgoff_t index)
1511 : {
1512 : struct vm_area_struct pvma;
1513 : struct page *page;
1514 0 : struct vm_fault vmf = {
1515 : .vma = &pvma,
1516 : };
1517 :
1518 0 : shmem_pseudo_vma_init(&pvma, info, index);
1519 0 : page = swap_cluster_readahead(swap, gfp, &vmf);
1520 0 : shmem_pseudo_vma_destroy(&pvma);
1521 :
1522 0 : if (!page)
1523 : return NULL;
1524 0 : return page_folio(page);
1525 : }
1526 :
1527 : /*
1528 : * Make sure huge_gfp is always more limited than limit_gfp.
1529 : * Some of the flags set permissions, while others set limitations.
1530 : */
1531 : static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
1532 : {
1533 : gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
1534 : gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
1535 : gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
1536 : gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
1537 :
1538 : /* Allow allocations only from the originally specified zones. */
1539 : result |= zoneflags;
1540 :
1541 : /*
1542 : * Minimize the result gfp by taking the union with the deny flags,
1543 : * and the intersection of the allow flags.
1544 : */
1545 : result |= (limit_gfp & denyflags);
1546 : result |= (huge_gfp & limit_gfp) & allowflags;
1547 :
1548 : return result;
1549 : }
1550 :
1551 : static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
1552 : struct shmem_inode_info *info, pgoff_t index)
1553 : {
1554 : struct vm_area_struct pvma;
1555 : struct address_space *mapping = info->vfs_inode.i_mapping;
1556 : pgoff_t hindex;
1557 : struct folio *folio;
1558 :
1559 : hindex = round_down(index, HPAGE_PMD_NR);
1560 : if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
1561 : XA_PRESENT))
1562 : return NULL;
1563 :
1564 : shmem_pseudo_vma_init(&pvma, info, hindex);
1565 : folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
1566 : shmem_pseudo_vma_destroy(&pvma);
1567 : if (!folio)
1568 : count_vm_event(THP_FILE_FALLBACK);
1569 : return folio;
1570 : }
1571 :
1572 0 : static struct folio *shmem_alloc_folio(gfp_t gfp,
1573 : struct shmem_inode_info *info, pgoff_t index)
1574 : {
1575 : struct vm_area_struct pvma;
1576 : struct folio *folio;
1577 :
1578 0 : shmem_pseudo_vma_init(&pvma, info, index);
1579 0 : folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
1580 0 : shmem_pseudo_vma_destroy(&pvma);
1581 :
1582 0 : return folio;
1583 : }
1584 :
1585 0 : static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
1586 : pgoff_t index, bool huge)
1587 : {
1588 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1589 : struct folio *folio;
1590 : int nr;
1591 0 : int err = -ENOSPC;
1592 :
1593 : if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1594 0 : huge = false;
1595 0 : nr = huge ? HPAGE_PMD_NR : 1;
1596 :
1597 0 : if (!shmem_inode_acct_block(inode, nr))
1598 : goto failed;
1599 :
1600 : if (huge)
1601 : folio = shmem_alloc_hugefolio(gfp, info, index);
1602 : else
1603 0 : folio = shmem_alloc_folio(gfp, info, index);
1604 0 : if (folio) {
1605 0 : __folio_set_locked(folio);
1606 0 : __folio_set_swapbacked(folio);
1607 0 : return folio;
1608 : }
1609 :
1610 0 : err = -ENOMEM;
1611 0 : shmem_inode_unacct_blocks(inode, nr);
1612 : failed:
1613 0 : return ERR_PTR(err);
1614 : }
1615 :
1616 : /*
1617 : * When a page is moved from swapcache to shmem filecache (either by the
1618 : * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
1619 : * shmem_unuse_inode()), it may have been read in earlier from swap, in
1620 : * ignorance of the mapping it belongs to. If that mapping has special
1621 : * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
1622 : * we may need to copy to a suitable page before moving to filecache.
1623 : *
1624 : * In a future release, this may well be extended to respect cpuset and
1625 : * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
1626 : * but for now it is a simple matter of zone.
1627 : */
1628 : static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
1629 : {
1630 0 : return folio_zonenum(folio) > gfp_zone(gfp);
1631 : }
1632 :
1633 0 : static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
1634 : struct shmem_inode_info *info, pgoff_t index)
1635 : {
1636 : struct folio *old, *new;
1637 : struct address_space *swap_mapping;
1638 : swp_entry_t entry;
1639 : pgoff_t swap_index;
1640 : int error;
1641 :
1642 0 : old = *foliop;
1643 0 : entry = folio_swap_entry(old);
1644 0 : swap_index = swp_offset(entry);
1645 0 : swap_mapping = swap_address_space(entry);
1646 :
1647 : /*
1648 : * We have arrived here because our zones are constrained, so don't
1649 : * limit chance of success by further cpuset and node constraints.
1650 : */
1651 0 : gfp &= ~GFP_CONSTRAINT_MASK;
1652 : VM_BUG_ON_FOLIO(folio_test_large(old), old);
1653 0 : new = shmem_alloc_folio(gfp, info, index);
1654 0 : if (!new)
1655 : return -ENOMEM;
1656 :
1657 0 : folio_get(new);
1658 0 : folio_copy(new, old);
1659 0 : flush_dcache_folio(new);
1660 :
1661 0 : __folio_set_locked(new);
1662 0 : __folio_set_swapbacked(new);
1663 0 : folio_mark_uptodate(new);
1664 0 : folio_set_swap_entry(new, entry);
1665 0 : folio_set_swapcache(new);
1666 :
1667 : /*
1668 : * Our caller will very soon move newpage out of swapcache, but it's
1669 : * a nice clean interface for us to replace oldpage by newpage there.
1670 : */
1671 0 : xa_lock_irq(&swap_mapping->i_pages);
1672 0 : error = shmem_replace_entry(swap_mapping, swap_index, old, new);
1673 0 : if (!error) {
1674 0 : mem_cgroup_migrate(old, new);
1675 0 : __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
1676 0 : __lruvec_stat_mod_folio(new, NR_SHMEM, 1);
1677 0 : __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
1678 0 : __lruvec_stat_mod_folio(old, NR_SHMEM, -1);
1679 : }
1680 0 : xa_unlock_irq(&swap_mapping->i_pages);
1681 :
1682 0 : if (unlikely(error)) {
1683 : /*
1684 : * Is this possible? I think not, now that our callers check
1685 : * both PageSwapCache and page_private after getting page lock;
1686 : * but be defensive. Reverse old to newpage for clear and free.
1687 : */
1688 : old = new;
1689 : } else {
1690 0 : folio_add_lru(new);
1691 0 : *foliop = new;
1692 : }
1693 :
1694 0 : folio_clear_swapcache(old);
1695 0 : old->private = NULL;
1696 :
1697 0 : folio_unlock(old);
1698 : folio_put_refs(old, 2);
1699 : return error;
1700 : }
1701 :
1702 0 : static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
1703 : struct folio *folio, swp_entry_t swap)
1704 : {
1705 0 : struct address_space *mapping = inode->i_mapping;
1706 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1707 : swp_entry_t swapin_error;
1708 : void *old;
1709 :
1710 : swapin_error = make_swapin_error_entry();
1711 0 : old = xa_cmpxchg_irq(&mapping->i_pages, index,
1712 : swp_to_radix_entry(swap),
1713 : swp_to_radix_entry(swapin_error), 0);
1714 0 : if (old != swp_to_radix_entry(swap))
1715 : return;
1716 :
1717 0 : folio_wait_writeback(folio);
1718 0 : delete_from_swap_cache(folio);
1719 0 : spin_lock_irq(&info->lock);
1720 : /*
1721 : * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
1722 : * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
1723 : * shmem_evict_inode.
1724 : */
1725 0 : info->alloced--;
1726 0 : info->swapped--;
1727 0 : shmem_recalc_inode(inode);
1728 0 : spin_unlock_irq(&info->lock);
1729 0 : swap_free(swap);
1730 : }
1731 :
1732 : /*
1733 : * Swap in the folio pointed to by *foliop.
1734 : * Caller has to make sure that *foliop contains a valid swapped folio.
1735 : * Returns 0 and the folio in foliop if success. On failure, returns the
1736 : * error code and NULL in *foliop.
1737 : */
1738 0 : static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
1739 : struct folio **foliop, enum sgp_type sgp,
1740 : gfp_t gfp, struct vm_area_struct *vma,
1741 : vm_fault_t *fault_type)
1742 : {
1743 0 : struct address_space *mapping = inode->i_mapping;
1744 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1745 0 : struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
1746 : struct swap_info_struct *si;
1747 0 : struct folio *folio = NULL;
1748 : swp_entry_t swap;
1749 : int error;
1750 :
1751 : VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
1752 0 : swap = radix_to_swp_entry(*foliop);
1753 0 : *foliop = NULL;
1754 :
1755 0 : if (is_swapin_error_entry(swap))
1756 : return -EIO;
1757 :
1758 0 : si = get_swap_device(swap);
1759 0 : if (!si) {
1760 0 : if (!shmem_confirm_swap(mapping, index, swap))
1761 : return -EEXIST;
1762 : else
1763 : return -EINVAL;
1764 : }
1765 :
1766 : /* Look it up and read it in.. */
1767 0 : folio = swap_cache_get_folio(swap, NULL, 0);
1768 0 : if (!folio) {
1769 : /* Or update major stats only when swapin succeeds?? */
1770 0 : if (fault_type) {
1771 0 : *fault_type |= VM_FAULT_MAJOR;
1772 0 : count_vm_event(PGMAJFAULT);
1773 0 : count_memcg_event_mm(charge_mm, PGMAJFAULT);
1774 : }
1775 : /* Here we actually start the io */
1776 0 : folio = shmem_swapin(swap, gfp, info, index);
1777 0 : if (!folio) {
1778 : error = -ENOMEM;
1779 : goto failed;
1780 : }
1781 : }
1782 :
1783 : /* We have to do this with folio locked to prevent races */
1784 0 : folio_lock(folio);
1785 0 : if (!folio_test_swapcache(folio) ||
1786 0 : folio_swap_entry(folio).val != swap.val ||
1787 0 : !shmem_confirm_swap(mapping, index, swap)) {
1788 : error = -EEXIST;
1789 : goto unlock;
1790 : }
1791 0 : if (!folio_test_uptodate(folio)) {
1792 : error = -EIO;
1793 : goto failed;
1794 : }
1795 0 : folio_wait_writeback(folio);
1796 :
1797 : /*
1798 : * Some architectures may have to restore extra metadata to the
1799 : * folio after reading from swap.
1800 : */
1801 0 : arch_swap_restore(swap, folio);
1802 :
1803 0 : if (shmem_should_replace_folio(folio, gfp)) {
1804 0 : error = shmem_replace_folio(&folio, gfp, info, index);
1805 0 : if (error)
1806 : goto failed;
1807 : }
1808 :
1809 0 : error = shmem_add_to_page_cache(folio, mapping, index,
1810 : swp_to_radix_entry(swap), gfp,
1811 : charge_mm);
1812 0 : if (error)
1813 : goto failed;
1814 :
1815 0 : spin_lock_irq(&info->lock);
1816 0 : info->swapped--;
1817 0 : shmem_recalc_inode(inode);
1818 0 : spin_unlock_irq(&info->lock);
1819 :
1820 0 : if (sgp == SGP_WRITE)
1821 0 : folio_mark_accessed(folio);
1822 :
1823 0 : delete_from_swap_cache(folio);
1824 0 : folio_mark_dirty(folio);
1825 0 : swap_free(swap);
1826 0 : put_swap_device(si);
1827 :
1828 0 : *foliop = folio;
1829 : return 0;
1830 : failed:
1831 0 : if (!shmem_confirm_swap(mapping, index, swap))
1832 0 : error = -EEXIST;
1833 0 : if (error == -EIO)
1834 0 : shmem_set_folio_swapin_error(inode, index, folio, swap);
1835 : unlock:
1836 0 : if (folio) {
1837 0 : folio_unlock(folio);
1838 0 : folio_put(folio);
1839 : }
1840 0 : put_swap_device(si);
1841 :
1842 : return error;
1843 : }
1844 :
1845 : /*
1846 : * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
1847 : *
1848 : * If we allocate a new one we do not mark it dirty. That's up to the
1849 : * vm. If we swap it in we mark it dirty since we also free the swap
1850 : * entry since a page cannot live in both the swap and page cache.
1851 : *
1852 : * vma, vmf, and fault_type are only supplied by shmem_fault:
1853 : * otherwise they are NULL.
1854 : */
1855 0 : static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
1856 : struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
1857 : struct vm_area_struct *vma, struct vm_fault *vmf,
1858 : vm_fault_t *fault_type)
1859 : {
1860 0 : struct address_space *mapping = inode->i_mapping;
1861 0 : struct shmem_inode_info *info = SHMEM_I(inode);
1862 : struct shmem_sb_info *sbinfo;
1863 : struct mm_struct *charge_mm;
1864 : struct folio *folio;
1865 : pgoff_t hindex;
1866 : gfp_t huge_gfp;
1867 : int error;
1868 0 : int once = 0;
1869 0 : int alloced = 0;
1870 :
1871 0 : if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
1872 : return -EFBIG;
1873 : repeat:
1874 0 : if (sgp <= SGP_CACHE &&
1875 0 : ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1876 : return -EINVAL;
1877 : }
1878 :
1879 0 : sbinfo = SHMEM_SB(inode->i_sb);
1880 0 : charge_mm = vma ? vma->vm_mm : NULL;
1881 :
1882 0 : folio = filemap_get_entry(mapping, index);
1883 : if (folio && vma && userfaultfd_minor(vma)) {
1884 : if (!xa_is_value(folio))
1885 : folio_put(folio);
1886 : *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
1887 : return 0;
1888 : }
1889 :
1890 0 : if (xa_is_value(folio)) {
1891 0 : error = shmem_swapin_folio(inode, index, &folio,
1892 : sgp, gfp, vma, fault_type);
1893 0 : if (error == -EEXIST)
1894 : goto repeat;
1895 :
1896 0 : *foliop = folio;
1897 : return error;
1898 : }
1899 :
1900 0 : if (folio) {
1901 0 : folio_lock(folio);
1902 :
1903 : /* Has the folio been truncated or swapped out? */
1904 0 : if (unlikely(folio->mapping != mapping)) {
1905 0 : folio_unlock(folio);
1906 0 : folio_put(folio);
1907 : goto repeat;
1908 : }
1909 0 : if (sgp == SGP_WRITE)
1910 0 : folio_mark_accessed(folio);
1911 0 : if (folio_test_uptodate(folio))
1912 : goto out;
1913 : /* fallocated folio */
1914 0 : if (sgp != SGP_READ)
1915 : goto clear;
1916 0 : folio_unlock(folio);
1917 0 : folio_put(folio);
1918 : }
1919 :
1920 : /*
1921 : * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
1922 : * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
1923 : */
1924 0 : *foliop = NULL;
1925 0 : if (sgp == SGP_READ)
1926 : return 0;
1927 0 : if (sgp == SGP_NOALLOC)
1928 : return -ENOENT;
1929 :
1930 : /*
1931 : * Fast cache lookup and swap lookup did not find it: allocate.
1932 : */
1933 :
1934 : if (vma && userfaultfd_missing(vma)) {
1935 : *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1936 : return 0;
1937 : }
1938 :
1939 : if (!shmem_is_huge(inode, index, false,
1940 : vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0))
1941 : goto alloc_nohuge;
1942 :
1943 : huge_gfp = vma_thp_gfp_mask(vma);
1944 : huge_gfp = limit_gfp_mask(huge_gfp, gfp);
1945 : folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true);
1946 : if (IS_ERR(folio)) {
1947 : alloc_nohuge:
1948 0 : folio = shmem_alloc_and_acct_folio(gfp, inode, index, false);
1949 : }
1950 0 : if (IS_ERR(folio)) {
1951 0 : int retry = 5;
1952 :
1953 0 : error = PTR_ERR(folio);
1954 0 : folio = NULL;
1955 : if (error != -ENOSPC)
1956 : goto unlock;
1957 : /*
1958 : * Try to reclaim some space by splitting a large folio
1959 : * beyond i_size on the filesystem.
1960 : */
1961 : while (retry--) {
1962 : int ret;
1963 :
1964 : ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1965 : if (ret == SHRINK_STOP)
1966 : break;
1967 : if (ret)
1968 : goto alloc_nohuge;
1969 : }
1970 : goto unlock;
1971 : }
1972 :
1973 0 : hindex = round_down(index, folio_nr_pages(folio));
1974 :
1975 0 : if (sgp == SGP_WRITE)
1976 0 : __folio_set_referenced(folio);
1977 :
1978 0 : error = shmem_add_to_page_cache(folio, mapping, hindex,
1979 : NULL, gfp & GFP_RECLAIM_MASK,
1980 : charge_mm);
1981 0 : if (error)
1982 : goto unacct;
1983 0 : folio_add_lru(folio);
1984 :
1985 0 : spin_lock_irq(&info->lock);
1986 0 : info->alloced += folio_nr_pages(folio);
1987 0 : inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
1988 0 : shmem_recalc_inode(inode);
1989 0 : spin_unlock_irq(&info->lock);
1990 0 : alloced = true;
1991 :
1992 0 : if (folio_test_pmd_mappable(folio) &&
1993 : DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1994 : folio_next_index(folio) - 1) {
1995 : /*
1996 : * Part of the large folio is beyond i_size: subject
1997 : * to shrink under memory pressure.
1998 : */
1999 : spin_lock(&sbinfo->shrinklist_lock);
2000 : /*
2001 : * _careful to defend against unlocked access to
2002 : * ->shrink_list in shmem_unused_huge_shrink()
2003 : */
2004 : if (list_empty_careful(&info->shrinklist)) {
2005 : list_add_tail(&info->shrinklist,
2006 : &sbinfo->shrinklist);
2007 : sbinfo->shrinklist_len++;
2008 : }
2009 : spin_unlock(&sbinfo->shrinklist_lock);
2010 : }
2011 :
2012 : /*
2013 : * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
2014 : */
2015 0 : if (sgp == SGP_FALLOC)
2016 0 : sgp = SGP_WRITE;
2017 : clear:
2018 : /*
2019 : * Let SGP_WRITE caller clear ends if write does not fill folio;
2020 : * but SGP_FALLOC on a folio fallocated earlier must initialize
2021 : * it now, lest undo on failure cancel our earlier guarantee.
2022 : */
2023 0 : if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
2024 0 : long i, n = folio_nr_pages(folio);
2025 :
2026 0 : for (i = 0; i < n; i++)
2027 0 : clear_highpage(folio_page(folio, i));
2028 0 : flush_dcache_folio(folio);
2029 0 : folio_mark_uptodate(folio);
2030 : }
2031 :
2032 : /* Perhaps the file has been truncated since we checked */
2033 0 : if (sgp <= SGP_CACHE &&
2034 0 : ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
2035 0 : if (alloced) {
2036 0 : folio_clear_dirty(folio);
2037 0 : filemap_remove_folio(folio);
2038 0 : spin_lock_irq(&info->lock);
2039 0 : shmem_recalc_inode(inode);
2040 0 : spin_unlock_irq(&info->lock);
2041 : }
2042 : error = -EINVAL;
2043 : goto unlock;
2044 : }
2045 : out:
2046 0 : *foliop = folio;
2047 : return 0;
2048 :
2049 : /*
2050 : * Error recovery.
2051 : */
2052 : unacct:
2053 0 : shmem_inode_unacct_blocks(inode, folio_nr_pages(folio));
2054 :
2055 0 : if (folio_test_large(folio)) {
2056 0 : folio_unlock(folio);
2057 0 : folio_put(folio);
2058 : goto alloc_nohuge;
2059 : }
2060 : unlock:
2061 0 : if (folio) {
2062 0 : folio_unlock(folio);
2063 0 : folio_put(folio);
2064 : }
2065 0 : if (error == -ENOSPC && !once++) {
2066 0 : spin_lock_irq(&info->lock);
2067 0 : shmem_recalc_inode(inode);
2068 0 : spin_unlock_irq(&info->lock);
2069 : goto repeat;
2070 : }
2071 0 : if (error == -EEXIST)
2072 : goto repeat;
2073 : return error;
2074 : }
2075 :
2076 0 : int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
2077 : enum sgp_type sgp)
2078 : {
2079 0 : return shmem_get_folio_gfp(inode, index, foliop, sgp,
2080 : mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
2081 : }
2082 :
2083 : /*
2084 : * This is like autoremove_wake_function, but it removes the wait queue
2085 : * entry unconditionally - even if something else had already woken the
2086 : * target.
2087 : */
2088 0 : static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
2089 : {
2090 0 : int ret = default_wake_function(wait, mode, sync, key);
2091 0 : list_del_init(&wait->entry);
2092 0 : return ret;
2093 : }
2094 :
2095 0 : static vm_fault_t shmem_fault(struct vm_fault *vmf)
2096 : {
2097 0 : struct vm_area_struct *vma = vmf->vma;
2098 0 : struct inode *inode = file_inode(vma->vm_file);
2099 0 : gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
2100 0 : struct folio *folio = NULL;
2101 : int err;
2102 0 : vm_fault_t ret = VM_FAULT_LOCKED;
2103 :
2104 : /*
2105 : * Trinity finds that probing a hole which tmpfs is punching can
2106 : * prevent the hole-punch from ever completing: which in turn
2107 : * locks writers out with its hold on i_rwsem. So refrain from
2108 : * faulting pages into the hole while it's being punched. Although
2109 : * shmem_undo_range() does remove the additions, it may be unable to
2110 : * keep up, as each new page needs its own unmap_mapping_range() call,
2111 : * and the i_mmap tree grows ever slower to scan if new vmas are added.
2112 : *
2113 : * It does not matter if we sometimes reach this check just before the
2114 : * hole-punch begins, so that one fault then races with the punch:
2115 : * we just need to make racing faults a rare case.
2116 : *
2117 : * The implementation below would be much simpler if we just used a
2118 : * standard mutex or completion: but we cannot take i_rwsem in fault,
2119 : * and bloating every shmem inode for this unlikely case would be sad.
2120 : */
2121 0 : if (unlikely(inode->i_private)) {
2122 : struct shmem_falloc *shmem_falloc;
2123 :
2124 0 : spin_lock(&inode->i_lock);
2125 0 : shmem_falloc = inode->i_private;
2126 0 : if (shmem_falloc &&
2127 0 : shmem_falloc->waitq &&
2128 0 : vmf->pgoff >= shmem_falloc->start &&
2129 0 : vmf->pgoff < shmem_falloc->next) {
2130 : struct file *fpin;
2131 : wait_queue_head_t *shmem_falloc_waitq;
2132 0 : DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
2133 :
2134 0 : ret = VM_FAULT_NOPAGE;
2135 0 : fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2136 0 : if (fpin)
2137 0 : ret = VM_FAULT_RETRY;
2138 :
2139 0 : shmem_falloc_waitq = shmem_falloc->waitq;
2140 0 : prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
2141 : TASK_UNINTERRUPTIBLE);
2142 0 : spin_unlock(&inode->i_lock);
2143 0 : schedule();
2144 :
2145 : /*
2146 : * shmem_falloc_waitq points into the shmem_fallocate()
2147 : * stack of the hole-punching task: shmem_falloc_waitq
2148 : * is usually invalid by the time we reach here, but
2149 : * finish_wait() does not dereference it in that case;
2150 : * though i_lock needed lest racing with wake_up_all().
2151 : */
2152 0 : spin_lock(&inode->i_lock);
2153 0 : finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2154 0 : spin_unlock(&inode->i_lock);
2155 :
2156 0 : if (fpin)
2157 0 : fput(fpin);
2158 0 : return ret;
2159 : }
2160 0 : spin_unlock(&inode->i_lock);
2161 : }
2162 :
2163 0 : err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
2164 : gfp, vma, vmf, &ret);
2165 0 : if (err)
2166 : return vmf_error(err);
2167 0 : if (folio)
2168 0 : vmf->page = folio_file_page(folio, vmf->pgoff);
2169 0 : return ret;
2170 : }
2171 :
2172 0 : unsigned long shmem_get_unmapped_area(struct file *file,
2173 : unsigned long uaddr, unsigned long len,
2174 : unsigned long pgoff, unsigned long flags)
2175 : {
2176 : unsigned long (*get_area)(struct file *,
2177 : unsigned long, unsigned long, unsigned long, unsigned long);
2178 : unsigned long addr;
2179 : unsigned long offset;
2180 : unsigned long inflated_len;
2181 : unsigned long inflated_addr;
2182 : unsigned long inflated_offset;
2183 :
2184 0 : if (len > TASK_SIZE)
2185 : return -ENOMEM;
2186 :
2187 0 : get_area = current->mm->get_unmapped_area;
2188 0 : addr = get_area(file, uaddr, len, pgoff, flags);
2189 :
2190 : if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
2191 0 : return addr;
2192 : if (IS_ERR_VALUE(addr))
2193 : return addr;
2194 : if (addr & ~PAGE_MASK)
2195 : return addr;
2196 : if (addr > TASK_SIZE - len)
2197 : return addr;
2198 :
2199 : if (shmem_huge == SHMEM_HUGE_DENY)
2200 : return addr;
2201 : if (len < HPAGE_PMD_SIZE)
2202 : return addr;
2203 : if (flags & MAP_FIXED)
2204 : return addr;
2205 : /*
2206 : * Our priority is to support MAP_SHARED mapped hugely;
2207 : * and support MAP_PRIVATE mapped hugely too, until it is COWed.
2208 : * But if caller specified an address hint and we allocated area there
2209 : * successfully, respect that as before.
2210 : */
2211 : if (uaddr == addr)
2212 : return addr;
2213 :
2214 : if (shmem_huge != SHMEM_HUGE_FORCE) {
2215 : struct super_block *sb;
2216 :
2217 : if (file) {
2218 : VM_BUG_ON(file->f_op != &shmem_file_operations);
2219 : sb = file_inode(file)->i_sb;
2220 : } else {
2221 : /*
2222 : * Called directly from mm/mmap.c, or drivers/char/mem.c
2223 : * for "/dev/zero", to create a shared anonymous object.
2224 : */
2225 : if (IS_ERR(shm_mnt))
2226 : return addr;
2227 : sb = shm_mnt->mnt_sb;
2228 : }
2229 : if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
2230 : return addr;
2231 : }
2232 :
2233 : offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
2234 : if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
2235 : return addr;
2236 : if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
2237 : return addr;
2238 :
2239 : inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
2240 : if (inflated_len > TASK_SIZE)
2241 : return addr;
2242 : if (inflated_len < len)
2243 : return addr;
2244 :
2245 : inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
2246 : if (IS_ERR_VALUE(inflated_addr))
2247 : return addr;
2248 : if (inflated_addr & ~PAGE_MASK)
2249 : return addr;
2250 :
2251 : inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
2252 : inflated_addr += offset - inflated_offset;
2253 : if (inflated_offset > offset)
2254 : inflated_addr += HPAGE_PMD_SIZE;
2255 :
2256 : if (inflated_addr > TASK_SIZE - len)
2257 : return addr;
2258 : return inflated_addr;
2259 : }
2260 :
2261 : #ifdef CONFIG_NUMA
2262 : static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
2263 : {
2264 : struct inode *inode = file_inode(vma->vm_file);
2265 : return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
2266 : }
2267 :
2268 : static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2269 : unsigned long addr)
2270 : {
2271 : struct inode *inode = file_inode(vma->vm_file);
2272 : pgoff_t index;
2273 :
2274 : index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2275 : return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
2276 : }
2277 : #endif
2278 :
2279 0 : int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
2280 : {
2281 0 : struct inode *inode = file_inode(file);
2282 0 : struct shmem_inode_info *info = SHMEM_I(inode);
2283 0 : int retval = -ENOMEM;
2284 :
2285 : /*
2286 : * What serializes the accesses to info->flags?
2287 : * ipc_lock_object() when called from shmctl_do_lock(),
2288 : * no serialization needed when called from shm_destroy().
2289 : */
2290 0 : if (lock && !(info->flags & VM_LOCKED)) {
2291 0 : if (!user_shm_lock(inode->i_size, ucounts))
2292 : goto out_nomem;
2293 0 : info->flags |= VM_LOCKED;
2294 0 : mapping_set_unevictable(file->f_mapping);
2295 : }
2296 0 : if (!lock && (info->flags & VM_LOCKED) && ucounts) {
2297 0 : user_shm_unlock(inode->i_size, ucounts);
2298 0 : info->flags &= ~VM_LOCKED;
2299 0 : mapping_clear_unevictable(file->f_mapping);
2300 : }
2301 : retval = 0;
2302 :
2303 : out_nomem:
2304 0 : return retval;
2305 : }
2306 :
2307 0 : static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
2308 : {
2309 0 : struct inode *inode = file_inode(file);
2310 0 : struct shmem_inode_info *info = SHMEM_I(inode);
2311 : int ret;
2312 :
2313 0 : ret = seal_check_future_write(info->seals, vma);
2314 0 : if (ret)
2315 : return ret;
2316 :
2317 : /* arm64 - allow memory tagging on RAM-based files */
2318 0 : vm_flags_set(vma, VM_MTE_ALLOWED);
2319 :
2320 0 : file_accessed(file);
2321 : /* This is anonymous shared memory if it is unlinked at the time of mmap */
2322 0 : if (inode->i_nlink)
2323 0 : vma->vm_ops = &shmem_vm_ops;
2324 : else
2325 0 : vma->vm_ops = &shmem_anon_vm_ops;
2326 : return 0;
2327 : }
2328 :
2329 : #ifdef CONFIG_TMPFS_XATTR
2330 : static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2331 :
2332 : /*
2333 : * chattr's fsflags are unrelated to extended attributes,
2334 : * but tmpfs has chosen to enable them under the same config option.
2335 : */
2336 : static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
2337 : {
2338 : unsigned int i_flags = 0;
2339 :
2340 : if (fsflags & FS_NOATIME_FL)
2341 : i_flags |= S_NOATIME;
2342 : if (fsflags & FS_APPEND_FL)
2343 : i_flags |= S_APPEND;
2344 : if (fsflags & FS_IMMUTABLE_FL)
2345 : i_flags |= S_IMMUTABLE;
2346 : /*
2347 : * But FS_NODUMP_FL does not require any action in i_flags.
2348 : */
2349 : inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
2350 : }
2351 : #else
2352 : static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
2353 : {
2354 : }
2355 : #define shmem_initxattrs NULL
2356 : #endif
2357 :
2358 1 : static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb,
2359 : struct inode *dir, umode_t mode, dev_t dev,
2360 : unsigned long flags)
2361 : {
2362 : struct inode *inode;
2363 : struct shmem_inode_info *info;
2364 1 : struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2365 : ino_t ino;
2366 :
2367 1 : if (shmem_reserve_inode(sb, &ino))
2368 : return NULL;
2369 :
2370 1 : inode = new_inode(sb);
2371 1 : if (inode) {
2372 1 : inode->i_ino = ino;
2373 1 : inode_init_owner(idmap, inode, dir, mode);
2374 1 : inode->i_blocks = 0;
2375 1 : inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
2376 1 : inode->i_generation = get_random_u32();
2377 1 : info = SHMEM_I(inode);
2378 2 : memset(info, 0, (char *)inode - (char *)info);
2379 1 : spin_lock_init(&info->lock);
2380 2 : atomic_set(&info->stop_eviction, 0);
2381 1 : info->seals = F_SEAL_SEAL;
2382 1 : info->flags = flags & VM_NORESERVE;
2383 1 : info->i_crtime = inode->i_mtime;
2384 1 : info->fsflags = (dir == NULL) ? 0 :
2385 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
2386 : if (info->fsflags)
2387 : shmem_set_inode_flags(inode, info->fsflags);
2388 2 : INIT_LIST_HEAD(&info->shrinklist);
2389 2 : INIT_LIST_HEAD(&info->swaplist);
2390 1 : if (sbinfo->noswap)
2391 0 : mapping_set_unevictable(inode->i_mapping);
2392 1 : simple_xattrs_init(&info->xattrs);
2393 1 : cache_no_acl(inode);
2394 2 : mapping_set_large_folios(inode->i_mapping);
2395 :
2396 1 : switch (mode & S_IFMT) {
2397 : default:
2398 0 : inode->i_op = &shmem_special_inode_operations;
2399 0 : init_special_inode(inode, mode, dev);
2400 0 : break;
2401 : case S_IFREG:
2402 0 : inode->i_mapping->a_ops = &shmem_aops;
2403 0 : inode->i_op = &shmem_inode_operations;
2404 0 : inode->i_fop = &shmem_file_operations;
2405 0 : mpol_shared_policy_init(&info->policy,
2406 : shmem_get_sbmpol(sbinfo));
2407 : break;
2408 : case S_IFDIR:
2409 1 : inc_nlink(inode);
2410 : /* Some things misbehave if size == 0 on a directory */
2411 1 : inode->i_size = 2 * BOGO_DIRENT_SIZE;
2412 1 : inode->i_op = &shmem_dir_inode_operations;
2413 1 : inode->i_fop = &simple_dir_operations;
2414 1 : break;
2415 : case S_IFLNK:
2416 : /*
2417 : * Must not load anything in the rbtree,
2418 : * mpol_free_shared_policy will not be called.
2419 : */
2420 : mpol_shared_policy_init(&info->policy, NULL);
2421 : break;
2422 : }
2423 :
2424 : lockdep_annotate_inode_mutex_key(inode);
2425 : } else
2426 : shmem_free_inode(sb);
2427 : return inode;
2428 : }
2429 :
2430 : #ifdef CONFIG_USERFAULTFD
2431 : int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
2432 : struct vm_area_struct *dst_vma,
2433 : unsigned long dst_addr,
2434 : unsigned long src_addr,
2435 : uffd_flags_t flags,
2436 : struct folio **foliop)
2437 : {
2438 : struct inode *inode = file_inode(dst_vma->vm_file);
2439 : struct shmem_inode_info *info = SHMEM_I(inode);
2440 : struct address_space *mapping = inode->i_mapping;
2441 : gfp_t gfp = mapping_gfp_mask(mapping);
2442 : pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
2443 : void *page_kaddr;
2444 : struct folio *folio;
2445 : int ret;
2446 : pgoff_t max_off;
2447 :
2448 : if (!shmem_inode_acct_block(inode, 1)) {
2449 : /*
2450 : * We may have got a page, returned -ENOENT triggering a retry,
2451 : * and now we find ourselves with -ENOMEM. Release the page, to
2452 : * avoid a BUG_ON in our caller.
2453 : */
2454 : if (unlikely(*foliop)) {
2455 : folio_put(*foliop);
2456 : *foliop = NULL;
2457 : }
2458 : return -ENOMEM;
2459 : }
2460 :
2461 : if (!*foliop) {
2462 : ret = -ENOMEM;
2463 : folio = shmem_alloc_folio(gfp, info, pgoff);
2464 : if (!folio)
2465 : goto out_unacct_blocks;
2466 :
2467 : if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
2468 : page_kaddr = kmap_local_folio(folio, 0);
2469 : /*
2470 : * The read mmap_lock is held here. Despite the
2471 : * mmap_lock being read recursive a deadlock is still
2472 : * possible if a writer has taken a lock. For example:
2473 : *
2474 : * process A thread 1 takes read lock on own mmap_lock
2475 : * process A thread 2 calls mmap, blocks taking write lock
2476 : * process B thread 1 takes page fault, read lock on own mmap lock
2477 : * process B thread 2 calls mmap, blocks taking write lock
2478 : * process A thread 1 blocks taking read lock on process B
2479 : * process B thread 1 blocks taking read lock on process A
2480 : *
2481 : * Disable page faults to prevent potential deadlock
2482 : * and retry the copy outside the mmap_lock.
2483 : */
2484 : pagefault_disable();
2485 : ret = copy_from_user(page_kaddr,
2486 : (const void __user *)src_addr,
2487 : PAGE_SIZE);
2488 : pagefault_enable();
2489 : kunmap_local(page_kaddr);
2490 :
2491 : /* fallback to copy_from_user outside mmap_lock */
2492 : if (unlikely(ret)) {
2493 : *foliop = folio;
2494 : ret = -ENOENT;
2495 : /* don't free the page */
2496 : goto out_unacct_blocks;
2497 : }
2498 :
2499 : flush_dcache_folio(folio);
2500 : } else { /* ZEROPAGE */
2501 : clear_user_highpage(&folio->page, dst_addr);
2502 : }
2503 : } else {
2504 : folio = *foliop;
2505 : VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
2506 : *foliop = NULL;
2507 : }
2508 :
2509 : VM_BUG_ON(folio_test_locked(folio));
2510 : VM_BUG_ON(folio_test_swapbacked(folio));
2511 : __folio_set_locked(folio);
2512 : __folio_set_swapbacked(folio);
2513 : __folio_mark_uptodate(folio);
2514 :
2515 : ret = -EFAULT;
2516 : max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2517 : if (unlikely(pgoff >= max_off))
2518 : goto out_release;
2519 :
2520 : ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
2521 : gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm);
2522 : if (ret)
2523 : goto out_release;
2524 :
2525 : ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
2526 : &folio->page, true, flags);
2527 : if (ret)
2528 : goto out_delete_from_cache;
2529 :
2530 : spin_lock_irq(&info->lock);
2531 : info->alloced++;
2532 : inode->i_blocks += BLOCKS_PER_PAGE;
2533 : shmem_recalc_inode(inode);
2534 : spin_unlock_irq(&info->lock);
2535 :
2536 : folio_unlock(folio);
2537 : return 0;
2538 : out_delete_from_cache:
2539 : filemap_remove_folio(folio);
2540 : out_release:
2541 : folio_unlock(folio);
2542 : folio_put(folio);
2543 : out_unacct_blocks:
2544 : shmem_inode_unacct_blocks(inode, 1);
2545 : return ret;
2546 : }
2547 : #endif /* CONFIG_USERFAULTFD */
2548 :
2549 : #ifdef CONFIG_TMPFS
2550 : static const struct inode_operations shmem_symlink_inode_operations;
2551 : static const struct inode_operations shmem_short_symlink_operations;
2552 :
2553 : static int
2554 : shmem_write_begin(struct file *file, struct address_space *mapping,
2555 : loff_t pos, unsigned len,
2556 : struct page **pagep, void **fsdata)
2557 : {
2558 : struct inode *inode = mapping->host;
2559 : struct shmem_inode_info *info = SHMEM_I(inode);
2560 : pgoff_t index = pos >> PAGE_SHIFT;
2561 : struct folio *folio;
2562 : int ret = 0;
2563 :
2564 : /* i_rwsem is held by caller */
2565 : if (unlikely(info->seals & (F_SEAL_GROW |
2566 : F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
2567 : if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
2568 : return -EPERM;
2569 : if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
2570 : return -EPERM;
2571 : }
2572 :
2573 : ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
2574 :
2575 : if (ret)
2576 : return ret;
2577 :
2578 : *pagep = folio_file_page(folio, index);
2579 : if (PageHWPoison(*pagep)) {
2580 : folio_unlock(folio);
2581 : folio_put(folio);
2582 : *pagep = NULL;
2583 : return -EIO;
2584 : }
2585 :
2586 : return 0;
2587 : }
2588 :
2589 : static int
2590 : shmem_write_end(struct file *file, struct address_space *mapping,
2591 : loff_t pos, unsigned len, unsigned copied,
2592 : struct page *page, void *fsdata)
2593 : {
2594 : struct folio *folio = page_folio(page);
2595 : struct inode *inode = mapping->host;
2596 :
2597 : if (pos + copied > inode->i_size)
2598 : i_size_write(inode, pos + copied);
2599 :
2600 : if (!folio_test_uptodate(folio)) {
2601 : if (copied < folio_size(folio)) {
2602 : size_t from = offset_in_folio(folio, pos);
2603 : folio_zero_segments(folio, 0, from,
2604 : from + copied, folio_size(folio));
2605 : }
2606 : folio_mark_uptodate(folio);
2607 : }
2608 : folio_mark_dirty(folio);
2609 : folio_unlock(folio);
2610 : folio_put(folio);
2611 :
2612 : return copied;
2613 : }
2614 :
2615 : static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
2616 : {
2617 : struct file *file = iocb->ki_filp;
2618 : struct inode *inode = file_inode(file);
2619 : struct address_space *mapping = inode->i_mapping;
2620 : pgoff_t index;
2621 : unsigned long offset;
2622 : int error = 0;
2623 : ssize_t retval = 0;
2624 : loff_t *ppos = &iocb->ki_pos;
2625 :
2626 : index = *ppos >> PAGE_SHIFT;
2627 : offset = *ppos & ~PAGE_MASK;
2628 :
2629 : for (;;) {
2630 : struct folio *folio = NULL;
2631 : struct page *page = NULL;
2632 : pgoff_t end_index;
2633 : unsigned long nr, ret;
2634 : loff_t i_size = i_size_read(inode);
2635 :
2636 : end_index = i_size >> PAGE_SHIFT;
2637 : if (index > end_index)
2638 : break;
2639 : if (index == end_index) {
2640 : nr = i_size & ~PAGE_MASK;
2641 : if (nr <= offset)
2642 : break;
2643 : }
2644 :
2645 : error = shmem_get_folio(inode, index, &folio, SGP_READ);
2646 : if (error) {
2647 : if (error == -EINVAL)
2648 : error = 0;
2649 : break;
2650 : }
2651 : if (folio) {
2652 : folio_unlock(folio);
2653 :
2654 : page = folio_file_page(folio, index);
2655 : if (PageHWPoison(page)) {
2656 : folio_put(folio);
2657 : error = -EIO;
2658 : break;
2659 : }
2660 : }
2661 :
2662 : /*
2663 : * We must evaluate after, since reads (unlike writes)
2664 : * are called without i_rwsem protection against truncate
2665 : */
2666 : nr = PAGE_SIZE;
2667 : i_size = i_size_read(inode);
2668 : end_index = i_size >> PAGE_SHIFT;
2669 : if (index == end_index) {
2670 : nr = i_size & ~PAGE_MASK;
2671 : if (nr <= offset) {
2672 : if (folio)
2673 : folio_put(folio);
2674 : break;
2675 : }
2676 : }
2677 : nr -= offset;
2678 :
2679 : if (folio) {
2680 : /*
2681 : * If users can be writing to this page using arbitrary
2682 : * virtual addresses, take care about potential aliasing
2683 : * before reading the page on the kernel side.
2684 : */
2685 : if (mapping_writably_mapped(mapping))
2686 : flush_dcache_page(page);
2687 : /*
2688 : * Mark the page accessed if we read the beginning.
2689 : */
2690 : if (!offset)
2691 : folio_mark_accessed(folio);
2692 : /*
2693 : * Ok, we have the page, and it's up-to-date, so
2694 : * now we can copy it to user space...
2695 : */
2696 : ret = copy_page_to_iter(page, offset, nr, to);
2697 : folio_put(folio);
2698 :
2699 : } else if (user_backed_iter(to)) {
2700 : /*
2701 : * Copy to user tends to be so well optimized, but
2702 : * clear_user() not so much, that it is noticeably
2703 : * faster to copy the zero page instead of clearing.
2704 : */
2705 : ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
2706 : } else {
2707 : /*
2708 : * But submitting the same page twice in a row to
2709 : * splice() - or others? - can result in confusion:
2710 : * so don't attempt that optimization on pipes etc.
2711 : */
2712 : ret = iov_iter_zero(nr, to);
2713 : }
2714 :
2715 : retval += ret;
2716 : offset += ret;
2717 : index += offset >> PAGE_SHIFT;
2718 : offset &= ~PAGE_MASK;
2719 :
2720 : if (!iov_iter_count(to))
2721 : break;
2722 : if (ret < nr) {
2723 : error = -EFAULT;
2724 : break;
2725 : }
2726 : cond_resched();
2727 : }
2728 :
2729 : *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
2730 : file_accessed(file);
2731 : return retval ? retval : error;
2732 : }
2733 :
2734 : static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
2735 : struct pipe_buffer *buf)
2736 : {
2737 : return true;
2738 : }
2739 :
2740 : static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
2741 : struct pipe_buffer *buf)
2742 : {
2743 : }
2744 :
2745 : static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
2746 : struct pipe_buffer *buf)
2747 : {
2748 : return false;
2749 : }
2750 :
2751 : static const struct pipe_buf_operations zero_pipe_buf_ops = {
2752 : .release = zero_pipe_buf_release,
2753 : .try_steal = zero_pipe_buf_try_steal,
2754 : .get = zero_pipe_buf_get,
2755 : };
2756 :
2757 : static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
2758 : loff_t fpos, size_t size)
2759 : {
2760 : size_t offset = fpos & ~PAGE_MASK;
2761 :
2762 : size = min_t(size_t, size, PAGE_SIZE - offset);
2763 :
2764 : if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
2765 : struct pipe_buffer *buf = pipe_head_buf(pipe);
2766 :
2767 : *buf = (struct pipe_buffer) {
2768 : .ops = &zero_pipe_buf_ops,
2769 : .page = ZERO_PAGE(0),
2770 : .offset = offset,
2771 : .len = size,
2772 : };
2773 : pipe->head++;
2774 : }
2775 :
2776 : return size;
2777 : }
2778 :
2779 : static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
2780 : struct pipe_inode_info *pipe,
2781 : size_t len, unsigned int flags)
2782 : {
2783 : struct inode *inode = file_inode(in);
2784 : struct address_space *mapping = inode->i_mapping;
2785 : struct folio *folio = NULL;
2786 : size_t total_spliced = 0, used, npages, n, part;
2787 : loff_t isize;
2788 : int error = 0;
2789 :
2790 : /* Work out how much data we can actually add into the pipe */
2791 : used = pipe_occupancy(pipe->head, pipe->tail);
2792 : npages = max_t(ssize_t, pipe->max_usage - used, 0);
2793 : len = min_t(size_t, len, npages * PAGE_SIZE);
2794 :
2795 : do {
2796 : if (*ppos >= i_size_read(inode))
2797 : break;
2798 :
2799 : error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, SGP_READ);
2800 : if (error) {
2801 : if (error == -EINVAL)
2802 : error = 0;
2803 : break;
2804 : }
2805 : if (folio) {
2806 : folio_unlock(folio);
2807 :
2808 : if (folio_test_hwpoison(folio)) {
2809 : error = -EIO;
2810 : break;
2811 : }
2812 : }
2813 :
2814 : /*
2815 : * i_size must be checked after we know the pages are Uptodate.
2816 : *
2817 : * Checking i_size after the check allows us to calculate
2818 : * the correct value for "nr", which means the zero-filled
2819 : * part of the page is not copied back to userspace (unless
2820 : * another truncate extends the file - this is desired though).
2821 : */
2822 : isize = i_size_read(inode);
2823 : if (unlikely(*ppos >= isize))
2824 : break;
2825 : part = min_t(loff_t, isize - *ppos, len);
2826 :
2827 : if (folio) {
2828 : /*
2829 : * If users can be writing to this page using arbitrary
2830 : * virtual addresses, take care about potential aliasing
2831 : * before reading the page on the kernel side.
2832 : */
2833 : if (mapping_writably_mapped(mapping))
2834 : flush_dcache_folio(folio);
2835 : folio_mark_accessed(folio);
2836 : /*
2837 : * Ok, we have the page, and it's up-to-date, so we can
2838 : * now splice it into the pipe.
2839 : */
2840 : n = splice_folio_into_pipe(pipe, folio, *ppos, part);
2841 : folio_put(folio);
2842 : folio = NULL;
2843 : } else {
2844 : n = splice_zeropage_into_pipe(pipe, *ppos, len);
2845 : }
2846 :
2847 : if (!n)
2848 : break;
2849 : len -= n;
2850 : total_spliced += n;
2851 : *ppos += n;
2852 : in->f_ra.prev_pos = *ppos;
2853 : if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
2854 : break;
2855 :
2856 : cond_resched();
2857 : } while (len);
2858 :
2859 : if (folio)
2860 : folio_put(folio);
2861 :
2862 : file_accessed(in);
2863 : return total_spliced ? total_spliced : error;
2864 : }
2865 :
2866 : static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
2867 : {
2868 : struct address_space *mapping = file->f_mapping;
2869 : struct inode *inode = mapping->host;
2870 :
2871 : if (whence != SEEK_DATA && whence != SEEK_HOLE)
2872 : return generic_file_llseek_size(file, offset, whence,
2873 : MAX_LFS_FILESIZE, i_size_read(inode));
2874 : if (offset < 0)
2875 : return -ENXIO;
2876 :
2877 : inode_lock(inode);
2878 : /* We're holding i_rwsem so we can access i_size directly */
2879 : offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
2880 : if (offset >= 0)
2881 : offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
2882 : inode_unlock(inode);
2883 : return offset;
2884 : }
2885 :
2886 : static long shmem_fallocate(struct file *file, int mode, loff_t offset,
2887 : loff_t len)
2888 : {
2889 : struct inode *inode = file_inode(file);
2890 : struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2891 : struct shmem_inode_info *info = SHMEM_I(inode);
2892 : struct shmem_falloc shmem_falloc;
2893 : pgoff_t start, index, end, undo_fallocend;
2894 : int error;
2895 :
2896 : if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2897 : return -EOPNOTSUPP;
2898 :
2899 : inode_lock(inode);
2900 :
2901 : if (mode & FALLOC_FL_PUNCH_HOLE) {
2902 : struct address_space *mapping = file->f_mapping;
2903 : loff_t unmap_start = round_up(offset, PAGE_SIZE);
2904 : loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
2905 : DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
2906 :
2907 : /* protected by i_rwsem */
2908 : if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
2909 : error = -EPERM;
2910 : goto out;
2911 : }
2912 :
2913 : shmem_falloc.waitq = &shmem_falloc_waitq;
2914 : shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
2915 : shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
2916 : spin_lock(&inode->i_lock);
2917 : inode->i_private = &shmem_falloc;
2918 : spin_unlock(&inode->i_lock);
2919 :
2920 : if ((u64)unmap_end > (u64)unmap_start)
2921 : unmap_mapping_range(mapping, unmap_start,
2922 : 1 + unmap_end - unmap_start, 0);
2923 : shmem_truncate_range(inode, offset, offset + len - 1);
2924 : /* No need to unmap again: hole-punching leaves COWed pages */
2925 :
2926 : spin_lock(&inode->i_lock);
2927 : inode->i_private = NULL;
2928 : wake_up_all(&shmem_falloc_waitq);
2929 : WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
2930 : spin_unlock(&inode->i_lock);
2931 : error = 0;
2932 : goto out;
2933 : }
2934 :
2935 : /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
2936 : error = inode_newsize_ok(inode, offset + len);
2937 : if (error)
2938 : goto out;
2939 :
2940 : if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
2941 : error = -EPERM;
2942 : goto out;
2943 : }
2944 :
2945 : start = offset >> PAGE_SHIFT;
2946 : end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
2947 : /* Try to avoid a swapstorm if len is impossible to satisfy */
2948 : if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
2949 : error = -ENOSPC;
2950 : goto out;
2951 : }
2952 :
2953 : shmem_falloc.waitq = NULL;
2954 : shmem_falloc.start = start;
2955 : shmem_falloc.next = start;
2956 : shmem_falloc.nr_falloced = 0;
2957 : shmem_falloc.nr_unswapped = 0;
2958 : spin_lock(&inode->i_lock);
2959 : inode->i_private = &shmem_falloc;
2960 : spin_unlock(&inode->i_lock);
2961 :
2962 : /*
2963 : * info->fallocend is only relevant when huge pages might be
2964 : * involved: to prevent split_huge_page() freeing fallocated
2965 : * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
2966 : */
2967 : undo_fallocend = info->fallocend;
2968 : if (info->fallocend < end)
2969 : info->fallocend = end;
2970 :
2971 : for (index = start; index < end; ) {
2972 : struct folio *folio;
2973 :
2974 : /*
2975 : * Good, the fallocate(2) manpage permits EINTR: we may have
2976 : * been interrupted because we are using up too much memory.
2977 : */
2978 : if (signal_pending(current))
2979 : error = -EINTR;
2980 : else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
2981 : error = -ENOMEM;
2982 : else
2983 : error = shmem_get_folio(inode, index, &folio,
2984 : SGP_FALLOC);
2985 : if (error) {
2986 : info->fallocend = undo_fallocend;
2987 : /* Remove the !uptodate folios we added */
2988 : if (index > start) {
2989 : shmem_undo_range(inode,
2990 : (loff_t)start << PAGE_SHIFT,
2991 : ((loff_t)index << PAGE_SHIFT) - 1, true);
2992 : }
2993 : goto undone;
2994 : }
2995 :
2996 : /*
2997 : * Here is a more important optimization than it appears:
2998 : * a second SGP_FALLOC on the same large folio will clear it,
2999 : * making it uptodate and un-undoable if we fail later.
3000 : */
3001 : index = folio_next_index(folio);
3002 : /* Beware 32-bit wraparound */
3003 : if (!index)
3004 : index--;
3005 :
3006 : /*
3007 : * Inform shmem_writepage() how far we have reached.
3008 : * No need for lock or barrier: we have the page lock.
3009 : */
3010 : if (!folio_test_uptodate(folio))
3011 : shmem_falloc.nr_falloced += index - shmem_falloc.next;
3012 : shmem_falloc.next = index;
3013 :
3014 : /*
3015 : * If !uptodate, leave it that way so that freeable folios
3016 : * can be recognized if we need to rollback on error later.
3017 : * But mark it dirty so that memory pressure will swap rather
3018 : * than free the folios we are allocating (and SGP_CACHE folios
3019 : * might still be clean: we now need to mark those dirty too).
3020 : */
3021 : folio_mark_dirty(folio);
3022 : folio_unlock(folio);
3023 : folio_put(folio);
3024 : cond_resched();
3025 : }
3026 :
3027 : if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
3028 : i_size_write(inode, offset + len);
3029 : undone:
3030 : spin_lock(&inode->i_lock);
3031 : inode->i_private = NULL;
3032 : spin_unlock(&inode->i_lock);
3033 : out:
3034 : if (!error)
3035 : file_modified(file);
3036 : inode_unlock(inode);
3037 : return error;
3038 : }
3039 :
3040 : static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
3041 : {
3042 : struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
3043 :
3044 : buf->f_type = TMPFS_MAGIC;
3045 : buf->f_bsize = PAGE_SIZE;
3046 : buf->f_namelen = NAME_MAX;
3047 : if (sbinfo->max_blocks) {
3048 : buf->f_blocks = sbinfo->max_blocks;
3049 : buf->f_bavail =
3050 : buf->f_bfree = sbinfo->max_blocks -
3051 : percpu_counter_sum(&sbinfo->used_blocks);
3052 : }
3053 : if (sbinfo->max_inodes) {
3054 : buf->f_files = sbinfo->max_inodes;
3055 : buf->f_ffree = sbinfo->free_inodes;
3056 : }
3057 : /* else leave those fields 0 like simple_statfs */
3058 :
3059 : buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
3060 :
3061 : return 0;
3062 : }
3063 :
3064 : /*
3065 : * File creation. Allocate an inode, and we're done..
3066 : */
3067 : static int
3068 : shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
3069 : struct dentry *dentry, umode_t mode, dev_t dev)
3070 : {
3071 : struct inode *inode;
3072 : int error = -ENOSPC;
3073 :
3074 : inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
3075 : if (inode) {
3076 : error = simple_acl_create(dir, inode);
3077 : if (error)
3078 : goto out_iput;
3079 : error = security_inode_init_security(inode, dir,
3080 : &dentry->d_name,
3081 : shmem_initxattrs, NULL);
3082 : if (error && error != -EOPNOTSUPP)
3083 : goto out_iput;
3084 :
3085 : error = 0;
3086 : dir->i_size += BOGO_DIRENT_SIZE;
3087 : dir->i_ctime = dir->i_mtime = current_time(dir);
3088 : inode_inc_iversion(dir);
3089 : d_instantiate(dentry, inode);
3090 : dget(dentry); /* Extra count - pin the dentry in core */
3091 : }
3092 : return error;
3093 : out_iput:
3094 : iput(inode);
3095 : return error;
3096 : }
3097 :
3098 : static int
3099 : shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
3100 : struct file *file, umode_t mode)
3101 : {
3102 : struct inode *inode;
3103 : int error = -ENOSPC;
3104 :
3105 : inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
3106 : if (inode) {
3107 : error = security_inode_init_security(inode, dir,
3108 : NULL,
3109 : shmem_initxattrs, NULL);
3110 : if (error && error != -EOPNOTSUPP)
3111 : goto out_iput;
3112 : error = simple_acl_create(dir, inode);
3113 : if (error)
3114 : goto out_iput;
3115 : d_tmpfile(file, inode);
3116 : }
3117 : return finish_open_simple(file, error);
3118 : out_iput:
3119 : iput(inode);
3120 : return error;
3121 : }
3122 :
3123 : static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
3124 : struct dentry *dentry, umode_t mode)
3125 : {
3126 : int error;
3127 :
3128 : error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
3129 : if (error)
3130 : return error;
3131 : inc_nlink(dir);
3132 : return 0;
3133 : }
3134 :
3135 : static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
3136 : struct dentry *dentry, umode_t mode, bool excl)
3137 : {
3138 : return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
3139 : }
3140 :
3141 : /*
3142 : * Link a file..
3143 : */
3144 : static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
3145 : {
3146 : struct inode *inode = d_inode(old_dentry);
3147 : int ret = 0;
3148 :
3149 : /*
3150 : * No ordinary (disk based) filesystem counts links as inodes;
3151 : * but each new link needs a new dentry, pinning lowmem, and
3152 : * tmpfs dentries cannot be pruned until they are unlinked.
3153 : * But if an O_TMPFILE file is linked into the tmpfs, the
3154 : * first link must skip that, to get the accounting right.
3155 : */
3156 : if (inode->i_nlink) {
3157 : ret = shmem_reserve_inode(inode->i_sb, NULL);
3158 : if (ret)
3159 : goto out;
3160 : }
3161 :
3162 : dir->i_size += BOGO_DIRENT_SIZE;
3163 : inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
3164 : inode_inc_iversion(dir);
3165 : inc_nlink(inode);
3166 : ihold(inode); /* New dentry reference */
3167 : dget(dentry); /* Extra pinning count for the created dentry */
3168 : d_instantiate(dentry, inode);
3169 : out:
3170 : return ret;
3171 : }
3172 :
3173 : static int shmem_unlink(struct inode *dir, struct dentry *dentry)
3174 : {
3175 : struct inode *inode = d_inode(dentry);
3176 :
3177 : if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
3178 : shmem_free_inode(inode->i_sb);
3179 :
3180 : dir->i_size -= BOGO_DIRENT_SIZE;
3181 : inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
3182 : inode_inc_iversion(dir);
3183 : drop_nlink(inode);
3184 : dput(dentry); /* Undo the count from "create" - this does all the work */
3185 : return 0;
3186 : }
3187 :
3188 : static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
3189 : {
3190 : if (!simple_empty(dentry))
3191 : return -ENOTEMPTY;
3192 :
3193 : drop_nlink(d_inode(dentry));
3194 : drop_nlink(dir);
3195 : return shmem_unlink(dir, dentry);
3196 : }
3197 :
3198 : static int shmem_whiteout(struct mnt_idmap *idmap,
3199 : struct inode *old_dir, struct dentry *old_dentry)
3200 : {
3201 : struct dentry *whiteout;
3202 : int error;
3203 :
3204 : whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
3205 : if (!whiteout)
3206 : return -ENOMEM;
3207 :
3208 : error = shmem_mknod(idmap, old_dir, whiteout,
3209 : S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
3210 : dput(whiteout);
3211 : if (error)
3212 : return error;
3213 :
3214 : /*
3215 : * Cheat and hash the whiteout while the old dentry is still in
3216 : * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
3217 : *
3218 : * d_lookup() will consistently find one of them at this point,
3219 : * not sure which one, but that isn't even important.
3220 : */
3221 : d_rehash(whiteout);
3222 : return 0;
3223 : }
3224 :
3225 : /*
3226 : * The VFS layer already does all the dentry stuff for rename,
3227 : * we just have to decrement the usage count for the target if
3228 : * it exists so that the VFS layer correctly free's it when it
3229 : * gets overwritten.
3230 : */
3231 : static int shmem_rename2(struct mnt_idmap *idmap,
3232 : struct inode *old_dir, struct dentry *old_dentry,
3233 : struct inode *new_dir, struct dentry *new_dentry,
3234 : unsigned int flags)
3235 : {
3236 : struct inode *inode = d_inode(old_dentry);
3237 : int they_are_dirs = S_ISDIR(inode->i_mode);
3238 :
3239 : if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
3240 : return -EINVAL;
3241 :
3242 : if (flags & RENAME_EXCHANGE)
3243 : return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
3244 :
3245 : if (!simple_empty(new_dentry))
3246 : return -ENOTEMPTY;
3247 :
3248 : if (flags & RENAME_WHITEOUT) {
3249 : int error;
3250 :
3251 : error = shmem_whiteout(idmap, old_dir, old_dentry);
3252 : if (error)
3253 : return error;
3254 : }
3255 :
3256 : if (d_really_is_positive(new_dentry)) {
3257 : (void) shmem_unlink(new_dir, new_dentry);
3258 : if (they_are_dirs) {
3259 : drop_nlink(d_inode(new_dentry));
3260 : drop_nlink(old_dir);
3261 : }
3262 : } else if (they_are_dirs) {
3263 : drop_nlink(old_dir);
3264 : inc_nlink(new_dir);
3265 : }
3266 :
3267 : old_dir->i_size -= BOGO_DIRENT_SIZE;
3268 : new_dir->i_size += BOGO_DIRENT_SIZE;
3269 : old_dir->i_ctime = old_dir->i_mtime =
3270 : new_dir->i_ctime = new_dir->i_mtime =
3271 : inode->i_ctime = current_time(old_dir);
3272 : inode_inc_iversion(old_dir);
3273 : inode_inc_iversion(new_dir);
3274 : return 0;
3275 : }
3276 :
3277 : static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
3278 : struct dentry *dentry, const char *symname)
3279 : {
3280 : int error;
3281 : int len;
3282 : struct inode *inode;
3283 : struct folio *folio;
3284 :
3285 : len = strlen(symname) + 1;
3286 : if (len > PAGE_SIZE)
3287 : return -ENAMETOOLONG;
3288 :
3289 : inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
3290 : VM_NORESERVE);
3291 : if (!inode)
3292 : return -ENOSPC;
3293 :
3294 : error = security_inode_init_security(inode, dir, &dentry->d_name,
3295 : shmem_initxattrs, NULL);
3296 : if (error && error != -EOPNOTSUPP) {
3297 : iput(inode);
3298 : return error;
3299 : }
3300 :
3301 : inode->i_size = len-1;
3302 : if (len <= SHORT_SYMLINK_LEN) {
3303 : inode->i_link = kmemdup(symname, len, GFP_KERNEL);
3304 : if (!inode->i_link) {
3305 : iput(inode);
3306 : return -ENOMEM;
3307 : }
3308 : inode->i_op = &shmem_short_symlink_operations;
3309 : } else {
3310 : inode_nohighmem(inode);
3311 : error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
3312 : if (error) {
3313 : iput(inode);
3314 : return error;
3315 : }
3316 : inode->i_mapping->a_ops = &shmem_aops;
3317 : inode->i_op = &shmem_symlink_inode_operations;
3318 : memcpy(folio_address(folio), symname, len);
3319 : folio_mark_uptodate(folio);
3320 : folio_mark_dirty(folio);
3321 : folio_unlock(folio);
3322 : folio_put(folio);
3323 : }
3324 : dir->i_size += BOGO_DIRENT_SIZE;
3325 : dir->i_ctime = dir->i_mtime = current_time(dir);
3326 : inode_inc_iversion(dir);
3327 : d_instantiate(dentry, inode);
3328 : dget(dentry);
3329 : return 0;
3330 : }
3331 :
3332 : static void shmem_put_link(void *arg)
3333 : {
3334 : folio_mark_accessed(arg);
3335 : folio_put(arg);
3336 : }
3337 :
3338 : static const char *shmem_get_link(struct dentry *dentry,
3339 : struct inode *inode,
3340 : struct delayed_call *done)
3341 : {
3342 : struct folio *folio = NULL;
3343 : int error;
3344 :
3345 : if (!dentry) {
3346 : folio = filemap_get_folio(inode->i_mapping, 0);
3347 : if (IS_ERR(folio))
3348 : return ERR_PTR(-ECHILD);
3349 : if (PageHWPoison(folio_page(folio, 0)) ||
3350 : !folio_test_uptodate(folio)) {
3351 : folio_put(folio);
3352 : return ERR_PTR(-ECHILD);
3353 : }
3354 : } else {
3355 : error = shmem_get_folio(inode, 0, &folio, SGP_READ);
3356 : if (error)
3357 : return ERR_PTR(error);
3358 : if (!folio)
3359 : return ERR_PTR(-ECHILD);
3360 : if (PageHWPoison(folio_page(folio, 0))) {
3361 : folio_unlock(folio);
3362 : folio_put(folio);
3363 : return ERR_PTR(-ECHILD);
3364 : }
3365 : folio_unlock(folio);
3366 : }
3367 : set_delayed_call(done, shmem_put_link, folio);
3368 : return folio_address(folio);
3369 : }
3370 :
3371 : #ifdef CONFIG_TMPFS_XATTR
3372 :
3373 : static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
3374 : {
3375 : struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3376 :
3377 : fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
3378 :
3379 : return 0;
3380 : }
3381 :
3382 : static int shmem_fileattr_set(struct mnt_idmap *idmap,
3383 : struct dentry *dentry, struct fileattr *fa)
3384 : {
3385 : struct inode *inode = d_inode(dentry);
3386 : struct shmem_inode_info *info = SHMEM_I(inode);
3387 :
3388 : if (fileattr_has_fsx(fa))
3389 : return -EOPNOTSUPP;
3390 : if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
3391 : return -EOPNOTSUPP;
3392 :
3393 : info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
3394 : (fa->flags & SHMEM_FL_USER_MODIFIABLE);
3395 :
3396 : shmem_set_inode_flags(inode, info->fsflags);
3397 : inode->i_ctime = current_time(inode);
3398 : inode_inc_iversion(inode);
3399 : return 0;
3400 : }
3401 :
3402 : /*
3403 : * Superblocks without xattr inode operations may get some security.* xattr
3404 : * support from the LSM "for free". As soon as we have any other xattrs
3405 : * like ACLs, we also need to implement the security.* handlers at
3406 : * filesystem level, though.
3407 : */
3408 :
3409 : /*
3410 : * Callback for security_inode_init_security() for acquiring xattrs.
3411 : */
3412 : static int shmem_initxattrs(struct inode *inode,
3413 : const struct xattr *xattr_array,
3414 : void *fs_info)
3415 : {
3416 : struct shmem_inode_info *info = SHMEM_I(inode);
3417 : const struct xattr *xattr;
3418 : struct simple_xattr *new_xattr;
3419 : size_t len;
3420 :
3421 : for (xattr = xattr_array; xattr->name != NULL; xattr++) {
3422 : new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
3423 : if (!new_xattr)
3424 : return -ENOMEM;
3425 :
3426 : len = strlen(xattr->name) + 1;
3427 : new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
3428 : GFP_KERNEL);
3429 : if (!new_xattr->name) {
3430 : kvfree(new_xattr);
3431 : return -ENOMEM;
3432 : }
3433 :
3434 : memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
3435 : XATTR_SECURITY_PREFIX_LEN);
3436 : memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
3437 : xattr->name, len);
3438 :
3439 : simple_xattr_add(&info->xattrs, new_xattr);
3440 : }
3441 :
3442 : return 0;
3443 : }
3444 :
3445 : static int shmem_xattr_handler_get(const struct xattr_handler *handler,
3446 : struct dentry *unused, struct inode *inode,
3447 : const char *name, void *buffer, size_t size)
3448 : {
3449 : struct shmem_inode_info *info = SHMEM_I(inode);
3450 :
3451 : name = xattr_full_name(handler, name);
3452 : return simple_xattr_get(&info->xattrs, name, buffer, size);
3453 : }
3454 :
3455 : static int shmem_xattr_handler_set(const struct xattr_handler *handler,
3456 : struct mnt_idmap *idmap,
3457 : struct dentry *unused, struct inode *inode,
3458 : const char *name, const void *value,
3459 : size_t size, int flags)
3460 : {
3461 : struct shmem_inode_info *info = SHMEM_I(inode);
3462 : int err;
3463 :
3464 : name = xattr_full_name(handler, name);
3465 : err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
3466 : if (!err) {
3467 : inode->i_ctime = current_time(inode);
3468 : inode_inc_iversion(inode);
3469 : }
3470 : return err;
3471 : }
3472 :
3473 : static const struct xattr_handler shmem_security_xattr_handler = {
3474 : .prefix = XATTR_SECURITY_PREFIX,
3475 : .get = shmem_xattr_handler_get,
3476 : .set = shmem_xattr_handler_set,
3477 : };
3478 :
3479 : static const struct xattr_handler shmem_trusted_xattr_handler = {
3480 : .prefix = XATTR_TRUSTED_PREFIX,
3481 : .get = shmem_xattr_handler_get,
3482 : .set = shmem_xattr_handler_set,
3483 : };
3484 :
3485 : static const struct xattr_handler *shmem_xattr_handlers[] = {
3486 : &shmem_security_xattr_handler,
3487 : &shmem_trusted_xattr_handler,
3488 : NULL
3489 : };
3490 :
3491 : static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
3492 : {
3493 : struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3494 : return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
3495 : }
3496 : #endif /* CONFIG_TMPFS_XATTR */
3497 :
3498 : static const struct inode_operations shmem_short_symlink_operations = {
3499 : .getattr = shmem_getattr,
3500 : .get_link = simple_get_link,
3501 : #ifdef CONFIG_TMPFS_XATTR
3502 : .listxattr = shmem_listxattr,
3503 : #endif
3504 : };
3505 :
3506 : static const struct inode_operations shmem_symlink_inode_operations = {
3507 : .getattr = shmem_getattr,
3508 : .get_link = shmem_get_link,
3509 : #ifdef CONFIG_TMPFS_XATTR
3510 : .listxattr = shmem_listxattr,
3511 : #endif
3512 : };
3513 :
3514 : static struct dentry *shmem_get_parent(struct dentry *child)
3515 : {
3516 : return ERR_PTR(-ESTALE);
3517 : }
3518 :
3519 : static int shmem_match(struct inode *ino, void *vfh)
3520 : {
3521 : __u32 *fh = vfh;
3522 : __u64 inum = fh[2];
3523 : inum = (inum << 32) | fh[1];
3524 : return ino->i_ino == inum && fh[0] == ino->i_generation;
3525 : }
3526 :
3527 : /* Find any alias of inode, but prefer a hashed alias */
3528 : static struct dentry *shmem_find_alias(struct inode *inode)
3529 : {
3530 : struct dentry *alias = d_find_alias(inode);
3531 :
3532 : return alias ?: d_find_any_alias(inode);
3533 : }
3534 :
3535 :
3536 : static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
3537 : struct fid *fid, int fh_len, int fh_type)
3538 : {
3539 : struct inode *inode;
3540 : struct dentry *dentry = NULL;
3541 : u64 inum;
3542 :
3543 : if (fh_len < 3)
3544 : return NULL;
3545 :
3546 : inum = fid->raw[2];
3547 : inum = (inum << 32) | fid->raw[1];
3548 :
3549 : inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
3550 : shmem_match, fid->raw);
3551 : if (inode) {
3552 : dentry = shmem_find_alias(inode);
3553 : iput(inode);
3554 : }
3555 :
3556 : return dentry;
3557 : }
3558 :
3559 : static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
3560 : struct inode *parent)
3561 : {
3562 : if (*len < 3) {
3563 : *len = 3;
3564 : return FILEID_INVALID;
3565 : }
3566 :
3567 : if (inode_unhashed(inode)) {
3568 : /* Unfortunately insert_inode_hash is not idempotent,
3569 : * so as we hash inodes here rather than at creation
3570 : * time, we need a lock to ensure we only try
3571 : * to do it once
3572 : */
3573 : static DEFINE_SPINLOCK(lock);
3574 : spin_lock(&lock);
3575 : if (inode_unhashed(inode))
3576 : __insert_inode_hash(inode,
3577 : inode->i_ino + inode->i_generation);
3578 : spin_unlock(&lock);
3579 : }
3580 :
3581 : fh[0] = inode->i_generation;
3582 : fh[1] = inode->i_ino;
3583 : fh[2] = ((__u64)inode->i_ino) >> 32;
3584 :
3585 : *len = 3;
3586 : return 1;
3587 : }
3588 :
3589 : static const struct export_operations shmem_export_ops = {
3590 : .get_parent = shmem_get_parent,
3591 : .encode_fh = shmem_encode_fh,
3592 : .fh_to_dentry = shmem_fh_to_dentry,
3593 : };
3594 :
3595 : enum shmem_param {
3596 : Opt_gid,
3597 : Opt_huge,
3598 : Opt_mode,
3599 : Opt_mpol,
3600 : Opt_nr_blocks,
3601 : Opt_nr_inodes,
3602 : Opt_size,
3603 : Opt_uid,
3604 : Opt_inode32,
3605 : Opt_inode64,
3606 : Opt_noswap,
3607 : };
3608 :
3609 : static const struct constant_table shmem_param_enums_huge[] = {
3610 : {"never", SHMEM_HUGE_NEVER },
3611 : {"always", SHMEM_HUGE_ALWAYS },
3612 : {"within_size", SHMEM_HUGE_WITHIN_SIZE },
3613 : {"advise", SHMEM_HUGE_ADVISE },
3614 : {}
3615 : };
3616 :
3617 : const struct fs_parameter_spec shmem_fs_parameters[] = {
3618 : fsparam_u32 ("gid", Opt_gid),
3619 : fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
3620 : fsparam_u32oct("mode", Opt_mode),
3621 : fsparam_string("mpol", Opt_mpol),
3622 : fsparam_string("nr_blocks", Opt_nr_blocks),
3623 : fsparam_string("nr_inodes", Opt_nr_inodes),
3624 : fsparam_string("size", Opt_size),
3625 : fsparam_u32 ("uid", Opt_uid),
3626 : fsparam_flag ("inode32", Opt_inode32),
3627 : fsparam_flag ("inode64", Opt_inode64),
3628 : fsparam_flag ("noswap", Opt_noswap),
3629 : {}
3630 : };
3631 :
3632 : static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
3633 : {
3634 : struct shmem_options *ctx = fc->fs_private;
3635 : struct fs_parse_result result;
3636 : unsigned long long size;
3637 : char *rest;
3638 : int opt;
3639 :
3640 : opt = fs_parse(fc, shmem_fs_parameters, param, &result);
3641 : if (opt < 0)
3642 : return opt;
3643 :
3644 : switch (opt) {
3645 : case Opt_size:
3646 : size = memparse(param->string, &rest);
3647 : if (*rest == '%') {
3648 : size <<= PAGE_SHIFT;
3649 : size *= totalram_pages();
3650 : do_div(size, 100);
3651 : rest++;
3652 : }
3653 : if (*rest)
3654 : goto bad_value;
3655 : ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
3656 : ctx->seen |= SHMEM_SEEN_BLOCKS;
3657 : break;
3658 : case Opt_nr_blocks:
3659 : ctx->blocks = memparse(param->string, &rest);
3660 : if (*rest || ctx->blocks > S64_MAX)
3661 : goto bad_value;
3662 : ctx->seen |= SHMEM_SEEN_BLOCKS;
3663 : break;
3664 : case Opt_nr_inodes:
3665 : ctx->inodes = memparse(param->string, &rest);
3666 : if (*rest)
3667 : goto bad_value;
3668 : ctx->seen |= SHMEM_SEEN_INODES;
3669 : break;
3670 : case Opt_mode:
3671 : ctx->mode = result.uint_32 & 07777;
3672 : break;
3673 : case Opt_uid:
3674 : ctx->uid = make_kuid(current_user_ns(), result.uint_32);
3675 : if (!uid_valid(ctx->uid))
3676 : goto bad_value;
3677 : break;
3678 : case Opt_gid:
3679 : ctx->gid = make_kgid(current_user_ns(), result.uint_32);
3680 : if (!gid_valid(ctx->gid))
3681 : goto bad_value;
3682 : break;
3683 : case Opt_huge:
3684 : ctx->huge = result.uint_32;
3685 : if (ctx->huge != SHMEM_HUGE_NEVER &&
3686 : !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
3687 : has_transparent_hugepage()))
3688 : goto unsupported_parameter;
3689 : ctx->seen |= SHMEM_SEEN_HUGE;
3690 : break;
3691 : case Opt_mpol:
3692 : if (IS_ENABLED(CONFIG_NUMA)) {
3693 : mpol_put(ctx->mpol);
3694 : ctx->mpol = NULL;
3695 : if (mpol_parse_str(param->string, &ctx->mpol))
3696 : goto bad_value;
3697 : break;
3698 : }
3699 : goto unsupported_parameter;
3700 : case Opt_inode32:
3701 : ctx->full_inums = false;
3702 : ctx->seen |= SHMEM_SEEN_INUMS;
3703 : break;
3704 : case Opt_inode64:
3705 : if (sizeof(ino_t) < 8) {
3706 : return invalfc(fc,
3707 : "Cannot use inode64 with <64bit inums in kernel\n");
3708 : }
3709 : ctx->full_inums = true;
3710 : ctx->seen |= SHMEM_SEEN_INUMS;
3711 : break;
3712 : case Opt_noswap:
3713 : if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
3714 : return invalfc(fc,
3715 : "Turning off swap in unprivileged tmpfs mounts unsupported");
3716 : }
3717 : ctx->noswap = true;
3718 : ctx->seen |= SHMEM_SEEN_NOSWAP;
3719 : break;
3720 : }
3721 : return 0;
3722 :
3723 : unsupported_parameter:
3724 : return invalfc(fc, "Unsupported parameter '%s'", param->key);
3725 : bad_value:
3726 : return invalfc(fc, "Bad value for '%s'", param->key);
3727 : }
3728 :
3729 : static int shmem_parse_options(struct fs_context *fc, void *data)
3730 : {
3731 : char *options = data;
3732 :
3733 : if (options) {
3734 : int err = security_sb_eat_lsm_opts(options, &fc->security);
3735 : if (err)
3736 : return err;
3737 : }
3738 :
3739 : while (options != NULL) {
3740 : char *this_char = options;
3741 : for (;;) {
3742 : /*
3743 : * NUL-terminate this option: unfortunately,
3744 : * mount options form a comma-separated list,
3745 : * but mpol's nodelist may also contain commas.
3746 : */
3747 : options = strchr(options, ',');
3748 : if (options == NULL)
3749 : break;
3750 : options++;
3751 : if (!isdigit(*options)) {
3752 : options[-1] = '\0';
3753 : break;
3754 : }
3755 : }
3756 : if (*this_char) {
3757 : char *value = strchr(this_char, '=');
3758 : size_t len = 0;
3759 : int err;
3760 :
3761 : if (value) {
3762 : *value++ = '\0';
3763 : len = strlen(value);
3764 : }
3765 : err = vfs_parse_fs_string(fc, this_char, value, len);
3766 : if (err < 0)
3767 : return err;
3768 : }
3769 : }
3770 : return 0;
3771 : }
3772 :
3773 : /*
3774 : * Reconfigure a shmem filesystem.
3775 : *
3776 : * Note that we disallow change from limited->unlimited blocks/inodes while any
3777 : * are in use; but we must separately disallow unlimited->limited, because in
3778 : * that case we have no record of how much is already in use.
3779 : */
3780 : static int shmem_reconfigure(struct fs_context *fc)
3781 : {
3782 : struct shmem_options *ctx = fc->fs_private;
3783 : struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
3784 : unsigned long inodes;
3785 : struct mempolicy *mpol = NULL;
3786 : const char *err;
3787 :
3788 : raw_spin_lock(&sbinfo->stat_lock);
3789 : inodes = sbinfo->max_inodes - sbinfo->free_inodes;
3790 :
3791 : if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
3792 : if (!sbinfo->max_blocks) {
3793 : err = "Cannot retroactively limit size";
3794 : goto out;
3795 : }
3796 : if (percpu_counter_compare(&sbinfo->used_blocks,
3797 : ctx->blocks) > 0) {
3798 : err = "Too small a size for current use";
3799 : goto out;
3800 : }
3801 : }
3802 : if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
3803 : if (!sbinfo->max_inodes) {
3804 : err = "Cannot retroactively limit inodes";
3805 : goto out;
3806 : }
3807 : if (ctx->inodes < inodes) {
3808 : err = "Too few inodes for current use";
3809 : goto out;
3810 : }
3811 : }
3812 :
3813 : if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
3814 : sbinfo->next_ino > UINT_MAX) {
3815 : err = "Current inum too high to switch to 32-bit inums";
3816 : goto out;
3817 : }
3818 : if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
3819 : err = "Cannot disable swap on remount";
3820 : goto out;
3821 : }
3822 : if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
3823 : err = "Cannot enable swap on remount if it was disabled on first mount";
3824 : goto out;
3825 : }
3826 :
3827 : if (ctx->seen & SHMEM_SEEN_HUGE)
3828 : sbinfo->huge = ctx->huge;
3829 : if (ctx->seen & SHMEM_SEEN_INUMS)
3830 : sbinfo->full_inums = ctx->full_inums;
3831 : if (ctx->seen & SHMEM_SEEN_BLOCKS)
3832 : sbinfo->max_blocks = ctx->blocks;
3833 : if (ctx->seen & SHMEM_SEEN_INODES) {
3834 : sbinfo->max_inodes = ctx->inodes;
3835 : sbinfo->free_inodes = ctx->inodes - inodes;
3836 : }
3837 :
3838 : /*
3839 : * Preserve previous mempolicy unless mpol remount option was specified.
3840 : */
3841 : if (ctx->mpol) {
3842 : mpol = sbinfo->mpol;
3843 : sbinfo->mpol = ctx->mpol; /* transfers initial ref */
3844 : ctx->mpol = NULL;
3845 : }
3846 :
3847 : if (ctx->noswap)
3848 : sbinfo->noswap = true;
3849 :
3850 : raw_spin_unlock(&sbinfo->stat_lock);
3851 : mpol_put(mpol);
3852 : return 0;
3853 : out:
3854 : raw_spin_unlock(&sbinfo->stat_lock);
3855 : return invalfc(fc, "%s", err);
3856 : }
3857 :
3858 : static int shmem_show_options(struct seq_file *seq, struct dentry *root)
3859 : {
3860 : struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
3861 : struct mempolicy *mpol;
3862 :
3863 : if (sbinfo->max_blocks != shmem_default_max_blocks())
3864 : seq_printf(seq, ",size=%luk",
3865 : sbinfo->max_blocks << (PAGE_SHIFT - 10));
3866 : if (sbinfo->max_inodes != shmem_default_max_inodes())
3867 : seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
3868 : if (sbinfo->mode != (0777 | S_ISVTX))
3869 : seq_printf(seq, ",mode=%03ho", sbinfo->mode);
3870 : if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
3871 : seq_printf(seq, ",uid=%u",
3872 : from_kuid_munged(&init_user_ns, sbinfo->uid));
3873 : if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
3874 : seq_printf(seq, ",gid=%u",
3875 : from_kgid_munged(&init_user_ns, sbinfo->gid));
3876 :
3877 : /*
3878 : * Showing inode{64,32} might be useful even if it's the system default,
3879 : * since then people don't have to resort to checking both here and
3880 : * /proc/config.gz to confirm 64-bit inums were successfully applied
3881 : * (which may not even exist if IKCONFIG_PROC isn't enabled).
3882 : *
3883 : * We hide it when inode64 isn't the default and we are using 32-bit
3884 : * inodes, since that probably just means the feature isn't even under
3885 : * consideration.
3886 : *
3887 : * As such:
3888 : *
3889 : * +-----------------+-----------------+
3890 : * | TMPFS_INODE64=y | TMPFS_INODE64=n |
3891 : * +------------------+-----------------+-----------------+
3892 : * | full_inums=true | show | show |
3893 : * | full_inums=false | show | hide |
3894 : * +------------------+-----------------+-----------------+
3895 : *
3896 : */
3897 : if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
3898 : seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
3899 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3900 : /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
3901 : if (sbinfo->huge)
3902 : seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
3903 : #endif
3904 : mpol = shmem_get_sbmpol(sbinfo);
3905 : shmem_show_mpol(seq, mpol);
3906 : mpol_put(mpol);
3907 : if (sbinfo->noswap)
3908 : seq_printf(seq, ",noswap");
3909 : return 0;
3910 : }
3911 :
3912 : #endif /* CONFIG_TMPFS */
3913 :
3914 0 : static void shmem_put_super(struct super_block *sb)
3915 : {
3916 0 : struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3917 :
3918 0 : free_percpu(sbinfo->ino_batch);
3919 0 : percpu_counter_destroy(&sbinfo->used_blocks);
3920 0 : mpol_put(sbinfo->mpol);
3921 0 : kfree(sbinfo);
3922 0 : sb->s_fs_info = NULL;
3923 0 : }
3924 :
3925 1 : static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
3926 : {
3927 1 : struct shmem_options *ctx = fc->fs_private;
3928 : struct inode *inode;
3929 : struct shmem_sb_info *sbinfo;
3930 :
3931 : /* Round up to L1_CACHE_BYTES to resist false sharing */
3932 1 : sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
3933 : L1_CACHE_BYTES), GFP_KERNEL);
3934 1 : if (!sbinfo)
3935 : return -ENOMEM;
3936 :
3937 1 : sb->s_fs_info = sbinfo;
3938 :
3939 : #ifdef CONFIG_TMPFS
3940 : /*
3941 : * Per default we only allow half of the physical ram per
3942 : * tmpfs instance, limiting inodes to one per page of lowmem;
3943 : * but the internal instance is left unlimited.
3944 : */
3945 : if (!(sb->s_flags & SB_KERNMOUNT)) {
3946 : if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
3947 : ctx->blocks = shmem_default_max_blocks();
3948 : if (!(ctx->seen & SHMEM_SEEN_INODES))
3949 : ctx->inodes = shmem_default_max_inodes();
3950 : if (!(ctx->seen & SHMEM_SEEN_INUMS))
3951 : ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
3952 : sbinfo->noswap = ctx->noswap;
3953 : } else {
3954 : sb->s_flags |= SB_NOUSER;
3955 : }
3956 : sb->s_export_op = &shmem_export_ops;
3957 : sb->s_flags |= SB_NOSEC | SB_I_VERSION;
3958 : #else
3959 1 : sb->s_flags |= SB_NOUSER;
3960 : #endif
3961 1 : sbinfo->max_blocks = ctx->blocks;
3962 1 : sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
3963 1 : if (sb->s_flags & SB_KERNMOUNT) {
3964 1 : sbinfo->ino_batch = alloc_percpu(ino_t);
3965 1 : if (!sbinfo->ino_batch)
3966 : goto failed;
3967 : }
3968 1 : sbinfo->uid = ctx->uid;
3969 1 : sbinfo->gid = ctx->gid;
3970 1 : sbinfo->full_inums = ctx->full_inums;
3971 1 : sbinfo->mode = ctx->mode;
3972 1 : sbinfo->huge = ctx->huge;
3973 1 : sbinfo->mpol = ctx->mpol;
3974 1 : ctx->mpol = NULL;
3975 :
3976 : raw_spin_lock_init(&sbinfo->stat_lock);
3977 2 : if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
3978 : goto failed;
3979 1 : spin_lock_init(&sbinfo->shrinklist_lock);
3980 2 : INIT_LIST_HEAD(&sbinfo->shrinklist);
3981 :
3982 1 : sb->s_maxbytes = MAX_LFS_FILESIZE;
3983 1 : sb->s_blocksize = PAGE_SIZE;
3984 1 : sb->s_blocksize_bits = PAGE_SHIFT;
3985 1 : sb->s_magic = TMPFS_MAGIC;
3986 1 : sb->s_op = &shmem_ops;
3987 1 : sb->s_time_gran = 1;
3988 : #ifdef CONFIG_TMPFS_XATTR
3989 : sb->s_xattr = shmem_xattr_handlers;
3990 : #endif
3991 : #ifdef CONFIG_TMPFS_POSIX_ACL
3992 : sb->s_flags |= SB_POSIXACL;
3993 : #endif
3994 1 : uuid_gen(&sb->s_uuid);
3995 :
3996 1 : inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0,
3997 : VM_NORESERVE);
3998 1 : if (!inode)
3999 : goto failed;
4000 1 : inode->i_uid = sbinfo->uid;
4001 1 : inode->i_gid = sbinfo->gid;
4002 1 : sb->s_root = d_make_root(inode);
4003 1 : if (!sb->s_root)
4004 : goto failed;
4005 : return 0;
4006 :
4007 : failed:
4008 0 : shmem_put_super(sb);
4009 0 : return -ENOMEM;
4010 : }
4011 :
4012 1 : static int shmem_get_tree(struct fs_context *fc)
4013 : {
4014 1 : return get_tree_nodev(fc, shmem_fill_super);
4015 : }
4016 :
4017 1 : static void shmem_free_fc(struct fs_context *fc)
4018 : {
4019 1 : struct shmem_options *ctx = fc->fs_private;
4020 :
4021 1 : if (ctx) {
4022 1 : mpol_put(ctx->mpol);
4023 1 : kfree(ctx);
4024 : }
4025 1 : }
4026 :
4027 : static const struct fs_context_operations shmem_fs_context_ops = {
4028 : .free = shmem_free_fc,
4029 : .get_tree = shmem_get_tree,
4030 : #ifdef CONFIG_TMPFS
4031 : .parse_monolithic = shmem_parse_options,
4032 : .parse_param = shmem_parse_one,
4033 : .reconfigure = shmem_reconfigure,
4034 : #endif
4035 : };
4036 :
4037 : static struct kmem_cache *shmem_inode_cachep;
4038 :
4039 1 : static struct inode *shmem_alloc_inode(struct super_block *sb)
4040 : {
4041 : struct shmem_inode_info *info;
4042 2 : info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
4043 1 : if (!info)
4044 : return NULL;
4045 1 : return &info->vfs_inode;
4046 : }
4047 :
4048 0 : static void shmem_free_in_core_inode(struct inode *inode)
4049 : {
4050 0 : if (S_ISLNK(inode->i_mode))
4051 0 : kfree(inode->i_link);
4052 0 : kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
4053 0 : }
4054 :
4055 0 : static void shmem_destroy_inode(struct inode *inode)
4056 : {
4057 : if (S_ISREG(inode->i_mode))
4058 : mpol_free_shared_policy(&SHMEM_I(inode)->policy);
4059 0 : }
4060 :
4061 12 : static void shmem_init_inode(void *foo)
4062 : {
4063 12 : struct shmem_inode_info *info = foo;
4064 12 : inode_init_once(&info->vfs_inode);
4065 12 : }
4066 :
4067 : static void shmem_init_inodecache(void)
4068 : {
4069 1 : shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
4070 : sizeof(struct shmem_inode_info),
4071 : 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
4072 : }
4073 :
4074 : static void shmem_destroy_inodecache(void)
4075 : {
4076 0 : kmem_cache_destroy(shmem_inode_cachep);
4077 : }
4078 :
4079 : /* Keep the page in page cache instead of truncating it */
4080 0 : static int shmem_error_remove_page(struct address_space *mapping,
4081 : struct page *page)
4082 : {
4083 0 : return 0;
4084 : }
4085 :
4086 : const struct address_space_operations shmem_aops = {
4087 : .writepage = shmem_writepage,
4088 : .dirty_folio = noop_dirty_folio,
4089 : #ifdef CONFIG_TMPFS
4090 : .write_begin = shmem_write_begin,
4091 : .write_end = shmem_write_end,
4092 : #endif
4093 : #ifdef CONFIG_MIGRATION
4094 : .migrate_folio = migrate_folio,
4095 : #endif
4096 : .error_remove_page = shmem_error_remove_page,
4097 : };
4098 : EXPORT_SYMBOL(shmem_aops);
4099 :
4100 : static const struct file_operations shmem_file_operations = {
4101 : .mmap = shmem_mmap,
4102 : .open = generic_file_open,
4103 : .get_unmapped_area = shmem_get_unmapped_area,
4104 : #ifdef CONFIG_TMPFS
4105 : .llseek = shmem_file_llseek,
4106 : .read_iter = shmem_file_read_iter,
4107 : .write_iter = generic_file_write_iter,
4108 : .fsync = noop_fsync,
4109 : .splice_read = shmem_file_splice_read,
4110 : .splice_write = iter_file_splice_write,
4111 : .fallocate = shmem_fallocate,
4112 : #endif
4113 : };
4114 :
4115 : static const struct inode_operations shmem_inode_operations = {
4116 : .getattr = shmem_getattr,
4117 : .setattr = shmem_setattr,
4118 : #ifdef CONFIG_TMPFS_XATTR
4119 : .listxattr = shmem_listxattr,
4120 : .set_acl = simple_set_acl,
4121 : .fileattr_get = shmem_fileattr_get,
4122 : .fileattr_set = shmem_fileattr_set,
4123 : #endif
4124 : };
4125 :
4126 : static const struct inode_operations shmem_dir_inode_operations = {
4127 : #ifdef CONFIG_TMPFS
4128 : .getattr = shmem_getattr,
4129 : .create = shmem_create,
4130 : .lookup = simple_lookup,
4131 : .link = shmem_link,
4132 : .unlink = shmem_unlink,
4133 : .symlink = shmem_symlink,
4134 : .mkdir = shmem_mkdir,
4135 : .rmdir = shmem_rmdir,
4136 : .mknod = shmem_mknod,
4137 : .rename = shmem_rename2,
4138 : .tmpfile = shmem_tmpfile,
4139 : #endif
4140 : #ifdef CONFIG_TMPFS_XATTR
4141 : .listxattr = shmem_listxattr,
4142 : .fileattr_get = shmem_fileattr_get,
4143 : .fileattr_set = shmem_fileattr_set,
4144 : #endif
4145 : #ifdef CONFIG_TMPFS_POSIX_ACL
4146 : .setattr = shmem_setattr,
4147 : .set_acl = simple_set_acl,
4148 : #endif
4149 : };
4150 :
4151 : static const struct inode_operations shmem_special_inode_operations = {
4152 : .getattr = shmem_getattr,
4153 : #ifdef CONFIG_TMPFS_XATTR
4154 : .listxattr = shmem_listxattr,
4155 : #endif
4156 : #ifdef CONFIG_TMPFS_POSIX_ACL
4157 : .setattr = shmem_setattr,
4158 : .set_acl = simple_set_acl,
4159 : #endif
4160 : };
4161 :
4162 : static const struct super_operations shmem_ops = {
4163 : .alloc_inode = shmem_alloc_inode,
4164 : .free_inode = shmem_free_in_core_inode,
4165 : .destroy_inode = shmem_destroy_inode,
4166 : #ifdef CONFIG_TMPFS
4167 : .statfs = shmem_statfs,
4168 : .show_options = shmem_show_options,
4169 : #endif
4170 : .evict_inode = shmem_evict_inode,
4171 : .drop_inode = generic_delete_inode,
4172 : .put_super = shmem_put_super,
4173 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4174 : .nr_cached_objects = shmem_unused_huge_count,
4175 : .free_cached_objects = shmem_unused_huge_scan,
4176 : #endif
4177 : };
4178 :
4179 : static const struct vm_operations_struct shmem_vm_ops = {
4180 : .fault = shmem_fault,
4181 : .map_pages = filemap_map_pages,
4182 : #ifdef CONFIG_NUMA
4183 : .set_policy = shmem_set_policy,
4184 : .get_policy = shmem_get_policy,
4185 : #endif
4186 : };
4187 :
4188 : static const struct vm_operations_struct shmem_anon_vm_ops = {
4189 : .fault = shmem_fault,
4190 : .map_pages = filemap_map_pages,
4191 : #ifdef CONFIG_NUMA
4192 : .set_policy = shmem_set_policy,
4193 : .get_policy = shmem_get_policy,
4194 : #endif
4195 : };
4196 :
4197 1 : int shmem_init_fs_context(struct fs_context *fc)
4198 : {
4199 : struct shmem_options *ctx;
4200 :
4201 1 : ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
4202 1 : if (!ctx)
4203 : return -ENOMEM;
4204 :
4205 1 : ctx->mode = 0777 | S_ISVTX;
4206 1 : ctx->uid = current_fsuid();
4207 1 : ctx->gid = current_fsgid();
4208 :
4209 1 : fc->fs_private = ctx;
4210 1 : fc->ops = &shmem_fs_context_ops;
4211 1 : return 0;
4212 : }
4213 :
4214 : static struct file_system_type shmem_fs_type = {
4215 : .owner = THIS_MODULE,
4216 : .name = "tmpfs",
4217 : .init_fs_context = shmem_init_fs_context,
4218 : #ifdef CONFIG_TMPFS
4219 : .parameters = shmem_fs_parameters,
4220 : #endif
4221 : .kill_sb = kill_litter_super,
4222 : #ifdef CONFIG_SHMEM
4223 : .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
4224 : #else
4225 : .fs_flags = FS_USERNS_MOUNT,
4226 : #endif
4227 : };
4228 :
4229 1 : void __init shmem_init(void)
4230 : {
4231 : int error;
4232 :
4233 : shmem_init_inodecache();
4234 :
4235 1 : error = register_filesystem(&shmem_fs_type);
4236 1 : if (error) {
4237 0 : pr_err("Could not register tmpfs\n");
4238 0 : goto out2;
4239 : }
4240 :
4241 1 : shm_mnt = kern_mount(&shmem_fs_type);
4242 2 : if (IS_ERR(shm_mnt)) {
4243 0 : error = PTR_ERR(shm_mnt);
4244 0 : pr_err("Could not kern_mount tmpfs\n");
4245 : goto out1;
4246 : }
4247 :
4248 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4249 : if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
4250 : SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
4251 : else
4252 : shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
4253 : #endif
4254 : return;
4255 :
4256 : out1:
4257 0 : unregister_filesystem(&shmem_fs_type);
4258 : out2:
4259 : shmem_destroy_inodecache();
4260 0 : shm_mnt = ERR_PTR(error);
4261 : }
4262 :
4263 : #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
4264 : static ssize_t shmem_enabled_show(struct kobject *kobj,
4265 : struct kobj_attribute *attr, char *buf)
4266 : {
4267 : static const int values[] = {
4268 : SHMEM_HUGE_ALWAYS,
4269 : SHMEM_HUGE_WITHIN_SIZE,
4270 : SHMEM_HUGE_ADVISE,
4271 : SHMEM_HUGE_NEVER,
4272 : SHMEM_HUGE_DENY,
4273 : SHMEM_HUGE_FORCE,
4274 : };
4275 : int len = 0;
4276 : int i;
4277 :
4278 : for (i = 0; i < ARRAY_SIZE(values); i++) {
4279 : len += sysfs_emit_at(buf, len,
4280 : shmem_huge == values[i] ? "%s[%s]" : "%s%s",
4281 : i ? " " : "",
4282 : shmem_format_huge(values[i]));
4283 : }
4284 :
4285 : len += sysfs_emit_at(buf, len, "\n");
4286 :
4287 : return len;
4288 : }
4289 :
4290 : static ssize_t shmem_enabled_store(struct kobject *kobj,
4291 : struct kobj_attribute *attr, const char *buf, size_t count)
4292 : {
4293 : char tmp[16];
4294 : int huge;
4295 :
4296 : if (count + 1 > sizeof(tmp))
4297 : return -EINVAL;
4298 : memcpy(tmp, buf, count);
4299 : tmp[count] = '\0';
4300 : if (count && tmp[count - 1] == '\n')
4301 : tmp[count - 1] = '\0';
4302 :
4303 : huge = shmem_parse_huge(tmp);
4304 : if (huge == -EINVAL)
4305 : return -EINVAL;
4306 : if (!has_transparent_hugepage() &&
4307 : huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
4308 : return -EINVAL;
4309 :
4310 : shmem_huge = huge;
4311 : if (shmem_huge > SHMEM_HUGE_DENY)
4312 : SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
4313 : return count;
4314 : }
4315 :
4316 : struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
4317 : #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
4318 :
4319 : #else /* !CONFIG_SHMEM */
4320 :
4321 : /*
4322 : * tiny-shmem: simple shmemfs and tmpfs using ramfs code
4323 : *
4324 : * This is intended for small system where the benefits of the full
4325 : * shmem code (swap-backed and resource-limited) are outweighed by
4326 : * their complexity. On systems without swap this code should be
4327 : * effectively equivalent, but much lighter weight.
4328 : */
4329 :
4330 : static struct file_system_type shmem_fs_type = {
4331 : .name = "tmpfs",
4332 : .init_fs_context = ramfs_init_fs_context,
4333 : .parameters = ramfs_fs_parameters,
4334 : .kill_sb = ramfs_kill_sb,
4335 : .fs_flags = FS_USERNS_MOUNT,
4336 : };
4337 :
4338 : void __init shmem_init(void)
4339 : {
4340 : BUG_ON(register_filesystem(&shmem_fs_type) != 0);
4341 :
4342 : shm_mnt = kern_mount(&shmem_fs_type);
4343 : BUG_ON(IS_ERR(shm_mnt));
4344 : }
4345 :
4346 : int shmem_unuse(unsigned int type)
4347 : {
4348 : return 0;
4349 : }
4350 :
4351 : int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
4352 : {
4353 : return 0;
4354 : }
4355 :
4356 : void shmem_unlock_mapping(struct address_space *mapping)
4357 : {
4358 : }
4359 :
4360 : #ifdef CONFIG_MMU
4361 : unsigned long shmem_get_unmapped_area(struct file *file,
4362 : unsigned long addr, unsigned long len,
4363 : unsigned long pgoff, unsigned long flags)
4364 : {
4365 : return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
4366 : }
4367 : #endif
4368 :
4369 : void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
4370 : {
4371 : truncate_inode_pages_range(inode->i_mapping, lstart, lend);
4372 : }
4373 : EXPORT_SYMBOL_GPL(shmem_truncate_range);
4374 :
4375 : #define shmem_vm_ops generic_file_vm_ops
4376 : #define shmem_anon_vm_ops generic_file_vm_ops
4377 : #define shmem_file_operations ramfs_file_operations
4378 : #define shmem_get_inode(idmap, sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
4379 : #define shmem_acct_size(flags, size) 0
4380 : #define shmem_unacct_size(flags, size) do {} while (0)
4381 :
4382 : #endif /* CONFIG_SHMEM */
4383 :
4384 : /* common code */
4385 :
4386 0 : static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
4387 : unsigned long flags, unsigned int i_flags)
4388 : {
4389 : struct inode *inode;
4390 : struct file *res;
4391 :
4392 0 : if (IS_ERR(mnt))
4393 : return ERR_CAST(mnt);
4394 :
4395 0 : if (size < 0 || size > MAX_LFS_FILESIZE)
4396 : return ERR_PTR(-EINVAL);
4397 :
4398 0 : if (shmem_acct_size(flags, size))
4399 : return ERR_PTR(-ENOMEM);
4400 :
4401 0 : if (is_idmapped_mnt(mnt))
4402 : return ERR_PTR(-EINVAL);
4403 :
4404 0 : inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
4405 : S_IFREG | S_IRWXUGO, 0, flags);
4406 0 : if (unlikely(!inode)) {
4407 : shmem_unacct_size(flags, size);
4408 : return ERR_PTR(-ENOSPC);
4409 : }
4410 0 : inode->i_flags |= i_flags;
4411 0 : inode->i_size = size;
4412 0 : clear_nlink(inode); /* It is unlinked */
4413 0 : res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
4414 0 : if (!IS_ERR(res))
4415 0 : res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
4416 : &shmem_file_operations);
4417 0 : if (IS_ERR(res))
4418 0 : iput(inode);
4419 : return res;
4420 : }
4421 :
4422 : /**
4423 : * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
4424 : * kernel internal. There will be NO LSM permission checks against the
4425 : * underlying inode. So users of this interface must do LSM checks at a
4426 : * higher layer. The users are the big_key and shm implementations. LSM
4427 : * checks are provided at the key or shm level rather than the inode.
4428 : * @name: name for dentry (to be seen in /proc/<pid>/maps
4429 : * @size: size to be set for the file
4430 : * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4431 : */
4432 0 : struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
4433 : {
4434 0 : return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
4435 : }
4436 :
4437 : /**
4438 : * shmem_file_setup - get an unlinked file living in tmpfs
4439 : * @name: name for dentry (to be seen in /proc/<pid>/maps
4440 : * @size: size to be set for the file
4441 : * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4442 : */
4443 0 : struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
4444 : {
4445 0 : return __shmem_file_setup(shm_mnt, name, size, flags, 0);
4446 : }
4447 : EXPORT_SYMBOL_GPL(shmem_file_setup);
4448 :
4449 : /**
4450 : * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
4451 : * @mnt: the tmpfs mount where the file will be created
4452 : * @name: name for dentry (to be seen in /proc/<pid>/maps
4453 : * @size: size to be set for the file
4454 : * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4455 : */
4456 0 : struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
4457 : loff_t size, unsigned long flags)
4458 : {
4459 0 : return __shmem_file_setup(mnt, name, size, flags, 0);
4460 : }
4461 : EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
4462 :
4463 : /**
4464 : * shmem_zero_setup - setup a shared anonymous mapping
4465 : * @vma: the vma to be mmapped is prepared by do_mmap
4466 : */
4467 0 : int shmem_zero_setup(struct vm_area_struct *vma)
4468 : {
4469 : struct file *file;
4470 0 : loff_t size = vma->vm_end - vma->vm_start;
4471 :
4472 : /*
4473 : * Cloning a new file under mmap_lock leads to a lock ordering conflict
4474 : * between XFS directory reading and selinux: since this file is only
4475 : * accessible to the user through its mapping, use S_PRIVATE flag to
4476 : * bypass file security, in the same way as shmem_kernel_file_setup().
4477 : */
4478 0 : file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
4479 0 : if (IS_ERR(file))
4480 0 : return PTR_ERR(file);
4481 :
4482 0 : if (vma->vm_file)
4483 0 : fput(vma->vm_file);
4484 0 : vma->vm_file = file;
4485 0 : vma->vm_ops = &shmem_anon_vm_ops;
4486 :
4487 0 : return 0;
4488 : }
4489 :
4490 : /**
4491 : * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
4492 : * @mapping: the folio's address_space
4493 : * @index: the folio index
4494 : * @gfp: the page allocator flags to use if allocating
4495 : *
4496 : * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
4497 : * with any new page allocations done using the specified allocation flags.
4498 : * But read_cache_page_gfp() uses the ->read_folio() method: which does not
4499 : * suit tmpfs, since it may have pages in swapcache, and needs to find those
4500 : * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
4501 : *
4502 : * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
4503 : * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
4504 : */
4505 0 : struct folio *shmem_read_folio_gfp(struct address_space *mapping,
4506 : pgoff_t index, gfp_t gfp)
4507 : {
4508 : #ifdef CONFIG_SHMEM
4509 0 : struct inode *inode = mapping->host;
4510 : struct folio *folio;
4511 : int error;
4512 :
4513 0 : BUG_ON(!shmem_mapping(mapping));
4514 0 : error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
4515 : gfp, NULL, NULL, NULL);
4516 0 : if (error)
4517 0 : return ERR_PTR(error);
4518 :
4519 0 : folio_unlock(folio);
4520 0 : return folio;
4521 : #else
4522 : /*
4523 : * The tiny !SHMEM case uses ramfs without swap
4524 : */
4525 : return mapping_read_folio_gfp(mapping, index, gfp);
4526 : #endif
4527 : }
4528 : EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);
4529 :
4530 0 : struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
4531 : pgoff_t index, gfp_t gfp)
4532 : {
4533 0 : struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
4534 : struct page *page;
4535 :
4536 0 : if (IS_ERR(folio))
4537 0 : return &folio->page;
4538 :
4539 0 : page = folio_file_page(folio, index);
4540 : if (PageHWPoison(page)) {
4541 : folio_put(folio);
4542 : return ERR_PTR(-EIO);
4543 : }
4544 :
4545 0 : return page;
4546 : }
4547 : EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
|