Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
4 : *
5 : * Swap reorganised 29.12.95, Stephen Tweedie.
6 : * kswapd added: 7.1.96 sct
7 : * Removed kswapd_ctl limits, and swap out as many pages as needed
8 : * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
9 : * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
10 : * Multiqueue VM started 5.8.00, Rik van Riel.
11 : */
12 :
13 : #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 :
15 : #include <linux/mm.h>
16 : #include <linux/sched/mm.h>
17 : #include <linux/module.h>
18 : #include <linux/gfp.h>
19 : #include <linux/kernel_stat.h>
20 : #include <linux/swap.h>
21 : #include <linux/pagemap.h>
22 : #include <linux/init.h>
23 : #include <linux/highmem.h>
24 : #include <linux/vmpressure.h>
25 : #include <linux/vmstat.h>
26 : #include <linux/file.h>
27 : #include <linux/writeback.h>
28 : #include <linux/blkdev.h>
29 : #include <linux/buffer_head.h> /* for buffer_heads_over_limit */
30 : #include <linux/mm_inline.h>
31 : #include <linux/backing-dev.h>
32 : #include <linux/rmap.h>
33 : #include <linux/topology.h>
34 : #include <linux/cpu.h>
35 : #include <linux/cpuset.h>
36 : #include <linux/compaction.h>
37 : #include <linux/notifier.h>
38 : #include <linux/rwsem.h>
39 : #include <linux/delay.h>
40 : #include <linux/kthread.h>
41 : #include <linux/freezer.h>
42 : #include <linux/memcontrol.h>
43 : #include <linux/migrate.h>
44 : #include <linux/delayacct.h>
45 : #include <linux/sysctl.h>
46 : #include <linux/memory-tiers.h>
47 : #include <linux/oom.h>
48 : #include <linux/pagevec.h>
49 : #include <linux/prefetch.h>
50 : #include <linux/printk.h>
51 : #include <linux/dax.h>
52 : #include <linux/psi.h>
53 : #include <linux/pagewalk.h>
54 : #include <linux/shmem_fs.h>
55 : #include <linux/ctype.h>
56 : #include <linux/debugfs.h>
57 : #include <linux/khugepaged.h>
58 : #include <linux/rculist_nulls.h>
59 : #include <linux/random.h>
60 :
61 : #include <asm/tlbflush.h>
62 : #include <asm/div64.h>
63 :
64 : #include <linux/swapops.h>
65 : #include <linux/balloon_compaction.h>
66 : #include <linux/sched/sysctl.h>
67 :
68 : #include "internal.h"
69 : #include "swap.h"
70 :
71 : #define CREATE_TRACE_POINTS
72 : #include <trace/events/vmscan.h>
73 :
74 : struct scan_control {
75 : /* How many pages shrink_list() should reclaim */
76 : unsigned long nr_to_reclaim;
77 :
78 : /*
79 : * Nodemask of nodes allowed by the caller. If NULL, all nodes
80 : * are scanned.
81 : */
82 : nodemask_t *nodemask;
83 :
84 : /*
85 : * The memory cgroup that hit its limit and as a result is the
86 : * primary target of this reclaim invocation.
87 : */
88 : struct mem_cgroup *target_mem_cgroup;
89 :
90 : /*
91 : * Scan pressure balancing between anon and file LRUs
92 : */
93 : unsigned long anon_cost;
94 : unsigned long file_cost;
95 :
96 : /* Can active folios be deactivated as part of reclaim? */
97 : #define DEACTIVATE_ANON 1
98 : #define DEACTIVATE_FILE 2
99 : unsigned int may_deactivate:2;
100 : unsigned int force_deactivate:1;
101 : unsigned int skipped_deactivate:1;
102 :
103 : /* Writepage batching in laptop mode; RECLAIM_WRITE */
104 : unsigned int may_writepage:1;
105 :
106 : /* Can mapped folios be reclaimed? */
107 : unsigned int may_unmap:1;
108 :
109 : /* Can folios be swapped as part of reclaim? */
110 : unsigned int may_swap:1;
111 :
112 : /* Proactive reclaim invoked by userspace through memory.reclaim */
113 : unsigned int proactive:1;
114 :
115 : /*
116 : * Cgroup memory below memory.low is protected as long as we
117 : * don't threaten to OOM. If any cgroup is reclaimed at
118 : * reduced force or passed over entirely due to its memory.low
119 : * setting (memcg_low_skipped), and nothing is reclaimed as a
120 : * result, then go back for one more cycle that reclaims the protected
121 : * memory (memcg_low_reclaim) to avert OOM.
122 : */
123 : unsigned int memcg_low_reclaim:1;
124 : unsigned int memcg_low_skipped:1;
125 :
126 : unsigned int hibernation_mode:1;
127 :
128 : /* One of the zones is ready for compaction */
129 : unsigned int compaction_ready:1;
130 :
131 : /* There is easily reclaimable cold cache in the current node */
132 : unsigned int cache_trim_mode:1;
133 :
134 : /* The file folios on the current node are dangerously low */
135 : unsigned int file_is_tiny:1;
136 :
137 : /* Always discard instead of demoting to lower tier memory */
138 : unsigned int no_demotion:1;
139 :
140 : /* Allocation order */
141 : s8 order;
142 :
143 : /* Scan (total_size >> priority) pages at once */
144 : s8 priority;
145 :
146 : /* The highest zone to isolate folios for reclaim from */
147 : s8 reclaim_idx;
148 :
149 : /* This context's GFP mask */
150 : gfp_t gfp_mask;
151 :
152 : /* Incremented by the number of inactive pages that were scanned */
153 : unsigned long nr_scanned;
154 :
155 : /* Number of pages freed so far during a call to shrink_zones() */
156 : unsigned long nr_reclaimed;
157 :
158 : struct {
159 : unsigned int dirty;
160 : unsigned int unqueued_dirty;
161 : unsigned int congested;
162 : unsigned int writeback;
163 : unsigned int immediate;
164 : unsigned int file_taken;
165 : unsigned int taken;
166 : } nr;
167 :
168 : /* for recording the reclaimed slab by now */
169 : struct reclaim_state reclaim_state;
170 : };
171 :
172 : #ifdef ARCH_HAS_PREFETCHW
173 : #define prefetchw_prev_lru_folio(_folio, _base, _field) \
174 : do { \
175 : if ((_folio)->lru.prev != _base) { \
176 : struct folio *prev; \
177 : \
178 : prev = lru_to_folio(&(_folio->lru)); \
179 : prefetchw(&prev->_field); \
180 : } \
181 : } while (0)
182 : #else
183 : #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
184 : #endif
185 :
186 : /*
187 : * From 0 .. 200. Higher means more swappy.
188 : */
189 : int vm_swappiness = 60;
190 :
191 0 : static void set_task_reclaim_state(struct task_struct *task,
192 : struct reclaim_state *rs)
193 : {
194 : /* Check for an overwrite */
195 0 : WARN_ON_ONCE(rs && task->reclaim_state);
196 :
197 : /* Check for the nulling of an already-nulled member */
198 0 : WARN_ON_ONCE(!rs && !task->reclaim_state);
199 :
200 0 : task->reclaim_state = rs;
201 0 : }
202 :
203 : LIST_HEAD(shrinker_list);
204 : DECLARE_RWSEM(shrinker_rwsem);
205 :
206 : #ifdef CONFIG_MEMCG
207 : static int shrinker_nr_max;
208 :
209 : /* The shrinker_info is expanded in a batch of BITS_PER_LONG */
210 : static inline int shrinker_map_size(int nr_items)
211 : {
212 : return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
213 : }
214 :
215 : static inline int shrinker_defer_size(int nr_items)
216 : {
217 : return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
218 : }
219 :
220 : static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
221 : int nid)
222 : {
223 : return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
224 : lockdep_is_held(&shrinker_rwsem));
225 : }
226 :
227 : static int expand_one_shrinker_info(struct mem_cgroup *memcg,
228 : int map_size, int defer_size,
229 : int old_map_size, int old_defer_size)
230 : {
231 : struct shrinker_info *new, *old;
232 : struct mem_cgroup_per_node *pn;
233 : int nid;
234 : int size = map_size + defer_size;
235 :
236 : for_each_node(nid) {
237 : pn = memcg->nodeinfo[nid];
238 : old = shrinker_info_protected(memcg, nid);
239 : /* Not yet online memcg */
240 : if (!old)
241 : return 0;
242 :
243 : new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
244 : if (!new)
245 : return -ENOMEM;
246 :
247 : new->nr_deferred = (atomic_long_t *)(new + 1);
248 : new->map = (void *)new->nr_deferred + defer_size;
249 :
250 : /* map: set all old bits, clear all new bits */
251 : memset(new->map, (int)0xff, old_map_size);
252 : memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
253 : /* nr_deferred: copy old values, clear all new values */
254 : memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
255 : memset((void *)new->nr_deferred + old_defer_size, 0,
256 : defer_size - old_defer_size);
257 :
258 : rcu_assign_pointer(pn->shrinker_info, new);
259 : kvfree_rcu(old, rcu);
260 : }
261 :
262 : return 0;
263 : }
264 :
265 : void free_shrinker_info(struct mem_cgroup *memcg)
266 : {
267 : struct mem_cgroup_per_node *pn;
268 : struct shrinker_info *info;
269 : int nid;
270 :
271 : for_each_node(nid) {
272 : pn = memcg->nodeinfo[nid];
273 : info = rcu_dereference_protected(pn->shrinker_info, true);
274 : kvfree(info);
275 : rcu_assign_pointer(pn->shrinker_info, NULL);
276 : }
277 : }
278 :
279 : int alloc_shrinker_info(struct mem_cgroup *memcg)
280 : {
281 : struct shrinker_info *info;
282 : int nid, size, ret = 0;
283 : int map_size, defer_size = 0;
284 :
285 : down_write(&shrinker_rwsem);
286 : map_size = shrinker_map_size(shrinker_nr_max);
287 : defer_size = shrinker_defer_size(shrinker_nr_max);
288 : size = map_size + defer_size;
289 : for_each_node(nid) {
290 : info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
291 : if (!info) {
292 : free_shrinker_info(memcg);
293 : ret = -ENOMEM;
294 : break;
295 : }
296 : info->nr_deferred = (atomic_long_t *)(info + 1);
297 : info->map = (void *)info->nr_deferred + defer_size;
298 : rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
299 : }
300 : up_write(&shrinker_rwsem);
301 :
302 : return ret;
303 : }
304 :
305 : static inline bool need_expand(int nr_max)
306 : {
307 : return round_up(nr_max, BITS_PER_LONG) >
308 : round_up(shrinker_nr_max, BITS_PER_LONG);
309 : }
310 :
311 : static int expand_shrinker_info(int new_id)
312 : {
313 : int ret = 0;
314 : int new_nr_max = new_id + 1;
315 : int map_size, defer_size = 0;
316 : int old_map_size, old_defer_size = 0;
317 : struct mem_cgroup *memcg;
318 :
319 : if (!need_expand(new_nr_max))
320 : goto out;
321 :
322 : if (!root_mem_cgroup)
323 : goto out;
324 :
325 : lockdep_assert_held(&shrinker_rwsem);
326 :
327 : map_size = shrinker_map_size(new_nr_max);
328 : defer_size = shrinker_defer_size(new_nr_max);
329 : old_map_size = shrinker_map_size(shrinker_nr_max);
330 : old_defer_size = shrinker_defer_size(shrinker_nr_max);
331 :
332 : memcg = mem_cgroup_iter(NULL, NULL, NULL);
333 : do {
334 : ret = expand_one_shrinker_info(memcg, map_size, defer_size,
335 : old_map_size, old_defer_size);
336 : if (ret) {
337 : mem_cgroup_iter_break(NULL, memcg);
338 : goto out;
339 : }
340 : } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
341 : out:
342 : if (!ret)
343 : shrinker_nr_max = new_nr_max;
344 :
345 : return ret;
346 : }
347 :
348 : void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
349 : {
350 : if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
351 : struct shrinker_info *info;
352 :
353 : rcu_read_lock();
354 : info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
355 : /* Pairs with smp mb in shrink_slab() */
356 : smp_mb__before_atomic();
357 : set_bit(shrinker_id, info->map);
358 : rcu_read_unlock();
359 : }
360 : }
361 :
362 : static DEFINE_IDR(shrinker_idr);
363 :
364 : static int prealloc_memcg_shrinker(struct shrinker *shrinker)
365 : {
366 : int id, ret = -ENOMEM;
367 :
368 : if (mem_cgroup_disabled())
369 : return -ENOSYS;
370 :
371 : down_write(&shrinker_rwsem);
372 : /* This may call shrinker, so it must use down_read_trylock() */
373 : id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
374 : if (id < 0)
375 : goto unlock;
376 :
377 : if (id >= shrinker_nr_max) {
378 : if (expand_shrinker_info(id)) {
379 : idr_remove(&shrinker_idr, id);
380 : goto unlock;
381 : }
382 : }
383 : shrinker->id = id;
384 : ret = 0;
385 : unlock:
386 : up_write(&shrinker_rwsem);
387 : return ret;
388 : }
389 :
390 : static void unregister_memcg_shrinker(struct shrinker *shrinker)
391 : {
392 : int id = shrinker->id;
393 :
394 : BUG_ON(id < 0);
395 :
396 : lockdep_assert_held(&shrinker_rwsem);
397 :
398 : idr_remove(&shrinker_idr, id);
399 : }
400 :
401 : static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
402 : struct mem_cgroup *memcg)
403 : {
404 : struct shrinker_info *info;
405 :
406 : info = shrinker_info_protected(memcg, nid);
407 : return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
408 : }
409 :
410 : static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
411 : struct mem_cgroup *memcg)
412 : {
413 : struct shrinker_info *info;
414 :
415 : info = shrinker_info_protected(memcg, nid);
416 : return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
417 : }
418 :
419 : void reparent_shrinker_deferred(struct mem_cgroup *memcg)
420 : {
421 : int i, nid;
422 : long nr;
423 : struct mem_cgroup *parent;
424 : struct shrinker_info *child_info, *parent_info;
425 :
426 : parent = parent_mem_cgroup(memcg);
427 : if (!parent)
428 : parent = root_mem_cgroup;
429 :
430 : /* Prevent from concurrent shrinker_info expand */
431 : down_read(&shrinker_rwsem);
432 : for_each_node(nid) {
433 : child_info = shrinker_info_protected(memcg, nid);
434 : parent_info = shrinker_info_protected(parent, nid);
435 : for (i = 0; i < shrinker_nr_max; i++) {
436 : nr = atomic_long_read(&child_info->nr_deferred[i]);
437 : atomic_long_add(nr, &parent_info->nr_deferred[i]);
438 : }
439 : }
440 : up_read(&shrinker_rwsem);
441 : }
442 :
443 : static bool cgroup_reclaim(struct scan_control *sc)
444 : {
445 : return sc->target_mem_cgroup;
446 : }
447 :
448 : static bool global_reclaim(struct scan_control *sc)
449 : {
450 : return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
451 : }
452 :
453 : /**
454 : * writeback_throttling_sane - is the usual dirty throttling mechanism available?
455 : * @sc: scan_control in question
456 : *
457 : * The normal page dirty throttling mechanism in balance_dirty_pages() is
458 : * completely broken with the legacy memcg and direct stalling in
459 : * shrink_folio_list() is used for throttling instead, which lacks all the
460 : * niceties such as fairness, adaptive pausing, bandwidth proportional
461 : * allocation and configurability.
462 : *
463 : * This function tests whether the vmscan currently in progress can assume
464 : * that the normal dirty throttling mechanism is operational.
465 : */
466 : static bool writeback_throttling_sane(struct scan_control *sc)
467 : {
468 : if (!cgroup_reclaim(sc))
469 : return true;
470 : #ifdef CONFIG_CGROUP_WRITEBACK
471 : if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
472 : return true;
473 : #endif
474 : return false;
475 : }
476 : #else
477 : static int prealloc_memcg_shrinker(struct shrinker *shrinker)
478 : {
479 : return -ENOSYS;
480 : }
481 :
482 : static void unregister_memcg_shrinker(struct shrinker *shrinker)
483 : {
484 : }
485 :
486 : static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
487 : struct mem_cgroup *memcg)
488 : {
489 : return 0;
490 : }
491 :
492 : static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
493 : struct mem_cgroup *memcg)
494 : {
495 : return 0;
496 : }
497 :
498 : static bool cgroup_reclaim(struct scan_control *sc)
499 : {
500 : return false;
501 : }
502 :
503 : static bool global_reclaim(struct scan_control *sc)
504 : {
505 : return true;
506 : }
507 :
508 : static bool writeback_throttling_sane(struct scan_control *sc)
509 : {
510 : return true;
511 : }
512 : #endif
513 :
514 : static long xchg_nr_deferred(struct shrinker *shrinker,
515 : struct shrink_control *sc)
516 : {
517 0 : int nid = sc->nid;
518 :
519 0 : if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
520 0 : nid = 0;
521 :
522 0 : if (sc->memcg &&
523 0 : (shrinker->flags & SHRINKER_MEMCG_AWARE))
524 : return xchg_nr_deferred_memcg(nid, shrinker,
525 : sc->memcg);
526 :
527 0 : return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
528 : }
529 :
530 :
531 : static long add_nr_deferred(long nr, struct shrinker *shrinker,
532 : struct shrink_control *sc)
533 : {
534 0 : int nid = sc->nid;
535 :
536 0 : if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
537 0 : nid = 0;
538 :
539 0 : if (sc->memcg &&
540 0 : (shrinker->flags & SHRINKER_MEMCG_AWARE))
541 : return add_nr_deferred_memcg(nr, nid, shrinker,
542 : sc->memcg);
543 :
544 0 : return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
545 : }
546 :
547 : static bool can_demote(int nid, struct scan_control *sc)
548 : {
549 : if (!numa_demotion_enabled)
550 : return false;
551 : if (sc && sc->no_demotion)
552 : return false;
553 : if (next_demotion_node(nid) == NUMA_NO_NODE)
554 : return false;
555 :
556 : return true;
557 : }
558 :
559 : static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
560 : int nid,
561 : struct scan_control *sc)
562 : {
563 : if (memcg == NULL) {
564 : /*
565 : * For non-memcg reclaim, is there
566 : * space in any swap device?
567 : */
568 0 : if (get_nr_swap_pages() > 0)
569 : return true;
570 : } else {
571 : /* Is the memcg below its swap limit? */
572 : if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
573 : return true;
574 : }
575 :
576 : /*
577 : * The page can not be swapped.
578 : *
579 : * Can it be reclaimed from this node via demotion?
580 : */
581 : return can_demote(nid, sc);
582 : }
583 :
584 : /*
585 : * This misses isolated folios which are not accounted for to save counters.
586 : * As the data only determines if reclaim or compaction continues, it is
587 : * not expected that isolated folios will be a dominating factor.
588 : */
589 0 : unsigned long zone_reclaimable_pages(struct zone *zone)
590 : {
591 : unsigned long nr;
592 :
593 0 : nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
594 0 : zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
595 0 : if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
596 0 : nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
597 0 : zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
598 :
599 0 : return nr;
600 : }
601 :
602 : /**
603 : * lruvec_lru_size - Returns the number of pages on the given LRU list.
604 : * @lruvec: lru vector
605 : * @lru: lru to use
606 : * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
607 : */
608 : static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
609 : int zone_idx)
610 : {
611 : unsigned long size = 0;
612 : int zid;
613 :
614 0 : for (zid = 0; zid <= zone_idx; zid++) {
615 0 : struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
616 :
617 0 : if (!managed_zone(zone))
618 0 : continue;
619 :
620 : if (!mem_cgroup_disabled())
621 : size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
622 : else
623 0 : size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
624 : }
625 : return size;
626 : }
627 :
628 : /*
629 : * Add a shrinker callback to be called from the vm.
630 : */
631 28 : static int __prealloc_shrinker(struct shrinker *shrinker)
632 : {
633 : unsigned int size;
634 : int err;
635 :
636 28 : if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
637 28 : err = prealloc_memcg_shrinker(shrinker);
638 : if (err != -ENOSYS)
639 : return err;
640 :
641 28 : shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
642 : }
643 :
644 28 : size = sizeof(*shrinker->nr_deferred);
645 : if (shrinker->flags & SHRINKER_NUMA_AWARE)
646 : size *= nr_node_ids;
647 :
648 28 : shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
649 28 : if (!shrinker->nr_deferred)
650 : return -ENOMEM;
651 :
652 : return 0;
653 : }
654 :
655 : #ifdef CONFIG_SHRINKER_DEBUG
656 : int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
657 : {
658 : va_list ap;
659 : int err;
660 :
661 : va_start(ap, fmt);
662 : shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
663 : va_end(ap);
664 : if (!shrinker->name)
665 : return -ENOMEM;
666 :
667 : err = __prealloc_shrinker(shrinker);
668 : if (err) {
669 : kfree_const(shrinker->name);
670 : shrinker->name = NULL;
671 : }
672 :
673 : return err;
674 : }
675 : #else
676 28 : int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
677 : {
678 28 : return __prealloc_shrinker(shrinker);
679 : }
680 : #endif
681 :
682 0 : void free_prealloced_shrinker(struct shrinker *shrinker)
683 : {
684 : #ifdef CONFIG_SHRINKER_DEBUG
685 : kfree_const(shrinker->name);
686 : shrinker->name = NULL;
687 : #endif
688 0 : if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
689 0 : down_write(&shrinker_rwsem);
690 0 : unregister_memcg_shrinker(shrinker);
691 0 : up_write(&shrinker_rwsem);
692 0 : return;
693 : }
694 :
695 0 : kfree(shrinker->nr_deferred);
696 0 : shrinker->nr_deferred = NULL;
697 : }
698 :
699 28 : void register_shrinker_prepared(struct shrinker *shrinker)
700 : {
701 28 : down_write(&shrinker_rwsem);
702 56 : list_add_tail(&shrinker->list, &shrinker_list);
703 28 : shrinker->flags |= SHRINKER_REGISTERED;
704 28 : shrinker_debugfs_add(shrinker);
705 28 : up_write(&shrinker_rwsem);
706 28 : }
707 :
708 0 : static int __register_shrinker(struct shrinker *shrinker)
709 : {
710 0 : int err = __prealloc_shrinker(shrinker);
711 :
712 0 : if (err)
713 : return err;
714 0 : register_shrinker_prepared(shrinker);
715 0 : return 0;
716 : }
717 :
718 : #ifdef CONFIG_SHRINKER_DEBUG
719 : int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
720 : {
721 : va_list ap;
722 : int err;
723 :
724 : va_start(ap, fmt);
725 : shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
726 : va_end(ap);
727 : if (!shrinker->name)
728 : return -ENOMEM;
729 :
730 : err = __register_shrinker(shrinker);
731 : if (err) {
732 : kfree_const(shrinker->name);
733 : shrinker->name = NULL;
734 : }
735 : return err;
736 : }
737 : #else
738 0 : int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
739 : {
740 0 : return __register_shrinker(shrinker);
741 : }
742 : #endif
743 : EXPORT_SYMBOL(register_shrinker);
744 :
745 : /*
746 : * Remove one
747 : */
748 17 : void unregister_shrinker(struct shrinker *shrinker)
749 : {
750 : struct dentry *debugfs_entry;
751 :
752 17 : if (!(shrinker->flags & SHRINKER_REGISTERED))
753 : return;
754 :
755 17 : down_write(&shrinker_rwsem);
756 34 : list_del(&shrinker->list);
757 17 : shrinker->flags &= ~SHRINKER_REGISTERED;
758 : if (shrinker->flags & SHRINKER_MEMCG_AWARE)
759 : unregister_memcg_shrinker(shrinker);
760 17 : debugfs_entry = shrinker_debugfs_remove(shrinker);
761 17 : up_write(&shrinker_rwsem);
762 :
763 : debugfs_remove_recursive(debugfs_entry);
764 :
765 17 : kfree(shrinker->nr_deferred);
766 17 : shrinker->nr_deferred = NULL;
767 : }
768 : EXPORT_SYMBOL(unregister_shrinker);
769 :
770 : /**
771 : * synchronize_shrinkers - Wait for all running shrinkers to complete.
772 : *
773 : * This is equivalent to calling unregister_shrink() and register_shrinker(),
774 : * but atomically and with less overhead. This is useful to guarantee that all
775 : * shrinker invocations have seen an update, before freeing memory, similar to
776 : * rcu.
777 : */
778 0 : void synchronize_shrinkers(void)
779 : {
780 0 : down_write(&shrinker_rwsem);
781 0 : up_write(&shrinker_rwsem);
782 0 : }
783 : EXPORT_SYMBOL(synchronize_shrinkers);
784 :
785 : #define SHRINK_BATCH 128
786 :
787 0 : static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
788 : struct shrinker *shrinker, int priority)
789 : {
790 0 : unsigned long freed = 0;
791 : unsigned long long delta;
792 : long total_scan;
793 : long freeable;
794 : long nr;
795 : long new_nr;
796 0 : long batch_size = shrinker->batch ? shrinker->batch
797 0 : : SHRINK_BATCH;
798 0 : long scanned = 0, next_deferred;
799 :
800 0 : freeable = shrinker->count_objects(shrinker, shrinkctl);
801 0 : if (freeable == 0 || freeable == SHRINK_EMPTY)
802 : return freeable;
803 :
804 : /*
805 : * copy the current shrinker scan count into a local variable
806 : * and zero it so that other concurrent shrinker invocations
807 : * don't also do this scanning work.
808 : */
809 0 : nr = xchg_nr_deferred(shrinker, shrinkctl);
810 :
811 0 : if (shrinker->seeks) {
812 0 : delta = freeable >> priority;
813 0 : delta *= 4;
814 0 : do_div(delta, shrinker->seeks);
815 : } else {
816 : /*
817 : * These objects don't require any IO to create. Trim
818 : * them aggressively under memory pressure to keep
819 : * them from causing refetches in the IO caches.
820 : */
821 0 : delta = freeable / 2;
822 : }
823 :
824 0 : total_scan = nr >> priority;
825 0 : total_scan += delta;
826 0 : total_scan = min(total_scan, (2 * freeable));
827 :
828 0 : trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
829 : freeable, delta, total_scan, priority);
830 :
831 : /*
832 : * Normally, we should not scan less than batch_size objects in one
833 : * pass to avoid too frequent shrinker calls, but if the slab has less
834 : * than batch_size objects in total and we are really tight on memory,
835 : * we will try to reclaim all available objects, otherwise we can end
836 : * up failing allocations although there are plenty of reclaimable
837 : * objects spread over several slabs with usage less than the
838 : * batch_size.
839 : *
840 : * We detect the "tight on memory" situations by looking at the total
841 : * number of objects we want to scan (total_scan). If it is greater
842 : * than the total number of objects on slab (freeable), we must be
843 : * scanning at high prio and therefore should try to reclaim as much as
844 : * possible.
845 : */
846 0 : while (total_scan >= batch_size ||
847 0 : total_scan >= freeable) {
848 : unsigned long ret;
849 0 : unsigned long nr_to_scan = min(batch_size, total_scan);
850 :
851 0 : shrinkctl->nr_to_scan = nr_to_scan;
852 0 : shrinkctl->nr_scanned = nr_to_scan;
853 0 : ret = shrinker->scan_objects(shrinker, shrinkctl);
854 0 : if (ret == SHRINK_STOP)
855 : break;
856 0 : freed += ret;
857 :
858 0 : count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
859 0 : total_scan -= shrinkctl->nr_scanned;
860 0 : scanned += shrinkctl->nr_scanned;
861 :
862 0 : cond_resched();
863 : }
864 :
865 : /*
866 : * The deferred work is increased by any new work (delta) that wasn't
867 : * done, decreased by old deferred work that was done now.
868 : *
869 : * And it is capped to two times of the freeable items.
870 : */
871 0 : next_deferred = max_t(long, (nr + delta - scanned), 0);
872 0 : next_deferred = min(next_deferred, (2 * freeable));
873 :
874 : /*
875 : * move the unused scan count back into the shrinker in a
876 : * manner that handles concurrent updates.
877 : */
878 0 : new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
879 :
880 0 : trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
881 0 : return freed;
882 : }
883 :
884 : #ifdef CONFIG_MEMCG
885 : static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
886 : struct mem_cgroup *memcg, int priority)
887 : {
888 : struct shrinker_info *info;
889 : unsigned long ret, freed = 0;
890 : int i;
891 :
892 : if (!mem_cgroup_online(memcg))
893 : return 0;
894 :
895 : if (!down_read_trylock(&shrinker_rwsem))
896 : return 0;
897 :
898 : info = shrinker_info_protected(memcg, nid);
899 : if (unlikely(!info))
900 : goto unlock;
901 :
902 : for_each_set_bit(i, info->map, shrinker_nr_max) {
903 : struct shrink_control sc = {
904 : .gfp_mask = gfp_mask,
905 : .nid = nid,
906 : .memcg = memcg,
907 : };
908 : struct shrinker *shrinker;
909 :
910 : shrinker = idr_find(&shrinker_idr, i);
911 : if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
912 : if (!shrinker)
913 : clear_bit(i, info->map);
914 : continue;
915 : }
916 :
917 : /* Call non-slab shrinkers even though kmem is disabled */
918 : if (!memcg_kmem_online() &&
919 : !(shrinker->flags & SHRINKER_NONSLAB))
920 : continue;
921 :
922 : ret = do_shrink_slab(&sc, shrinker, priority);
923 : if (ret == SHRINK_EMPTY) {
924 : clear_bit(i, info->map);
925 : /*
926 : * After the shrinker reported that it had no objects to
927 : * free, but before we cleared the corresponding bit in
928 : * the memcg shrinker map, a new object might have been
929 : * added. To make sure, we have the bit set in this
930 : * case, we invoke the shrinker one more time and reset
931 : * the bit if it reports that it is not empty anymore.
932 : * The memory barrier here pairs with the barrier in
933 : * set_shrinker_bit():
934 : *
935 : * list_lru_add() shrink_slab_memcg()
936 : * list_add_tail() clear_bit()
937 : * <MB> <MB>
938 : * set_bit() do_shrink_slab()
939 : */
940 : smp_mb__after_atomic();
941 : ret = do_shrink_slab(&sc, shrinker, priority);
942 : if (ret == SHRINK_EMPTY)
943 : ret = 0;
944 : else
945 : set_shrinker_bit(memcg, nid, i);
946 : }
947 : freed += ret;
948 :
949 : if (rwsem_is_contended(&shrinker_rwsem)) {
950 : freed = freed ? : 1;
951 : break;
952 : }
953 : }
954 : unlock:
955 : up_read(&shrinker_rwsem);
956 : return freed;
957 : }
958 : #else /* CONFIG_MEMCG */
959 : static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
960 : struct mem_cgroup *memcg, int priority)
961 : {
962 : return 0;
963 : }
964 : #endif /* CONFIG_MEMCG */
965 :
966 : /**
967 : * shrink_slab - shrink slab caches
968 : * @gfp_mask: allocation context
969 : * @nid: node whose slab caches to target
970 : * @memcg: memory cgroup whose slab caches to target
971 : * @priority: the reclaim priority
972 : *
973 : * Call the shrink functions to age shrinkable caches.
974 : *
975 : * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
976 : * unaware shrinkers will receive a node id of 0 instead.
977 : *
978 : * @memcg specifies the memory cgroup to target. Unaware shrinkers
979 : * are called only if it is the root cgroup.
980 : *
981 : * @priority is sc->priority, we take the number of objects and >> by priority
982 : * in order to get the scan target.
983 : *
984 : * Returns the number of reclaimed slab objects.
985 : */
986 0 : static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
987 : struct mem_cgroup *memcg,
988 : int priority)
989 : {
990 0 : unsigned long ret, freed = 0;
991 : struct shrinker *shrinker;
992 :
993 : /*
994 : * The root memcg might be allocated even though memcg is disabled
995 : * via "cgroup_disable=memory" boot parameter. This could make
996 : * mem_cgroup_is_root() return false, then just run memcg slab
997 : * shrink, but skip global shrink. This may result in premature
998 : * oom.
999 : */
1000 : if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
1001 : return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
1002 :
1003 0 : if (!down_read_trylock(&shrinker_rwsem))
1004 : goto out;
1005 :
1006 0 : list_for_each_entry(shrinker, &shrinker_list, list) {
1007 0 : struct shrink_control sc = {
1008 : .gfp_mask = gfp_mask,
1009 : .nid = nid,
1010 : .memcg = memcg,
1011 : };
1012 :
1013 0 : ret = do_shrink_slab(&sc, shrinker, priority);
1014 0 : if (ret == SHRINK_EMPTY)
1015 0 : ret = 0;
1016 0 : freed += ret;
1017 : /*
1018 : * Bail out if someone want to register a new shrinker to
1019 : * prevent the registration from being stalled for long periods
1020 : * by parallel ongoing shrinking.
1021 : */
1022 0 : if (rwsem_is_contended(&shrinker_rwsem)) {
1023 0 : freed = freed ? : 1;
1024 0 : break;
1025 : }
1026 : }
1027 :
1028 0 : up_read(&shrinker_rwsem);
1029 : out:
1030 0 : cond_resched();
1031 : return freed;
1032 : }
1033 :
1034 : static unsigned long drop_slab_node(int nid)
1035 : {
1036 0 : unsigned long freed = 0;
1037 0 : struct mem_cgroup *memcg = NULL;
1038 :
1039 0 : memcg = mem_cgroup_iter(NULL, NULL, NULL);
1040 : do {
1041 0 : freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
1042 0 : } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
1043 :
1044 : return freed;
1045 : }
1046 :
1047 0 : void drop_slab(void)
1048 : {
1049 : int nid;
1050 0 : int shift = 0;
1051 : unsigned long freed;
1052 :
1053 : do {
1054 0 : freed = 0;
1055 0 : for_each_online_node(nid) {
1056 0 : if (fatal_signal_pending(current))
1057 : return;
1058 :
1059 0 : freed += drop_slab_node(nid);
1060 : }
1061 0 : } while ((freed >> shift++) > 1);
1062 : }
1063 :
1064 : static int reclaimer_offset(void)
1065 : {
1066 : BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
1067 : PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
1068 : BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
1069 : PGSCAN_DIRECT - PGSCAN_KSWAPD);
1070 : BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
1071 : PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
1072 : BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
1073 : PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
1074 :
1075 0 : if (current_is_kswapd())
1076 : return 0;
1077 : if (current_is_khugepaged())
1078 : return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
1079 : return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
1080 : }
1081 :
1082 : static inline int is_page_cache_freeable(struct folio *folio)
1083 : {
1084 : /*
1085 : * A freeable page cache folio is referenced only by the caller
1086 : * that isolated the folio, the page cache and optional filesystem
1087 : * private data at folio->private.
1088 : */
1089 0 : return folio_ref_count(folio) - folio_test_private(folio) ==
1090 0 : 1 + folio_nr_pages(folio);
1091 : }
1092 :
1093 : /*
1094 : * We detected a synchronous write error writing a folio out. Probably
1095 : * -ENOSPC. We need to propagate that into the address_space for a subsequent
1096 : * fsync(), msync() or close().
1097 : *
1098 : * The tricky part is that after writepage we cannot touch the mapping: nothing
1099 : * prevents it from being freed up. But we have a ref on the folio and once
1100 : * that folio is locked, the mapping is pinned.
1101 : *
1102 : * We're allowed to run sleeping folio_lock() here because we know the caller has
1103 : * __GFP_FS.
1104 : */
1105 0 : static void handle_write_error(struct address_space *mapping,
1106 : struct folio *folio, int error)
1107 : {
1108 0 : folio_lock(folio);
1109 0 : if (folio_mapping(folio) == mapping)
1110 0 : mapping_set_error(mapping, error);
1111 0 : folio_unlock(folio);
1112 0 : }
1113 :
1114 0 : static bool skip_throttle_noprogress(pg_data_t *pgdat)
1115 : {
1116 0 : int reclaimable = 0, write_pending = 0;
1117 : int i;
1118 :
1119 : /*
1120 : * If kswapd is disabled, reschedule if necessary but do not
1121 : * throttle as the system is likely near OOM.
1122 : */
1123 0 : if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
1124 : return true;
1125 :
1126 : /*
1127 : * If there are a lot of dirty/writeback folios then do not
1128 : * throttle as throttling will occur when the folios cycle
1129 : * towards the end of the LRU if still under writeback.
1130 : */
1131 0 : for (i = 0; i < MAX_NR_ZONES; i++) {
1132 0 : struct zone *zone = pgdat->node_zones + i;
1133 :
1134 0 : if (!managed_zone(zone))
1135 0 : continue;
1136 :
1137 0 : reclaimable += zone_reclaimable_pages(zone);
1138 0 : write_pending += zone_page_state_snapshot(zone,
1139 : NR_ZONE_WRITE_PENDING);
1140 : }
1141 0 : if (2 * write_pending <= reclaimable)
1142 : return true;
1143 :
1144 0 : return false;
1145 : }
1146 :
1147 0 : void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
1148 : {
1149 0 : wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
1150 : long timeout, ret;
1151 0 : DEFINE_WAIT(wait);
1152 :
1153 : /*
1154 : * Do not throttle IO workers, kthreads other than kswapd or
1155 : * workqueues. They may be required for reclaim to make
1156 : * forward progress (e.g. journalling workqueues or kthreads).
1157 : */
1158 0 : if (!current_is_kswapd() &&
1159 0 : current->flags & (PF_IO_WORKER|PF_KTHREAD)) {
1160 0 : cond_resched();
1161 0 : return;
1162 : }
1163 :
1164 : /*
1165 : * These figures are pulled out of thin air.
1166 : * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
1167 : * parallel reclaimers which is a short-lived event so the timeout is
1168 : * short. Failing to make progress or waiting on writeback are
1169 : * potentially long-lived events so use a longer timeout. This is shaky
1170 : * logic as a failure to make progress could be due to anything from
1171 : * writeback to a slow device to excessive referenced folios at the tail
1172 : * of the inactive LRU.
1173 : */
1174 0 : switch(reason) {
1175 : case VMSCAN_THROTTLE_WRITEBACK:
1176 0 : timeout = HZ/10;
1177 :
1178 0 : if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
1179 0 : WRITE_ONCE(pgdat->nr_reclaim_start,
1180 : node_page_state(pgdat, NR_THROTTLED_WRITTEN));
1181 : }
1182 :
1183 : break;
1184 : case VMSCAN_THROTTLE_CONGESTED:
1185 : fallthrough;
1186 : case VMSCAN_THROTTLE_NOPROGRESS:
1187 0 : if (skip_throttle_noprogress(pgdat)) {
1188 0 : cond_resched();
1189 0 : return;
1190 : }
1191 :
1192 : timeout = 1;
1193 :
1194 : break;
1195 : case VMSCAN_THROTTLE_ISOLATED:
1196 : timeout = HZ/50;
1197 : break;
1198 : default:
1199 0 : WARN_ON_ONCE(1);
1200 : timeout = HZ;
1201 : break;
1202 : }
1203 :
1204 0 : prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1205 0 : ret = schedule_timeout(timeout);
1206 0 : finish_wait(wqh, &wait);
1207 :
1208 0 : if (reason == VMSCAN_THROTTLE_WRITEBACK)
1209 0 : atomic_dec(&pgdat->nr_writeback_throttled);
1210 :
1211 0 : trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
1212 0 : jiffies_to_usecs(timeout - ret),
1213 : reason);
1214 : }
1215 :
1216 : /*
1217 : * Account for folios written if tasks are throttled waiting on dirty
1218 : * folios to clean. If enough folios have been cleaned since throttling
1219 : * started then wakeup the throttled tasks.
1220 : */
1221 0 : void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
1222 : int nr_throttled)
1223 : {
1224 : unsigned long nr_written;
1225 :
1226 0 : node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
1227 :
1228 : /*
1229 : * This is an inaccurate read as the per-cpu deltas may not
1230 : * be synchronised. However, given that the system is
1231 : * writeback throttled, it is not worth taking the penalty
1232 : * of getting an accurate count. At worst, the throttle
1233 : * timeout guarantees forward progress.
1234 : */
1235 0 : nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
1236 0 : READ_ONCE(pgdat->nr_reclaim_start);
1237 :
1238 0 : if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
1239 0 : wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
1240 0 : }
1241 :
1242 : /* possible outcome of pageout() */
1243 : typedef enum {
1244 : /* failed to write folio out, folio is locked */
1245 : PAGE_KEEP,
1246 : /* move folio to the active list, folio is locked */
1247 : PAGE_ACTIVATE,
1248 : /* folio has been sent to the disk successfully, folio is unlocked */
1249 : PAGE_SUCCESS,
1250 : /* folio is clean and locked */
1251 : PAGE_CLEAN,
1252 : } pageout_t;
1253 :
1254 : /*
1255 : * pageout is called by shrink_folio_list() for each dirty folio.
1256 : * Calls ->writepage().
1257 : */
1258 0 : static pageout_t pageout(struct folio *folio, struct address_space *mapping,
1259 : struct swap_iocb **plug)
1260 : {
1261 : /*
1262 : * If the folio is dirty, only perform writeback if that write
1263 : * will be non-blocking. To prevent this allocation from being
1264 : * stalled by pagecache activity. But note that there may be
1265 : * stalls if we need to run get_block(). We could test
1266 : * PagePrivate for that.
1267 : *
1268 : * If this process is currently in __generic_file_write_iter() against
1269 : * this folio's queue, we can perform writeback even if that
1270 : * will block.
1271 : *
1272 : * If the folio is swapcache, write it back even if that would
1273 : * block, for some throttling. This happens by accident, because
1274 : * swap_backing_dev_info is bust: it doesn't reflect the
1275 : * congestion state of the swapdevs. Easy to fix, if needed.
1276 : */
1277 0 : if (!is_page_cache_freeable(folio))
1278 : return PAGE_KEEP;
1279 0 : if (!mapping) {
1280 : /*
1281 : * Some data journaling orphaned folios can have
1282 : * folio->mapping == NULL while being dirty with clean buffers.
1283 : */
1284 0 : if (folio_test_private(folio)) {
1285 0 : if (try_to_free_buffers(folio)) {
1286 0 : folio_clear_dirty(folio);
1287 0 : pr_info("%s: orphaned folio\n", __func__);
1288 0 : return PAGE_CLEAN;
1289 : }
1290 : }
1291 : return PAGE_KEEP;
1292 : }
1293 0 : if (mapping->a_ops->writepage == NULL)
1294 : return PAGE_ACTIVATE;
1295 :
1296 0 : if (folio_clear_dirty_for_io(folio)) {
1297 : int res;
1298 0 : struct writeback_control wbc = {
1299 : .sync_mode = WB_SYNC_NONE,
1300 : .nr_to_write = SWAP_CLUSTER_MAX,
1301 : .range_start = 0,
1302 : .range_end = LLONG_MAX,
1303 : .for_reclaim = 1,
1304 : .swap_plug = plug,
1305 : };
1306 :
1307 0 : folio_set_reclaim(folio);
1308 0 : res = mapping->a_ops->writepage(&folio->page, &wbc);
1309 0 : if (res < 0)
1310 0 : handle_write_error(mapping, folio, res);
1311 0 : if (res == AOP_WRITEPAGE_ACTIVATE) {
1312 0 : folio_clear_reclaim(folio);
1313 0 : return PAGE_ACTIVATE;
1314 : }
1315 :
1316 0 : if (!folio_test_writeback(folio)) {
1317 : /* synchronous write or broken a_ops? */
1318 : folio_clear_reclaim(folio);
1319 : }
1320 0 : trace_mm_vmscan_write_folio(folio);
1321 0 : node_stat_add_folio(folio, NR_VMSCAN_WRITE);
1322 0 : return PAGE_SUCCESS;
1323 : }
1324 :
1325 : return PAGE_CLEAN;
1326 : }
1327 :
1328 : /*
1329 : * Same as remove_mapping, but if the folio is removed from the mapping, it
1330 : * gets returned with a refcount of 0.
1331 : */
1332 0 : static int __remove_mapping(struct address_space *mapping, struct folio *folio,
1333 : bool reclaimed, struct mem_cgroup *target_memcg)
1334 : {
1335 : int refcount;
1336 0 : void *shadow = NULL;
1337 :
1338 0 : BUG_ON(!folio_test_locked(folio));
1339 0 : BUG_ON(mapping != folio_mapping(folio));
1340 :
1341 0 : if (!folio_test_swapcache(folio))
1342 0 : spin_lock(&mapping->host->i_lock);
1343 0 : xa_lock_irq(&mapping->i_pages);
1344 : /*
1345 : * The non racy check for a busy folio.
1346 : *
1347 : * Must be careful with the order of the tests. When someone has
1348 : * a ref to the folio, it may be possible that they dirty it then
1349 : * drop the reference. So if the dirty flag is tested before the
1350 : * refcount here, then the following race may occur:
1351 : *
1352 : * get_user_pages(&page);
1353 : * [user mapping goes away]
1354 : * write_to(page);
1355 : * !folio_test_dirty(folio) [good]
1356 : * folio_set_dirty(folio);
1357 : * folio_put(folio);
1358 : * !refcount(folio) [good, discard it]
1359 : *
1360 : * [oops, our write_to data is lost]
1361 : *
1362 : * Reversing the order of the tests ensures such a situation cannot
1363 : * escape unnoticed. The smp_rmb is needed to ensure the folio->flags
1364 : * load is not satisfied before that of folio->_refcount.
1365 : *
1366 : * Note that if the dirty flag is always set via folio_mark_dirty,
1367 : * and thus under the i_pages lock, then this ordering is not required.
1368 : */
1369 0 : refcount = 1 + folio_nr_pages(folio);
1370 0 : if (!folio_ref_freeze(folio, refcount))
1371 : goto cannot_free;
1372 : /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */
1373 0 : if (unlikely(folio_test_dirty(folio))) {
1374 : folio_ref_unfreeze(folio, refcount);
1375 : goto cannot_free;
1376 : }
1377 :
1378 0 : if (folio_test_swapcache(folio)) {
1379 0 : swp_entry_t swap = folio_swap_entry(folio);
1380 :
1381 0 : if (reclaimed && !mapping_exiting(mapping))
1382 0 : shadow = workingset_eviction(folio, target_memcg);
1383 0 : __delete_from_swap_cache(folio, swap, shadow);
1384 0 : mem_cgroup_swapout(folio, swap);
1385 0 : xa_unlock_irq(&mapping->i_pages);
1386 0 : put_swap_folio(folio, swap);
1387 : } else {
1388 : void (*free_folio)(struct folio *);
1389 :
1390 0 : free_folio = mapping->a_ops->free_folio;
1391 : /*
1392 : * Remember a shadow entry for reclaimed file cache in
1393 : * order to detect refaults, thus thrashing, later on.
1394 : *
1395 : * But don't store shadows in an address space that is
1396 : * already exiting. This is not just an optimization,
1397 : * inode reclaim needs to empty out the radix tree or
1398 : * the nodes are lost. Don't plant shadows behind its
1399 : * back.
1400 : *
1401 : * We also don't store shadows for DAX mappings because the
1402 : * only page cache folios found in these are zero pages
1403 : * covering holes, and because we don't want to mix DAX
1404 : * exceptional entries and shadow exceptional entries in the
1405 : * same address_space.
1406 : */
1407 0 : if (reclaimed && folio_is_file_lru(folio) &&
1408 0 : !mapping_exiting(mapping) && !dax_mapping(mapping))
1409 0 : shadow = workingset_eviction(folio, target_memcg);
1410 0 : __filemap_remove_folio(folio, shadow);
1411 0 : xa_unlock_irq(&mapping->i_pages);
1412 0 : if (mapping_shrinkable(mapping))
1413 0 : inode_add_lru(mapping->host);
1414 0 : spin_unlock(&mapping->host->i_lock);
1415 :
1416 0 : if (free_folio)
1417 0 : free_folio(folio);
1418 : }
1419 :
1420 : return 1;
1421 :
1422 : cannot_free:
1423 0 : xa_unlock_irq(&mapping->i_pages);
1424 0 : if (!folio_test_swapcache(folio))
1425 0 : spin_unlock(&mapping->host->i_lock);
1426 : return 0;
1427 : }
1428 :
1429 : /**
1430 : * remove_mapping() - Attempt to remove a folio from its mapping.
1431 : * @mapping: The address space.
1432 : * @folio: The folio to remove.
1433 : *
1434 : * If the folio is dirty, under writeback or if someone else has a ref
1435 : * on it, removal will fail.
1436 : * Return: The number of pages removed from the mapping. 0 if the folio
1437 : * could not be removed.
1438 : * Context: The caller should have a single refcount on the folio and
1439 : * hold its lock.
1440 : */
1441 0 : long remove_mapping(struct address_space *mapping, struct folio *folio)
1442 : {
1443 0 : if (__remove_mapping(mapping, folio, false, NULL)) {
1444 : /*
1445 : * Unfreezing the refcount with 1 effectively
1446 : * drops the pagecache ref for us without requiring another
1447 : * atomic operation.
1448 : */
1449 0 : folio_ref_unfreeze(folio, 1);
1450 : return folio_nr_pages(folio);
1451 : }
1452 : return 0;
1453 : }
1454 :
1455 : /**
1456 : * folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
1457 : * @folio: Folio to be returned to an LRU list.
1458 : *
1459 : * Add previously isolated @folio to appropriate LRU list.
1460 : * The folio may still be unevictable for other reasons.
1461 : *
1462 : * Context: lru_lock must not be held, interrupts must be enabled.
1463 : */
1464 0 : void folio_putback_lru(struct folio *folio)
1465 : {
1466 0 : folio_add_lru(folio);
1467 0 : folio_put(folio); /* drop ref from isolate */
1468 0 : }
1469 :
1470 : enum folio_references {
1471 : FOLIOREF_RECLAIM,
1472 : FOLIOREF_RECLAIM_CLEAN,
1473 : FOLIOREF_KEEP,
1474 : FOLIOREF_ACTIVATE,
1475 : };
1476 :
1477 0 : static enum folio_references folio_check_references(struct folio *folio,
1478 : struct scan_control *sc)
1479 : {
1480 : int referenced_ptes, referenced_folio;
1481 : unsigned long vm_flags;
1482 :
1483 0 : referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
1484 : &vm_flags);
1485 0 : referenced_folio = folio_test_clear_referenced(folio);
1486 :
1487 : /*
1488 : * The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
1489 : * Let the folio, now marked Mlocked, be moved to the unevictable list.
1490 : */
1491 0 : if (vm_flags & VM_LOCKED)
1492 : return FOLIOREF_ACTIVATE;
1493 :
1494 : /* rmap lock contention: rotate */
1495 0 : if (referenced_ptes == -1)
1496 : return FOLIOREF_KEEP;
1497 :
1498 0 : if (referenced_ptes) {
1499 : /*
1500 : * All mapped folios start out with page table
1501 : * references from the instantiating fault, so we need
1502 : * to look twice if a mapped file/anon folio is used more
1503 : * than once.
1504 : *
1505 : * Mark it and spare it for another trip around the
1506 : * inactive list. Another page table reference will
1507 : * lead to its activation.
1508 : *
1509 : * Note: the mark is set for activated folios as well
1510 : * so that recently deactivated but used folios are
1511 : * quickly recovered.
1512 : */
1513 0 : folio_set_referenced(folio);
1514 :
1515 0 : if (referenced_folio || referenced_ptes > 1)
1516 : return FOLIOREF_ACTIVATE;
1517 :
1518 : /*
1519 : * Activate file-backed executable folios after first usage.
1520 : */
1521 0 : if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
1522 : return FOLIOREF_ACTIVATE;
1523 :
1524 : return FOLIOREF_KEEP;
1525 : }
1526 :
1527 : /* Reclaim if clean, defer dirty folios to writeback */
1528 0 : if (referenced_folio && folio_is_file_lru(folio))
1529 : return FOLIOREF_RECLAIM_CLEAN;
1530 :
1531 : return FOLIOREF_RECLAIM;
1532 : }
1533 :
1534 : /* Check if a folio is dirty or under writeback */
1535 0 : static void folio_check_dirty_writeback(struct folio *folio,
1536 : bool *dirty, bool *writeback)
1537 : {
1538 : struct address_space *mapping;
1539 :
1540 : /*
1541 : * Anonymous folios are not handled by flushers and must be written
1542 : * from reclaim context. Do not stall reclaim based on them.
1543 : * MADV_FREE anonymous folios are put into inactive file list too.
1544 : * They could be mistakenly treated as file lru. So further anon
1545 : * test is needed.
1546 : */
1547 0 : if (!folio_is_file_lru(folio) ||
1548 0 : (folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
1549 0 : *dirty = false;
1550 0 : *writeback = false;
1551 0 : return;
1552 : }
1553 :
1554 : /* By default assume that the folio flags are accurate */
1555 0 : *dirty = folio_test_dirty(folio);
1556 0 : *writeback = folio_test_writeback(folio);
1557 :
1558 : /* Verify dirty/writeback state if the filesystem supports it */
1559 0 : if (!folio_test_private(folio))
1560 : return;
1561 :
1562 0 : mapping = folio_mapping(folio);
1563 0 : if (mapping && mapping->a_ops->is_dirty_writeback)
1564 0 : mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
1565 : }
1566 :
1567 : static struct page *alloc_demote_page(struct page *page, unsigned long private)
1568 : {
1569 : struct page *target_page;
1570 : nodemask_t *allowed_mask;
1571 : struct migration_target_control *mtc;
1572 :
1573 : mtc = (struct migration_target_control *)private;
1574 :
1575 : allowed_mask = mtc->nmask;
1576 : /*
1577 : * make sure we allocate from the target node first also trying to
1578 : * demote or reclaim pages from the target node via kswapd if we are
1579 : * low on free memory on target node. If we don't do this and if
1580 : * we have free memory on the slower(lower) memtier, we would start
1581 : * allocating pages from slower(lower) memory tiers without even forcing
1582 : * a demotion of cold pages from the target memtier. This can result
1583 : * in the kernel placing hot pages in slower(lower) memory tiers.
1584 : */
1585 : mtc->nmask = NULL;
1586 : mtc->gfp_mask |= __GFP_THISNODE;
1587 : target_page = alloc_migration_target(page, (unsigned long)mtc);
1588 : if (target_page)
1589 : return target_page;
1590 :
1591 : mtc->gfp_mask &= ~__GFP_THISNODE;
1592 : mtc->nmask = allowed_mask;
1593 :
1594 : return alloc_migration_target(page, (unsigned long)mtc);
1595 : }
1596 :
1597 : /*
1598 : * Take folios on @demote_folios and attempt to demote them to another node.
1599 : * Folios which are not demoted are left on @demote_folios.
1600 : */
1601 : static unsigned int demote_folio_list(struct list_head *demote_folios,
1602 : struct pglist_data *pgdat)
1603 : {
1604 0 : int target_nid = next_demotion_node(pgdat->node_id);
1605 : unsigned int nr_succeeded;
1606 : nodemask_t allowed_mask;
1607 :
1608 0 : struct migration_target_control mtc = {
1609 : /*
1610 : * Allocate from 'node', or fail quickly and quietly.
1611 : * When this happens, 'page' will likely just be discarded
1612 : * instead of migrated.
1613 : */
1614 : .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
1615 : __GFP_NOMEMALLOC | GFP_NOWAIT,
1616 : .nid = target_nid,
1617 : .nmask = &allowed_mask
1618 : };
1619 :
1620 0 : if (list_empty(demote_folios))
1621 : return 0;
1622 :
1623 : if (target_nid == NUMA_NO_NODE)
1624 : return 0;
1625 :
1626 : node_get_allowed_targets(pgdat, &allowed_mask);
1627 :
1628 : /* Demotion ignores all cpuset and mempolicy settings */
1629 : migrate_pages(demote_folios, alloc_demote_page, NULL,
1630 : (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
1631 : &nr_succeeded);
1632 :
1633 : __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
1634 :
1635 : return nr_succeeded;
1636 : }
1637 :
1638 0 : static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
1639 : {
1640 0 : if (gfp_mask & __GFP_FS)
1641 : return true;
1642 0 : if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO))
1643 : return false;
1644 : /*
1645 : * We can "enter_fs" for swap-cache with only __GFP_IO
1646 : * providing this isn't SWP_FS_OPS.
1647 : * ->flags can be updated non-atomicially (scan_swap_map_slots),
1648 : * but that will never affect SWP_FS_OPS, so the data_race
1649 : * is safe.
1650 : */
1651 0 : return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
1652 : }
1653 :
1654 : /*
1655 : * shrink_folio_list() returns the number of reclaimed pages
1656 : */
1657 0 : static unsigned int shrink_folio_list(struct list_head *folio_list,
1658 : struct pglist_data *pgdat, struct scan_control *sc,
1659 : struct reclaim_stat *stat, bool ignore_references)
1660 : {
1661 0 : LIST_HEAD(ret_folios);
1662 0 : LIST_HEAD(free_folios);
1663 0 : LIST_HEAD(demote_folios);
1664 0 : unsigned int nr_reclaimed = 0;
1665 0 : unsigned int pgactivate = 0;
1666 : bool do_demote_pass;
1667 0 : struct swap_iocb *plug = NULL;
1668 :
1669 0 : memset(stat, 0, sizeof(*stat));
1670 0 : cond_resched();
1671 0 : do_demote_pass = can_demote(pgdat->node_id, sc);
1672 :
1673 : retry:
1674 0 : while (!list_empty(folio_list)) {
1675 : struct address_space *mapping;
1676 : struct folio *folio;
1677 0 : enum folio_references references = FOLIOREF_RECLAIM;
1678 : bool dirty, writeback;
1679 : unsigned int nr_pages;
1680 :
1681 0 : cond_resched();
1682 :
1683 0 : folio = lru_to_folio(folio_list);
1684 0 : list_del(&folio->lru);
1685 :
1686 0 : if (!folio_trylock(folio))
1687 : goto keep;
1688 :
1689 : VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
1690 :
1691 0 : nr_pages = folio_nr_pages(folio);
1692 :
1693 : /* Account the number of base pages */
1694 0 : sc->nr_scanned += nr_pages;
1695 :
1696 0 : if (unlikely(!folio_evictable(folio)))
1697 : goto activate_locked;
1698 :
1699 0 : if (!sc->may_unmap && folio_mapped(folio))
1700 : goto keep_locked;
1701 :
1702 : /* folio_update_gen() tried to promote this page? */
1703 : if (lru_gen_enabled() && !ignore_references &&
1704 : folio_mapped(folio) && folio_test_referenced(folio))
1705 : goto keep_locked;
1706 :
1707 : /*
1708 : * The number of dirty pages determines if a node is marked
1709 : * reclaim_congested. kswapd will stall and start writing
1710 : * folios if the tail of the LRU is all dirty unqueued folios.
1711 : */
1712 0 : folio_check_dirty_writeback(folio, &dirty, &writeback);
1713 0 : if (dirty || writeback)
1714 0 : stat->nr_dirty += nr_pages;
1715 :
1716 0 : if (dirty && !writeback)
1717 0 : stat->nr_unqueued_dirty += nr_pages;
1718 :
1719 : /*
1720 : * Treat this folio as congested if folios are cycling
1721 : * through the LRU so quickly that the folios marked
1722 : * for immediate reclaim are making it to the end of
1723 : * the LRU a second time.
1724 : */
1725 0 : if (writeback && folio_test_reclaim(folio))
1726 0 : stat->nr_congested += nr_pages;
1727 :
1728 : /*
1729 : * If a folio at the tail of the LRU is under writeback, there
1730 : * are three cases to consider.
1731 : *
1732 : * 1) If reclaim is encountering an excessive number
1733 : * of folios under writeback and this folio has both
1734 : * the writeback and reclaim flags set, then it
1735 : * indicates that folios are being queued for I/O but
1736 : * are being recycled through the LRU before the I/O
1737 : * can complete. Waiting on the folio itself risks an
1738 : * indefinite stall if it is impossible to writeback
1739 : * the folio due to I/O error or disconnected storage
1740 : * so instead note that the LRU is being scanned too
1741 : * quickly and the caller can stall after the folio
1742 : * list has been processed.
1743 : *
1744 : * 2) Global or new memcg reclaim encounters a folio that is
1745 : * not marked for immediate reclaim, or the caller does not
1746 : * have __GFP_FS (or __GFP_IO if it's simply going to swap,
1747 : * not to fs). In this case mark the folio for immediate
1748 : * reclaim and continue scanning.
1749 : *
1750 : * Require may_enter_fs() because we would wait on fs, which
1751 : * may not have submitted I/O yet. And the loop driver might
1752 : * enter reclaim, and deadlock if it waits on a folio for
1753 : * which it is needed to do the write (loop masks off
1754 : * __GFP_IO|__GFP_FS for this reason); but more thought
1755 : * would probably show more reasons.
1756 : *
1757 : * 3) Legacy memcg encounters a folio that already has the
1758 : * reclaim flag set. memcg does not have any dirty folio
1759 : * throttling so we could easily OOM just because too many
1760 : * folios are in writeback and there is nothing else to
1761 : * reclaim. Wait for the writeback to complete.
1762 : *
1763 : * In cases 1) and 2) we activate the folios to get them out of
1764 : * the way while we continue scanning for clean folios on the
1765 : * inactive list and refilling from the active list. The
1766 : * observation here is that waiting for disk writes is more
1767 : * expensive than potentially causing reloads down the line.
1768 : * Since they're marked for immediate reclaim, they won't put
1769 : * memory pressure on the cache working set any longer than it
1770 : * takes to write them to disk.
1771 : */
1772 0 : if (folio_test_writeback(folio)) {
1773 : /* Case 1 above */
1774 0 : if (current_is_kswapd() &&
1775 0 : folio_test_reclaim(folio) &&
1776 0 : test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1777 0 : stat->nr_immediate += nr_pages;
1778 0 : goto activate_locked;
1779 :
1780 : /* Case 2 above */
1781 0 : } else if (writeback_throttling_sane(sc) ||
1782 : !folio_test_reclaim(folio) ||
1783 : !may_enter_fs(folio, sc->gfp_mask)) {
1784 : /*
1785 : * This is slightly racy -
1786 : * folio_end_writeback() might have
1787 : * just cleared the reclaim flag, then
1788 : * setting the reclaim flag here ends up
1789 : * interpreted as the readahead flag - but
1790 : * that does not matter enough to care.
1791 : * What we do want is for this folio to
1792 : * have the reclaim flag set next time
1793 : * memcg reclaim reaches the tests above,
1794 : * so it will then wait for writeback to
1795 : * avoid OOM; and it's also appropriate
1796 : * in global reclaim.
1797 : */
1798 0 : folio_set_reclaim(folio);
1799 0 : stat->nr_writeback += nr_pages;
1800 0 : goto activate_locked;
1801 :
1802 : /* Case 3 above */
1803 : } else {
1804 : folio_unlock(folio);
1805 : folio_wait_writeback(folio);
1806 : /* then go back and try same folio again */
1807 : list_add_tail(&folio->lru, folio_list);
1808 0 : continue;
1809 : }
1810 : }
1811 :
1812 0 : if (!ignore_references)
1813 0 : references = folio_check_references(folio, sc);
1814 :
1815 0 : switch (references) {
1816 : case FOLIOREF_ACTIVATE:
1817 : goto activate_locked;
1818 : case FOLIOREF_KEEP:
1819 0 : stat->nr_ref_keep += nr_pages;
1820 0 : goto keep_locked;
1821 : case FOLIOREF_RECLAIM:
1822 : case FOLIOREF_RECLAIM_CLEAN:
1823 : ; /* try to reclaim the folio below */
1824 : }
1825 :
1826 : /*
1827 : * Before reclaiming the folio, try to relocate
1828 : * its contents to another node.
1829 : */
1830 : if (do_demote_pass &&
1831 : (thp_migration_supported() || !folio_test_large(folio))) {
1832 : list_add(&folio->lru, &demote_folios);
1833 : folio_unlock(folio);
1834 : continue;
1835 : }
1836 :
1837 : /*
1838 : * Anonymous process memory has backing store?
1839 : * Try to allocate it some swap space here.
1840 : * Lazyfree folio could be freed directly
1841 : */
1842 0 : if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {
1843 0 : if (!folio_test_swapcache(folio)) {
1844 0 : if (!(sc->gfp_mask & __GFP_IO))
1845 : goto keep_locked;
1846 0 : if (folio_maybe_dma_pinned(folio))
1847 : goto keep_locked;
1848 0 : if (folio_test_large(folio)) {
1849 : /* cannot split folio, skip it */
1850 : if (!can_split_folio(folio, NULL))
1851 : goto activate_locked;
1852 : /*
1853 : * Split folios without a PMD map right
1854 : * away. Chances are some or all of the
1855 : * tail pages can be freed without IO.
1856 : */
1857 : if (!folio_entire_mapcount(folio) &&
1858 : split_folio_to_list(folio,
1859 : folio_list))
1860 : goto activate_locked;
1861 : }
1862 0 : if (!add_to_swap(folio)) {
1863 0 : if (!folio_test_large(folio))
1864 : goto activate_locked_split;
1865 : /* Fallback to swap normal pages */
1866 0 : if (split_folio_to_list(folio,
1867 : folio_list))
1868 : goto activate_locked;
1869 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1870 : count_vm_event(THP_SWPOUT_FALLBACK);
1871 : #endif
1872 0 : if (!add_to_swap(folio))
1873 : goto activate_locked_split;
1874 : }
1875 : }
1876 0 : } else if (folio_test_swapbacked(folio) &&
1877 0 : folio_test_large(folio)) {
1878 : /* Split shmem folio */
1879 : if (split_folio_to_list(folio, folio_list))
1880 : goto keep_locked;
1881 : }
1882 :
1883 : /*
1884 : * If the folio was split above, the tail pages will make
1885 : * their own pass through this function and be accounted
1886 : * then.
1887 : */
1888 0 : if ((nr_pages > 1) && !folio_test_large(folio)) {
1889 0 : sc->nr_scanned -= (nr_pages - 1);
1890 0 : nr_pages = 1;
1891 : }
1892 :
1893 : /*
1894 : * The folio is mapped into the page tables of one or more
1895 : * processes. Try to unmap it here.
1896 : */
1897 0 : if (folio_mapped(folio)) {
1898 0 : enum ttu_flags flags = TTU_BATCH_FLUSH;
1899 0 : bool was_swapbacked = folio_test_swapbacked(folio);
1900 :
1901 : if (folio_test_pmd_mappable(folio))
1902 : flags |= TTU_SPLIT_HUGE_PMD;
1903 :
1904 0 : try_to_unmap(folio, flags);
1905 0 : if (folio_mapped(folio)) {
1906 0 : stat->nr_unmap_fail += nr_pages;
1907 0 : if (!was_swapbacked &&
1908 0 : folio_test_swapbacked(folio))
1909 0 : stat->nr_lazyfree_fail += nr_pages;
1910 : goto activate_locked;
1911 : }
1912 : }
1913 :
1914 0 : mapping = folio_mapping(folio);
1915 0 : if (folio_test_dirty(folio)) {
1916 : /*
1917 : * Only kswapd can writeback filesystem folios
1918 : * to avoid risk of stack overflow. But avoid
1919 : * injecting inefficient single-folio I/O into
1920 : * flusher writeback as much as possible: only
1921 : * write folios when we've encountered many
1922 : * dirty folios, and when we've already scanned
1923 : * the rest of the LRU for clean folios and see
1924 : * the same dirty folios again (with the reclaim
1925 : * flag set).
1926 : */
1927 0 : if (folio_is_file_lru(folio) &&
1928 0 : (!current_is_kswapd() ||
1929 0 : !folio_test_reclaim(folio) ||
1930 0 : !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1931 : /*
1932 : * Immediately reclaim when written back.
1933 : * Similar in principle to folio_deactivate()
1934 : * except we already have the folio isolated
1935 : * and know it's dirty
1936 : */
1937 0 : node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
1938 : nr_pages);
1939 : folio_set_reclaim(folio);
1940 :
1941 : goto activate_locked;
1942 : }
1943 :
1944 0 : if (references == FOLIOREF_RECLAIM_CLEAN)
1945 : goto keep_locked;
1946 0 : if (!may_enter_fs(folio, sc->gfp_mask))
1947 : goto keep_locked;
1948 0 : if (!sc->may_writepage)
1949 : goto keep_locked;
1950 :
1951 : /*
1952 : * Folio is dirty. Flush the TLB if a writable entry
1953 : * potentially exists to avoid CPU writes after I/O
1954 : * starts and then write it out here.
1955 : */
1956 : try_to_unmap_flush_dirty();
1957 0 : switch (pageout(folio, mapping, &plug)) {
1958 : case PAGE_KEEP:
1959 : goto keep_locked;
1960 : case PAGE_ACTIVATE:
1961 : goto activate_locked;
1962 : case PAGE_SUCCESS:
1963 0 : stat->nr_pageout += nr_pages;
1964 :
1965 0 : if (folio_test_writeback(folio))
1966 : goto keep;
1967 0 : if (folio_test_dirty(folio))
1968 : goto keep;
1969 :
1970 : /*
1971 : * A synchronous write - probably a ramdisk. Go
1972 : * ahead and try to reclaim the folio.
1973 : */
1974 0 : if (!folio_trylock(folio))
1975 : goto keep;
1976 0 : if (folio_test_dirty(folio) ||
1977 0 : folio_test_writeback(folio))
1978 : goto keep_locked;
1979 0 : mapping = folio_mapping(folio);
1980 : fallthrough;
1981 : case PAGE_CLEAN:
1982 : ; /* try to free the folio below */
1983 : }
1984 : }
1985 :
1986 : /*
1987 : * If the folio has buffers, try to free the buffer
1988 : * mappings associated with this folio. If we succeed
1989 : * we try to free the folio as well.
1990 : *
1991 : * We do this even if the folio is dirty.
1992 : * filemap_release_folio() does not perform I/O, but it
1993 : * is possible for a folio to have the dirty flag set,
1994 : * but it is actually clean (all its buffers are clean).
1995 : * This happens if the buffers were written out directly,
1996 : * with submit_bh(). ext3 will do this, as well as
1997 : * the blockdev mapping. filemap_release_folio() will
1998 : * discover that cleanness and will drop the buffers
1999 : * and mark the folio clean - it can be freed.
2000 : *
2001 : * Rarely, folios can have buffers and no ->mapping.
2002 : * These are the folios which were not successfully
2003 : * invalidated in truncate_cleanup_folio(). We try to
2004 : * drop those buffers here and if that worked, and the
2005 : * folio is no longer mapped into process address space
2006 : * (refcount == 1) it can be freed. Otherwise, leave
2007 : * the folio on the LRU so it is swappable.
2008 : */
2009 0 : if (folio_has_private(folio)) {
2010 0 : if (!filemap_release_folio(folio, sc->gfp_mask))
2011 : goto activate_locked;
2012 0 : if (!mapping && folio_ref_count(folio) == 1) {
2013 0 : folio_unlock(folio);
2014 0 : if (folio_put_testzero(folio))
2015 : goto free_it;
2016 : else {
2017 : /*
2018 : * rare race with speculative reference.
2019 : * the speculative reference will free
2020 : * this folio shortly, so we may
2021 : * increment nr_reclaimed here (and
2022 : * leave it off the LRU).
2023 : */
2024 0 : nr_reclaimed += nr_pages;
2025 0 : continue;
2026 : }
2027 : }
2028 : }
2029 :
2030 0 : if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
2031 : /* follow __remove_mapping for reference */
2032 0 : if (!folio_ref_freeze(folio, 1))
2033 : goto keep_locked;
2034 : /*
2035 : * The folio has only one reference left, which is
2036 : * from the isolation. After the caller puts the
2037 : * folio back on the lru and drops the reference, the
2038 : * folio will be freed anyway. It doesn't matter
2039 : * which lru it goes on. So we don't bother checking
2040 : * the dirty flag here.
2041 : */
2042 0 : count_vm_events(PGLAZYFREED, nr_pages);
2043 0 : count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);
2044 0 : } else if (!mapping || !__remove_mapping(mapping, folio, true,
2045 : sc->target_mem_cgroup))
2046 : goto keep_locked;
2047 :
2048 0 : folio_unlock(folio);
2049 : free_it:
2050 : /*
2051 : * Folio may get swapped out as a whole, need to account
2052 : * all pages in it.
2053 : */
2054 0 : nr_reclaimed += nr_pages;
2055 :
2056 : /*
2057 : * Is there need to periodically free_folio_list? It would
2058 : * appear not as the counts should be low
2059 : */
2060 0 : if (unlikely(folio_test_large(folio)))
2061 0 : destroy_large_folio(folio);
2062 : else
2063 0 : list_add(&folio->lru, &free_folios);
2064 0 : continue;
2065 :
2066 : activate_locked_split:
2067 : /*
2068 : * The tail pages that are failed to add into swap cache
2069 : * reach here. Fixup nr_scanned and nr_pages.
2070 : */
2071 0 : if (nr_pages > 1) {
2072 0 : sc->nr_scanned -= (nr_pages - 1);
2073 0 : nr_pages = 1;
2074 : }
2075 : activate_locked:
2076 : /* Not a candidate for swapping, so reclaim swap space. */
2077 0 : if (folio_test_swapcache(folio) &&
2078 0 : (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
2079 0 : folio_free_swap(folio);
2080 : VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
2081 0 : if (!folio_test_mlocked(folio)) {
2082 0 : int type = folio_is_file_lru(folio);
2083 0 : folio_set_active(folio);
2084 0 : stat->nr_activate[type] += nr_pages;
2085 0 : count_memcg_folio_events(folio, PGACTIVATE, nr_pages);
2086 : }
2087 : keep_locked:
2088 0 : folio_unlock(folio);
2089 : keep:
2090 0 : list_add(&folio->lru, &ret_folios);
2091 : VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
2092 : folio_test_unevictable(folio), folio);
2093 : }
2094 : /* 'folio_list' is always empty here */
2095 :
2096 : /* Migrate folios selected for demotion */
2097 0 : nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
2098 : /* Folios that could not be demoted are still in @demote_folios */
2099 0 : if (!list_empty(&demote_folios)) {
2100 : /* Folios which weren't demoted go back on @folio_list */
2101 0 : list_splice_init(&demote_folios, folio_list);
2102 :
2103 : /*
2104 : * goto retry to reclaim the undemoted folios in folio_list if
2105 : * desired.
2106 : *
2107 : * Reclaiming directly from top tier nodes is not often desired
2108 : * due to it breaking the LRU ordering: in general memory
2109 : * should be reclaimed from lower tier nodes and demoted from
2110 : * top tier nodes.
2111 : *
2112 : * However, disabling reclaim from top tier nodes entirely
2113 : * would cause ooms in edge scenarios where lower tier memory
2114 : * is unreclaimable for whatever reason, eg memory being
2115 : * mlocked or too hot to reclaim. We can disable reclaim
2116 : * from top tier nodes in proactive reclaim though as that is
2117 : * not real memory pressure.
2118 : */
2119 0 : if (!sc->proactive) {
2120 : do_demote_pass = false;
2121 : goto retry;
2122 : }
2123 : }
2124 :
2125 0 : pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
2126 :
2127 0 : mem_cgroup_uncharge_list(&free_folios);
2128 : try_to_unmap_flush();
2129 0 : free_unref_page_list(&free_folios);
2130 :
2131 0 : list_splice(&ret_folios, folio_list);
2132 0 : count_vm_events(PGACTIVATE, pgactivate);
2133 :
2134 0 : if (plug)
2135 0 : swap_write_unplug(plug);
2136 0 : return nr_reclaimed;
2137 : }
2138 :
2139 0 : unsigned int reclaim_clean_pages_from_list(struct zone *zone,
2140 : struct list_head *folio_list)
2141 : {
2142 0 : struct scan_control sc = {
2143 : .gfp_mask = GFP_KERNEL,
2144 : .may_unmap = 1,
2145 : };
2146 : struct reclaim_stat stat;
2147 : unsigned int nr_reclaimed;
2148 : struct folio *folio, *next;
2149 0 : LIST_HEAD(clean_folios);
2150 : unsigned int noreclaim_flag;
2151 :
2152 0 : list_for_each_entry_safe(folio, next, folio_list, lru) {
2153 0 : if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
2154 0 : !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
2155 0 : !folio_test_unevictable(folio)) {
2156 0 : folio_clear_active(folio);
2157 0 : list_move(&folio->lru, &clean_folios);
2158 : }
2159 : }
2160 :
2161 : /*
2162 : * We should be safe here since we are only dealing with file pages and
2163 : * we are not kswapd and therefore cannot write dirty file pages. But
2164 : * call memalloc_noreclaim_save() anyway, just in case these conditions
2165 : * change in the future.
2166 : */
2167 0 : noreclaim_flag = memalloc_noreclaim_save();
2168 0 : nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
2169 : &stat, true);
2170 0 : memalloc_noreclaim_restore(noreclaim_flag);
2171 :
2172 0 : list_splice(&clean_folios, folio_list);
2173 0 : mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
2174 : -(long)nr_reclaimed);
2175 : /*
2176 : * Since lazyfree pages are isolated from file LRU from the beginning,
2177 : * they will rotate back to anonymous LRU in the end if it failed to
2178 : * discard so isolated count will be mismatched.
2179 : * Compensate the isolated count for both LRU lists.
2180 : */
2181 0 : mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
2182 0 : stat.nr_lazyfree_fail);
2183 0 : mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
2184 0 : -(long)stat.nr_lazyfree_fail);
2185 0 : return nr_reclaimed;
2186 : }
2187 :
2188 : /*
2189 : * Update LRU sizes after isolating pages. The LRU size updates must
2190 : * be complete before mem_cgroup_update_lru_size due to a sanity check.
2191 : */
2192 : static __always_inline void update_lru_sizes(struct lruvec *lruvec,
2193 : enum lru_list lru, unsigned long *nr_zone_taken)
2194 : {
2195 : int zid;
2196 :
2197 0 : for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2198 0 : if (!nr_zone_taken[zid])
2199 0 : continue;
2200 :
2201 0 : update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
2202 : }
2203 :
2204 : }
2205 :
2206 : /*
2207 : * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
2208 : *
2209 : * lruvec->lru_lock is heavily contended. Some of the functions that
2210 : * shrink the lists perform better by taking out a batch of pages
2211 : * and working on them outside the LRU lock.
2212 : *
2213 : * For pagecache intensive workloads, this function is the hottest
2214 : * spot in the kernel (apart from copy_*_user functions).
2215 : *
2216 : * Lru_lock must be held before calling this function.
2217 : *
2218 : * @nr_to_scan: The number of eligible pages to look through on the list.
2219 : * @lruvec: The LRU vector to pull pages from.
2220 : * @dst: The temp list to put pages on to.
2221 : * @nr_scanned: The number of pages that were scanned.
2222 : * @sc: The scan_control struct for this reclaim session
2223 : * @lru: LRU list id for isolating
2224 : *
2225 : * returns how many pages were moved onto *@dst.
2226 : */
2227 0 : static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
2228 : struct lruvec *lruvec, struct list_head *dst,
2229 : unsigned long *nr_scanned, struct scan_control *sc,
2230 : enum lru_list lru)
2231 : {
2232 0 : struct list_head *src = &lruvec->lists[lru];
2233 0 : unsigned long nr_taken = 0;
2234 0 : unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
2235 0 : unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
2236 0 : unsigned long skipped = 0;
2237 : unsigned long scan, total_scan, nr_pages;
2238 0 : LIST_HEAD(folios_skipped);
2239 :
2240 0 : total_scan = 0;
2241 0 : scan = 0;
2242 0 : while (scan < nr_to_scan && !list_empty(src)) {
2243 0 : struct list_head *move_to = src;
2244 : struct folio *folio;
2245 :
2246 0 : folio = lru_to_folio(src);
2247 : prefetchw_prev_lru_folio(folio, src, flags);
2248 :
2249 0 : nr_pages = folio_nr_pages(folio);
2250 0 : total_scan += nr_pages;
2251 :
2252 0 : if (folio_zonenum(folio) > sc->reclaim_idx) {
2253 0 : nr_skipped[folio_zonenum(folio)] += nr_pages;
2254 0 : move_to = &folios_skipped;
2255 0 : goto move;
2256 : }
2257 :
2258 : /*
2259 : * Do not count skipped folios because that makes the function
2260 : * return with no isolated folios if the LRU mostly contains
2261 : * ineligible folios. This causes the VM to not reclaim any
2262 : * folios, triggering a premature OOM.
2263 : * Account all pages in a folio.
2264 : */
2265 0 : scan += nr_pages;
2266 :
2267 0 : if (!folio_test_lru(folio))
2268 : goto move;
2269 0 : if (!sc->may_unmap && folio_mapped(folio))
2270 : goto move;
2271 :
2272 : /*
2273 : * Be careful not to clear the lru flag until after we're
2274 : * sure the folio is not being freed elsewhere -- the
2275 : * folio release code relies on it.
2276 : */
2277 0 : if (unlikely(!folio_try_get(folio)))
2278 : goto move;
2279 :
2280 0 : if (!folio_test_clear_lru(folio)) {
2281 : /* Another thread is already isolating this folio */
2282 : folio_put(folio);
2283 : goto move;
2284 : }
2285 :
2286 0 : nr_taken += nr_pages;
2287 0 : nr_zone_taken[folio_zonenum(folio)] += nr_pages;
2288 0 : move_to = dst;
2289 : move:
2290 0 : list_move(&folio->lru, move_to);
2291 : }
2292 :
2293 : /*
2294 : * Splice any skipped folios to the start of the LRU list. Note that
2295 : * this disrupts the LRU order when reclaiming for lower zones but
2296 : * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
2297 : * scanning would soon rescan the same folios to skip and waste lots
2298 : * of cpu cycles.
2299 : */
2300 0 : if (!list_empty(&folios_skipped)) {
2301 : int zid;
2302 :
2303 : list_splice(&folios_skipped, src);
2304 0 : for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2305 0 : if (!nr_skipped[zid])
2306 0 : continue;
2307 :
2308 0 : __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
2309 0 : skipped += nr_skipped[zid];
2310 : }
2311 : }
2312 0 : *nr_scanned = total_scan;
2313 0 : trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
2314 : total_scan, skipped, nr_taken,
2315 : sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru);
2316 0 : update_lru_sizes(lruvec, lru, nr_zone_taken);
2317 0 : return nr_taken;
2318 : }
2319 :
2320 : /**
2321 : * folio_isolate_lru() - Try to isolate a folio from its LRU list.
2322 : * @folio: Folio to isolate from its LRU list.
2323 : *
2324 : * Isolate a @folio from an LRU list and adjust the vmstat statistic
2325 : * corresponding to whatever LRU list the folio was on.
2326 : *
2327 : * The folio will have its LRU flag cleared. If it was found on the
2328 : * active list, it will have the Active flag set. If it was found on the
2329 : * unevictable list, it will have the Unevictable flag set. These flags
2330 : * may need to be cleared by the caller before letting the page go.
2331 : *
2332 : * Context:
2333 : *
2334 : * (1) Must be called with an elevated refcount on the folio. This is a
2335 : * fundamental difference from isolate_lru_folios() (which is called
2336 : * without a stable reference).
2337 : * (2) The lru_lock must not be held.
2338 : * (3) Interrupts must be enabled.
2339 : *
2340 : * Return: true if the folio was removed from an LRU list.
2341 : * false if the folio was not on an LRU list.
2342 : */
2343 0 : bool folio_isolate_lru(struct folio *folio)
2344 : {
2345 0 : bool ret = false;
2346 :
2347 : VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
2348 :
2349 0 : if (folio_test_clear_lru(folio)) {
2350 : struct lruvec *lruvec;
2351 :
2352 0 : folio_get(folio);
2353 0 : lruvec = folio_lruvec_lock_irq(folio);
2354 0 : lruvec_del_folio(lruvec, folio);
2355 : unlock_page_lruvec_irq(lruvec);
2356 0 : ret = true;
2357 : }
2358 :
2359 0 : return ret;
2360 : }
2361 :
2362 : /*
2363 : * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
2364 : * then get rescheduled. When there are massive number of tasks doing page
2365 : * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
2366 : * the LRU list will go small and be scanned faster than necessary, leading to
2367 : * unnecessary swapping, thrashing and OOM.
2368 : */
2369 0 : static int too_many_isolated(struct pglist_data *pgdat, int file,
2370 : struct scan_control *sc)
2371 : {
2372 : unsigned long inactive, isolated;
2373 : bool too_many;
2374 :
2375 0 : if (current_is_kswapd())
2376 : return 0;
2377 :
2378 0 : if (!writeback_throttling_sane(sc))
2379 : return 0;
2380 :
2381 0 : if (file) {
2382 0 : inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
2383 0 : isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
2384 : } else {
2385 0 : inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
2386 0 : isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
2387 : }
2388 :
2389 : /*
2390 : * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
2391 : * won't get blocked by normal direct-reclaimers, forming a circular
2392 : * deadlock.
2393 : */
2394 0 : if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
2395 0 : inactive >>= 3;
2396 :
2397 0 : too_many = isolated > inactive;
2398 :
2399 : /* Wake up tasks throttled due to too_many_isolated. */
2400 0 : if (!too_many)
2401 : wake_throttle_isolated(pgdat);
2402 :
2403 0 : return too_many;
2404 : }
2405 :
2406 : /*
2407 : * move_folios_to_lru() moves folios from private @list to appropriate LRU list.
2408 : * On return, @list is reused as a list of folios to be freed by the caller.
2409 : *
2410 : * Returns the number of pages moved to the given lruvec.
2411 : */
2412 0 : static unsigned int move_folios_to_lru(struct lruvec *lruvec,
2413 : struct list_head *list)
2414 : {
2415 0 : int nr_pages, nr_moved = 0;
2416 0 : LIST_HEAD(folios_to_free);
2417 :
2418 0 : while (!list_empty(list)) {
2419 0 : struct folio *folio = lru_to_folio(list);
2420 :
2421 : VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
2422 0 : list_del(&folio->lru);
2423 0 : if (unlikely(!folio_evictable(folio))) {
2424 0 : spin_unlock_irq(&lruvec->lru_lock);
2425 0 : folio_putback_lru(folio);
2426 0 : spin_lock_irq(&lruvec->lru_lock);
2427 0 : continue;
2428 : }
2429 :
2430 : /*
2431 : * The folio_set_lru needs to be kept here for list integrity.
2432 : * Otherwise:
2433 : * #0 move_folios_to_lru #1 release_pages
2434 : * if (!folio_put_testzero())
2435 : * if (folio_put_testzero())
2436 : * !lru //skip lru_lock
2437 : * folio_set_lru()
2438 : * list_add(&folio->lru,)
2439 : * list_add(&folio->lru,)
2440 : */
2441 0 : folio_set_lru(folio);
2442 :
2443 0 : if (unlikely(folio_put_testzero(folio))) {
2444 0 : __folio_clear_lru_flags(folio);
2445 :
2446 0 : if (unlikely(folio_test_large(folio))) {
2447 0 : spin_unlock_irq(&lruvec->lru_lock);
2448 0 : destroy_large_folio(folio);
2449 0 : spin_lock_irq(&lruvec->lru_lock);
2450 : } else
2451 0 : list_add(&folio->lru, &folios_to_free);
2452 :
2453 0 : continue;
2454 : }
2455 :
2456 : /*
2457 : * All pages were isolated from the same lruvec (and isolation
2458 : * inhibits memcg migration).
2459 : */
2460 : VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
2461 0 : lruvec_add_folio(lruvec, folio);
2462 0 : nr_pages = folio_nr_pages(folio);
2463 0 : nr_moved += nr_pages;
2464 0 : if (folio_test_active(folio))
2465 0 : workingset_age_nonresident(lruvec, nr_pages);
2466 : }
2467 :
2468 : /*
2469 : * To save our caller's stack, now use input list for pages to free.
2470 : */
2471 0 : list_splice(&folios_to_free, list);
2472 :
2473 0 : return nr_moved;
2474 : }
2475 :
2476 : /*
2477 : * If a kernel thread (such as nfsd for loop-back mounts) services a backing
2478 : * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case
2479 : * we should not throttle. Otherwise it is safe to do so.
2480 : */
2481 : static int current_may_throttle(void)
2482 : {
2483 0 : return !(current->flags & PF_LOCAL_THROTTLE);
2484 : }
2485 :
2486 : /*
2487 : * shrink_inactive_list() is a helper for shrink_node(). It returns the number
2488 : * of reclaimed pages
2489 : */
2490 0 : static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
2491 : struct lruvec *lruvec, struct scan_control *sc,
2492 : enum lru_list lru)
2493 : {
2494 0 : LIST_HEAD(folio_list);
2495 : unsigned long nr_scanned;
2496 0 : unsigned int nr_reclaimed = 0;
2497 : unsigned long nr_taken;
2498 : struct reclaim_stat stat;
2499 0 : bool file = is_file_lru(lru);
2500 : enum vm_event_item item;
2501 0 : struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2502 0 : bool stalled = false;
2503 :
2504 0 : while (unlikely(too_many_isolated(pgdat, file, sc))) {
2505 0 : if (stalled)
2506 : return 0;
2507 :
2508 : /* wait a bit for the reclaimer. */
2509 0 : stalled = true;
2510 0 : reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
2511 :
2512 : /* We are about to die and free our memory. Return now. */
2513 0 : if (fatal_signal_pending(current))
2514 : return SWAP_CLUSTER_MAX;
2515 : }
2516 :
2517 0 : lru_add_drain();
2518 :
2519 0 : spin_lock_irq(&lruvec->lru_lock);
2520 :
2521 0 : nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list,
2522 : &nr_scanned, sc, lru);
2523 :
2524 0 : __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2525 0 : item = PGSCAN_KSWAPD + reclaimer_offset();
2526 0 : if (!cgroup_reclaim(sc))
2527 0 : __count_vm_events(item, nr_scanned);
2528 0 : __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
2529 0 : __count_vm_events(PGSCAN_ANON + file, nr_scanned);
2530 :
2531 0 : spin_unlock_irq(&lruvec->lru_lock);
2532 :
2533 0 : if (nr_taken == 0)
2534 : return 0;
2535 :
2536 0 : nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
2537 :
2538 0 : spin_lock_irq(&lruvec->lru_lock);
2539 0 : move_folios_to_lru(lruvec, &folio_list);
2540 :
2541 0 : __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2542 0 : item = PGSTEAL_KSWAPD + reclaimer_offset();
2543 0 : if (!cgroup_reclaim(sc))
2544 0 : __count_vm_events(item, nr_reclaimed);
2545 0 : __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
2546 0 : __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
2547 0 : spin_unlock_irq(&lruvec->lru_lock);
2548 :
2549 0 : lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
2550 0 : mem_cgroup_uncharge_list(&folio_list);
2551 0 : free_unref_page_list(&folio_list);
2552 :
2553 : /*
2554 : * If dirty folios are scanned that are not queued for IO, it
2555 : * implies that flushers are not doing their job. This can
2556 : * happen when memory pressure pushes dirty folios to the end of
2557 : * the LRU before the dirty limits are breached and the dirty
2558 : * data has expired. It can also happen when the proportion of
2559 : * dirty folios grows not through writes but through memory
2560 : * pressure reclaiming all the clean cache. And in some cases,
2561 : * the flushers simply cannot keep up with the allocation
2562 : * rate. Nudge the flusher threads in case they are asleep.
2563 : */
2564 0 : if (stat.nr_unqueued_dirty == nr_taken) {
2565 0 : wakeup_flusher_threads(WB_REASON_VMSCAN);
2566 : /*
2567 : * For cgroupv1 dirty throttling is achieved by waking up
2568 : * the kernel flusher here and later waiting on folios
2569 : * which are in writeback to finish (see shrink_folio_list()).
2570 : *
2571 : * Flusher may not be able to issue writeback quickly
2572 : * enough for cgroupv1 writeback throttling to work
2573 : * on a large system.
2574 : */
2575 0 : if (!writeback_throttling_sane(sc))
2576 : reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
2577 : }
2578 :
2579 0 : sc->nr.dirty += stat.nr_dirty;
2580 0 : sc->nr.congested += stat.nr_congested;
2581 0 : sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
2582 0 : sc->nr.writeback += stat.nr_writeback;
2583 0 : sc->nr.immediate += stat.nr_immediate;
2584 0 : sc->nr.taken += nr_taken;
2585 0 : if (file)
2586 0 : sc->nr.file_taken += nr_taken;
2587 :
2588 0 : trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
2589 0 : nr_scanned, nr_reclaimed, &stat, sc->priority, file);
2590 0 : return nr_reclaimed;
2591 : }
2592 :
2593 : /*
2594 : * shrink_active_list() moves folios from the active LRU to the inactive LRU.
2595 : *
2596 : * We move them the other way if the folio is referenced by one or more
2597 : * processes.
2598 : *
2599 : * If the folios are mostly unmapped, the processing is fast and it is
2600 : * appropriate to hold lru_lock across the whole operation. But if
2601 : * the folios are mapped, the processing is slow (folio_referenced()), so
2602 : * we should drop lru_lock around each folio. It's impossible to balance
2603 : * this, so instead we remove the folios from the LRU while processing them.
2604 : * It is safe to rely on the active flag against the non-LRU folios in here
2605 : * because nobody will play with that bit on a non-LRU folio.
2606 : *
2607 : * The downside is that we have to touch folio->_refcount against each folio.
2608 : * But we had to alter folio->flags anyway.
2609 : */
2610 0 : static void shrink_active_list(unsigned long nr_to_scan,
2611 : struct lruvec *lruvec,
2612 : struct scan_control *sc,
2613 : enum lru_list lru)
2614 : {
2615 : unsigned long nr_taken;
2616 : unsigned long nr_scanned;
2617 : unsigned long vm_flags;
2618 0 : LIST_HEAD(l_hold); /* The folios which were snipped off */
2619 0 : LIST_HEAD(l_active);
2620 0 : LIST_HEAD(l_inactive);
2621 : unsigned nr_deactivate, nr_activate;
2622 0 : unsigned nr_rotated = 0;
2623 0 : int file = is_file_lru(lru);
2624 0 : struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2625 :
2626 0 : lru_add_drain();
2627 :
2628 0 : spin_lock_irq(&lruvec->lru_lock);
2629 :
2630 0 : nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
2631 : &nr_scanned, sc, lru);
2632 :
2633 0 : __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2634 :
2635 0 : if (!cgroup_reclaim(sc))
2636 0 : __count_vm_events(PGREFILL, nr_scanned);
2637 0 : __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2638 :
2639 0 : spin_unlock_irq(&lruvec->lru_lock);
2640 :
2641 0 : while (!list_empty(&l_hold)) {
2642 : struct folio *folio;
2643 :
2644 0 : cond_resched();
2645 0 : folio = lru_to_folio(&l_hold);
2646 0 : list_del(&folio->lru);
2647 :
2648 0 : if (unlikely(!folio_evictable(folio))) {
2649 0 : folio_putback_lru(folio);
2650 0 : continue;
2651 : }
2652 :
2653 0 : if (unlikely(buffer_heads_over_limit)) {
2654 0 : if (folio_test_private(folio) && folio_trylock(folio)) {
2655 0 : if (folio_test_private(folio))
2656 0 : filemap_release_folio(folio, 0);
2657 0 : folio_unlock(folio);
2658 : }
2659 : }
2660 :
2661 : /* Referenced or rmap lock contention: rotate */
2662 0 : if (folio_referenced(folio, 0, sc->target_mem_cgroup,
2663 : &vm_flags) != 0) {
2664 : /*
2665 : * Identify referenced, file-backed active folios and
2666 : * give them one more trip around the active list. So
2667 : * that executable code get better chances to stay in
2668 : * memory under moderate memory pressure. Anon folios
2669 : * are not likely to be evicted by use-once streaming
2670 : * IO, plus JVM can create lots of anon VM_EXEC folios,
2671 : * so we ignore them here.
2672 : */
2673 0 : if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
2674 0 : nr_rotated += folio_nr_pages(folio);
2675 0 : list_add(&folio->lru, &l_active);
2676 0 : continue;
2677 : }
2678 : }
2679 :
2680 0 : folio_clear_active(folio); /* we are de-activating */
2681 0 : folio_set_workingset(folio);
2682 0 : list_add(&folio->lru, &l_inactive);
2683 : }
2684 :
2685 : /*
2686 : * Move folios back to the lru list.
2687 : */
2688 0 : spin_lock_irq(&lruvec->lru_lock);
2689 :
2690 0 : nr_activate = move_folios_to_lru(lruvec, &l_active);
2691 0 : nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
2692 : /* Keep all free folios in l_active list */
2693 0 : list_splice(&l_inactive, &l_active);
2694 :
2695 0 : __count_vm_events(PGDEACTIVATE, nr_deactivate);
2696 0 : __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
2697 :
2698 0 : __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2699 0 : spin_unlock_irq(&lruvec->lru_lock);
2700 :
2701 0 : if (nr_rotated)
2702 0 : lru_note_cost(lruvec, file, 0, nr_rotated);
2703 0 : mem_cgroup_uncharge_list(&l_active);
2704 0 : free_unref_page_list(&l_active);
2705 0 : trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2706 0 : nr_deactivate, nr_rotated, sc->priority, file);
2707 0 : }
2708 :
2709 0 : static unsigned int reclaim_folio_list(struct list_head *folio_list,
2710 : struct pglist_data *pgdat)
2711 : {
2712 : struct reclaim_stat dummy_stat;
2713 : unsigned int nr_reclaimed;
2714 : struct folio *folio;
2715 0 : struct scan_control sc = {
2716 : .gfp_mask = GFP_KERNEL,
2717 : .may_writepage = 1,
2718 : .may_unmap = 1,
2719 : .may_swap = 1,
2720 : .no_demotion = 1,
2721 : };
2722 :
2723 0 : nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
2724 0 : while (!list_empty(folio_list)) {
2725 0 : folio = lru_to_folio(folio_list);
2726 0 : list_del(&folio->lru);
2727 0 : folio_putback_lru(folio);
2728 : }
2729 :
2730 0 : return nr_reclaimed;
2731 : }
2732 :
2733 0 : unsigned long reclaim_pages(struct list_head *folio_list)
2734 : {
2735 : int nid;
2736 0 : unsigned int nr_reclaimed = 0;
2737 0 : LIST_HEAD(node_folio_list);
2738 : unsigned int noreclaim_flag;
2739 :
2740 0 : if (list_empty(folio_list))
2741 : return nr_reclaimed;
2742 :
2743 0 : noreclaim_flag = memalloc_noreclaim_save();
2744 :
2745 0 : nid = folio_nid(lru_to_folio(folio_list));
2746 : do {
2747 0 : struct folio *folio = lru_to_folio(folio_list);
2748 :
2749 : if (nid == folio_nid(folio)) {
2750 0 : folio_clear_active(folio);
2751 0 : list_move(&folio->lru, &node_folio_list);
2752 0 : continue;
2753 : }
2754 :
2755 : nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
2756 : nid = folio_nid(lru_to_folio(folio_list));
2757 0 : } while (!list_empty(folio_list));
2758 :
2759 0 : nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
2760 :
2761 0 : memalloc_noreclaim_restore(noreclaim_flag);
2762 :
2763 0 : return nr_reclaimed;
2764 : }
2765 :
2766 0 : static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2767 : struct lruvec *lruvec, struct scan_control *sc)
2768 : {
2769 0 : if (is_active_lru(lru)) {
2770 0 : if (sc->may_deactivate & (1 << is_file_lru(lru)))
2771 0 : shrink_active_list(nr_to_scan, lruvec, sc, lru);
2772 : else
2773 0 : sc->skipped_deactivate = 1;
2774 : return 0;
2775 : }
2776 :
2777 0 : return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2778 : }
2779 :
2780 : /*
2781 : * The inactive anon list should be small enough that the VM never has
2782 : * to do too much work.
2783 : *
2784 : * The inactive file list should be small enough to leave most memory
2785 : * to the established workingset on the scan-resistant active list,
2786 : * but large enough to avoid thrashing the aggregate readahead window.
2787 : *
2788 : * Both inactive lists should also be large enough that each inactive
2789 : * folio has a chance to be referenced again before it is reclaimed.
2790 : *
2791 : * If that fails and refaulting is observed, the inactive list grows.
2792 : *
2793 : * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios
2794 : * on this LRU, maintained by the pageout code. An inactive_ratio
2795 : * of 3 means 3:1 or 25% of the folios are kept on the inactive list.
2796 : *
2797 : * total target max
2798 : * memory ratio inactive
2799 : * -------------------------------------
2800 : * 10MB 1 5MB
2801 : * 100MB 1 50MB
2802 : * 1GB 3 250MB
2803 : * 10GB 10 0.9GB
2804 : * 100GB 31 3GB
2805 : * 1TB 101 10GB
2806 : * 10TB 320 32GB
2807 : */
2808 0 : static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
2809 : {
2810 0 : enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
2811 : unsigned long inactive, active;
2812 : unsigned long inactive_ratio;
2813 : unsigned long gb;
2814 :
2815 0 : inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
2816 0 : active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
2817 :
2818 0 : gb = (inactive + active) >> (30 - PAGE_SHIFT);
2819 0 : if (gb)
2820 0 : inactive_ratio = int_sqrt(10 * gb);
2821 : else
2822 : inactive_ratio = 1;
2823 :
2824 0 : return inactive * inactive_ratio < active;
2825 : }
2826 :
2827 : enum scan_balance {
2828 : SCAN_EQUAL,
2829 : SCAN_FRACT,
2830 : SCAN_ANON,
2831 : SCAN_FILE,
2832 : };
2833 :
2834 0 : static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
2835 : {
2836 : unsigned long file;
2837 : struct lruvec *target_lruvec;
2838 :
2839 : if (lru_gen_enabled())
2840 : return;
2841 :
2842 0 : target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
2843 :
2844 : /*
2845 : * Flush the memory cgroup stats, so that we read accurate per-memcg
2846 : * lruvec stats for heuristics.
2847 : */
2848 : mem_cgroup_flush_stats();
2849 :
2850 : /*
2851 : * Determine the scan balance between anon and file LRUs.
2852 : */
2853 0 : spin_lock_irq(&target_lruvec->lru_lock);
2854 0 : sc->anon_cost = target_lruvec->anon_cost;
2855 0 : sc->file_cost = target_lruvec->file_cost;
2856 0 : spin_unlock_irq(&target_lruvec->lru_lock);
2857 :
2858 : /*
2859 : * Target desirable inactive:active list ratios for the anon
2860 : * and file LRU lists.
2861 : */
2862 0 : if (!sc->force_deactivate) {
2863 : unsigned long refaults;
2864 :
2865 : /*
2866 : * When refaults are being observed, it means a new
2867 : * workingset is being established. Deactivate to get
2868 : * rid of any stale active pages quickly.
2869 : */
2870 0 : refaults = lruvec_page_state(target_lruvec,
2871 : WORKINGSET_ACTIVATE_ANON);
2872 0 : if (refaults != target_lruvec->refaults[WORKINGSET_ANON] ||
2873 0 : inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
2874 0 : sc->may_deactivate |= DEACTIVATE_ANON;
2875 : else
2876 0 : sc->may_deactivate &= ~DEACTIVATE_ANON;
2877 :
2878 0 : refaults = lruvec_page_state(target_lruvec,
2879 : WORKINGSET_ACTIVATE_FILE);
2880 0 : if (refaults != target_lruvec->refaults[WORKINGSET_FILE] ||
2881 0 : inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
2882 0 : sc->may_deactivate |= DEACTIVATE_FILE;
2883 : else
2884 0 : sc->may_deactivate &= ~DEACTIVATE_FILE;
2885 : } else
2886 0 : sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
2887 :
2888 : /*
2889 : * If we have plenty of inactive file pages that aren't
2890 : * thrashing, try to reclaim those first before touching
2891 : * anonymous pages.
2892 : */
2893 0 : file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
2894 0 : if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
2895 0 : sc->cache_trim_mode = 1;
2896 : else
2897 0 : sc->cache_trim_mode = 0;
2898 :
2899 : /*
2900 : * Prevent the reclaimer from falling into the cache trap: as
2901 : * cache pages start out inactive, every cache fault will tip
2902 : * the scan balance towards the file LRU. And as the file LRU
2903 : * shrinks, so does the window for rotation from references.
2904 : * This means we have a runaway feedback loop where a tiny
2905 : * thrashing file LRU becomes infinitely more attractive than
2906 : * anon pages. Try to detect this based on file LRU size.
2907 : */
2908 0 : if (!cgroup_reclaim(sc)) {
2909 0 : unsigned long total_high_wmark = 0;
2910 : unsigned long free, anon;
2911 : int z;
2912 :
2913 0 : free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2914 0 : file = node_page_state(pgdat, NR_ACTIVE_FILE) +
2915 0 : node_page_state(pgdat, NR_INACTIVE_FILE);
2916 :
2917 0 : for (z = 0; z < MAX_NR_ZONES; z++) {
2918 0 : struct zone *zone = &pgdat->node_zones[z];
2919 :
2920 0 : if (!managed_zone(zone))
2921 0 : continue;
2922 :
2923 0 : total_high_wmark += high_wmark_pages(zone);
2924 : }
2925 :
2926 : /*
2927 : * Consider anon: if that's low too, this isn't a
2928 : * runaway file reclaim problem, but rather just
2929 : * extreme pressure. Reclaim as per usual then.
2930 : */
2931 0 : anon = node_page_state(pgdat, NR_INACTIVE_ANON);
2932 :
2933 0 : sc->file_is_tiny =
2934 0 : file + free <= total_high_wmark &&
2935 0 : !(sc->may_deactivate & DEACTIVATE_ANON) &&
2936 0 : anon >> sc->priority;
2937 : }
2938 : }
2939 :
2940 : /*
2941 : * Determine how aggressively the anon and file LRU lists should be
2942 : * scanned.
2943 : *
2944 : * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan
2945 : * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan
2946 : */
2947 0 : static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
2948 : unsigned long *nr)
2949 : {
2950 0 : struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2951 0 : struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2952 : unsigned long anon_cost, file_cost, total_cost;
2953 0 : int swappiness = mem_cgroup_swappiness(memcg);
2954 : u64 fraction[ANON_AND_FILE];
2955 0 : u64 denominator = 0; /* gcc */
2956 : enum scan_balance scan_balance;
2957 : unsigned long ap, fp;
2958 : enum lru_list lru;
2959 :
2960 : /* If we have no swap space, do not bother scanning anon folios. */
2961 0 : if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
2962 : scan_balance = SCAN_FILE;
2963 : goto out;
2964 : }
2965 :
2966 : /*
2967 : * Global reclaim will swap to prevent OOM even with no
2968 : * swappiness, but memcg users want to use this knob to
2969 : * disable swapping for individual groups completely when
2970 : * using the memory controller's swap limit feature would be
2971 : * too expensive.
2972 : */
2973 0 : if (cgroup_reclaim(sc) && !swappiness) {
2974 : scan_balance = SCAN_FILE;
2975 : goto out;
2976 : }
2977 :
2978 : /*
2979 : * Do not apply any pressure balancing cleverness when the
2980 : * system is close to OOM, scan both anon and file equally
2981 : * (unless the swappiness setting disagrees with swapping).
2982 : */
2983 0 : if (!sc->priority && swappiness) {
2984 : scan_balance = SCAN_EQUAL;
2985 : goto out;
2986 : }
2987 :
2988 : /*
2989 : * If the system is almost out of file pages, force-scan anon.
2990 : */
2991 0 : if (sc->file_is_tiny) {
2992 : scan_balance = SCAN_ANON;
2993 : goto out;
2994 : }
2995 :
2996 : /*
2997 : * If there is enough inactive page cache, we do not reclaim
2998 : * anything from the anonymous working right now.
2999 : */
3000 0 : if (sc->cache_trim_mode) {
3001 : scan_balance = SCAN_FILE;
3002 : goto out;
3003 : }
3004 :
3005 0 : scan_balance = SCAN_FRACT;
3006 : /*
3007 : * Calculate the pressure balance between anon and file pages.
3008 : *
3009 : * The amount of pressure we put on each LRU is inversely
3010 : * proportional to the cost of reclaiming each list, as
3011 : * determined by the share of pages that are refaulting, times
3012 : * the relative IO cost of bringing back a swapped out
3013 : * anonymous page vs reloading a filesystem page (swappiness).
3014 : *
3015 : * Although we limit that influence to ensure no list gets
3016 : * left behind completely: at least a third of the pressure is
3017 : * applied, before swappiness.
3018 : *
3019 : * With swappiness at 100, anon and file have equal IO cost.
3020 : */
3021 0 : total_cost = sc->anon_cost + sc->file_cost;
3022 0 : anon_cost = total_cost + sc->anon_cost;
3023 0 : file_cost = total_cost + sc->file_cost;
3024 0 : total_cost = anon_cost + file_cost;
3025 :
3026 0 : ap = swappiness * (total_cost + 1);
3027 0 : ap /= anon_cost + 1;
3028 :
3029 0 : fp = (200 - swappiness) * (total_cost + 1);
3030 0 : fp /= file_cost + 1;
3031 :
3032 0 : fraction[0] = ap;
3033 0 : fraction[1] = fp;
3034 0 : denominator = ap + fp;
3035 : out:
3036 0 : for_each_evictable_lru(lru) {
3037 0 : int file = is_file_lru(lru);
3038 : unsigned long lruvec_size;
3039 : unsigned long low, min;
3040 : unsigned long scan;
3041 :
3042 0 : lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
3043 0 : mem_cgroup_protection(sc->target_mem_cgroup, memcg,
3044 : &min, &low);
3045 :
3046 : if (min || low) {
3047 : /*
3048 : * Scale a cgroup's reclaim pressure by proportioning
3049 : * its current usage to its memory.low or memory.min
3050 : * setting.
3051 : *
3052 : * This is important, as otherwise scanning aggression
3053 : * becomes extremely binary -- from nothing as we
3054 : * approach the memory protection threshold, to totally
3055 : * nominal as we exceed it. This results in requiring
3056 : * setting extremely liberal protection thresholds. It
3057 : * also means we simply get no protection at all if we
3058 : * set it too low, which is not ideal.
3059 : *
3060 : * If there is any protection in place, we reduce scan
3061 : * pressure by how much of the total memory used is
3062 : * within protection thresholds.
3063 : *
3064 : * There is one special case: in the first reclaim pass,
3065 : * we skip over all groups that are within their low
3066 : * protection. If that fails to reclaim enough pages to
3067 : * satisfy the reclaim goal, we come back and override
3068 : * the best-effort low protection. However, we still
3069 : * ideally want to honor how well-behaved groups are in
3070 : * that case instead of simply punishing them all
3071 : * equally. As such, we reclaim them based on how much
3072 : * memory they are using, reducing the scan pressure
3073 : * again by how much of the total memory used is under
3074 : * hard protection.
3075 : */
3076 : unsigned long cgroup_size = mem_cgroup_size(memcg);
3077 : unsigned long protection;
3078 :
3079 : /* memory.low scaling, make sure we retry before OOM */
3080 : if (!sc->memcg_low_reclaim && low > min) {
3081 : protection = low;
3082 : sc->memcg_low_skipped = 1;
3083 : } else {
3084 : protection = min;
3085 : }
3086 :
3087 : /* Avoid TOCTOU with earlier protection check */
3088 : cgroup_size = max(cgroup_size, protection);
3089 :
3090 : scan = lruvec_size - lruvec_size * protection /
3091 : (cgroup_size + 1);
3092 :
3093 : /*
3094 : * Minimally target SWAP_CLUSTER_MAX pages to keep
3095 : * reclaim moving forwards, avoiding decrementing
3096 : * sc->priority further than desirable.
3097 : */
3098 : scan = max(scan, SWAP_CLUSTER_MAX);
3099 : } else {
3100 0 : scan = lruvec_size;
3101 : }
3102 :
3103 0 : scan >>= sc->priority;
3104 :
3105 : /*
3106 : * If the cgroup's already been deleted, make sure to
3107 : * scrape out the remaining cache.
3108 : */
3109 : if (!scan && !mem_cgroup_online(memcg))
3110 : scan = min(lruvec_size, SWAP_CLUSTER_MAX);
3111 :
3112 0 : switch (scan_balance) {
3113 : case SCAN_EQUAL:
3114 : /* Scan lists relative to size */
3115 : break;
3116 : case SCAN_FRACT:
3117 : /*
3118 : * Scan types proportional to swappiness and
3119 : * their relative recent reclaim efficiency.
3120 : * Make sure we don't miss the last page on
3121 : * the offlined memory cgroups because of a
3122 : * round-off error.
3123 : */
3124 0 : scan = mem_cgroup_online(memcg) ?
3125 0 : div64_u64(scan * fraction[file], denominator) :
3126 : DIV64_U64_ROUND_UP(scan * fraction[file],
3127 : denominator);
3128 0 : break;
3129 : case SCAN_FILE:
3130 : case SCAN_ANON:
3131 : /* Scan one type exclusively */
3132 0 : if ((scan_balance == SCAN_FILE) != file)
3133 0 : scan = 0;
3134 : break;
3135 : default:
3136 : /* Look ma, no brain */
3137 0 : BUG();
3138 : }
3139 :
3140 0 : nr[lru] = scan;
3141 : }
3142 0 : }
3143 :
3144 : /*
3145 : * Anonymous LRU management is a waste if there is
3146 : * ultimately no way to reclaim the memory.
3147 : */
3148 : static bool can_age_anon_pages(struct pglist_data *pgdat,
3149 : struct scan_control *sc)
3150 : {
3151 : /* Aging the anon LRU is valuable if swap is present: */
3152 0 : if (total_swap_pages > 0)
3153 : return true;
3154 :
3155 : /* Also valuable if anon pages can be demoted: */
3156 0 : return can_demote(pgdat->node_id, sc);
3157 : }
3158 :
3159 : #ifdef CONFIG_LRU_GEN
3160 :
3161 : #ifdef CONFIG_LRU_GEN_ENABLED
3162 : DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
3163 : #define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
3164 : #else
3165 : DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
3166 : #define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
3167 : #endif
3168 :
3169 : /******************************************************************************
3170 : * shorthand helpers
3171 : ******************************************************************************/
3172 :
3173 : #define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
3174 :
3175 : #define DEFINE_MAX_SEQ(lruvec) \
3176 : unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
3177 :
3178 : #define DEFINE_MIN_SEQ(lruvec) \
3179 : unsigned long min_seq[ANON_AND_FILE] = { \
3180 : READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \
3181 : READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
3182 : }
3183 :
3184 : #define for_each_gen_type_zone(gen, type, zone) \
3185 : for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
3186 : for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
3187 : for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
3188 :
3189 : #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
3190 : #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
3191 :
3192 : static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
3193 : {
3194 : struct pglist_data *pgdat = NODE_DATA(nid);
3195 :
3196 : #ifdef CONFIG_MEMCG
3197 : if (memcg) {
3198 : struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
3199 :
3200 : /* see the comment in mem_cgroup_lruvec() */
3201 : if (!lruvec->pgdat)
3202 : lruvec->pgdat = pgdat;
3203 :
3204 : return lruvec;
3205 : }
3206 : #endif
3207 : VM_WARN_ON_ONCE(!mem_cgroup_disabled());
3208 :
3209 : return &pgdat->__lruvec;
3210 : }
3211 :
3212 : static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
3213 : {
3214 : struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3215 : struct pglist_data *pgdat = lruvec_pgdat(lruvec);
3216 :
3217 : if (!sc->may_swap)
3218 : return 0;
3219 :
3220 : if (!can_demote(pgdat->node_id, sc) &&
3221 : mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
3222 : return 0;
3223 :
3224 : return mem_cgroup_swappiness(memcg);
3225 : }
3226 :
3227 : static int get_nr_gens(struct lruvec *lruvec, int type)
3228 : {
3229 : return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
3230 : }
3231 :
3232 : static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
3233 : {
3234 : /* see the comment on lru_gen_folio */
3235 : return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
3236 : get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
3237 : get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
3238 : }
3239 :
3240 : /******************************************************************************
3241 : * Bloom filters
3242 : ******************************************************************************/
3243 :
3244 : /*
3245 : * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
3246 : * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
3247 : * bits in a bitmap, k is the number of hash functions and n is the number of
3248 : * inserted items.
3249 : *
3250 : * Page table walkers use one of the two filters to reduce their search space.
3251 : * To get rid of non-leaf entries that no longer have enough leaf entries, the
3252 : * aging uses the double-buffering technique to flip to the other filter each
3253 : * time it produces a new generation. For non-leaf entries that have enough
3254 : * leaf entries, the aging carries them over to the next generation in
3255 : * walk_pmd_range(); the eviction also report them when walking the rmap
3256 : * in lru_gen_look_around().
3257 : *
3258 : * For future optimizations:
3259 : * 1. It's not necessary to keep both filters all the time. The spare one can be
3260 : * freed after the RCU grace period and reallocated if needed again.
3261 : * 2. And when reallocating, it's worth scaling its size according to the number
3262 : * of inserted entries in the other filter, to reduce the memory overhead on
3263 : * small systems and false positives on large systems.
3264 : * 3. Jenkins' hash function is an alternative to Knuth's.
3265 : */
3266 : #define BLOOM_FILTER_SHIFT 15
3267 :
3268 : static inline int filter_gen_from_seq(unsigned long seq)
3269 : {
3270 : return seq % NR_BLOOM_FILTERS;
3271 : }
3272 :
3273 : static void get_item_key(void *item, int *key)
3274 : {
3275 : u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
3276 :
3277 : BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
3278 :
3279 : key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
3280 : key[1] = hash >> BLOOM_FILTER_SHIFT;
3281 : }
3282 :
3283 : static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
3284 : {
3285 : int key[2];
3286 : unsigned long *filter;
3287 : int gen = filter_gen_from_seq(seq);
3288 :
3289 : filter = READ_ONCE(lruvec->mm_state.filters[gen]);
3290 : if (!filter)
3291 : return true;
3292 :
3293 : get_item_key(item, key);
3294 :
3295 : return test_bit(key[0], filter) && test_bit(key[1], filter);
3296 : }
3297 :
3298 : static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
3299 : {
3300 : int key[2];
3301 : unsigned long *filter;
3302 : int gen = filter_gen_from_seq(seq);
3303 :
3304 : filter = READ_ONCE(lruvec->mm_state.filters[gen]);
3305 : if (!filter)
3306 : return;
3307 :
3308 : get_item_key(item, key);
3309 :
3310 : if (!test_bit(key[0], filter))
3311 : set_bit(key[0], filter);
3312 : if (!test_bit(key[1], filter))
3313 : set_bit(key[1], filter);
3314 : }
3315 :
3316 : static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
3317 : {
3318 : unsigned long *filter;
3319 : int gen = filter_gen_from_seq(seq);
3320 :
3321 : filter = lruvec->mm_state.filters[gen];
3322 : if (filter) {
3323 : bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
3324 : return;
3325 : }
3326 :
3327 : filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
3328 : __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
3329 : WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
3330 : }
3331 :
3332 : /******************************************************************************
3333 : * mm_struct list
3334 : ******************************************************************************/
3335 :
3336 : static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
3337 : {
3338 : static struct lru_gen_mm_list mm_list = {
3339 : .fifo = LIST_HEAD_INIT(mm_list.fifo),
3340 : .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
3341 : };
3342 :
3343 : #ifdef CONFIG_MEMCG
3344 : if (memcg)
3345 : return &memcg->mm_list;
3346 : #endif
3347 : VM_WARN_ON_ONCE(!mem_cgroup_disabled());
3348 :
3349 : return &mm_list;
3350 : }
3351 :
3352 : void lru_gen_add_mm(struct mm_struct *mm)
3353 : {
3354 : int nid;
3355 : struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
3356 : struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
3357 :
3358 : VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
3359 : #ifdef CONFIG_MEMCG
3360 : VM_WARN_ON_ONCE(mm->lru_gen.memcg);
3361 : mm->lru_gen.memcg = memcg;
3362 : #endif
3363 : spin_lock(&mm_list->lock);
3364 :
3365 : for_each_node_state(nid, N_MEMORY) {
3366 : struct lruvec *lruvec = get_lruvec(memcg, nid);
3367 :
3368 : /* the first addition since the last iteration */
3369 : if (lruvec->mm_state.tail == &mm_list->fifo)
3370 : lruvec->mm_state.tail = &mm->lru_gen.list;
3371 : }
3372 :
3373 : list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
3374 :
3375 : spin_unlock(&mm_list->lock);
3376 : }
3377 :
3378 : void lru_gen_del_mm(struct mm_struct *mm)
3379 : {
3380 : int nid;
3381 : struct lru_gen_mm_list *mm_list;
3382 : struct mem_cgroup *memcg = NULL;
3383 :
3384 : if (list_empty(&mm->lru_gen.list))
3385 : return;
3386 :
3387 : #ifdef CONFIG_MEMCG
3388 : memcg = mm->lru_gen.memcg;
3389 : #endif
3390 : mm_list = get_mm_list(memcg);
3391 :
3392 : spin_lock(&mm_list->lock);
3393 :
3394 : for_each_node(nid) {
3395 : struct lruvec *lruvec = get_lruvec(memcg, nid);
3396 :
3397 : /* where the last iteration ended (exclusive) */
3398 : if (lruvec->mm_state.tail == &mm->lru_gen.list)
3399 : lruvec->mm_state.tail = lruvec->mm_state.tail->next;
3400 :
3401 : /* where the current iteration continues (inclusive) */
3402 : if (lruvec->mm_state.head != &mm->lru_gen.list)
3403 : continue;
3404 :
3405 : lruvec->mm_state.head = lruvec->mm_state.head->next;
3406 : /* the deletion ends the current iteration */
3407 : if (lruvec->mm_state.head == &mm_list->fifo)
3408 : WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
3409 : }
3410 :
3411 : list_del_init(&mm->lru_gen.list);
3412 :
3413 : spin_unlock(&mm_list->lock);
3414 :
3415 : #ifdef CONFIG_MEMCG
3416 : mem_cgroup_put(mm->lru_gen.memcg);
3417 : mm->lru_gen.memcg = NULL;
3418 : #endif
3419 : }
3420 :
3421 : #ifdef CONFIG_MEMCG
3422 : void lru_gen_migrate_mm(struct mm_struct *mm)
3423 : {
3424 : struct mem_cgroup *memcg;
3425 : struct task_struct *task = rcu_dereference_protected(mm->owner, true);
3426 :
3427 : VM_WARN_ON_ONCE(task->mm != mm);
3428 : lockdep_assert_held(&task->alloc_lock);
3429 :
3430 : /* for mm_update_next_owner() */
3431 : if (mem_cgroup_disabled())
3432 : return;
3433 :
3434 : /* migration can happen before addition */
3435 : if (!mm->lru_gen.memcg)
3436 : return;
3437 :
3438 : rcu_read_lock();
3439 : memcg = mem_cgroup_from_task(task);
3440 : rcu_read_unlock();
3441 : if (memcg == mm->lru_gen.memcg)
3442 : return;
3443 :
3444 : VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
3445 :
3446 : lru_gen_del_mm(mm);
3447 : lru_gen_add_mm(mm);
3448 : }
3449 : #endif
3450 :
3451 : static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
3452 : {
3453 : int i;
3454 : int hist;
3455 :
3456 : lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
3457 :
3458 : if (walk) {
3459 : hist = lru_hist_from_seq(walk->max_seq);
3460 :
3461 : for (i = 0; i < NR_MM_STATS; i++) {
3462 : WRITE_ONCE(lruvec->mm_state.stats[hist][i],
3463 : lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
3464 : walk->mm_stats[i] = 0;
3465 : }
3466 : }
3467 :
3468 : if (NR_HIST_GENS > 1 && last) {
3469 : hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
3470 :
3471 : for (i = 0; i < NR_MM_STATS; i++)
3472 : WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
3473 : }
3474 : }
3475 :
3476 : static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
3477 : {
3478 : int type;
3479 : unsigned long size = 0;
3480 : struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3481 : int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
3482 :
3483 : if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
3484 : return true;
3485 :
3486 : clear_bit(key, &mm->lru_gen.bitmap);
3487 :
3488 : for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
3489 : size += type ? get_mm_counter(mm, MM_FILEPAGES) :
3490 : get_mm_counter(mm, MM_ANONPAGES) +
3491 : get_mm_counter(mm, MM_SHMEMPAGES);
3492 : }
3493 :
3494 : if (size < MIN_LRU_BATCH)
3495 : return true;
3496 :
3497 : return !mmget_not_zero(mm);
3498 : }
3499 :
3500 : static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
3501 : struct mm_struct **iter)
3502 : {
3503 : bool first = false;
3504 : bool last = true;
3505 : struct mm_struct *mm = NULL;
3506 : struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3507 : struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
3508 : struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
3509 :
3510 : /*
3511 : * There are four interesting cases for this page table walker:
3512 : * 1. It tries to start a new iteration of mm_list with a stale max_seq;
3513 : * there is nothing left to do.
3514 : * 2. It's the first of the current generation, and it needs to reset
3515 : * the Bloom filter for the next generation.
3516 : * 3. It reaches the end of mm_list, and it needs to increment
3517 : * mm_state->seq; the iteration is done.
3518 : * 4. It's the last of the current generation, and it needs to reset the
3519 : * mm stats counters for the next generation.
3520 : */
3521 : spin_lock(&mm_list->lock);
3522 :
3523 : VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
3524 : VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
3525 : VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
3526 :
3527 : if (walk->max_seq <= mm_state->seq) {
3528 : if (!*iter)
3529 : last = false;
3530 : goto done;
3531 : }
3532 :
3533 : if (!mm_state->nr_walkers) {
3534 : VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
3535 :
3536 : mm_state->head = mm_list->fifo.next;
3537 : first = true;
3538 : }
3539 :
3540 : while (!mm && mm_state->head != &mm_list->fifo) {
3541 : mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
3542 :
3543 : mm_state->head = mm_state->head->next;
3544 :
3545 : /* force scan for those added after the last iteration */
3546 : if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
3547 : mm_state->tail = mm_state->head;
3548 : walk->force_scan = true;
3549 : }
3550 :
3551 : if (should_skip_mm(mm, walk))
3552 : mm = NULL;
3553 : }
3554 :
3555 : if (mm_state->head == &mm_list->fifo)
3556 : WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
3557 : done:
3558 : if (*iter && !mm)
3559 : mm_state->nr_walkers--;
3560 : if (!*iter && mm)
3561 : mm_state->nr_walkers++;
3562 :
3563 : if (mm_state->nr_walkers)
3564 : last = false;
3565 :
3566 : if (*iter || last)
3567 : reset_mm_stats(lruvec, walk, last);
3568 :
3569 : spin_unlock(&mm_list->lock);
3570 :
3571 : if (mm && first)
3572 : reset_bloom_filter(lruvec, walk->max_seq + 1);
3573 :
3574 : if (*iter)
3575 : mmput_async(*iter);
3576 :
3577 : *iter = mm;
3578 :
3579 : return last;
3580 : }
3581 :
3582 : static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
3583 : {
3584 : bool success = false;
3585 : struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3586 : struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
3587 : struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
3588 :
3589 : spin_lock(&mm_list->lock);
3590 :
3591 : VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
3592 :
3593 : if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
3594 : VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
3595 :
3596 : WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
3597 : reset_mm_stats(lruvec, NULL, true);
3598 : success = true;
3599 : }
3600 :
3601 : spin_unlock(&mm_list->lock);
3602 :
3603 : return success;
3604 : }
3605 :
3606 : /******************************************************************************
3607 : * refault feedback loop
3608 : ******************************************************************************/
3609 :
3610 : /*
3611 : * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
3612 : *
3613 : * The P term is refaulted/(evicted+protected) from a tier in the generation
3614 : * currently being evicted; the I term is the exponential moving average of the
3615 : * P term over the generations previously evicted, using the smoothing factor
3616 : * 1/2; the D term isn't supported.
3617 : *
3618 : * The setpoint (SP) is always the first tier of one type; the process variable
3619 : * (PV) is either any tier of the other type or any other tier of the same
3620 : * type.
3621 : *
3622 : * The error is the difference between the SP and the PV; the correction is to
3623 : * turn off protection when SP>PV or turn on protection when SP<PV.
3624 : *
3625 : * For future optimizations:
3626 : * 1. The D term may discount the other two terms over time so that long-lived
3627 : * generations can resist stale information.
3628 : */
3629 : struct ctrl_pos {
3630 : unsigned long refaulted;
3631 : unsigned long total;
3632 : int gain;
3633 : };
3634 :
3635 : static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
3636 : struct ctrl_pos *pos)
3637 : {
3638 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
3639 : int hist = lru_hist_from_seq(lrugen->min_seq[type]);
3640 :
3641 : pos->refaulted = lrugen->avg_refaulted[type][tier] +
3642 : atomic_long_read(&lrugen->refaulted[hist][type][tier]);
3643 : pos->total = lrugen->avg_total[type][tier] +
3644 : atomic_long_read(&lrugen->evicted[hist][type][tier]);
3645 : if (tier)
3646 : pos->total += lrugen->protected[hist][type][tier - 1];
3647 : pos->gain = gain;
3648 : }
3649 :
3650 : static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
3651 : {
3652 : int hist, tier;
3653 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
3654 : bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
3655 : unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
3656 :
3657 : lockdep_assert_held(&lruvec->lru_lock);
3658 :
3659 : if (!carryover && !clear)
3660 : return;
3661 :
3662 : hist = lru_hist_from_seq(seq);
3663 :
3664 : for (tier = 0; tier < MAX_NR_TIERS; tier++) {
3665 : if (carryover) {
3666 : unsigned long sum;
3667 :
3668 : sum = lrugen->avg_refaulted[type][tier] +
3669 : atomic_long_read(&lrugen->refaulted[hist][type][tier]);
3670 : WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
3671 :
3672 : sum = lrugen->avg_total[type][tier] +
3673 : atomic_long_read(&lrugen->evicted[hist][type][tier]);
3674 : if (tier)
3675 : sum += lrugen->protected[hist][type][tier - 1];
3676 : WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
3677 : }
3678 :
3679 : if (clear) {
3680 : atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
3681 : atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
3682 : if (tier)
3683 : WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
3684 : }
3685 : }
3686 : }
3687 :
3688 : static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
3689 : {
3690 : /*
3691 : * Return true if the PV has a limited number of refaults or a lower
3692 : * refaulted/total than the SP.
3693 : */
3694 : return pv->refaulted < MIN_LRU_BATCH ||
3695 : pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
3696 : (sp->refaulted + 1) * pv->total * pv->gain;
3697 : }
3698 :
3699 : /******************************************************************************
3700 : * the aging
3701 : ******************************************************************************/
3702 :
3703 : /* promote pages accessed through page tables */
3704 : static int folio_update_gen(struct folio *folio, int gen)
3705 : {
3706 : unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
3707 :
3708 : VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
3709 : VM_WARN_ON_ONCE(!rcu_read_lock_held());
3710 :
3711 : do {
3712 : /* lru_gen_del_folio() has isolated this page? */
3713 : if (!(old_flags & LRU_GEN_MASK)) {
3714 : /* for shrink_folio_list() */
3715 : new_flags = old_flags | BIT(PG_referenced);
3716 : continue;
3717 : }
3718 :
3719 : new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
3720 : new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
3721 : } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
3722 :
3723 : return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
3724 : }
3725 :
3726 : /* protect pages accessed multiple times through file descriptors */
3727 : static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
3728 : {
3729 : int type = folio_is_file_lru(folio);
3730 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
3731 : int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
3732 : unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
3733 :
3734 : VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
3735 :
3736 : do {
3737 : new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
3738 : /* folio_update_gen() has promoted this page? */
3739 : if (new_gen >= 0 && new_gen != old_gen)
3740 : return new_gen;
3741 :
3742 : new_gen = (old_gen + 1) % MAX_NR_GENS;
3743 :
3744 : new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
3745 : new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
3746 : /* for folio_end_writeback() */
3747 : if (reclaiming)
3748 : new_flags |= BIT(PG_reclaim);
3749 : } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
3750 :
3751 : lru_gen_update_size(lruvec, folio, old_gen, new_gen);
3752 :
3753 : return new_gen;
3754 : }
3755 :
3756 : static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
3757 : int old_gen, int new_gen)
3758 : {
3759 : int type = folio_is_file_lru(folio);
3760 : int zone = folio_zonenum(folio);
3761 : int delta = folio_nr_pages(folio);
3762 :
3763 : VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
3764 : VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
3765 :
3766 : walk->batched++;
3767 :
3768 : walk->nr_pages[old_gen][type][zone] -= delta;
3769 : walk->nr_pages[new_gen][type][zone] += delta;
3770 : }
3771 :
3772 : static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
3773 : {
3774 : int gen, type, zone;
3775 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
3776 :
3777 : walk->batched = 0;
3778 :
3779 : for_each_gen_type_zone(gen, type, zone) {
3780 : enum lru_list lru = type * LRU_INACTIVE_FILE;
3781 : int delta = walk->nr_pages[gen][type][zone];
3782 :
3783 : if (!delta)
3784 : continue;
3785 :
3786 : walk->nr_pages[gen][type][zone] = 0;
3787 : WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
3788 : lrugen->nr_pages[gen][type][zone] + delta);
3789 :
3790 : if (lru_gen_is_active(lruvec, gen))
3791 : lru += LRU_ACTIVE;
3792 : __update_lru_size(lruvec, lru, zone, delta);
3793 : }
3794 : }
3795 :
3796 : static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
3797 : {
3798 : struct address_space *mapping;
3799 : struct vm_area_struct *vma = args->vma;
3800 : struct lru_gen_mm_walk *walk = args->private;
3801 :
3802 : if (!vma_is_accessible(vma))
3803 : return true;
3804 :
3805 : if (is_vm_hugetlb_page(vma))
3806 : return true;
3807 :
3808 : if (!vma_has_recency(vma))
3809 : return true;
3810 :
3811 : if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
3812 : return true;
3813 :
3814 : if (vma == get_gate_vma(vma->vm_mm))
3815 : return true;
3816 :
3817 : if (vma_is_anonymous(vma))
3818 : return !walk->can_swap;
3819 :
3820 : if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
3821 : return true;
3822 :
3823 : mapping = vma->vm_file->f_mapping;
3824 : if (mapping_unevictable(mapping))
3825 : return true;
3826 :
3827 : if (shmem_mapping(mapping))
3828 : return !walk->can_swap;
3829 :
3830 : /* to exclude special mappings like dax, etc. */
3831 : return !mapping->a_ops->read_folio;
3832 : }
3833 :
3834 : /*
3835 : * Some userspace memory allocators map many single-page VMAs. Instead of
3836 : * returning back to the PGD table for each of such VMAs, finish an entire PMD
3837 : * table to reduce zigzags and improve cache performance.
3838 : */
3839 : static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
3840 : unsigned long *vm_start, unsigned long *vm_end)
3841 : {
3842 : unsigned long start = round_up(*vm_end, size);
3843 : unsigned long end = (start | ~mask) + 1;
3844 : VMA_ITERATOR(vmi, args->mm, start);
3845 :
3846 : VM_WARN_ON_ONCE(mask & size);
3847 : VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
3848 :
3849 : for_each_vma(vmi, args->vma) {
3850 : if (end && end <= args->vma->vm_start)
3851 : return false;
3852 :
3853 : if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args))
3854 : continue;
3855 :
3856 : *vm_start = max(start, args->vma->vm_start);
3857 : *vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
3858 :
3859 : return true;
3860 : }
3861 :
3862 : return false;
3863 : }
3864 :
3865 : static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
3866 : {
3867 : unsigned long pfn = pte_pfn(pte);
3868 :
3869 : VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
3870 :
3871 : if (!pte_present(pte) || is_zero_pfn(pfn))
3872 : return -1;
3873 :
3874 : if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
3875 : return -1;
3876 :
3877 : if (WARN_ON_ONCE(!pfn_valid(pfn)))
3878 : return -1;
3879 :
3880 : return pfn;
3881 : }
3882 :
3883 : #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
3884 : static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
3885 : {
3886 : unsigned long pfn = pmd_pfn(pmd);
3887 :
3888 : VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
3889 :
3890 : if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
3891 : return -1;
3892 :
3893 : if (WARN_ON_ONCE(pmd_devmap(pmd)))
3894 : return -1;
3895 :
3896 : if (WARN_ON_ONCE(!pfn_valid(pfn)))
3897 : return -1;
3898 :
3899 : return pfn;
3900 : }
3901 : #endif
3902 :
3903 : static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
3904 : struct pglist_data *pgdat, bool can_swap)
3905 : {
3906 : struct folio *folio;
3907 :
3908 : /* try to avoid unnecessary memory loads */
3909 : if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3910 : return NULL;
3911 :
3912 : folio = pfn_folio(pfn);
3913 : if (folio_nid(folio) != pgdat->node_id)
3914 : return NULL;
3915 :
3916 : if (folio_memcg_rcu(folio) != memcg)
3917 : return NULL;
3918 :
3919 : /* file VMAs can contain anon pages from COW */
3920 : if (!folio_is_file_lru(folio) && !can_swap)
3921 : return NULL;
3922 :
3923 : return folio;
3924 : }
3925 :
3926 : static bool suitable_to_scan(int total, int young)
3927 : {
3928 : int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
3929 :
3930 : /* suitable if the average number of young PTEs per cacheline is >=1 */
3931 : return young * n >= total;
3932 : }
3933 :
3934 : static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
3935 : struct mm_walk *args)
3936 : {
3937 : int i;
3938 : pte_t *pte;
3939 : spinlock_t *ptl;
3940 : unsigned long addr;
3941 : int total = 0;
3942 : int young = 0;
3943 : struct lru_gen_mm_walk *walk = args->private;
3944 : struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
3945 : struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3946 : int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
3947 :
3948 : VM_WARN_ON_ONCE(pmd_leaf(*pmd));
3949 :
3950 : ptl = pte_lockptr(args->mm, pmd);
3951 : if (!spin_trylock(ptl))
3952 : return false;
3953 :
3954 : arch_enter_lazy_mmu_mode();
3955 :
3956 : pte = pte_offset_map(pmd, start & PMD_MASK);
3957 : restart:
3958 : for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
3959 : unsigned long pfn;
3960 : struct folio *folio;
3961 :
3962 : total++;
3963 : walk->mm_stats[MM_LEAF_TOTAL]++;
3964 :
3965 : pfn = get_pte_pfn(pte[i], args->vma, addr);
3966 : if (pfn == -1)
3967 : continue;
3968 :
3969 : if (!pte_young(pte[i])) {
3970 : walk->mm_stats[MM_LEAF_OLD]++;
3971 : continue;
3972 : }
3973 :
3974 : folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
3975 : if (!folio)
3976 : continue;
3977 :
3978 : if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
3979 : VM_WARN_ON_ONCE(true);
3980 :
3981 : young++;
3982 : walk->mm_stats[MM_LEAF_YOUNG]++;
3983 :
3984 : if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
3985 : !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
3986 : !folio_test_swapcache(folio)))
3987 : folio_mark_dirty(folio);
3988 :
3989 : old_gen = folio_update_gen(folio, new_gen);
3990 : if (old_gen >= 0 && old_gen != new_gen)
3991 : update_batch_size(walk, folio, old_gen, new_gen);
3992 : }
3993 :
3994 : if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
3995 : goto restart;
3996 :
3997 : pte_unmap(pte);
3998 :
3999 : arch_leave_lazy_mmu_mode();
4000 : spin_unlock(ptl);
4001 :
4002 : return suitable_to_scan(total, young);
4003 : }
4004 :
4005 : #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
4006 : static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
4007 : struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
4008 : {
4009 : int i;
4010 : pmd_t *pmd;
4011 : spinlock_t *ptl;
4012 : struct lru_gen_mm_walk *walk = args->private;
4013 : struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
4014 : struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
4015 : int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
4016 :
4017 : VM_WARN_ON_ONCE(pud_leaf(*pud));
4018 :
4019 : /* try to batch at most 1+MIN_LRU_BATCH+1 entries */
4020 : if (*first == -1) {
4021 : *first = addr;
4022 : bitmap_zero(bitmap, MIN_LRU_BATCH);
4023 : return;
4024 : }
4025 :
4026 : i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first);
4027 : if (i && i <= MIN_LRU_BATCH) {
4028 : __set_bit(i - 1, bitmap);
4029 : return;
4030 : }
4031 :
4032 : pmd = pmd_offset(pud, *first);
4033 :
4034 : ptl = pmd_lockptr(args->mm, pmd);
4035 : if (!spin_trylock(ptl))
4036 : goto done;
4037 :
4038 : arch_enter_lazy_mmu_mode();
4039 :
4040 : do {
4041 : unsigned long pfn;
4042 : struct folio *folio;
4043 :
4044 : /* don't round down the first address */
4045 : addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
4046 :
4047 : pfn = get_pmd_pfn(pmd[i], vma, addr);
4048 : if (pfn == -1)
4049 : goto next;
4050 :
4051 : if (!pmd_trans_huge(pmd[i])) {
4052 : if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
4053 : pmdp_test_and_clear_young(vma, addr, pmd + i);
4054 : goto next;
4055 : }
4056 :
4057 : folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
4058 : if (!folio)
4059 : goto next;
4060 :
4061 : if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
4062 : goto next;
4063 :
4064 : walk->mm_stats[MM_LEAF_YOUNG]++;
4065 :
4066 : if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
4067 : !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
4068 : !folio_test_swapcache(folio)))
4069 : folio_mark_dirty(folio);
4070 :
4071 : old_gen = folio_update_gen(folio, new_gen);
4072 : if (old_gen >= 0 && old_gen != new_gen)
4073 : update_batch_size(walk, folio, old_gen, new_gen);
4074 : next:
4075 : i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
4076 : } while (i <= MIN_LRU_BATCH);
4077 :
4078 : arch_leave_lazy_mmu_mode();
4079 : spin_unlock(ptl);
4080 : done:
4081 : *first = -1;
4082 : }
4083 : #else
4084 : static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
4085 : struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
4086 : {
4087 : }
4088 : #endif
4089 :
4090 : static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
4091 : struct mm_walk *args)
4092 : {
4093 : int i;
4094 : pmd_t *pmd;
4095 : unsigned long next;
4096 : unsigned long addr;
4097 : struct vm_area_struct *vma;
4098 : unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
4099 : unsigned long first = -1;
4100 : struct lru_gen_mm_walk *walk = args->private;
4101 :
4102 : VM_WARN_ON_ONCE(pud_leaf(*pud));
4103 :
4104 : /*
4105 : * Finish an entire PMD in two passes: the first only reaches to PTE
4106 : * tables to avoid taking the PMD lock; the second, if necessary, takes
4107 : * the PMD lock to clear the accessed bit in PMD entries.
4108 : */
4109 : pmd = pmd_offset(pud, start & PUD_MASK);
4110 : restart:
4111 : /* walk_pte_range() may call get_next_vma() */
4112 : vma = args->vma;
4113 : for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
4114 : pmd_t val = pmdp_get_lockless(pmd + i);
4115 :
4116 : next = pmd_addr_end(addr, end);
4117 :
4118 : if (!pmd_present(val) || is_huge_zero_pmd(val)) {
4119 : walk->mm_stats[MM_LEAF_TOTAL]++;
4120 : continue;
4121 : }
4122 :
4123 : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4124 : if (pmd_trans_huge(val)) {
4125 : unsigned long pfn = pmd_pfn(val);
4126 : struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
4127 :
4128 : walk->mm_stats[MM_LEAF_TOTAL]++;
4129 :
4130 : if (!pmd_young(val)) {
4131 : walk->mm_stats[MM_LEAF_OLD]++;
4132 : continue;
4133 : }
4134 :
4135 : /* try to avoid unnecessary memory loads */
4136 : if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
4137 : continue;
4138 :
4139 : walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
4140 : continue;
4141 : }
4142 : #endif
4143 : walk->mm_stats[MM_NONLEAF_TOTAL]++;
4144 :
4145 : if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) {
4146 : if (!pmd_young(val))
4147 : continue;
4148 :
4149 : walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
4150 : }
4151 :
4152 : if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
4153 : continue;
4154 :
4155 : walk->mm_stats[MM_NONLEAF_FOUND]++;
4156 :
4157 : if (!walk_pte_range(&val, addr, next, args))
4158 : continue;
4159 :
4160 : walk->mm_stats[MM_NONLEAF_ADDED]++;
4161 :
4162 : /* carry over to the next generation */
4163 : update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
4164 : }
4165 :
4166 : walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
4167 :
4168 : if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
4169 : goto restart;
4170 : }
4171 :
4172 : static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
4173 : struct mm_walk *args)
4174 : {
4175 : int i;
4176 : pud_t *pud;
4177 : unsigned long addr;
4178 : unsigned long next;
4179 : struct lru_gen_mm_walk *walk = args->private;
4180 :
4181 : VM_WARN_ON_ONCE(p4d_leaf(*p4d));
4182 :
4183 : pud = pud_offset(p4d, start & P4D_MASK);
4184 : restart:
4185 : for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
4186 : pud_t val = READ_ONCE(pud[i]);
4187 :
4188 : next = pud_addr_end(addr, end);
4189 :
4190 : if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
4191 : continue;
4192 :
4193 : walk_pmd_range(&val, addr, next, args);
4194 :
4195 : /* a racy check to curtail the waiting time */
4196 : if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
4197 : return 1;
4198 :
4199 : if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
4200 : end = (addr | ~PUD_MASK) + 1;
4201 : goto done;
4202 : }
4203 : }
4204 :
4205 : if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
4206 : goto restart;
4207 :
4208 : end = round_up(end, P4D_SIZE);
4209 : done:
4210 : if (!end || !args->vma)
4211 : return 1;
4212 :
4213 : walk->next_addr = max(end, args->vma->vm_start);
4214 :
4215 : return -EAGAIN;
4216 : }
4217 :
4218 : static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
4219 : {
4220 : static const struct mm_walk_ops mm_walk_ops = {
4221 : .test_walk = should_skip_vma,
4222 : .p4d_entry = walk_pud_range,
4223 : };
4224 :
4225 : int err;
4226 : struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4227 :
4228 : walk->next_addr = FIRST_USER_ADDRESS;
4229 :
4230 : do {
4231 : err = -EBUSY;
4232 :
4233 : /* folio_update_gen() requires stable folio_memcg() */
4234 : if (!mem_cgroup_trylock_pages(memcg))
4235 : break;
4236 :
4237 : /* the caller might be holding the lock for write */
4238 : if (mmap_read_trylock(mm)) {
4239 : err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
4240 :
4241 : mmap_read_unlock(mm);
4242 : }
4243 :
4244 : mem_cgroup_unlock_pages();
4245 :
4246 : if (walk->batched) {
4247 : spin_lock_irq(&lruvec->lru_lock);
4248 : reset_batch_size(lruvec, walk);
4249 : spin_unlock_irq(&lruvec->lru_lock);
4250 : }
4251 :
4252 : cond_resched();
4253 : } while (err == -EAGAIN);
4254 : }
4255 :
4256 : static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
4257 : {
4258 : struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
4259 :
4260 : if (pgdat && current_is_kswapd()) {
4261 : VM_WARN_ON_ONCE(walk);
4262 :
4263 : walk = &pgdat->mm_walk;
4264 : } else if (!walk && force_alloc) {
4265 : VM_WARN_ON_ONCE(current_is_kswapd());
4266 :
4267 : walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
4268 : }
4269 :
4270 : current->reclaim_state->mm_walk = walk;
4271 :
4272 : return walk;
4273 : }
4274 :
4275 : static void clear_mm_walk(void)
4276 : {
4277 : struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
4278 :
4279 : VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
4280 : VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
4281 :
4282 : current->reclaim_state->mm_walk = NULL;
4283 :
4284 : if (!current_is_kswapd())
4285 : kfree(walk);
4286 : }
4287 :
4288 : static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
4289 : {
4290 : int zone;
4291 : int remaining = MAX_LRU_BATCH;
4292 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
4293 : int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
4294 :
4295 : if (type == LRU_GEN_ANON && !can_swap)
4296 : goto done;
4297 :
4298 : /* prevent cold/hot inversion if force_scan is true */
4299 : for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4300 : struct list_head *head = &lrugen->folios[old_gen][type][zone];
4301 :
4302 : while (!list_empty(head)) {
4303 : struct folio *folio = lru_to_folio(head);
4304 :
4305 : VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
4306 : VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
4307 : VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
4308 : VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
4309 :
4310 : new_gen = folio_inc_gen(lruvec, folio, false);
4311 : list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
4312 :
4313 : if (!--remaining)
4314 : return false;
4315 : }
4316 : }
4317 : done:
4318 : reset_ctrl_pos(lruvec, type, true);
4319 : WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
4320 :
4321 : return true;
4322 : }
4323 :
4324 : static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
4325 : {
4326 : int gen, type, zone;
4327 : bool success = false;
4328 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
4329 : DEFINE_MIN_SEQ(lruvec);
4330 :
4331 : VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
4332 :
4333 : /* find the oldest populated generation */
4334 : for (type = !can_swap; type < ANON_AND_FILE; type++) {
4335 : while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
4336 : gen = lru_gen_from_seq(min_seq[type]);
4337 :
4338 : for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4339 : if (!list_empty(&lrugen->folios[gen][type][zone]))
4340 : goto next;
4341 : }
4342 :
4343 : min_seq[type]++;
4344 : }
4345 : next:
4346 : ;
4347 : }
4348 :
4349 : /* see the comment on lru_gen_folio */
4350 : if (can_swap) {
4351 : min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
4352 : min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
4353 : }
4354 :
4355 : for (type = !can_swap; type < ANON_AND_FILE; type++) {
4356 : if (min_seq[type] == lrugen->min_seq[type])
4357 : continue;
4358 :
4359 : reset_ctrl_pos(lruvec, type, true);
4360 : WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
4361 : success = true;
4362 : }
4363 :
4364 : return success;
4365 : }
4366 :
4367 : static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
4368 : {
4369 : int prev, next;
4370 : int type, zone;
4371 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
4372 :
4373 : spin_lock_irq(&lruvec->lru_lock);
4374 :
4375 : VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
4376 :
4377 : for (type = ANON_AND_FILE - 1; type >= 0; type--) {
4378 : if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
4379 : continue;
4380 :
4381 : VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
4382 :
4383 : while (!inc_min_seq(lruvec, type, can_swap)) {
4384 : spin_unlock_irq(&lruvec->lru_lock);
4385 : cond_resched();
4386 : spin_lock_irq(&lruvec->lru_lock);
4387 : }
4388 : }
4389 :
4390 : /*
4391 : * Update the active/inactive LRU sizes for compatibility. Both sides of
4392 : * the current max_seq need to be covered, since max_seq+1 can overlap
4393 : * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
4394 : * overlap, cold/hot inversion happens.
4395 : */
4396 : prev = lru_gen_from_seq(lrugen->max_seq - 1);
4397 : next = lru_gen_from_seq(lrugen->max_seq + 1);
4398 :
4399 : for (type = 0; type < ANON_AND_FILE; type++) {
4400 : for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4401 : enum lru_list lru = type * LRU_INACTIVE_FILE;
4402 : long delta = lrugen->nr_pages[prev][type][zone] -
4403 : lrugen->nr_pages[next][type][zone];
4404 :
4405 : if (!delta)
4406 : continue;
4407 :
4408 : __update_lru_size(lruvec, lru, zone, delta);
4409 : __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
4410 : }
4411 : }
4412 :
4413 : for (type = 0; type < ANON_AND_FILE; type++)
4414 : reset_ctrl_pos(lruvec, type, false);
4415 :
4416 : WRITE_ONCE(lrugen->timestamps[next], jiffies);
4417 : /* make sure preceding modifications appear */
4418 : smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
4419 :
4420 : spin_unlock_irq(&lruvec->lru_lock);
4421 : }
4422 :
4423 : static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
4424 : struct scan_control *sc, bool can_swap, bool force_scan)
4425 : {
4426 : bool success;
4427 : struct lru_gen_mm_walk *walk;
4428 : struct mm_struct *mm = NULL;
4429 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
4430 :
4431 : VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
4432 :
4433 : /* see the comment in iterate_mm_list() */
4434 : if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
4435 : success = false;
4436 : goto done;
4437 : }
4438 :
4439 : /*
4440 : * If the hardware doesn't automatically set the accessed bit, fallback
4441 : * to lru_gen_look_around(), which only clears the accessed bit in a
4442 : * handful of PTEs. Spreading the work out over a period of time usually
4443 : * is less efficient, but it avoids bursty page faults.
4444 : */
4445 : if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) {
4446 : success = iterate_mm_list_nowalk(lruvec, max_seq);
4447 : goto done;
4448 : }
4449 :
4450 : walk = set_mm_walk(NULL, true);
4451 : if (!walk) {
4452 : success = iterate_mm_list_nowalk(lruvec, max_seq);
4453 : goto done;
4454 : }
4455 :
4456 : walk->lruvec = lruvec;
4457 : walk->max_seq = max_seq;
4458 : walk->can_swap = can_swap;
4459 : walk->force_scan = force_scan;
4460 :
4461 : do {
4462 : success = iterate_mm_list(lruvec, walk, &mm);
4463 : if (mm)
4464 : walk_mm(lruvec, mm, walk);
4465 :
4466 : cond_resched();
4467 : } while (mm);
4468 : done:
4469 : if (!success) {
4470 : if (sc->priority <= DEF_PRIORITY - 2)
4471 : wait_event_killable(lruvec->mm_state.wait,
4472 : max_seq < READ_ONCE(lrugen->max_seq));
4473 : return false;
4474 : }
4475 :
4476 : VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
4477 :
4478 : inc_max_seq(lruvec, can_swap, force_scan);
4479 : /* either this sees any waiters or they will see updated max_seq */
4480 : if (wq_has_sleeper(&lruvec->mm_state.wait))
4481 : wake_up_all(&lruvec->mm_state.wait);
4482 :
4483 : return true;
4484 : }
4485 :
4486 : /******************************************************************************
4487 : * working set protection
4488 : ******************************************************************************/
4489 :
4490 : static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
4491 : {
4492 : int gen, type, zone;
4493 : unsigned long total = 0;
4494 : bool can_swap = get_swappiness(lruvec, sc);
4495 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
4496 : struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4497 : DEFINE_MAX_SEQ(lruvec);
4498 : DEFINE_MIN_SEQ(lruvec);
4499 :
4500 : for (type = !can_swap; type < ANON_AND_FILE; type++) {
4501 : unsigned long seq;
4502 :
4503 : for (seq = min_seq[type]; seq <= max_seq; seq++) {
4504 : gen = lru_gen_from_seq(seq);
4505 :
4506 : for (zone = 0; zone < MAX_NR_ZONES; zone++)
4507 : total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
4508 : }
4509 : }
4510 :
4511 : /* whether the size is big enough to be helpful */
4512 : return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
4513 : }
4514 :
4515 : static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
4516 : unsigned long min_ttl)
4517 : {
4518 : int gen;
4519 : unsigned long birth;
4520 : struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4521 : DEFINE_MIN_SEQ(lruvec);
4522 :
4523 : /* see the comment on lru_gen_folio */
4524 : gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
4525 : birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
4526 :
4527 : if (time_is_after_jiffies(birth + min_ttl))
4528 : return false;
4529 :
4530 : if (!lruvec_is_sizable(lruvec, sc))
4531 : return false;
4532 :
4533 : mem_cgroup_calculate_protection(NULL, memcg);
4534 :
4535 : return !mem_cgroup_below_min(NULL, memcg);
4536 : }
4537 :
4538 : /* to protect the working set of the last N jiffies */
4539 : static unsigned long lru_gen_min_ttl __read_mostly;
4540 :
4541 : static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
4542 : {
4543 : struct mem_cgroup *memcg;
4544 : unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
4545 :
4546 : VM_WARN_ON_ONCE(!current_is_kswapd());
4547 :
4548 : /* check the order to exclude compaction-induced reclaim */
4549 : if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
4550 : return;
4551 :
4552 : memcg = mem_cgroup_iter(NULL, NULL, NULL);
4553 : do {
4554 : struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4555 :
4556 : if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
4557 : mem_cgroup_iter_break(NULL, memcg);
4558 : return;
4559 : }
4560 :
4561 : cond_resched();
4562 : } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
4563 :
4564 : /*
4565 : * The main goal is to OOM kill if every generation from all memcgs is
4566 : * younger than min_ttl. However, another possibility is all memcgs are
4567 : * either too small or below min.
4568 : */
4569 : if (mutex_trylock(&oom_lock)) {
4570 : struct oom_control oc = {
4571 : .gfp_mask = sc->gfp_mask,
4572 : };
4573 :
4574 : out_of_memory(&oc);
4575 :
4576 : mutex_unlock(&oom_lock);
4577 : }
4578 : }
4579 :
4580 : /******************************************************************************
4581 : * rmap/PT walk feedback
4582 : ******************************************************************************/
4583 :
4584 : /*
4585 : * This function exploits spatial locality when shrink_folio_list() walks the
4586 : * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
4587 : * the scan was done cacheline efficiently, it adds the PMD entry pointing to
4588 : * the PTE table to the Bloom filter. This forms a feedback loop between the
4589 : * eviction and the aging.
4590 : */
4591 : void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
4592 : {
4593 : int i;
4594 : unsigned long start;
4595 : unsigned long end;
4596 : struct lru_gen_mm_walk *walk;
4597 : int young = 0;
4598 : pte_t *pte = pvmw->pte;
4599 : unsigned long addr = pvmw->address;
4600 : struct folio *folio = pfn_folio(pvmw->pfn);
4601 : struct mem_cgroup *memcg = folio_memcg(folio);
4602 : struct pglist_data *pgdat = folio_pgdat(folio);
4603 : struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4604 : DEFINE_MAX_SEQ(lruvec);
4605 : int old_gen, new_gen = lru_gen_from_seq(max_seq);
4606 :
4607 : lockdep_assert_held(pvmw->ptl);
4608 : VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
4609 :
4610 : if (spin_is_contended(pvmw->ptl))
4611 : return;
4612 :
4613 : /* avoid taking the LRU lock under the PTL when possible */
4614 : walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
4615 :
4616 : start = max(addr & PMD_MASK, pvmw->vma->vm_start);
4617 : end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
4618 :
4619 : if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
4620 : if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
4621 : end = start + MIN_LRU_BATCH * PAGE_SIZE;
4622 : else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2)
4623 : start = end - MIN_LRU_BATCH * PAGE_SIZE;
4624 : else {
4625 : start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2;
4626 : end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2;
4627 : }
4628 : }
4629 :
4630 : /* folio_update_gen() requires stable folio_memcg() */
4631 : if (!mem_cgroup_trylock_pages(memcg))
4632 : return;
4633 :
4634 : arch_enter_lazy_mmu_mode();
4635 :
4636 : pte -= (addr - start) / PAGE_SIZE;
4637 :
4638 : for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
4639 : unsigned long pfn;
4640 :
4641 : pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
4642 : if (pfn == -1)
4643 : continue;
4644 :
4645 : if (!pte_young(pte[i]))
4646 : continue;
4647 :
4648 : folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap);
4649 : if (!folio)
4650 : continue;
4651 :
4652 : if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
4653 : VM_WARN_ON_ONCE(true);
4654 :
4655 : young++;
4656 :
4657 : if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
4658 : !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
4659 : !folio_test_swapcache(folio)))
4660 : folio_mark_dirty(folio);
4661 :
4662 : if (walk) {
4663 : old_gen = folio_update_gen(folio, new_gen);
4664 : if (old_gen >= 0 && old_gen != new_gen)
4665 : update_batch_size(walk, folio, old_gen, new_gen);
4666 :
4667 : continue;
4668 : }
4669 :
4670 : old_gen = folio_lru_gen(folio);
4671 : if (old_gen < 0)
4672 : folio_set_referenced(folio);
4673 : else if (old_gen != new_gen)
4674 : folio_activate(folio);
4675 : }
4676 :
4677 : arch_leave_lazy_mmu_mode();
4678 : mem_cgroup_unlock_pages();
4679 :
4680 : /* feedback from rmap walkers to page table walkers */
4681 : if (suitable_to_scan(i, young))
4682 : update_bloom_filter(lruvec, max_seq, pvmw->pmd);
4683 : }
4684 :
4685 : /******************************************************************************
4686 : * memcg LRU
4687 : ******************************************************************************/
4688 :
4689 : /* see the comment on MEMCG_NR_GENS */
4690 : enum {
4691 : MEMCG_LRU_NOP,
4692 : MEMCG_LRU_HEAD,
4693 : MEMCG_LRU_TAIL,
4694 : MEMCG_LRU_OLD,
4695 : MEMCG_LRU_YOUNG,
4696 : };
4697 :
4698 : #ifdef CONFIG_MEMCG
4699 :
4700 : static int lru_gen_memcg_seg(struct lruvec *lruvec)
4701 : {
4702 : return READ_ONCE(lruvec->lrugen.seg);
4703 : }
4704 :
4705 : static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
4706 : {
4707 : int seg;
4708 : int old, new;
4709 : int bin = get_random_u32_below(MEMCG_NR_BINS);
4710 : struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4711 :
4712 : spin_lock(&pgdat->memcg_lru.lock);
4713 :
4714 : VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
4715 :
4716 : seg = 0;
4717 : new = old = lruvec->lrugen.gen;
4718 :
4719 : /* see the comment on MEMCG_NR_GENS */
4720 : if (op == MEMCG_LRU_HEAD)
4721 : seg = MEMCG_LRU_HEAD;
4722 : else if (op == MEMCG_LRU_TAIL)
4723 : seg = MEMCG_LRU_TAIL;
4724 : else if (op == MEMCG_LRU_OLD)
4725 : new = get_memcg_gen(pgdat->memcg_lru.seq);
4726 : else if (op == MEMCG_LRU_YOUNG)
4727 : new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
4728 : else
4729 : VM_WARN_ON_ONCE(true);
4730 :
4731 : hlist_nulls_del_rcu(&lruvec->lrugen.list);
4732 :
4733 : if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
4734 : hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
4735 : else
4736 : hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
4737 :
4738 : pgdat->memcg_lru.nr_memcgs[old]--;
4739 : pgdat->memcg_lru.nr_memcgs[new]++;
4740 :
4741 : lruvec->lrugen.gen = new;
4742 : WRITE_ONCE(lruvec->lrugen.seg, seg);
4743 :
4744 : if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
4745 : WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
4746 :
4747 : spin_unlock(&pgdat->memcg_lru.lock);
4748 : }
4749 :
4750 : void lru_gen_online_memcg(struct mem_cgroup *memcg)
4751 : {
4752 : int gen;
4753 : int nid;
4754 : int bin = get_random_u32_below(MEMCG_NR_BINS);
4755 :
4756 : for_each_node(nid) {
4757 : struct pglist_data *pgdat = NODE_DATA(nid);
4758 : struct lruvec *lruvec = get_lruvec(memcg, nid);
4759 :
4760 : spin_lock(&pgdat->memcg_lru.lock);
4761 :
4762 : VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
4763 :
4764 : gen = get_memcg_gen(pgdat->memcg_lru.seq);
4765 :
4766 : hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
4767 : pgdat->memcg_lru.nr_memcgs[gen]++;
4768 :
4769 : lruvec->lrugen.gen = gen;
4770 :
4771 : spin_unlock(&pgdat->memcg_lru.lock);
4772 : }
4773 : }
4774 :
4775 : void lru_gen_offline_memcg(struct mem_cgroup *memcg)
4776 : {
4777 : int nid;
4778 :
4779 : for_each_node(nid) {
4780 : struct lruvec *lruvec = get_lruvec(memcg, nid);
4781 :
4782 : lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
4783 : }
4784 : }
4785 :
4786 : void lru_gen_release_memcg(struct mem_cgroup *memcg)
4787 : {
4788 : int gen;
4789 : int nid;
4790 :
4791 : for_each_node(nid) {
4792 : struct pglist_data *pgdat = NODE_DATA(nid);
4793 : struct lruvec *lruvec = get_lruvec(memcg, nid);
4794 :
4795 : spin_lock(&pgdat->memcg_lru.lock);
4796 :
4797 : VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
4798 :
4799 : gen = lruvec->lrugen.gen;
4800 :
4801 : hlist_nulls_del_rcu(&lruvec->lrugen.list);
4802 : pgdat->memcg_lru.nr_memcgs[gen]--;
4803 :
4804 : if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
4805 : WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
4806 :
4807 : spin_unlock(&pgdat->memcg_lru.lock);
4808 : }
4809 : }
4810 :
4811 : void lru_gen_soft_reclaim(struct lruvec *lruvec)
4812 : {
4813 : /* see the comment on MEMCG_NR_GENS */
4814 : if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
4815 : lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
4816 : }
4817 :
4818 : #else /* !CONFIG_MEMCG */
4819 :
4820 : static int lru_gen_memcg_seg(struct lruvec *lruvec)
4821 : {
4822 : return 0;
4823 : }
4824 :
4825 : #endif
4826 :
4827 : /******************************************************************************
4828 : * the eviction
4829 : ******************************************************************************/
4830 :
4831 : static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
4832 : {
4833 : bool success;
4834 : int gen = folio_lru_gen(folio);
4835 : int type = folio_is_file_lru(folio);
4836 : int zone = folio_zonenum(folio);
4837 : int delta = folio_nr_pages(folio);
4838 : int refs = folio_lru_refs(folio);
4839 : int tier = lru_tier_from_refs(refs);
4840 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
4841 :
4842 : VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
4843 :
4844 : /* unevictable */
4845 : if (!folio_evictable(folio)) {
4846 : success = lru_gen_del_folio(lruvec, folio, true);
4847 : VM_WARN_ON_ONCE_FOLIO(!success, folio);
4848 : folio_set_unevictable(folio);
4849 : lruvec_add_folio(lruvec, folio);
4850 : __count_vm_events(UNEVICTABLE_PGCULLED, delta);
4851 : return true;
4852 : }
4853 :
4854 : /* dirty lazyfree */
4855 : if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) {
4856 : success = lru_gen_del_folio(lruvec, folio, true);
4857 : VM_WARN_ON_ONCE_FOLIO(!success, folio);
4858 : folio_set_swapbacked(folio);
4859 : lruvec_add_folio_tail(lruvec, folio);
4860 : return true;
4861 : }
4862 :
4863 : /* promoted */
4864 : if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
4865 : list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
4866 : return true;
4867 : }
4868 :
4869 : /* protected */
4870 : if (tier > tier_idx) {
4871 : int hist = lru_hist_from_seq(lrugen->min_seq[type]);
4872 :
4873 : gen = folio_inc_gen(lruvec, folio, false);
4874 : list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
4875 :
4876 : WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
4877 : lrugen->protected[hist][type][tier - 1] + delta);
4878 : __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
4879 : return true;
4880 : }
4881 :
4882 : /* waiting for writeback */
4883 : if (folio_test_locked(folio) || folio_test_writeback(folio) ||
4884 : (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
4885 : gen = folio_inc_gen(lruvec, folio, true);
4886 : list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
4887 : return true;
4888 : }
4889 :
4890 : return false;
4891 : }
4892 :
4893 : static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc)
4894 : {
4895 : bool success;
4896 :
4897 : /* swapping inhibited */
4898 : if (!(sc->gfp_mask & __GFP_IO) &&
4899 : (folio_test_dirty(folio) ||
4900 : (folio_test_anon(folio) && !folio_test_swapcache(folio))))
4901 : return false;
4902 :
4903 : /* raced with release_pages() */
4904 : if (!folio_try_get(folio))
4905 : return false;
4906 :
4907 : /* raced with another isolation */
4908 : if (!folio_test_clear_lru(folio)) {
4909 : folio_put(folio);
4910 : return false;
4911 : }
4912 :
4913 : /* see the comment on MAX_NR_TIERS */
4914 : if (!folio_test_referenced(folio))
4915 : set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
4916 :
4917 : /* for shrink_folio_list() */
4918 : folio_clear_reclaim(folio);
4919 : folio_clear_referenced(folio);
4920 :
4921 : success = lru_gen_del_folio(lruvec, folio, true);
4922 : VM_WARN_ON_ONCE_FOLIO(!success, folio);
4923 :
4924 : return true;
4925 : }
4926 :
4927 : static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
4928 : int type, int tier, struct list_head *list)
4929 : {
4930 : int gen, zone;
4931 : enum vm_event_item item;
4932 : int sorted = 0;
4933 : int scanned = 0;
4934 : int isolated = 0;
4935 : int remaining = MAX_LRU_BATCH;
4936 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
4937 : struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4938 :
4939 : VM_WARN_ON_ONCE(!list_empty(list));
4940 :
4941 : if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
4942 : return 0;
4943 :
4944 : gen = lru_gen_from_seq(lrugen->min_seq[type]);
4945 :
4946 : for (zone = sc->reclaim_idx; zone >= 0; zone--) {
4947 : LIST_HEAD(moved);
4948 : int skipped = 0;
4949 : struct list_head *head = &lrugen->folios[gen][type][zone];
4950 :
4951 : while (!list_empty(head)) {
4952 : struct folio *folio = lru_to_folio(head);
4953 : int delta = folio_nr_pages(folio);
4954 :
4955 : VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
4956 : VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
4957 : VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
4958 : VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
4959 :
4960 : scanned += delta;
4961 :
4962 : if (sort_folio(lruvec, folio, tier))
4963 : sorted += delta;
4964 : else if (isolate_folio(lruvec, folio, sc)) {
4965 : list_add(&folio->lru, list);
4966 : isolated += delta;
4967 : } else {
4968 : list_move(&folio->lru, &moved);
4969 : skipped += delta;
4970 : }
4971 :
4972 : if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
4973 : break;
4974 : }
4975 :
4976 : if (skipped) {
4977 : list_splice(&moved, head);
4978 : __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
4979 : }
4980 :
4981 : if (!remaining || isolated >= MIN_LRU_BATCH)
4982 : break;
4983 : }
4984 :
4985 : item = PGSCAN_KSWAPD + reclaimer_offset();
4986 : if (!cgroup_reclaim(sc)) {
4987 : __count_vm_events(item, isolated);
4988 : __count_vm_events(PGREFILL, sorted);
4989 : }
4990 : __count_memcg_events(memcg, item, isolated);
4991 : __count_memcg_events(memcg, PGREFILL, sorted);
4992 : __count_vm_events(PGSCAN_ANON + type, isolated);
4993 :
4994 : /*
4995 : * There might not be eligible folios due to reclaim_idx. Check the
4996 : * remaining to prevent livelock if it's not making progress.
4997 : */
4998 : return isolated || !remaining ? scanned : 0;
4999 : }
5000 :
5001 : static int get_tier_idx(struct lruvec *lruvec, int type)
5002 : {
5003 : int tier;
5004 : struct ctrl_pos sp, pv;
5005 :
5006 : /*
5007 : * To leave a margin for fluctuations, use a larger gain factor (1:2).
5008 : * This value is chosen because any other tier would have at least twice
5009 : * as many refaults as the first tier.
5010 : */
5011 : read_ctrl_pos(lruvec, type, 0, 1, &sp);
5012 : for (tier = 1; tier < MAX_NR_TIERS; tier++) {
5013 : read_ctrl_pos(lruvec, type, tier, 2, &pv);
5014 : if (!positive_ctrl_err(&sp, &pv))
5015 : break;
5016 : }
5017 :
5018 : return tier - 1;
5019 : }
5020 :
5021 : static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
5022 : {
5023 : int type, tier;
5024 : struct ctrl_pos sp, pv;
5025 : int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
5026 :
5027 : /*
5028 : * Compare the first tier of anon with that of file to determine which
5029 : * type to scan. Also need to compare other tiers of the selected type
5030 : * with the first tier of the other type to determine the last tier (of
5031 : * the selected type) to evict.
5032 : */
5033 : read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
5034 : read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
5035 : type = positive_ctrl_err(&sp, &pv);
5036 :
5037 : read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
5038 : for (tier = 1; tier < MAX_NR_TIERS; tier++) {
5039 : read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
5040 : if (!positive_ctrl_err(&sp, &pv))
5041 : break;
5042 : }
5043 :
5044 : *tier_idx = tier - 1;
5045 :
5046 : return type;
5047 : }
5048 :
5049 : static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
5050 : int *type_scanned, struct list_head *list)
5051 : {
5052 : int i;
5053 : int type;
5054 : int scanned;
5055 : int tier = -1;
5056 : DEFINE_MIN_SEQ(lruvec);
5057 :
5058 : /*
5059 : * Try to make the obvious choice first. When anon and file are both
5060 : * available from the same generation, interpret swappiness 1 as file
5061 : * first and 200 as anon first.
5062 : */
5063 : if (!swappiness)
5064 : type = LRU_GEN_FILE;
5065 : else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
5066 : type = LRU_GEN_ANON;
5067 : else if (swappiness == 1)
5068 : type = LRU_GEN_FILE;
5069 : else if (swappiness == 200)
5070 : type = LRU_GEN_ANON;
5071 : else
5072 : type = get_type_to_scan(lruvec, swappiness, &tier);
5073 :
5074 : for (i = !swappiness; i < ANON_AND_FILE; i++) {
5075 : if (tier < 0)
5076 : tier = get_tier_idx(lruvec, type);
5077 :
5078 : scanned = scan_folios(lruvec, sc, type, tier, list);
5079 : if (scanned)
5080 : break;
5081 :
5082 : type = !type;
5083 : tier = -1;
5084 : }
5085 :
5086 : *type_scanned = type;
5087 :
5088 : return scanned;
5089 : }
5090 :
5091 : static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
5092 : {
5093 : int type;
5094 : int scanned;
5095 : int reclaimed;
5096 : LIST_HEAD(list);
5097 : LIST_HEAD(clean);
5098 : struct folio *folio;
5099 : struct folio *next;
5100 : enum vm_event_item item;
5101 : struct reclaim_stat stat;
5102 : struct lru_gen_mm_walk *walk;
5103 : bool skip_retry = false;
5104 : struct mem_cgroup *memcg = lruvec_memcg(lruvec);
5105 : struct pglist_data *pgdat = lruvec_pgdat(lruvec);
5106 :
5107 : spin_lock_irq(&lruvec->lru_lock);
5108 :
5109 : scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
5110 :
5111 : scanned += try_to_inc_min_seq(lruvec, swappiness);
5112 :
5113 : if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
5114 : scanned = 0;
5115 :
5116 : spin_unlock_irq(&lruvec->lru_lock);
5117 :
5118 : if (list_empty(&list))
5119 : return scanned;
5120 : retry:
5121 : reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
5122 : sc->nr_reclaimed += reclaimed;
5123 :
5124 : list_for_each_entry_safe_reverse(folio, next, &list, lru) {
5125 : if (!folio_evictable(folio)) {
5126 : list_del(&folio->lru);
5127 : folio_putback_lru(folio);
5128 : continue;
5129 : }
5130 :
5131 : if (folio_test_reclaim(folio) &&
5132 : (folio_test_dirty(folio) || folio_test_writeback(folio))) {
5133 : /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
5134 : if (folio_test_workingset(folio))
5135 : folio_set_referenced(folio);
5136 : continue;
5137 : }
5138 :
5139 : if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
5140 : folio_mapped(folio) || folio_test_locked(folio) ||
5141 : folio_test_dirty(folio) || folio_test_writeback(folio)) {
5142 : /* don't add rejected folios to the oldest generation */
5143 : set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
5144 : BIT(PG_active));
5145 : continue;
5146 : }
5147 :
5148 : /* retry folios that may have missed folio_rotate_reclaimable() */
5149 : list_move(&folio->lru, &clean);
5150 : sc->nr_scanned -= folio_nr_pages(folio);
5151 : }
5152 :
5153 : spin_lock_irq(&lruvec->lru_lock);
5154 :
5155 : move_folios_to_lru(lruvec, &list);
5156 :
5157 : walk = current->reclaim_state->mm_walk;
5158 : if (walk && walk->batched)
5159 : reset_batch_size(lruvec, walk);
5160 :
5161 : item = PGSTEAL_KSWAPD + reclaimer_offset();
5162 : if (!cgroup_reclaim(sc))
5163 : __count_vm_events(item, reclaimed);
5164 : __count_memcg_events(memcg, item, reclaimed);
5165 : __count_vm_events(PGSTEAL_ANON + type, reclaimed);
5166 :
5167 : spin_unlock_irq(&lruvec->lru_lock);
5168 :
5169 : mem_cgroup_uncharge_list(&list);
5170 : free_unref_page_list(&list);
5171 :
5172 : INIT_LIST_HEAD(&list);
5173 : list_splice_init(&clean, &list);
5174 :
5175 : if (!list_empty(&list)) {
5176 : skip_retry = true;
5177 : goto retry;
5178 : }
5179 :
5180 : return scanned;
5181 : }
5182 :
5183 : static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
5184 : struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
5185 : {
5186 : int gen, type, zone;
5187 : unsigned long old = 0;
5188 : unsigned long young = 0;
5189 : unsigned long total = 0;
5190 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
5191 : struct mem_cgroup *memcg = lruvec_memcg(lruvec);
5192 : DEFINE_MIN_SEQ(lruvec);
5193 :
5194 : /* whether this lruvec is completely out of cold folios */
5195 : if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
5196 : *nr_to_scan = 0;
5197 : return true;
5198 : }
5199 :
5200 : for (type = !can_swap; type < ANON_AND_FILE; type++) {
5201 : unsigned long seq;
5202 :
5203 : for (seq = min_seq[type]; seq <= max_seq; seq++) {
5204 : unsigned long size = 0;
5205 :
5206 : gen = lru_gen_from_seq(seq);
5207 :
5208 : for (zone = 0; zone < MAX_NR_ZONES; zone++)
5209 : size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
5210 :
5211 : total += size;
5212 : if (seq == max_seq)
5213 : young += size;
5214 : else if (seq + MIN_NR_GENS == max_seq)
5215 : old += size;
5216 : }
5217 : }
5218 :
5219 : /* try to scrape all its memory if this memcg was deleted */
5220 : *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
5221 :
5222 : /*
5223 : * The aging tries to be lazy to reduce the overhead, while the eviction
5224 : * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
5225 : * ideal number of generations is MIN_NR_GENS+1.
5226 : */
5227 : if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
5228 : return false;
5229 :
5230 : /*
5231 : * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
5232 : * of the total number of pages for each generation. A reasonable range
5233 : * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
5234 : * aging cares about the upper bound of hot pages, while the eviction
5235 : * cares about the lower bound of cold pages.
5236 : */
5237 : if (young * MIN_NR_GENS > total)
5238 : return true;
5239 : if (old * (MIN_NR_GENS + 2) < total)
5240 : return true;
5241 :
5242 : return false;
5243 : }
5244 :
5245 : /*
5246 : * For future optimizations:
5247 : * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
5248 : * reclaim.
5249 : */
5250 : static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
5251 : {
5252 : unsigned long nr_to_scan;
5253 : struct mem_cgroup *memcg = lruvec_memcg(lruvec);
5254 : DEFINE_MAX_SEQ(lruvec);
5255 :
5256 : if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
5257 : return 0;
5258 :
5259 : if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
5260 : return nr_to_scan;
5261 :
5262 : /* skip the aging path at the default priority */
5263 : if (sc->priority == DEF_PRIORITY)
5264 : return nr_to_scan;
5265 :
5266 : /* skip this lruvec as it's low on cold folios */
5267 : return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
5268 : }
5269 :
5270 : static unsigned long get_nr_to_reclaim(struct scan_control *sc)
5271 : {
5272 : /* don't abort memcg reclaim to ensure fairness */
5273 : if (!global_reclaim(sc))
5274 : return -1;
5275 :
5276 : return max(sc->nr_to_reclaim, compact_gap(sc->order));
5277 : }
5278 :
5279 : static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
5280 : {
5281 : long nr_to_scan;
5282 : unsigned long scanned = 0;
5283 : unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
5284 : int swappiness = get_swappiness(lruvec, sc);
5285 :
5286 : /* clean file folios are more likely to exist */
5287 : if (swappiness && !(sc->gfp_mask & __GFP_IO))
5288 : swappiness = 1;
5289 :
5290 : while (true) {
5291 : int delta;
5292 :
5293 : nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
5294 : if (nr_to_scan <= 0)
5295 : break;
5296 :
5297 : delta = evict_folios(lruvec, sc, swappiness);
5298 : if (!delta)
5299 : break;
5300 :
5301 : scanned += delta;
5302 : if (scanned >= nr_to_scan)
5303 : break;
5304 :
5305 : if (sc->nr_reclaimed >= nr_to_reclaim)
5306 : break;
5307 :
5308 : cond_resched();
5309 : }
5310 :
5311 : /* whether try_to_inc_max_seq() was successful */
5312 : return nr_to_scan < 0;
5313 : }
5314 :
5315 : static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
5316 : {
5317 : bool success;
5318 : unsigned long scanned = sc->nr_scanned;
5319 : unsigned long reclaimed = sc->nr_reclaimed;
5320 : int seg = lru_gen_memcg_seg(lruvec);
5321 : struct mem_cgroup *memcg = lruvec_memcg(lruvec);
5322 : struct pglist_data *pgdat = lruvec_pgdat(lruvec);
5323 :
5324 : /* see the comment on MEMCG_NR_GENS */
5325 : if (!lruvec_is_sizable(lruvec, sc))
5326 : return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
5327 :
5328 : mem_cgroup_calculate_protection(NULL, memcg);
5329 :
5330 : if (mem_cgroup_below_min(NULL, memcg))
5331 : return MEMCG_LRU_YOUNG;
5332 :
5333 : if (mem_cgroup_below_low(NULL, memcg)) {
5334 : /* see the comment on MEMCG_NR_GENS */
5335 : if (seg != MEMCG_LRU_TAIL)
5336 : return MEMCG_LRU_TAIL;
5337 :
5338 : memcg_memory_event(memcg, MEMCG_LOW);
5339 : }
5340 :
5341 : success = try_to_shrink_lruvec(lruvec, sc);
5342 :
5343 : shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
5344 :
5345 : if (!sc->proactive)
5346 : vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
5347 : sc->nr_reclaimed - reclaimed);
5348 :
5349 : sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
5350 : current->reclaim_state->reclaimed_slab = 0;
5351 :
5352 : return success ? MEMCG_LRU_YOUNG : 0;
5353 : }
5354 :
5355 : #ifdef CONFIG_MEMCG
5356 :
5357 : static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
5358 : {
5359 : int op;
5360 : int gen;
5361 : int bin;
5362 : int first_bin;
5363 : struct lruvec *lruvec;
5364 : struct lru_gen_folio *lrugen;
5365 : struct mem_cgroup *memcg;
5366 : const struct hlist_nulls_node *pos;
5367 : unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
5368 :
5369 : bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
5370 : restart:
5371 : op = 0;
5372 : memcg = NULL;
5373 : gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
5374 :
5375 : rcu_read_lock();
5376 :
5377 : hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
5378 : if (op)
5379 : lru_gen_rotate_memcg(lruvec, op);
5380 :
5381 : mem_cgroup_put(memcg);
5382 :
5383 : lruvec = container_of(lrugen, struct lruvec, lrugen);
5384 : memcg = lruvec_memcg(lruvec);
5385 :
5386 : if (!mem_cgroup_tryget(memcg)) {
5387 : op = 0;
5388 : memcg = NULL;
5389 : continue;
5390 : }
5391 :
5392 : rcu_read_unlock();
5393 :
5394 : op = shrink_one(lruvec, sc);
5395 :
5396 : rcu_read_lock();
5397 :
5398 : if (sc->nr_reclaimed >= nr_to_reclaim)
5399 : break;
5400 : }
5401 :
5402 : rcu_read_unlock();
5403 :
5404 : if (op)
5405 : lru_gen_rotate_memcg(lruvec, op);
5406 :
5407 : mem_cgroup_put(memcg);
5408 :
5409 : if (sc->nr_reclaimed >= nr_to_reclaim)
5410 : return;
5411 :
5412 : /* restart if raced with lru_gen_rotate_memcg() */
5413 : if (gen != get_nulls_value(pos))
5414 : goto restart;
5415 :
5416 : /* try the rest of the bins of the current generation */
5417 : bin = get_memcg_bin(bin + 1);
5418 : if (bin != first_bin)
5419 : goto restart;
5420 : }
5421 :
5422 : static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
5423 : {
5424 : struct blk_plug plug;
5425 :
5426 : VM_WARN_ON_ONCE(global_reclaim(sc));
5427 : VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
5428 :
5429 : lru_add_drain();
5430 :
5431 : blk_start_plug(&plug);
5432 :
5433 : set_mm_walk(NULL, sc->proactive);
5434 :
5435 : if (try_to_shrink_lruvec(lruvec, sc))
5436 : lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
5437 :
5438 : clear_mm_walk();
5439 :
5440 : blk_finish_plug(&plug);
5441 : }
5442 :
5443 : #else /* !CONFIG_MEMCG */
5444 :
5445 : static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
5446 : {
5447 : BUILD_BUG();
5448 : }
5449 :
5450 : static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
5451 : {
5452 : BUILD_BUG();
5453 : }
5454 :
5455 : #endif
5456 :
5457 : static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
5458 : {
5459 : int priority;
5460 : unsigned long reclaimable;
5461 : struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
5462 :
5463 : if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
5464 : return;
5465 : /*
5466 : * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
5467 : * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
5468 : * estimated reclaimed_to_scanned_ratio = inactive / total.
5469 : */
5470 : reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
5471 : if (get_swappiness(lruvec, sc))
5472 : reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
5473 :
5474 : reclaimable /= MEMCG_NR_GENS;
5475 :
5476 : /* round down reclaimable and round up sc->nr_to_reclaim */
5477 : priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
5478 :
5479 : sc->priority = clamp(priority, 0, DEF_PRIORITY);
5480 : }
5481 :
5482 : static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
5483 : {
5484 : struct blk_plug plug;
5485 : unsigned long reclaimed = sc->nr_reclaimed;
5486 :
5487 : VM_WARN_ON_ONCE(!global_reclaim(sc));
5488 :
5489 : /*
5490 : * Unmapped clean folios are already prioritized. Scanning for more of
5491 : * them is likely futile and can cause high reclaim latency when there
5492 : * is a large number of memcgs.
5493 : */
5494 : if (!sc->may_writepage || !sc->may_unmap)
5495 : goto done;
5496 :
5497 : lru_add_drain();
5498 :
5499 : blk_start_plug(&plug);
5500 :
5501 : set_mm_walk(pgdat, sc->proactive);
5502 :
5503 : set_initial_priority(pgdat, sc);
5504 :
5505 : if (current_is_kswapd())
5506 : sc->nr_reclaimed = 0;
5507 :
5508 : if (mem_cgroup_disabled())
5509 : shrink_one(&pgdat->__lruvec, sc);
5510 : else
5511 : shrink_many(pgdat, sc);
5512 :
5513 : if (current_is_kswapd())
5514 : sc->nr_reclaimed += reclaimed;
5515 :
5516 : clear_mm_walk();
5517 :
5518 : blk_finish_plug(&plug);
5519 : done:
5520 : /* kswapd should never fail */
5521 : pgdat->kswapd_failures = 0;
5522 : }
5523 :
5524 : /******************************************************************************
5525 : * state change
5526 : ******************************************************************************/
5527 :
5528 : static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
5529 : {
5530 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
5531 :
5532 : if (lrugen->enabled) {
5533 : enum lru_list lru;
5534 :
5535 : for_each_evictable_lru(lru) {
5536 : if (!list_empty(&lruvec->lists[lru]))
5537 : return false;
5538 : }
5539 : } else {
5540 : int gen, type, zone;
5541 :
5542 : for_each_gen_type_zone(gen, type, zone) {
5543 : if (!list_empty(&lrugen->folios[gen][type][zone]))
5544 : return false;
5545 : }
5546 : }
5547 :
5548 : return true;
5549 : }
5550 :
5551 : static bool fill_evictable(struct lruvec *lruvec)
5552 : {
5553 : enum lru_list lru;
5554 : int remaining = MAX_LRU_BATCH;
5555 :
5556 : for_each_evictable_lru(lru) {
5557 : int type = is_file_lru(lru);
5558 : bool active = is_active_lru(lru);
5559 : struct list_head *head = &lruvec->lists[lru];
5560 :
5561 : while (!list_empty(head)) {
5562 : bool success;
5563 : struct folio *folio = lru_to_folio(head);
5564 :
5565 : VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
5566 : VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
5567 : VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
5568 : VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio);
5569 :
5570 : lruvec_del_folio(lruvec, folio);
5571 : success = lru_gen_add_folio(lruvec, folio, false);
5572 : VM_WARN_ON_ONCE(!success);
5573 :
5574 : if (!--remaining)
5575 : return false;
5576 : }
5577 : }
5578 :
5579 : return true;
5580 : }
5581 :
5582 : static bool drain_evictable(struct lruvec *lruvec)
5583 : {
5584 : int gen, type, zone;
5585 : int remaining = MAX_LRU_BATCH;
5586 :
5587 : for_each_gen_type_zone(gen, type, zone) {
5588 : struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];
5589 :
5590 : while (!list_empty(head)) {
5591 : bool success;
5592 : struct folio *folio = lru_to_folio(head);
5593 :
5594 : VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
5595 : VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
5596 : VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
5597 : VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
5598 :
5599 : success = lru_gen_del_folio(lruvec, folio, false);
5600 : VM_WARN_ON_ONCE(!success);
5601 : lruvec_add_folio(lruvec, folio);
5602 :
5603 : if (!--remaining)
5604 : return false;
5605 : }
5606 : }
5607 :
5608 : return true;
5609 : }
5610 :
5611 : static void lru_gen_change_state(bool enabled)
5612 : {
5613 : static DEFINE_MUTEX(state_mutex);
5614 :
5615 : struct mem_cgroup *memcg;
5616 :
5617 : cgroup_lock();
5618 : cpus_read_lock();
5619 : get_online_mems();
5620 : mutex_lock(&state_mutex);
5621 :
5622 : if (enabled == lru_gen_enabled())
5623 : goto unlock;
5624 :
5625 : if (enabled)
5626 : static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
5627 : else
5628 : static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
5629 :
5630 : memcg = mem_cgroup_iter(NULL, NULL, NULL);
5631 : do {
5632 : int nid;
5633 :
5634 : for_each_node(nid) {
5635 : struct lruvec *lruvec = get_lruvec(memcg, nid);
5636 :
5637 : spin_lock_irq(&lruvec->lru_lock);
5638 :
5639 : VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
5640 : VM_WARN_ON_ONCE(!state_is_valid(lruvec));
5641 :
5642 : lruvec->lrugen.enabled = enabled;
5643 :
5644 : while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
5645 : spin_unlock_irq(&lruvec->lru_lock);
5646 : cond_resched();
5647 : spin_lock_irq(&lruvec->lru_lock);
5648 : }
5649 :
5650 : spin_unlock_irq(&lruvec->lru_lock);
5651 : }
5652 :
5653 : cond_resched();
5654 : } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
5655 : unlock:
5656 : mutex_unlock(&state_mutex);
5657 : put_online_mems();
5658 : cpus_read_unlock();
5659 : cgroup_unlock();
5660 : }
5661 :
5662 : /******************************************************************************
5663 : * sysfs interface
5664 : ******************************************************************************/
5665 :
5666 : static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
5667 : {
5668 : return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
5669 : }
5670 :
5671 : /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5672 : static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
5673 : const char *buf, size_t len)
5674 : {
5675 : unsigned int msecs;
5676 :
5677 : if (kstrtouint(buf, 0, &msecs))
5678 : return -EINVAL;
5679 :
5680 : WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
5681 :
5682 : return len;
5683 : }
5684 :
5685 : static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
5686 : min_ttl_ms, 0644, show_min_ttl, store_min_ttl
5687 : );
5688 :
5689 : static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
5690 : {
5691 : unsigned int caps = 0;
5692 :
5693 : if (get_cap(LRU_GEN_CORE))
5694 : caps |= BIT(LRU_GEN_CORE);
5695 :
5696 : if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
5697 : caps |= BIT(LRU_GEN_MM_WALK);
5698 :
5699 : if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
5700 : caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
5701 :
5702 : return sysfs_emit(buf, "0x%04x\n", caps);
5703 : }
5704 :
5705 : /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5706 : static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
5707 : const char *buf, size_t len)
5708 : {
5709 : int i;
5710 : unsigned int caps;
5711 :
5712 : if (tolower(*buf) == 'n')
5713 : caps = 0;
5714 : else if (tolower(*buf) == 'y')
5715 : caps = -1;
5716 : else if (kstrtouint(buf, 0, &caps))
5717 : return -EINVAL;
5718 :
5719 : for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
5720 : bool enabled = caps & BIT(i);
5721 :
5722 : if (i == LRU_GEN_CORE)
5723 : lru_gen_change_state(enabled);
5724 : else if (enabled)
5725 : static_branch_enable(&lru_gen_caps[i]);
5726 : else
5727 : static_branch_disable(&lru_gen_caps[i]);
5728 : }
5729 :
5730 : return len;
5731 : }
5732 :
5733 : static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
5734 : enabled, 0644, show_enabled, store_enabled
5735 : );
5736 :
5737 : static struct attribute *lru_gen_attrs[] = {
5738 : &lru_gen_min_ttl_attr.attr,
5739 : &lru_gen_enabled_attr.attr,
5740 : NULL
5741 : };
5742 :
5743 : static struct attribute_group lru_gen_attr_group = {
5744 : .name = "lru_gen",
5745 : .attrs = lru_gen_attrs,
5746 : };
5747 :
5748 : /******************************************************************************
5749 : * debugfs interface
5750 : ******************************************************************************/
5751 :
5752 : static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
5753 : {
5754 : struct mem_cgroup *memcg;
5755 : loff_t nr_to_skip = *pos;
5756 :
5757 : m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
5758 : if (!m->private)
5759 : return ERR_PTR(-ENOMEM);
5760 :
5761 : memcg = mem_cgroup_iter(NULL, NULL, NULL);
5762 : do {
5763 : int nid;
5764 :
5765 : for_each_node_state(nid, N_MEMORY) {
5766 : if (!nr_to_skip--)
5767 : return get_lruvec(memcg, nid);
5768 : }
5769 : } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
5770 :
5771 : return NULL;
5772 : }
5773 :
5774 : static void lru_gen_seq_stop(struct seq_file *m, void *v)
5775 : {
5776 : if (!IS_ERR_OR_NULL(v))
5777 : mem_cgroup_iter_break(NULL, lruvec_memcg(v));
5778 :
5779 : kvfree(m->private);
5780 : m->private = NULL;
5781 : }
5782 :
5783 : static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
5784 : {
5785 : int nid = lruvec_pgdat(v)->node_id;
5786 : struct mem_cgroup *memcg = lruvec_memcg(v);
5787 :
5788 : ++*pos;
5789 :
5790 : nid = next_memory_node(nid);
5791 : if (nid == MAX_NUMNODES) {
5792 : memcg = mem_cgroup_iter(NULL, memcg, NULL);
5793 : if (!memcg)
5794 : return NULL;
5795 :
5796 : nid = first_memory_node;
5797 : }
5798 :
5799 : return get_lruvec(memcg, nid);
5800 : }
5801 :
5802 : static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
5803 : unsigned long max_seq, unsigned long *min_seq,
5804 : unsigned long seq)
5805 : {
5806 : int i;
5807 : int type, tier;
5808 : int hist = lru_hist_from_seq(seq);
5809 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
5810 :
5811 : for (tier = 0; tier < MAX_NR_TIERS; tier++) {
5812 : seq_printf(m, " %10d", tier);
5813 : for (type = 0; type < ANON_AND_FILE; type++) {
5814 : const char *s = " ";
5815 : unsigned long n[3] = {};
5816 :
5817 : if (seq == max_seq) {
5818 : s = "RT ";
5819 : n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
5820 : n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
5821 : } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
5822 : s = "rep";
5823 : n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
5824 : n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
5825 : if (tier)
5826 : n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
5827 : }
5828 :
5829 : for (i = 0; i < 3; i++)
5830 : seq_printf(m, " %10lu%c", n[i], s[i]);
5831 : }
5832 : seq_putc(m, '\n');
5833 : }
5834 :
5835 : seq_puts(m, " ");
5836 : for (i = 0; i < NR_MM_STATS; i++) {
5837 : const char *s = " ";
5838 : unsigned long n = 0;
5839 :
5840 : if (seq == max_seq && NR_HIST_GENS == 1) {
5841 : s = "LOYNFA";
5842 : n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
5843 : } else if (seq != max_seq && NR_HIST_GENS > 1) {
5844 : s = "loynfa";
5845 : n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
5846 : }
5847 :
5848 : seq_printf(m, " %10lu%c", n, s[i]);
5849 : }
5850 : seq_putc(m, '\n');
5851 : }
5852 :
5853 : /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5854 : static int lru_gen_seq_show(struct seq_file *m, void *v)
5855 : {
5856 : unsigned long seq;
5857 : bool full = !debugfs_real_fops(m->file)->write;
5858 : struct lruvec *lruvec = v;
5859 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
5860 : int nid = lruvec_pgdat(lruvec)->node_id;
5861 : struct mem_cgroup *memcg = lruvec_memcg(lruvec);
5862 : DEFINE_MAX_SEQ(lruvec);
5863 : DEFINE_MIN_SEQ(lruvec);
5864 :
5865 : if (nid == first_memory_node) {
5866 : const char *path = memcg ? m->private : "";
5867 :
5868 : #ifdef CONFIG_MEMCG
5869 : if (memcg)
5870 : cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
5871 : #endif
5872 : seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
5873 : }
5874 :
5875 : seq_printf(m, " node %5d\n", nid);
5876 :
5877 : if (!full)
5878 : seq = min_seq[LRU_GEN_ANON];
5879 : else if (max_seq >= MAX_NR_GENS)
5880 : seq = max_seq - MAX_NR_GENS + 1;
5881 : else
5882 : seq = 0;
5883 :
5884 : for (; seq <= max_seq; seq++) {
5885 : int type, zone;
5886 : int gen = lru_gen_from_seq(seq);
5887 : unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
5888 :
5889 : seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
5890 :
5891 : for (type = 0; type < ANON_AND_FILE; type++) {
5892 : unsigned long size = 0;
5893 : char mark = full && seq < min_seq[type] ? 'x' : ' ';
5894 :
5895 : for (zone = 0; zone < MAX_NR_ZONES; zone++)
5896 : size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
5897 :
5898 : seq_printf(m, " %10lu%c", size, mark);
5899 : }
5900 :
5901 : seq_putc(m, '\n');
5902 :
5903 : if (full)
5904 : lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
5905 : }
5906 :
5907 : return 0;
5908 : }
5909 :
5910 : static const struct seq_operations lru_gen_seq_ops = {
5911 : .start = lru_gen_seq_start,
5912 : .stop = lru_gen_seq_stop,
5913 : .next = lru_gen_seq_next,
5914 : .show = lru_gen_seq_show,
5915 : };
5916 :
5917 : static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
5918 : bool can_swap, bool force_scan)
5919 : {
5920 : DEFINE_MAX_SEQ(lruvec);
5921 : DEFINE_MIN_SEQ(lruvec);
5922 :
5923 : if (seq < max_seq)
5924 : return 0;
5925 :
5926 : if (seq > max_seq)
5927 : return -EINVAL;
5928 :
5929 : if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
5930 : return -ERANGE;
5931 :
5932 : try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
5933 :
5934 : return 0;
5935 : }
5936 :
5937 : static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
5938 : int swappiness, unsigned long nr_to_reclaim)
5939 : {
5940 : DEFINE_MAX_SEQ(lruvec);
5941 :
5942 : if (seq + MIN_NR_GENS > max_seq)
5943 : return -EINVAL;
5944 :
5945 : sc->nr_reclaimed = 0;
5946 :
5947 : while (!signal_pending(current)) {
5948 : DEFINE_MIN_SEQ(lruvec);
5949 :
5950 : if (seq < min_seq[!swappiness])
5951 : return 0;
5952 :
5953 : if (sc->nr_reclaimed >= nr_to_reclaim)
5954 : return 0;
5955 :
5956 : if (!evict_folios(lruvec, sc, swappiness))
5957 : return 0;
5958 :
5959 : cond_resched();
5960 : }
5961 :
5962 : return -EINTR;
5963 : }
5964 :
5965 : static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
5966 : struct scan_control *sc, int swappiness, unsigned long opt)
5967 : {
5968 : struct lruvec *lruvec;
5969 : int err = -EINVAL;
5970 : struct mem_cgroup *memcg = NULL;
5971 :
5972 : if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
5973 : return -EINVAL;
5974 :
5975 : if (!mem_cgroup_disabled()) {
5976 : rcu_read_lock();
5977 :
5978 : memcg = mem_cgroup_from_id(memcg_id);
5979 : if (!mem_cgroup_tryget(memcg))
5980 : memcg = NULL;
5981 :
5982 : rcu_read_unlock();
5983 :
5984 : if (!memcg)
5985 : return -EINVAL;
5986 : }
5987 :
5988 : if (memcg_id != mem_cgroup_id(memcg))
5989 : goto done;
5990 :
5991 : lruvec = get_lruvec(memcg, nid);
5992 :
5993 : if (swappiness < 0)
5994 : swappiness = get_swappiness(lruvec, sc);
5995 : else if (swappiness > 200)
5996 : goto done;
5997 :
5998 : switch (cmd) {
5999 : case '+':
6000 : err = run_aging(lruvec, seq, sc, swappiness, opt);
6001 : break;
6002 : case '-':
6003 : err = run_eviction(lruvec, seq, sc, swappiness, opt);
6004 : break;
6005 : }
6006 : done:
6007 : mem_cgroup_put(memcg);
6008 :
6009 : return err;
6010 : }
6011 :
6012 : /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
6013 : static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
6014 : size_t len, loff_t *pos)
6015 : {
6016 : void *buf;
6017 : char *cur, *next;
6018 : unsigned int flags;
6019 : struct blk_plug plug;
6020 : int err = -EINVAL;
6021 : struct scan_control sc = {
6022 : .may_writepage = true,
6023 : .may_unmap = true,
6024 : .may_swap = true,
6025 : .reclaim_idx = MAX_NR_ZONES - 1,
6026 : .gfp_mask = GFP_KERNEL,
6027 : };
6028 :
6029 : buf = kvmalloc(len + 1, GFP_KERNEL);
6030 : if (!buf)
6031 : return -ENOMEM;
6032 :
6033 : if (copy_from_user(buf, src, len)) {
6034 : kvfree(buf);
6035 : return -EFAULT;
6036 : }
6037 :
6038 : set_task_reclaim_state(current, &sc.reclaim_state);
6039 : flags = memalloc_noreclaim_save();
6040 : blk_start_plug(&plug);
6041 : if (!set_mm_walk(NULL, true)) {
6042 : err = -ENOMEM;
6043 : goto done;
6044 : }
6045 :
6046 : next = buf;
6047 : next[len] = '\0';
6048 :
6049 : while ((cur = strsep(&next, ",;\n"))) {
6050 : int n;
6051 : int end;
6052 : char cmd;
6053 : unsigned int memcg_id;
6054 : unsigned int nid;
6055 : unsigned long seq;
6056 : unsigned int swappiness = -1;
6057 : unsigned long opt = -1;
6058 :
6059 : cur = skip_spaces(cur);
6060 : if (!*cur)
6061 : continue;
6062 :
6063 : n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
6064 : &seq, &end, &swappiness, &end, &opt, &end);
6065 : if (n < 4 || cur[end]) {
6066 : err = -EINVAL;
6067 : break;
6068 : }
6069 :
6070 : err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
6071 : if (err)
6072 : break;
6073 : }
6074 : done:
6075 : clear_mm_walk();
6076 : blk_finish_plug(&plug);
6077 : memalloc_noreclaim_restore(flags);
6078 : set_task_reclaim_state(current, NULL);
6079 :
6080 : kvfree(buf);
6081 :
6082 : return err ? : len;
6083 : }
6084 :
6085 : static int lru_gen_seq_open(struct inode *inode, struct file *file)
6086 : {
6087 : return seq_open(file, &lru_gen_seq_ops);
6088 : }
6089 :
6090 : static const struct file_operations lru_gen_rw_fops = {
6091 : .open = lru_gen_seq_open,
6092 : .read = seq_read,
6093 : .write = lru_gen_seq_write,
6094 : .llseek = seq_lseek,
6095 : .release = seq_release,
6096 : };
6097 :
6098 : static const struct file_operations lru_gen_ro_fops = {
6099 : .open = lru_gen_seq_open,
6100 : .read = seq_read,
6101 : .llseek = seq_lseek,
6102 : .release = seq_release,
6103 : };
6104 :
6105 : /******************************************************************************
6106 : * initialization
6107 : ******************************************************************************/
6108 :
6109 : void lru_gen_init_lruvec(struct lruvec *lruvec)
6110 : {
6111 : int i;
6112 : int gen, type, zone;
6113 : struct lru_gen_folio *lrugen = &lruvec->lrugen;
6114 :
6115 : lrugen->max_seq = MIN_NR_GENS + 1;
6116 : lrugen->enabled = lru_gen_enabled();
6117 :
6118 : for (i = 0; i <= MIN_NR_GENS + 1; i++)
6119 : lrugen->timestamps[i] = jiffies;
6120 :
6121 : for_each_gen_type_zone(gen, type, zone)
6122 : INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
6123 :
6124 : lruvec->mm_state.seq = MIN_NR_GENS;
6125 : init_waitqueue_head(&lruvec->mm_state.wait);
6126 : }
6127 :
6128 : #ifdef CONFIG_MEMCG
6129 :
6130 : void lru_gen_init_pgdat(struct pglist_data *pgdat)
6131 : {
6132 : int i, j;
6133 :
6134 : spin_lock_init(&pgdat->memcg_lru.lock);
6135 :
6136 : for (i = 0; i < MEMCG_NR_GENS; i++) {
6137 : for (j = 0; j < MEMCG_NR_BINS; j++)
6138 : INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
6139 : }
6140 : }
6141 :
6142 : void lru_gen_init_memcg(struct mem_cgroup *memcg)
6143 : {
6144 : INIT_LIST_HEAD(&memcg->mm_list.fifo);
6145 : spin_lock_init(&memcg->mm_list.lock);
6146 : }
6147 :
6148 : void lru_gen_exit_memcg(struct mem_cgroup *memcg)
6149 : {
6150 : int i;
6151 : int nid;
6152 :
6153 : VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo));
6154 :
6155 : for_each_node(nid) {
6156 : struct lruvec *lruvec = get_lruvec(memcg, nid);
6157 :
6158 : VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers);
6159 : VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
6160 : sizeof(lruvec->lrugen.nr_pages)));
6161 :
6162 : lruvec->lrugen.list.next = LIST_POISON1;
6163 :
6164 : for (i = 0; i < NR_BLOOM_FILTERS; i++) {
6165 : bitmap_free(lruvec->mm_state.filters[i]);
6166 : lruvec->mm_state.filters[i] = NULL;
6167 : }
6168 : }
6169 : }
6170 :
6171 : #endif /* CONFIG_MEMCG */
6172 :
6173 : static int __init init_lru_gen(void)
6174 : {
6175 : BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
6176 : BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
6177 :
6178 : if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
6179 : pr_err("lru_gen: failed to create sysfs group\n");
6180 :
6181 : debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
6182 : debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
6183 :
6184 : return 0;
6185 : };
6186 : late_initcall(init_lru_gen);
6187 :
6188 : #else /* !CONFIG_LRU_GEN */
6189 :
6190 : static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
6191 : {
6192 : }
6193 :
6194 : static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
6195 : {
6196 : }
6197 :
6198 : static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
6199 : {
6200 : }
6201 :
6202 : #endif /* CONFIG_LRU_GEN */
6203 :
6204 0 : static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
6205 : {
6206 : unsigned long nr[NR_LRU_LISTS];
6207 : unsigned long targets[NR_LRU_LISTS];
6208 : unsigned long nr_to_scan;
6209 : enum lru_list lru;
6210 0 : unsigned long nr_reclaimed = 0;
6211 0 : unsigned long nr_to_reclaim = sc->nr_to_reclaim;
6212 : bool proportional_reclaim;
6213 : struct blk_plug plug;
6214 :
6215 : if (lru_gen_enabled() && !global_reclaim(sc)) {
6216 : lru_gen_shrink_lruvec(lruvec, sc);
6217 : return;
6218 : }
6219 :
6220 0 : get_scan_count(lruvec, sc, nr);
6221 :
6222 : /* Record the original scan target for proportional adjustments later */
6223 0 : memcpy(targets, nr, sizeof(nr));
6224 :
6225 : /*
6226 : * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
6227 : * event that can occur when there is little memory pressure e.g.
6228 : * multiple streaming readers/writers. Hence, we do not abort scanning
6229 : * when the requested number of pages are reclaimed when scanning at
6230 : * DEF_PRIORITY on the assumption that the fact we are direct
6231 : * reclaiming implies that kswapd is not keeping up and it is best to
6232 : * do a batch of work at once. For memcg reclaim one check is made to
6233 : * abort proportional reclaim if either the file or anon lru has already
6234 : * dropped to zero at the first pass.
6235 : */
6236 0 : proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
6237 0 : sc->priority == DEF_PRIORITY);
6238 :
6239 0 : blk_start_plug(&plug);
6240 0 : while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
6241 0 : nr[LRU_INACTIVE_FILE]) {
6242 : unsigned long nr_anon, nr_file, percentage;
6243 : unsigned long nr_scanned;
6244 :
6245 0 : for_each_evictable_lru(lru) {
6246 0 : if (nr[lru]) {
6247 0 : nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
6248 0 : nr[lru] -= nr_to_scan;
6249 :
6250 0 : nr_reclaimed += shrink_list(lru, nr_to_scan,
6251 : lruvec, sc);
6252 : }
6253 : }
6254 :
6255 0 : cond_resched();
6256 :
6257 0 : if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
6258 0 : continue;
6259 :
6260 : /*
6261 : * For kswapd and memcg, reclaim at least the number of pages
6262 : * requested. Ensure that the anon and file LRUs are scanned
6263 : * proportionally what was requested by get_scan_count(). We
6264 : * stop reclaiming one LRU and reduce the amount scanning
6265 : * proportional to the original scan target.
6266 : */
6267 0 : nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
6268 0 : nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
6269 :
6270 : /*
6271 : * It's just vindictive to attack the larger once the smaller
6272 : * has gone to zero. And given the way we stop scanning the
6273 : * smaller below, this makes sure that we only make one nudge
6274 : * towards proportionality once we've got nr_to_reclaim.
6275 : */
6276 0 : if (!nr_file || !nr_anon)
6277 : break;
6278 :
6279 0 : if (nr_file > nr_anon) {
6280 0 : unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
6281 0 : targets[LRU_ACTIVE_ANON] + 1;
6282 0 : lru = LRU_BASE;
6283 0 : percentage = nr_anon * 100 / scan_target;
6284 : } else {
6285 0 : unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
6286 0 : targets[LRU_ACTIVE_FILE] + 1;
6287 0 : lru = LRU_FILE;
6288 0 : percentage = nr_file * 100 / scan_target;
6289 : }
6290 :
6291 : /* Stop scanning the smaller of the LRU */
6292 0 : nr[lru] = 0;
6293 0 : nr[lru + LRU_ACTIVE] = 0;
6294 :
6295 : /*
6296 : * Recalculate the other LRU scan count based on its original
6297 : * scan target and the percentage scanning already complete
6298 : */
6299 0 : lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
6300 0 : nr_scanned = targets[lru] - nr[lru];
6301 0 : nr[lru] = targets[lru] * (100 - percentage) / 100;
6302 0 : nr[lru] -= min(nr[lru], nr_scanned);
6303 :
6304 0 : lru += LRU_ACTIVE;
6305 0 : nr_scanned = targets[lru] - nr[lru];
6306 0 : nr[lru] = targets[lru] * (100 - percentage) / 100;
6307 0 : nr[lru] -= min(nr[lru], nr_scanned);
6308 : }
6309 0 : blk_finish_plug(&plug);
6310 0 : sc->nr_reclaimed += nr_reclaimed;
6311 :
6312 : /*
6313 : * Even if we did not try to evict anon pages at all, we want to
6314 : * rebalance the anon lru active/inactive ratio.
6315 : */
6316 0 : if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
6317 0 : inactive_is_low(lruvec, LRU_INACTIVE_ANON))
6318 0 : shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
6319 : sc, LRU_ACTIVE_ANON);
6320 : }
6321 :
6322 : /* Use reclaim/compaction for costly allocs or under memory pressure */
6323 : static bool in_reclaim_compaction(struct scan_control *sc)
6324 : {
6325 0 : if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
6326 0 : (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
6327 0 : sc->priority < DEF_PRIORITY - 2))
6328 : return true;
6329 :
6330 : return false;
6331 : }
6332 :
6333 : /*
6334 : * Reclaim/compaction is used for high-order allocation requests. It reclaims
6335 : * order-0 pages before compacting the zone. should_continue_reclaim() returns
6336 : * true if more pages should be reclaimed such that when the page allocator
6337 : * calls try_to_compact_pages() that it will have enough free pages to succeed.
6338 : * It will give up earlier than that if there is difficulty reclaiming pages.
6339 : */
6340 0 : static inline bool should_continue_reclaim(struct pglist_data *pgdat,
6341 : unsigned long nr_reclaimed,
6342 : struct scan_control *sc)
6343 : {
6344 : unsigned long pages_for_compaction;
6345 : unsigned long inactive_lru_pages;
6346 : int z;
6347 :
6348 : /* If not in reclaim/compaction mode, stop */
6349 0 : if (!in_reclaim_compaction(sc))
6350 : return false;
6351 :
6352 : /*
6353 : * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
6354 : * number of pages that were scanned. This will return to the caller
6355 : * with the risk reclaim/compaction and the resulting allocation attempt
6356 : * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
6357 : * allocations through requiring that the full LRU list has been scanned
6358 : * first, by assuming that zero delta of sc->nr_scanned means full LRU
6359 : * scan, but that approximation was wrong, and there were corner cases
6360 : * where always a non-zero amount of pages were scanned.
6361 : */
6362 0 : if (!nr_reclaimed)
6363 : return false;
6364 :
6365 : /* If compaction would go ahead or the allocation would succeed, stop */
6366 0 : for (z = 0; z <= sc->reclaim_idx; z++) {
6367 0 : struct zone *zone = &pgdat->node_zones[z];
6368 0 : if (!managed_zone(zone))
6369 0 : continue;
6370 :
6371 0 : switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
6372 : case COMPACT_SUCCESS:
6373 : case COMPACT_CONTINUE:
6374 : return false;
6375 : default:
6376 : /* check next zone */
6377 : ;
6378 : }
6379 : }
6380 :
6381 : /*
6382 : * If we have not reclaimed enough pages for compaction and the
6383 : * inactive lists are large enough, continue reclaiming
6384 : */
6385 0 : pages_for_compaction = compact_gap(sc->order);
6386 0 : inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
6387 0 : if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
6388 0 : inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
6389 :
6390 0 : return inactive_lru_pages > pages_for_compaction;
6391 : }
6392 :
6393 0 : static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
6394 : {
6395 0 : struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
6396 : struct mem_cgroup *memcg;
6397 :
6398 0 : memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
6399 : do {
6400 0 : struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
6401 : unsigned long reclaimed;
6402 : unsigned long scanned;
6403 :
6404 : /*
6405 : * This loop can become CPU-bound when target memcgs
6406 : * aren't eligible for reclaim - either because they
6407 : * don't have any reclaimable pages, or because their
6408 : * memory is explicitly protected. Avoid soft lockups.
6409 : */
6410 0 : cond_resched();
6411 :
6412 0 : mem_cgroup_calculate_protection(target_memcg, memcg);
6413 :
6414 0 : if (mem_cgroup_below_min(target_memcg, memcg)) {
6415 : /*
6416 : * Hard protection.
6417 : * If there is no reclaimable memory, OOM.
6418 : */
6419 : continue;
6420 0 : } else if (mem_cgroup_below_low(target_memcg, memcg)) {
6421 : /*
6422 : * Soft protection.
6423 : * Respect the protection only as long as
6424 : * there is an unprotected supply
6425 : * of reclaimable memory from other cgroups.
6426 : */
6427 : if (!sc->memcg_low_reclaim) {
6428 : sc->memcg_low_skipped = 1;
6429 : continue;
6430 : }
6431 : memcg_memory_event(memcg, MEMCG_LOW);
6432 : }
6433 :
6434 0 : reclaimed = sc->nr_reclaimed;
6435 0 : scanned = sc->nr_scanned;
6436 :
6437 0 : shrink_lruvec(lruvec, sc);
6438 :
6439 0 : shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
6440 0 : sc->priority);
6441 :
6442 : /* Record the group's reclaim efficiency */
6443 : if (!sc->proactive)
6444 : vmpressure(sc->gfp_mask, memcg, false,
6445 : sc->nr_scanned - scanned,
6446 : sc->nr_reclaimed - reclaimed);
6447 :
6448 0 : } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
6449 0 : }
6450 :
6451 0 : static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
6452 : {
6453 0 : struct reclaim_state *reclaim_state = current->reclaim_state;
6454 : unsigned long nr_reclaimed, nr_scanned;
6455 : struct lruvec *target_lruvec;
6456 0 : bool reclaimable = false;
6457 :
6458 : if (lru_gen_enabled() && global_reclaim(sc)) {
6459 : lru_gen_shrink_node(pgdat, sc);
6460 : return;
6461 : }
6462 :
6463 0 : target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
6464 :
6465 : again:
6466 0 : memset(&sc->nr, 0, sizeof(sc->nr));
6467 :
6468 0 : nr_reclaimed = sc->nr_reclaimed;
6469 0 : nr_scanned = sc->nr_scanned;
6470 :
6471 0 : prepare_scan_count(pgdat, sc);
6472 :
6473 0 : shrink_node_memcgs(pgdat, sc);
6474 :
6475 0 : if (reclaim_state) {
6476 0 : sc->nr_reclaimed += reclaim_state->reclaimed_slab;
6477 0 : reclaim_state->reclaimed_slab = 0;
6478 : }
6479 :
6480 : /* Record the subtree's reclaim efficiency */
6481 : if (!sc->proactive)
6482 : vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
6483 : sc->nr_scanned - nr_scanned,
6484 : sc->nr_reclaimed - nr_reclaimed);
6485 :
6486 0 : if (sc->nr_reclaimed - nr_reclaimed)
6487 0 : reclaimable = true;
6488 :
6489 0 : if (current_is_kswapd()) {
6490 : /*
6491 : * If reclaim is isolating dirty pages under writeback,
6492 : * it implies that the long-lived page allocation rate
6493 : * is exceeding the page laundering rate. Either the
6494 : * global limits are not being effective at throttling
6495 : * processes due to the page distribution throughout
6496 : * zones or there is heavy usage of a slow backing
6497 : * device. The only option is to throttle from reclaim
6498 : * context which is not ideal as there is no guarantee
6499 : * the dirtying process is throttled in the same way
6500 : * balance_dirty_pages() manages.
6501 : *
6502 : * Once a node is flagged PGDAT_WRITEBACK, kswapd will
6503 : * count the number of pages under pages flagged for
6504 : * immediate reclaim and stall if any are encountered
6505 : * in the nr_immediate check below.
6506 : */
6507 0 : if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
6508 0 : set_bit(PGDAT_WRITEBACK, &pgdat->flags);
6509 :
6510 : /* Allow kswapd to start writing pages during reclaim.*/
6511 0 : if (sc->nr.unqueued_dirty == sc->nr.file_taken)
6512 0 : set_bit(PGDAT_DIRTY, &pgdat->flags);
6513 :
6514 : /*
6515 : * If kswapd scans pages marked for immediate
6516 : * reclaim and under writeback (nr_immediate), it
6517 : * implies that pages are cycling through the LRU
6518 : * faster than they are written so forcibly stall
6519 : * until some pages complete writeback.
6520 : */
6521 0 : if (sc->nr.immediate)
6522 0 : reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
6523 : }
6524 :
6525 : /*
6526 : * Tag a node/memcg as congested if all the dirty pages were marked
6527 : * for writeback and immediate reclaim (counted in nr.congested).
6528 : *
6529 : * Legacy memcg will stall in page writeback so avoid forcibly
6530 : * stalling in reclaim_throttle().
6531 : */
6532 0 : if ((current_is_kswapd() ||
6533 0 : (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
6534 0 : sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
6535 0 : set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
6536 :
6537 : /*
6538 : * Stall direct reclaim for IO completions if the lruvec is
6539 : * node is congested. Allow kswapd to continue until it
6540 : * starts encountering unqueued dirty pages or cycling through
6541 : * the LRU too quickly.
6542 : */
6543 0 : if (!current_is_kswapd() && current_may_throttle() &&
6544 0 : !sc->hibernation_mode &&
6545 0 : test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
6546 0 : reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
6547 :
6548 0 : if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
6549 : sc))
6550 : goto again;
6551 :
6552 : /*
6553 : * Kswapd gives up on balancing particular nodes after too
6554 : * many failures to reclaim anything from them and goes to
6555 : * sleep. On reclaim progress, reset the failure counter. A
6556 : * successful direct reclaim run will revive a dormant kswapd.
6557 : */
6558 0 : if (reclaimable)
6559 0 : pgdat->kswapd_failures = 0;
6560 : }
6561 :
6562 : /*
6563 : * Returns true if compaction should go ahead for a costly-order request, or
6564 : * the allocation would already succeed without compaction. Return false if we
6565 : * should reclaim first.
6566 : */
6567 0 : static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
6568 : {
6569 : unsigned long watermark;
6570 : enum compact_result suitable;
6571 :
6572 0 : suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
6573 0 : if (suitable == COMPACT_SUCCESS)
6574 : /* Allocation should succeed already. Don't reclaim. */
6575 : return true;
6576 0 : if (suitable == COMPACT_SKIPPED)
6577 : /* Compaction cannot yet proceed. Do reclaim. */
6578 : return false;
6579 :
6580 : /*
6581 : * Compaction is already possible, but it takes time to run and there
6582 : * are potentially other callers using the pages just freed. So proceed
6583 : * with reclaim to make a buffer of free pages available to give
6584 : * compaction a reasonable chance of completing and allocating the page.
6585 : * Note that we won't actually reclaim the whole buffer in one attempt
6586 : * as the target watermark in should_continue_reclaim() is lower. But if
6587 : * we are already above the high+gap watermark, don't reclaim at all.
6588 : */
6589 0 : watermark = high_wmark_pages(zone) + compact_gap(sc->order);
6590 :
6591 0 : return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
6592 : }
6593 :
6594 0 : static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
6595 : {
6596 : /*
6597 : * If reclaim is making progress greater than 12% efficiency then
6598 : * wake all the NOPROGRESS throttled tasks.
6599 : */
6600 0 : if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
6601 : wait_queue_head_t *wqh;
6602 :
6603 0 : wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
6604 0 : if (waitqueue_active(wqh))
6605 0 : wake_up(wqh);
6606 :
6607 : return;
6608 : }
6609 :
6610 : /*
6611 : * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will
6612 : * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages
6613 : * under writeback and marked for immediate reclaim at the tail of the
6614 : * LRU.
6615 : */
6616 0 : if (current_is_kswapd() || cgroup_reclaim(sc))
6617 : return;
6618 :
6619 : /* Throttle if making no progress at high prioities. */
6620 0 : if (sc->priority == 1 && !sc->nr_reclaimed)
6621 0 : reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
6622 : }
6623 :
6624 : /*
6625 : * This is the direct reclaim path, for page-allocating processes. We only
6626 : * try to reclaim pages from zones which will satisfy the caller's allocation
6627 : * request.
6628 : *
6629 : * If a zone is deemed to be full of pinned pages then just give it a light
6630 : * scan then give up on it.
6631 : */
6632 0 : static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
6633 : {
6634 : struct zoneref *z;
6635 : struct zone *zone;
6636 : unsigned long nr_soft_reclaimed;
6637 : unsigned long nr_soft_scanned;
6638 : gfp_t orig_mask;
6639 0 : pg_data_t *last_pgdat = NULL;
6640 0 : pg_data_t *first_pgdat = NULL;
6641 :
6642 : /*
6643 : * If the number of buffer_heads in the machine exceeds the maximum
6644 : * allowed level, force direct reclaim to scan the highmem zone as
6645 : * highmem pages could be pinning lowmem pages storing buffer_heads
6646 : */
6647 0 : orig_mask = sc->gfp_mask;
6648 0 : if (buffer_heads_over_limit) {
6649 0 : sc->gfp_mask |= __GFP_HIGHMEM;
6650 0 : sc->reclaim_idx = gfp_zone(sc->gfp_mask);
6651 : }
6652 :
6653 0 : for_each_zone_zonelist_nodemask(zone, z, zonelist,
6654 : sc->reclaim_idx, sc->nodemask) {
6655 : /*
6656 : * Take care memory controller reclaiming has small influence
6657 : * to global LRU.
6658 : */
6659 0 : if (!cgroup_reclaim(sc)) {
6660 0 : if (!cpuset_zone_allowed(zone,
6661 : GFP_KERNEL | __GFP_HARDWALL))
6662 : continue;
6663 :
6664 : /*
6665 : * If we already have plenty of memory free for
6666 : * compaction in this zone, don't free any more.
6667 : * Even though compaction is invoked for any
6668 : * non-zero order, only frequent costly order
6669 : * reclamation is disruptive enough to become a
6670 : * noticeable problem, like transparent huge
6671 : * page allocations.
6672 : */
6673 0 : if (IS_ENABLED(CONFIG_COMPACTION) &&
6674 0 : sc->order > PAGE_ALLOC_COSTLY_ORDER &&
6675 0 : compaction_ready(zone, sc)) {
6676 0 : sc->compaction_ready = true;
6677 0 : continue;
6678 : }
6679 :
6680 : /*
6681 : * Shrink each node in the zonelist once. If the
6682 : * zonelist is ordered by zone (not the default) then a
6683 : * node may be shrunk multiple times but in that case
6684 : * the user prefers lower zones being preserved.
6685 : */
6686 0 : if (zone->zone_pgdat == last_pgdat)
6687 0 : continue;
6688 :
6689 : /*
6690 : * This steals pages from memory cgroups over softlimit
6691 : * and returns the number of reclaimed pages and
6692 : * scanned pages. This works for global memory pressure
6693 : * and balancing, not for a memcg's limit.
6694 : */
6695 0 : nr_soft_scanned = 0;
6696 0 : nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
6697 0 : sc->order, sc->gfp_mask,
6698 : &nr_soft_scanned);
6699 : sc->nr_reclaimed += nr_soft_reclaimed;
6700 : sc->nr_scanned += nr_soft_scanned;
6701 : /* need some check for avoid more shrink_zone() */
6702 : }
6703 :
6704 0 : if (!first_pgdat)
6705 0 : first_pgdat = zone->zone_pgdat;
6706 :
6707 : /* See comment about same check for global reclaim above */
6708 : if (zone->zone_pgdat == last_pgdat)
6709 : continue;
6710 0 : last_pgdat = zone->zone_pgdat;
6711 0 : shrink_node(zone->zone_pgdat, sc);
6712 : }
6713 :
6714 0 : if (first_pgdat)
6715 0 : consider_reclaim_throttle(first_pgdat, sc);
6716 :
6717 : /*
6718 : * Restore to original mask to avoid the impact on the caller if we
6719 : * promoted it to __GFP_HIGHMEM.
6720 : */
6721 0 : sc->gfp_mask = orig_mask;
6722 0 : }
6723 :
6724 : static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
6725 : {
6726 : struct lruvec *target_lruvec;
6727 : unsigned long refaults;
6728 :
6729 : if (lru_gen_enabled())
6730 : return;
6731 :
6732 0 : target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
6733 0 : refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
6734 0 : target_lruvec->refaults[WORKINGSET_ANON] = refaults;
6735 0 : refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
6736 0 : target_lruvec->refaults[WORKINGSET_FILE] = refaults;
6737 : }
6738 :
6739 : /*
6740 : * This is the main entry point to direct page reclaim.
6741 : *
6742 : * If a full scan of the inactive list fails to free enough memory then we
6743 : * are "out of memory" and something needs to be killed.
6744 : *
6745 : * If the caller is !__GFP_FS then the probability of a failure is reasonably
6746 : * high - the zone may be full of dirty or under-writeback pages, which this
6747 : * caller can't do much about. We kick the writeback threads and take explicit
6748 : * naps in the hope that some of these pages can be written. But if the
6749 : * allocating task holds filesystem locks which prevent writeout this might not
6750 : * work, and the allocation attempt will fail.
6751 : *
6752 : * returns: 0, if no pages reclaimed
6753 : * else, the number of pages reclaimed
6754 : */
6755 0 : static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
6756 : struct scan_control *sc)
6757 : {
6758 0 : int initial_priority = sc->priority;
6759 : pg_data_t *last_pgdat;
6760 : struct zoneref *z;
6761 : struct zone *zone;
6762 : retry:
6763 : delayacct_freepages_start();
6764 :
6765 0 : if (!cgroup_reclaim(sc))
6766 0 : __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
6767 :
6768 : do {
6769 : if (!sc->proactive)
6770 : vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
6771 : sc->priority);
6772 0 : sc->nr_scanned = 0;
6773 0 : shrink_zones(zonelist, sc);
6774 :
6775 0 : if (sc->nr_reclaimed >= sc->nr_to_reclaim)
6776 : break;
6777 :
6778 0 : if (sc->compaction_ready)
6779 : break;
6780 :
6781 : /*
6782 : * If we're getting trouble reclaiming, start doing
6783 : * writepage even in laptop mode.
6784 : */
6785 0 : if (sc->priority < DEF_PRIORITY - 2)
6786 0 : sc->may_writepage = 1;
6787 0 : } while (--sc->priority >= 0);
6788 :
6789 0 : last_pgdat = NULL;
6790 0 : for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
6791 : sc->nodemask) {
6792 0 : if (zone->zone_pgdat == last_pgdat)
6793 0 : continue;
6794 0 : last_pgdat = zone->zone_pgdat;
6795 :
6796 0 : snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
6797 :
6798 0 : if (cgroup_reclaim(sc)) {
6799 : struct lruvec *lruvec;
6800 :
6801 : lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
6802 : zone->zone_pgdat);
6803 : clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
6804 : }
6805 : }
6806 :
6807 : delayacct_freepages_end();
6808 :
6809 0 : if (sc->nr_reclaimed)
6810 : return sc->nr_reclaimed;
6811 :
6812 : /* Aborted reclaim to try compaction? don't OOM, then */
6813 0 : if (sc->compaction_ready)
6814 : return 1;
6815 :
6816 : /*
6817 : * We make inactive:active ratio decisions based on the node's
6818 : * composition of memory, but a restrictive reclaim_idx or a
6819 : * memory.low cgroup setting can exempt large amounts of
6820 : * memory from reclaim. Neither of which are very common, so
6821 : * instead of doing costly eligibility calculations of the
6822 : * entire cgroup subtree up front, we assume the estimates are
6823 : * good, and retry with forcible deactivation if that fails.
6824 : */
6825 0 : if (sc->skipped_deactivate) {
6826 0 : sc->priority = initial_priority;
6827 0 : sc->force_deactivate = 1;
6828 0 : sc->skipped_deactivate = 0;
6829 0 : goto retry;
6830 : }
6831 :
6832 : /* Untapped cgroup reserves? Don't OOM, retry. */
6833 0 : if (sc->memcg_low_skipped) {
6834 0 : sc->priority = initial_priority;
6835 0 : sc->force_deactivate = 0;
6836 0 : sc->memcg_low_reclaim = 1;
6837 0 : sc->memcg_low_skipped = 0;
6838 0 : goto retry;
6839 : }
6840 :
6841 : return 0;
6842 : }
6843 :
6844 0 : static bool allow_direct_reclaim(pg_data_t *pgdat)
6845 : {
6846 : struct zone *zone;
6847 0 : unsigned long pfmemalloc_reserve = 0;
6848 0 : unsigned long free_pages = 0;
6849 : int i;
6850 : bool wmark_ok;
6851 :
6852 0 : if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
6853 : return true;
6854 :
6855 0 : for (i = 0; i <= ZONE_NORMAL; i++) {
6856 0 : zone = &pgdat->node_zones[i];
6857 0 : if (!managed_zone(zone))
6858 0 : continue;
6859 :
6860 0 : if (!zone_reclaimable_pages(zone))
6861 0 : continue;
6862 :
6863 0 : pfmemalloc_reserve += min_wmark_pages(zone);
6864 0 : free_pages += zone_page_state(zone, NR_FREE_PAGES);
6865 : }
6866 :
6867 : /* If there are no reserves (unexpected config) then do not throttle */
6868 0 : if (!pfmemalloc_reserve)
6869 : return true;
6870 :
6871 0 : wmark_ok = free_pages > pfmemalloc_reserve / 2;
6872 :
6873 : /* kswapd must be awake if processes are being throttled */
6874 0 : if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
6875 0 : if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
6876 0 : WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
6877 :
6878 0 : wake_up_interruptible(&pgdat->kswapd_wait);
6879 : }
6880 :
6881 : return wmark_ok;
6882 : }
6883 :
6884 : /*
6885 : * Throttle direct reclaimers if backing storage is backed by the network
6886 : * and the PFMEMALLOC reserve for the preferred node is getting dangerously
6887 : * depleted. kswapd will continue to make progress and wake the processes
6888 : * when the low watermark is reached.
6889 : *
6890 : * Returns true if a fatal signal was delivered during throttling. If this
6891 : * happens, the page allocator should not consider triggering the OOM killer.
6892 : */
6893 0 : static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
6894 : nodemask_t *nodemask)
6895 : {
6896 : struct zoneref *z;
6897 : struct zone *zone;
6898 0 : pg_data_t *pgdat = NULL;
6899 :
6900 : /*
6901 : * Kernel threads should not be throttled as they may be indirectly
6902 : * responsible for cleaning pages necessary for reclaim to make forward
6903 : * progress. kjournald for example may enter direct reclaim while
6904 : * committing a transaction where throttling it could forcing other
6905 : * processes to block on log_wait_commit().
6906 : */
6907 0 : if (current->flags & PF_KTHREAD)
6908 : goto out;
6909 :
6910 : /*
6911 : * If a fatal signal is pending, this process should not throttle.
6912 : * It should return quickly so it can exit and free its memory
6913 : */
6914 0 : if (fatal_signal_pending(current))
6915 : goto out;
6916 :
6917 : /*
6918 : * Check if the pfmemalloc reserves are ok by finding the first node
6919 : * with a usable ZONE_NORMAL or lower zone. The expectation is that
6920 : * GFP_KERNEL will be required for allocating network buffers when
6921 : * swapping over the network so ZONE_HIGHMEM is unusable.
6922 : *
6923 : * Throttling is based on the first usable node and throttled processes
6924 : * wait on a queue until kswapd makes progress and wakes them. There
6925 : * is an affinity then between processes waking up and where reclaim
6926 : * progress has been made assuming the process wakes on the same node.
6927 : * More importantly, processes running on remote nodes will not compete
6928 : * for remote pfmemalloc reserves and processes on different nodes
6929 : * should make reasonable progress.
6930 : */
6931 0 : for_each_zone_zonelist_nodemask(zone, z, zonelist,
6932 : gfp_zone(gfp_mask), nodemask) {
6933 0 : if (zone_idx(zone) > ZONE_NORMAL)
6934 0 : continue;
6935 :
6936 : /* Throttle based on the first usable node */
6937 0 : pgdat = zone->zone_pgdat;
6938 0 : if (allow_direct_reclaim(pgdat))
6939 : goto out;
6940 : break;
6941 : }
6942 :
6943 : /* If no zone was usable by the allocation flags then do not throttle */
6944 0 : if (!pgdat)
6945 : goto out;
6946 :
6947 : /* Account for the throttling */
6948 0 : count_vm_event(PGSCAN_DIRECT_THROTTLE);
6949 :
6950 : /*
6951 : * If the caller cannot enter the filesystem, it's possible that it
6952 : * is due to the caller holding an FS lock or performing a journal
6953 : * transaction in the case of a filesystem like ext[3|4]. In this case,
6954 : * it is not safe to block on pfmemalloc_wait as kswapd could be
6955 : * blocked waiting on the same lock. Instead, throttle for up to a
6956 : * second before continuing.
6957 : */
6958 0 : if (!(gfp_mask & __GFP_FS))
6959 0 : wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
6960 : allow_direct_reclaim(pgdat), HZ);
6961 : else
6962 : /* Throttle until kswapd wakes the process */
6963 0 : wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
6964 : allow_direct_reclaim(pgdat));
6965 :
6966 0 : if (fatal_signal_pending(current))
6967 : return true;
6968 :
6969 : out:
6970 : return false;
6971 : }
6972 :
6973 0 : unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
6974 : gfp_t gfp_mask, nodemask_t *nodemask)
6975 : {
6976 : unsigned long nr_reclaimed;
6977 0 : struct scan_control sc = {
6978 : .nr_to_reclaim = SWAP_CLUSTER_MAX,
6979 0 : .gfp_mask = current_gfp_context(gfp_mask),
6980 0 : .reclaim_idx = gfp_zone(gfp_mask),
6981 : .order = order,
6982 : .nodemask = nodemask,
6983 : .priority = DEF_PRIORITY,
6984 0 : .may_writepage = !laptop_mode,
6985 : .may_unmap = 1,
6986 : .may_swap = 1,
6987 : };
6988 :
6989 : /*
6990 : * scan_control uses s8 fields for order, priority, and reclaim_idx.
6991 : * Confirm they are large enough for max values.
6992 : */
6993 : BUILD_BUG_ON(MAX_ORDER > S8_MAX);
6994 : BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
6995 : BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
6996 :
6997 : /*
6998 : * Do not enter reclaim if fatal signal was delivered while throttled.
6999 : * 1 is returned so that the page allocator does not OOM kill at this
7000 : * point.
7001 : */
7002 0 : if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
7003 : return 1;
7004 :
7005 0 : set_task_reclaim_state(current, &sc.reclaim_state);
7006 0 : trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
7007 :
7008 0 : nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
7009 :
7010 0 : trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
7011 0 : set_task_reclaim_state(current, NULL);
7012 :
7013 0 : return nr_reclaimed;
7014 : }
7015 :
7016 : #ifdef CONFIG_MEMCG
7017 :
7018 : /* Only used by soft limit reclaim. Do not reuse for anything else. */
7019 : unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
7020 : gfp_t gfp_mask, bool noswap,
7021 : pg_data_t *pgdat,
7022 : unsigned long *nr_scanned)
7023 : {
7024 : struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
7025 : struct scan_control sc = {
7026 : .nr_to_reclaim = SWAP_CLUSTER_MAX,
7027 : .target_mem_cgroup = memcg,
7028 : .may_writepage = !laptop_mode,
7029 : .may_unmap = 1,
7030 : .reclaim_idx = MAX_NR_ZONES - 1,
7031 : .may_swap = !noswap,
7032 : };
7033 :
7034 : WARN_ON_ONCE(!current->reclaim_state);
7035 :
7036 : sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
7037 : (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
7038 :
7039 : trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
7040 : sc.gfp_mask);
7041 :
7042 : /*
7043 : * NOTE: Although we can get the priority field, using it
7044 : * here is not a good idea, since it limits the pages we can scan.
7045 : * if we don't reclaim here, the shrink_node from balance_pgdat
7046 : * will pick up pages from other mem cgroup's as well. We hack
7047 : * the priority and make it zero.
7048 : */
7049 : shrink_lruvec(lruvec, &sc);
7050 :
7051 : trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
7052 :
7053 : *nr_scanned = sc.nr_scanned;
7054 :
7055 : return sc.nr_reclaimed;
7056 : }
7057 :
7058 : unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
7059 : unsigned long nr_pages,
7060 : gfp_t gfp_mask,
7061 : unsigned int reclaim_options)
7062 : {
7063 : unsigned long nr_reclaimed;
7064 : unsigned int noreclaim_flag;
7065 : struct scan_control sc = {
7066 : .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
7067 : .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
7068 : (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
7069 : .reclaim_idx = MAX_NR_ZONES - 1,
7070 : .target_mem_cgroup = memcg,
7071 : .priority = DEF_PRIORITY,
7072 : .may_writepage = !laptop_mode,
7073 : .may_unmap = 1,
7074 : .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
7075 : .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
7076 : };
7077 : /*
7078 : * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
7079 : * equal pressure on all the nodes. This is based on the assumption that
7080 : * the reclaim does not bail out early.
7081 : */
7082 : struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
7083 :
7084 : set_task_reclaim_state(current, &sc.reclaim_state);
7085 : trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
7086 : noreclaim_flag = memalloc_noreclaim_save();
7087 :
7088 : nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
7089 :
7090 : memalloc_noreclaim_restore(noreclaim_flag);
7091 : trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
7092 : set_task_reclaim_state(current, NULL);
7093 :
7094 : return nr_reclaimed;
7095 : }
7096 : #endif
7097 :
7098 0 : static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
7099 : {
7100 : struct mem_cgroup *memcg;
7101 : struct lruvec *lruvec;
7102 :
7103 : if (lru_gen_enabled()) {
7104 : lru_gen_age_node(pgdat, sc);
7105 : return;
7106 : }
7107 :
7108 0 : if (!can_age_anon_pages(pgdat, sc))
7109 : return;
7110 :
7111 0 : lruvec = mem_cgroup_lruvec(NULL, pgdat);
7112 0 : if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
7113 : return;
7114 :
7115 0 : memcg = mem_cgroup_iter(NULL, NULL, NULL);
7116 : do {
7117 0 : lruvec = mem_cgroup_lruvec(memcg, pgdat);
7118 0 : shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
7119 : sc, LRU_ACTIVE_ANON);
7120 0 : memcg = mem_cgroup_iter(NULL, memcg, NULL);
7121 : } while (memcg);
7122 : }
7123 :
7124 : static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
7125 : {
7126 : int i;
7127 : struct zone *zone;
7128 :
7129 : /*
7130 : * Check for watermark boosts top-down as the higher zones
7131 : * are more likely to be boosted. Both watermarks and boosts
7132 : * should not be checked at the same time as reclaim would
7133 : * start prematurely when there is no boosting and a lower
7134 : * zone is balanced.
7135 : */
7136 0 : for (i = highest_zoneidx; i >= 0; i--) {
7137 0 : zone = pgdat->node_zones + i;
7138 0 : if (!managed_zone(zone))
7139 0 : continue;
7140 :
7141 0 : if (zone->watermark_boost)
7142 : return true;
7143 : }
7144 :
7145 : return false;
7146 : }
7147 :
7148 : /*
7149 : * Returns true if there is an eligible zone balanced for the request order
7150 : * and highest_zoneidx
7151 : */
7152 2 : static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
7153 : {
7154 : int i;
7155 2 : unsigned long mark = -1;
7156 : struct zone *zone;
7157 :
7158 : /*
7159 : * Check watermarks bottom-up as lower zones are more likely to
7160 : * meet watermarks.
7161 : */
7162 2 : for (i = 0; i <= highest_zoneidx; i++) {
7163 2 : zone = pgdat->node_zones + i;
7164 :
7165 2 : if (!managed_zone(zone))
7166 0 : continue;
7167 :
7168 : if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
7169 : mark = wmark_pages(zone, WMARK_PROMO);
7170 : else
7171 2 : mark = high_wmark_pages(zone);
7172 2 : if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
7173 : return true;
7174 : }
7175 :
7176 : /*
7177 : * If a node has no managed zone within highest_zoneidx, it does not
7178 : * need balancing by definition. This can happen if a zone-restricted
7179 : * allocation tries to wake a remote kswapd.
7180 : */
7181 0 : if (mark == -1)
7182 : return true;
7183 :
7184 0 : return false;
7185 : }
7186 :
7187 : /* Clear pgdat state for congested, dirty or under writeback. */
7188 : static void clear_pgdat_congested(pg_data_t *pgdat)
7189 : {
7190 2 : struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
7191 :
7192 4 : clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
7193 4 : clear_bit(PGDAT_DIRTY, &pgdat->flags);
7194 4 : clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
7195 : }
7196 :
7197 : /*
7198 : * Prepare kswapd for sleeping. This verifies that there are no processes
7199 : * waiting in throttle_direct_reclaim() and that watermarks have been met.
7200 : *
7201 : * Returns true if kswapd is ready to sleep
7202 : */
7203 2 : static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
7204 : int highest_zoneidx)
7205 : {
7206 : /*
7207 : * The throttled processes are normally woken up in balance_pgdat() as
7208 : * soon as allow_direct_reclaim() is true. But there is a potential
7209 : * race between when kswapd checks the watermarks and a process gets
7210 : * throttled. There is also a potential race if processes get
7211 : * throttled, kswapd wakes, a large process exits thereby balancing the
7212 : * zones, which causes kswapd to exit balance_pgdat() before reaching
7213 : * the wake up checks. If kswapd is going to sleep, no process should
7214 : * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
7215 : * the wake up is premature, processes will wake kswapd and get
7216 : * throttled again. The difference from wake ups in balance_pgdat() is
7217 : * that here we are under prepare_to_wait().
7218 : */
7219 4 : if (waitqueue_active(&pgdat->pfmemalloc_wait))
7220 0 : wake_up_all(&pgdat->pfmemalloc_wait);
7221 :
7222 : /* Hopeless node, leave it to direct reclaim */
7223 2 : if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
7224 : return true;
7225 :
7226 2 : if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
7227 2 : clear_pgdat_congested(pgdat);
7228 2 : return true;
7229 : }
7230 :
7231 : return false;
7232 : }
7233 :
7234 : /*
7235 : * kswapd shrinks a node of pages that are at or below the highest usable
7236 : * zone that is currently unbalanced.
7237 : *
7238 : * Returns true if kswapd scanned at least the requested number of pages to
7239 : * reclaim or if the lack of progress was due to pages under writeback.
7240 : * This is used to determine if the scanning priority needs to be raised.
7241 : */
7242 0 : static bool kswapd_shrink_node(pg_data_t *pgdat,
7243 : struct scan_control *sc)
7244 : {
7245 : struct zone *zone;
7246 : int z;
7247 :
7248 : /* Reclaim a number of pages proportional to the number of zones */
7249 0 : sc->nr_to_reclaim = 0;
7250 0 : for (z = 0; z <= sc->reclaim_idx; z++) {
7251 0 : zone = pgdat->node_zones + z;
7252 0 : if (!managed_zone(zone))
7253 0 : continue;
7254 :
7255 0 : sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
7256 : }
7257 :
7258 : /*
7259 : * Historically care was taken to put equal pressure on all zones but
7260 : * now pressure is applied based on node LRU order.
7261 : */
7262 0 : shrink_node(pgdat, sc);
7263 :
7264 : /*
7265 : * Fragmentation may mean that the system cannot be rebalanced for
7266 : * high-order allocations. If twice the allocation size has been
7267 : * reclaimed then recheck watermarks only at order-0 to prevent
7268 : * excessive reclaim. Assume that a process requested a high-order
7269 : * can direct reclaim/compact.
7270 : */
7271 0 : if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
7272 0 : sc->order = 0;
7273 :
7274 0 : return sc->nr_scanned >= sc->nr_to_reclaim;
7275 : }
7276 :
7277 : /* Page allocator PCP high watermark is lowered if reclaim is active. */
7278 : static inline void
7279 : update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
7280 : {
7281 : int i;
7282 : struct zone *zone;
7283 :
7284 0 : for (i = 0; i <= highest_zoneidx; i++) {
7285 0 : zone = pgdat->node_zones + i;
7286 :
7287 0 : if (!managed_zone(zone))
7288 0 : continue;
7289 :
7290 : if (active)
7291 0 : set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
7292 : else
7293 0 : clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
7294 : }
7295 : }
7296 :
7297 : static inline void
7298 : set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
7299 : {
7300 0 : update_reclaim_active(pgdat, highest_zoneidx, true);
7301 : }
7302 :
7303 : static inline void
7304 : clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
7305 : {
7306 0 : update_reclaim_active(pgdat, highest_zoneidx, false);
7307 : }
7308 :
7309 : /*
7310 : * For kswapd, balance_pgdat() will reclaim pages across a node from zones
7311 : * that are eligible for use by the caller until at least one zone is
7312 : * balanced.
7313 : *
7314 : * Returns the order kswapd finished reclaiming at.
7315 : *
7316 : * kswapd scans the zones in the highmem->normal->dma direction. It skips
7317 : * zones which have free_pages > high_wmark_pages(zone), but once a zone is
7318 : * found to have free_pages <= high_wmark_pages(zone), any page in that zone
7319 : * or lower is eligible for reclaim until at least one usable zone is
7320 : * balanced.
7321 : */
7322 0 : static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
7323 : {
7324 : int i;
7325 : unsigned long nr_soft_reclaimed;
7326 : unsigned long nr_soft_scanned;
7327 : unsigned long pflags;
7328 : unsigned long nr_boost_reclaim;
7329 0 : unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
7330 : bool boosted;
7331 : struct zone *zone;
7332 0 : struct scan_control sc = {
7333 : .gfp_mask = GFP_KERNEL,
7334 : .order = order,
7335 : .may_unmap = 1,
7336 : };
7337 :
7338 0 : set_task_reclaim_state(current, &sc.reclaim_state);
7339 0 : psi_memstall_enter(&pflags);
7340 0 : __fs_reclaim_acquire(_THIS_IP_);
7341 :
7342 0 : count_vm_event(PAGEOUTRUN);
7343 :
7344 : /*
7345 : * Account for the reclaim boost. Note that the zone boost is left in
7346 : * place so that parallel allocations that are near the watermark will
7347 : * stall or direct reclaim until kswapd is finished.
7348 : */
7349 0 : nr_boost_reclaim = 0;
7350 0 : for (i = 0; i <= highest_zoneidx; i++) {
7351 0 : zone = pgdat->node_zones + i;
7352 0 : if (!managed_zone(zone))
7353 0 : continue;
7354 :
7355 0 : nr_boost_reclaim += zone->watermark_boost;
7356 0 : zone_boosts[i] = zone->watermark_boost;
7357 : }
7358 : boosted = nr_boost_reclaim;
7359 :
7360 : restart:
7361 0 : set_reclaim_active(pgdat, highest_zoneidx);
7362 0 : sc.priority = DEF_PRIORITY;
7363 : do {
7364 0 : unsigned long nr_reclaimed = sc.nr_reclaimed;
7365 0 : bool raise_priority = true;
7366 : bool balanced;
7367 : bool ret;
7368 :
7369 0 : sc.reclaim_idx = highest_zoneidx;
7370 :
7371 : /*
7372 : * If the number of buffer_heads exceeds the maximum allowed
7373 : * then consider reclaiming from all zones. This has a dual
7374 : * purpose -- on 64-bit systems it is expected that
7375 : * buffer_heads are stripped during active rotation. On 32-bit
7376 : * systems, highmem pages can pin lowmem memory and shrinking
7377 : * buffers can relieve lowmem pressure. Reclaim may still not
7378 : * go ahead if all eligible zones for the original allocation
7379 : * request are balanced to avoid excessive reclaim from kswapd.
7380 : */
7381 0 : if (buffer_heads_over_limit) {
7382 0 : for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
7383 0 : zone = pgdat->node_zones + i;
7384 0 : if (!managed_zone(zone))
7385 0 : continue;
7386 :
7387 0 : sc.reclaim_idx = i;
7388 0 : break;
7389 : }
7390 : }
7391 :
7392 : /*
7393 : * If the pgdat is imbalanced then ignore boosting and preserve
7394 : * the watermarks for a later time and restart. Note that the
7395 : * zone watermarks will be still reset at the end of balancing
7396 : * on the grounds that the normal reclaim should be enough to
7397 : * re-evaluate if boosting is required when kswapd next wakes.
7398 : */
7399 0 : balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
7400 0 : if (!balanced && nr_boost_reclaim) {
7401 : nr_boost_reclaim = 0;
7402 : goto restart;
7403 : }
7404 :
7405 : /*
7406 : * If boosting is not active then only reclaim if there are no
7407 : * eligible zones. Note that sc.reclaim_idx is not used as
7408 : * buffer_heads_over_limit may have adjusted it.
7409 : */
7410 0 : if (!nr_boost_reclaim && balanced)
7411 : goto out;
7412 :
7413 : /* Limit the priority of boosting to avoid reclaim writeback */
7414 0 : if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
7415 0 : raise_priority = false;
7416 :
7417 : /*
7418 : * Do not writeback or swap pages for boosted reclaim. The
7419 : * intent is to relieve pressure not issue sub-optimal IO
7420 : * from reclaim context. If no pages are reclaimed, the
7421 : * reclaim will be aborted.
7422 : */
7423 0 : sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
7424 0 : sc.may_swap = !nr_boost_reclaim;
7425 :
7426 : /*
7427 : * Do some background aging, to give pages a chance to be
7428 : * referenced before reclaiming. All pages are rotated
7429 : * regardless of classzone as this is about consistent aging.
7430 : */
7431 0 : kswapd_age_node(pgdat, &sc);
7432 :
7433 : /*
7434 : * If we're getting trouble reclaiming, start doing writepage
7435 : * even in laptop mode.
7436 : */
7437 0 : if (sc.priority < DEF_PRIORITY - 2)
7438 0 : sc.may_writepage = 1;
7439 :
7440 : /* Call soft limit reclaim before calling shrink_node. */
7441 0 : sc.nr_scanned = 0;
7442 0 : nr_soft_scanned = 0;
7443 0 : nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
7444 : sc.gfp_mask, &nr_soft_scanned);
7445 : sc.nr_reclaimed += nr_soft_reclaimed;
7446 :
7447 : /*
7448 : * There should be no need to raise the scanning priority if
7449 : * enough pages are already being scanned that that high
7450 : * watermark would be met at 100% efficiency.
7451 : */
7452 0 : if (kswapd_shrink_node(pgdat, &sc))
7453 0 : raise_priority = false;
7454 :
7455 : /*
7456 : * If the low watermark is met there is no need for processes
7457 : * to be throttled on pfmemalloc_wait as they should not be
7458 : * able to safely make forward progress. Wake them
7459 : */
7460 0 : if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
7461 0 : allow_direct_reclaim(pgdat))
7462 0 : wake_up_all(&pgdat->pfmemalloc_wait);
7463 :
7464 : /* Check if kswapd should be suspending */
7465 0 : __fs_reclaim_release(_THIS_IP_);
7466 0 : ret = try_to_freeze();
7467 0 : __fs_reclaim_acquire(_THIS_IP_);
7468 0 : if (ret || kthread_should_stop())
7469 : break;
7470 :
7471 : /*
7472 : * Raise priority if scanning rate is too low or there was no
7473 : * progress in reclaiming pages
7474 : */
7475 0 : nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
7476 0 : nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
7477 :
7478 : /*
7479 : * If reclaim made no progress for a boost, stop reclaim as
7480 : * IO cannot be queued and it could be an infinite loop in
7481 : * extreme circumstances.
7482 : */
7483 0 : if (nr_boost_reclaim && !nr_reclaimed)
7484 : break;
7485 :
7486 0 : if (raise_priority || !nr_reclaimed)
7487 0 : sc.priority--;
7488 0 : } while (sc.priority >= 1);
7489 :
7490 0 : if (!sc.nr_reclaimed)
7491 0 : pgdat->kswapd_failures++;
7492 :
7493 : out:
7494 0 : clear_reclaim_active(pgdat, highest_zoneidx);
7495 :
7496 : /* If reclaim was boosted, account for the reclaim done in this pass */
7497 0 : if (boosted) {
7498 : unsigned long flags;
7499 :
7500 0 : for (i = 0; i <= highest_zoneidx; i++) {
7501 0 : if (!zone_boosts[i])
7502 0 : continue;
7503 :
7504 : /* Increments are under the zone lock */
7505 0 : zone = pgdat->node_zones + i;
7506 0 : spin_lock_irqsave(&zone->lock, flags);
7507 0 : zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
7508 0 : spin_unlock_irqrestore(&zone->lock, flags);
7509 : }
7510 :
7511 : /*
7512 : * As there is now likely space, wakeup kcompact to defragment
7513 : * pageblocks.
7514 : */
7515 0 : wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
7516 : }
7517 :
7518 0 : snapshot_refaults(NULL, pgdat);
7519 0 : __fs_reclaim_release(_THIS_IP_);
7520 0 : psi_memstall_leave(&pflags);
7521 0 : set_task_reclaim_state(current, NULL);
7522 :
7523 : /*
7524 : * Return the order kswapd stopped reclaiming at as
7525 : * prepare_kswapd_sleep() takes it into account. If another caller
7526 : * entered the allocator slow path while kswapd was awake, order will
7527 : * remain at the higher level.
7528 : */
7529 0 : return sc.order;
7530 : }
7531 :
7532 : /*
7533 : * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
7534 : * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
7535 : * not a valid index then either kswapd runs for first time or kswapd couldn't
7536 : * sleep after previous reclaim attempt (node is still unbalanced). In that
7537 : * case return the zone index of the previous kswapd reclaim cycle.
7538 : */
7539 : static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
7540 : enum zone_type prev_highest_zoneidx)
7541 : {
7542 1 : enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
7543 :
7544 1 : return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
7545 : }
7546 :
7547 1 : static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
7548 : unsigned int highest_zoneidx)
7549 : {
7550 1 : long remaining = 0;
7551 2 : DEFINE_WAIT(wait);
7552 :
7553 2 : if (freezing(current) || kthread_should_stop())
7554 0 : return;
7555 :
7556 1 : prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
7557 :
7558 : /*
7559 : * Try to sleep for a short interval. Note that kcompactd will only be
7560 : * woken if it is possible to sleep for a short interval. This is
7561 : * deliberate on the assumption that if reclaim cannot keep an
7562 : * eligible zone balanced that it's also unlikely that compaction will
7563 : * succeed.
7564 : */
7565 1 : if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
7566 : /*
7567 : * Compaction records what page blocks it recently failed to
7568 : * isolate pages from and skips them in the future scanning.
7569 : * When kswapd is going to sleep, it is reasonable to assume
7570 : * that pages and compaction may succeed so reset the cache.
7571 : */
7572 1 : reset_isolation_suitable(pgdat);
7573 :
7574 : /*
7575 : * We have freed the memory, now we should compact it to make
7576 : * allocation of the requested order possible.
7577 : */
7578 1 : wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
7579 :
7580 1 : remaining = schedule_timeout(HZ/10);
7581 :
7582 : /*
7583 : * If woken prematurely then reset kswapd_highest_zoneidx and
7584 : * order. The values will either be from a wakeup request or
7585 : * the previous request that slept prematurely.
7586 : */
7587 1 : if (remaining) {
7588 0 : WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
7589 : kswapd_highest_zoneidx(pgdat,
7590 : highest_zoneidx));
7591 :
7592 0 : if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
7593 0 : WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
7594 : }
7595 :
7596 1 : finish_wait(&pgdat->kswapd_wait, &wait);
7597 1 : prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
7598 : }
7599 :
7600 : /*
7601 : * After a short sleep, check if it was a premature sleep. If not, then
7602 : * go fully to sleep until explicitly woken up.
7603 : */
7604 2 : if (!remaining &&
7605 1 : prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
7606 1 : trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
7607 :
7608 : /*
7609 : * vmstat counters are not perfectly accurate and the estimated
7610 : * value for counters such as NR_FREE_PAGES can deviate from the
7611 : * true value by nr_online_cpus * threshold. To avoid the zone
7612 : * watermarks being breached while under pressure, we reduce the
7613 : * per-cpu vmstat threshold while kswapd is awake and restore
7614 : * them before going back to sleep.
7615 : */
7616 : set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
7617 :
7618 1 : if (!kthread_should_stop())
7619 1 : schedule();
7620 :
7621 : set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
7622 : } else {
7623 0 : if (remaining)
7624 0 : count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
7625 : else
7626 0 : count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
7627 : }
7628 0 : finish_wait(&pgdat->kswapd_wait, &wait);
7629 : }
7630 :
7631 : /*
7632 : * The background pageout daemon, started as a kernel thread
7633 : * from the init process.
7634 : *
7635 : * This basically trickles out pages so that we have _some_
7636 : * free memory available even if there is no other activity
7637 : * that frees anything up. This is needed for things like routing
7638 : * etc, where we otherwise might have all activity going on in
7639 : * asynchronous contexts that cannot page things out.
7640 : *
7641 : * If there are applications that are active memory-allocators
7642 : * (most normal use), this basically shouldn't matter.
7643 : */
7644 1 : static int kswapd(void *p)
7645 : {
7646 : unsigned int alloc_order, reclaim_order;
7647 1 : unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
7648 1 : pg_data_t *pgdat = (pg_data_t *)p;
7649 1 : struct task_struct *tsk = current;
7650 1 : const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
7651 :
7652 1 : if (!cpumask_empty(cpumask))
7653 1 : set_cpus_allowed_ptr(tsk, cpumask);
7654 :
7655 : /*
7656 : * Tell the memory management that we're a "memory allocator",
7657 : * and that if we need more memory we should get access to it
7658 : * regardless (see "__alloc_pages()"). "kswapd" should
7659 : * never get caught in the normal page freeing logic.
7660 : *
7661 : * (Kswapd normally doesn't need memory anyway, but sometimes
7662 : * you need a small amount of memory in order to be able to
7663 : * page out something else, and this flag essentially protects
7664 : * us from recursively trying to free more memory as we're
7665 : * trying to free the first piece of memory in the first place).
7666 : */
7667 1 : tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
7668 1 : set_freezable();
7669 :
7670 1 : WRITE_ONCE(pgdat->kswapd_order, 0);
7671 1 : WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
7672 1 : atomic_set(&pgdat->nr_writeback_throttled, 0);
7673 : for ( ; ; ) {
7674 : bool ret;
7675 :
7676 1 : alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
7677 : highest_zoneidx = kswapd_highest_zoneidx(pgdat,
7678 : highest_zoneidx);
7679 :
7680 : kswapd_try_sleep:
7681 1 : kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
7682 : highest_zoneidx);
7683 :
7684 : /* Read the new order and highest_zoneidx */
7685 0 : alloc_order = READ_ONCE(pgdat->kswapd_order);
7686 0 : highest_zoneidx = kswapd_highest_zoneidx(pgdat,
7687 : highest_zoneidx);
7688 0 : WRITE_ONCE(pgdat->kswapd_order, 0);
7689 0 : WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
7690 :
7691 0 : ret = try_to_freeze();
7692 0 : if (kthread_should_stop())
7693 : break;
7694 :
7695 : /*
7696 : * We can speed up thawing tasks if we don't call balance_pgdat
7697 : * after returning from the refrigerator
7698 : */
7699 0 : if (ret)
7700 0 : continue;
7701 :
7702 : /*
7703 : * Reclaim begins at the requested order but if a high-order
7704 : * reclaim fails then kswapd falls back to reclaiming for
7705 : * order-0. If that happens, kswapd will consider sleeping
7706 : * for the order it finished reclaiming at (reclaim_order)
7707 : * but kcompactd is woken to compact for the original
7708 : * request (alloc_order).
7709 : */
7710 0 : trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
7711 : alloc_order);
7712 0 : reclaim_order = balance_pgdat(pgdat, alloc_order,
7713 : highest_zoneidx);
7714 0 : if (reclaim_order < alloc_order)
7715 : goto kswapd_try_sleep;
7716 : }
7717 :
7718 0 : tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
7719 :
7720 0 : return 0;
7721 : }
7722 :
7723 : /*
7724 : * A zone is low on free memory or too fragmented for high-order memory. If
7725 : * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
7726 : * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim
7727 : * has failed or is not needed, still wake up kcompactd if only compaction is
7728 : * needed.
7729 : */
7730 0 : void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
7731 : enum zone_type highest_zoneidx)
7732 : {
7733 : pg_data_t *pgdat;
7734 : enum zone_type curr_idx;
7735 :
7736 0 : if (!managed_zone(zone))
7737 : return;
7738 :
7739 0 : if (!cpuset_zone_allowed(zone, gfp_flags))
7740 : return;
7741 :
7742 0 : pgdat = zone->zone_pgdat;
7743 0 : curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
7744 :
7745 0 : if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
7746 0 : WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
7747 :
7748 0 : if (READ_ONCE(pgdat->kswapd_order) < order)
7749 0 : WRITE_ONCE(pgdat->kswapd_order, order);
7750 :
7751 0 : if (!waitqueue_active(&pgdat->kswapd_wait))
7752 : return;
7753 :
7754 : /* Hopeless node, leave it to direct reclaim if possible */
7755 0 : if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
7756 0 : (pgdat_balanced(pgdat, order, highest_zoneidx) &&
7757 0 : !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
7758 : /*
7759 : * There may be plenty of free memory available, but it's too
7760 : * fragmented for high-order allocations. Wake up kcompactd
7761 : * and rely on compaction_suitable() to determine if it's
7762 : * needed. If it fails, it will defer subsequent attempts to
7763 : * ratelimit its work.
7764 : */
7765 0 : if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
7766 0 : wakeup_kcompactd(pgdat, order, highest_zoneidx);
7767 : return;
7768 : }
7769 :
7770 0 : trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
7771 : gfp_flags);
7772 0 : wake_up_interruptible(&pgdat->kswapd_wait);
7773 : }
7774 :
7775 : #ifdef CONFIG_HIBERNATION
7776 : /*
7777 : * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
7778 : * freed pages.
7779 : *
7780 : * Rather than trying to age LRUs the aim is to preserve the overall
7781 : * LRU order by reclaiming preferentially
7782 : * inactive > active > active referenced > active mapped
7783 : */
7784 : unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
7785 : {
7786 : struct scan_control sc = {
7787 : .nr_to_reclaim = nr_to_reclaim,
7788 : .gfp_mask = GFP_HIGHUSER_MOVABLE,
7789 : .reclaim_idx = MAX_NR_ZONES - 1,
7790 : .priority = DEF_PRIORITY,
7791 : .may_writepage = 1,
7792 : .may_unmap = 1,
7793 : .may_swap = 1,
7794 : .hibernation_mode = 1,
7795 : };
7796 : struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
7797 : unsigned long nr_reclaimed;
7798 : unsigned int noreclaim_flag;
7799 :
7800 : fs_reclaim_acquire(sc.gfp_mask);
7801 : noreclaim_flag = memalloc_noreclaim_save();
7802 : set_task_reclaim_state(current, &sc.reclaim_state);
7803 :
7804 : nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
7805 :
7806 : set_task_reclaim_state(current, NULL);
7807 : memalloc_noreclaim_restore(noreclaim_flag);
7808 : fs_reclaim_release(sc.gfp_mask);
7809 :
7810 : return nr_reclaimed;
7811 : }
7812 : #endif /* CONFIG_HIBERNATION */
7813 :
7814 : /*
7815 : * This kswapd start function will be called by init and node-hot-add.
7816 : */
7817 1 : void kswapd_run(int nid)
7818 : {
7819 1 : pg_data_t *pgdat = NODE_DATA(nid);
7820 :
7821 : pgdat_kswapd_lock(pgdat);
7822 1 : if (!pgdat->kswapd) {
7823 2 : pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
7824 1 : if (IS_ERR(pgdat->kswapd)) {
7825 : /* failure at boot is fatal */
7826 0 : BUG_ON(system_state < SYSTEM_RUNNING);
7827 0 : pr_err("Failed to start kswapd on node %d\n", nid);
7828 0 : pgdat->kswapd = NULL;
7829 : }
7830 : }
7831 : pgdat_kswapd_unlock(pgdat);
7832 1 : }
7833 :
7834 : /*
7835 : * Called by memory hotplug when all memory in a node is offlined. Caller must
7836 : * be holding mem_hotplug_begin/done().
7837 : */
7838 0 : void kswapd_stop(int nid)
7839 : {
7840 0 : pg_data_t *pgdat = NODE_DATA(nid);
7841 : struct task_struct *kswapd;
7842 :
7843 : pgdat_kswapd_lock(pgdat);
7844 0 : kswapd = pgdat->kswapd;
7845 0 : if (kswapd) {
7846 0 : kthread_stop(kswapd);
7847 0 : pgdat->kswapd = NULL;
7848 : }
7849 : pgdat_kswapd_unlock(pgdat);
7850 0 : }
7851 :
7852 1 : static int __init kswapd_init(void)
7853 : {
7854 : int nid;
7855 :
7856 1 : swap_setup();
7857 2 : for_each_node_state(nid, N_MEMORY)
7858 1 : kswapd_run(nid);
7859 1 : return 0;
7860 : }
7861 :
7862 : module_init(kswapd_init)
7863 :
7864 : #ifdef CONFIG_NUMA
7865 : /*
7866 : * Node reclaim mode
7867 : *
7868 : * If non-zero call node_reclaim when the number of free pages falls below
7869 : * the watermarks.
7870 : */
7871 : int node_reclaim_mode __read_mostly;
7872 :
7873 : /*
7874 : * Priority for NODE_RECLAIM. This determines the fraction of pages
7875 : * of a node considered for each zone_reclaim. 4 scans 1/16th of
7876 : * a zone.
7877 : */
7878 : #define NODE_RECLAIM_PRIORITY 4
7879 :
7880 : /*
7881 : * Percentage of pages in a zone that must be unmapped for node_reclaim to
7882 : * occur.
7883 : */
7884 : int sysctl_min_unmapped_ratio = 1;
7885 :
7886 : /*
7887 : * If the number of slab pages in a zone grows beyond this percentage then
7888 : * slab reclaim needs to occur.
7889 : */
7890 : int sysctl_min_slab_ratio = 5;
7891 :
7892 : static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
7893 : {
7894 : unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
7895 : unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
7896 : node_page_state(pgdat, NR_ACTIVE_FILE);
7897 :
7898 : /*
7899 : * It's possible for there to be more file mapped pages than
7900 : * accounted for by the pages on the file LRU lists because
7901 : * tmpfs pages accounted for as ANON can also be FILE_MAPPED
7902 : */
7903 : return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
7904 : }
7905 :
7906 : /* Work out how many page cache pages we can reclaim in this reclaim_mode */
7907 : static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
7908 : {
7909 : unsigned long nr_pagecache_reclaimable;
7910 : unsigned long delta = 0;
7911 :
7912 : /*
7913 : * If RECLAIM_UNMAP is set, then all file pages are considered
7914 : * potentially reclaimable. Otherwise, we have to worry about
7915 : * pages like swapcache and node_unmapped_file_pages() provides
7916 : * a better estimate
7917 : */
7918 : if (node_reclaim_mode & RECLAIM_UNMAP)
7919 : nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
7920 : else
7921 : nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
7922 :
7923 : /* If we can't clean pages, remove dirty pages from consideration */
7924 : if (!(node_reclaim_mode & RECLAIM_WRITE))
7925 : delta += node_page_state(pgdat, NR_FILE_DIRTY);
7926 :
7927 : /* Watch for any possible underflows due to delta */
7928 : if (unlikely(delta > nr_pagecache_reclaimable))
7929 : delta = nr_pagecache_reclaimable;
7930 :
7931 : return nr_pagecache_reclaimable - delta;
7932 : }
7933 :
7934 : /*
7935 : * Try to free up some pages from this node through reclaim.
7936 : */
7937 : static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
7938 : {
7939 : /* Minimum pages needed in order to stay on node */
7940 : const unsigned long nr_pages = 1 << order;
7941 : struct task_struct *p = current;
7942 : unsigned int noreclaim_flag;
7943 : struct scan_control sc = {
7944 : .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
7945 : .gfp_mask = current_gfp_context(gfp_mask),
7946 : .order = order,
7947 : .priority = NODE_RECLAIM_PRIORITY,
7948 : .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
7949 : .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
7950 : .may_swap = 1,
7951 : .reclaim_idx = gfp_zone(gfp_mask),
7952 : };
7953 : unsigned long pflags;
7954 :
7955 : trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
7956 : sc.gfp_mask);
7957 :
7958 : cond_resched();
7959 : psi_memstall_enter(&pflags);
7960 : fs_reclaim_acquire(sc.gfp_mask);
7961 : /*
7962 : * We need to be able to allocate from the reserves for RECLAIM_UNMAP
7963 : */
7964 : noreclaim_flag = memalloc_noreclaim_save();
7965 : set_task_reclaim_state(p, &sc.reclaim_state);
7966 :
7967 : if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
7968 : node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
7969 : /*
7970 : * Free memory by calling shrink node with increasing
7971 : * priorities until we have enough memory freed.
7972 : */
7973 : do {
7974 : shrink_node(pgdat, &sc);
7975 : } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
7976 : }
7977 :
7978 : set_task_reclaim_state(p, NULL);
7979 : memalloc_noreclaim_restore(noreclaim_flag);
7980 : fs_reclaim_release(sc.gfp_mask);
7981 : psi_memstall_leave(&pflags);
7982 :
7983 : trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
7984 :
7985 : return sc.nr_reclaimed >= nr_pages;
7986 : }
7987 :
7988 : int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
7989 : {
7990 : int ret;
7991 :
7992 : /*
7993 : * Node reclaim reclaims unmapped file backed pages and
7994 : * slab pages if we are over the defined limits.
7995 : *
7996 : * A small portion of unmapped file backed pages is needed for
7997 : * file I/O otherwise pages read by file I/O will be immediately
7998 : * thrown out if the node is overallocated. So we do not reclaim
7999 : * if less than a specified percentage of the node is used by
8000 : * unmapped file backed pages.
8001 : */
8002 : if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
8003 : node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
8004 : pgdat->min_slab_pages)
8005 : return NODE_RECLAIM_FULL;
8006 :
8007 : /*
8008 : * Do not scan if the allocation should not be delayed.
8009 : */
8010 : if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
8011 : return NODE_RECLAIM_NOSCAN;
8012 :
8013 : /*
8014 : * Only run node reclaim on the local node or on nodes that do not
8015 : * have associated processors. This will favor the local processor
8016 : * over remote processors and spread off node memory allocations
8017 : * as wide as possible.
8018 : */
8019 : if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
8020 : return NODE_RECLAIM_NOSCAN;
8021 :
8022 : if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
8023 : return NODE_RECLAIM_NOSCAN;
8024 :
8025 : ret = __node_reclaim(pgdat, gfp_mask, order);
8026 : clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
8027 :
8028 : if (!ret)
8029 : count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
8030 :
8031 : return ret;
8032 : }
8033 : #endif
8034 :
8035 0 : void check_move_unevictable_pages(struct pagevec *pvec)
8036 : {
8037 : struct folio_batch fbatch;
8038 : unsigned i;
8039 :
8040 0 : folio_batch_init(&fbatch);
8041 0 : for (i = 0; i < pvec->nr; i++) {
8042 0 : struct page *page = pvec->pages[i];
8043 :
8044 0 : if (PageTransTail(page))
8045 : continue;
8046 0 : folio_batch_add(&fbatch, page_folio(page));
8047 : }
8048 0 : check_move_unevictable_folios(&fbatch);
8049 0 : }
8050 : EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
8051 :
8052 : /**
8053 : * check_move_unevictable_folios - Move evictable folios to appropriate zone
8054 : * lru list
8055 : * @fbatch: Batch of lru folios to check.
8056 : *
8057 : * Checks folios for evictability, if an evictable folio is in the unevictable
8058 : * lru list, moves it to the appropriate evictable lru list. This function
8059 : * should be only used for lru folios.
8060 : */
8061 0 : void check_move_unevictable_folios(struct folio_batch *fbatch)
8062 : {
8063 0 : struct lruvec *lruvec = NULL;
8064 0 : int pgscanned = 0;
8065 0 : int pgrescued = 0;
8066 : int i;
8067 :
8068 0 : for (i = 0; i < fbatch->nr; i++) {
8069 0 : struct folio *folio = fbatch->folios[i];
8070 0 : int nr_pages = folio_nr_pages(folio);
8071 :
8072 0 : pgscanned += nr_pages;
8073 :
8074 : /* block memcg migration while the folio moves between lrus */
8075 0 : if (!folio_test_clear_lru(folio))
8076 0 : continue;
8077 :
8078 0 : lruvec = folio_lruvec_relock_irq(folio, lruvec);
8079 0 : if (folio_evictable(folio) && folio_test_unevictable(folio)) {
8080 0 : lruvec_del_folio(lruvec, folio);
8081 0 : folio_clear_unevictable(folio);
8082 0 : lruvec_add_folio(lruvec, folio);
8083 0 : pgrescued += nr_pages;
8084 : }
8085 : folio_set_lru(folio);
8086 : }
8087 :
8088 0 : if (lruvec) {
8089 0 : __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
8090 0 : __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
8091 0 : unlock_page_lruvec_irq(lruvec);
8092 0 : } else if (pgscanned) {
8093 0 : count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
8094 : }
8095 0 : }
8096 : EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
|