LCOV - code coverage report
Current view: top level - mm - vmstat.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 22 209 10.5 %
Date: 2023-04-06 08:38:28 Functions: 2 24 8.3 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  *  linux/mm/vmstat.c
       4             :  *
       5             :  *  Manages VM statistics
       6             :  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
       7             :  *
       8             :  *  zoned VM statistics
       9             :  *  Copyright (C) 2006 Silicon Graphics, Inc.,
      10             :  *              Christoph Lameter <christoph@lameter.com>
      11             :  *  Copyright (C) 2008-2014 Christoph Lameter
      12             :  */
      13             : #include <linux/fs.h>
      14             : #include <linux/mm.h>
      15             : #include <linux/err.h>
      16             : #include <linux/module.h>
      17             : #include <linux/slab.h>
      18             : #include <linux/cpu.h>
      19             : #include <linux/cpumask.h>
      20             : #include <linux/vmstat.h>
      21             : #include <linux/proc_fs.h>
      22             : #include <linux/seq_file.h>
      23             : #include <linux/debugfs.h>
      24             : #include <linux/sched.h>
      25             : #include <linux/math64.h>
      26             : #include <linux/writeback.h>
      27             : #include <linux/compaction.h>
      28             : #include <linux/mm_inline.h>
      29             : #include <linux/page_ext.h>
      30             : #include <linux/page_owner.h>
      31             : 
      32             : #include "internal.h"
      33             : 
      34             : #ifdef CONFIG_NUMA
      35             : int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
      36             : 
      37             : /* zero numa counters within a zone */
      38             : static void zero_zone_numa_counters(struct zone *zone)
      39             : {
      40             :         int item, cpu;
      41             : 
      42             :         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
      43             :                 atomic_long_set(&zone->vm_numa_event[item], 0);
      44             :                 for_each_online_cpu(cpu) {
      45             :                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
      46             :                                                 = 0;
      47             :                 }
      48             :         }
      49             : }
      50             : 
      51             : /* zero numa counters of all the populated zones */
      52             : static void zero_zones_numa_counters(void)
      53             : {
      54             :         struct zone *zone;
      55             : 
      56             :         for_each_populated_zone(zone)
      57             :                 zero_zone_numa_counters(zone);
      58             : }
      59             : 
      60             : /* zero global numa counters */
      61             : static void zero_global_numa_counters(void)
      62             : {
      63             :         int item;
      64             : 
      65             :         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
      66             :                 atomic_long_set(&vm_numa_event[item], 0);
      67             : }
      68             : 
      69             : static void invalid_numa_statistics(void)
      70             : {
      71             :         zero_zones_numa_counters();
      72             :         zero_global_numa_counters();
      73             : }
      74             : 
      75             : static DEFINE_MUTEX(vm_numa_stat_lock);
      76             : 
      77             : int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
      78             :                 void *buffer, size_t *length, loff_t *ppos)
      79             : {
      80             :         int ret, oldval;
      81             : 
      82             :         mutex_lock(&vm_numa_stat_lock);
      83             :         if (write)
      84             :                 oldval = sysctl_vm_numa_stat;
      85             :         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
      86             :         if (ret || !write)
      87             :                 goto out;
      88             : 
      89             :         if (oldval == sysctl_vm_numa_stat)
      90             :                 goto out;
      91             :         else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
      92             :                 static_branch_enable(&vm_numa_stat_key);
      93             :                 pr_info("enable numa statistics\n");
      94             :         } else {
      95             :                 static_branch_disable(&vm_numa_stat_key);
      96             :                 invalid_numa_statistics();
      97             :                 pr_info("disable numa statistics, and clear numa counters\n");
      98             :         }
      99             : 
     100             : out:
     101             :         mutex_unlock(&vm_numa_stat_lock);
     102             :         return ret;
     103             : }
     104             : #endif
     105             : 
     106             : #ifdef CONFIG_VM_EVENT_COUNTERS
     107             : DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
     108             : EXPORT_PER_CPU_SYMBOL(vm_event_states);
     109             : 
     110           0 : static void sum_vm_events(unsigned long *ret)
     111             : {
     112             :         int cpu;
     113             :         int i;
     114             : 
     115           0 :         memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
     116             : 
     117           0 :         for_each_online_cpu(cpu) {
     118             :                 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
     119             : 
     120           0 :                 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
     121           0 :                         ret[i] += this->event[i];
     122             :         }
     123           0 : }
     124             : 
     125             : /*
     126             :  * Accumulate the vm event counters across all CPUs.
     127             :  * The result is unavoidably approximate - it can change
     128             :  * during and after execution of this function.
     129             : */
     130           0 : void all_vm_events(unsigned long *ret)
     131             : {
     132             :         cpus_read_lock();
     133           0 :         sum_vm_events(ret);
     134             :         cpus_read_unlock();
     135           0 : }
     136             : EXPORT_SYMBOL_GPL(all_vm_events);
     137             : 
     138             : /*
     139             :  * Fold the foreign cpu events into our own.
     140             :  *
     141             :  * This is adding to the events on one processor
     142             :  * but keeps the global counts constant.
     143             :  */
     144           0 : void vm_events_fold_cpu(int cpu)
     145             : {
     146           0 :         struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
     147             :         int i;
     148             : 
     149           0 :         for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
     150           0 :                 count_vm_events(i, fold_state->event[i]);
     151           0 :                 fold_state->event[i] = 0;
     152             :         }
     153           0 : }
     154             : 
     155             : #endif /* CONFIG_VM_EVENT_COUNTERS */
     156             : 
     157             : /*
     158             :  * Manage combined zone based / global counters
     159             :  *
     160             :  * vm_stat contains the global counters
     161             :  */
     162             : atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
     163             : atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
     164             : atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
     165             : EXPORT_SYMBOL(vm_zone_stat);
     166             : EXPORT_SYMBOL(vm_node_stat);
     167             : 
     168             : #ifdef CONFIG_NUMA
     169             : static void fold_vm_zone_numa_events(struct zone *zone)
     170             : {
     171             :         unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
     172             :         int cpu;
     173             :         enum numa_stat_item item;
     174             : 
     175             :         for_each_online_cpu(cpu) {
     176             :                 struct per_cpu_zonestat *pzstats;
     177             : 
     178             :                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
     179             :                 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
     180             :                         zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
     181             :         }
     182             : 
     183             :         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
     184             :                 zone_numa_event_add(zone_numa_events[item], zone, item);
     185             : }
     186             : 
     187             : void fold_vm_numa_events(void)
     188             : {
     189             :         struct zone *zone;
     190             : 
     191             :         for_each_populated_zone(zone)
     192             :                 fold_vm_zone_numa_events(zone);
     193             : }
     194             : #endif
     195             : 
     196             : #ifdef CONFIG_SMP
     197             : 
     198             : int calculate_pressure_threshold(struct zone *zone)
     199             : {
     200             :         int threshold;
     201             :         int watermark_distance;
     202             : 
     203             :         /*
     204             :          * As vmstats are not up to date, there is drift between the estimated
     205             :          * and real values. For high thresholds and a high number of CPUs, it
     206             :          * is possible for the min watermark to be breached while the estimated
     207             :          * value looks fine. The pressure threshold is a reduced value such
     208             :          * that even the maximum amount of drift will not accidentally breach
     209             :          * the min watermark
     210             :          */
     211             :         watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
     212             :         threshold = max(1, (int)(watermark_distance / num_online_cpus()));
     213             : 
     214             :         /*
     215             :          * Maximum threshold is 125
     216             :          */
     217             :         threshold = min(125, threshold);
     218             : 
     219             :         return threshold;
     220             : }
     221             : 
     222             : int calculate_normal_threshold(struct zone *zone)
     223             : {
     224             :         int threshold;
     225             :         int mem;        /* memory in 128 MB units */
     226             : 
     227             :         /*
     228             :          * The threshold scales with the number of processors and the amount
     229             :          * of memory per zone. More memory means that we can defer updates for
     230             :          * longer, more processors could lead to more contention.
     231             :          * fls() is used to have a cheap way of logarithmic scaling.
     232             :          *
     233             :          * Some sample thresholds:
     234             :          *
     235             :          * Threshold    Processors      (fls)   Zonesize        fls(mem)+1
     236             :          * ------------------------------------------------------------------
     237             :          * 8            1               1       0.9-1 GB        4
     238             :          * 16           2               2       0.9-1 GB        4
     239             :          * 20           2               2       1-2 GB          5
     240             :          * 24           2               2       2-4 GB          6
     241             :          * 28           2               2       4-8 GB          7
     242             :          * 32           2               2       8-16 GB         8
     243             :          * 4            2               2       <128M                1
     244             :          * 30           4               3       2-4 GB          5
     245             :          * 48           4               3       8-16 GB         8
     246             :          * 32           8               4       1-2 GB          4
     247             :          * 32           8               4       0.9-1GB         4
     248             :          * 10           16              5       <128M                1
     249             :          * 40           16              5       900M            4
     250             :          * 70           64              7       2-4 GB          5
     251             :          * 84           64              7       4-8 GB          6
     252             :          * 108          512             9       4-8 GB          6
     253             :          * 125          1024            10      8-16 GB         8
     254             :          * 125          1024            10      16-32 GB        9
     255             :          */
     256             : 
     257             :         mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
     258             : 
     259             :         threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
     260             : 
     261             :         /*
     262             :          * Maximum threshold is 125
     263             :          */
     264             :         threshold = min(125, threshold);
     265             : 
     266             :         return threshold;
     267             : }
     268             : 
     269             : /*
     270             :  * Refresh the thresholds for each zone.
     271             :  */
     272             : void refresh_zone_stat_thresholds(void)
     273             : {
     274             :         struct pglist_data *pgdat;
     275             :         struct zone *zone;
     276             :         int cpu;
     277             :         int threshold;
     278             : 
     279             :         /* Zero current pgdat thresholds */
     280             :         for_each_online_pgdat(pgdat) {
     281             :                 for_each_online_cpu(cpu) {
     282             :                         per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
     283             :                 }
     284             :         }
     285             : 
     286             :         for_each_populated_zone(zone) {
     287             :                 struct pglist_data *pgdat = zone->zone_pgdat;
     288             :                 unsigned long max_drift, tolerate_drift;
     289             : 
     290             :                 threshold = calculate_normal_threshold(zone);
     291             : 
     292             :                 for_each_online_cpu(cpu) {
     293             :                         int pgdat_threshold;
     294             : 
     295             :                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
     296             :                                                         = threshold;
     297             : 
     298             :                         /* Base nodestat threshold on the largest populated zone. */
     299             :                         pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
     300             :                         per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
     301             :                                 = max(threshold, pgdat_threshold);
     302             :                 }
     303             : 
     304             :                 /*
     305             :                  * Only set percpu_drift_mark if there is a danger that
     306             :                  * NR_FREE_PAGES reports the low watermark is ok when in fact
     307             :                  * the min watermark could be breached by an allocation
     308             :                  */
     309             :                 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
     310             :                 max_drift = num_online_cpus() * threshold;
     311             :                 if (max_drift > tolerate_drift)
     312             :                         zone->percpu_drift_mark = high_wmark_pages(zone) +
     313             :                                         max_drift;
     314             :         }
     315             : }
     316             : 
     317             : void set_pgdat_percpu_threshold(pg_data_t *pgdat,
     318             :                                 int (*calculate_pressure)(struct zone *))
     319             : {
     320             :         struct zone *zone;
     321             :         int cpu;
     322             :         int threshold;
     323             :         int i;
     324             : 
     325             :         for (i = 0; i < pgdat->nr_zones; i++) {
     326             :                 zone = &pgdat->node_zones[i];
     327             :                 if (!zone->percpu_drift_mark)
     328             :                         continue;
     329             : 
     330             :                 threshold = (*calculate_pressure)(zone);
     331             :                 for_each_online_cpu(cpu)
     332             :                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
     333             :                                                         = threshold;
     334             :         }
     335             : }
     336             : 
     337             : /*
     338             :  * For use when we know that interrupts are disabled,
     339             :  * or when we know that preemption is disabled and that
     340             :  * particular counter cannot be updated from interrupt context.
     341             :  */
     342             : void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
     343             :                            long delta)
     344             : {
     345             :         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
     346             :         s8 __percpu *p = pcp->vm_stat_diff + item;
     347             :         long x;
     348             :         long t;
     349             : 
     350             :         /*
     351             :          * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
     352             :          * atomicity is provided by IRQs being disabled -- either explicitly
     353             :          * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
     354             :          * CPU migrations and preemption potentially corrupts a counter so
     355             :          * disable preemption.
     356             :          */
     357             :         preempt_disable_nested();
     358             : 
     359             :         x = delta + __this_cpu_read(*p);
     360             : 
     361             :         t = __this_cpu_read(pcp->stat_threshold);
     362             : 
     363             :         if (unlikely(abs(x) > t)) {
     364             :                 zone_page_state_add(x, zone, item);
     365             :                 x = 0;
     366             :         }
     367             :         __this_cpu_write(*p, x);
     368             : 
     369             :         preempt_enable_nested();
     370             : }
     371             : EXPORT_SYMBOL(__mod_zone_page_state);
     372             : 
     373             : void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
     374             :                                 long delta)
     375             : {
     376             :         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
     377             :         s8 __percpu *p = pcp->vm_node_stat_diff + item;
     378             :         long x;
     379             :         long t;
     380             : 
     381             :         if (vmstat_item_in_bytes(item)) {
     382             :                 /*
     383             :                  * Only cgroups use subpage accounting right now; at
     384             :                  * the global level, these items still change in
     385             :                  * multiples of whole pages. Store them as pages
     386             :                  * internally to keep the per-cpu counters compact.
     387             :                  */
     388             :                 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
     389             :                 delta >>= PAGE_SHIFT;
     390             :         }
     391             : 
     392             :         /* See __mod_node_page_state */
     393             :         preempt_disable_nested();
     394             : 
     395             :         x = delta + __this_cpu_read(*p);
     396             : 
     397             :         t = __this_cpu_read(pcp->stat_threshold);
     398             : 
     399             :         if (unlikely(abs(x) > t)) {
     400             :                 node_page_state_add(x, pgdat, item);
     401             :                 x = 0;
     402             :         }
     403             :         __this_cpu_write(*p, x);
     404             : 
     405             :         preempt_enable_nested();
     406             : }
     407             : EXPORT_SYMBOL(__mod_node_page_state);
     408             : 
     409             : /*
     410             :  * Optimized increment and decrement functions.
     411             :  *
     412             :  * These are only for a single page and therefore can take a struct page *
     413             :  * argument instead of struct zone *. This allows the inclusion of the code
     414             :  * generated for page_zone(page) into the optimized functions.
     415             :  *
     416             :  * No overflow check is necessary and therefore the differential can be
     417             :  * incremented or decremented in place which may allow the compilers to
     418             :  * generate better code.
     419             :  * The increment or decrement is known and therefore one boundary check can
     420             :  * be omitted.
     421             :  *
     422             :  * NOTE: These functions are very performance sensitive. Change only
     423             :  * with care.
     424             :  *
     425             :  * Some processors have inc/dec instructions that are atomic vs an interrupt.
     426             :  * However, the code must first determine the differential location in a zone
     427             :  * based on the processor number and then inc/dec the counter. There is no
     428             :  * guarantee without disabling preemption that the processor will not change
     429             :  * in between and therefore the atomicity vs. interrupt cannot be exploited
     430             :  * in a useful way here.
     431             :  */
     432             : void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
     433             : {
     434             :         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
     435             :         s8 __percpu *p = pcp->vm_stat_diff + item;
     436             :         s8 v, t;
     437             : 
     438             :         /* See __mod_node_page_state */
     439             :         preempt_disable_nested();
     440             : 
     441             :         v = __this_cpu_inc_return(*p);
     442             :         t = __this_cpu_read(pcp->stat_threshold);
     443             :         if (unlikely(v > t)) {
     444             :                 s8 overstep = t >> 1;
     445             : 
     446             :                 zone_page_state_add(v + overstep, zone, item);
     447             :                 __this_cpu_write(*p, -overstep);
     448             :         }
     449             : 
     450             :         preempt_enable_nested();
     451             : }
     452             : 
     453             : void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
     454             : {
     455             :         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
     456             :         s8 __percpu *p = pcp->vm_node_stat_diff + item;
     457             :         s8 v, t;
     458             : 
     459             :         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
     460             : 
     461             :         /* See __mod_node_page_state */
     462             :         preempt_disable_nested();
     463             : 
     464             :         v = __this_cpu_inc_return(*p);
     465             :         t = __this_cpu_read(pcp->stat_threshold);
     466             :         if (unlikely(v > t)) {
     467             :                 s8 overstep = t >> 1;
     468             : 
     469             :                 node_page_state_add(v + overstep, pgdat, item);
     470             :                 __this_cpu_write(*p, -overstep);
     471             :         }
     472             : 
     473             :         preempt_enable_nested();
     474             : }
     475             : 
     476             : void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
     477             : {
     478             :         __inc_zone_state(page_zone(page), item);
     479             : }
     480             : EXPORT_SYMBOL(__inc_zone_page_state);
     481             : 
     482             : void __inc_node_page_state(struct page *page, enum node_stat_item item)
     483             : {
     484             :         __inc_node_state(page_pgdat(page), item);
     485             : }
     486             : EXPORT_SYMBOL(__inc_node_page_state);
     487             : 
     488             : void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
     489             : {
     490             :         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
     491             :         s8 __percpu *p = pcp->vm_stat_diff + item;
     492             :         s8 v, t;
     493             : 
     494             :         /* See __mod_node_page_state */
     495             :         preempt_disable_nested();
     496             : 
     497             :         v = __this_cpu_dec_return(*p);
     498             :         t = __this_cpu_read(pcp->stat_threshold);
     499             :         if (unlikely(v < - t)) {
     500             :                 s8 overstep = t >> 1;
     501             : 
     502             :                 zone_page_state_add(v - overstep, zone, item);
     503             :                 __this_cpu_write(*p, overstep);
     504             :         }
     505             : 
     506             :         preempt_enable_nested();
     507             : }
     508             : 
     509             : void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
     510             : {
     511             :         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
     512             :         s8 __percpu *p = pcp->vm_node_stat_diff + item;
     513             :         s8 v, t;
     514             : 
     515             :         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
     516             : 
     517             :         /* See __mod_node_page_state */
     518             :         preempt_disable_nested();
     519             : 
     520             :         v = __this_cpu_dec_return(*p);
     521             :         t = __this_cpu_read(pcp->stat_threshold);
     522             :         if (unlikely(v < - t)) {
     523             :                 s8 overstep = t >> 1;
     524             : 
     525             :                 node_page_state_add(v - overstep, pgdat, item);
     526             :                 __this_cpu_write(*p, overstep);
     527             :         }
     528             : 
     529             :         preempt_enable_nested();
     530             : }
     531             : 
     532             : void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
     533             : {
     534             :         __dec_zone_state(page_zone(page), item);
     535             : }
     536             : EXPORT_SYMBOL(__dec_zone_page_state);
     537             : 
     538             : void __dec_node_page_state(struct page *page, enum node_stat_item item)
     539             : {
     540             :         __dec_node_state(page_pgdat(page), item);
     541             : }
     542             : EXPORT_SYMBOL(__dec_node_page_state);
     543             : 
     544             : #ifdef CONFIG_HAVE_CMPXCHG_LOCAL
     545             : /*
     546             :  * If we have cmpxchg_local support then we do not need to incur the overhead
     547             :  * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
     548             :  *
     549             :  * mod_state() modifies the zone counter state through atomic per cpu
     550             :  * operations.
     551             :  *
     552             :  * Overstep mode specifies how overstep should handled:
     553             :  *     0       No overstepping
     554             :  *     1       Overstepping half of threshold
     555             :  *     -1      Overstepping minus half of threshold
     556             : */
     557             : static inline void mod_zone_state(struct zone *zone,
     558             :        enum zone_stat_item item, long delta, int overstep_mode)
     559             : {
     560             :         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
     561             :         s8 __percpu *p = pcp->vm_stat_diff + item;
     562             :         long o, n, t, z;
     563             : 
     564             :         do {
     565             :                 z = 0;  /* overflow to zone counters */
     566             : 
     567             :                 /*
     568             :                  * The fetching of the stat_threshold is racy. We may apply
     569             :                  * a counter threshold to the wrong the cpu if we get
     570             :                  * rescheduled while executing here. However, the next
     571             :                  * counter update will apply the threshold again and
     572             :                  * therefore bring the counter under the threshold again.
     573             :                  *
     574             :                  * Most of the time the thresholds are the same anyways
     575             :                  * for all cpus in a zone.
     576             :                  */
     577             :                 t = this_cpu_read(pcp->stat_threshold);
     578             : 
     579             :                 o = this_cpu_read(*p);
     580             :                 n = delta + o;
     581             : 
     582             :                 if (abs(n) > t) {
     583             :                         int os = overstep_mode * (t >> 1) ;
     584             : 
     585             :                         /* Overflow must be added to zone counters */
     586             :                         z = n + os;
     587             :                         n = -os;
     588             :                 }
     589             :         } while (this_cpu_cmpxchg(*p, o, n) != o);
     590             : 
     591             :         if (z)
     592             :                 zone_page_state_add(z, zone, item);
     593             : }
     594             : 
     595             : void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
     596             :                          long delta)
     597             : {
     598             :         mod_zone_state(zone, item, delta, 0);
     599             : }
     600             : EXPORT_SYMBOL(mod_zone_page_state);
     601             : 
     602             : void inc_zone_page_state(struct page *page, enum zone_stat_item item)
     603             : {
     604             :         mod_zone_state(page_zone(page), item, 1, 1);
     605             : }
     606             : EXPORT_SYMBOL(inc_zone_page_state);
     607             : 
     608             : void dec_zone_page_state(struct page *page, enum zone_stat_item item)
     609             : {
     610             :         mod_zone_state(page_zone(page), item, -1, -1);
     611             : }
     612             : EXPORT_SYMBOL(dec_zone_page_state);
     613             : 
     614             : static inline void mod_node_state(struct pglist_data *pgdat,
     615             :        enum node_stat_item item, int delta, int overstep_mode)
     616             : {
     617             :         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
     618             :         s8 __percpu *p = pcp->vm_node_stat_diff + item;
     619             :         long o, n, t, z;
     620             : 
     621             :         if (vmstat_item_in_bytes(item)) {
     622             :                 /*
     623             :                  * Only cgroups use subpage accounting right now; at
     624             :                  * the global level, these items still change in
     625             :                  * multiples of whole pages. Store them as pages
     626             :                  * internally to keep the per-cpu counters compact.
     627             :                  */
     628             :                 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
     629             :                 delta >>= PAGE_SHIFT;
     630             :         }
     631             : 
     632             :         do {
     633             :                 z = 0;  /* overflow to node counters */
     634             : 
     635             :                 /*
     636             :                  * The fetching of the stat_threshold is racy. We may apply
     637             :                  * a counter threshold to the wrong the cpu if we get
     638             :                  * rescheduled while executing here. However, the next
     639             :                  * counter update will apply the threshold again and
     640             :                  * therefore bring the counter under the threshold again.
     641             :                  *
     642             :                  * Most of the time the thresholds are the same anyways
     643             :                  * for all cpus in a node.
     644             :                  */
     645             :                 t = this_cpu_read(pcp->stat_threshold);
     646             : 
     647             :                 o = this_cpu_read(*p);
     648             :                 n = delta + o;
     649             : 
     650             :                 if (abs(n) > t) {
     651             :                         int os = overstep_mode * (t >> 1) ;
     652             : 
     653             :                         /* Overflow must be added to node counters */
     654             :                         z = n + os;
     655             :                         n = -os;
     656             :                 }
     657             :         } while (this_cpu_cmpxchg(*p, o, n) != o);
     658             : 
     659             :         if (z)
     660             :                 node_page_state_add(z, pgdat, item);
     661             : }
     662             : 
     663             : void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
     664             :                                         long delta)
     665             : {
     666             :         mod_node_state(pgdat, item, delta, 0);
     667             : }
     668             : EXPORT_SYMBOL(mod_node_page_state);
     669             : 
     670             : void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
     671             : {
     672             :         mod_node_state(pgdat, item, 1, 1);
     673             : }
     674             : 
     675             : void inc_node_page_state(struct page *page, enum node_stat_item item)
     676             : {
     677             :         mod_node_state(page_pgdat(page), item, 1, 1);
     678             : }
     679             : EXPORT_SYMBOL(inc_node_page_state);
     680             : 
     681             : void dec_node_page_state(struct page *page, enum node_stat_item item)
     682             : {
     683             :         mod_node_state(page_pgdat(page), item, -1, -1);
     684             : }
     685             : EXPORT_SYMBOL(dec_node_page_state);
     686             : #else
     687             : /*
     688             :  * Use interrupt disable to serialize counter updates
     689             :  */
     690             : void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
     691             :                          long delta)
     692             : {
     693             :         unsigned long flags;
     694             : 
     695             :         local_irq_save(flags);
     696             :         __mod_zone_page_state(zone, item, delta);
     697             :         local_irq_restore(flags);
     698             : }
     699             : EXPORT_SYMBOL(mod_zone_page_state);
     700             : 
     701             : void inc_zone_page_state(struct page *page, enum zone_stat_item item)
     702             : {
     703             :         unsigned long flags;
     704             :         struct zone *zone;
     705             : 
     706             :         zone = page_zone(page);
     707             :         local_irq_save(flags);
     708             :         __inc_zone_state(zone, item);
     709             :         local_irq_restore(flags);
     710             : }
     711             : EXPORT_SYMBOL(inc_zone_page_state);
     712             : 
     713             : void dec_zone_page_state(struct page *page, enum zone_stat_item item)
     714             : {
     715             :         unsigned long flags;
     716             : 
     717             :         local_irq_save(flags);
     718             :         __dec_zone_page_state(page, item);
     719             :         local_irq_restore(flags);
     720             : }
     721             : EXPORT_SYMBOL(dec_zone_page_state);
     722             : 
     723             : void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
     724             : {
     725             :         unsigned long flags;
     726             : 
     727             :         local_irq_save(flags);
     728             :         __inc_node_state(pgdat, item);
     729             :         local_irq_restore(flags);
     730             : }
     731             : EXPORT_SYMBOL(inc_node_state);
     732             : 
     733             : void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
     734             :                                         long delta)
     735             : {
     736             :         unsigned long flags;
     737             : 
     738             :         local_irq_save(flags);
     739             :         __mod_node_page_state(pgdat, item, delta);
     740             :         local_irq_restore(flags);
     741             : }
     742             : EXPORT_SYMBOL(mod_node_page_state);
     743             : 
     744             : void inc_node_page_state(struct page *page, enum node_stat_item item)
     745             : {
     746             :         unsigned long flags;
     747             :         struct pglist_data *pgdat;
     748             : 
     749             :         pgdat = page_pgdat(page);
     750             :         local_irq_save(flags);
     751             :         __inc_node_state(pgdat, item);
     752             :         local_irq_restore(flags);
     753             : }
     754             : EXPORT_SYMBOL(inc_node_page_state);
     755             : 
     756             : void dec_node_page_state(struct page *page, enum node_stat_item item)
     757             : {
     758             :         unsigned long flags;
     759             : 
     760             :         local_irq_save(flags);
     761             :         __dec_node_page_state(page, item);
     762             :         local_irq_restore(flags);
     763             : }
     764             : EXPORT_SYMBOL(dec_node_page_state);
     765             : #endif
     766             : 
     767             : /*
     768             :  * Fold a differential into the global counters.
     769             :  * Returns the number of counters updated.
     770             :  */
     771             : static int fold_diff(int *zone_diff, int *node_diff)
     772             : {
     773             :         int i;
     774             :         int changes = 0;
     775             : 
     776             :         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
     777             :                 if (zone_diff[i]) {
     778             :                         atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
     779             :                         changes++;
     780             :         }
     781             : 
     782             :         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
     783             :                 if (node_diff[i]) {
     784             :                         atomic_long_add(node_diff[i], &vm_node_stat[i]);
     785             :                         changes++;
     786             :         }
     787             :         return changes;
     788             : }
     789             : 
     790             : /*
     791             :  * Update the zone counters for the current cpu.
     792             :  *
     793             :  * Note that refresh_cpu_vm_stats strives to only access
     794             :  * node local memory. The per cpu pagesets on remote zones are placed
     795             :  * in the memory local to the processor using that pageset. So the
     796             :  * loop over all zones will access a series of cachelines local to
     797             :  * the processor.
     798             :  *
     799             :  * The call to zone_page_state_add updates the cachelines with the
     800             :  * statistics in the remote zone struct as well as the global cachelines
     801             :  * with the global counters. These could cause remote node cache line
     802             :  * bouncing and will have to be only done when necessary.
     803             :  *
     804             :  * The function returns the number of global counters updated.
     805             :  */
     806             : static int refresh_cpu_vm_stats(bool do_pagesets)
     807             : {
     808             :         struct pglist_data *pgdat;
     809             :         struct zone *zone;
     810             :         int i;
     811             :         int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
     812             :         int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
     813             :         int changes = 0;
     814             : 
     815             :         for_each_populated_zone(zone) {
     816             :                 struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
     817             : #ifdef CONFIG_NUMA
     818             :                 struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
     819             : #endif
     820             : 
     821             :                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
     822             :                         int v;
     823             : 
     824             :                         v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
     825             :                         if (v) {
     826             : 
     827             :                                 atomic_long_add(v, &zone->vm_stat[i]);
     828             :                                 global_zone_diff[i] += v;
     829             : #ifdef CONFIG_NUMA
     830             :                                 /* 3 seconds idle till flush */
     831             :                                 __this_cpu_write(pcp->expire, 3);
     832             : #endif
     833             :                         }
     834             :                 }
     835             : #ifdef CONFIG_NUMA
     836             : 
     837             :                 if (do_pagesets) {
     838             :                         cond_resched();
     839             :                         /*
     840             :                          * Deal with draining the remote pageset of this
     841             :                          * processor
     842             :                          *
     843             :                          * Check if there are pages remaining in this pageset
     844             :                          * if not then there is nothing to expire.
     845             :                          */
     846             :                         if (!__this_cpu_read(pcp->expire) ||
     847             :                                !__this_cpu_read(pcp->count))
     848             :                                 continue;
     849             : 
     850             :                         /*
     851             :                          * We never drain zones local to this processor.
     852             :                          */
     853             :                         if (zone_to_nid(zone) == numa_node_id()) {
     854             :                                 __this_cpu_write(pcp->expire, 0);
     855             :                                 continue;
     856             :                         }
     857             : 
     858             :                         if (__this_cpu_dec_return(pcp->expire))
     859             :                                 continue;
     860             : 
     861             :                         if (__this_cpu_read(pcp->count)) {
     862             :                                 drain_zone_pages(zone, this_cpu_ptr(pcp));
     863             :                                 changes++;
     864             :                         }
     865             :                 }
     866             : #endif
     867             :         }
     868             : 
     869             :         for_each_online_pgdat(pgdat) {
     870             :                 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
     871             : 
     872             :                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
     873             :                         int v;
     874             : 
     875             :                         v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
     876             :                         if (v) {
     877             :                                 atomic_long_add(v, &pgdat->vm_stat[i]);
     878             :                                 global_node_diff[i] += v;
     879             :                         }
     880             :                 }
     881             :         }
     882             : 
     883             :         changes += fold_diff(global_zone_diff, global_node_diff);
     884             :         return changes;
     885             : }
     886             : 
     887             : /*
     888             :  * Fold the data for an offline cpu into the global array.
     889             :  * There cannot be any access by the offline cpu and therefore
     890             :  * synchronization is simplified.
     891             :  */
     892             : void cpu_vm_stats_fold(int cpu)
     893             : {
     894             :         struct pglist_data *pgdat;
     895             :         struct zone *zone;
     896             :         int i;
     897             :         int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
     898             :         int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
     899             : 
     900             :         for_each_populated_zone(zone) {
     901             :                 struct per_cpu_zonestat *pzstats;
     902             : 
     903             :                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
     904             : 
     905             :                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
     906             :                         if (pzstats->vm_stat_diff[i]) {
     907             :                                 int v;
     908             : 
     909             :                                 v = pzstats->vm_stat_diff[i];
     910             :                                 pzstats->vm_stat_diff[i] = 0;
     911             :                                 atomic_long_add(v, &zone->vm_stat[i]);
     912             :                                 global_zone_diff[i] += v;
     913             :                         }
     914             :                 }
     915             : #ifdef CONFIG_NUMA
     916             :                 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
     917             :                         if (pzstats->vm_numa_event[i]) {
     918             :                                 unsigned long v;
     919             : 
     920             :                                 v = pzstats->vm_numa_event[i];
     921             :                                 pzstats->vm_numa_event[i] = 0;
     922             :                                 zone_numa_event_add(v, zone, i);
     923             :                         }
     924             :                 }
     925             : #endif
     926             :         }
     927             : 
     928             :         for_each_online_pgdat(pgdat) {
     929             :                 struct per_cpu_nodestat *p;
     930             : 
     931             :                 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
     932             : 
     933             :                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
     934             :                         if (p->vm_node_stat_diff[i]) {
     935             :                                 int v;
     936             : 
     937             :                                 v = p->vm_node_stat_diff[i];
     938             :                                 p->vm_node_stat_diff[i] = 0;
     939             :                                 atomic_long_add(v, &pgdat->vm_stat[i]);
     940             :                                 global_node_diff[i] += v;
     941             :                         }
     942             :         }
     943             : 
     944             :         fold_diff(global_zone_diff, global_node_diff);
     945             : }
     946             : 
     947             : /*
     948             :  * this is only called if !populated_zone(zone), which implies no other users of
     949             :  * pset->vm_stat_diff[] exist.
     950             :  */
     951             : void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
     952             : {
     953             :         unsigned long v;
     954             :         int i;
     955             : 
     956             :         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
     957             :                 if (pzstats->vm_stat_diff[i]) {
     958             :                         v = pzstats->vm_stat_diff[i];
     959             :                         pzstats->vm_stat_diff[i] = 0;
     960             :                         zone_page_state_add(v, zone, i);
     961             :                 }
     962             :         }
     963             : 
     964             : #ifdef CONFIG_NUMA
     965             :         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
     966             :                 if (pzstats->vm_numa_event[i]) {
     967             :                         v = pzstats->vm_numa_event[i];
     968             :                         pzstats->vm_numa_event[i] = 0;
     969             :                         zone_numa_event_add(v, zone, i);
     970             :                 }
     971             :         }
     972             : #endif
     973             : }
     974             : #endif
     975             : 
     976             : #ifdef CONFIG_NUMA
     977             : /*
     978             :  * Determine the per node value of a stat item. This function
     979             :  * is called frequently in a NUMA machine, so try to be as
     980             :  * frugal as possible.
     981             :  */
     982             : unsigned long sum_zone_node_page_state(int node,
     983             :                                  enum zone_stat_item item)
     984             : {
     985             :         struct zone *zones = NODE_DATA(node)->node_zones;
     986             :         int i;
     987             :         unsigned long count = 0;
     988             : 
     989             :         for (i = 0; i < MAX_NR_ZONES; i++)
     990             :                 count += zone_page_state(zones + i, item);
     991             : 
     992             :         return count;
     993             : }
     994             : 
     995             : /* Determine the per node value of a numa stat item. */
     996             : unsigned long sum_zone_numa_event_state(int node,
     997             :                                  enum numa_stat_item item)
     998             : {
     999             :         struct zone *zones = NODE_DATA(node)->node_zones;
    1000             :         unsigned long count = 0;
    1001             :         int i;
    1002             : 
    1003             :         for (i = 0; i < MAX_NR_ZONES; i++)
    1004             :                 count += zone_numa_event_state(zones + i, item);
    1005             : 
    1006             :         return count;
    1007             : }
    1008             : 
    1009             : /*
    1010             :  * Determine the per node value of a stat item.
    1011             :  */
    1012             : unsigned long node_page_state_pages(struct pglist_data *pgdat,
    1013             :                                     enum node_stat_item item)
    1014             : {
    1015             :         long x = atomic_long_read(&pgdat->vm_stat[item]);
    1016             : #ifdef CONFIG_SMP
    1017             :         if (x < 0)
    1018             :                 x = 0;
    1019             : #endif
    1020             :         return x;
    1021             : }
    1022             : 
    1023             : unsigned long node_page_state(struct pglist_data *pgdat,
    1024             :                               enum node_stat_item item)
    1025             : {
    1026             :         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
    1027             : 
    1028             :         return node_page_state_pages(pgdat, item);
    1029             : }
    1030             : #endif
    1031             : 
    1032             : #ifdef CONFIG_COMPACTION
    1033             : 
    1034             : struct contig_page_info {
    1035             :         unsigned long free_pages;
    1036             :         unsigned long free_blocks_total;
    1037             :         unsigned long free_blocks_suitable;
    1038             : };
    1039             : 
    1040             : /*
    1041             :  * Calculate the number of free pages in a zone, how many contiguous
    1042             :  * pages are free and how many are large enough to satisfy an allocation of
    1043             :  * the target size. Note that this function makes no attempt to estimate
    1044             :  * how many suitable free blocks there *might* be if MOVABLE pages were
    1045             :  * migrated. Calculating that is possible, but expensive and can be
    1046             :  * figured out from userspace
    1047             :  */
    1048             : static void fill_contig_page_info(struct zone *zone,
    1049             :                                 unsigned int suitable_order,
    1050             :                                 struct contig_page_info *info)
    1051             : {
    1052             :         unsigned int order;
    1053             : 
    1054          51 :         info->free_pages = 0;
    1055          51 :         info->free_blocks_total = 0;
    1056          51 :         info->free_blocks_suitable = 0;
    1057             : 
    1058         612 :         for (order = 0; order < MAX_ORDER; order++) {
    1059             :                 unsigned long blocks;
    1060             : 
    1061             :                 /*
    1062             :                  * Count number of free blocks.
    1063             :                  *
    1064             :                  * Access to nr_free is lockless as nr_free is used only for
    1065             :                  * diagnostic purposes. Use data_race to avoid KCSAN warning.
    1066             :                  */
    1067         561 :                 blocks = data_race(zone->free_area[order].nr_free);
    1068         561 :                 info->free_blocks_total += blocks;
    1069             : 
    1070             :                 /* Count free base pages */
    1071         561 :                 info->free_pages += blocks << order;
    1072             : 
    1073             :                 /* Count the suitable free blocks */
    1074         561 :                 if (order >= suitable_order)
    1075         102 :                         info->free_blocks_suitable += blocks <<
    1076         102 :                                                 (order - suitable_order);
    1077             :         }
    1078             : }
    1079             : 
    1080             : /*
    1081             :  * A fragmentation index only makes sense if an allocation of a requested
    1082             :  * size would fail. If that is true, the fragmentation index indicates
    1083             :  * whether external fragmentation or a lack of memory was the problem.
    1084             :  * The value can be used to determine if page reclaim or compaction
    1085             :  * should be used
    1086             :  */
    1087           0 : static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
    1088             : {
    1089           0 :         unsigned long requested = 1UL << order;
    1090             : 
    1091           0 :         if (WARN_ON_ONCE(order >= MAX_ORDER))
    1092             :                 return 0;
    1093             : 
    1094           0 :         if (!info->free_blocks_total)
    1095             :                 return 0;
    1096             : 
    1097             :         /* Fragmentation index only makes sense when a request would fail */
    1098           0 :         if (info->free_blocks_suitable)
    1099             :                 return -1000;
    1100             : 
    1101             :         /*
    1102             :          * Index is between 0 and 1 so return within 3 decimal places
    1103             :          *
    1104             :          * 0 => allocation would fail due to lack of memory
    1105             :          * 1 => allocation would fail due to fragmentation
    1106             :          */
    1107           0 :         return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
    1108             : }
    1109             : 
    1110             : /*
    1111             :  * Calculates external fragmentation within a zone wrt the given order.
    1112             :  * It is defined as the percentage of pages found in blocks of size
    1113             :  * less than 1 << order. It returns values in range [0, 100].
    1114             :  */
    1115          51 : unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
    1116             : {
    1117             :         struct contig_page_info info;
    1118             : 
    1119          51 :         fill_contig_page_info(zone, order, &info);
    1120          51 :         if (info.free_pages == 0)
    1121             :                 return 0;
    1122             : 
    1123         153 :         return div_u64((info.free_pages -
    1124         102 :                         (info.free_blocks_suitable << order)) * 100,
    1125             :                         info.free_pages);
    1126             : }
    1127             : 
    1128             : /* Same as __fragmentation index but allocs contig_page_info on stack */
    1129           0 : int fragmentation_index(struct zone *zone, unsigned int order)
    1130             : {
    1131             :         struct contig_page_info info;
    1132             : 
    1133           0 :         fill_contig_page_info(zone, order, &info);
    1134           0 :         return __fragmentation_index(order, &info);
    1135             : }
    1136             : #endif
    1137             : 
    1138             : #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
    1139             :     defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
    1140             : #ifdef CONFIG_ZONE_DMA
    1141             : #define TEXT_FOR_DMA(xx) xx "_dma",
    1142             : #else
    1143             : #define TEXT_FOR_DMA(xx)
    1144             : #endif
    1145             : 
    1146             : #ifdef CONFIG_ZONE_DMA32
    1147             : #define TEXT_FOR_DMA32(xx) xx "_dma32",
    1148             : #else
    1149             : #define TEXT_FOR_DMA32(xx)
    1150             : #endif
    1151             : 
    1152             : #ifdef CONFIG_HIGHMEM
    1153             : #define TEXT_FOR_HIGHMEM(xx) xx "_high",
    1154             : #else
    1155             : #define TEXT_FOR_HIGHMEM(xx)
    1156             : #endif
    1157             : 
    1158             : #ifdef CONFIG_ZONE_DEVICE
    1159             : #define TEXT_FOR_DEVICE(xx) xx "_device",
    1160             : #else
    1161             : #define TEXT_FOR_DEVICE(xx)
    1162             : #endif
    1163             : 
    1164             : #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
    1165             :                                         TEXT_FOR_HIGHMEM(xx) xx "_movable", \
    1166             :                                         TEXT_FOR_DEVICE(xx)
    1167             : 
    1168             : const char * const vmstat_text[] = {
    1169             :         /* enum zone_stat_item counters */
    1170             :         "nr_free_pages",
    1171             :         "nr_zone_inactive_anon",
    1172             :         "nr_zone_active_anon",
    1173             :         "nr_zone_inactive_file",
    1174             :         "nr_zone_active_file",
    1175             :         "nr_zone_unevictable",
    1176             :         "nr_zone_write_pending",
    1177             :         "nr_mlock",
    1178             :         "nr_bounce",
    1179             : #if IS_ENABLED(CONFIG_ZSMALLOC)
    1180             :         "nr_zspages",
    1181             : #endif
    1182             :         "nr_free_cma",
    1183             : 
    1184             :         /* enum numa_stat_item counters */
    1185             : #ifdef CONFIG_NUMA
    1186             :         "numa_hit",
    1187             :         "numa_miss",
    1188             :         "numa_foreign",
    1189             :         "numa_interleave",
    1190             :         "numa_local",
    1191             :         "numa_other",
    1192             : #endif
    1193             : 
    1194             :         /* enum node_stat_item counters */
    1195             :         "nr_inactive_anon",
    1196             :         "nr_active_anon",
    1197             :         "nr_inactive_file",
    1198             :         "nr_active_file",
    1199             :         "nr_unevictable",
    1200             :         "nr_slab_reclaimable",
    1201             :         "nr_slab_unreclaimable",
    1202             :         "nr_isolated_anon",
    1203             :         "nr_isolated_file",
    1204             :         "workingset_nodes",
    1205             :         "workingset_refault_anon",
    1206             :         "workingset_refault_file",
    1207             :         "workingset_activate_anon",
    1208             :         "workingset_activate_file",
    1209             :         "workingset_restore_anon",
    1210             :         "workingset_restore_file",
    1211             :         "workingset_nodereclaim",
    1212             :         "nr_anon_pages",
    1213             :         "nr_mapped",
    1214             :         "nr_file_pages",
    1215             :         "nr_dirty",
    1216             :         "nr_writeback",
    1217             :         "nr_writeback_temp",
    1218             :         "nr_shmem",
    1219             :         "nr_shmem_hugepages",
    1220             :         "nr_shmem_pmdmapped",
    1221             :         "nr_file_hugepages",
    1222             :         "nr_file_pmdmapped",
    1223             :         "nr_anon_transparent_hugepages",
    1224             :         "nr_vmscan_write",
    1225             :         "nr_vmscan_immediate_reclaim",
    1226             :         "nr_dirtied",
    1227             :         "nr_written",
    1228             :         "nr_throttled_written",
    1229             :         "nr_kernel_misc_reclaimable",
    1230             :         "nr_foll_pin_acquired",
    1231             :         "nr_foll_pin_released",
    1232             :         "nr_kernel_stack",
    1233             : #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
    1234             :         "nr_shadow_call_stack",
    1235             : #endif
    1236             :         "nr_page_table_pages",
    1237             :         "nr_sec_page_table_pages",
    1238             : #ifdef CONFIG_SWAP
    1239             :         "nr_swapcached",
    1240             : #endif
    1241             : #ifdef CONFIG_NUMA_BALANCING
    1242             :         "pgpromote_success",
    1243             :         "pgpromote_candidate",
    1244             : #endif
    1245             : 
    1246             :         /* enum writeback_stat_item counters */
    1247             :         "nr_dirty_threshold",
    1248             :         "nr_dirty_background_threshold",
    1249             : 
    1250             : #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
    1251             :         /* enum vm_event_item counters */
    1252             :         "pgpgin",
    1253             :         "pgpgout",
    1254             :         "pswpin",
    1255             :         "pswpout",
    1256             : 
    1257             :         TEXTS_FOR_ZONES("pgalloc")
    1258             :         TEXTS_FOR_ZONES("allocstall")
    1259             :         TEXTS_FOR_ZONES("pgskip")
    1260             : 
    1261             :         "pgfree",
    1262             :         "pgactivate",
    1263             :         "pgdeactivate",
    1264             :         "pglazyfree",
    1265             : 
    1266             :         "pgfault",
    1267             :         "pgmajfault",
    1268             :         "pglazyfreed",
    1269             : 
    1270             :         "pgrefill",
    1271             :         "pgreuse",
    1272             :         "pgsteal_kswapd",
    1273             :         "pgsteal_direct",
    1274             :         "pgsteal_khugepaged",
    1275             :         "pgdemote_kswapd",
    1276             :         "pgdemote_direct",
    1277             :         "pgdemote_khugepaged",
    1278             :         "pgscan_kswapd",
    1279             :         "pgscan_direct",
    1280             :         "pgscan_khugepaged",
    1281             :         "pgscan_direct_throttle",
    1282             :         "pgscan_anon",
    1283             :         "pgscan_file",
    1284             :         "pgsteal_anon",
    1285             :         "pgsteal_file",
    1286             : 
    1287             : #ifdef CONFIG_NUMA
    1288             :         "zone_reclaim_failed",
    1289             : #endif
    1290             :         "pginodesteal",
    1291             :         "slabs_scanned",
    1292             :         "kswapd_inodesteal",
    1293             :         "kswapd_low_wmark_hit_quickly",
    1294             :         "kswapd_high_wmark_hit_quickly",
    1295             :         "pageoutrun",
    1296             : 
    1297             :         "pgrotated",
    1298             : 
    1299             :         "drop_pagecache",
    1300             :         "drop_slab",
    1301             :         "oom_kill",
    1302             : 
    1303             : #ifdef CONFIG_NUMA_BALANCING
    1304             :         "numa_pte_updates",
    1305             :         "numa_huge_pte_updates",
    1306             :         "numa_hint_faults",
    1307             :         "numa_hint_faults_local",
    1308             :         "numa_pages_migrated",
    1309             : #endif
    1310             : #ifdef CONFIG_MIGRATION
    1311             :         "pgmigrate_success",
    1312             :         "pgmigrate_fail",
    1313             :         "thp_migration_success",
    1314             :         "thp_migration_fail",
    1315             :         "thp_migration_split",
    1316             : #endif
    1317             : #ifdef CONFIG_COMPACTION
    1318             :         "compact_migrate_scanned",
    1319             :         "compact_free_scanned",
    1320             :         "compact_isolated",
    1321             :         "compact_stall",
    1322             :         "compact_fail",
    1323             :         "compact_success",
    1324             :         "compact_daemon_wake",
    1325             :         "compact_daemon_migrate_scanned",
    1326             :         "compact_daemon_free_scanned",
    1327             : #endif
    1328             : 
    1329             : #ifdef CONFIG_HUGETLB_PAGE
    1330             :         "htlb_buddy_alloc_success",
    1331             :         "htlb_buddy_alloc_fail",
    1332             : #endif
    1333             : #ifdef CONFIG_CMA
    1334             :         "cma_alloc_success",
    1335             :         "cma_alloc_fail",
    1336             : #endif
    1337             :         "unevictable_pgs_culled",
    1338             :         "unevictable_pgs_scanned",
    1339             :         "unevictable_pgs_rescued",
    1340             :         "unevictable_pgs_mlocked",
    1341             :         "unevictable_pgs_munlocked",
    1342             :         "unevictable_pgs_cleared",
    1343             :         "unevictable_pgs_stranded",
    1344             : 
    1345             : #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    1346             :         "thp_fault_alloc",
    1347             :         "thp_fault_fallback",
    1348             :         "thp_fault_fallback_charge",
    1349             :         "thp_collapse_alloc",
    1350             :         "thp_collapse_alloc_failed",
    1351             :         "thp_file_alloc",
    1352             :         "thp_file_fallback",
    1353             :         "thp_file_fallback_charge",
    1354             :         "thp_file_mapped",
    1355             :         "thp_split_page",
    1356             :         "thp_split_page_failed",
    1357             :         "thp_deferred_split_page",
    1358             :         "thp_split_pmd",
    1359             :         "thp_scan_exceed_none_pte",
    1360             :         "thp_scan_exceed_swap_pte",
    1361             :         "thp_scan_exceed_share_pte",
    1362             : #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
    1363             :         "thp_split_pud",
    1364             : #endif
    1365             :         "thp_zero_page_alloc",
    1366             :         "thp_zero_page_alloc_failed",
    1367             :         "thp_swpout",
    1368             :         "thp_swpout_fallback",
    1369             : #endif
    1370             : #ifdef CONFIG_MEMORY_BALLOON
    1371             :         "balloon_inflate",
    1372             :         "balloon_deflate",
    1373             : #ifdef CONFIG_BALLOON_COMPACTION
    1374             :         "balloon_migrate",
    1375             : #endif
    1376             : #endif /* CONFIG_MEMORY_BALLOON */
    1377             : #ifdef CONFIG_DEBUG_TLBFLUSH
    1378             :         "nr_tlb_remote_flush",
    1379             :         "nr_tlb_remote_flush_received",
    1380             :         "nr_tlb_local_flush_all",
    1381             :         "nr_tlb_local_flush_one",
    1382             : #endif /* CONFIG_DEBUG_TLBFLUSH */
    1383             : 
    1384             : #ifdef CONFIG_SWAP
    1385             :         "swap_ra",
    1386             :         "swap_ra_hit",
    1387             : #ifdef CONFIG_KSM
    1388             :         "ksm_swpin_copy",
    1389             : #endif
    1390             : #endif
    1391             : #ifdef CONFIG_KSM
    1392             :         "cow_ksm",
    1393             : #endif
    1394             : #ifdef CONFIG_ZSWAP
    1395             :         "zswpin",
    1396             :         "zswpout",
    1397             : #endif
    1398             : #ifdef CONFIG_X86
    1399             :         "direct_map_level2_splits",
    1400             :         "direct_map_level3_splits",
    1401             : #endif
    1402             : #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
    1403             : };
    1404             : #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
    1405             : 
    1406             : #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
    1407             :      defined(CONFIG_PROC_FS)
    1408           0 : static void *frag_start(struct seq_file *m, loff_t *pos)
    1409             : {
    1410             :         pg_data_t *pgdat;
    1411           0 :         loff_t node = *pos;
    1412             : 
    1413           0 :         for (pgdat = first_online_pgdat();
    1414           0 :              pgdat && node;
    1415           0 :              pgdat = next_online_pgdat(pgdat))
    1416           0 :                 --node;
    1417             : 
    1418           0 :         return pgdat;
    1419             : }
    1420             : 
    1421           0 : static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
    1422             : {
    1423           0 :         pg_data_t *pgdat = (pg_data_t *)arg;
    1424             : 
    1425           0 :         (*pos)++;
    1426           0 :         return next_online_pgdat(pgdat);
    1427             : }
    1428             : 
    1429           0 : static void frag_stop(struct seq_file *m, void *arg)
    1430             : {
    1431           0 : }
    1432             : 
    1433             : /*
    1434             :  * Walk zones in a node and print using a callback.
    1435             :  * If @assert_populated is true, only use callback for zones that are populated.
    1436             :  */
    1437           0 : static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
    1438             :                 bool assert_populated, bool nolock,
    1439             :                 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
    1440             : {
    1441             :         struct zone *zone;
    1442           0 :         struct zone *node_zones = pgdat->node_zones;
    1443             :         unsigned long flags;
    1444             : 
    1445           0 :         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
    1446           0 :                 if (assert_populated && !populated_zone(zone))
    1447           0 :                         continue;
    1448             : 
    1449           0 :                 if (!nolock)
    1450           0 :                         spin_lock_irqsave(&zone->lock, flags);
    1451           0 :                 print(m, pgdat, zone);
    1452           0 :                 if (!nolock)
    1453           0 :                         spin_unlock_irqrestore(&zone->lock, flags);
    1454             :         }
    1455           0 : }
    1456             : #endif
    1457             : 
    1458             : #ifdef CONFIG_PROC_FS
    1459           0 : static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
    1460             :                                                 struct zone *zone)
    1461             : {
    1462             :         int order;
    1463             : 
    1464           0 :         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
    1465           0 :         for (order = 0; order < MAX_ORDER; ++order)
    1466             :                 /*
    1467             :                  * Access to nr_free is lockless as nr_free is used only for
    1468             :                  * printing purposes. Use data_race to avoid KCSAN warning.
    1469             :                  */
    1470           0 :                 seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
    1471           0 :         seq_putc(m, '\n');
    1472           0 : }
    1473             : 
    1474             : /*
    1475             :  * This walks the free areas for each zone.
    1476             :  */
    1477           0 : static int frag_show(struct seq_file *m, void *arg)
    1478             : {
    1479           0 :         pg_data_t *pgdat = (pg_data_t *)arg;
    1480           0 :         walk_zones_in_node(m, pgdat, true, false, frag_show_print);
    1481           0 :         return 0;
    1482             : }
    1483             : 
    1484           0 : static void pagetypeinfo_showfree_print(struct seq_file *m,
    1485             :                                         pg_data_t *pgdat, struct zone *zone)
    1486             : {
    1487             :         int order, mtype;
    1488             : 
    1489           0 :         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
    1490           0 :                 seq_printf(m, "Node %4d, zone %8s, type %12s ",
    1491             :                                         pgdat->node_id,
    1492             :                                         zone->name,
    1493             :                                         migratetype_names[mtype]);
    1494           0 :                 for (order = 0; order < MAX_ORDER; ++order) {
    1495           0 :                         unsigned long freecount = 0;
    1496             :                         struct free_area *area;
    1497             :                         struct list_head *curr;
    1498           0 :                         bool overflow = false;
    1499             : 
    1500           0 :                         area = &(zone->free_area[order]);
    1501             : 
    1502           0 :                         list_for_each(curr, &area->free_list[mtype]) {
    1503             :                                 /*
    1504             :                                  * Cap the free_list iteration because it might
    1505             :                                  * be really large and we are under a spinlock
    1506             :                                  * so a long time spent here could trigger a
    1507             :                                  * hard lockup detector. Anyway this is a
    1508             :                                  * debugging tool so knowing there is a handful
    1509             :                                  * of pages of this order should be more than
    1510             :                                  * sufficient.
    1511             :                                  */
    1512           0 :                                 if (++freecount >= 100000) {
    1513             :                                         overflow = true;
    1514             :                                         break;
    1515             :                                 }
    1516             :                         }
    1517           0 :                         seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
    1518           0 :                         spin_unlock_irq(&zone->lock);
    1519           0 :                         cond_resched();
    1520           0 :                         spin_lock_irq(&zone->lock);
    1521             :                 }
    1522           0 :                 seq_putc(m, '\n');
    1523             :         }
    1524           0 : }
    1525             : 
    1526             : /* Print out the free pages at each order for each migatetype */
    1527           0 : static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
    1528             : {
    1529             :         int order;
    1530           0 :         pg_data_t *pgdat = (pg_data_t *)arg;
    1531             : 
    1532             :         /* Print header */
    1533           0 :         seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
    1534           0 :         for (order = 0; order < MAX_ORDER; ++order)
    1535           0 :                 seq_printf(m, "%6d ", order);
    1536           0 :         seq_putc(m, '\n');
    1537             : 
    1538           0 :         walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
    1539           0 : }
    1540             : 
    1541           0 : static void pagetypeinfo_showblockcount_print(struct seq_file *m,
    1542             :                                         pg_data_t *pgdat, struct zone *zone)
    1543             : {
    1544             :         int mtype;
    1545             :         unsigned long pfn;
    1546           0 :         unsigned long start_pfn = zone->zone_start_pfn;
    1547           0 :         unsigned long end_pfn = zone_end_pfn(zone);
    1548           0 :         unsigned long count[MIGRATE_TYPES] = { 0, };
    1549             : 
    1550           0 :         for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
    1551             :                 struct page *page;
    1552             : 
    1553           0 :                 page = pfn_to_online_page(pfn);
    1554           0 :                 if (!page)
    1555           0 :                         continue;
    1556             : 
    1557           0 :                 if (page_zone(page) != zone)
    1558           0 :                         continue;
    1559             : 
    1560           0 :                 mtype = get_pageblock_migratetype(page);
    1561             : 
    1562           0 :                 if (mtype < MIGRATE_TYPES)
    1563           0 :                         count[mtype]++;
    1564             :         }
    1565             : 
    1566             :         /* Print counts */
    1567           0 :         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
    1568           0 :         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
    1569           0 :                 seq_printf(m, "%12lu ", count[mtype]);
    1570           0 :         seq_putc(m, '\n');
    1571           0 : }
    1572             : 
    1573             : /* Print out the number of pageblocks for each migratetype */
    1574           0 : static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
    1575             : {
    1576             :         int mtype;
    1577           0 :         pg_data_t *pgdat = (pg_data_t *)arg;
    1578             : 
    1579           0 :         seq_printf(m, "\n%-23s", "Number of blocks type ");
    1580           0 :         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
    1581           0 :                 seq_printf(m, "%12s ", migratetype_names[mtype]);
    1582           0 :         seq_putc(m, '\n');
    1583           0 :         walk_zones_in_node(m, pgdat, true, false,
    1584             :                 pagetypeinfo_showblockcount_print);
    1585           0 : }
    1586             : 
    1587             : /*
    1588             :  * Print out the number of pageblocks for each migratetype that contain pages
    1589             :  * of other types. This gives an indication of how well fallbacks are being
    1590             :  * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
    1591             :  * to determine what is going on
    1592             :  */
    1593             : static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
    1594             : {
    1595             : #ifdef CONFIG_PAGE_OWNER
    1596             :         int mtype;
    1597             : 
    1598             :         if (!static_branch_unlikely(&page_owner_inited))
    1599             :                 return;
    1600             : 
    1601             :         drain_all_pages(NULL);
    1602             : 
    1603             :         seq_printf(m, "\n%-23s", "Number of mixed blocks ");
    1604             :         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
    1605             :                 seq_printf(m, "%12s ", migratetype_names[mtype]);
    1606             :         seq_putc(m, '\n');
    1607             : 
    1608             :         walk_zones_in_node(m, pgdat, true, true,
    1609             :                 pagetypeinfo_showmixedcount_print);
    1610             : #endif /* CONFIG_PAGE_OWNER */
    1611             : }
    1612             : 
    1613             : /*
    1614             :  * This prints out statistics in relation to grouping pages by mobility.
    1615             :  * It is expensive to collect so do not constantly read the file.
    1616             :  */
    1617           0 : static int pagetypeinfo_show(struct seq_file *m, void *arg)
    1618             : {
    1619           0 :         pg_data_t *pgdat = (pg_data_t *)arg;
    1620             : 
    1621             :         /* check memoryless node */
    1622           0 :         if (!node_state(pgdat->node_id, N_MEMORY))
    1623             :                 return 0;
    1624             : 
    1625           0 :         seq_printf(m, "Page block order: %d\n", pageblock_order);
    1626           0 :         seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
    1627           0 :         seq_putc(m, '\n');
    1628           0 :         pagetypeinfo_showfree(m, pgdat);
    1629           0 :         pagetypeinfo_showblockcount(m, pgdat);
    1630           0 :         pagetypeinfo_showmixedcount(m, pgdat);
    1631             : 
    1632           0 :         return 0;
    1633             : }
    1634             : 
    1635             : static const struct seq_operations fragmentation_op = {
    1636             :         .start  = frag_start,
    1637             :         .next   = frag_next,
    1638             :         .stop   = frag_stop,
    1639             :         .show   = frag_show,
    1640             : };
    1641             : 
    1642             : static const struct seq_operations pagetypeinfo_op = {
    1643             :         .start  = frag_start,
    1644             :         .next   = frag_next,
    1645             :         .stop   = frag_stop,
    1646             :         .show   = pagetypeinfo_show,
    1647             : };
    1648             : 
    1649             : static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
    1650             : {
    1651             :         int zid;
    1652             : 
    1653           0 :         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
    1654           0 :                 struct zone *compare = &pgdat->node_zones[zid];
    1655             : 
    1656           0 :                 if (populated_zone(compare))
    1657           0 :                         return zone == compare;
    1658             :         }
    1659             : 
    1660             :         return false;
    1661             : }
    1662             : 
    1663           0 : static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
    1664             :                                                         struct zone *zone)
    1665             : {
    1666             :         int i;
    1667           0 :         seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
    1668           0 :         if (is_zone_first_populated(pgdat, zone)) {
    1669           0 :                 seq_printf(m, "\n  per-node stats");
    1670           0 :                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
    1671           0 :                         unsigned long pages = node_page_state_pages(pgdat, i);
    1672             : 
    1673           0 :                         if (vmstat_item_print_in_thp(i))
    1674             :                                 pages /= HPAGE_PMD_NR;
    1675           0 :                         seq_printf(m, "\n      %-12s %lu", node_stat_name(i),
    1676             :                                    pages);
    1677             :                 }
    1678             :         }
    1679           0 :         seq_printf(m,
    1680             :                    "\n  pages free     %lu"
    1681             :                    "\n        boost    %lu"
    1682             :                    "\n        min      %lu"
    1683             :                    "\n        low      %lu"
    1684             :                    "\n        high     %lu"
    1685             :                    "\n        spanned  %lu"
    1686             :                    "\n        present  %lu"
    1687             :                    "\n        managed  %lu"
    1688             :                    "\n        cma      %lu",
    1689             :                    zone_page_state(zone, NR_FREE_PAGES),
    1690             :                    zone->watermark_boost,
    1691           0 :                    min_wmark_pages(zone),
    1692           0 :                    low_wmark_pages(zone),
    1693           0 :                    high_wmark_pages(zone),
    1694             :                    zone->spanned_pages,
    1695             :                    zone->present_pages,
    1696             :                    zone_managed_pages(zone),
    1697             :                    zone_cma_pages(zone));
    1698             : 
    1699           0 :         seq_printf(m,
    1700             :                    "\n        protection: (%ld",
    1701             :                    zone->lowmem_reserve[0]);
    1702           0 :         for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
    1703           0 :                 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
    1704           0 :         seq_putc(m, ')');
    1705             : 
    1706             :         /* If unpopulated, no other information is useful */
    1707           0 :         if (!populated_zone(zone)) {
    1708           0 :                 seq_putc(m, '\n');
    1709           0 :                 return;
    1710             :         }
    1711             : 
    1712           0 :         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
    1713           0 :                 seq_printf(m, "\n      %-12s %lu", zone_stat_name(i),
    1714             :                            zone_page_state(zone, i));
    1715             : 
    1716             : #ifdef CONFIG_NUMA
    1717             :         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
    1718             :                 seq_printf(m, "\n      %-12s %lu", numa_stat_name(i),
    1719             :                            zone_numa_event_state(zone, i));
    1720             : #endif
    1721             : 
    1722           0 :         seq_printf(m, "\n  pagesets");
    1723           0 :         for_each_online_cpu(i) {
    1724             :                 struct per_cpu_pages *pcp;
    1725             :                 struct per_cpu_zonestat __maybe_unused *pzstats;
    1726             : 
    1727           0 :                 pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
    1728           0 :                 seq_printf(m,
    1729             :                            "\n    cpu: %i"
    1730             :                            "\n              count: %i"
    1731             :                            "\n              high:  %i"
    1732             :                            "\n              batch: %i",
    1733             :                            i,
    1734             :                            pcp->count,
    1735             :                            pcp->high,
    1736             :                            pcp->batch);
    1737             : #ifdef CONFIG_SMP
    1738             :                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
    1739             :                 seq_printf(m, "\n  vm stats threshold: %d",
    1740             :                                 pzstats->stat_threshold);
    1741             : #endif
    1742             :         }
    1743           0 :         seq_printf(m,
    1744             :                    "\n  node_unreclaimable:  %u"
    1745             :                    "\n  start_pfn:           %lu",
    1746           0 :                    pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
    1747             :                    zone->zone_start_pfn);
    1748           0 :         seq_putc(m, '\n');
    1749             : }
    1750             : 
    1751             : /*
    1752             :  * Output information about zones in @pgdat.  All zones are printed regardless
    1753             :  * of whether they are populated or not: lowmem_reserve_ratio operates on the
    1754             :  * set of all zones and userspace would not be aware of such zones if they are
    1755             :  * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
    1756             :  */
    1757           0 : static int zoneinfo_show(struct seq_file *m, void *arg)
    1758             : {
    1759           0 :         pg_data_t *pgdat = (pg_data_t *)arg;
    1760           0 :         walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
    1761           0 :         return 0;
    1762             : }
    1763             : 
    1764             : static const struct seq_operations zoneinfo_op = {
    1765             :         .start  = frag_start, /* iterate over all zones. The same as in
    1766             :                                * fragmentation. */
    1767             :         .next   = frag_next,
    1768             :         .stop   = frag_stop,
    1769             :         .show   = zoneinfo_show,
    1770             : };
    1771             : 
    1772             : #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
    1773             :                          NR_VM_NUMA_EVENT_ITEMS + \
    1774             :                          NR_VM_NODE_STAT_ITEMS + \
    1775             :                          NR_VM_WRITEBACK_STAT_ITEMS + \
    1776             :                          (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
    1777             :                           NR_VM_EVENT_ITEMS : 0))
    1778             : 
    1779           0 : static void *vmstat_start(struct seq_file *m, loff_t *pos)
    1780             : {
    1781             :         unsigned long *v;
    1782             :         int i;
    1783             : 
    1784           0 :         if (*pos >= NR_VMSTAT_ITEMS)
    1785             :                 return NULL;
    1786             : 
    1787             :         BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
    1788             :         fold_vm_numa_events();
    1789           0 :         v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
    1790           0 :         m->private = v;
    1791           0 :         if (!v)
    1792             :                 return ERR_PTR(-ENOMEM);
    1793           0 :         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
    1794           0 :                 v[i] = global_zone_page_state(i);
    1795             :         v += NR_VM_ZONE_STAT_ITEMS;
    1796             : 
    1797             : #ifdef CONFIG_NUMA
    1798             :         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
    1799             :                 v[i] = global_numa_event_state(i);
    1800             :         v += NR_VM_NUMA_EVENT_ITEMS;
    1801             : #endif
    1802             : 
    1803           0 :         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
    1804           0 :                 v[i] = global_node_page_state_pages(i);
    1805           0 :                 if (vmstat_item_print_in_thp(i))
    1806             :                         v[i] /= HPAGE_PMD_NR;
    1807             :         }
    1808           0 :         v += NR_VM_NODE_STAT_ITEMS;
    1809             : 
    1810           0 :         global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
    1811             :                             v + NR_DIRTY_THRESHOLD);
    1812           0 :         v += NR_VM_WRITEBACK_STAT_ITEMS;
    1813             : 
    1814             : #ifdef CONFIG_VM_EVENT_COUNTERS
    1815           0 :         all_vm_events(v);
    1816           0 :         v[PGPGIN] /= 2;         /* sectors -> kbytes */
    1817           0 :         v[PGPGOUT] /= 2;
    1818             : #endif
    1819           0 :         return (unsigned long *)m->private + *pos;
    1820             : }
    1821             : 
    1822           0 : static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
    1823             : {
    1824           0 :         (*pos)++;
    1825           0 :         if (*pos >= NR_VMSTAT_ITEMS)
    1826             :                 return NULL;
    1827           0 :         return (unsigned long *)m->private + *pos;
    1828             : }
    1829             : 
    1830           0 : static int vmstat_show(struct seq_file *m, void *arg)
    1831             : {
    1832           0 :         unsigned long *l = arg;
    1833           0 :         unsigned long off = l - (unsigned long *)m->private;
    1834             : 
    1835           0 :         seq_puts(m, vmstat_text[off]);
    1836           0 :         seq_put_decimal_ull(m, " ", *l);
    1837           0 :         seq_putc(m, '\n');
    1838             : 
    1839           0 :         if (off == NR_VMSTAT_ITEMS - 1) {
    1840             :                 /*
    1841             :                  * We've come to the end - add any deprecated counters to avoid
    1842             :                  * breaking userspace which might depend on them being present.
    1843             :                  */
    1844           0 :                 seq_puts(m, "nr_unstable 0\n");
    1845             :         }
    1846           0 :         return 0;
    1847             : }
    1848             : 
    1849           0 : static void vmstat_stop(struct seq_file *m, void *arg)
    1850             : {
    1851           0 :         kfree(m->private);
    1852           0 :         m->private = NULL;
    1853           0 : }
    1854             : 
    1855             : static const struct seq_operations vmstat_op = {
    1856             :         .start  = vmstat_start,
    1857             :         .next   = vmstat_next,
    1858             :         .stop   = vmstat_stop,
    1859             :         .show   = vmstat_show,
    1860             : };
    1861             : #endif /* CONFIG_PROC_FS */
    1862             : 
    1863             : #ifdef CONFIG_SMP
    1864             : static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
    1865             : int sysctl_stat_interval __read_mostly = HZ;
    1866             : 
    1867             : #ifdef CONFIG_PROC_FS
    1868             : static void refresh_vm_stats(struct work_struct *work)
    1869             : {
    1870             :         refresh_cpu_vm_stats(true);
    1871             : }
    1872             : 
    1873             : int vmstat_refresh(struct ctl_table *table, int write,
    1874             :                    void *buffer, size_t *lenp, loff_t *ppos)
    1875             : {
    1876             :         long val;
    1877             :         int err;
    1878             :         int i;
    1879             : 
    1880             :         /*
    1881             :          * The regular update, every sysctl_stat_interval, may come later
    1882             :          * than expected: leaving a significant amount in per_cpu buckets.
    1883             :          * This is particularly misleading when checking a quantity of HUGE
    1884             :          * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
    1885             :          * which can equally be echo'ed to or cat'ted from (by root),
    1886             :          * can be used to update the stats just before reading them.
    1887             :          *
    1888             :          * Oh, and since global_zone_page_state() etc. are so careful to hide
    1889             :          * transiently negative values, report an error here if any of
    1890             :          * the stats is negative, so we know to go looking for imbalance.
    1891             :          */
    1892             :         err = schedule_on_each_cpu(refresh_vm_stats);
    1893             :         if (err)
    1894             :                 return err;
    1895             :         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
    1896             :                 /*
    1897             :                  * Skip checking stats known to go negative occasionally.
    1898             :                  */
    1899             :                 switch (i) {
    1900             :                 case NR_ZONE_WRITE_PENDING:
    1901             :                 case NR_FREE_CMA_PAGES:
    1902             :                         continue;
    1903             :                 }
    1904             :                 val = atomic_long_read(&vm_zone_stat[i]);
    1905             :                 if (val < 0) {
    1906             :                         pr_warn("%s: %s %ld\n",
    1907             :                                 __func__, zone_stat_name(i), val);
    1908             :                 }
    1909             :         }
    1910             :         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
    1911             :                 /*
    1912             :                  * Skip checking stats known to go negative occasionally.
    1913             :                  */
    1914             :                 switch (i) {
    1915             :                 case NR_WRITEBACK:
    1916             :                         continue;
    1917             :                 }
    1918             :                 val = atomic_long_read(&vm_node_stat[i]);
    1919             :                 if (val < 0) {
    1920             :                         pr_warn("%s: %s %ld\n",
    1921             :                                 __func__, node_stat_name(i), val);
    1922             :                 }
    1923             :         }
    1924             :         if (write)
    1925             :                 *ppos += *lenp;
    1926             :         else
    1927             :                 *lenp = 0;
    1928             :         return 0;
    1929             : }
    1930             : #endif /* CONFIG_PROC_FS */
    1931             : 
    1932             : static void vmstat_update(struct work_struct *w)
    1933             : {
    1934             :         if (refresh_cpu_vm_stats(true)) {
    1935             :                 /*
    1936             :                  * Counters were updated so we expect more updates
    1937             :                  * to occur in the future. Keep on running the
    1938             :                  * update worker thread.
    1939             :                  */
    1940             :                 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
    1941             :                                 this_cpu_ptr(&vmstat_work),
    1942             :                                 round_jiffies_relative(sysctl_stat_interval));
    1943             :         }
    1944             : }
    1945             : 
    1946             : /*
    1947             :  * Check if the diffs for a certain cpu indicate that
    1948             :  * an update is needed.
    1949             :  */
    1950             : static bool need_update(int cpu)
    1951             : {
    1952             :         pg_data_t *last_pgdat = NULL;
    1953             :         struct zone *zone;
    1954             : 
    1955             :         for_each_populated_zone(zone) {
    1956             :                 struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
    1957             :                 struct per_cpu_nodestat *n;
    1958             : 
    1959             :                 /*
    1960             :                  * The fast way of checking if there are any vmstat diffs.
    1961             :                  */
    1962             :                 if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
    1963             :                         return true;
    1964             : 
    1965             :                 if (last_pgdat == zone->zone_pgdat)
    1966             :                         continue;
    1967             :                 last_pgdat = zone->zone_pgdat;
    1968             :                 n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
    1969             :                 if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
    1970             :                         return true;
    1971             :         }
    1972             :         return false;
    1973             : }
    1974             : 
    1975             : /*
    1976             :  * Switch off vmstat processing and then fold all the remaining differentials
    1977             :  * until the diffs stay at zero. The function is used by NOHZ and can only be
    1978             :  * invoked when tick processing is not active.
    1979             :  */
    1980             : void quiet_vmstat(void)
    1981             : {
    1982             :         if (system_state != SYSTEM_RUNNING)
    1983             :                 return;
    1984             : 
    1985             :         if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
    1986             :                 return;
    1987             : 
    1988             :         if (!need_update(smp_processor_id()))
    1989             :                 return;
    1990             : 
    1991             :         /*
    1992             :          * Just refresh counters and do not care about the pending delayed
    1993             :          * vmstat_update. It doesn't fire that often to matter and canceling
    1994             :          * it would be too expensive from this path.
    1995             :          * vmstat_shepherd will take care about that for us.
    1996             :          */
    1997             :         refresh_cpu_vm_stats(false);
    1998             : }
    1999             : 
    2000             : /*
    2001             :  * Shepherd worker thread that checks the
    2002             :  * differentials of processors that have their worker
    2003             :  * threads for vm statistics updates disabled because of
    2004             :  * inactivity.
    2005             :  */
    2006             : static void vmstat_shepherd(struct work_struct *w);
    2007             : 
    2008             : static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
    2009             : 
    2010             : static void vmstat_shepherd(struct work_struct *w)
    2011             : {
    2012             :         int cpu;
    2013             : 
    2014             :         cpus_read_lock();
    2015             :         /* Check processors whose vmstat worker threads have been disabled */
    2016             :         for_each_online_cpu(cpu) {
    2017             :                 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
    2018             : 
    2019             :                 if (!delayed_work_pending(dw) && need_update(cpu))
    2020             :                         queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
    2021             : 
    2022             :                 cond_resched();
    2023             :         }
    2024             :         cpus_read_unlock();
    2025             : 
    2026             :         schedule_delayed_work(&shepherd,
    2027             :                 round_jiffies_relative(sysctl_stat_interval));
    2028             : }
    2029             : 
    2030             : static void __init start_shepherd_timer(void)
    2031             : {
    2032             :         int cpu;
    2033             : 
    2034             :         for_each_possible_cpu(cpu)
    2035             :                 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
    2036             :                         vmstat_update);
    2037             : 
    2038             :         schedule_delayed_work(&shepherd,
    2039             :                 round_jiffies_relative(sysctl_stat_interval));
    2040             : }
    2041             : 
    2042             : static void __init init_cpu_node_state(void)
    2043             : {
    2044             :         int node;
    2045             : 
    2046             :         for_each_online_node(node) {
    2047             :                 if (!cpumask_empty(cpumask_of_node(node)))
    2048             :                         node_set_state(node, N_CPU);
    2049             :         }
    2050             : }
    2051             : 
    2052             : static int vmstat_cpu_online(unsigned int cpu)
    2053             : {
    2054             :         refresh_zone_stat_thresholds();
    2055             : 
    2056             :         if (!node_state(cpu_to_node(cpu), N_CPU)) {
    2057             :                 node_set_state(cpu_to_node(cpu), N_CPU);
    2058             :         }
    2059             : 
    2060             :         return 0;
    2061             : }
    2062             : 
    2063             : static int vmstat_cpu_down_prep(unsigned int cpu)
    2064             : {
    2065             :         cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
    2066             :         return 0;
    2067             : }
    2068             : 
    2069             : static int vmstat_cpu_dead(unsigned int cpu)
    2070             : {
    2071             :         const struct cpumask *node_cpus;
    2072             :         int node;
    2073             : 
    2074             :         node = cpu_to_node(cpu);
    2075             : 
    2076             :         refresh_zone_stat_thresholds();
    2077             :         node_cpus = cpumask_of_node(node);
    2078             :         if (!cpumask_empty(node_cpus))
    2079             :                 return 0;
    2080             : 
    2081             :         node_clear_state(node, N_CPU);
    2082             : 
    2083             :         return 0;
    2084             : }
    2085             : 
    2086             : #endif
    2087             : 
    2088             : struct workqueue_struct *mm_percpu_wq;
    2089             : 
    2090           1 : void __init init_mm_internals(void)
    2091             : {
    2092             :         int ret __maybe_unused;
    2093             : 
    2094           1 :         mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
    2095             : 
    2096             : #ifdef CONFIG_SMP
    2097             :         ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
    2098             :                                         NULL, vmstat_cpu_dead);
    2099             :         if (ret < 0)
    2100             :                 pr_err("vmstat: failed to register 'dead' hotplug state\n");
    2101             : 
    2102             :         ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
    2103             :                                         vmstat_cpu_online,
    2104             :                                         vmstat_cpu_down_prep);
    2105             :         if (ret < 0)
    2106             :                 pr_err("vmstat: failed to register 'online' hotplug state\n");
    2107             : 
    2108             :         cpus_read_lock();
    2109             :         init_cpu_node_state();
    2110             :         cpus_read_unlock();
    2111             : 
    2112             :         start_shepherd_timer();
    2113             : #endif
    2114             : #ifdef CONFIG_PROC_FS
    2115           1 :         proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
    2116           1 :         proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
    2117           1 :         proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
    2118           1 :         proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
    2119             : #endif
    2120           1 : }
    2121             : 
    2122             : #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
    2123             : 
    2124             : /*
    2125             :  * Return an index indicating how much of the available free memory is
    2126             :  * unusable for an allocation of the requested size.
    2127             :  */
    2128             : static int unusable_free_index(unsigned int order,
    2129             :                                 struct contig_page_info *info)
    2130             : {
    2131             :         /* No free memory is interpreted as all free memory is unusable */
    2132             :         if (info->free_pages == 0)
    2133             :                 return 1000;
    2134             : 
    2135             :         /*
    2136             :          * Index should be a value between 0 and 1. Return a value to 3
    2137             :          * decimal places.
    2138             :          *
    2139             :          * 0 => no fragmentation
    2140             :          * 1 => high fragmentation
    2141             :          */
    2142             :         return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
    2143             : 
    2144             : }
    2145             : 
    2146             : static void unusable_show_print(struct seq_file *m,
    2147             :                                         pg_data_t *pgdat, struct zone *zone)
    2148             : {
    2149             :         unsigned int order;
    2150             :         int index;
    2151             :         struct contig_page_info info;
    2152             : 
    2153             :         seq_printf(m, "Node %d, zone %8s ",
    2154             :                                 pgdat->node_id,
    2155             :                                 zone->name);
    2156             :         for (order = 0; order < MAX_ORDER; ++order) {
    2157             :                 fill_contig_page_info(zone, order, &info);
    2158             :                 index = unusable_free_index(order, &info);
    2159             :                 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
    2160             :         }
    2161             : 
    2162             :         seq_putc(m, '\n');
    2163             : }
    2164             : 
    2165             : /*
    2166             :  * Display unusable free space index
    2167             :  *
    2168             :  * The unusable free space index measures how much of the available free
    2169             :  * memory cannot be used to satisfy an allocation of a given size and is a
    2170             :  * value between 0 and 1. The higher the value, the more of free memory is
    2171             :  * unusable and by implication, the worse the external fragmentation is. This
    2172             :  * can be expressed as a percentage by multiplying by 100.
    2173             :  */
    2174             : static int unusable_show(struct seq_file *m, void *arg)
    2175             : {
    2176             :         pg_data_t *pgdat = (pg_data_t *)arg;
    2177             : 
    2178             :         /* check memoryless node */
    2179             :         if (!node_state(pgdat->node_id, N_MEMORY))
    2180             :                 return 0;
    2181             : 
    2182             :         walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
    2183             : 
    2184             :         return 0;
    2185             : }
    2186             : 
    2187             : static const struct seq_operations unusable_sops = {
    2188             :         .start  = frag_start,
    2189             :         .next   = frag_next,
    2190             :         .stop   = frag_stop,
    2191             :         .show   = unusable_show,
    2192             : };
    2193             : 
    2194             : DEFINE_SEQ_ATTRIBUTE(unusable);
    2195             : 
    2196             : static void extfrag_show_print(struct seq_file *m,
    2197             :                                         pg_data_t *pgdat, struct zone *zone)
    2198             : {
    2199             :         unsigned int order;
    2200             :         int index;
    2201             : 
    2202             :         /* Alloc on stack as interrupts are disabled for zone walk */
    2203             :         struct contig_page_info info;
    2204             : 
    2205             :         seq_printf(m, "Node %d, zone %8s ",
    2206             :                                 pgdat->node_id,
    2207             :                                 zone->name);
    2208             :         for (order = 0; order < MAX_ORDER; ++order) {
    2209             :                 fill_contig_page_info(zone, order, &info);
    2210             :                 index = __fragmentation_index(order, &info);
    2211             :                 seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
    2212             :         }
    2213             : 
    2214             :         seq_putc(m, '\n');
    2215             : }
    2216             : 
    2217             : /*
    2218             :  * Display fragmentation index for orders that allocations would fail for
    2219             :  */
    2220             : static int extfrag_show(struct seq_file *m, void *arg)
    2221             : {
    2222             :         pg_data_t *pgdat = (pg_data_t *)arg;
    2223             : 
    2224             :         walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
    2225             : 
    2226             :         return 0;
    2227             : }
    2228             : 
    2229             : static const struct seq_operations extfrag_sops = {
    2230             :         .start  = frag_start,
    2231             :         .next   = frag_next,
    2232             :         .stop   = frag_stop,
    2233             :         .show   = extfrag_show,
    2234             : };
    2235             : 
    2236             : DEFINE_SEQ_ATTRIBUTE(extfrag);
    2237             : 
    2238             : static int __init extfrag_debug_init(void)
    2239             : {
    2240             :         struct dentry *extfrag_debug_root;
    2241             : 
    2242             :         extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
    2243             : 
    2244             :         debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
    2245             :                             &unusable_fops);
    2246             : 
    2247             :         debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
    2248             :                             &extfrag_fops);
    2249             : 
    2250             :         return 0;
    2251             : }
    2252             : 
    2253             : module_init(extfrag_debug_init);
    2254             : #endif

Generated by: LCOV version 1.14