LCOV - coverage.info - kernel/sched/cputime.c

LCOV - code coverage report

Current view:	top level - kernel/sched - cputime.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	17	107	15.9 %
Date:	2023-08-24 13:40:31	Functions:	1	12	8.3 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  * Simple CPU accounting cgroup controller
       4             :  */
       5             : 
       6             : #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
       7             :  #include <asm/cputime.h>
       8             : #endif
       9             : 
      10             : #ifdef CONFIG_IRQ_TIME_ACCOUNTING
      11             : 
      12             : /*
      13             :  * There are no locks covering percpu hardirq/softirq time.
      14             :  * They are only modified in vtime_account, on corresponding CPU
      15             :  * with interrupts disabled. So, writes are safe.
      16             :  * They are read and saved off onto struct rq in update_rq_clock().
      17             :  * This may result in other CPU reading this CPU's irq time and can
      18             :  * race with irq/vtime_account on this CPU. We would either get old
      19             :  * or new value with a side effect of accounting a slice of irq time to wrong
      20             :  * task when irq is in progress while we read rq->clock. That is a worthy
      21             :  * compromise in place of having locks on each irq in account_system_time.
      22             :  */
      23             : DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
      24             : 
      25             : static int sched_clock_irqtime;
      26             : 
      27             : void enable_sched_clock_irqtime(void)
      28             : {
      29             :         sched_clock_irqtime = 1;
      30             : }
      31             : 
      32             : void disable_sched_clock_irqtime(void)
      33             : {
      34             :         sched_clock_irqtime = 0;
      35             : }
      36             : 
      37             : static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
      38             :                                   enum cpu_usage_stat idx)
      39             : {
      40             :         u64 *cpustat = kcpustat_this_cpu->cpustat;
      41             : 
      42             :         u64_stats_update_begin(&irqtime->sync);
      43             :         cpustat[idx] += delta;
      44             :         irqtime->total += delta;
      45             :         irqtime->tick_delta += delta;
      46             :         u64_stats_update_end(&irqtime->sync);
      47             : }
      48             : 
      49             : /*
      50             :  * Called after incrementing preempt_count on {soft,}irq_enter
      51             :  * and before decrementing preempt_count on {soft,}irq_exit.
      52             :  */
      53             : void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
      54             : {
      55             :         struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
      56             :         unsigned int pc;
      57             :         s64 delta;
      58             :         int cpu;
      59             : 
      60             :         if (!sched_clock_irqtime)
      61             :                 return;
      62             : 
      63             :         cpu = smp_processor_id();
      64             :         delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
      65             :         irqtime->irq_start_time += delta;
      66             :         pc = irq_count() - offset;
      67             : 
      68             :         /*
      69             :          * We do not account for softirq time from ksoftirqd here.
      70             :          * We want to continue accounting softirq time to ksoftirqd thread
      71             :          * in that case, so as not to confuse scheduler with a special task
      72             :          * that do not consume any time, but still wants to run.
      73             :          */
      74             :         if (pc & HARDIRQ_MASK)
      75             :                 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
      76             :         else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
      77             :                 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
      78             : }
      79             : 
      80             : static u64 irqtime_tick_accounted(u64 maxtime)
      81             : {
      82             :         struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
      83             :         u64 delta;
      84             : 
      85             :         delta = min(irqtime->tick_delta, maxtime);
      86             :         irqtime->tick_delta -= delta;
      87             : 
      88             :         return delta;
      89             : }
      90             : 
      91             : #else /* CONFIG_IRQ_TIME_ACCOUNTING */
      92             : 
      93             : #define sched_clock_irqtime     (0)
      94             : 
      95             : static u64 irqtime_tick_accounted(u64 dummy)
      96             : {
      97             :         return 0;
      98             : }
      99             : 
     100             : #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
     101             : 
     102             : static inline void task_group_account_field(struct task_struct *p, int index,
     103             :                                             u64 tmp)
     104             : {
     105             :         /*
     106             :          * Since all updates are sure to touch the root cgroup, we
     107             :          * get ourselves ahead and touch it first. If the root cgroup
     108             :          * is the only cgroup, then nothing else should be necessary.
     109             :          *
     110             :          */
     111           4 :         __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
     112             : 
     113           4 :         cgroup_account_cputime_field(p, index, tmp);
     114             : }
     115             : 
     116             : /*
     117             :  * Account user CPU time to a process.
     118             :  * @p: the process that the CPU time gets accounted to
     119             :  * @cputime: the CPU time spent in user space since the last update
     120             :  */
     121           0 : void account_user_time(struct task_struct *p, u64 cputime)
     122             : {
     123             :         int index;
     124             : 
     125             :         /* Add user time to process. */
     126           4 :         p->utime += cputime;
     127           4 :         account_group_user_time(p, cputime);
     128             : 
     129           8 :         index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
     130             : 
     131             :         /* Add user time to cpustat. */
     132           8 :         task_group_account_field(p, index, cputime);
     133             : 
     134             :         /* Account for user time used */
     135           4 :         acct_account_cputime(p);
     136           0 : }
     137             : 
     138             : /*
     139             :  * Account guest CPU time to a process.
     140             :  * @p: the process that the CPU time gets accounted to
     141             :  * @cputime: the CPU time spent in virtual machine since the last update
     142             :  */
     143           0 : void account_guest_time(struct task_struct *p, u64 cputime)
     144             : {
     145           0 :         u64 *cpustat = kcpustat_this_cpu->cpustat;
     146             : 
     147             :         /* Add guest time to process. */
     148           0 :         p->utime += cputime;
     149           0 :         account_group_user_time(p, cputime);
     150           0 :         p->gtime += cputime;
     151             : 
     152             :         /* Add guest time to cpustat. */
     153           0 :         if (task_nice(p) > 0) {
     154           0 :                 task_group_account_field(p, CPUTIME_NICE, cputime);
     155           0 :                 cpustat[CPUTIME_GUEST_NICE] += cputime;
     156             :         } else {
     157           0 :                 task_group_account_field(p, CPUTIME_USER, cputime);
     158           0 :                 cpustat[CPUTIME_GUEST] += cputime;
     159             :         }
     160           0 : }
     161             : 
     162             : /*
     163             :  * Account system CPU time to a process and desired cpustat field
     164             :  * @p: the process that the CPU time gets accounted to
     165             :  * @cputime: the CPU time spent in kernel space since the last update
     166             :  * @index: pointer to cpustat field that has to be updated
     167             :  */
     168           0 : void account_system_index_time(struct task_struct *p,
     169             :                                u64 cputime, enum cpu_usage_stat index)
     170             : {
     171             :         /* Add system time to process. */
     172           0 :         p->stime += cputime;
     173           0 :         account_group_system_time(p, cputime);
     174             : 
     175             :         /* Add system time to cpustat. */
     176           0 :         task_group_account_field(p, index, cputime);
     177             : 
     178             :         /* Account for system time used */
     179           0 :         acct_account_cputime(p);
     180           0 : }
     181             : 
     182             : /*
     183             :  * Account system CPU time to a process.
     184             :  * @p: the process that the CPU time gets accounted to
     185             :  * @hardirq_offset: the offset to subtract from hardirq_count()
     186             :  * @cputime: the CPU time spent in kernel space since the last update
     187             :  */
     188           0 : void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
     189             : {
     190             :         int index;
     191             : 
     192           0 :         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
     193           0 :                 account_guest_time(p, cputime);
     194           0 :                 return;
     195             :         }
     196             : 
     197           0 :         if (hardirq_count() - hardirq_offset)
     198             :                 index = CPUTIME_IRQ;
     199           0 :         else if (in_serving_softirq())
     200             :                 index = CPUTIME_SOFTIRQ;
     201             :         else
     202           0 :                 index = CPUTIME_SYSTEM;
     203             : 
     204           0 :         account_system_index_time(p, cputime, index);
     205             : }
     206             : 
     207             : /*
     208             :  * Account for involuntary wait time.
     209             :  * @cputime: the CPU time spent in involuntary wait
     210             :  */
     211           0 : void account_steal_time(u64 cputime)
     212             : {
     213           0 :         u64 *cpustat = kcpustat_this_cpu->cpustat;
     214             : 
     215           0 :         cpustat[CPUTIME_STEAL] += cputime;
     216           0 : }
     217             : 
     218             : /*
     219             :  * Account for idle time.
     220             :  * @cputime: the CPU time spent in idle wait
     221             :  */
     222           0 : void account_idle_time(u64 cputime)
     223             : {
     224           1 :         u64 *cpustat = kcpustat_this_cpu->cpustat;
     225           1 :         struct rq *rq = this_rq();
     226             : 
     227           2 :         if (atomic_read(&rq->nr_iowait) > 0)
     228           0 :                 cpustat[CPUTIME_IOWAIT] += cputime;
     229             :         else
     230           1 :                 cpustat[CPUTIME_IDLE] += cputime;
     231           0 : }
     232             : 
     233             : 
     234             : #ifdef CONFIG_SCHED_CORE
     235             : /*
     236             :  * Account for forceidle time due to core scheduling.
     237             :  *
     238             :  * REQUIRES: schedstat is enabled.
     239             :  */
     240             : void __account_forceidle_time(struct task_struct *p, u64 delta)
     241             : {
     242             :         __schedstat_add(p->stats.core_forceidle_sum, delta);
     243             : 
     244             :         task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
     245             : }
     246             : #endif
     247             : 
     248             : /*
     249             :  * When a guest is interrupted for a longer amount of time, missed clock
     250             :  * ticks are not redelivered later. Due to that, this function may on
     251             :  * occasion account more time than the calling functions think elapsed.
     252             :  */
     253             : static __always_inline u64 steal_account_process_time(u64 maxtime)
     254             : {
     255             : #ifdef CONFIG_PARAVIRT
     256             :         if (static_key_false(&paravirt_steal_enabled)) {
     257             :                 u64 steal;
     258             : 
     259             :                 steal = paravirt_steal_clock(smp_processor_id());
     260             :                 steal -= this_rq()->prev_steal_time;
     261             :                 steal = min(steal, maxtime);
     262             :                 account_steal_time(steal);
     263             :                 this_rq()->prev_steal_time += steal;
     264             : 
     265             :                 return steal;
     266             :         }
     267             : #endif
     268             :         return 0;
     269             : }
     270             : 
     271             : /*
     272             :  * Account how much elapsed time was spent in steal, irq, or softirq time.
     273             :  */
     274             : static inline u64 account_other_time(u64 max)
     275             : {
     276             :         u64 accounted;
     277             : 
     278             :         lockdep_assert_irqs_disabled();
     279             : 
     280             :         accounted = steal_account_process_time(max);
     281             : 
     282             :         if (accounted < max)
     283             :                 accounted += irqtime_tick_accounted(max - accounted);
     284             : 
     285             :         return accounted;
     286             : }
     287             : 
     288             : #ifdef CONFIG_64BIT
     289             : static inline u64 read_sum_exec_runtime(struct task_struct *t)
     290             : {
     291             :         return t->se.sum_exec_runtime;
     292             : }
     293             : #else
     294             : static u64 read_sum_exec_runtime(struct task_struct *t)
     295             : {
     296             :         u64 ns;
     297             :         struct rq_flags rf;
     298             :         struct rq *rq;
     299             : 
     300             :         rq = task_rq_lock(t, &rf);
     301             :         ns = t->se.sum_exec_runtime;
     302             :         task_rq_unlock(rq, t, &rf);
     303             : 
     304             :         return ns;
     305             : }
     306             : #endif
     307             : 
     308             : /*
     309             :  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
     310             :  * tasks (sum on group iteration) belonging to @tsk's group.
     311             :  */
     312           0 : void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
     313             : {
     314           0 :         struct signal_struct *sig = tsk->signal;
     315             :         u64 utime, stime;
     316             :         struct task_struct *t;
     317             :         unsigned int seq, nextseq;
     318             :         unsigned long flags;
     319             : 
     320             :         /*
     321             :          * Update current task runtime to account pending time since last
     322             :          * scheduler action or thread_group_cputime() call. This thread group
     323             :          * might have other running tasks on different CPUs, but updating
     324             :          * their runtime can affect syscall performance, so we skip account
     325             :          * those pending times and rely only on values updated on tick or
     326             :          * other scheduler action.
     327             :          */
     328           0 :         if (same_thread_group(current, tsk))
     329           0 :                 (void) task_sched_runtime(current);
     330             : 
     331             :         rcu_read_lock();
     332             :         /* Attempt a lockless read on the first round. */
     333           0 :         nextseq = 0;
     334             :         do {
     335           0 :                 seq = nextseq;
     336           0 :                 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
     337           0 :                 times->utime = sig->utime;
     338           0 :                 times->stime = sig->stime;
     339           0 :                 times->sum_exec_runtime = sig->sum_sched_runtime;
     340             : 
     341           0 :                 for_each_thread(tsk, t) {
     342           0 :                         task_cputime(t, &utime, &stime);
     343           0 :                         times->utime += utime;
     344           0 :                         times->stime += stime;
     345           0 :                         times->sum_exec_runtime += read_sum_exec_runtime(t);
     346             :                 }
     347             :                 /* If lockless access failed, take the lock. */
     348           0 :                 nextseq = 1;
     349           0 :         } while (need_seqretry(&sig->stats_lock, seq));
     350           0 :         done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
     351             :         rcu_read_unlock();
     352           0 : }
     353             : 
     354             : #ifdef CONFIG_IRQ_TIME_ACCOUNTING
     355             : /*
     356             :  * Account a tick to a process and cpustat
     357             :  * @p: the process that the CPU time gets accounted to
     358             :  * @user_tick: is the tick from userspace
     359             :  * @rq: the pointer to rq
     360             :  *
     361             :  * Tick demultiplexing follows the order
     362             :  * - pending hardirq update
     363             :  * - pending softirq update
     364             :  * - user_time
     365             :  * - idle_time
     366             :  * - system time
     367             :  *   - check for guest_time
     368             :  *   - else account as system_time
     369             :  *
     370             :  * Check for hardirq is done both for system and user time as there is
     371             :  * no timer going off while we are on hardirq and hence we may never get an
     372             :  * opportunity to update it solely in system time.
     373             :  * p->stime and friends are only updated on system time and not on irq
     374             :  * softirq as those do not count in task exec_runtime any more.
     375             :  */
     376             : static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
     377             :                                          int ticks)
     378             : {
     379             :         u64 other, cputime = TICK_NSEC * ticks;
     380             : 
     381             :         /*
     382             :          * When returning from idle, many ticks can get accounted at
     383             :          * once, including some ticks of steal, irq, and softirq time.
     384             :          * Subtract those ticks from the amount of time accounted to
     385             :          * idle, or potentially user or system time. Due to rounding,
     386             :          * other time can exceed ticks occasionally.
     387             :          */
     388             :         other = account_other_time(ULONG_MAX);
     389             :         if (other >= cputime)
     390             :                 return;
     391             : 
     392             :         cputime -= other;
     393             : 
     394             :         if (this_cpu_ksoftirqd() == p) {
     395             :                 /*
     396             :                  * ksoftirqd time do not get accounted in cpu_softirq_time.
     397             :                  * So, we have to handle it separately here.
     398             :                  * Also, p->stime needs to be updated for ksoftirqd.
     399             :                  */
     400             :                 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
     401             :         } else if (user_tick) {
     402             :                 account_user_time(p, cputime);
     403             :         } else if (p == this_rq()->idle) {
     404             :                 account_idle_time(cputime);
     405             :         } else if (p->flags & PF_VCPU) { /* System time or guest time */
     406             :                 account_guest_time(p, cputime);
     407             :         } else {
     408             :                 account_system_index_time(p, cputime, CPUTIME_SYSTEM);
     409             :         }
     410             : }
     411             : 
     412             : static void irqtime_account_idle_ticks(int ticks)
     413             : {
     414             :         irqtime_account_process_tick(current, 0, ticks);
     415             : }
     416             : #else /* CONFIG_IRQ_TIME_ACCOUNTING */
     417             : static inline void irqtime_account_idle_ticks(int ticks) { }
     418             : static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
     419             :                                                 int nr_ticks) { }
     420             : #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
     421             : 
     422             : /*
     423             :  * Use precise platform statistics if available:
     424             :  */
     425             : #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
     426             : 
     427             : # ifndef __ARCH_HAS_VTIME_TASK_SWITCH
     428             : void vtime_task_switch(struct task_struct *prev)
     429             : {
     430             :         if (is_idle_task(prev))
     431             :                 vtime_account_idle(prev);
     432             :         else
     433             :                 vtime_account_kernel(prev);
     434             : 
     435             :         vtime_flush(prev);
     436             :         arch_vtime_task_switch(prev);
     437             : }
     438             : # endif
     439             : 
     440             : void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
     441             : {
     442             :         unsigned int pc = irq_count() - offset;
     443             : 
     444             :         if (pc & HARDIRQ_OFFSET) {
     445             :                 vtime_account_hardirq(tsk);
     446             :         } else if (pc & SOFTIRQ_OFFSET) {
     447             :                 vtime_account_softirq(tsk);
     448             :         } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
     449             :                    is_idle_task(tsk)) {
     450             :                 vtime_account_idle(tsk);
     451             :         } else {
     452             :                 vtime_account_kernel(tsk);
     453             :         }
     454             : }
     455             : 
     456             : void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
     457             :                     u64 *ut, u64 *st)
     458             : {
     459             :         *ut = curr->utime;
     460             :         *st = curr->stime;
     461             : }
     462             : 
     463             : void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
     464             : {
     465             :         *ut = p->utime;
     466             :         *st = p->stime;
     467             : }
     468             : EXPORT_SYMBOL_GPL(task_cputime_adjusted);
     469             : 
     470             : void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
     471             : {
     472             :         struct task_cputime cputime;
     473             : 
     474             :         thread_group_cputime(p, &cputime);
     475             : 
     476             :         *ut = cputime.utime;
     477             :         *st = cputime.stime;
     478             : }
     479             : 
     480             : #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
     481             : 
     482             : /*
     483             :  * Account a single tick of CPU time.
     484             :  * @p: the process that the CPU time gets accounted to
     485             :  * @user_tick: indicates if the tick is a user or a system tick
     486             :  */
     487           5 : void account_process_tick(struct task_struct *p, int user_tick)
     488             : {
     489             :         u64 cputime, steal;
     490             : 
     491             :         if (vtime_accounting_enabled_this_cpu())
     492             :                 return;
     493             : 
     494             :         if (sched_clock_irqtime) {
     495             :                 irqtime_account_process_tick(p, user_tick, 1);
     496             :                 return;
     497             :         }
     498             : 
     499           5 :         cputime = TICK_NSEC;
     500           5 :         steal = steal_account_process_time(ULONG_MAX);
     501             : 
     502             :         if (steal >= cputime)
     503             :                 return;
     504             : 
     505           5 :         cputime -= steal;
     506             : 
     507           5 :         if (user_tick)
     508             :                 account_user_time(p, cputime);
     509           2 :         else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
     510           0 :                 account_system_time(p, HARDIRQ_OFFSET, cputime);
     511             :         else
     512             :                 account_idle_time(cputime);
     513             : }
     514             : 
     515             : /*
     516             :  * Account multiple ticks of idle time.
     517             :  * @ticks: number of stolen ticks
     518             :  */
     519           0 : void account_idle_ticks(unsigned long ticks)
     520             : {
     521             :         u64 cputime, steal;
     522             : 
     523             :         if (sched_clock_irqtime) {
     524             :                 irqtime_account_idle_ticks(ticks);
     525             :                 return;
     526             :         }
     527             : 
     528           0 :         cputime = ticks * TICK_NSEC;
     529           0 :         steal = steal_account_process_time(ULONG_MAX);
     530             : 
     531           0 :         if (steal >= cputime)
     532             :                 return;
     533             : 
     534           0 :         cputime -= steal;
     535             :         account_idle_time(cputime);
     536             : }
     537             : 
     538             : /*
     539             :  * Adjust tick based cputime random precision against scheduler runtime
     540             :  * accounting.
     541             :  *
     542             :  * Tick based cputime accounting depend on random scheduling timeslices of a
     543             :  * task to be interrupted or not by the timer.  Depending on these
     544             :  * circumstances, the number of these interrupts may be over or
     545             :  * under-optimistic, matching the real user and system cputime with a variable
     546             :  * precision.
     547             :  *
     548             :  * Fix this by scaling these tick based values against the total runtime
     549             :  * accounted by the CFS scheduler.
     550             :  *
     551             :  * This code provides the following guarantees:
     552             :  *
     553             :  *   stime + utime == rtime
     554             :  *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
     555             :  *
     556             :  * Assuming that rtime_i+1 >= rtime_i.
     557             :  */
     558           0 : void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
     559             :                     u64 *ut, u64 *st)
     560             : {
     561             :         u64 rtime, stime, utime;
     562             :         unsigned long flags;
     563             : 
     564             :         /* Serialize concurrent callers such that we can honour our guarantees */
     565           0 :         raw_spin_lock_irqsave(&prev->lock, flags);
     566           0 :         rtime = curr->sum_exec_runtime;
     567             : 
     568             :         /*
     569             :          * This is possible under two circumstances:
     570             :          *  - rtime isn't monotonic after all (a bug);
     571             :          *  - we got reordered by the lock.
     572             :          *
     573             :          * In both cases this acts as a filter such that the rest of the code
     574             :          * can assume it is monotonic regardless of anything else.
     575             :          */
     576           0 :         if (prev->stime + prev->utime >= rtime)
     577             :                 goto out;
     578             : 
     579           0 :         stime = curr->stime;
     580           0 :         utime = curr->utime;
     581             : 
     582             :         /*
     583             :          * If either stime or utime are 0, assume all runtime is userspace.
     584             :          * Once a task gets some ticks, the monotonicity code at 'update:'
     585             :          * will ensure things converge to the observed ratio.
     586             :          */
     587           0 :         if (stime == 0) {
     588             :                 utime = rtime;
     589             :                 goto update;
     590             :         }
     591             : 
     592           0 :         if (utime == 0) {
     593             :                 stime = rtime;
     594             :                 goto update;
     595             :         }
     596             : 
     597           0 :         stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
     598             : 
     599             : update:
     600             :         /*
     601             :          * Make sure stime doesn't go backwards; this preserves monotonicity
     602             :          * for utime because rtime is monotonic.
     603             :          *
     604             :          *  utime_i+1 = rtime_i+1 - stime_i
     605             :          *            = rtime_i+1 - (rtime_i - utime_i)
     606             :          *            = (rtime_i+1 - rtime_i) + utime_i
     607             :          *            >= utime_i
     608             :          */
     609           0 :         if (stime < prev->stime)
     610           0 :                 stime = prev->stime;
     611           0 :         utime = rtime - stime;
     612             : 
     613             :         /*
     614             :          * Make sure utime doesn't go backwards; this still preserves
     615             :          * monotonicity for stime, analogous argument to above.
     616             :          */
     617           0 :         if (utime < prev->utime) {
     618           0 :                 utime = prev->utime;
     619           0 :                 stime = rtime - utime;
     620             :         }
     621             : 
     622           0 :         prev->stime = stime;
     623           0 :         prev->utime = utime;
     624             : out:
     625           0 :         *ut = prev->utime;
     626           0 :         *st = prev->stime;
     627           0 :         raw_spin_unlock_irqrestore(&prev->lock, flags);
     628           0 : }
     629             : 
     630           0 : void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
     631             : {
     632           0 :         struct task_cputime cputime = {
     633           0 :                 .sum_exec_runtime = p->se.sum_exec_runtime,
     634             :         };
     635             : 
     636           0 :         if (task_cputime(p, &cputime.utime, &cputime.stime))
     637             :                 cputime.sum_exec_runtime = task_sched_runtime(p);
     638           0 :         cputime_adjust(&cputime, &p->prev_cputime, ut, st);
     639           0 : }
     640             : EXPORT_SYMBOL_GPL(task_cputime_adjusted);
     641             : 
     642           0 : void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
     643             : {
     644             :         struct task_cputime cputime;
     645             : 
     646           0 :         thread_group_cputime(p, &cputime);
     647           0 :         cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
     648           0 : }
     649             : #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
     650             : 
     651             : #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
     652             : static u64 vtime_delta(struct vtime *vtime)
     653             : {
     654             :         unsigned long long clock;
     655             : 
     656             :         clock = sched_clock();
     657             :         if (clock < vtime->starttime)
     658             :                 return 0;
     659             : 
     660             :         return clock - vtime->starttime;
     661             : }
     662             : 
     663             : static u64 get_vtime_delta(struct vtime *vtime)
     664             : {
     665             :         u64 delta = vtime_delta(vtime);
     666             :         u64 other;
     667             : 
     668             :         /*
     669             :          * Unlike tick based timing, vtime based timing never has lost
     670             :          * ticks, and no need for steal time accounting to make up for
     671             :          * lost ticks. Vtime accounts a rounded version of actual
     672             :          * elapsed time. Limit account_other_time to prevent rounding
     673             :          * errors from causing elapsed vtime to go negative.
     674             :          */
     675             :         other = account_other_time(delta);
     676             :         WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
     677             :         vtime->starttime += delta;
     678             : 
     679             :         return delta - other;
     680             : }
     681             : 
     682             : static void vtime_account_system(struct task_struct *tsk,
     683             :                                  struct vtime *vtime)
     684             : {
     685             :         vtime->stime += get_vtime_delta(vtime);
     686             :         if (vtime->stime >= TICK_NSEC) {
     687             :                 account_system_time(tsk, irq_count(), vtime->stime);
     688             :                 vtime->stime = 0;
     689             :         }
     690             : }
     691             : 
     692             : static void vtime_account_guest(struct task_struct *tsk,
     693             :                                 struct vtime *vtime)
     694             : {
     695             :         vtime->gtime += get_vtime_delta(vtime);
     696             :         if (vtime->gtime >= TICK_NSEC) {
     697             :                 account_guest_time(tsk, vtime->gtime);
     698             :                 vtime->gtime = 0;
     699             :         }
     700             : }
     701             : 
     702             : static void __vtime_account_kernel(struct task_struct *tsk,
     703             :                                    struct vtime *vtime)
     704             : {
     705             :         /* We might have scheduled out from guest path */
     706             :         if (vtime->state == VTIME_GUEST)
     707             :                 vtime_account_guest(tsk, vtime);
     708             :         else
     709             :                 vtime_account_system(tsk, vtime);
     710             : }
     711             : 
     712             : void vtime_account_kernel(struct task_struct *tsk)
     713             : {
     714             :         struct vtime *vtime = &tsk->vtime;
     715             : 
     716             :         if (!vtime_delta(vtime))
     717             :                 return;
     718             : 
     719             :         write_seqcount_begin(&vtime->seqcount);
     720             :         __vtime_account_kernel(tsk, vtime);
     721             :         write_seqcount_end(&vtime->seqcount);
     722             : }
     723             : 
     724             : void vtime_user_enter(struct task_struct *tsk)
     725             : {
     726             :         struct vtime *vtime = &tsk->vtime;
     727             : 
     728             :         write_seqcount_begin(&vtime->seqcount);
     729             :         vtime_account_system(tsk, vtime);
     730             :         vtime->state = VTIME_USER;
     731             :         write_seqcount_end(&vtime->seqcount);
     732             : }
     733             : 
     734             : void vtime_user_exit(struct task_struct *tsk)
     735             : {
     736             :         struct vtime *vtime = &tsk->vtime;
     737             : 
     738             :         write_seqcount_begin(&vtime->seqcount);
     739             :         vtime->utime += get_vtime_delta(vtime);
     740             :         if (vtime->utime >= TICK_NSEC) {
     741             :                 account_user_time(tsk, vtime->utime);
     742             :                 vtime->utime = 0;
     743             :         }
     744             :         vtime->state = VTIME_SYS;
     745             :         write_seqcount_end(&vtime->seqcount);
     746             : }
     747             : 
     748             : void vtime_guest_enter(struct task_struct *tsk)
     749             : {
     750             :         struct vtime *vtime = &tsk->vtime;
     751             :         /*
     752             :          * The flags must be updated under the lock with
     753             :          * the vtime_starttime flush and update.
     754             :          * That enforces a right ordering and update sequence
     755             :          * synchronization against the reader (task_gtime())
     756             :          * that can thus safely catch up with a tickless delta.
     757             :          */
     758             :         write_seqcount_begin(&vtime->seqcount);
     759             :         vtime_account_system(tsk, vtime);
     760             :         tsk->flags |= PF_VCPU;
     761             :         vtime->state = VTIME_GUEST;
     762             :         write_seqcount_end(&vtime->seqcount);
     763             : }
     764             : EXPORT_SYMBOL_GPL(vtime_guest_enter);
     765             : 
     766             : void vtime_guest_exit(struct task_struct *tsk)
     767             : {
     768             :         struct vtime *vtime = &tsk->vtime;
     769             : 
     770             :         write_seqcount_begin(&vtime->seqcount);
     771             :         vtime_account_guest(tsk, vtime);
     772             :         tsk->flags &= ~PF_VCPU;
     773             :         vtime->state = VTIME_SYS;
     774             :         write_seqcount_end(&vtime->seqcount);
     775             : }
     776             : EXPORT_SYMBOL_GPL(vtime_guest_exit);
     777             : 
     778             : void vtime_account_idle(struct task_struct *tsk)
     779             : {
     780             :         account_idle_time(get_vtime_delta(&tsk->vtime));
     781             : }
     782             : 
     783             : void vtime_task_switch_generic(struct task_struct *prev)
     784             : {
     785             :         struct vtime *vtime = &prev->vtime;
     786             : 
     787             :         write_seqcount_begin(&vtime->seqcount);
     788             :         if (vtime->state == VTIME_IDLE)
     789             :                 vtime_account_idle(prev);
     790             :         else
     791             :                 __vtime_account_kernel(prev, vtime);
     792             :         vtime->state = VTIME_INACTIVE;
     793             :         vtime->cpu = -1;
     794             :         write_seqcount_end(&vtime->seqcount);
     795             : 
     796             :         vtime = &current->vtime;
     797             : 
     798             :         write_seqcount_begin(&vtime->seqcount);
     799             :         if (is_idle_task(current))
     800             :                 vtime->state = VTIME_IDLE;
     801             :         else if (current->flags & PF_VCPU)
     802             :                 vtime->state = VTIME_GUEST;
     803             :         else
     804             :                 vtime->state = VTIME_SYS;
     805             :         vtime->starttime = sched_clock();
     806             :         vtime->cpu = smp_processor_id();
     807             :         write_seqcount_end(&vtime->seqcount);
     808             : }
     809             : 
     810             : void vtime_init_idle(struct task_struct *t, int cpu)
     811             : {
     812             :         struct vtime *vtime = &t->vtime;
     813             :         unsigned long flags;
     814             : 
     815             :         local_irq_save(flags);
     816             :         write_seqcount_begin(&vtime->seqcount);
     817             :         vtime->state = VTIME_IDLE;
     818             :         vtime->starttime = sched_clock();
     819             :         vtime->cpu = cpu;
     820             :         write_seqcount_end(&vtime->seqcount);
     821             :         local_irq_restore(flags);
     822             : }
     823             : 
     824             : u64 task_gtime(struct task_struct *t)
     825             : {
     826             :         struct vtime *vtime = &t->vtime;
     827             :         unsigned int seq;
     828             :         u64 gtime;
     829             : 
     830             :         if (!vtime_accounting_enabled())
     831             :                 return t->gtime;
     832             : 
     833             :         do {
     834             :                 seq = read_seqcount_begin(&vtime->seqcount);
     835             : 
     836             :                 gtime = t->gtime;
     837             :                 if (vtime->state == VTIME_GUEST)
     838             :                         gtime += vtime->gtime + vtime_delta(vtime);
     839             : 
     840             :         } while (read_seqcount_retry(&vtime->seqcount, seq));
     841             : 
     842             :         return gtime;
     843             : }
     844             : 
     845             : /*
     846             :  * Fetch cputime raw values from fields of task_struct and
     847             :  * add up the pending nohz execution time since the last
     848             :  * cputime snapshot.
     849             :  */
     850             : bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
     851             : {
     852             :         struct vtime *vtime = &t->vtime;
     853             :         unsigned int seq;
     854             :         u64 delta;
     855             :         int ret;
     856             : 
     857             :         if (!vtime_accounting_enabled()) {
     858             :                 *utime = t->utime;
     859             :                 *stime = t->stime;
     860             :                 return false;
     861             :         }
     862             : 
     863             :         do {
     864             :                 ret = false;
     865             :                 seq = read_seqcount_begin(&vtime->seqcount);
     866             : 
     867             :                 *utime = t->utime;
     868             :                 *stime = t->stime;
     869             : 
     870             :                 /* Task is sleeping or idle, nothing to add */
     871             :                 if (vtime->state < VTIME_SYS)
     872             :                         continue;
     873             : 
     874             :                 ret = true;
     875             :                 delta = vtime_delta(vtime);
     876             : 
     877             :                 /*
     878             :                  * Task runs either in user (including guest) or kernel space,
     879             :                  * add pending nohz time to the right place.
     880             :                  */
     881             :                 if (vtime->state == VTIME_SYS)
     882             :                         *stime += vtime->stime + delta;
     883             :                 else
     884             :                         *utime += vtime->utime + delta;
     885             :         } while (read_seqcount_retry(&vtime->seqcount, seq));
     886             : 
     887             :         return ret;
     888             : }
     889             : 
     890             : static int vtime_state_fetch(struct vtime *vtime, int cpu)
     891             : {
     892             :         int state = READ_ONCE(vtime->state);
     893             : 
     894             :         /*
     895             :          * We raced against a context switch, fetch the
     896             :          * kcpustat task again.
     897             :          */
     898             :         if (vtime->cpu != cpu && vtime->cpu != -1)
     899             :                 return -EAGAIN;
     900             : 
     901             :         /*
     902             :          * Two possible things here:
     903             :          * 1) We are seeing the scheduling out task (prev) or any past one.
     904             :          * 2) We are seeing the scheduling in task (next) but it hasn't
     905             :          *    passed though vtime_task_switch() yet so the pending
     906             :          *    cputime of the prev task may not be flushed yet.
     907             :          *
     908             :          * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
     909             :          */
     910             :         if (state == VTIME_INACTIVE)
     911             :                 return -EAGAIN;
     912             : 
     913             :         return state;
     914             : }
     915             : 
     916             : static u64 kcpustat_user_vtime(struct vtime *vtime)
     917             : {
     918             :         if (vtime->state == VTIME_USER)
     919             :                 return vtime->utime + vtime_delta(vtime);
     920             :         else if (vtime->state == VTIME_GUEST)
     921             :                 return vtime->gtime + vtime_delta(vtime);
     922             :         return 0;
     923             : }
     924             : 
     925             : static int kcpustat_field_vtime(u64 *cpustat,
     926             :                                 struct task_struct *tsk,
     927             :                                 enum cpu_usage_stat usage,
     928             :                                 int cpu, u64 *val)
     929             : {
     930             :         struct vtime *vtime = &tsk->vtime;
     931             :         unsigned int seq;
     932             : 
     933             :         do {
     934             :                 int state;
     935             : 
     936             :                 seq = read_seqcount_begin(&vtime->seqcount);
     937             : 
     938             :                 state = vtime_state_fetch(vtime, cpu);
     939             :                 if (state < 0)
     940             :                         return state;
     941             : 
     942             :                 *val = cpustat[usage];
     943             : 
     944             :                 /*
     945             :                  * Nice VS unnice cputime accounting may be inaccurate if
     946             :                  * the nice value has changed since the last vtime update.
     947             :                  * But proper fix would involve interrupting target on nice
     948             :                  * updates which is a no go on nohz_full (although the scheduler
     949             :                  * may still interrupt the target if rescheduling is needed...)
     950             :                  */
     951             :                 switch (usage) {
     952             :                 case CPUTIME_SYSTEM:
     953             :                         if (state == VTIME_SYS)
     954             :                                 *val += vtime->stime + vtime_delta(vtime);
     955             :                         break;
     956             :                 case CPUTIME_USER:
     957             :                         if (task_nice(tsk) <= 0)
     958             :                                 *val += kcpustat_user_vtime(vtime);
     959             :                         break;
     960             :                 case CPUTIME_NICE:
     961             :                         if (task_nice(tsk) > 0)
     962             :                                 *val += kcpustat_user_vtime(vtime);
     963             :                         break;
     964             :                 case CPUTIME_GUEST:
     965             :                         if (state == VTIME_GUEST && task_nice(tsk) <= 0)
     966             :                                 *val += vtime->gtime + vtime_delta(vtime);
     967             :                         break;
     968             :                 case CPUTIME_GUEST_NICE:
     969             :                         if (state == VTIME_GUEST && task_nice(tsk) > 0)
     970             :                                 *val += vtime->gtime + vtime_delta(vtime);
     971             :                         break;
     972             :                 default:
     973             :                         break;
     974             :                 }
     975             :         } while (read_seqcount_retry(&vtime->seqcount, seq));
     976             : 
     977             :         return 0;
     978             : }
     979             : 
     980             : u64 kcpustat_field(struct kernel_cpustat *kcpustat,
     981             :                    enum cpu_usage_stat usage, int cpu)
     982             : {
     983             :         u64 *cpustat = kcpustat->cpustat;
     984             :         u64 val = cpustat[usage];
     985             :         struct rq *rq;
     986             :         int err;
     987             : 
     988             :         if (!vtime_accounting_enabled_cpu(cpu))
     989             :                 return val;
     990             : 
     991             :         rq = cpu_rq(cpu);
     992             : 
     993             :         for (;;) {
     994             :                 struct task_struct *curr;
     995             : 
     996             :                 rcu_read_lock();
     997             :                 curr = rcu_dereference(rq->curr);
     998             :                 if (WARN_ON_ONCE(!curr)) {
     999             :                         rcu_read_unlock();
    1000             :                         return cpustat[usage];
    1001             :                 }
    1002             : 
    1003             :                 err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
    1004             :                 rcu_read_unlock();
    1005             : 
    1006             :                 if (!err)
    1007             :                         return val;
    1008             : 
    1009             :                 cpu_relax();
    1010             :         }
    1011             : }
    1012             : EXPORT_SYMBOL_GPL(kcpustat_field);
    1013             : 
    1014             : static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
    1015             :                                     const struct kernel_cpustat *src,
    1016             :                                     struct task_struct *tsk, int cpu)
    1017             : {
    1018             :         struct vtime *vtime = &tsk->vtime;
    1019             :         unsigned int seq;
    1020             : 
    1021             :         do {
    1022             :                 u64 *cpustat;
    1023             :                 u64 delta;
    1024             :                 int state;
    1025             : 
    1026             :                 seq = read_seqcount_begin(&vtime->seqcount);
    1027             : 
    1028             :                 state = vtime_state_fetch(vtime, cpu);
    1029             :                 if (state < 0)
    1030             :                         return state;
    1031             : 
    1032             :                 *dst = *src;
    1033             :                 cpustat = dst->cpustat;
    1034             : 
    1035             :                 /* Task is sleeping, dead or idle, nothing to add */
    1036             :                 if (state < VTIME_SYS)
    1037             :                         continue;
    1038             : 
    1039             :                 delta = vtime_delta(vtime);
    1040             : 
    1041             :                 /*
    1042             :                  * Task runs either in user (including guest) or kernel space,
    1043             :                  * add pending nohz time to the right place.
    1044             :                  */
    1045             :                 if (state == VTIME_SYS) {
    1046             :                         cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
    1047             :                 } else if (state == VTIME_USER) {
    1048             :                         if (task_nice(tsk) > 0)
    1049             :                                 cpustat[CPUTIME_NICE] += vtime->utime + delta;
    1050             :                         else
    1051             :                                 cpustat[CPUTIME_USER] += vtime->utime + delta;
    1052             :                 } else {
    1053             :                         WARN_ON_ONCE(state != VTIME_GUEST);
    1054             :                         if (task_nice(tsk) > 0) {
    1055             :                                 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
    1056             :                                 cpustat[CPUTIME_NICE] += vtime->gtime + delta;
    1057             :                         } else {
    1058             :                                 cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
    1059             :                                 cpustat[CPUTIME_USER] += vtime->gtime + delta;
    1060             :                         }
    1061             :                 }
    1062             :         } while (read_seqcount_retry(&vtime->seqcount, seq));
    1063             : 
    1064             :         return 0;
    1065             : }
    1066             : 
    1067             : void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
    1068             : {
    1069             :         const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
    1070             :         struct rq *rq;
    1071             :         int err;
    1072             : 
    1073             :         if (!vtime_accounting_enabled_cpu(cpu)) {
    1074             :                 *dst = *src;
    1075             :                 return;
    1076             :         }
    1077             : 
    1078             :         rq = cpu_rq(cpu);
    1079             : 
    1080             :         for (;;) {
    1081             :                 struct task_struct *curr;
    1082             : 
    1083             :                 rcu_read_lock();
    1084             :                 curr = rcu_dereference(rq->curr);
    1085             :                 if (WARN_ON_ONCE(!curr)) {
    1086             :                         rcu_read_unlock();
    1087             :                         *dst = *src;
    1088             :                         return;
    1089             :                 }
    1090             : 
    1091             :                 err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
    1092             :                 rcu_read_unlock();
    1093             : 
    1094             :                 if (!err)
    1095             :                         return;
    1096             : 
    1097             :                 cpu_relax();
    1098             :         }
    1099             : }
    1100             : EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
    1101             : 
    1102             : #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */

Generated by: LCOV version 1.14