LCOV - code coverage report
Current view: top level - kernel/sched - clock.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 8 10 80.0 %
Date: 2023-07-19 18:55:55 Functions: 3 4 75.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  * sched_clock() for unstable CPU clocks
       4             :  *
       5             :  *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
       6             :  *
       7             :  *  Updates and enhancements:
       8             :  *    Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
       9             :  *
      10             :  * Based on code by:
      11             :  *   Ingo Molnar <mingo@redhat.com>
      12             :  *   Guillaume Chazarain <guichaz@gmail.com>
      13             :  *
      14             :  *
      15             :  * What this file implements:
      16             :  *
      17             :  * cpu_clock(i) provides a fast (execution time) high resolution
      18             :  * clock with bounded drift between CPUs. The value of cpu_clock(i)
      19             :  * is monotonic for constant i. The timestamp returned is in nanoseconds.
      20             :  *
      21             :  * ######################### BIG FAT WARNING ##########################
      22             :  * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
      23             :  * # go backwards !!                                                  #
      24             :  * ####################################################################
      25             :  *
      26             :  * There is no strict promise about the base, although it tends to start
      27             :  * at 0 on boot (but people really shouldn't rely on that).
      28             :  *
      29             :  * cpu_clock(i)       -- can be used from any context, including NMI.
      30             :  * local_clock()      -- is cpu_clock() on the current CPU.
      31             :  *
      32             :  * sched_clock_cpu(i)
      33             :  *
      34             :  * How it is implemented:
      35             :  *
      36             :  * The implementation either uses sched_clock() when
      37             :  * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
      38             :  * sched_clock() is assumed to provide these properties (mostly it means
      39             :  * the architecture provides a globally synchronized highres time source).
      40             :  *
      41             :  * Otherwise it tries to create a semi stable clock from a mixture of other
      42             :  * clocks, including:
      43             :  *
      44             :  *  - GTOD (clock monotonic)
      45             :  *  - sched_clock()
      46             :  *  - explicit idle events
      47             :  *
      48             :  * We use GTOD as base and use sched_clock() deltas to improve resolution. The
      49             :  * deltas are filtered to provide monotonicity and keeping it within an
      50             :  * expected window.
      51             :  *
      52             :  * Furthermore, explicit sleep and wakeup hooks allow us to account for time
      53             :  * that is otherwise invisible (TSC gets stopped).
      54             :  *
      55             :  */
      56             : 
      57             : /*
      58             :  * Scheduler clock - returns current time in nanosec units.
      59             :  * This is default implementation.
      60             :  * Architectures and sub-architectures can override this.
      61             :  */
      62        8792 : notrace unsigned long long __weak sched_clock(void)
      63             : {
      64             :         return (unsigned long long)(jiffies - INITIAL_JIFFIES)
      65        8792 :                                         * (NSEC_PER_SEC / HZ);
      66             : }
      67             : EXPORT_SYMBOL_GPL(sched_clock);
      68             : 
      69             : static DEFINE_STATIC_KEY_FALSE(sched_clock_running);
      70             : 
      71             : #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
      72             : /*
      73             :  * We must start with !__sched_clock_stable because the unstable -> stable
      74             :  * transition is accurate, while the stable -> unstable transition is not.
      75             :  *
      76             :  * Similarly we start with __sched_clock_stable_early, thereby assuming we
      77             :  * will become stable, such that there's only a single 1 -> 0 transition.
      78             :  */
      79             : static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
      80             : static int __sched_clock_stable_early = 1;
      81             : 
      82             : /*
      83             :  * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset
      84             :  */
      85             : __read_mostly u64 __sched_clock_offset;
      86             : static __read_mostly u64 __gtod_offset;
      87             : 
      88             : struct sched_clock_data {
      89             :         u64                     tick_raw;
      90             :         u64                     tick_gtod;
      91             :         u64                     clock;
      92             : };
      93             : 
      94             : static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
      95             : 
      96             : static __always_inline struct sched_clock_data *this_scd(void)
      97             : {
      98             :         return this_cpu_ptr(&sched_clock_data);
      99             : }
     100             : 
     101             : notrace static inline struct sched_clock_data *cpu_sdc(int cpu)
     102             : {
     103             :         return &per_cpu(sched_clock_data, cpu);
     104             : }
     105             : 
     106             : notrace int sched_clock_stable(void)
     107             : {
     108             :         return static_branch_likely(&__sched_clock_stable);
     109             : }
     110             : 
     111             : notrace static void __scd_stamp(struct sched_clock_data *scd)
     112             : {
     113             :         scd->tick_gtod = ktime_get_ns();
     114             :         scd->tick_raw = sched_clock();
     115             : }
     116             : 
     117             : notrace static void __set_sched_clock_stable(void)
     118             : {
     119             :         struct sched_clock_data *scd;
     120             : 
     121             :         /*
     122             :          * Since we're still unstable and the tick is already running, we have
     123             :          * to disable IRQs in order to get a consistent scd->tick* reading.
     124             :          */
     125             :         local_irq_disable();
     126             :         scd = this_scd();
     127             :         /*
     128             :          * Attempt to make the (initial) unstable->stable transition continuous.
     129             :          */
     130             :         __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
     131             :         local_irq_enable();
     132             : 
     133             :         printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
     134             :                         scd->tick_gtod, __gtod_offset,
     135             :                         scd->tick_raw,  __sched_clock_offset);
     136             : 
     137             :         static_branch_enable(&__sched_clock_stable);
     138             :         tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
     139             : }
     140             : 
     141             : /*
     142             :  * If we ever get here, we're screwed, because we found out -- typically after
     143             :  * the fact -- that TSC wasn't good. This means all our clocksources (including
     144             :  * ktime) could have reported wrong values.
     145             :  *
     146             :  * What we do here is an attempt to fix up and continue sort of where we left
     147             :  * off in a coherent manner.
     148             :  *
     149             :  * The only way to fully avoid random clock jumps is to boot with:
     150             :  * "tsc=unstable".
     151             :  */
     152             : notrace static void __sched_clock_work(struct work_struct *work)
     153             : {
     154             :         struct sched_clock_data *scd;
     155             :         int cpu;
     156             : 
     157             :         /* take a current timestamp and set 'now' */
     158             :         preempt_disable();
     159             :         scd = this_scd();
     160             :         __scd_stamp(scd);
     161             :         scd->clock = scd->tick_gtod + __gtod_offset;
     162             :         preempt_enable();
     163             : 
     164             :         /* clone to all CPUs */
     165             :         for_each_possible_cpu(cpu)
     166             :                 per_cpu(sched_clock_data, cpu) = *scd;
     167             : 
     168             :         printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n");
     169             :         printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
     170             :                         scd->tick_gtod, __gtod_offset,
     171             :                         scd->tick_raw,  __sched_clock_offset);
     172             : 
     173             :         static_branch_disable(&__sched_clock_stable);
     174             : }
     175             : 
     176             : static DECLARE_WORK(sched_clock_work, __sched_clock_work);
     177             : 
     178             : notrace static void __clear_sched_clock_stable(void)
     179             : {
     180             :         if (!sched_clock_stable())
     181             :                 return;
     182             : 
     183             :         tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
     184             :         schedule_work(&sched_clock_work);
     185             : }
     186             : 
     187             : notrace void clear_sched_clock_stable(void)
     188             : {
     189             :         __sched_clock_stable_early = 0;
     190             : 
     191             :         smp_mb(); /* matches sched_clock_init_late() */
     192             : 
     193             :         if (static_key_count(&sched_clock_running.key) == 2)
     194             :                 __clear_sched_clock_stable();
     195             : }
     196             : 
     197             : notrace static void __sched_clock_gtod_offset(void)
     198             : {
     199             :         struct sched_clock_data *scd = this_scd();
     200             : 
     201             :         __scd_stamp(scd);
     202             :         __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod;
     203             : }
     204             : 
     205             : void __init sched_clock_init(void)
     206             : {
     207             :         /*
     208             :          * Set __gtod_offset such that once we mark sched_clock_running,
     209             :          * sched_clock_tick() continues where sched_clock() left off.
     210             :          *
     211             :          * Even if TSC is buggered, we're still UP at this point so it
     212             :          * can't really be out of sync.
     213             :          */
     214             :         local_irq_disable();
     215             :         __sched_clock_gtod_offset();
     216             :         local_irq_enable();
     217             : 
     218             :         static_branch_inc(&sched_clock_running);
     219             : }
     220             : /*
     221             :  * We run this as late_initcall() such that it runs after all built-in drivers,
     222             :  * notably: acpi_processor and intel_idle, which can mark the TSC as unstable.
     223             :  */
     224             : static int __init sched_clock_init_late(void)
     225             : {
     226             :         static_branch_inc(&sched_clock_running);
     227             :         /*
     228             :          * Ensure that it is impossible to not do a static_key update.
     229             :          *
     230             :          * Either {set,clear}_sched_clock_stable() must see sched_clock_running
     231             :          * and do the update, or we must see their __sched_clock_stable_early
     232             :          * and do the update, or both.
     233             :          */
     234             :         smp_mb(); /* matches {set,clear}_sched_clock_stable() */
     235             : 
     236             :         if (__sched_clock_stable_early)
     237             :                 __set_sched_clock_stable();
     238             : 
     239             :         return 0;
     240             : }
     241             : late_initcall(sched_clock_init_late);
     242             : 
     243             : /*
     244             :  * min, max except they take wrapping into account
     245             :  */
     246             : 
     247             : static __always_inline u64 wrap_min(u64 x, u64 y)
     248             : {
     249             :         return (s64)(x - y) < 0 ? x : y;
     250             : }
     251             : 
     252             : static __always_inline u64 wrap_max(u64 x, u64 y)
     253             : {
     254             :         return (s64)(x - y) > 0 ? x : y;
     255             : }
     256             : 
     257             : /*
     258             :  * update the percpu scd from the raw @now value
     259             :  *
     260             :  *  - filter out backward motion
     261             :  *  - use the GTOD tick value to create a window to filter crazy TSC values
     262             :  */
     263             : static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
     264             : {
     265             :         u64 now, clock, old_clock, min_clock, max_clock, gtod;
     266             :         s64 delta;
     267             : 
     268             : again:
     269             :         now = sched_clock();
     270             :         delta = now - scd->tick_raw;
     271             :         if (unlikely(delta < 0))
     272             :                 delta = 0;
     273             : 
     274             :         old_clock = scd->clock;
     275             : 
     276             :         /*
     277             :          * scd->clock = clamp(scd->tick_gtod + delta,
     278             :          *                    max(scd->tick_gtod, scd->clock),
     279             :          *                    scd->tick_gtod + TICK_NSEC);
     280             :          */
     281             : 
     282             :         gtod = scd->tick_gtod + __gtod_offset;
     283             :         clock = gtod + delta;
     284             :         min_clock = wrap_max(gtod, old_clock);
     285             :         max_clock = wrap_max(old_clock, gtod + TICK_NSEC);
     286             : 
     287             :         clock = wrap_max(clock, min_clock);
     288             :         clock = wrap_min(clock, max_clock);
     289             : 
     290             :         if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock))
     291             :                 goto again;
     292             : 
     293             :         return clock;
     294             : }
     295             : 
     296             : noinstr u64 local_clock(void)
     297             : {
     298             :         u64 clock;
     299             : 
     300             :         if (static_branch_likely(&__sched_clock_stable))
     301             :                 return sched_clock() + __sched_clock_offset;
     302             : 
     303             :         if (!static_branch_likely(&sched_clock_running))
     304             :                 return sched_clock();
     305             : 
     306             :         preempt_disable_notrace();
     307             :         clock = sched_clock_local(this_scd());
     308             :         preempt_enable_notrace();
     309             : 
     310             :         return clock;
     311             : }
     312             : EXPORT_SYMBOL_GPL(local_clock);
     313             : 
     314             : static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
     315             : {
     316             :         struct sched_clock_data *my_scd = this_scd();
     317             :         u64 this_clock, remote_clock;
     318             :         u64 *ptr, old_val, val;
     319             : 
     320             : #if BITS_PER_LONG != 64
     321             : again:
     322             :         /*
     323             :          * Careful here: The local and the remote clock values need to
     324             :          * be read out atomic as we need to compare the values and
     325             :          * then update either the local or the remote side. So the
     326             :          * cmpxchg64 below only protects one readout.
     327             :          *
     328             :          * We must reread via sched_clock_local() in the retry case on
     329             :          * 32-bit kernels as an NMI could use sched_clock_local() via the
     330             :          * tracer and hit between the readout of
     331             :          * the low 32-bit and the high 32-bit portion.
     332             :          */
     333             :         this_clock = sched_clock_local(my_scd);
     334             :         /*
     335             :          * We must enforce atomic readout on 32-bit, otherwise the
     336             :          * update on the remote CPU can hit inbetween the readout of
     337             :          * the low 32-bit and the high 32-bit portion.
     338             :          */
     339             :         remote_clock = cmpxchg64(&scd->clock, 0, 0);
     340             : #else
     341             :         /*
     342             :          * On 64-bit kernels the read of [my]scd->clock is atomic versus the
     343             :          * update, so we can avoid the above 32-bit dance.
     344             :          */
     345             :         sched_clock_local(my_scd);
     346             : again:
     347             :         this_clock = my_scd->clock;
     348             :         remote_clock = scd->clock;
     349             : #endif
     350             : 
     351             :         /*
     352             :          * Use the opportunity that we have both locks
     353             :          * taken to couple the two clocks: we take the
     354             :          * larger time as the latest time for both
     355             :          * runqueues. (this creates monotonic movement)
     356             :          */
     357             :         if (likely((s64)(remote_clock - this_clock) < 0)) {
     358             :                 ptr = &scd->clock;
     359             :                 old_val = remote_clock;
     360             :                 val = this_clock;
     361             :         } else {
     362             :                 /*
     363             :                  * Should be rare, but possible:
     364             :                  */
     365             :                 ptr = &my_scd->clock;
     366             :                 old_val = this_clock;
     367             :                 val = remote_clock;
     368             :         }
     369             : 
     370             :         if (!try_cmpxchg64(ptr, &old_val, val))
     371             :                 goto again;
     372             : 
     373             :         return val;
     374             : }
     375             : 
     376             : /*
     377             :  * Similar to cpu_clock(), but requires local IRQs to be disabled.
     378             :  *
     379             :  * See cpu_clock().
     380             :  */
     381             : notrace u64 sched_clock_cpu(int cpu)
     382             : {
     383             :         struct sched_clock_data *scd;
     384             :         u64 clock;
     385             : 
     386             :         if (sched_clock_stable())
     387             :                 return sched_clock() + __sched_clock_offset;
     388             : 
     389             :         if (!static_branch_likely(&sched_clock_running))
     390             :                 return sched_clock();
     391             : 
     392             :         preempt_disable_notrace();
     393             :         scd = cpu_sdc(cpu);
     394             : 
     395             :         if (cpu != smp_processor_id())
     396             :                 clock = sched_clock_remote(scd);
     397             :         else
     398             :                 clock = sched_clock_local(scd);
     399             :         preempt_enable_notrace();
     400             : 
     401             :         return clock;
     402             : }
     403             : EXPORT_SYMBOL_GPL(sched_clock_cpu);
     404             : 
     405             : notrace void sched_clock_tick(void)
     406             : {
     407             :         struct sched_clock_data *scd;
     408             : 
     409             :         if (sched_clock_stable())
     410             :                 return;
     411             : 
     412             :         if (!static_branch_likely(&sched_clock_running))
     413             :                 return;
     414             : 
     415             :         lockdep_assert_irqs_disabled();
     416             : 
     417             :         scd = this_scd();
     418             :         __scd_stamp(scd);
     419             :         sched_clock_local(scd);
     420             : }
     421             : 
     422             : notrace void sched_clock_tick_stable(void)
     423             : {
     424             :         if (!sched_clock_stable())
     425             :                 return;
     426             : 
     427             :         /*
     428             :          * Called under watchdog_lock.
     429             :          *
     430             :          * The watchdog just found this TSC to (still) be stable, so now is a
     431             :          * good moment to update our __gtod_offset. Because once we find the
     432             :          * TSC to be unstable, any computation will be computing crap.
     433             :          */
     434             :         local_irq_disable();
     435             :         __sched_clock_gtod_offset();
     436             :         local_irq_enable();
     437             : }
     438             : 
     439             : /*
     440             :  * We are going deep-idle (irqs are disabled):
     441             :  */
     442             : notrace void sched_clock_idle_sleep_event(void)
     443             : {
     444             :         sched_clock_cpu(smp_processor_id());
     445             : }
     446             : EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
     447             : 
     448             : /*
     449             :  * We just idled; resync with ktime.
     450             :  */
     451             : notrace void sched_clock_idle_wakeup_event(void)
     452             : {
     453             :         unsigned long flags;
     454             : 
     455             :         if (sched_clock_stable())
     456             :                 return;
     457             : 
     458             :         if (unlikely(timekeeping_suspended))
     459             :                 return;
     460             : 
     461             :         local_irq_save(flags);
     462             :         sched_clock_tick();
     463             :         local_irq_restore(flags);
     464             : }
     465             : EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
     466             : 
     467             : #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
     468             : 
     469           1 : void __init sched_clock_init(void)
     470             : {
     471           1 :         static_branch_inc(&sched_clock_running);
     472             :         local_irq_disable();
     473             :         generic_sched_clock_init();
     474             :         local_irq_enable();
     475           1 : }
     476             : 
     477        7826 : notrace u64 sched_clock_cpu(int cpu)
     478             : {
     479        7826 :         if (!static_branch_likely(&sched_clock_running))
     480             :                 return 0;
     481             : 
     482        7826 :         return sched_clock();
     483             : }
     484             : 
     485             : #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
     486             : 
     487             : /*
     488             :  * Running clock - returns the time that has elapsed while a guest has been
     489             :  * running.
     490             :  * On a guest this value should be local_clock minus the time the guest was
     491             :  * suspended by the hypervisor (for any reason).
     492             :  * On bare metal this function should return the same as local_clock.
     493             :  * Architectures and sub-architectures can override this.
     494             :  */
     495           0 : notrace u64 __weak running_clock(void)
     496             : {
     497           0 :         return local_clock();
     498             : }

Generated by: LCOV version 1.14