Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Kernel internal timers
4 : *
5 : * Copyright (C) 1991, 1992 Linus Torvalds
6 : *
7 : * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
8 : *
9 : * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
10 : * "A Kernel Model for Precision Timekeeping" by Dave Mills
11 : * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
12 : * serialize accesses to xtime/lost_ticks).
13 : * Copyright (C) 1998 Andrea Arcangeli
14 : * 1999-03-10 Improved NTP compatibility by Ulrich Windl
15 : * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
16 : * 2000-10-05 Implemented scalable SMP per-CPU timer handling.
17 : * Copyright (C) 2000, 2001, 2002 Ingo Molnar
18 : * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
19 : */
20 :
21 : #include <linux/kernel_stat.h>
22 : #include <linux/export.h>
23 : #include <linux/interrupt.h>
24 : #include <linux/percpu.h>
25 : #include <linux/init.h>
26 : #include <linux/mm.h>
27 : #include <linux/swap.h>
28 : #include <linux/pid_namespace.h>
29 : #include <linux/notifier.h>
30 : #include <linux/thread_info.h>
31 : #include <linux/time.h>
32 : #include <linux/jiffies.h>
33 : #include <linux/posix-timers.h>
34 : #include <linux/cpu.h>
35 : #include <linux/syscalls.h>
36 : #include <linux/delay.h>
37 : #include <linux/tick.h>
38 : #include <linux/kallsyms.h>
39 : #include <linux/irq_work.h>
40 : #include <linux/sched/signal.h>
41 : #include <linux/sched/sysctl.h>
42 : #include <linux/sched/nohz.h>
43 : #include <linux/sched/debug.h>
44 : #include <linux/slab.h>
45 : #include <linux/compat.h>
46 : #include <linux/random.h>
47 : #include <linux/sysctl.h>
48 :
49 : #include <linux/uaccess.h>
50 : #include <asm/unistd.h>
51 : #include <asm/div64.h>
52 : #include <asm/timex.h>
53 : #include <asm/io.h>
54 :
55 : #include "tick-internal.h"
56 :
57 : #define CREATE_TRACE_POINTS
58 : #include <trace/events/timer.h>
59 :
60 : __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
61 :
62 : EXPORT_SYMBOL(jiffies_64);
63 :
64 : /*
65 : * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
66 : * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
67 : * level has a different granularity.
68 : *
69 : * The level granularity is: LVL_CLK_DIV ^ lvl
70 : * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level)
71 : *
72 : * The array level of a newly armed timer depends on the relative expiry
73 : * time. The farther the expiry time is away the higher the array level and
74 : * therefor the granularity becomes.
75 : *
76 : * Contrary to the original timer wheel implementation, which aims for 'exact'
77 : * expiry of the timers, this implementation removes the need for recascading
78 : * the timers into the lower array levels. The previous 'classic' timer wheel
79 : * implementation of the kernel already violated the 'exact' expiry by adding
80 : * slack to the expiry time to provide batched expiration. The granularity
81 : * levels provide implicit batching.
82 : *
83 : * This is an optimization of the original timer wheel implementation for the
84 : * majority of the timer wheel use cases: timeouts. The vast majority of
85 : * timeout timers (networking, disk I/O ...) are canceled before expiry. If
86 : * the timeout expires it indicates that normal operation is disturbed, so it
87 : * does not matter much whether the timeout comes with a slight delay.
88 : *
89 : * The only exception to this are networking timers with a small expiry
90 : * time. They rely on the granularity. Those fit into the first wheel level,
91 : * which has HZ granularity.
92 : *
93 : * We don't have cascading anymore. timers with a expiry time above the
94 : * capacity of the last wheel level are force expired at the maximum timeout
95 : * value of the last wheel level. From data sampling we know that the maximum
96 : * value observed is 5 days (network connection tracking), so this should not
97 : * be an issue.
98 : *
99 : * The currently chosen array constants values are a good compromise between
100 : * array size and granularity.
101 : *
102 : * This results in the following granularity and range levels:
103 : *
104 : * HZ 1000 steps
105 : * Level Offset Granularity Range
106 : * 0 0 1 ms 0 ms - 63 ms
107 : * 1 64 8 ms 64 ms - 511 ms
108 : * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s)
109 : * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s)
110 : * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m)
111 : * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m)
112 : * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h)
113 : * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d)
114 : * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d)
115 : *
116 : * HZ 300
117 : * Level Offset Granularity Range
118 : * 0 0 3 ms 0 ms - 210 ms
119 : * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s)
120 : * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s)
121 : * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m)
122 : * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m)
123 : * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h)
124 : * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h)
125 : * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d)
126 : * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
127 : *
128 : * HZ 250
129 : * Level Offset Granularity Range
130 : * 0 0 4 ms 0 ms - 255 ms
131 : * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s)
132 : * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s)
133 : * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m)
134 : * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m)
135 : * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h)
136 : * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h)
137 : * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d)
138 : * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
139 : *
140 : * HZ 100
141 : * Level Offset Granularity Range
142 : * 0 0 10 ms 0 ms - 630 ms
143 : * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s)
144 : * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s)
145 : * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m)
146 : * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m)
147 : * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h)
148 : * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d)
149 : * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
150 : */
151 :
152 : /* Clock divisor for the next level */
153 : #define LVL_CLK_SHIFT 3
154 : #define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT)
155 : #define LVL_CLK_MASK (LVL_CLK_DIV - 1)
156 : #define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT)
157 : #define LVL_GRAN(n) (1UL << LVL_SHIFT(n))
158 :
159 : /*
160 : * The time start value for each level to select the bucket at enqueue
161 : * time. We start from the last possible delta of the previous level
162 : * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
163 : */
164 : #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
165 :
166 : /* Size of each clock level */
167 : #define LVL_BITS 6
168 : #define LVL_SIZE (1UL << LVL_BITS)
169 : #define LVL_MASK (LVL_SIZE - 1)
170 : #define LVL_OFFS(n) ((n) * LVL_SIZE)
171 :
172 : /* Level depth */
173 : #if HZ > 100
174 : # define LVL_DEPTH 9
175 : # else
176 : # define LVL_DEPTH 8
177 : #endif
178 :
179 : /* The cutoff (max. capacity of the wheel) */
180 : #define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH))
181 : #define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
182 :
183 : /*
184 : * The resulting wheel size. If NOHZ is configured we allocate two
185 : * wheels so we have a separate storage for the deferrable timers.
186 : */
187 : #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
188 :
189 : #ifdef CONFIG_NO_HZ_COMMON
190 : # define NR_BASES 2
191 : # define BASE_STD 0
192 : # define BASE_DEF 1
193 : #else
194 : # define NR_BASES 1
195 : # define BASE_STD 0
196 : # define BASE_DEF 0
197 : #endif
198 :
199 : struct timer_base {
200 : raw_spinlock_t lock;
201 : struct timer_list *running_timer;
202 : #ifdef CONFIG_PREEMPT_RT
203 : spinlock_t expiry_lock;
204 : atomic_t timer_waiters;
205 : #endif
206 : unsigned long clk;
207 : unsigned long next_expiry;
208 : unsigned int cpu;
209 : bool next_expiry_recalc;
210 : bool is_idle;
211 : bool timers_pending;
212 : DECLARE_BITMAP(pending_map, WHEEL_SIZE);
213 : struct hlist_head vectors[WHEEL_SIZE];
214 : } ____cacheline_aligned;
215 :
216 : static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
217 :
218 : #ifdef CONFIG_NO_HZ_COMMON
219 :
220 : static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
221 : static DEFINE_MUTEX(timer_keys_mutex);
222 :
223 : static void timer_update_keys(struct work_struct *work);
224 : static DECLARE_WORK(timer_update_work, timer_update_keys);
225 :
226 : #ifdef CONFIG_SMP
227 : static unsigned int sysctl_timer_migration = 1;
228 :
229 : DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
230 :
231 : static void timers_update_migration(void)
232 : {
233 : if (sysctl_timer_migration && tick_nohz_active)
234 : static_branch_enable(&timers_migration_enabled);
235 : else
236 : static_branch_disable(&timers_migration_enabled);
237 : }
238 :
239 : #ifdef CONFIG_SYSCTL
240 : static int timer_migration_handler(struct ctl_table *table, int write,
241 : void *buffer, size_t *lenp, loff_t *ppos)
242 : {
243 : int ret;
244 :
245 : mutex_lock(&timer_keys_mutex);
246 : ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
247 : if (!ret && write)
248 : timers_update_migration();
249 : mutex_unlock(&timer_keys_mutex);
250 : return ret;
251 : }
252 :
253 : static struct ctl_table timer_sysctl[] = {
254 : {
255 : .procname = "timer_migration",
256 : .data = &sysctl_timer_migration,
257 : .maxlen = sizeof(unsigned int),
258 : .mode = 0644,
259 : .proc_handler = timer_migration_handler,
260 : .extra1 = SYSCTL_ZERO,
261 : .extra2 = SYSCTL_ONE,
262 : },
263 : {}
264 : };
265 :
266 : static int __init timer_sysctl_init(void)
267 : {
268 : register_sysctl("kernel", timer_sysctl);
269 : return 0;
270 : }
271 : device_initcall(timer_sysctl_init);
272 : #endif /* CONFIG_SYSCTL */
273 : #else /* CONFIG_SMP */
274 : static inline void timers_update_migration(void) { }
275 : #endif /* !CONFIG_SMP */
276 :
277 : static void timer_update_keys(struct work_struct *work)
278 : {
279 : mutex_lock(&timer_keys_mutex);
280 : timers_update_migration();
281 : static_branch_enable(&timers_nohz_active);
282 : mutex_unlock(&timer_keys_mutex);
283 : }
284 :
285 : void timers_update_nohz(void)
286 : {
287 : schedule_work(&timer_update_work);
288 : }
289 :
290 : static inline bool is_timers_nohz_active(void)
291 : {
292 : return static_branch_unlikely(&timers_nohz_active);
293 : }
294 : #else
295 : static inline bool is_timers_nohz_active(void) { return false; }
296 : #endif /* NO_HZ_COMMON */
297 :
298 : static unsigned long round_jiffies_common(unsigned long j, int cpu,
299 : bool force_up)
300 : {
301 : int rem;
302 0 : unsigned long original = j;
303 :
304 : /*
305 : * We don't want all cpus firing their timers at once hitting the
306 : * same lock or cachelines, so we skew each extra cpu with an extra
307 : * 3 jiffies. This 3 jiffies came originally from the mm/ code which
308 : * already did this.
309 : * The skew is done by adding 3*cpunr, then round, then subtract this
310 : * extra offset again.
311 : */
312 0 : j += cpu * 3;
313 :
314 0 : rem = j % HZ;
315 :
316 : /*
317 : * If the target jiffie is just after a whole second (which can happen
318 : * due to delays of the timer irq, long irq off times etc etc) then
319 : * we should round down to the whole second, not up. Use 1/4th second
320 : * as cutoff for this rounding as an extreme upper bound for this.
321 : * But never round down if @force_up is set.
322 : */
323 0 : if (rem < HZ/4 && !force_up) /* round down */
324 0 : j = j - rem;
325 : else /* round up */
326 0 : j = j - rem + HZ;
327 :
328 : /* now that we have rounded, subtract the extra skew again */
329 0 : j -= cpu * 3;
330 :
331 : /*
332 : * Make sure j is still in the future. Otherwise return the
333 : * unmodified value.
334 : */
335 0 : return time_is_after_jiffies(j) ? j : original;
336 : }
337 :
338 : /**
339 : * __round_jiffies - function to round jiffies to a full second
340 : * @j: the time in (absolute) jiffies that should be rounded
341 : * @cpu: the processor number on which the timeout will happen
342 : *
343 : * __round_jiffies() rounds an absolute time in the future (in jiffies)
344 : * up or down to (approximately) full seconds. This is useful for timers
345 : * for which the exact time they fire does not matter too much, as long as
346 : * they fire approximately every X seconds.
347 : *
348 : * By rounding these timers to whole seconds, all such timers will fire
349 : * at the same time, rather than at various times spread out. The goal
350 : * of this is to have the CPU wake up less, which saves power.
351 : *
352 : * The exact rounding is skewed for each processor to avoid all
353 : * processors firing at the exact same time, which could lead
354 : * to lock contention or spurious cache line bouncing.
355 : *
356 : * The return value is the rounded version of the @j parameter.
357 : */
358 0 : unsigned long __round_jiffies(unsigned long j, int cpu)
359 : {
360 0 : return round_jiffies_common(j, cpu, false);
361 : }
362 : EXPORT_SYMBOL_GPL(__round_jiffies);
363 :
364 : /**
365 : * __round_jiffies_relative - function to round jiffies to a full second
366 : * @j: the time in (relative) jiffies that should be rounded
367 : * @cpu: the processor number on which the timeout will happen
368 : *
369 : * __round_jiffies_relative() rounds a time delta in the future (in jiffies)
370 : * up or down to (approximately) full seconds. This is useful for timers
371 : * for which the exact time they fire does not matter too much, as long as
372 : * they fire approximately every X seconds.
373 : *
374 : * By rounding these timers to whole seconds, all such timers will fire
375 : * at the same time, rather than at various times spread out. The goal
376 : * of this is to have the CPU wake up less, which saves power.
377 : *
378 : * The exact rounding is skewed for each processor to avoid all
379 : * processors firing at the exact same time, which could lead
380 : * to lock contention or spurious cache line bouncing.
381 : *
382 : * The return value is the rounded version of the @j parameter.
383 : */
384 0 : unsigned long __round_jiffies_relative(unsigned long j, int cpu)
385 : {
386 0 : unsigned long j0 = jiffies;
387 :
388 : /* Use j0 because jiffies might change while we run */
389 0 : return round_jiffies_common(j + j0, cpu, false) - j0;
390 : }
391 : EXPORT_SYMBOL_GPL(__round_jiffies_relative);
392 :
393 : /**
394 : * round_jiffies - function to round jiffies to a full second
395 : * @j: the time in (absolute) jiffies that should be rounded
396 : *
397 : * round_jiffies() rounds an absolute time in the future (in jiffies)
398 : * up or down to (approximately) full seconds. This is useful for timers
399 : * for which the exact time they fire does not matter too much, as long as
400 : * they fire approximately every X seconds.
401 : *
402 : * By rounding these timers to whole seconds, all such timers will fire
403 : * at the same time, rather than at various times spread out. The goal
404 : * of this is to have the CPU wake up less, which saves power.
405 : *
406 : * The return value is the rounded version of the @j parameter.
407 : */
408 0 : unsigned long round_jiffies(unsigned long j)
409 : {
410 0 : return round_jiffies_common(j, raw_smp_processor_id(), false);
411 : }
412 : EXPORT_SYMBOL_GPL(round_jiffies);
413 :
414 : /**
415 : * round_jiffies_relative - function to round jiffies to a full second
416 : * @j: the time in (relative) jiffies that should be rounded
417 : *
418 : * round_jiffies_relative() rounds a time delta in the future (in jiffies)
419 : * up or down to (approximately) full seconds. This is useful for timers
420 : * for which the exact time they fire does not matter too much, as long as
421 : * they fire approximately every X seconds.
422 : *
423 : * By rounding these timers to whole seconds, all such timers will fire
424 : * at the same time, rather than at various times spread out. The goal
425 : * of this is to have the CPU wake up less, which saves power.
426 : *
427 : * The return value is the rounded version of the @j parameter.
428 : */
429 0 : unsigned long round_jiffies_relative(unsigned long j)
430 : {
431 0 : return __round_jiffies_relative(j, raw_smp_processor_id());
432 : }
433 : EXPORT_SYMBOL_GPL(round_jiffies_relative);
434 :
435 : /**
436 : * __round_jiffies_up - function to round jiffies up to a full second
437 : * @j: the time in (absolute) jiffies that should be rounded
438 : * @cpu: the processor number on which the timeout will happen
439 : *
440 : * This is the same as __round_jiffies() except that it will never
441 : * round down. This is useful for timeouts for which the exact time
442 : * of firing does not matter too much, as long as they don't fire too
443 : * early.
444 : */
445 0 : unsigned long __round_jiffies_up(unsigned long j, int cpu)
446 : {
447 0 : return round_jiffies_common(j, cpu, true);
448 : }
449 : EXPORT_SYMBOL_GPL(__round_jiffies_up);
450 :
451 : /**
452 : * __round_jiffies_up_relative - function to round jiffies up to a full second
453 : * @j: the time in (relative) jiffies that should be rounded
454 : * @cpu: the processor number on which the timeout will happen
455 : *
456 : * This is the same as __round_jiffies_relative() except that it will never
457 : * round down. This is useful for timeouts for which the exact time
458 : * of firing does not matter too much, as long as they don't fire too
459 : * early.
460 : */
461 0 : unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
462 : {
463 0 : unsigned long j0 = jiffies;
464 :
465 : /* Use j0 because jiffies might change while we run */
466 0 : return round_jiffies_common(j + j0, cpu, true) - j0;
467 : }
468 : EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
469 :
470 : /**
471 : * round_jiffies_up - function to round jiffies up to a full second
472 : * @j: the time in (absolute) jiffies that should be rounded
473 : *
474 : * This is the same as round_jiffies() except that it will never
475 : * round down. This is useful for timeouts for which the exact time
476 : * of firing does not matter too much, as long as they don't fire too
477 : * early.
478 : */
479 0 : unsigned long round_jiffies_up(unsigned long j)
480 : {
481 0 : return round_jiffies_common(j, raw_smp_processor_id(), true);
482 : }
483 : EXPORT_SYMBOL_GPL(round_jiffies_up);
484 :
485 : /**
486 : * round_jiffies_up_relative - function to round jiffies up to a full second
487 : * @j: the time in (relative) jiffies that should be rounded
488 : *
489 : * This is the same as round_jiffies_relative() except that it will never
490 : * round down. This is useful for timeouts for which the exact time
491 : * of firing does not matter too much, as long as they don't fire too
492 : * early.
493 : */
494 0 : unsigned long round_jiffies_up_relative(unsigned long j)
495 : {
496 0 : return __round_jiffies_up_relative(j, raw_smp_processor_id());
497 : }
498 : EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
499 :
500 :
501 : static inline unsigned int timer_get_idx(struct timer_list *timer)
502 : {
503 858 : return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
504 : }
505 :
506 : static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
507 : {
508 928 : timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
509 464 : idx << TIMER_ARRAYSHIFT;
510 : }
511 :
512 : /*
513 : * Helper function to calculate the array index for a given expiry
514 : * time.
515 : */
516 : static inline unsigned calc_index(unsigned long expires, unsigned lvl,
517 : unsigned long *bucket_expiry)
518 : {
519 :
520 : /*
521 : * The timer wheel has to guarantee that a timer does not fire
522 : * early. Early expiry can happen due to:
523 : * - Timer is armed at the edge of a tick
524 : * - Truncation of the expiry time in the outer wheel levels
525 : *
526 : * Round up with level granularity to prevent this.
527 : */
528 464 : expires = (expires >> LVL_SHIFT(lvl)) + 1;
529 464 : *bucket_expiry = expires << LVL_SHIFT(lvl);
530 464 : return LVL_OFFS(lvl) + (expires & LVL_MASK);
531 : }
532 :
533 464 : static int calc_wheel_index(unsigned long expires, unsigned long clk,
534 : unsigned long *bucket_expiry)
535 : {
536 464 : unsigned long delta = expires - clk;
537 : unsigned int idx;
538 :
539 464 : if (delta < LVL_START(1)) {
540 86 : idx = calc_index(expires, 0, bucket_expiry);
541 378 : } else if (delta < LVL_START(2)) {
542 7 : idx = calc_index(expires, 1, bucket_expiry);
543 371 : } else if (delta < LVL_START(3)) {
544 3 : idx = calc_index(expires, 2, bucket_expiry);
545 368 : } else if (delta < LVL_START(4)) {
546 367 : idx = calc_index(expires, 3, bucket_expiry);
547 1 : } else if (delta < LVL_START(5)) {
548 0 : idx = calc_index(expires, 4, bucket_expiry);
549 1 : } else if (delta < LVL_START(6)) {
550 0 : idx = calc_index(expires, 5, bucket_expiry);
551 1 : } else if (delta < LVL_START(7)) {
552 1 : idx = calc_index(expires, 6, bucket_expiry);
553 : } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
554 : idx = calc_index(expires, 7, bucket_expiry);
555 0 : } else if ((long) delta < 0) {
556 0 : idx = clk & LVL_MASK;
557 0 : *bucket_expiry = clk;
558 : } else {
559 : /*
560 : * Force expire obscene large timeouts to expire at the
561 : * capacity limit of the wheel.
562 : */
563 0 : if (delta >= WHEEL_TIMEOUT_CUTOFF)
564 0 : expires = clk + WHEEL_TIMEOUT_MAX;
565 :
566 0 : idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
567 : }
568 464 : return idx;
569 : }
570 :
571 : static void
572 : trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
573 : {
574 : if (!is_timers_nohz_active())
575 : return;
576 :
577 : /*
578 : * TODO: This wants some optimizing similar to the code below, but we
579 : * will do that when we switch from push to pull for deferrable timers.
580 : */
581 : if (timer->flags & TIMER_DEFERRABLE) {
582 : if (tick_nohz_full_cpu(base->cpu))
583 : wake_up_nohz_cpu(base->cpu);
584 : return;
585 : }
586 :
587 : /*
588 : * We might have to IPI the remote CPU if the base is idle and the
589 : * timer is not deferrable. If the other CPU is on the way to idle
590 : * then it can't set base->is_idle as we hold the base lock:
591 : */
592 : if (base->is_idle)
593 : wake_up_nohz_cpu(base->cpu);
594 : }
595 :
596 : /*
597 : * Enqueue the timer into the hash bucket, mark it pending in
598 : * the bitmap, store the index in the timer flags then wake up
599 : * the target CPU if needed.
600 : */
601 464 : static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
602 : unsigned int idx, unsigned long bucket_expiry)
603 : {
604 :
605 928 : hlist_add_head(&timer->entry, base->vectors + idx);
606 928 : __set_bit(idx, base->pending_map);
607 928 : timer_set_idx(timer, idx);
608 :
609 464 : trace_timer_start(timer, timer->expires, timer->flags);
610 :
611 : /*
612 : * Check whether this is the new first expiring timer. The
613 : * effective expiry time of the timer is required here
614 : * (bucket_expiry) instead of timer->expires.
615 : */
616 464 : if (time_before(bucket_expiry, base->next_expiry)) {
617 : /*
618 : * Set the next expiry time and kick the CPU so it
619 : * can reevaluate the wheel:
620 : */
621 76 : base->next_expiry = bucket_expiry;
622 76 : base->timers_pending = true;
623 76 : base->next_expiry_recalc = false;
624 76 : trigger_dyntick_cpu(base, timer);
625 : }
626 464 : }
627 :
628 464 : static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
629 : {
630 : unsigned long bucket_expiry;
631 : unsigned int idx;
632 :
633 464 : idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
634 464 : enqueue_timer(base, timer, idx, bucket_expiry);
635 464 : }
636 :
637 : #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
638 :
639 : static const struct debug_obj_descr timer_debug_descr;
640 :
641 : struct timer_hint {
642 : void (*function)(struct timer_list *t);
643 : long offset;
644 : };
645 :
646 : #define TIMER_HINT(fn, container, timr, hintfn) \
647 : { \
648 : .function = fn, \
649 : .offset = offsetof(container, hintfn) - \
650 : offsetof(container, timr) \
651 : }
652 :
653 : static const struct timer_hint timer_hints[] = {
654 : TIMER_HINT(delayed_work_timer_fn,
655 : struct delayed_work, timer, work.func),
656 : TIMER_HINT(kthread_delayed_work_timer_fn,
657 : struct kthread_delayed_work, timer, work.func),
658 : };
659 :
660 : static void *timer_debug_hint(void *addr)
661 : {
662 : struct timer_list *timer = addr;
663 : int i;
664 :
665 : for (i = 0; i < ARRAY_SIZE(timer_hints); i++) {
666 : if (timer_hints[i].function == timer->function) {
667 : void (**fn)(void) = addr + timer_hints[i].offset;
668 :
669 : return *fn;
670 : }
671 : }
672 :
673 : return timer->function;
674 : }
675 :
676 : static bool timer_is_static_object(void *addr)
677 : {
678 : struct timer_list *timer = addr;
679 :
680 : return (timer->entry.pprev == NULL &&
681 : timer->entry.next == TIMER_ENTRY_STATIC);
682 : }
683 :
684 : /*
685 : * fixup_init is called when:
686 : * - an active object is initialized
687 : */
688 : static bool timer_fixup_init(void *addr, enum debug_obj_state state)
689 : {
690 : struct timer_list *timer = addr;
691 :
692 : switch (state) {
693 : case ODEBUG_STATE_ACTIVE:
694 : del_timer_sync(timer);
695 : debug_object_init(timer, &timer_debug_descr);
696 : return true;
697 : default:
698 : return false;
699 : }
700 : }
701 :
702 : /* Stub timer callback for improperly used timers. */
703 : static void stub_timer(struct timer_list *unused)
704 : {
705 : WARN_ON(1);
706 : }
707 :
708 : /*
709 : * fixup_activate is called when:
710 : * - an active object is activated
711 : * - an unknown non-static object is activated
712 : */
713 : static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
714 : {
715 : struct timer_list *timer = addr;
716 :
717 : switch (state) {
718 : case ODEBUG_STATE_NOTAVAILABLE:
719 : timer_setup(timer, stub_timer, 0);
720 : return true;
721 :
722 : case ODEBUG_STATE_ACTIVE:
723 : WARN_ON(1);
724 : fallthrough;
725 : default:
726 : return false;
727 : }
728 : }
729 :
730 : /*
731 : * fixup_free is called when:
732 : * - an active object is freed
733 : */
734 : static bool timer_fixup_free(void *addr, enum debug_obj_state state)
735 : {
736 : struct timer_list *timer = addr;
737 :
738 : switch (state) {
739 : case ODEBUG_STATE_ACTIVE:
740 : del_timer_sync(timer);
741 : debug_object_free(timer, &timer_debug_descr);
742 : return true;
743 : default:
744 : return false;
745 : }
746 : }
747 :
748 : /*
749 : * fixup_assert_init is called when:
750 : * - an untracked/uninit-ed object is found
751 : */
752 : static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
753 : {
754 : struct timer_list *timer = addr;
755 :
756 : switch (state) {
757 : case ODEBUG_STATE_NOTAVAILABLE:
758 : timer_setup(timer, stub_timer, 0);
759 : return true;
760 : default:
761 : return false;
762 : }
763 : }
764 :
765 : static const struct debug_obj_descr timer_debug_descr = {
766 : .name = "timer_list",
767 : .debug_hint = timer_debug_hint,
768 : .is_static_object = timer_is_static_object,
769 : .fixup_init = timer_fixup_init,
770 : .fixup_activate = timer_fixup_activate,
771 : .fixup_free = timer_fixup_free,
772 : .fixup_assert_init = timer_fixup_assert_init,
773 : };
774 :
775 : static inline void debug_timer_init(struct timer_list *timer)
776 : {
777 : debug_object_init(timer, &timer_debug_descr);
778 : }
779 :
780 : static inline void debug_timer_activate(struct timer_list *timer)
781 : {
782 : debug_object_activate(timer, &timer_debug_descr);
783 : }
784 :
785 : static inline void debug_timer_deactivate(struct timer_list *timer)
786 : {
787 : debug_object_deactivate(timer, &timer_debug_descr);
788 : }
789 :
790 : static inline void debug_timer_assert_init(struct timer_list *timer)
791 : {
792 : debug_object_assert_init(timer, &timer_debug_descr);
793 : }
794 :
795 : static void do_init_timer(struct timer_list *timer,
796 : void (*func)(struct timer_list *),
797 : unsigned int flags,
798 : const char *name, struct lock_class_key *key);
799 :
800 : void init_timer_on_stack_key(struct timer_list *timer,
801 : void (*func)(struct timer_list *),
802 : unsigned int flags,
803 : const char *name, struct lock_class_key *key)
804 : {
805 : debug_object_init_on_stack(timer, &timer_debug_descr);
806 : do_init_timer(timer, func, flags, name, key);
807 : }
808 : EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
809 :
810 : void destroy_timer_on_stack(struct timer_list *timer)
811 : {
812 : debug_object_free(timer, &timer_debug_descr);
813 : }
814 : EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
815 :
816 : #else
817 : static inline void debug_timer_init(struct timer_list *timer) { }
818 : static inline void debug_timer_activate(struct timer_list *timer) { }
819 : static inline void debug_timer_deactivate(struct timer_list *timer) { }
820 : static inline void debug_timer_assert_init(struct timer_list *timer) { }
821 : #endif
822 :
823 : static inline void debug_init(struct timer_list *timer)
824 : {
825 432 : debug_timer_init(timer);
826 432 : trace_timer_init(timer);
827 : }
828 :
829 : static inline void debug_deactivate(struct timer_list *timer)
830 : {
831 461 : debug_timer_deactivate(timer);
832 461 : trace_timer_cancel(timer);
833 : }
834 :
835 : static inline void debug_assert_init(struct timer_list *timer)
836 : {
837 925 : debug_timer_assert_init(timer);
838 : }
839 :
840 432 : static void do_init_timer(struct timer_list *timer,
841 : void (*func)(struct timer_list *),
842 : unsigned int flags,
843 : const char *name, struct lock_class_key *key)
844 : {
845 432 : timer->entry.pprev = NULL;
846 432 : timer->function = func;
847 432 : if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
848 0 : flags &= TIMER_INIT_FLAGS;
849 432 : timer->flags = flags | raw_smp_processor_id();
850 : lockdep_init_map(&timer->lockdep_map, name, key, 0);
851 432 : }
852 :
853 : /**
854 : * init_timer_key - initialize a timer
855 : * @timer: the timer to be initialized
856 : * @func: timer callback function
857 : * @flags: timer flags
858 : * @name: name of the timer
859 : * @key: lockdep class key of the fake lock used for tracking timer
860 : * sync lock dependencies
861 : *
862 : * init_timer_key() must be done to a timer prior calling *any* of the
863 : * other timer functions.
864 : */
865 10 : void init_timer_key(struct timer_list *timer,
866 : void (*func)(struct timer_list *), unsigned int flags,
867 : const char *name, struct lock_class_key *key)
868 : {
869 432 : debug_init(timer);
870 432 : do_init_timer(timer, func, flags, name, key);
871 10 : }
872 : EXPORT_SYMBOL(init_timer_key);
873 :
874 : static inline void detach_timer(struct timer_list *timer, bool clear_pending)
875 : {
876 461 : struct hlist_node *entry = &timer->entry;
877 :
878 922 : debug_deactivate(timer);
879 :
880 461 : __hlist_del(entry);
881 369 : if (clear_pending)
882 461 : entry->pprev = NULL;
883 461 : entry->next = LIST_POISON2;
884 : }
885 :
886 858 : static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
887 : bool clear_pending)
888 : {
889 1716 : unsigned idx = timer_get_idx(timer);
890 :
891 858 : if (!timer_pending(timer))
892 : return 0;
893 :
894 738 : if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
895 736 : __clear_bit(idx, base->pending_map);
896 368 : base->next_expiry_recalc = true;
897 : }
898 :
899 738 : detach_timer(timer, clear_pending);
900 369 : return 1;
901 : }
902 :
903 : static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
904 : {
905 916 : struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
906 :
907 : /*
908 : * If the timer is deferrable and NO_HZ_COMMON is set then we need
909 : * to use the deferrable base.
910 : */
911 : if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
912 : base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
913 : return base;
914 : }
915 :
916 : static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
917 : {
918 435 : struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
919 :
920 : /*
921 : * If the timer is deferrable and NO_HZ_COMMON is set then we need
922 : * to use the deferrable base.
923 : */
924 : if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
925 : base = this_cpu_ptr(&timer_bases[BASE_DEF]);
926 : return base;
927 : }
928 :
929 : static inline struct timer_base *get_timer_base(u32 tflags)
930 : {
931 887 : return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
932 : }
933 :
934 : static inline struct timer_base *
935 : get_target_base(struct timer_base *base, unsigned tflags)
936 : {
937 : #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
938 : if (static_branch_likely(&timers_migration_enabled) &&
939 : !(tflags & TIMER_PINNED))
940 : return get_timer_cpu_base(tflags, get_nohz_timer_target());
941 : #endif
942 435 : return get_timer_this_cpu_base(tflags);
943 : }
944 :
945 464 : static inline void forward_timer_base(struct timer_base *base)
946 : {
947 464 : unsigned long jnow = READ_ONCE(jiffies);
948 :
949 : /*
950 : * No need to forward if we are close enough below jiffies.
951 : * Also while executing timers, base->clk is 1 offset ahead
952 : * of jiffies to avoid endless requeuing to current jiffies.
953 : */
954 464 : if ((long)(jnow - base->clk) < 1)
955 : return;
956 :
957 : /*
958 : * If the next expiry value is > jiffies, then we fast forward to
959 : * jiffies otherwise we forward to the next expiry value.
960 : */
961 80 : if (time_after(base->next_expiry, jnow)) {
962 80 : base->clk = jnow;
963 : } else {
964 0 : if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
965 : return;
966 0 : base->clk = base->next_expiry;
967 : }
968 : }
969 :
970 :
971 : /*
972 : * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
973 : * that all timers which are tied to this base are locked, and the base itself
974 : * is locked too.
975 : *
976 : * So __run_timers/migrate_timers can safely modify all timers which could
977 : * be found in the base->vectors array.
978 : *
979 : * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
980 : * to wait until the migration is done.
981 : */
982 887 : static struct timer_base *lock_timer_base(struct timer_list *timer,
983 : unsigned long *flags)
984 : __acquires(timer->base->lock)
985 : {
986 : for (;;) {
987 : struct timer_base *base;
988 : u32 tf;
989 :
990 : /*
991 : * We need to use READ_ONCE() here, otherwise the compiler
992 : * might re-read @tf between the check for TIMER_MIGRATING
993 : * and spin_lock().
994 : */
995 887 : tf = READ_ONCE(timer->flags);
996 :
997 887 : if (!(tf & TIMER_MIGRATING)) {
998 1774 : base = get_timer_base(tf);
999 887 : raw_spin_lock_irqsave(&base->lock, *flags);
1000 887 : if (timer->flags == tf)
1001 887 : return base;
1002 0 : raw_spin_unlock_irqrestore(&base->lock, *flags);
1003 : }
1004 : cpu_relax();
1005 : }
1006 : }
1007 :
1008 : #define MOD_TIMER_PENDING_ONLY 0x01
1009 : #define MOD_TIMER_REDUCE 0x02
1010 : #define MOD_TIMER_NOTPENDING 0x04
1011 :
1012 : static inline int
1013 435 : __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
1014 : {
1015 435 : unsigned long clk = 0, flags, bucket_expiry;
1016 : struct timer_base *base, *new_base;
1017 435 : unsigned int idx = UINT_MAX;
1018 435 : int ret = 0;
1019 :
1020 435 : debug_assert_init(timer);
1021 :
1022 : /*
1023 : * This is a common optimization triggered by the networking code - if
1024 : * the timer is re-modified to have the same timeout or ends up in the
1025 : * same array bucket then just return:
1026 : */
1027 437 : if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
1028 : /*
1029 : * The downside of this optimization is that it can result in
1030 : * larger granularity than you would get from adding a new
1031 : * timer with this expiry.
1032 : */
1033 0 : long diff = timer->expires - expires;
1034 :
1035 0 : if (!diff)
1036 : return 1;
1037 0 : if (options & MOD_TIMER_REDUCE && diff <= 0)
1038 : return 1;
1039 :
1040 : /*
1041 : * We lock timer base and calculate the bucket index right
1042 : * here. If the timer ends up in the same bucket, then we
1043 : * just update the expiry time and avoid the whole
1044 : * dequeue/enqueue dance.
1045 : */
1046 0 : base = lock_timer_base(timer, &flags);
1047 : /*
1048 : * Has @timer been shutdown? This needs to be evaluated
1049 : * while holding base lock to prevent a race against the
1050 : * shutdown code.
1051 : */
1052 0 : if (!timer->function)
1053 : goto out_unlock;
1054 :
1055 0 : forward_timer_base(base);
1056 :
1057 0 : if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
1058 0 : time_before_eq(timer->expires, expires)) {
1059 : ret = 1;
1060 : goto out_unlock;
1061 : }
1062 :
1063 0 : clk = base->clk;
1064 0 : idx = calc_wheel_index(expires, clk, &bucket_expiry);
1065 :
1066 : /*
1067 : * Retrieve and compare the array index of the pending
1068 : * timer. If it matches set the expiry to the new value so a
1069 : * subsequent call will exit in the expires check above.
1070 : */
1071 0 : if (idx == timer_get_idx(timer)) {
1072 0 : if (!(options & MOD_TIMER_REDUCE))
1073 0 : timer->expires = expires;
1074 0 : else if (time_after(timer->expires, expires))
1075 0 : timer->expires = expires;
1076 : ret = 1;
1077 : goto out_unlock;
1078 : }
1079 : } else {
1080 435 : base = lock_timer_base(timer, &flags);
1081 : /*
1082 : * Has @timer been shutdown? This needs to be evaluated
1083 : * while holding base lock to prevent a race against the
1084 : * shutdown code.
1085 : */
1086 435 : if (!timer->function)
1087 : goto out_unlock;
1088 :
1089 435 : forward_timer_base(base);
1090 : }
1091 :
1092 435 : ret = detach_if_pending(timer, base, false);
1093 435 : if (!ret && (options & MOD_TIMER_PENDING_ONLY))
1094 : goto out_unlock;
1095 :
1096 870 : new_base = get_target_base(base, timer->flags);
1097 :
1098 435 : if (base != new_base) {
1099 : /*
1100 : * We are trying to schedule the timer on the new base.
1101 : * However we can't change timer's base while it is running,
1102 : * otherwise timer_delete_sync() can't detect that the timer's
1103 : * handler yet has not finished. This also guarantees that the
1104 : * timer is serialized wrt itself.
1105 : */
1106 0 : if (likely(base->running_timer != timer)) {
1107 : /* See the comment in lock_timer_base() */
1108 0 : timer->flags |= TIMER_MIGRATING;
1109 :
1110 0 : raw_spin_unlock(&base->lock);
1111 0 : base = new_base;
1112 0 : raw_spin_lock(&base->lock);
1113 0 : WRITE_ONCE(timer->flags,
1114 : (timer->flags & ~TIMER_BASEMASK) | base->cpu);
1115 0 : forward_timer_base(base);
1116 : }
1117 : }
1118 :
1119 435 : debug_timer_activate(timer);
1120 :
1121 435 : timer->expires = expires;
1122 : /*
1123 : * If 'idx' was calculated above and the base time did not advance
1124 : * between calculating 'idx' and possibly switching the base, only
1125 : * enqueue_timer() is required. Otherwise we need to (re)calculate
1126 : * the wheel index via internal_add_timer().
1127 : */
1128 435 : if (idx != UINT_MAX && clk == base->clk)
1129 0 : enqueue_timer(base, timer, idx, bucket_expiry);
1130 : else
1131 435 : internal_add_timer(base, timer);
1132 :
1133 : out_unlock:
1134 870 : raw_spin_unlock_irqrestore(&base->lock, flags);
1135 :
1136 435 : return ret;
1137 : }
1138 :
1139 : /**
1140 : * mod_timer_pending - Modify a pending timer's timeout
1141 : * @timer: The pending timer to be modified
1142 : * @expires: New absolute timeout in jiffies
1143 : *
1144 : * mod_timer_pending() is the same for pending timers as mod_timer(), but
1145 : * will not activate inactive timers.
1146 : *
1147 : * If @timer->function == NULL then the start operation is silently
1148 : * discarded.
1149 : *
1150 : * Return:
1151 : * * %0 - The timer was inactive and not modified or was in
1152 : * shutdown state and the operation was discarded
1153 : * * %1 - The timer was active and requeued to expire at @expires
1154 : */
1155 0 : int mod_timer_pending(struct timer_list *timer, unsigned long expires)
1156 : {
1157 0 : return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
1158 : }
1159 : EXPORT_SYMBOL(mod_timer_pending);
1160 :
1161 : /**
1162 : * mod_timer - Modify a timer's timeout
1163 : * @timer: The timer to be modified
1164 : * @expires: New absolute timeout in jiffies
1165 : *
1166 : * mod_timer(timer, expires) is equivalent to:
1167 : *
1168 : * del_timer(timer); timer->expires = expires; add_timer(timer);
1169 : *
1170 : * mod_timer() is more efficient than the above open coded sequence. In
1171 : * case that the timer is inactive, the del_timer() part is a NOP. The
1172 : * timer is in any case activated with the new expiry time @expires.
1173 : *
1174 : * Note that if there are multiple unserialized concurrent users of the
1175 : * same timer, then mod_timer() is the only safe way to modify the timeout,
1176 : * since add_timer() cannot modify an already running timer.
1177 : *
1178 : * If @timer->function == NULL then the start operation is silently
1179 : * discarded. In this case the return value is 0 and meaningless.
1180 : *
1181 : * Return:
1182 : * * %0 - The timer was inactive and started or was in shutdown
1183 : * state and the operation was discarded
1184 : * * %1 - The timer was active and requeued to expire at @expires or
1185 : * the timer was active and not modified because @expires did
1186 : * not change the effective expiry time
1187 : */
1188 2 : int mod_timer(struct timer_list *timer, unsigned long expires)
1189 : {
1190 2 : return __mod_timer(timer, expires, 0);
1191 : }
1192 : EXPORT_SYMBOL(mod_timer);
1193 :
1194 : /**
1195 : * timer_reduce - Modify a timer's timeout if it would reduce the timeout
1196 : * @timer: The timer to be modified
1197 : * @expires: New absolute timeout in jiffies
1198 : *
1199 : * timer_reduce() is very similar to mod_timer(), except that it will only
1200 : * modify an enqueued timer if that would reduce the expiration time. If
1201 : * @timer is not enqueued it starts the timer.
1202 : *
1203 : * If @timer->function == NULL then the start operation is silently
1204 : * discarded.
1205 : *
1206 : * Return:
1207 : * * %0 - The timer was inactive and started or was in shutdown
1208 : * state and the operation was discarded
1209 : * * %1 - The timer was active and requeued to expire at @expires or
1210 : * the timer was active and not modified because @expires
1211 : * did not change the effective expiry time such that the
1212 : * timer would expire earlier than already scheduled
1213 : */
1214 0 : int timer_reduce(struct timer_list *timer, unsigned long expires)
1215 : {
1216 0 : return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
1217 : }
1218 : EXPORT_SYMBOL(timer_reduce);
1219 :
1220 : /**
1221 : * add_timer - Start a timer
1222 : * @timer: The timer to be started
1223 : *
1224 : * Start @timer to expire at @timer->expires in the future. @timer->expires
1225 : * is the absolute expiry time measured in 'jiffies'. When the timer expires
1226 : * timer->function(timer) will be invoked from soft interrupt context.
1227 : *
1228 : * The @timer->expires and @timer->function fields must be set prior
1229 : * to calling this function.
1230 : *
1231 : * If @timer->function == NULL then the start operation is silently
1232 : * discarded.
1233 : *
1234 : * If @timer->expires is already in the past @timer will be queued to
1235 : * expire at the next timer tick.
1236 : *
1237 : * This can only operate on an inactive timer. Attempts to invoke this on
1238 : * an active timer are rejected with a warning.
1239 : */
1240 11 : void add_timer(struct timer_list *timer)
1241 : {
1242 11 : if (WARN_ON_ONCE(timer_pending(timer)))
1243 : return;
1244 11 : __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
1245 : }
1246 : EXPORT_SYMBOL(add_timer);
1247 :
1248 : /**
1249 : * add_timer_on - Start a timer on a particular CPU
1250 : * @timer: The timer to be started
1251 : * @cpu: The CPU to start it on
1252 : *
1253 : * Same as add_timer() except that it starts the timer on the given CPU.
1254 : *
1255 : * See add_timer() for further details.
1256 : */
1257 29 : void add_timer_on(struct timer_list *timer, int cpu)
1258 : {
1259 : struct timer_base *new_base, *base;
1260 : unsigned long flags;
1261 :
1262 58 : debug_assert_init(timer);
1263 :
1264 29 : if (WARN_ON_ONCE(timer_pending(timer)))
1265 0 : return;
1266 :
1267 58 : new_base = get_timer_cpu_base(timer->flags, cpu);
1268 :
1269 : /*
1270 : * If @timer was on a different CPU, it should be migrated with the
1271 : * old base locked to prevent other operations proceeding with the
1272 : * wrong base locked. See lock_timer_base().
1273 : */
1274 29 : base = lock_timer_base(timer, &flags);
1275 : /*
1276 : * Has @timer been shutdown? This needs to be evaluated while
1277 : * holding base lock to prevent a race against the shutdown code.
1278 : */
1279 29 : if (!timer->function)
1280 : goto out_unlock;
1281 :
1282 29 : if (base != new_base) {
1283 0 : timer->flags |= TIMER_MIGRATING;
1284 :
1285 0 : raw_spin_unlock(&base->lock);
1286 0 : base = new_base;
1287 0 : raw_spin_lock(&base->lock);
1288 0 : WRITE_ONCE(timer->flags,
1289 : (timer->flags & ~TIMER_BASEMASK) | cpu);
1290 : }
1291 29 : forward_timer_base(base);
1292 :
1293 29 : debug_timer_activate(timer);
1294 29 : internal_add_timer(base, timer);
1295 : out_unlock:
1296 58 : raw_spin_unlock_irqrestore(&base->lock, flags);
1297 : }
1298 : EXPORT_SYMBOL_GPL(add_timer_on);
1299 :
1300 : /**
1301 : * __timer_delete - Internal function: Deactivate a timer
1302 : * @timer: The timer to be deactivated
1303 : * @shutdown: If true, this indicates that the timer is about to be
1304 : * shutdown permanently.
1305 : *
1306 : * If @shutdown is true then @timer->function is set to NULL under the
1307 : * timer base lock which prevents further rearming of the time. In that
1308 : * case any attempt to rearm @timer after this function returns will be
1309 : * silently ignored.
1310 : *
1311 : * Return:
1312 : * * %0 - The timer was not pending
1313 : * * %1 - The timer was pending and deactivated
1314 : */
1315 38 : static int __timer_delete(struct timer_list *timer, bool shutdown)
1316 : {
1317 : struct timer_base *base;
1318 : unsigned long flags;
1319 38 : int ret = 0;
1320 :
1321 76 : debug_assert_init(timer);
1322 :
1323 : /*
1324 : * If @shutdown is set then the lock has to be taken whether the
1325 : * timer is pending or not to protect against a concurrent rearm
1326 : * which might hit between the lockless pending check and the lock
1327 : * aquisition. By taking the lock it is ensured that such a newly
1328 : * enqueued timer is dequeued and cannot end up with
1329 : * timer->function == NULL in the expiry code.
1330 : *
1331 : * If timer->function is currently executed, then this makes sure
1332 : * that the callback cannot requeue the timer.
1333 : */
1334 38 : if (timer_pending(timer) || shutdown) {
1335 0 : base = lock_timer_base(timer, &flags);
1336 0 : ret = detach_if_pending(timer, base, true);
1337 0 : if (shutdown)
1338 0 : timer->function = NULL;
1339 0 : raw_spin_unlock_irqrestore(&base->lock, flags);
1340 : }
1341 :
1342 38 : return ret;
1343 : }
1344 :
1345 : /**
1346 : * timer_delete - Deactivate a timer
1347 : * @timer: The timer to be deactivated
1348 : *
1349 : * The function only deactivates a pending timer, but contrary to
1350 : * timer_delete_sync() it does not take into account whether the timer's
1351 : * callback function is concurrently executed on a different CPU or not.
1352 : * It neither prevents rearming of the timer. If @timer can be rearmed
1353 : * concurrently then the return value of this function is meaningless.
1354 : *
1355 : * Return:
1356 : * * %0 - The timer was not pending
1357 : * * %1 - The timer was pending and deactivated
1358 : */
1359 38 : int timer_delete(struct timer_list *timer)
1360 : {
1361 38 : return __timer_delete(timer, false);
1362 : }
1363 : EXPORT_SYMBOL(timer_delete);
1364 :
1365 : /**
1366 : * timer_shutdown - Deactivate a timer and prevent rearming
1367 : * @timer: The timer to be deactivated
1368 : *
1369 : * The function does not wait for an eventually running timer callback on a
1370 : * different CPU but it prevents rearming of the timer. Any attempt to arm
1371 : * @timer after this function returns will be silently ignored.
1372 : *
1373 : * This function is useful for teardown code and should only be used when
1374 : * timer_shutdown_sync() cannot be invoked due to locking or context constraints.
1375 : *
1376 : * Return:
1377 : * * %0 - The timer was not pending
1378 : * * %1 - The timer was pending
1379 : */
1380 0 : int timer_shutdown(struct timer_list *timer)
1381 : {
1382 0 : return __timer_delete(timer, true);
1383 : }
1384 : EXPORT_SYMBOL_GPL(timer_shutdown);
1385 :
1386 : /**
1387 : * __try_to_del_timer_sync - Internal function: Try to deactivate a timer
1388 : * @timer: Timer to deactivate
1389 : * @shutdown: If true, this indicates that the timer is about to be
1390 : * shutdown permanently.
1391 : *
1392 : * If @shutdown is true then @timer->function is set to NULL under the
1393 : * timer base lock which prevents further rearming of the timer. Any
1394 : * attempt to rearm @timer after this function returns will be silently
1395 : * ignored.
1396 : *
1397 : * This function cannot guarantee that the timer cannot be rearmed
1398 : * right after dropping the base lock if @shutdown is false. That
1399 : * needs to be prevented by the calling code if necessary.
1400 : *
1401 : * Return:
1402 : * * %0 - The timer was not pending
1403 : * * %1 - The timer was pending and deactivated
1404 : * * %-1 - The timer callback function is running on a different CPU
1405 : */
1406 423 : static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
1407 : {
1408 : struct timer_base *base;
1409 : unsigned long flags;
1410 423 : int ret = -1;
1411 :
1412 423 : debug_assert_init(timer);
1413 :
1414 423 : base = lock_timer_base(timer, &flags);
1415 :
1416 423 : if (base->running_timer != timer)
1417 423 : ret = detach_if_pending(timer, base, true);
1418 423 : if (shutdown)
1419 0 : timer->function = NULL;
1420 :
1421 846 : raw_spin_unlock_irqrestore(&base->lock, flags);
1422 :
1423 423 : return ret;
1424 : }
1425 :
1426 : /**
1427 : * try_to_del_timer_sync - Try to deactivate a timer
1428 : * @timer: Timer to deactivate
1429 : *
1430 : * This function tries to deactivate a timer. On success the timer is not
1431 : * queued and the timer callback function is not running on any CPU.
1432 : *
1433 : * This function does not guarantee that the timer cannot be rearmed right
1434 : * after dropping the base lock. That needs to be prevented by the calling
1435 : * code if necessary.
1436 : *
1437 : * Return:
1438 : * * %0 - The timer was not pending
1439 : * * %1 - The timer was pending and deactivated
1440 : * * %-1 - The timer callback function is running on a different CPU
1441 : */
1442 0 : int try_to_del_timer_sync(struct timer_list *timer)
1443 : {
1444 0 : return __try_to_del_timer_sync(timer, false);
1445 : }
1446 : EXPORT_SYMBOL(try_to_del_timer_sync);
1447 :
1448 : #ifdef CONFIG_PREEMPT_RT
1449 : static __init void timer_base_init_expiry_lock(struct timer_base *base)
1450 : {
1451 : spin_lock_init(&base->expiry_lock);
1452 : }
1453 :
1454 : static inline void timer_base_lock_expiry(struct timer_base *base)
1455 : {
1456 : spin_lock(&base->expiry_lock);
1457 : }
1458 :
1459 : static inline void timer_base_unlock_expiry(struct timer_base *base)
1460 : {
1461 : spin_unlock(&base->expiry_lock);
1462 : }
1463 :
1464 : /*
1465 : * The counterpart to del_timer_wait_running().
1466 : *
1467 : * If there is a waiter for base->expiry_lock, then it was waiting for the
1468 : * timer callback to finish. Drop expiry_lock and reacquire it. That allows
1469 : * the waiter to acquire the lock and make progress.
1470 : */
1471 : static void timer_sync_wait_running(struct timer_base *base)
1472 : {
1473 : if (atomic_read(&base->timer_waiters)) {
1474 : raw_spin_unlock_irq(&base->lock);
1475 : spin_unlock(&base->expiry_lock);
1476 : spin_lock(&base->expiry_lock);
1477 : raw_spin_lock_irq(&base->lock);
1478 : }
1479 : }
1480 :
1481 : /*
1482 : * This function is called on PREEMPT_RT kernels when the fast path
1483 : * deletion of a timer failed because the timer callback function was
1484 : * running.
1485 : *
1486 : * This prevents priority inversion, if the softirq thread on a remote CPU
1487 : * got preempted, and it prevents a life lock when the task which tries to
1488 : * delete a timer preempted the softirq thread running the timer callback
1489 : * function.
1490 : */
1491 : static void del_timer_wait_running(struct timer_list *timer)
1492 : {
1493 : u32 tf;
1494 :
1495 : tf = READ_ONCE(timer->flags);
1496 : if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
1497 : struct timer_base *base = get_timer_base(tf);
1498 :
1499 : /*
1500 : * Mark the base as contended and grab the expiry lock,
1501 : * which is held by the softirq across the timer
1502 : * callback. Drop the lock immediately so the softirq can
1503 : * expire the next timer. In theory the timer could already
1504 : * be running again, but that's more than unlikely and just
1505 : * causes another wait loop.
1506 : */
1507 : atomic_inc(&base->timer_waiters);
1508 : spin_lock_bh(&base->expiry_lock);
1509 : atomic_dec(&base->timer_waiters);
1510 : spin_unlock_bh(&base->expiry_lock);
1511 : }
1512 : }
1513 : #else
1514 : static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
1515 : static inline void timer_base_lock_expiry(struct timer_base *base) { }
1516 : static inline void timer_base_unlock_expiry(struct timer_base *base) { }
1517 : static inline void timer_sync_wait_running(struct timer_base *base) { }
1518 : static inline void del_timer_wait_running(struct timer_list *timer) { }
1519 : #endif
1520 :
1521 : /**
1522 : * __timer_delete_sync - Internal function: Deactivate a timer and wait
1523 : * for the handler to finish.
1524 : * @timer: The timer to be deactivated
1525 : * @shutdown: If true, @timer->function will be set to NULL under the
1526 : * timer base lock which prevents rearming of @timer
1527 : *
1528 : * If @shutdown is not set the timer can be rearmed later. If the timer can
1529 : * be rearmed concurrently, i.e. after dropping the base lock then the
1530 : * return value is meaningless.
1531 : *
1532 : * If @shutdown is set then @timer->function is set to NULL under timer
1533 : * base lock which prevents rearming of the timer. Any attempt to rearm
1534 : * a shutdown timer is silently ignored.
1535 : *
1536 : * If the timer should be reused after shutdown it has to be initialized
1537 : * again.
1538 : *
1539 : * Return:
1540 : * * %0 - The timer was not pending
1541 : * * %1 - The timer was pending and deactivated
1542 : */
1543 423 : static int __timer_delete_sync(struct timer_list *timer, bool shutdown)
1544 : {
1545 : int ret;
1546 :
1547 : #ifdef CONFIG_LOCKDEP
1548 : unsigned long flags;
1549 :
1550 : /*
1551 : * If lockdep gives a backtrace here, please reference
1552 : * the synchronization rules above.
1553 : */
1554 : local_irq_save(flags);
1555 : lock_map_acquire(&timer->lockdep_map);
1556 : lock_map_release(&timer->lockdep_map);
1557 : local_irq_restore(flags);
1558 : #endif
1559 : /*
1560 : * don't use it in hardirq context, because it
1561 : * could lead to deadlock.
1562 : */
1563 423 : WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE));
1564 :
1565 : /*
1566 : * Must be able to sleep on PREEMPT_RT because of the slowpath in
1567 : * del_timer_wait_running().
1568 : */
1569 : if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
1570 : lockdep_assert_preemption_enabled();
1571 :
1572 : do {
1573 423 : ret = __try_to_del_timer_sync(timer, shutdown);
1574 :
1575 423 : if (unlikely(ret < 0)) {
1576 0 : del_timer_wait_running(timer);
1577 : cpu_relax();
1578 : }
1579 423 : } while (ret < 0);
1580 :
1581 423 : return ret;
1582 : }
1583 :
1584 : /**
1585 : * timer_delete_sync - Deactivate a timer and wait for the handler to finish.
1586 : * @timer: The timer to be deactivated
1587 : *
1588 : * Synchronization rules: Callers must prevent restarting of the timer,
1589 : * otherwise this function is meaningless. It must not be called from
1590 : * interrupt contexts unless the timer is an irqsafe one. The caller must
1591 : * not hold locks which would prevent completion of the timer's callback
1592 : * function. The timer's handler must not call add_timer_on(). Upon exit
1593 : * the timer is not queued and the handler is not running on any CPU.
1594 : *
1595 : * For !irqsafe timers, the caller must not hold locks that are held in
1596 : * interrupt context. Even if the lock has nothing to do with the timer in
1597 : * question. Here's why::
1598 : *
1599 : * CPU0 CPU1
1600 : * ---- ----
1601 : * <SOFTIRQ>
1602 : * call_timer_fn();
1603 : * base->running_timer = mytimer;
1604 : * spin_lock_irq(somelock);
1605 : * <IRQ>
1606 : * spin_lock(somelock);
1607 : * timer_delete_sync(mytimer);
1608 : * while (base->running_timer == mytimer);
1609 : *
1610 : * Now timer_delete_sync() will never return and never release somelock.
1611 : * The interrupt on the other CPU is waiting to grab somelock but it has
1612 : * interrupted the softirq that CPU0 is waiting to finish.
1613 : *
1614 : * This function cannot guarantee that the timer is not rearmed again by
1615 : * some concurrent or preempting code, right after it dropped the base
1616 : * lock. If there is the possibility of a concurrent rearm then the return
1617 : * value of the function is meaningless.
1618 : *
1619 : * If such a guarantee is needed, e.g. for teardown situations then use
1620 : * timer_shutdown_sync() instead.
1621 : *
1622 : * Return:
1623 : * * %0 - The timer was not pending
1624 : * * %1 - The timer was pending and deactivated
1625 : */
1626 2 : int timer_delete_sync(struct timer_list *timer)
1627 : {
1628 423 : return __timer_delete_sync(timer, false);
1629 : }
1630 : EXPORT_SYMBOL(timer_delete_sync);
1631 :
1632 : /**
1633 : * timer_shutdown_sync - Shutdown a timer and prevent rearming
1634 : * @timer: The timer to be shutdown
1635 : *
1636 : * When the function returns it is guaranteed that:
1637 : * - @timer is not queued
1638 : * - The callback function of @timer is not running
1639 : * - @timer cannot be enqueued again. Any attempt to rearm
1640 : * @timer is silently ignored.
1641 : *
1642 : * See timer_delete_sync() for synchronization rules.
1643 : *
1644 : * This function is useful for final teardown of an infrastructure where
1645 : * the timer is subject to a circular dependency problem.
1646 : *
1647 : * A common pattern for this is a timer and a workqueue where the timer can
1648 : * schedule work and work can arm the timer. On shutdown the workqueue must
1649 : * be destroyed and the timer must be prevented from rearming. Unless the
1650 : * code has conditionals like 'if (mything->in_shutdown)' to prevent that
1651 : * there is no way to get this correct with timer_delete_sync().
1652 : *
1653 : * timer_shutdown_sync() is solving the problem. The correct ordering of
1654 : * calls in this case is:
1655 : *
1656 : * timer_shutdown_sync(&mything->timer);
1657 : * workqueue_destroy(&mything->workqueue);
1658 : *
1659 : * After this 'mything' can be safely freed.
1660 : *
1661 : * This obviously implies that the timer is not required to be functional
1662 : * for the rest of the shutdown operation.
1663 : *
1664 : * Return:
1665 : * * %0 - The timer was not pending
1666 : * * %1 - The timer was pending
1667 : */
1668 0 : int timer_shutdown_sync(struct timer_list *timer)
1669 : {
1670 0 : return __timer_delete_sync(timer, true);
1671 : }
1672 : EXPORT_SYMBOL_GPL(timer_shutdown_sync);
1673 :
1674 92 : static void call_timer_fn(struct timer_list *timer,
1675 : void (*fn)(struct timer_list *),
1676 : unsigned long baseclk)
1677 : {
1678 92 : int count = preempt_count();
1679 :
1680 : #ifdef CONFIG_LOCKDEP
1681 : /*
1682 : * It is permissible to free the timer from inside the
1683 : * function that is called from it, this we need to take into
1684 : * account for lockdep too. To avoid bogus "held lock freed"
1685 : * warnings as well as problems when looking into
1686 : * timer->lockdep_map, make a copy and use that here.
1687 : */
1688 : struct lockdep_map lockdep_map;
1689 :
1690 : lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
1691 : #endif
1692 : /*
1693 : * Couple the lock chain with the lock chain at
1694 : * timer_delete_sync() by acquiring the lock_map around the fn()
1695 : * call here and in timer_delete_sync().
1696 : */
1697 : lock_map_acquire(&lockdep_map);
1698 :
1699 92 : trace_timer_expire_entry(timer, baseclk);
1700 92 : fn(timer);
1701 92 : trace_timer_expire_exit(timer);
1702 :
1703 : lock_map_release(&lockdep_map);
1704 :
1705 92 : if (count != preempt_count()) {
1706 0 : WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
1707 : fn, count, preempt_count());
1708 : /*
1709 : * Restore the preempt count. That gives us a decent
1710 : * chance to survive and extract information. If the
1711 : * callback kept a lock held, bad luck, but not worse
1712 : * than the BUG() we had.
1713 : */
1714 : preempt_count_set(count);
1715 : }
1716 92 : }
1717 :
1718 92 : static void expire_timers(struct timer_base *base, struct hlist_head *head)
1719 : {
1720 : /*
1721 : * This value is required only for tracing. base->clk was
1722 : * incremented directly before expire_timers was called. But expiry
1723 : * is related to the old base->clk value.
1724 : */
1725 : unsigned long baseclk = base->clk - 1;
1726 :
1727 184 : while (!hlist_empty(head)) {
1728 : struct timer_list *timer;
1729 : void (*fn)(struct timer_list *);
1730 :
1731 92 : timer = hlist_entry(head->first, struct timer_list, entry);
1732 :
1733 92 : base->running_timer = timer;
1734 92 : detach_timer(timer, true);
1735 :
1736 92 : fn = timer->function;
1737 :
1738 92 : if (WARN_ON_ONCE(!fn)) {
1739 : /* Should never happen. Emphasis on should! */
1740 0 : base->running_timer = NULL;
1741 0 : continue;
1742 : }
1743 :
1744 92 : if (timer->flags & TIMER_IRQSAFE) {
1745 9 : raw_spin_unlock(&base->lock);
1746 9 : call_timer_fn(timer, fn, baseclk);
1747 9 : raw_spin_lock(&base->lock);
1748 9 : base->running_timer = NULL;
1749 : } else {
1750 83 : raw_spin_unlock_irq(&base->lock);
1751 83 : call_timer_fn(timer, fn, baseclk);
1752 83 : raw_spin_lock_irq(&base->lock);
1753 83 : base->running_timer = NULL;
1754 83 : timer_sync_wait_running(base);
1755 : }
1756 : }
1757 92 : }
1758 :
1759 92 : static int collect_expired_timers(struct timer_base *base,
1760 : struct hlist_head *heads)
1761 : {
1762 92 : unsigned long clk = base->clk = base->next_expiry;
1763 : struct hlist_head *vec;
1764 92 : int i, levels = 0;
1765 : unsigned int idx;
1766 :
1767 126 : for (i = 0; i < LVL_DEPTH; i++) {
1768 126 : idx = (clk & LVL_MASK) + i * LVL_SIZE;
1769 :
1770 252 : if (__test_and_clear_bit(idx, base->pending_map)) {
1771 92 : vec = base->vectors + idx;
1772 184 : hlist_move_list(vec, heads++);
1773 92 : levels++;
1774 : }
1775 : /* Is it time to look at the next level? */
1776 126 : if (clk & LVL_CLK_MASK)
1777 : break;
1778 : /* Shift clock for the next level granularity */
1779 34 : clk >>= LVL_CLK_SHIFT;
1780 : }
1781 92 : return levels;
1782 : }
1783 :
1784 : /*
1785 : * Find the next pending bucket of a level. Search from level start (@offset)
1786 : * + @clk upwards and if nothing there, search from start of the level
1787 : * (@offset) up to @offset + clk.
1788 : */
1789 623 : static int next_pending_bucket(struct timer_base *base, unsigned offset,
1790 : unsigned clk)
1791 : {
1792 623 : unsigned pos, start = offset + clk;
1793 623 : unsigned end = offset + LVL_SIZE;
1794 :
1795 623 : pos = find_next_bit(base->pending_map, end, start);
1796 623 : if (pos < end)
1797 157 : return pos - start;
1798 :
1799 466 : pos = find_next_bit(base->pending_map, start, offset);
1800 466 : return pos < start ? pos + LVL_SIZE - start : -1;
1801 : }
1802 :
1803 : /*
1804 : * Search the first expiring timer in the various clock levels. Caller must
1805 : * hold base->lock.
1806 : */
1807 92 : static unsigned long __next_timer_interrupt(struct timer_base *base)
1808 : {
1809 : unsigned long clk, next, adj;
1810 92 : unsigned lvl, offset = 0;
1811 :
1812 92 : next = base->clk + NEXT_TIMER_MAX_DELTA;
1813 92 : clk = base->clk;
1814 696 : for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
1815 623 : int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
1816 623 : unsigned long lvl_clk = clk & LVL_CLK_MASK;
1817 :
1818 623 : if (pos >= 0) {
1819 258 : unsigned long tmp = clk + (unsigned long) pos;
1820 :
1821 258 : tmp <<= LVL_SHIFT(lvl);
1822 258 : if (time_before(tmp, next))
1823 98 : next = tmp;
1824 :
1825 : /*
1826 : * If the next expiration happens before we reach
1827 : * the next level, no need to check further.
1828 : */
1829 258 : if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
1830 : break;
1831 : }
1832 : /*
1833 : * Clock for the next level. If the current level clock lower
1834 : * bits are zero, we look at the next level as is. If not we
1835 : * need to advance it by one because that's going to be the
1836 : * next expiring bucket in that level. base->clk is the next
1837 : * expiring jiffie. So in case of:
1838 : *
1839 : * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1840 : * 0 0 0 0 0 0
1841 : *
1842 : * we have to look at all levels @index 0. With
1843 : *
1844 : * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1845 : * 0 0 0 0 0 2
1846 : *
1847 : * LVL0 has the next expiring bucket @index 2. The upper
1848 : * levels have the next expiring bucket @index 1.
1849 : *
1850 : * In case that the propagation wraps the next level the same
1851 : * rules apply:
1852 : *
1853 : * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1854 : * 0 0 0 0 F 2
1855 : *
1856 : * So after looking at LVL0 we get:
1857 : *
1858 : * LVL5 LVL4 LVL3 LVL2 LVL1
1859 : * 0 0 0 1 0
1860 : *
1861 : * So no propagation from LVL1 to LVL2 because that happened
1862 : * with the add already, but then we need to propagate further
1863 : * from LVL2 to LVL3.
1864 : *
1865 : * So the simple check whether the lower bits of the current
1866 : * level are 0 or not is sufficient for all cases.
1867 : */
1868 604 : adj = lvl_clk ? 1 : 0;
1869 604 : clk >>= LVL_CLK_SHIFT;
1870 604 : clk += adj;
1871 : }
1872 :
1873 92 : base->next_expiry_recalc = false;
1874 92 : base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
1875 :
1876 92 : return next;
1877 : }
1878 :
1879 : #ifdef CONFIG_NO_HZ_COMMON
1880 : /*
1881 : * Check, if the next hrtimer event is before the next timer wheel
1882 : * event:
1883 : */
1884 : static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
1885 : {
1886 : u64 nextevt = hrtimer_get_next_event();
1887 :
1888 : /*
1889 : * If high resolution timers are enabled
1890 : * hrtimer_get_next_event() returns KTIME_MAX.
1891 : */
1892 : if (expires <= nextevt)
1893 : return expires;
1894 :
1895 : /*
1896 : * If the next timer is already expired, return the tick base
1897 : * time so the tick is fired immediately.
1898 : */
1899 : if (nextevt <= basem)
1900 : return basem;
1901 :
1902 : /*
1903 : * Round up to the next jiffie. High resolution timers are
1904 : * off, so the hrtimers are expired in the tick and we need to
1905 : * make sure that this tick really expires the timer to avoid
1906 : * a ping pong of the nohz stop code.
1907 : *
1908 : * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
1909 : */
1910 : return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
1911 : }
1912 :
1913 : /**
1914 : * get_next_timer_interrupt - return the time (clock mono) of the next timer
1915 : * @basej: base time jiffies
1916 : * @basem: base time clock monotonic
1917 : *
1918 : * Returns the tick aligned clock monotonic time of the next pending
1919 : * timer or KTIME_MAX if no timer is pending.
1920 : */
1921 : u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
1922 : {
1923 : struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
1924 : u64 expires = KTIME_MAX;
1925 : unsigned long nextevt;
1926 :
1927 : /*
1928 : * Pretend that there is no timer pending if the cpu is offline.
1929 : * Possible pending timers will be migrated later to an active cpu.
1930 : */
1931 : if (cpu_is_offline(smp_processor_id()))
1932 : return expires;
1933 :
1934 : raw_spin_lock(&base->lock);
1935 : if (base->next_expiry_recalc)
1936 : base->next_expiry = __next_timer_interrupt(base);
1937 : nextevt = base->next_expiry;
1938 :
1939 : /*
1940 : * We have a fresh next event. Check whether we can forward the
1941 : * base. We can only do that when @basej is past base->clk
1942 : * otherwise we might rewind base->clk.
1943 : */
1944 : if (time_after(basej, base->clk)) {
1945 : if (time_after(nextevt, basej))
1946 : base->clk = basej;
1947 : else if (time_after(nextevt, base->clk))
1948 : base->clk = nextevt;
1949 : }
1950 :
1951 : if (time_before_eq(nextevt, basej)) {
1952 : expires = basem;
1953 : base->is_idle = false;
1954 : } else {
1955 : if (base->timers_pending)
1956 : expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
1957 : /*
1958 : * If we expect to sleep more than a tick, mark the base idle.
1959 : * Also the tick is stopped so any added timer must forward
1960 : * the base clk itself to keep granularity small. This idle
1961 : * logic is only maintained for the BASE_STD base, deferrable
1962 : * timers may still see large granularity skew (by design).
1963 : */
1964 : if ((expires - basem) > TICK_NSEC)
1965 : base->is_idle = true;
1966 : }
1967 : raw_spin_unlock(&base->lock);
1968 :
1969 : return cmp_next_hrtimer_event(basem, expires);
1970 : }
1971 :
1972 : /**
1973 : * timer_clear_idle - Clear the idle state of the timer base
1974 : *
1975 : * Called with interrupts disabled
1976 : */
1977 : void timer_clear_idle(void)
1978 : {
1979 : struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
1980 :
1981 : /*
1982 : * We do this unlocked. The worst outcome is a remote enqueue sending
1983 : * a pointless IPI, but taking the lock would just make the window for
1984 : * sending the IPI a few instructions smaller for the cost of taking
1985 : * the lock in the exit from idle path.
1986 : */
1987 : base->is_idle = false;
1988 : }
1989 : #endif
1990 :
1991 : /**
1992 : * __run_timers - run all expired timers (if any) on this CPU.
1993 : * @base: the timer vector to be processed.
1994 : */
1995 92 : static inline void __run_timers(struct timer_base *base)
1996 : {
1997 : struct hlist_head heads[LVL_DEPTH];
1998 : int levels;
1999 :
2000 92 : if (time_before(jiffies, base->next_expiry))
2001 0 : return;
2002 :
2003 92 : timer_base_lock_expiry(base);
2004 92 : raw_spin_lock_irq(&base->lock);
2005 :
2006 276 : while (time_after_eq(jiffies, base->clk) &&
2007 92 : time_after_eq(jiffies, base->next_expiry)) {
2008 92 : levels = collect_expired_timers(base, heads);
2009 : /*
2010 : * The two possible reasons for not finding any expired
2011 : * timer at this clk are that all matching timers have been
2012 : * dequeued or no timer has been queued since
2013 : * base::next_expiry was set to base::clk +
2014 : * NEXT_TIMER_MAX_DELTA.
2015 : */
2016 92 : WARN_ON_ONCE(!levels && !base->next_expiry_recalc
2017 : && base->timers_pending);
2018 92 : base->clk++;
2019 92 : base->next_expiry = __next_timer_interrupt(base);
2020 :
2021 276 : while (levels--)
2022 92 : expire_timers(base, heads + levels);
2023 : }
2024 92 : raw_spin_unlock_irq(&base->lock);
2025 92 : timer_base_unlock_expiry(base);
2026 : }
2027 :
2028 : /*
2029 : * This function runs timers and the timer-tq in bottom half context.
2030 : */
2031 92 : static __latent_entropy void run_timer_softirq(struct softirq_action *h)
2032 : {
2033 92 : struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
2034 :
2035 92 : __run_timers(base);
2036 : if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
2037 : __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
2038 92 : }
2039 :
2040 : /*
2041 : * Called by the local, per-CPU timer interrupt on SMP.
2042 : */
2043 2943 : static void run_local_timers(void)
2044 : {
2045 2943 : struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
2046 :
2047 2943 : hrtimer_run_queues();
2048 : /* Raise the softirq only if required. */
2049 2943 : if (time_before(jiffies, base->next_expiry)) {
2050 : if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
2051 : return;
2052 : /* CPU is awake, so check the deferrable base. */
2053 : base++;
2054 : if (time_before(jiffies, base->next_expiry))
2055 : return;
2056 : }
2057 92 : raise_softirq(TIMER_SOFTIRQ);
2058 : }
2059 :
2060 : /*
2061 : * Called from the timer interrupt handler to charge one tick to the current
2062 : * process. user_tick is 1 if the tick is user time, 0 for system.
2063 : */
2064 2943 : void update_process_times(int user_tick)
2065 : {
2066 2943 : struct task_struct *p = current;
2067 :
2068 : /* Note: this timer irq context must be accounted for as well. */
2069 2943 : account_process_tick(p, user_tick);
2070 2943 : run_local_timers();
2071 2943 : rcu_sched_clock_irq(user_tick);
2072 : #ifdef CONFIG_IRQ_WORK
2073 2943 : if (in_irq())
2074 2943 : irq_work_tick();
2075 : #endif
2076 2943 : scheduler_tick();
2077 : if (IS_ENABLED(CONFIG_POSIX_TIMERS))
2078 2943 : run_posix_cpu_timers();
2079 2943 : }
2080 :
2081 : /*
2082 : * Since schedule_timeout()'s timer is defined on the stack, it must store
2083 : * the target task on the stack as well.
2084 : */
2085 : struct process_timer {
2086 : struct timer_list timer;
2087 : struct task_struct *task;
2088 : };
2089 :
2090 54 : static void process_timeout(struct timer_list *t)
2091 : {
2092 54 : struct process_timer *timeout = from_timer(timeout, t, timer);
2093 :
2094 54 : wake_up_process(timeout->task);
2095 54 : }
2096 :
2097 : /**
2098 : * schedule_timeout - sleep until timeout
2099 : * @timeout: timeout value in jiffies
2100 : *
2101 : * Make the current task sleep until @timeout jiffies have elapsed.
2102 : * The function behavior depends on the current task state
2103 : * (see also set_current_state() description):
2104 : *
2105 : * %TASK_RUNNING - the scheduler is called, but the task does not sleep
2106 : * at all. That happens because sched_submit_work() does nothing for
2107 : * tasks in %TASK_RUNNING state.
2108 : *
2109 : * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
2110 : * pass before the routine returns unless the current task is explicitly
2111 : * woken up, (e.g. by wake_up_process()).
2112 : *
2113 : * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
2114 : * delivered to the current task or the current task is explicitly woken
2115 : * up.
2116 : *
2117 : * The current task state is guaranteed to be %TASK_RUNNING when this
2118 : * routine returns.
2119 : *
2120 : * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
2121 : * the CPU away without a bound on the timeout. In this case the return
2122 : * value will be %MAX_SCHEDULE_TIMEOUT.
2123 : *
2124 : * Returns 0 when the timer has expired otherwise the remaining time in
2125 : * jiffies will be returned. In all cases the return value is guaranteed
2126 : * to be non-negative.
2127 : */
2128 828 : signed long __sched schedule_timeout(signed long timeout)
2129 : {
2130 : struct process_timer timer;
2131 : unsigned long expire;
2132 :
2133 828 : switch (timeout)
2134 : {
2135 : case MAX_SCHEDULE_TIMEOUT:
2136 : /*
2137 : * These two special cases are useful to be comfortable
2138 : * in the caller. Nothing more. We could take
2139 : * MAX_SCHEDULE_TIMEOUT from one of the negative value
2140 : * but I' d like to return a valid offset (>=0) to allow
2141 : * the caller to do everything it want with the retval.
2142 : */
2143 406 : schedule();
2144 406 : goto out;
2145 : default:
2146 : /*
2147 : * Another bit of PARANOID. Note that the retval will be
2148 : * 0 since no piece of kernel is supposed to do a check
2149 : * for a negative retval of schedule_timeout() (since it
2150 : * should never happens anyway). You just have the printk()
2151 : * that will tell you if something is gone wrong and where.
2152 : */
2153 422 : if (timeout < 0) {
2154 0 : printk(KERN_ERR "schedule_timeout: wrong timeout "
2155 : "value %lx\n", timeout);
2156 0 : dump_stack();
2157 0 : __set_current_state(TASK_RUNNING);
2158 0 : goto out;
2159 : }
2160 : }
2161 :
2162 422 : expire = timeout + jiffies;
2163 :
2164 422 : timer.task = current;
2165 844 : timer_setup_on_stack(&timer.timer, process_timeout, 0);
2166 422 : __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
2167 422 : schedule();
2168 421 : del_timer_sync(&timer.timer);
2169 :
2170 : /* Remove the timer from the object tracker */
2171 421 : destroy_timer_on_stack(&timer.timer);
2172 :
2173 421 : timeout = expire - jiffies;
2174 :
2175 : out:
2176 827 : return timeout < 0 ? 0 : timeout;
2177 : }
2178 : EXPORT_SYMBOL(schedule_timeout);
2179 :
2180 : /*
2181 : * We can use __set_current_state() here because schedule_timeout() calls
2182 : * schedule() unconditionally.
2183 : */
2184 0 : signed long __sched schedule_timeout_interruptible(signed long timeout)
2185 : {
2186 0 : __set_current_state(TASK_INTERRUPTIBLE);
2187 0 : return schedule_timeout(timeout);
2188 : }
2189 : EXPORT_SYMBOL(schedule_timeout_interruptible);
2190 :
2191 0 : signed long __sched schedule_timeout_killable(signed long timeout)
2192 : {
2193 0 : __set_current_state(TASK_KILLABLE);
2194 0 : return schedule_timeout(timeout);
2195 : }
2196 : EXPORT_SYMBOL(schedule_timeout_killable);
2197 :
2198 0 : signed long __sched schedule_timeout_uninterruptible(signed long timeout)
2199 : {
2200 0 : __set_current_state(TASK_UNINTERRUPTIBLE);
2201 0 : return schedule_timeout(timeout);
2202 : }
2203 : EXPORT_SYMBOL(schedule_timeout_uninterruptible);
2204 :
2205 : /*
2206 : * Like schedule_timeout_uninterruptible(), except this task will not contribute
2207 : * to load average.
2208 : */
2209 0 : signed long __sched schedule_timeout_idle(signed long timeout)
2210 : {
2211 0 : __set_current_state(TASK_IDLE);
2212 0 : return schedule_timeout(timeout);
2213 : }
2214 : EXPORT_SYMBOL(schedule_timeout_idle);
2215 :
2216 : #ifdef CONFIG_HOTPLUG_CPU
2217 : static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
2218 : {
2219 : struct timer_list *timer;
2220 : int cpu = new_base->cpu;
2221 :
2222 : while (!hlist_empty(head)) {
2223 : timer = hlist_entry(head->first, struct timer_list, entry);
2224 : detach_timer(timer, false);
2225 : timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
2226 : internal_add_timer(new_base, timer);
2227 : }
2228 : }
2229 :
2230 : int timers_prepare_cpu(unsigned int cpu)
2231 : {
2232 : struct timer_base *base;
2233 : int b;
2234 :
2235 : for (b = 0; b < NR_BASES; b++) {
2236 : base = per_cpu_ptr(&timer_bases[b], cpu);
2237 : base->clk = jiffies;
2238 : base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
2239 : base->next_expiry_recalc = false;
2240 : base->timers_pending = false;
2241 : base->is_idle = false;
2242 : }
2243 : return 0;
2244 : }
2245 :
2246 : int timers_dead_cpu(unsigned int cpu)
2247 : {
2248 : struct timer_base *old_base;
2249 : struct timer_base *new_base;
2250 : int b, i;
2251 :
2252 : for (b = 0; b < NR_BASES; b++) {
2253 : old_base = per_cpu_ptr(&timer_bases[b], cpu);
2254 : new_base = get_cpu_ptr(&timer_bases[b]);
2255 : /*
2256 : * The caller is globally serialized and nobody else
2257 : * takes two locks at once, deadlock is not possible.
2258 : */
2259 : raw_spin_lock_irq(&new_base->lock);
2260 : raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
2261 :
2262 : /*
2263 : * The current CPUs base clock might be stale. Update it
2264 : * before moving the timers over.
2265 : */
2266 : forward_timer_base(new_base);
2267 :
2268 : WARN_ON_ONCE(old_base->running_timer);
2269 : old_base->running_timer = NULL;
2270 :
2271 : for (i = 0; i < WHEEL_SIZE; i++)
2272 : migrate_timer_list(new_base, old_base->vectors + i);
2273 :
2274 : raw_spin_unlock(&old_base->lock);
2275 : raw_spin_unlock_irq(&new_base->lock);
2276 : put_cpu_ptr(&timer_bases);
2277 : }
2278 : return 0;
2279 : }
2280 :
2281 : #endif /* CONFIG_HOTPLUG_CPU */
2282 :
2283 1 : static void __init init_timer_cpu(int cpu)
2284 : {
2285 : struct timer_base *base;
2286 : int i;
2287 :
2288 2 : for (i = 0; i < NR_BASES; i++) {
2289 1 : base = per_cpu_ptr(&timer_bases[i], cpu);
2290 1 : base->cpu = cpu;
2291 : raw_spin_lock_init(&base->lock);
2292 1 : base->clk = jiffies;
2293 1 : base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
2294 1 : timer_base_init_expiry_lock(base);
2295 : }
2296 1 : }
2297 :
2298 1 : static void __init init_timer_cpus(void)
2299 : {
2300 : int cpu;
2301 :
2302 2 : for_each_possible_cpu(cpu)
2303 1 : init_timer_cpu(cpu);
2304 1 : }
2305 :
2306 1 : void __init init_timers(void)
2307 : {
2308 1 : init_timer_cpus();
2309 : posix_cputimers_init_work();
2310 1 : open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
2311 1 : }
2312 :
2313 : /**
2314 : * msleep - sleep safely even with waitqueue interruptions
2315 : * @msecs: Time in milliseconds to sleep for
2316 : */
2317 0 : void msleep(unsigned int msecs)
2318 : {
2319 0 : unsigned long timeout = msecs_to_jiffies(msecs) + 1;
2320 :
2321 0 : while (timeout)
2322 0 : timeout = schedule_timeout_uninterruptible(timeout);
2323 0 : }
2324 :
2325 : EXPORT_SYMBOL(msleep);
2326 :
2327 : /**
2328 : * msleep_interruptible - sleep waiting for signals
2329 : * @msecs: Time in milliseconds to sleep for
2330 : */
2331 0 : unsigned long msleep_interruptible(unsigned int msecs)
2332 : {
2333 0 : unsigned long timeout = msecs_to_jiffies(msecs) + 1;
2334 :
2335 0 : while (timeout && !signal_pending(current))
2336 0 : timeout = schedule_timeout_interruptible(timeout);
2337 0 : return jiffies_to_msecs(timeout);
2338 : }
2339 :
2340 : EXPORT_SYMBOL(msleep_interruptible);
2341 :
2342 : /**
2343 : * usleep_range_state - Sleep for an approximate time in a given state
2344 : * @min: Minimum time in usecs to sleep
2345 : * @max: Maximum time in usecs to sleep
2346 : * @state: State of the current task that will be while sleeping
2347 : *
2348 : * In non-atomic context where the exact wakeup time is flexible, use
2349 : * usleep_range_state() instead of udelay(). The sleep improves responsiveness
2350 : * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
2351 : * power usage by allowing hrtimers to take advantage of an already-
2352 : * scheduled interrupt instead of scheduling a new one just for this sleep.
2353 : */
2354 0 : void __sched usleep_range_state(unsigned long min, unsigned long max,
2355 : unsigned int state)
2356 : {
2357 0 : ktime_t exp = ktime_add_us(ktime_get(), min);
2358 0 : u64 delta = (u64)(max - min) * NSEC_PER_USEC;
2359 :
2360 : for (;;) {
2361 0 : __set_current_state(state);
2362 : /* Do not return before the requested sleep time has elapsed */
2363 0 : if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
2364 : break;
2365 : }
2366 0 : }
2367 : EXPORT_SYMBOL(usleep_range_state);
|