Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4 : * policies)
5 : */
6 :
7 : int sched_rr_timeslice = RR_TIMESLICE;
8 : /* More than 4 hours if BW_SHIFT equals 20. */
9 : static const u64 max_rt_runtime = MAX_BW;
10 :
11 : static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
12 :
13 : struct rt_bandwidth def_rt_bandwidth;
14 :
15 : /*
16 : * period over which we measure -rt task CPU usage in us.
17 : * default: 1s
18 : */
19 : unsigned int sysctl_sched_rt_period = 1000000;
20 :
21 : /*
22 : * part of the period that we allow rt tasks to run in us.
23 : * default: 0.95s
24 : */
25 : int sysctl_sched_rt_runtime = 950000;
26 :
27 : #ifdef CONFIG_SYSCTL
28 : static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
29 : static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
30 : size_t *lenp, loff_t *ppos);
31 : static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
32 : size_t *lenp, loff_t *ppos);
33 : static struct ctl_table sched_rt_sysctls[] = {
34 : {
35 : .procname = "sched_rt_period_us",
36 : .data = &sysctl_sched_rt_period,
37 : .maxlen = sizeof(unsigned int),
38 : .mode = 0644,
39 : .proc_handler = sched_rt_handler,
40 : },
41 : {
42 : .procname = "sched_rt_runtime_us",
43 : .data = &sysctl_sched_rt_runtime,
44 : .maxlen = sizeof(int),
45 : .mode = 0644,
46 : .proc_handler = sched_rt_handler,
47 : },
48 : {
49 : .procname = "sched_rr_timeslice_ms",
50 : .data = &sysctl_sched_rr_timeslice,
51 : .maxlen = sizeof(int),
52 : .mode = 0644,
53 : .proc_handler = sched_rr_handler,
54 : },
55 : {}
56 : };
57 :
58 1 : static int __init sched_rt_sysctl_init(void)
59 : {
60 1 : register_sysctl_init("kernel", sched_rt_sysctls);
61 1 : return 0;
62 : }
63 : late_initcall(sched_rt_sysctl_init);
64 : #endif
65 :
66 0 : static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
67 : {
68 0 : struct rt_bandwidth *rt_b =
69 0 : container_of(timer, struct rt_bandwidth, rt_period_timer);
70 0 : int idle = 0;
71 : int overrun;
72 :
73 0 : raw_spin_lock(&rt_b->rt_runtime_lock);
74 : for (;;) {
75 0 : overrun = hrtimer_forward_now(timer, rt_b->rt_period);
76 0 : if (!overrun)
77 : break;
78 :
79 0 : raw_spin_unlock(&rt_b->rt_runtime_lock);
80 0 : idle = do_sched_rt_period_timer(rt_b, overrun);
81 0 : raw_spin_lock(&rt_b->rt_runtime_lock);
82 : }
83 0 : if (idle)
84 0 : rt_b->rt_period_active = 0;
85 0 : raw_spin_unlock(&rt_b->rt_runtime_lock);
86 :
87 0 : return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
88 : }
89 :
90 1 : void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
91 : {
92 1 : rt_b->rt_period = ns_to_ktime(period);
93 1 : rt_b->rt_runtime = runtime;
94 :
95 : raw_spin_lock_init(&rt_b->rt_runtime_lock);
96 :
97 1 : hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
98 : HRTIMER_MODE_REL_HARD);
99 1 : rt_b->rt_period_timer.function = sched_rt_period_timer;
100 1 : }
101 :
102 0 : static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
103 : {
104 0 : raw_spin_lock(&rt_b->rt_runtime_lock);
105 0 : if (!rt_b->rt_period_active) {
106 0 : rt_b->rt_period_active = 1;
107 : /*
108 : * SCHED_DEADLINE updates the bandwidth, as a run away
109 : * RT task with a DL task could hog a CPU. But DL does
110 : * not reset the period. If a deadline task was running
111 : * without an RT task running, it can cause RT tasks to
112 : * throttle when they start up. Kick the timer right away
113 : * to update the period.
114 : */
115 0 : hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
116 0 : hrtimer_start_expires(&rt_b->rt_period_timer,
117 : HRTIMER_MODE_ABS_PINNED_HARD);
118 : }
119 0 : raw_spin_unlock(&rt_b->rt_runtime_lock);
120 0 : }
121 :
122 : static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
123 : {
124 0 : if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
125 : return;
126 :
127 0 : do_start_rt_bandwidth(rt_b);
128 : }
129 :
130 1 : void init_rt_rq(struct rt_rq *rt_rq)
131 : {
132 : struct rt_prio_array *array;
133 : int i;
134 :
135 1 : array = &rt_rq->active;
136 101 : for (i = 0; i < MAX_RT_PRIO; i++) {
137 200 : INIT_LIST_HEAD(array->queue + i);
138 200 : __clear_bit(i, array->bitmap);
139 : }
140 : /* delimiter for bitsearch: */
141 2 : __set_bit(MAX_RT_PRIO, array->bitmap);
142 :
143 : #if defined CONFIG_SMP
144 : rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
145 : rt_rq->highest_prio.next = MAX_RT_PRIO-1;
146 : rt_rq->rt_nr_migratory = 0;
147 : rt_rq->overloaded = 0;
148 : plist_head_init(&rt_rq->pushable_tasks);
149 : #endif /* CONFIG_SMP */
150 : /* We start is dequeued state, because no RT tasks are queued */
151 1 : rt_rq->rt_queued = 0;
152 :
153 1 : rt_rq->rt_time = 0;
154 1 : rt_rq->rt_throttled = 0;
155 1 : rt_rq->rt_runtime = 0;
156 : raw_spin_lock_init(&rt_rq->rt_runtime_lock);
157 1 : }
158 :
159 : #ifdef CONFIG_RT_GROUP_SCHED
160 : static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
161 : {
162 : hrtimer_cancel(&rt_b->rt_period_timer);
163 : }
164 :
165 : #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
166 :
167 : static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
168 : {
169 : #ifdef CONFIG_SCHED_DEBUG
170 : WARN_ON_ONCE(!rt_entity_is_task(rt_se));
171 : #endif
172 : return container_of(rt_se, struct task_struct, rt);
173 : }
174 :
175 : static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
176 : {
177 : return rt_rq->rq;
178 : }
179 :
180 : static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
181 : {
182 : return rt_se->rt_rq;
183 : }
184 :
185 : static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
186 : {
187 : struct rt_rq *rt_rq = rt_se->rt_rq;
188 :
189 : return rt_rq->rq;
190 : }
191 :
192 : void unregister_rt_sched_group(struct task_group *tg)
193 : {
194 : if (tg->rt_se)
195 : destroy_rt_bandwidth(&tg->rt_bandwidth);
196 :
197 : }
198 :
199 : void free_rt_sched_group(struct task_group *tg)
200 : {
201 : int i;
202 :
203 : for_each_possible_cpu(i) {
204 : if (tg->rt_rq)
205 : kfree(tg->rt_rq[i]);
206 : if (tg->rt_se)
207 : kfree(tg->rt_se[i]);
208 : }
209 :
210 : kfree(tg->rt_rq);
211 : kfree(tg->rt_se);
212 : }
213 :
214 : void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
215 : struct sched_rt_entity *rt_se, int cpu,
216 : struct sched_rt_entity *parent)
217 : {
218 : struct rq *rq = cpu_rq(cpu);
219 :
220 : rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
221 : rt_rq->rt_nr_boosted = 0;
222 : rt_rq->rq = rq;
223 : rt_rq->tg = tg;
224 :
225 : tg->rt_rq[cpu] = rt_rq;
226 : tg->rt_se[cpu] = rt_se;
227 :
228 : if (!rt_se)
229 : return;
230 :
231 : if (!parent)
232 : rt_se->rt_rq = &rq->rt;
233 : else
234 : rt_se->rt_rq = parent->my_q;
235 :
236 : rt_se->my_q = rt_rq;
237 : rt_se->parent = parent;
238 : INIT_LIST_HEAD(&rt_se->run_list);
239 : }
240 :
241 : int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
242 : {
243 : struct rt_rq *rt_rq;
244 : struct sched_rt_entity *rt_se;
245 : int i;
246 :
247 : tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
248 : if (!tg->rt_rq)
249 : goto err;
250 : tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
251 : if (!tg->rt_se)
252 : goto err;
253 :
254 : init_rt_bandwidth(&tg->rt_bandwidth,
255 : ktime_to_ns(def_rt_bandwidth.rt_period), 0);
256 :
257 : for_each_possible_cpu(i) {
258 : rt_rq = kzalloc_node(sizeof(struct rt_rq),
259 : GFP_KERNEL, cpu_to_node(i));
260 : if (!rt_rq)
261 : goto err;
262 :
263 : rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
264 : GFP_KERNEL, cpu_to_node(i));
265 : if (!rt_se)
266 : goto err_free_rq;
267 :
268 : init_rt_rq(rt_rq);
269 : rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
270 : init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
271 : }
272 :
273 : return 1;
274 :
275 : err_free_rq:
276 : kfree(rt_rq);
277 : err:
278 : return 0;
279 : }
280 :
281 : #else /* CONFIG_RT_GROUP_SCHED */
282 :
283 : #define rt_entity_is_task(rt_se) (1)
284 :
285 : static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
286 : {
287 0 : return container_of(rt_se, struct task_struct, rt);
288 : }
289 :
290 : static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
291 : {
292 0 : return container_of(rt_rq, struct rq, rt);
293 : }
294 :
295 : static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
296 : {
297 0 : struct task_struct *p = rt_task_of(rt_se);
298 :
299 0 : return task_rq(p);
300 : }
301 :
302 : static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
303 : {
304 0 : struct rq *rq = rq_of_rt_se(rt_se);
305 :
306 : return &rq->rt;
307 : }
308 :
309 0 : void unregister_rt_sched_group(struct task_group *tg) { }
310 :
311 0 : void free_rt_sched_group(struct task_group *tg) { }
312 :
313 0 : int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
314 : {
315 0 : return 1;
316 : }
317 : #endif /* CONFIG_RT_GROUP_SCHED */
318 :
319 : #ifdef CONFIG_SMP
320 :
321 : static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
322 : {
323 : /* Try to pull RT tasks here if we lower this rq's prio */
324 : return rq->online && rq->rt.highest_prio.curr > prev->prio;
325 : }
326 :
327 : static inline int rt_overloaded(struct rq *rq)
328 : {
329 : return atomic_read(&rq->rd->rto_count);
330 : }
331 :
332 : static inline void rt_set_overload(struct rq *rq)
333 : {
334 : if (!rq->online)
335 : return;
336 :
337 : cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
338 : /*
339 : * Make sure the mask is visible before we set
340 : * the overload count. That is checked to determine
341 : * if we should look at the mask. It would be a shame
342 : * if we looked at the mask, but the mask was not
343 : * updated yet.
344 : *
345 : * Matched by the barrier in pull_rt_task().
346 : */
347 : smp_wmb();
348 : atomic_inc(&rq->rd->rto_count);
349 : }
350 :
351 : static inline void rt_clear_overload(struct rq *rq)
352 : {
353 : if (!rq->online)
354 : return;
355 :
356 : /* the order here really doesn't matter */
357 : atomic_dec(&rq->rd->rto_count);
358 : cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
359 : }
360 :
361 : static void update_rt_migration(struct rt_rq *rt_rq)
362 : {
363 : if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
364 : if (!rt_rq->overloaded) {
365 : rt_set_overload(rq_of_rt_rq(rt_rq));
366 : rt_rq->overloaded = 1;
367 : }
368 : } else if (rt_rq->overloaded) {
369 : rt_clear_overload(rq_of_rt_rq(rt_rq));
370 : rt_rq->overloaded = 0;
371 : }
372 : }
373 :
374 : static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
375 : {
376 : struct task_struct *p;
377 :
378 : if (!rt_entity_is_task(rt_se))
379 : return;
380 :
381 : p = rt_task_of(rt_se);
382 : rt_rq = &rq_of_rt_rq(rt_rq)->rt;
383 :
384 : rt_rq->rt_nr_total++;
385 : if (p->nr_cpus_allowed > 1)
386 : rt_rq->rt_nr_migratory++;
387 :
388 : update_rt_migration(rt_rq);
389 : }
390 :
391 : static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
392 : {
393 : struct task_struct *p;
394 :
395 : if (!rt_entity_is_task(rt_se))
396 : return;
397 :
398 : p = rt_task_of(rt_se);
399 : rt_rq = &rq_of_rt_rq(rt_rq)->rt;
400 :
401 : rt_rq->rt_nr_total--;
402 : if (p->nr_cpus_allowed > 1)
403 : rt_rq->rt_nr_migratory--;
404 :
405 : update_rt_migration(rt_rq);
406 : }
407 :
408 : static inline int has_pushable_tasks(struct rq *rq)
409 : {
410 : return !plist_head_empty(&rq->rt.pushable_tasks);
411 : }
412 :
413 : static DEFINE_PER_CPU(struct balance_callback, rt_push_head);
414 : static DEFINE_PER_CPU(struct balance_callback, rt_pull_head);
415 :
416 : static void push_rt_tasks(struct rq *);
417 : static void pull_rt_task(struct rq *);
418 :
419 : static inline void rt_queue_push_tasks(struct rq *rq)
420 : {
421 : if (!has_pushable_tasks(rq))
422 : return;
423 :
424 : queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
425 : }
426 :
427 : static inline void rt_queue_pull_task(struct rq *rq)
428 : {
429 : queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
430 : }
431 :
432 : static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
433 : {
434 : plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
435 : plist_node_init(&p->pushable_tasks, p->prio);
436 : plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
437 :
438 : /* Update the highest prio pushable task */
439 : if (p->prio < rq->rt.highest_prio.next)
440 : rq->rt.highest_prio.next = p->prio;
441 : }
442 :
443 : static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
444 : {
445 : plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
446 :
447 : /* Update the new highest prio pushable task */
448 : if (has_pushable_tasks(rq)) {
449 : p = plist_first_entry(&rq->rt.pushable_tasks,
450 : struct task_struct, pushable_tasks);
451 : rq->rt.highest_prio.next = p->prio;
452 : } else {
453 : rq->rt.highest_prio.next = MAX_RT_PRIO-1;
454 : }
455 : }
456 :
457 : #else
458 :
459 : static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
460 : {
461 : }
462 :
463 : static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
464 : {
465 : }
466 :
467 : static inline
468 : void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
469 : {
470 : }
471 :
472 : static inline
473 : void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
474 : {
475 : }
476 :
477 : static inline void rt_queue_push_tasks(struct rq *rq)
478 : {
479 : }
480 : #endif /* CONFIG_SMP */
481 :
482 : static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
483 : static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
484 :
485 : static inline int on_rt_rq(struct sched_rt_entity *rt_se)
486 : {
487 0 : return rt_se->on_rq;
488 : }
489 :
490 : #ifdef CONFIG_UCLAMP_TASK
491 : /*
492 : * Verify the fitness of task @p to run on @cpu taking into account the uclamp
493 : * settings.
494 : *
495 : * This check is only important for heterogeneous systems where uclamp_min value
496 : * is higher than the capacity of a @cpu. For non-heterogeneous system this
497 : * function will always return true.
498 : *
499 : * The function will return true if the capacity of the @cpu is >= the
500 : * uclamp_min and false otherwise.
501 : *
502 : * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
503 : * > uclamp_max.
504 : */
505 : static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
506 : {
507 : unsigned int min_cap;
508 : unsigned int max_cap;
509 : unsigned int cpu_cap;
510 :
511 : /* Only heterogeneous systems can benefit from this check */
512 : if (!sched_asym_cpucap_active())
513 : return true;
514 :
515 : min_cap = uclamp_eff_value(p, UCLAMP_MIN);
516 : max_cap = uclamp_eff_value(p, UCLAMP_MAX);
517 :
518 : cpu_cap = capacity_orig_of(cpu);
519 :
520 : return cpu_cap >= min(min_cap, max_cap);
521 : }
522 : #else
523 : static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
524 : {
525 : return true;
526 : }
527 : #endif
528 :
529 : #ifdef CONFIG_RT_GROUP_SCHED
530 :
531 : static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
532 : {
533 : if (!rt_rq->tg)
534 : return RUNTIME_INF;
535 :
536 : return rt_rq->rt_runtime;
537 : }
538 :
539 : static inline u64 sched_rt_period(struct rt_rq *rt_rq)
540 : {
541 : return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
542 : }
543 :
544 : typedef struct task_group *rt_rq_iter_t;
545 :
546 : static inline struct task_group *next_task_group(struct task_group *tg)
547 : {
548 : do {
549 : tg = list_entry_rcu(tg->list.next,
550 : typeof(struct task_group), list);
551 : } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
552 :
553 : if (&tg->list == &task_groups)
554 : tg = NULL;
555 :
556 : return tg;
557 : }
558 :
559 : #define for_each_rt_rq(rt_rq, iter, rq) \
560 : for (iter = container_of(&task_groups, typeof(*iter), list); \
561 : (iter = next_task_group(iter)) && \
562 : (rt_rq = iter->rt_rq[cpu_of(rq)]);)
563 :
564 : #define for_each_sched_rt_entity(rt_se) \
565 : for (; rt_se; rt_se = rt_se->parent)
566 :
567 : static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
568 : {
569 : return rt_se->my_q;
570 : }
571 :
572 : static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
573 : static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
574 :
575 : static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
576 : {
577 : struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
578 : struct rq *rq = rq_of_rt_rq(rt_rq);
579 : struct sched_rt_entity *rt_se;
580 :
581 : int cpu = cpu_of(rq);
582 :
583 : rt_se = rt_rq->tg->rt_se[cpu];
584 :
585 : if (rt_rq->rt_nr_running) {
586 : if (!rt_se)
587 : enqueue_top_rt_rq(rt_rq);
588 : else if (!on_rt_rq(rt_se))
589 : enqueue_rt_entity(rt_se, 0);
590 :
591 : if (rt_rq->highest_prio.curr < curr->prio)
592 : resched_curr(rq);
593 : }
594 : }
595 :
596 : static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
597 : {
598 : struct sched_rt_entity *rt_se;
599 : int cpu = cpu_of(rq_of_rt_rq(rt_rq));
600 :
601 : rt_se = rt_rq->tg->rt_se[cpu];
602 :
603 : if (!rt_se) {
604 : dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
605 : /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
606 : cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
607 : }
608 : else if (on_rt_rq(rt_se))
609 : dequeue_rt_entity(rt_se, 0);
610 : }
611 :
612 : static inline int rt_rq_throttled(struct rt_rq *rt_rq)
613 : {
614 : return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
615 : }
616 :
617 : static int rt_se_boosted(struct sched_rt_entity *rt_se)
618 : {
619 : struct rt_rq *rt_rq = group_rt_rq(rt_se);
620 : struct task_struct *p;
621 :
622 : if (rt_rq)
623 : return !!rt_rq->rt_nr_boosted;
624 :
625 : p = rt_task_of(rt_se);
626 : return p->prio != p->normal_prio;
627 : }
628 :
629 : #ifdef CONFIG_SMP
630 : static inline const struct cpumask *sched_rt_period_mask(void)
631 : {
632 : return this_rq()->rd->span;
633 : }
634 : #else
635 : static inline const struct cpumask *sched_rt_period_mask(void)
636 : {
637 : return cpu_online_mask;
638 : }
639 : #endif
640 :
641 : static inline
642 : struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
643 : {
644 : return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
645 : }
646 :
647 : static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
648 : {
649 : return &rt_rq->tg->rt_bandwidth;
650 : }
651 :
652 : #else /* !CONFIG_RT_GROUP_SCHED */
653 :
654 : static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
655 : {
656 : return rt_rq->rt_runtime;
657 : }
658 :
659 : static inline u64 sched_rt_period(struct rt_rq *rt_rq)
660 : {
661 0 : return ktime_to_ns(def_rt_bandwidth.rt_period);
662 : }
663 :
664 : typedef struct rt_rq *rt_rq_iter_t;
665 :
666 : #define for_each_rt_rq(rt_rq, iter, rq) \
667 : for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
668 :
669 : #define for_each_sched_rt_entity(rt_se) \
670 : for (; rt_se; rt_se = NULL)
671 :
672 : static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
673 : {
674 : return NULL;
675 : }
676 :
677 0 : static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
678 : {
679 0 : struct rq *rq = rq_of_rt_rq(rt_rq);
680 :
681 0 : if (!rt_rq->rt_nr_running)
682 : return;
683 :
684 0 : enqueue_top_rt_rq(rt_rq);
685 0 : resched_curr(rq);
686 : }
687 :
688 : static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
689 : {
690 0 : dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
691 : }
692 :
693 : static inline int rt_rq_throttled(struct rt_rq *rt_rq)
694 : {
695 : return rt_rq->rt_throttled;
696 : }
697 :
698 : static inline const struct cpumask *sched_rt_period_mask(void)
699 : {
700 : return cpu_online_mask;
701 : }
702 :
703 : static inline
704 : struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
705 : {
706 0 : return &cpu_rq(cpu)->rt;
707 : }
708 :
709 : static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
710 : {
711 : return &def_rt_bandwidth;
712 : }
713 :
714 : #endif /* CONFIG_RT_GROUP_SCHED */
715 :
716 0 : bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
717 : {
718 0 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
719 :
720 0 : return (hrtimer_active(&rt_b->rt_period_timer) ||
721 0 : rt_rq->rt_time < rt_b->rt_runtime);
722 : }
723 :
724 : #ifdef CONFIG_SMP
725 : /*
726 : * We ran out of runtime, see if we can borrow some from our neighbours.
727 : */
728 : static void do_balance_runtime(struct rt_rq *rt_rq)
729 : {
730 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
731 : struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
732 : int i, weight;
733 : u64 rt_period;
734 :
735 : weight = cpumask_weight(rd->span);
736 :
737 : raw_spin_lock(&rt_b->rt_runtime_lock);
738 : rt_period = ktime_to_ns(rt_b->rt_period);
739 : for_each_cpu(i, rd->span) {
740 : struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
741 : s64 diff;
742 :
743 : if (iter == rt_rq)
744 : continue;
745 :
746 : raw_spin_lock(&iter->rt_runtime_lock);
747 : /*
748 : * Either all rqs have inf runtime and there's nothing to steal
749 : * or __disable_runtime() below sets a specific rq to inf to
750 : * indicate its been disabled and disallow stealing.
751 : */
752 : if (iter->rt_runtime == RUNTIME_INF)
753 : goto next;
754 :
755 : /*
756 : * From runqueues with spare time, take 1/n part of their
757 : * spare time, but no more than our period.
758 : */
759 : diff = iter->rt_runtime - iter->rt_time;
760 : if (diff > 0) {
761 : diff = div_u64((u64)diff, weight);
762 : if (rt_rq->rt_runtime + diff > rt_period)
763 : diff = rt_period - rt_rq->rt_runtime;
764 : iter->rt_runtime -= diff;
765 : rt_rq->rt_runtime += diff;
766 : if (rt_rq->rt_runtime == rt_period) {
767 : raw_spin_unlock(&iter->rt_runtime_lock);
768 : break;
769 : }
770 : }
771 : next:
772 : raw_spin_unlock(&iter->rt_runtime_lock);
773 : }
774 : raw_spin_unlock(&rt_b->rt_runtime_lock);
775 : }
776 :
777 : /*
778 : * Ensure this RQ takes back all the runtime it lend to its neighbours.
779 : */
780 : static void __disable_runtime(struct rq *rq)
781 : {
782 : struct root_domain *rd = rq->rd;
783 : rt_rq_iter_t iter;
784 : struct rt_rq *rt_rq;
785 :
786 : if (unlikely(!scheduler_running))
787 : return;
788 :
789 : for_each_rt_rq(rt_rq, iter, rq) {
790 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
791 : s64 want;
792 : int i;
793 :
794 : raw_spin_lock(&rt_b->rt_runtime_lock);
795 : raw_spin_lock(&rt_rq->rt_runtime_lock);
796 : /*
797 : * Either we're all inf and nobody needs to borrow, or we're
798 : * already disabled and thus have nothing to do, or we have
799 : * exactly the right amount of runtime to take out.
800 : */
801 : if (rt_rq->rt_runtime == RUNTIME_INF ||
802 : rt_rq->rt_runtime == rt_b->rt_runtime)
803 : goto balanced;
804 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
805 :
806 : /*
807 : * Calculate the difference between what we started out with
808 : * and what we current have, that's the amount of runtime
809 : * we lend and now have to reclaim.
810 : */
811 : want = rt_b->rt_runtime - rt_rq->rt_runtime;
812 :
813 : /*
814 : * Greedy reclaim, take back as much as we can.
815 : */
816 : for_each_cpu(i, rd->span) {
817 : struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
818 : s64 diff;
819 :
820 : /*
821 : * Can't reclaim from ourselves or disabled runqueues.
822 : */
823 : if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
824 : continue;
825 :
826 : raw_spin_lock(&iter->rt_runtime_lock);
827 : if (want > 0) {
828 : diff = min_t(s64, iter->rt_runtime, want);
829 : iter->rt_runtime -= diff;
830 : want -= diff;
831 : } else {
832 : iter->rt_runtime -= want;
833 : want -= want;
834 : }
835 : raw_spin_unlock(&iter->rt_runtime_lock);
836 :
837 : if (!want)
838 : break;
839 : }
840 :
841 : raw_spin_lock(&rt_rq->rt_runtime_lock);
842 : /*
843 : * We cannot be left wanting - that would mean some runtime
844 : * leaked out of the system.
845 : */
846 : WARN_ON_ONCE(want);
847 : balanced:
848 : /*
849 : * Disable all the borrow logic by pretending we have inf
850 : * runtime - in which case borrowing doesn't make sense.
851 : */
852 : rt_rq->rt_runtime = RUNTIME_INF;
853 : rt_rq->rt_throttled = 0;
854 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
855 : raw_spin_unlock(&rt_b->rt_runtime_lock);
856 :
857 : /* Make rt_rq available for pick_next_task() */
858 : sched_rt_rq_enqueue(rt_rq);
859 : }
860 : }
861 :
862 : static void __enable_runtime(struct rq *rq)
863 : {
864 : rt_rq_iter_t iter;
865 : struct rt_rq *rt_rq;
866 :
867 : if (unlikely(!scheduler_running))
868 : return;
869 :
870 : /*
871 : * Reset each runqueue's bandwidth settings
872 : */
873 : for_each_rt_rq(rt_rq, iter, rq) {
874 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
875 :
876 : raw_spin_lock(&rt_b->rt_runtime_lock);
877 : raw_spin_lock(&rt_rq->rt_runtime_lock);
878 : rt_rq->rt_runtime = rt_b->rt_runtime;
879 : rt_rq->rt_time = 0;
880 : rt_rq->rt_throttled = 0;
881 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
882 : raw_spin_unlock(&rt_b->rt_runtime_lock);
883 : }
884 : }
885 :
886 : static void balance_runtime(struct rt_rq *rt_rq)
887 : {
888 : if (!sched_feat(RT_RUNTIME_SHARE))
889 : return;
890 :
891 : if (rt_rq->rt_time > rt_rq->rt_runtime) {
892 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
893 : do_balance_runtime(rt_rq);
894 : raw_spin_lock(&rt_rq->rt_runtime_lock);
895 : }
896 : }
897 : #else /* !CONFIG_SMP */
898 : static inline void balance_runtime(struct rt_rq *rt_rq) {}
899 : #endif /* CONFIG_SMP */
900 :
901 0 : static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
902 : {
903 0 : int i, idle = 1, throttled = 0;
904 : const struct cpumask *span;
905 :
906 0 : span = sched_rt_period_mask();
907 : #ifdef CONFIG_RT_GROUP_SCHED
908 : /*
909 : * FIXME: isolated CPUs should really leave the root task group,
910 : * whether they are isolcpus or were isolated via cpusets, lest
911 : * the timer run on a CPU which does not service all runqueues,
912 : * potentially leaving other CPUs indefinitely throttled. If
913 : * isolation is really required, the user will turn the throttle
914 : * off to kill the perturbations it causes anyway. Meanwhile,
915 : * this maintains functionality for boot and/or troubleshooting.
916 : */
917 : if (rt_b == &root_task_group.rt_bandwidth)
918 : span = cpu_online_mask;
919 : #endif
920 0 : for_each_cpu(i, span) {
921 0 : int enqueue = 0;
922 0 : struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
923 0 : struct rq *rq = rq_of_rt_rq(rt_rq);
924 : struct rq_flags rf;
925 : int skip;
926 :
927 : /*
928 : * When span == cpu_online_mask, taking each rq->lock
929 : * can be time-consuming. Try to avoid it when possible.
930 : */
931 0 : raw_spin_lock(&rt_rq->rt_runtime_lock);
932 0 : if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
933 0 : rt_rq->rt_runtime = rt_b->rt_runtime;
934 0 : skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
935 0 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
936 0 : if (skip)
937 0 : continue;
938 :
939 0 : rq_lock(rq, &rf);
940 0 : update_rq_clock(rq);
941 :
942 0 : if (rt_rq->rt_time) {
943 : u64 runtime;
944 :
945 0 : raw_spin_lock(&rt_rq->rt_runtime_lock);
946 0 : if (rt_rq->rt_throttled)
947 : balance_runtime(rt_rq);
948 0 : runtime = rt_rq->rt_runtime;
949 0 : rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
950 0 : if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
951 0 : rt_rq->rt_throttled = 0;
952 0 : enqueue = 1;
953 :
954 : /*
955 : * When we're idle and a woken (rt) task is
956 : * throttled check_preempt_curr() will set
957 : * skip_update and the time between the wakeup
958 : * and this unthrottle will get accounted as
959 : * 'runtime'.
960 : */
961 0 : if (rt_rq->rt_nr_running && rq->curr == rq->idle)
962 : rq_clock_cancel_skipupdate(rq);
963 : }
964 0 : if (rt_rq->rt_time || rt_rq->rt_nr_running)
965 0 : idle = 0;
966 0 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
967 0 : } else if (rt_rq->rt_nr_running) {
968 0 : idle = 0;
969 0 : if (!rt_rq_throttled(rt_rq))
970 0 : enqueue = 1;
971 : }
972 0 : if (rt_rq->rt_throttled)
973 0 : throttled = 1;
974 :
975 0 : if (enqueue)
976 0 : sched_rt_rq_enqueue(rt_rq);
977 0 : rq_unlock(rq, &rf);
978 : }
979 :
980 0 : if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
981 : return 1;
982 :
983 : return idle;
984 : }
985 :
986 : static inline int rt_se_prio(struct sched_rt_entity *rt_se)
987 : {
988 : #ifdef CONFIG_RT_GROUP_SCHED
989 : struct rt_rq *rt_rq = group_rt_rq(rt_se);
990 :
991 : if (rt_rq)
992 : return rt_rq->highest_prio.curr;
993 : #endif
994 :
995 0 : return rt_task_of(rt_se)->prio;
996 : }
997 :
998 0 : static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
999 : {
1000 0 : u64 runtime = sched_rt_runtime(rt_rq);
1001 :
1002 0 : if (rt_rq->rt_throttled)
1003 : return rt_rq_throttled(rt_rq);
1004 :
1005 0 : if (runtime >= sched_rt_period(rt_rq))
1006 : return 0;
1007 :
1008 0 : balance_runtime(rt_rq);
1009 0 : runtime = sched_rt_runtime(rt_rq);
1010 0 : if (runtime == RUNTIME_INF)
1011 : return 0;
1012 :
1013 0 : if (rt_rq->rt_time > runtime) {
1014 0 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
1015 :
1016 : /*
1017 : * Don't actually throttle groups that have no runtime assigned
1018 : * but accrue some time due to boosting.
1019 : */
1020 0 : if (likely(rt_b->rt_runtime)) {
1021 0 : rt_rq->rt_throttled = 1;
1022 0 : printk_deferred_once("sched: RT throttling activated\n");
1023 : } else {
1024 : /*
1025 : * In case we did anyway, make it go away,
1026 : * replenishment is a joke, since it will replenish us
1027 : * with exactly 0 ns.
1028 : */
1029 0 : rt_rq->rt_time = 0;
1030 : }
1031 :
1032 0 : if (rt_rq_throttled(rt_rq)) {
1033 0 : sched_rt_rq_dequeue(rt_rq);
1034 0 : return 1;
1035 : }
1036 : }
1037 :
1038 : return 0;
1039 : }
1040 :
1041 : /*
1042 : * Update the current task's runtime statistics. Skip current tasks that
1043 : * are not in our scheduling class.
1044 : */
1045 0 : static void update_curr_rt(struct rq *rq)
1046 : {
1047 0 : struct task_struct *curr = rq->curr;
1048 0 : struct sched_rt_entity *rt_se = &curr->rt;
1049 : u64 delta_exec;
1050 : u64 now;
1051 :
1052 0 : if (curr->sched_class != &rt_sched_class)
1053 : return;
1054 :
1055 0 : now = rq_clock_task(rq);
1056 0 : delta_exec = now - curr->se.exec_start;
1057 0 : if (unlikely((s64)delta_exec <= 0))
1058 : return;
1059 :
1060 : schedstat_set(curr->stats.exec_max,
1061 : max(curr->stats.exec_max, delta_exec));
1062 :
1063 0 : trace_sched_stat_runtime(curr, delta_exec, 0);
1064 :
1065 0 : update_current_exec_runtime(curr, now, delta_exec);
1066 :
1067 0 : if (!rt_bandwidth_enabled())
1068 : return;
1069 :
1070 0 : for_each_sched_rt_entity(rt_se) {
1071 0 : struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1072 : int exceeded;
1073 :
1074 0 : if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
1075 0 : raw_spin_lock(&rt_rq->rt_runtime_lock);
1076 0 : rt_rq->rt_time += delta_exec;
1077 0 : exceeded = sched_rt_runtime_exceeded(rt_rq);
1078 0 : if (exceeded)
1079 0 : resched_curr(rq);
1080 0 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
1081 0 : if (exceeded)
1082 0 : do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
1083 : }
1084 : }
1085 : }
1086 :
1087 : static void
1088 0 : dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
1089 : {
1090 0 : struct rq *rq = rq_of_rt_rq(rt_rq);
1091 :
1092 0 : BUG_ON(&rq->rt != rt_rq);
1093 :
1094 0 : if (!rt_rq->rt_queued)
1095 : return;
1096 :
1097 0 : BUG_ON(!rq->nr_running);
1098 :
1099 0 : sub_nr_running(rq, count);
1100 0 : rt_rq->rt_queued = 0;
1101 :
1102 : }
1103 :
1104 : static void
1105 0 : enqueue_top_rt_rq(struct rt_rq *rt_rq)
1106 : {
1107 0 : struct rq *rq = rq_of_rt_rq(rt_rq);
1108 :
1109 0 : BUG_ON(&rq->rt != rt_rq);
1110 :
1111 0 : if (rt_rq->rt_queued)
1112 : return;
1113 :
1114 0 : if (rt_rq_throttled(rt_rq))
1115 : return;
1116 :
1117 0 : if (rt_rq->rt_nr_running) {
1118 0 : add_nr_running(rq, rt_rq->rt_nr_running);
1119 0 : rt_rq->rt_queued = 1;
1120 : }
1121 :
1122 : /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1123 : cpufreq_update_util(rq, 0);
1124 : }
1125 :
1126 : #if defined CONFIG_SMP
1127 :
1128 : static void
1129 : inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1130 : {
1131 : struct rq *rq = rq_of_rt_rq(rt_rq);
1132 :
1133 : #ifdef CONFIG_RT_GROUP_SCHED
1134 : /*
1135 : * Change rq's cpupri only if rt_rq is the top queue.
1136 : */
1137 : if (&rq->rt != rt_rq)
1138 : return;
1139 : #endif
1140 : if (rq->online && prio < prev_prio)
1141 : cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1142 : }
1143 :
1144 : static void
1145 : dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1146 : {
1147 : struct rq *rq = rq_of_rt_rq(rt_rq);
1148 :
1149 : #ifdef CONFIG_RT_GROUP_SCHED
1150 : /*
1151 : * Change rq's cpupri only if rt_rq is the top queue.
1152 : */
1153 : if (&rq->rt != rt_rq)
1154 : return;
1155 : #endif
1156 : if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1157 : cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1158 : }
1159 :
1160 : #else /* CONFIG_SMP */
1161 :
1162 : static inline
1163 : void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1164 : static inline
1165 : void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1166 :
1167 : #endif /* CONFIG_SMP */
1168 :
1169 : #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1170 : static void
1171 : inc_rt_prio(struct rt_rq *rt_rq, int prio)
1172 : {
1173 : int prev_prio = rt_rq->highest_prio.curr;
1174 :
1175 : if (prio < prev_prio)
1176 : rt_rq->highest_prio.curr = prio;
1177 :
1178 : inc_rt_prio_smp(rt_rq, prio, prev_prio);
1179 : }
1180 :
1181 : static void
1182 : dec_rt_prio(struct rt_rq *rt_rq, int prio)
1183 : {
1184 : int prev_prio = rt_rq->highest_prio.curr;
1185 :
1186 : if (rt_rq->rt_nr_running) {
1187 :
1188 : WARN_ON(prio < prev_prio);
1189 :
1190 : /*
1191 : * This may have been our highest task, and therefore
1192 : * we may have some recomputation to do
1193 : */
1194 : if (prio == prev_prio) {
1195 : struct rt_prio_array *array = &rt_rq->active;
1196 :
1197 : rt_rq->highest_prio.curr =
1198 : sched_find_first_bit(array->bitmap);
1199 : }
1200 :
1201 : } else {
1202 : rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
1203 : }
1204 :
1205 : dec_rt_prio_smp(rt_rq, prio, prev_prio);
1206 : }
1207 :
1208 : #else
1209 :
1210 : static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1211 : static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1212 :
1213 : #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1214 :
1215 : #ifdef CONFIG_RT_GROUP_SCHED
1216 :
1217 : static void
1218 : inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1219 : {
1220 : if (rt_se_boosted(rt_se))
1221 : rt_rq->rt_nr_boosted++;
1222 :
1223 : if (rt_rq->tg)
1224 : start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1225 : }
1226 :
1227 : static void
1228 : dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1229 : {
1230 : if (rt_se_boosted(rt_se))
1231 : rt_rq->rt_nr_boosted--;
1232 :
1233 : WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1234 : }
1235 :
1236 : #else /* CONFIG_RT_GROUP_SCHED */
1237 :
1238 : static void
1239 : inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1240 : {
1241 0 : start_rt_bandwidth(&def_rt_bandwidth);
1242 : }
1243 :
1244 : static inline
1245 : void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1246 :
1247 : #endif /* CONFIG_RT_GROUP_SCHED */
1248 :
1249 : static inline
1250 : unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1251 : {
1252 0 : struct rt_rq *group_rq = group_rt_rq(rt_se);
1253 :
1254 : if (group_rq)
1255 : return group_rq->rt_nr_running;
1256 : else
1257 : return 1;
1258 : }
1259 :
1260 : static inline
1261 : unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1262 : {
1263 0 : struct rt_rq *group_rq = group_rt_rq(rt_se);
1264 : struct task_struct *tsk;
1265 :
1266 : if (group_rq)
1267 : return group_rq->rr_nr_running;
1268 :
1269 0 : tsk = rt_task_of(rt_se);
1270 :
1271 0 : return (tsk->policy == SCHED_RR) ? 1 : 0;
1272 : }
1273 :
1274 : static inline
1275 0 : void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1276 : {
1277 0 : int prio = rt_se_prio(rt_se);
1278 :
1279 0 : WARN_ON(!rt_prio(prio));
1280 0 : rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1281 0 : rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1282 :
1283 0 : inc_rt_prio(rt_rq, prio);
1284 0 : inc_rt_migration(rt_se, rt_rq);
1285 0 : inc_rt_group(rt_se, rt_rq);
1286 0 : }
1287 :
1288 : static inline
1289 0 : void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1290 : {
1291 0 : WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1292 0 : WARN_ON(!rt_rq->rt_nr_running);
1293 0 : rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1294 0 : rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1295 :
1296 0 : dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1297 0 : dec_rt_migration(rt_se, rt_rq);
1298 0 : dec_rt_group(rt_se, rt_rq);
1299 0 : }
1300 :
1301 : /*
1302 : * Change rt_se->run_list location unless SAVE && !MOVE
1303 : *
1304 : * assumes ENQUEUE/DEQUEUE flags match
1305 : */
1306 : static inline bool move_entity(unsigned int flags)
1307 : {
1308 0 : if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1309 : return false;
1310 :
1311 : return true;
1312 : }
1313 :
1314 : static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1315 : {
1316 0 : list_del_init(&rt_se->run_list);
1317 :
1318 0 : if (list_empty(array->queue + rt_se_prio(rt_se)))
1319 0 : __clear_bit(rt_se_prio(rt_se), array->bitmap);
1320 :
1321 0 : rt_se->on_list = 0;
1322 : }
1323 :
1324 : static inline struct sched_statistics *
1325 : __schedstats_from_rt_se(struct sched_rt_entity *rt_se)
1326 : {
1327 : #ifdef CONFIG_RT_GROUP_SCHED
1328 : /* schedstats is not supported for rt group. */
1329 : if (!rt_entity_is_task(rt_se))
1330 : return NULL;
1331 : #endif
1332 :
1333 : return &rt_task_of(rt_se)->stats;
1334 : }
1335 :
1336 : static inline void
1337 : update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1338 : {
1339 : struct sched_statistics *stats;
1340 0 : struct task_struct *p = NULL;
1341 :
1342 : if (!schedstat_enabled())
1343 : return;
1344 :
1345 : if (rt_entity_is_task(rt_se))
1346 : p = rt_task_of(rt_se);
1347 :
1348 : stats = __schedstats_from_rt_se(rt_se);
1349 : if (!stats)
1350 : return;
1351 :
1352 : __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats);
1353 : }
1354 :
1355 : static inline void
1356 : update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1357 : {
1358 : struct sched_statistics *stats;
1359 : struct task_struct *p = NULL;
1360 :
1361 : if (!schedstat_enabled())
1362 : return;
1363 :
1364 : if (rt_entity_is_task(rt_se))
1365 : p = rt_task_of(rt_se);
1366 :
1367 : stats = __schedstats_from_rt_se(rt_se);
1368 : if (!stats)
1369 : return;
1370 :
1371 : __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats);
1372 : }
1373 :
1374 : static inline void
1375 : update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
1376 : int flags)
1377 : {
1378 : if (!schedstat_enabled())
1379 : return;
1380 :
1381 : if (flags & ENQUEUE_WAKEUP)
1382 : update_stats_enqueue_sleeper_rt(rt_rq, rt_se);
1383 : }
1384 :
1385 : static inline void
1386 : update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1387 : {
1388 : struct sched_statistics *stats;
1389 : struct task_struct *p = NULL;
1390 :
1391 : if (!schedstat_enabled())
1392 : return;
1393 :
1394 : if (rt_entity_is_task(rt_se))
1395 : p = rt_task_of(rt_se);
1396 :
1397 : stats = __schedstats_from_rt_se(rt_se);
1398 : if (!stats)
1399 : return;
1400 :
1401 : __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats);
1402 : }
1403 :
1404 : static inline void
1405 : update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
1406 : int flags)
1407 : {
1408 0 : struct task_struct *p = NULL;
1409 :
1410 : if (!schedstat_enabled())
1411 : return;
1412 :
1413 : if (rt_entity_is_task(rt_se))
1414 : p = rt_task_of(rt_se);
1415 :
1416 : if ((flags & DEQUEUE_SLEEP) && p) {
1417 : unsigned int state;
1418 :
1419 : state = READ_ONCE(p->__state);
1420 : if (state & TASK_INTERRUPTIBLE)
1421 : __schedstat_set(p->stats.sleep_start,
1422 : rq_clock(rq_of_rt_rq(rt_rq)));
1423 :
1424 : if (state & TASK_UNINTERRUPTIBLE)
1425 : __schedstat_set(p->stats.block_start,
1426 : rq_clock(rq_of_rt_rq(rt_rq)));
1427 : }
1428 : }
1429 :
1430 0 : static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1431 : {
1432 0 : struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1433 0 : struct rt_prio_array *array = &rt_rq->active;
1434 0 : struct rt_rq *group_rq = group_rt_rq(rt_se);
1435 0 : struct list_head *queue = array->queue + rt_se_prio(rt_se);
1436 :
1437 : /*
1438 : * Don't enqueue the group if its throttled, or when empty.
1439 : * The latter is a consequence of the former when a child group
1440 : * get throttled and the current group doesn't have any other
1441 : * active members.
1442 : */
1443 : if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1444 : if (rt_se->on_list)
1445 : __delist_rt_entity(rt_se, array);
1446 : return;
1447 : }
1448 :
1449 0 : if (move_entity(flags)) {
1450 0 : WARN_ON_ONCE(rt_se->on_list);
1451 0 : if (flags & ENQUEUE_HEAD)
1452 0 : list_add(&rt_se->run_list, queue);
1453 : else
1454 0 : list_add_tail(&rt_se->run_list, queue);
1455 :
1456 0 : __set_bit(rt_se_prio(rt_se), array->bitmap);
1457 0 : rt_se->on_list = 1;
1458 : }
1459 0 : rt_se->on_rq = 1;
1460 :
1461 0 : inc_rt_tasks(rt_se, rt_rq);
1462 : }
1463 :
1464 0 : static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1465 : {
1466 0 : struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1467 0 : struct rt_prio_array *array = &rt_rq->active;
1468 :
1469 0 : if (move_entity(flags)) {
1470 0 : WARN_ON_ONCE(!rt_se->on_list);
1471 : __delist_rt_entity(rt_se, array);
1472 : }
1473 0 : rt_se->on_rq = 0;
1474 :
1475 0 : dec_rt_tasks(rt_se, rt_rq);
1476 0 : }
1477 :
1478 : /*
1479 : * Because the prio of an upper entry depends on the lower
1480 : * entries, we must remove entries top - down.
1481 : */
1482 0 : static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1483 : {
1484 0 : struct sched_rt_entity *back = NULL;
1485 : unsigned int rt_nr_running;
1486 :
1487 0 : for_each_sched_rt_entity(rt_se) {
1488 0 : rt_se->back = back;
1489 0 : back = rt_se;
1490 : }
1491 :
1492 0 : rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
1493 :
1494 0 : for (rt_se = back; rt_se; rt_se = rt_se->back) {
1495 0 : if (on_rt_rq(rt_se))
1496 0 : __dequeue_rt_entity(rt_se, flags);
1497 : }
1498 :
1499 0 : dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
1500 0 : }
1501 :
1502 0 : static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1503 : {
1504 0 : struct rq *rq = rq_of_rt_se(rt_se);
1505 :
1506 0 : update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
1507 :
1508 0 : dequeue_rt_stack(rt_se, flags);
1509 0 : for_each_sched_rt_entity(rt_se)
1510 0 : __enqueue_rt_entity(rt_se, flags);
1511 0 : enqueue_top_rt_rq(&rq->rt);
1512 0 : }
1513 :
1514 : static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1515 : {
1516 0 : struct rq *rq = rq_of_rt_se(rt_se);
1517 :
1518 0 : update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
1519 :
1520 0 : dequeue_rt_stack(rt_se, flags);
1521 :
1522 0 : for_each_sched_rt_entity(rt_se) {
1523 : struct rt_rq *rt_rq = group_rt_rq(rt_se);
1524 :
1525 : if (rt_rq && rt_rq->rt_nr_running)
1526 : __enqueue_rt_entity(rt_se, flags);
1527 : }
1528 0 : enqueue_top_rt_rq(&rq->rt);
1529 : }
1530 :
1531 : /*
1532 : * Adding/removing a task to/from a priority array:
1533 : */
1534 : static void
1535 0 : enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1536 : {
1537 0 : struct sched_rt_entity *rt_se = &p->rt;
1538 :
1539 0 : if (flags & ENQUEUE_WAKEUP)
1540 0 : rt_se->timeout = 0;
1541 :
1542 : check_schedstat_required();
1543 0 : update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se);
1544 :
1545 0 : enqueue_rt_entity(rt_se, flags);
1546 :
1547 0 : if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1548 : enqueue_pushable_task(rq, p);
1549 0 : }
1550 :
1551 0 : static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1552 : {
1553 0 : struct sched_rt_entity *rt_se = &p->rt;
1554 :
1555 0 : update_curr_rt(rq);
1556 0 : dequeue_rt_entity(rt_se, flags);
1557 :
1558 0 : dequeue_pushable_task(rq, p);
1559 0 : }
1560 :
1561 : /*
1562 : * Put task to the head or the end of the run list without the overhead of
1563 : * dequeue followed by enqueue.
1564 : */
1565 : static void
1566 0 : requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1567 : {
1568 0 : if (on_rt_rq(rt_se)) {
1569 0 : struct rt_prio_array *array = &rt_rq->active;
1570 0 : struct list_head *queue = array->queue + rt_se_prio(rt_se);
1571 :
1572 0 : if (head)
1573 0 : list_move(&rt_se->run_list, queue);
1574 : else
1575 0 : list_move_tail(&rt_se->run_list, queue);
1576 : }
1577 0 : }
1578 :
1579 : static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1580 : {
1581 0 : struct sched_rt_entity *rt_se = &p->rt;
1582 : struct rt_rq *rt_rq;
1583 :
1584 0 : for_each_sched_rt_entity(rt_se) {
1585 0 : rt_rq = rt_rq_of_se(rt_se);
1586 0 : requeue_rt_entity(rt_rq, rt_se, head);
1587 : }
1588 : }
1589 :
1590 0 : static void yield_task_rt(struct rq *rq)
1591 : {
1592 0 : requeue_task_rt(rq, rq->curr, 0);
1593 0 : }
1594 :
1595 : #ifdef CONFIG_SMP
1596 : static int find_lowest_rq(struct task_struct *task);
1597 :
1598 : static int
1599 : select_task_rq_rt(struct task_struct *p, int cpu, int flags)
1600 : {
1601 : struct task_struct *curr;
1602 : struct rq *rq;
1603 : bool test;
1604 :
1605 : /* For anything but wake ups, just return the task_cpu */
1606 : if (!(flags & (WF_TTWU | WF_FORK)))
1607 : goto out;
1608 :
1609 : rq = cpu_rq(cpu);
1610 :
1611 : rcu_read_lock();
1612 : curr = READ_ONCE(rq->curr); /* unlocked access */
1613 :
1614 : /*
1615 : * If the current task on @p's runqueue is an RT task, then
1616 : * try to see if we can wake this RT task up on another
1617 : * runqueue. Otherwise simply start this RT task
1618 : * on its current runqueue.
1619 : *
1620 : * We want to avoid overloading runqueues. If the woken
1621 : * task is a higher priority, then it will stay on this CPU
1622 : * and the lower prio task should be moved to another CPU.
1623 : * Even though this will probably make the lower prio task
1624 : * lose its cache, we do not want to bounce a higher task
1625 : * around just because it gave up its CPU, perhaps for a
1626 : * lock?
1627 : *
1628 : * For equal prio tasks, we just let the scheduler sort it out.
1629 : *
1630 : * Otherwise, just let it ride on the affined RQ and the
1631 : * post-schedule router will push the preempted task away
1632 : *
1633 : * This test is optimistic, if we get it wrong the load-balancer
1634 : * will have to sort it out.
1635 : *
1636 : * We take into account the capacity of the CPU to ensure it fits the
1637 : * requirement of the task - which is only important on heterogeneous
1638 : * systems like big.LITTLE.
1639 : */
1640 : test = curr &&
1641 : unlikely(rt_task(curr)) &&
1642 : (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
1643 :
1644 : if (test || !rt_task_fits_capacity(p, cpu)) {
1645 : int target = find_lowest_rq(p);
1646 :
1647 : /*
1648 : * Bail out if we were forcing a migration to find a better
1649 : * fitting CPU but our search failed.
1650 : */
1651 : if (!test && target != -1 && !rt_task_fits_capacity(p, target))
1652 : goto out_unlock;
1653 :
1654 : /*
1655 : * Don't bother moving it if the destination CPU is
1656 : * not running a lower priority task.
1657 : */
1658 : if (target != -1 &&
1659 : p->prio < cpu_rq(target)->rt.highest_prio.curr)
1660 : cpu = target;
1661 : }
1662 :
1663 : out_unlock:
1664 : rcu_read_unlock();
1665 :
1666 : out:
1667 : return cpu;
1668 : }
1669 :
1670 : static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1671 : {
1672 : /*
1673 : * Current can't be migrated, useless to reschedule,
1674 : * let's hope p can move out.
1675 : */
1676 : if (rq->curr->nr_cpus_allowed == 1 ||
1677 : !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1678 : return;
1679 :
1680 : /*
1681 : * p is migratable, so let's not schedule it and
1682 : * see if it is pushed or pulled somewhere else.
1683 : */
1684 : if (p->nr_cpus_allowed != 1 &&
1685 : cpupri_find(&rq->rd->cpupri, p, NULL))
1686 : return;
1687 :
1688 : /*
1689 : * There appear to be other CPUs that can accept
1690 : * the current task but none can run 'p', so lets reschedule
1691 : * to try and push the current task away:
1692 : */
1693 : requeue_task_rt(rq, p, 1);
1694 : resched_curr(rq);
1695 : }
1696 :
1697 : static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1698 : {
1699 : if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
1700 : /*
1701 : * This is OK, because current is on_cpu, which avoids it being
1702 : * picked for load-balance and preemption/IRQs are still
1703 : * disabled avoiding further scheduler activity on it and we've
1704 : * not yet started the picking loop.
1705 : */
1706 : rq_unpin_lock(rq, rf);
1707 : pull_rt_task(rq);
1708 : rq_repin_lock(rq, rf);
1709 : }
1710 :
1711 : return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
1712 : }
1713 : #endif /* CONFIG_SMP */
1714 :
1715 : /*
1716 : * Preempt the current task with a newly woken task if needed:
1717 : */
1718 0 : static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1719 : {
1720 0 : if (p->prio < rq->curr->prio) {
1721 0 : resched_curr(rq);
1722 0 : return;
1723 : }
1724 :
1725 : #ifdef CONFIG_SMP
1726 : /*
1727 : * If:
1728 : *
1729 : * - the newly woken task is of equal priority to the current task
1730 : * - the newly woken task is non-migratable while current is migratable
1731 : * - current will be preempted on the next reschedule
1732 : *
1733 : * we should check to see if current can readily move to a different
1734 : * cpu. If so, we will reschedule to allow the push logic to try
1735 : * to move current somewhere else, making room for our non-migratable
1736 : * task.
1737 : */
1738 : if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1739 : check_preempt_equal_prio(rq, p);
1740 : #endif
1741 : }
1742 :
1743 0 : static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
1744 : {
1745 0 : struct sched_rt_entity *rt_se = &p->rt;
1746 0 : struct rt_rq *rt_rq = &rq->rt;
1747 :
1748 0 : p->se.exec_start = rq_clock_task(rq);
1749 0 : if (on_rt_rq(&p->rt))
1750 : update_stats_wait_end_rt(rt_rq, rt_se);
1751 :
1752 : /* The running task is never eligible for pushing */
1753 0 : dequeue_pushable_task(rq, p);
1754 :
1755 0 : if (!first)
1756 : return;
1757 :
1758 : /*
1759 : * If prev task was rt, put_prev_task() has already updated the
1760 : * utilization. We only care of the case where we start to schedule a
1761 : * rt task
1762 : */
1763 0 : if (rq->curr->sched_class != &rt_sched_class)
1764 0 : update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1765 :
1766 : rt_queue_push_tasks(rq);
1767 : }
1768 :
1769 0 : static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
1770 : {
1771 0 : struct rt_prio_array *array = &rt_rq->active;
1772 0 : struct sched_rt_entity *next = NULL;
1773 : struct list_head *queue;
1774 : int idx;
1775 :
1776 0 : idx = sched_find_first_bit(array->bitmap);
1777 0 : BUG_ON(idx >= MAX_RT_PRIO);
1778 :
1779 0 : queue = array->queue + idx;
1780 0 : if (SCHED_WARN_ON(list_empty(queue)))
1781 : return NULL;
1782 0 : next = list_entry(queue->next, struct sched_rt_entity, run_list);
1783 :
1784 0 : return next;
1785 : }
1786 :
1787 : static struct task_struct *_pick_next_task_rt(struct rq *rq)
1788 : {
1789 : struct sched_rt_entity *rt_se;
1790 0 : struct rt_rq *rt_rq = &rq->rt;
1791 :
1792 : do {
1793 0 : rt_se = pick_next_rt_entity(rt_rq);
1794 0 : if (unlikely(!rt_se))
1795 : return NULL;
1796 0 : rt_rq = group_rt_rq(rt_se);
1797 : } while (rt_rq);
1798 :
1799 0 : return rt_task_of(rt_se);
1800 : }
1801 :
1802 : static struct task_struct *pick_task_rt(struct rq *rq)
1803 : {
1804 : struct task_struct *p;
1805 :
1806 0 : if (!sched_rt_runnable(rq))
1807 : return NULL;
1808 :
1809 : p = _pick_next_task_rt(rq);
1810 :
1811 : return p;
1812 : }
1813 :
1814 0 : static struct task_struct *pick_next_task_rt(struct rq *rq)
1815 : {
1816 0 : struct task_struct *p = pick_task_rt(rq);
1817 :
1818 0 : if (p)
1819 0 : set_next_task_rt(rq, p, true);
1820 :
1821 0 : return p;
1822 : }
1823 :
1824 0 : static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1825 : {
1826 0 : struct sched_rt_entity *rt_se = &p->rt;
1827 0 : struct rt_rq *rt_rq = &rq->rt;
1828 :
1829 0 : if (on_rt_rq(&p->rt))
1830 : update_stats_wait_start_rt(rt_rq, rt_se);
1831 :
1832 0 : update_curr_rt(rq);
1833 :
1834 0 : update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1835 :
1836 : /*
1837 : * The previous task needs to be made eligible for pushing
1838 : * if it is still active
1839 : */
1840 0 : if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1841 : enqueue_pushable_task(rq, p);
1842 0 : }
1843 :
1844 : #ifdef CONFIG_SMP
1845 :
1846 : /* Only try algorithms three times */
1847 : #define RT_MAX_TRIES 3
1848 :
1849 : static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1850 : {
1851 : if (!task_on_cpu(rq, p) &&
1852 : cpumask_test_cpu(cpu, &p->cpus_mask))
1853 : return 1;
1854 :
1855 : return 0;
1856 : }
1857 :
1858 : /*
1859 : * Return the highest pushable rq's task, which is suitable to be executed
1860 : * on the CPU, NULL otherwise
1861 : */
1862 : static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1863 : {
1864 : struct plist_head *head = &rq->rt.pushable_tasks;
1865 : struct task_struct *p;
1866 :
1867 : if (!has_pushable_tasks(rq))
1868 : return NULL;
1869 :
1870 : plist_for_each_entry(p, head, pushable_tasks) {
1871 : if (pick_rt_task(rq, p, cpu))
1872 : return p;
1873 : }
1874 :
1875 : return NULL;
1876 : }
1877 :
1878 : static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1879 :
1880 : static int find_lowest_rq(struct task_struct *task)
1881 : {
1882 : struct sched_domain *sd;
1883 : struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1884 : int this_cpu = smp_processor_id();
1885 : int cpu = task_cpu(task);
1886 : int ret;
1887 :
1888 : /* Make sure the mask is initialized first */
1889 : if (unlikely(!lowest_mask))
1890 : return -1;
1891 :
1892 : if (task->nr_cpus_allowed == 1)
1893 : return -1; /* No other targets possible */
1894 :
1895 : /*
1896 : * If we're on asym system ensure we consider the different capacities
1897 : * of the CPUs when searching for the lowest_mask.
1898 : */
1899 : if (sched_asym_cpucap_active()) {
1900 :
1901 : ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
1902 : task, lowest_mask,
1903 : rt_task_fits_capacity);
1904 : } else {
1905 :
1906 : ret = cpupri_find(&task_rq(task)->rd->cpupri,
1907 : task, lowest_mask);
1908 : }
1909 :
1910 : if (!ret)
1911 : return -1; /* No targets found */
1912 :
1913 : /*
1914 : * At this point we have built a mask of CPUs representing the
1915 : * lowest priority tasks in the system. Now we want to elect
1916 : * the best one based on our affinity and topology.
1917 : *
1918 : * We prioritize the last CPU that the task executed on since
1919 : * it is most likely cache-hot in that location.
1920 : */
1921 : if (cpumask_test_cpu(cpu, lowest_mask))
1922 : return cpu;
1923 :
1924 : /*
1925 : * Otherwise, we consult the sched_domains span maps to figure
1926 : * out which CPU is logically closest to our hot cache data.
1927 : */
1928 : if (!cpumask_test_cpu(this_cpu, lowest_mask))
1929 : this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1930 :
1931 : rcu_read_lock();
1932 : for_each_domain(cpu, sd) {
1933 : if (sd->flags & SD_WAKE_AFFINE) {
1934 : int best_cpu;
1935 :
1936 : /*
1937 : * "this_cpu" is cheaper to preempt than a
1938 : * remote processor.
1939 : */
1940 : if (this_cpu != -1 &&
1941 : cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1942 : rcu_read_unlock();
1943 : return this_cpu;
1944 : }
1945 :
1946 : best_cpu = cpumask_any_and_distribute(lowest_mask,
1947 : sched_domain_span(sd));
1948 : if (best_cpu < nr_cpu_ids) {
1949 : rcu_read_unlock();
1950 : return best_cpu;
1951 : }
1952 : }
1953 : }
1954 : rcu_read_unlock();
1955 :
1956 : /*
1957 : * And finally, if there were no matches within the domains
1958 : * just give the caller *something* to work with from the compatible
1959 : * locations.
1960 : */
1961 : if (this_cpu != -1)
1962 : return this_cpu;
1963 :
1964 : cpu = cpumask_any_distribute(lowest_mask);
1965 : if (cpu < nr_cpu_ids)
1966 : return cpu;
1967 :
1968 : return -1;
1969 : }
1970 :
1971 : /* Will lock the rq it finds */
1972 : static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1973 : {
1974 : struct rq *lowest_rq = NULL;
1975 : int tries;
1976 : int cpu;
1977 :
1978 : for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1979 : cpu = find_lowest_rq(task);
1980 :
1981 : if ((cpu == -1) || (cpu == rq->cpu))
1982 : break;
1983 :
1984 : lowest_rq = cpu_rq(cpu);
1985 :
1986 : if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1987 : /*
1988 : * Target rq has tasks of equal or higher priority,
1989 : * retrying does not release any lock and is unlikely
1990 : * to yield a different result.
1991 : */
1992 : lowest_rq = NULL;
1993 : break;
1994 : }
1995 :
1996 : /* if the prio of this runqueue changed, try again */
1997 : if (double_lock_balance(rq, lowest_rq)) {
1998 : /*
1999 : * We had to unlock the run queue. In
2000 : * the mean time, task could have
2001 : * migrated already or had its affinity changed.
2002 : * Also make sure that it wasn't scheduled on its rq.
2003 : */
2004 : if (unlikely(task_rq(task) != rq ||
2005 : !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
2006 : task_on_cpu(rq, task) ||
2007 : !rt_task(task) ||
2008 : !task_on_rq_queued(task))) {
2009 :
2010 : double_unlock_balance(rq, lowest_rq);
2011 : lowest_rq = NULL;
2012 : break;
2013 : }
2014 : }
2015 :
2016 : /* If this rq is still suitable use it. */
2017 : if (lowest_rq->rt.highest_prio.curr > task->prio)
2018 : break;
2019 :
2020 : /* try again */
2021 : double_unlock_balance(rq, lowest_rq);
2022 : lowest_rq = NULL;
2023 : }
2024 :
2025 : return lowest_rq;
2026 : }
2027 :
2028 : static struct task_struct *pick_next_pushable_task(struct rq *rq)
2029 : {
2030 : struct task_struct *p;
2031 :
2032 : if (!has_pushable_tasks(rq))
2033 : return NULL;
2034 :
2035 : p = plist_first_entry(&rq->rt.pushable_tasks,
2036 : struct task_struct, pushable_tasks);
2037 :
2038 : BUG_ON(rq->cpu != task_cpu(p));
2039 : BUG_ON(task_current(rq, p));
2040 : BUG_ON(p->nr_cpus_allowed <= 1);
2041 :
2042 : BUG_ON(!task_on_rq_queued(p));
2043 : BUG_ON(!rt_task(p));
2044 :
2045 : return p;
2046 : }
2047 :
2048 : /*
2049 : * If the current CPU has more than one RT task, see if the non
2050 : * running task can migrate over to a CPU that is running a task
2051 : * of lesser priority.
2052 : */
2053 : static int push_rt_task(struct rq *rq, bool pull)
2054 : {
2055 : struct task_struct *next_task;
2056 : struct rq *lowest_rq;
2057 : int ret = 0;
2058 :
2059 : if (!rq->rt.overloaded)
2060 : return 0;
2061 :
2062 : next_task = pick_next_pushable_task(rq);
2063 : if (!next_task)
2064 : return 0;
2065 :
2066 : retry:
2067 : /*
2068 : * It's possible that the next_task slipped in of
2069 : * higher priority than current. If that's the case
2070 : * just reschedule current.
2071 : */
2072 : if (unlikely(next_task->prio < rq->curr->prio)) {
2073 : resched_curr(rq);
2074 : return 0;
2075 : }
2076 :
2077 : if (is_migration_disabled(next_task)) {
2078 : struct task_struct *push_task = NULL;
2079 : int cpu;
2080 :
2081 : if (!pull || rq->push_busy)
2082 : return 0;
2083 :
2084 : /*
2085 : * Invoking find_lowest_rq() on anything but an RT task doesn't
2086 : * make sense. Per the above priority check, curr has to
2087 : * be of higher priority than next_task, so no need to
2088 : * reschedule when bailing out.
2089 : *
2090 : * Note that the stoppers are masqueraded as SCHED_FIFO
2091 : * (cf. sched_set_stop_task()), so we can't rely on rt_task().
2092 : */
2093 : if (rq->curr->sched_class != &rt_sched_class)
2094 : return 0;
2095 :
2096 : cpu = find_lowest_rq(rq->curr);
2097 : if (cpu == -1 || cpu == rq->cpu)
2098 : return 0;
2099 :
2100 : /*
2101 : * Given we found a CPU with lower priority than @next_task,
2102 : * therefore it should be running. However we cannot migrate it
2103 : * to this other CPU, instead attempt to push the current
2104 : * running task on this CPU away.
2105 : */
2106 : push_task = get_push_task(rq);
2107 : if (push_task) {
2108 : raw_spin_rq_unlock(rq);
2109 : stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2110 : push_task, &rq->push_work);
2111 : raw_spin_rq_lock(rq);
2112 : }
2113 :
2114 : return 0;
2115 : }
2116 :
2117 : if (WARN_ON(next_task == rq->curr))
2118 : return 0;
2119 :
2120 : /* We might release rq lock */
2121 : get_task_struct(next_task);
2122 :
2123 : /* find_lock_lowest_rq locks the rq if found */
2124 : lowest_rq = find_lock_lowest_rq(next_task, rq);
2125 : if (!lowest_rq) {
2126 : struct task_struct *task;
2127 : /*
2128 : * find_lock_lowest_rq releases rq->lock
2129 : * so it is possible that next_task has migrated.
2130 : *
2131 : * We need to make sure that the task is still on the same
2132 : * run-queue and is also still the next task eligible for
2133 : * pushing.
2134 : */
2135 : task = pick_next_pushable_task(rq);
2136 : if (task == next_task) {
2137 : /*
2138 : * The task hasn't migrated, and is still the next
2139 : * eligible task, but we failed to find a run-queue
2140 : * to push it to. Do not retry in this case, since
2141 : * other CPUs will pull from us when ready.
2142 : */
2143 : goto out;
2144 : }
2145 :
2146 : if (!task)
2147 : /* No more tasks, just exit */
2148 : goto out;
2149 :
2150 : /*
2151 : * Something has shifted, try again.
2152 : */
2153 : put_task_struct(next_task);
2154 : next_task = task;
2155 : goto retry;
2156 : }
2157 :
2158 : deactivate_task(rq, next_task, 0);
2159 : set_task_cpu(next_task, lowest_rq->cpu);
2160 : activate_task(lowest_rq, next_task, 0);
2161 : resched_curr(lowest_rq);
2162 : ret = 1;
2163 :
2164 : double_unlock_balance(rq, lowest_rq);
2165 : out:
2166 : put_task_struct(next_task);
2167 :
2168 : return ret;
2169 : }
2170 :
2171 : static void push_rt_tasks(struct rq *rq)
2172 : {
2173 : /* push_rt_task will return true if it moved an RT */
2174 : while (push_rt_task(rq, false))
2175 : ;
2176 : }
2177 :
2178 : #ifdef HAVE_RT_PUSH_IPI
2179 :
2180 : /*
2181 : * When a high priority task schedules out from a CPU and a lower priority
2182 : * task is scheduled in, a check is made to see if there's any RT tasks
2183 : * on other CPUs that are waiting to run because a higher priority RT task
2184 : * is currently running on its CPU. In this case, the CPU with multiple RT
2185 : * tasks queued on it (overloaded) needs to be notified that a CPU has opened
2186 : * up that may be able to run one of its non-running queued RT tasks.
2187 : *
2188 : * All CPUs with overloaded RT tasks need to be notified as there is currently
2189 : * no way to know which of these CPUs have the highest priority task waiting
2190 : * to run. Instead of trying to take a spinlock on each of these CPUs,
2191 : * which has shown to cause large latency when done on machines with many
2192 : * CPUs, sending an IPI to the CPUs to have them push off the overloaded
2193 : * RT tasks waiting to run.
2194 : *
2195 : * Just sending an IPI to each of the CPUs is also an issue, as on large
2196 : * count CPU machines, this can cause an IPI storm on a CPU, especially
2197 : * if its the only CPU with multiple RT tasks queued, and a large number
2198 : * of CPUs scheduling a lower priority task at the same time.
2199 : *
2200 : * Each root domain has its own irq work function that can iterate over
2201 : * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
2202 : * task must be checked if there's one or many CPUs that are lowering
2203 : * their priority, there's a single irq work iterator that will try to
2204 : * push off RT tasks that are waiting to run.
2205 : *
2206 : * When a CPU schedules a lower priority task, it will kick off the
2207 : * irq work iterator that will jump to each CPU with overloaded RT tasks.
2208 : * As it only takes the first CPU that schedules a lower priority task
2209 : * to start the process, the rto_start variable is incremented and if
2210 : * the atomic result is one, then that CPU will try to take the rto_lock.
2211 : * This prevents high contention on the lock as the process handles all
2212 : * CPUs scheduling lower priority tasks.
2213 : *
2214 : * All CPUs that are scheduling a lower priority task will increment the
2215 : * rt_loop_next variable. This will make sure that the irq work iterator
2216 : * checks all RT overloaded CPUs whenever a CPU schedules a new lower
2217 : * priority task, even if the iterator is in the middle of a scan. Incrementing
2218 : * the rt_loop_next will cause the iterator to perform another scan.
2219 : *
2220 : */
2221 : static int rto_next_cpu(struct root_domain *rd)
2222 : {
2223 : int next;
2224 : int cpu;
2225 :
2226 : /*
2227 : * When starting the IPI RT pushing, the rto_cpu is set to -1,
2228 : * rt_next_cpu() will simply return the first CPU found in
2229 : * the rto_mask.
2230 : *
2231 : * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
2232 : * will return the next CPU found in the rto_mask.
2233 : *
2234 : * If there are no more CPUs left in the rto_mask, then a check is made
2235 : * against rto_loop and rto_loop_next. rto_loop is only updated with
2236 : * the rto_lock held, but any CPU may increment the rto_loop_next
2237 : * without any locking.
2238 : */
2239 : for (;;) {
2240 :
2241 : /* When rto_cpu is -1 this acts like cpumask_first() */
2242 : cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
2243 :
2244 : rd->rto_cpu = cpu;
2245 :
2246 : if (cpu < nr_cpu_ids)
2247 : return cpu;
2248 :
2249 : rd->rto_cpu = -1;
2250 :
2251 : /*
2252 : * ACQUIRE ensures we see the @rto_mask changes
2253 : * made prior to the @next value observed.
2254 : *
2255 : * Matches WMB in rt_set_overload().
2256 : */
2257 : next = atomic_read_acquire(&rd->rto_loop_next);
2258 :
2259 : if (rd->rto_loop == next)
2260 : break;
2261 :
2262 : rd->rto_loop = next;
2263 : }
2264 :
2265 : return -1;
2266 : }
2267 :
2268 : static inline bool rto_start_trylock(atomic_t *v)
2269 : {
2270 : return !atomic_cmpxchg_acquire(v, 0, 1);
2271 : }
2272 :
2273 : static inline void rto_start_unlock(atomic_t *v)
2274 : {
2275 : atomic_set_release(v, 0);
2276 : }
2277 :
2278 : static void tell_cpu_to_push(struct rq *rq)
2279 : {
2280 : int cpu = -1;
2281 :
2282 : /* Keep the loop going if the IPI is currently active */
2283 : atomic_inc(&rq->rd->rto_loop_next);
2284 :
2285 : /* Only one CPU can initiate a loop at a time */
2286 : if (!rto_start_trylock(&rq->rd->rto_loop_start))
2287 : return;
2288 :
2289 : raw_spin_lock(&rq->rd->rto_lock);
2290 :
2291 : /*
2292 : * The rto_cpu is updated under the lock, if it has a valid CPU
2293 : * then the IPI is still running and will continue due to the
2294 : * update to loop_next, and nothing needs to be done here.
2295 : * Otherwise it is finishing up and an ipi needs to be sent.
2296 : */
2297 : if (rq->rd->rto_cpu < 0)
2298 : cpu = rto_next_cpu(rq->rd);
2299 :
2300 : raw_spin_unlock(&rq->rd->rto_lock);
2301 :
2302 : rto_start_unlock(&rq->rd->rto_loop_start);
2303 :
2304 : if (cpu >= 0) {
2305 : /* Make sure the rd does not get freed while pushing */
2306 : sched_get_rd(rq->rd);
2307 : irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2308 : }
2309 : }
2310 :
2311 : /* Called from hardirq context */
2312 : void rto_push_irq_work_func(struct irq_work *work)
2313 : {
2314 : struct root_domain *rd =
2315 : container_of(work, struct root_domain, rto_push_work);
2316 : struct rq *rq;
2317 : int cpu;
2318 :
2319 : rq = this_rq();
2320 :
2321 : /*
2322 : * We do not need to grab the lock to check for has_pushable_tasks.
2323 : * When it gets updated, a check is made if a push is possible.
2324 : */
2325 : if (has_pushable_tasks(rq)) {
2326 : raw_spin_rq_lock(rq);
2327 : while (push_rt_task(rq, true))
2328 : ;
2329 : raw_spin_rq_unlock(rq);
2330 : }
2331 :
2332 : raw_spin_lock(&rd->rto_lock);
2333 :
2334 : /* Pass the IPI to the next rt overloaded queue */
2335 : cpu = rto_next_cpu(rd);
2336 :
2337 : raw_spin_unlock(&rd->rto_lock);
2338 :
2339 : if (cpu < 0) {
2340 : sched_put_rd(rd);
2341 : return;
2342 : }
2343 :
2344 : /* Try the next RT overloaded CPU */
2345 : irq_work_queue_on(&rd->rto_push_work, cpu);
2346 : }
2347 : #endif /* HAVE_RT_PUSH_IPI */
2348 :
2349 : static void pull_rt_task(struct rq *this_rq)
2350 : {
2351 : int this_cpu = this_rq->cpu, cpu;
2352 : bool resched = false;
2353 : struct task_struct *p, *push_task;
2354 : struct rq *src_rq;
2355 : int rt_overload_count = rt_overloaded(this_rq);
2356 :
2357 : if (likely(!rt_overload_count))
2358 : return;
2359 :
2360 : /*
2361 : * Match the barrier from rt_set_overloaded; this guarantees that if we
2362 : * see overloaded we must also see the rto_mask bit.
2363 : */
2364 : smp_rmb();
2365 :
2366 : /* If we are the only overloaded CPU do nothing */
2367 : if (rt_overload_count == 1 &&
2368 : cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2369 : return;
2370 :
2371 : #ifdef HAVE_RT_PUSH_IPI
2372 : if (sched_feat(RT_PUSH_IPI)) {
2373 : tell_cpu_to_push(this_rq);
2374 : return;
2375 : }
2376 : #endif
2377 :
2378 : for_each_cpu(cpu, this_rq->rd->rto_mask) {
2379 : if (this_cpu == cpu)
2380 : continue;
2381 :
2382 : src_rq = cpu_rq(cpu);
2383 :
2384 : /*
2385 : * Don't bother taking the src_rq->lock if the next highest
2386 : * task is known to be lower-priority than our current task.
2387 : * This may look racy, but if this value is about to go
2388 : * logically higher, the src_rq will push this task away.
2389 : * And if its going logically lower, we do not care
2390 : */
2391 : if (src_rq->rt.highest_prio.next >=
2392 : this_rq->rt.highest_prio.curr)
2393 : continue;
2394 :
2395 : /*
2396 : * We can potentially drop this_rq's lock in
2397 : * double_lock_balance, and another CPU could
2398 : * alter this_rq
2399 : */
2400 : push_task = NULL;
2401 : double_lock_balance(this_rq, src_rq);
2402 :
2403 : /*
2404 : * We can pull only a task, which is pushable
2405 : * on its rq, and no others.
2406 : */
2407 : p = pick_highest_pushable_task(src_rq, this_cpu);
2408 :
2409 : /*
2410 : * Do we have an RT task that preempts
2411 : * the to-be-scheduled task?
2412 : */
2413 : if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2414 : WARN_ON(p == src_rq->curr);
2415 : WARN_ON(!task_on_rq_queued(p));
2416 :
2417 : /*
2418 : * There's a chance that p is higher in priority
2419 : * than what's currently running on its CPU.
2420 : * This is just that p is waking up and hasn't
2421 : * had a chance to schedule. We only pull
2422 : * p if it is lower in priority than the
2423 : * current task on the run queue
2424 : */
2425 : if (p->prio < src_rq->curr->prio)
2426 : goto skip;
2427 :
2428 : if (is_migration_disabled(p)) {
2429 : push_task = get_push_task(src_rq);
2430 : } else {
2431 : deactivate_task(src_rq, p, 0);
2432 : set_task_cpu(p, this_cpu);
2433 : activate_task(this_rq, p, 0);
2434 : resched = true;
2435 : }
2436 : /*
2437 : * We continue with the search, just in
2438 : * case there's an even higher prio task
2439 : * in another runqueue. (low likelihood
2440 : * but possible)
2441 : */
2442 : }
2443 : skip:
2444 : double_unlock_balance(this_rq, src_rq);
2445 :
2446 : if (push_task) {
2447 : raw_spin_rq_unlock(this_rq);
2448 : stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
2449 : push_task, &src_rq->push_work);
2450 : raw_spin_rq_lock(this_rq);
2451 : }
2452 : }
2453 :
2454 : if (resched)
2455 : resched_curr(this_rq);
2456 : }
2457 :
2458 : /*
2459 : * If we are not running and we are not going to reschedule soon, we should
2460 : * try to push tasks away now
2461 : */
2462 : static void task_woken_rt(struct rq *rq, struct task_struct *p)
2463 : {
2464 : bool need_to_push = !task_on_cpu(rq, p) &&
2465 : !test_tsk_need_resched(rq->curr) &&
2466 : p->nr_cpus_allowed > 1 &&
2467 : (dl_task(rq->curr) || rt_task(rq->curr)) &&
2468 : (rq->curr->nr_cpus_allowed < 2 ||
2469 : rq->curr->prio <= p->prio);
2470 :
2471 : if (need_to_push)
2472 : push_rt_tasks(rq);
2473 : }
2474 :
2475 : /* Assumes rq->lock is held */
2476 : static void rq_online_rt(struct rq *rq)
2477 : {
2478 : if (rq->rt.overloaded)
2479 : rt_set_overload(rq);
2480 :
2481 : __enable_runtime(rq);
2482 :
2483 : cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2484 : }
2485 :
2486 : /* Assumes rq->lock is held */
2487 : static void rq_offline_rt(struct rq *rq)
2488 : {
2489 : if (rq->rt.overloaded)
2490 : rt_clear_overload(rq);
2491 :
2492 : __disable_runtime(rq);
2493 :
2494 : cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2495 : }
2496 :
2497 : /*
2498 : * When switch from the rt queue, we bring ourselves to a position
2499 : * that we might want to pull RT tasks from other runqueues.
2500 : */
2501 : static void switched_from_rt(struct rq *rq, struct task_struct *p)
2502 : {
2503 : /*
2504 : * If there are other RT tasks then we will reschedule
2505 : * and the scheduling of the other RT tasks will handle
2506 : * the balancing. But if we are the last RT task
2507 : * we may need to handle the pulling of RT tasks
2508 : * now.
2509 : */
2510 : if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2511 : return;
2512 :
2513 : rt_queue_pull_task(rq);
2514 : }
2515 :
2516 : void __init init_sched_rt_class(void)
2517 : {
2518 : unsigned int i;
2519 :
2520 : for_each_possible_cpu(i) {
2521 : zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2522 : GFP_KERNEL, cpu_to_node(i));
2523 : }
2524 : }
2525 : #endif /* CONFIG_SMP */
2526 :
2527 : /*
2528 : * When switching a task to RT, we may overload the runqueue
2529 : * with RT tasks. In this case we try to push them off to
2530 : * other runqueues.
2531 : */
2532 0 : static void switched_to_rt(struct rq *rq, struct task_struct *p)
2533 : {
2534 : /*
2535 : * If we are running, update the avg_rt tracking, as the running time
2536 : * will now on be accounted into the latter.
2537 : */
2538 0 : if (task_current(rq, p)) {
2539 0 : update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
2540 : return;
2541 : }
2542 :
2543 : /*
2544 : * If we are not running we may need to preempt the current
2545 : * running task. If that current running task is also an RT task
2546 : * then see if we can move to another run queue.
2547 : */
2548 0 : if (task_on_rq_queued(p)) {
2549 : #ifdef CONFIG_SMP
2550 : if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2551 : rt_queue_push_tasks(rq);
2552 : #endif /* CONFIG_SMP */
2553 0 : if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2554 0 : resched_curr(rq);
2555 : }
2556 : }
2557 :
2558 : /*
2559 : * Priority of the task has changed. This may cause
2560 : * us to initiate a push or pull.
2561 : */
2562 : static void
2563 0 : prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2564 : {
2565 0 : if (!task_on_rq_queued(p))
2566 : return;
2567 :
2568 0 : if (task_current(rq, p)) {
2569 : #ifdef CONFIG_SMP
2570 : /*
2571 : * If our priority decreases while running, we
2572 : * may need to pull tasks to this runqueue.
2573 : */
2574 : if (oldprio < p->prio)
2575 : rt_queue_pull_task(rq);
2576 :
2577 : /*
2578 : * If there's a higher priority task waiting to run
2579 : * then reschedule.
2580 : */
2581 : if (p->prio > rq->rt.highest_prio.curr)
2582 : resched_curr(rq);
2583 : #else
2584 : /* For UP simply resched on drop of prio */
2585 0 : if (oldprio < p->prio)
2586 0 : resched_curr(rq);
2587 : #endif /* CONFIG_SMP */
2588 : } else {
2589 : /*
2590 : * This task is not running, but if it is
2591 : * greater than the current running task
2592 : * then reschedule.
2593 : */
2594 0 : if (p->prio < rq->curr->prio)
2595 0 : resched_curr(rq);
2596 : }
2597 : }
2598 :
2599 : #ifdef CONFIG_POSIX_TIMERS
2600 0 : static void watchdog(struct rq *rq, struct task_struct *p)
2601 : {
2602 : unsigned long soft, hard;
2603 :
2604 : /* max may change after cur was read, this will be fixed next tick */
2605 0 : soft = task_rlimit(p, RLIMIT_RTTIME);
2606 0 : hard = task_rlimit_max(p, RLIMIT_RTTIME);
2607 :
2608 0 : if (soft != RLIM_INFINITY) {
2609 : unsigned long next;
2610 :
2611 0 : if (p->rt.watchdog_stamp != jiffies) {
2612 0 : p->rt.timeout++;
2613 0 : p->rt.watchdog_stamp = jiffies;
2614 : }
2615 :
2616 0 : next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2617 0 : if (p->rt.timeout > next) {
2618 0 : posix_cputimers_rt_watchdog(&p->posix_cputimers,
2619 : p->se.sum_exec_runtime);
2620 : }
2621 : }
2622 0 : }
2623 : #else
2624 : static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2625 : #endif
2626 :
2627 : /*
2628 : * scheduler tick hitting a task of our scheduling class.
2629 : *
2630 : * NOTE: This function can be called remotely by the tick offload that
2631 : * goes along full dynticks. Therefore no local assumption can be made
2632 : * and everything must be accessed through the @rq and @curr passed in
2633 : * parameters.
2634 : */
2635 0 : static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2636 : {
2637 0 : struct sched_rt_entity *rt_se = &p->rt;
2638 :
2639 0 : update_curr_rt(rq);
2640 0 : update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
2641 :
2642 0 : watchdog(rq, p);
2643 :
2644 : /*
2645 : * RR tasks need a special form of timeslice management.
2646 : * FIFO tasks have no timeslices.
2647 : */
2648 0 : if (p->policy != SCHED_RR)
2649 : return;
2650 :
2651 0 : if (--p->rt.time_slice)
2652 : return;
2653 :
2654 0 : p->rt.time_slice = sched_rr_timeslice;
2655 :
2656 : /*
2657 : * Requeue to the end of queue if we (and all of our ancestors) are not
2658 : * the only element on the queue
2659 : */
2660 0 : for_each_sched_rt_entity(rt_se) {
2661 0 : if (rt_se->run_list.prev != rt_se->run_list.next) {
2662 0 : requeue_task_rt(rq, p, 0);
2663 0 : resched_curr(rq);
2664 0 : return;
2665 : }
2666 : }
2667 : }
2668 :
2669 0 : static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2670 : {
2671 : /*
2672 : * Time slice is 0 for SCHED_FIFO tasks
2673 : */
2674 0 : if (task->policy == SCHED_RR)
2675 0 : return sched_rr_timeslice;
2676 : else
2677 : return 0;
2678 : }
2679 :
2680 : DEFINE_SCHED_CLASS(rt) = {
2681 :
2682 : .enqueue_task = enqueue_task_rt,
2683 : .dequeue_task = dequeue_task_rt,
2684 : .yield_task = yield_task_rt,
2685 :
2686 : .check_preempt_curr = check_preempt_curr_rt,
2687 :
2688 : .pick_next_task = pick_next_task_rt,
2689 : .put_prev_task = put_prev_task_rt,
2690 : .set_next_task = set_next_task_rt,
2691 :
2692 : #ifdef CONFIG_SMP
2693 : .balance = balance_rt,
2694 : .pick_task = pick_task_rt,
2695 : .select_task_rq = select_task_rq_rt,
2696 : .set_cpus_allowed = set_cpus_allowed_common,
2697 : .rq_online = rq_online_rt,
2698 : .rq_offline = rq_offline_rt,
2699 : .task_woken = task_woken_rt,
2700 : .switched_from = switched_from_rt,
2701 : .find_lock_rq = find_lock_lowest_rq,
2702 : #endif
2703 :
2704 : .task_tick = task_tick_rt,
2705 :
2706 : .get_rr_interval = get_rr_interval_rt,
2707 :
2708 : .prio_changed = prio_changed_rt,
2709 : .switched_to = switched_to_rt,
2710 :
2711 : .update_curr = update_curr_rt,
2712 :
2713 : #ifdef CONFIG_UCLAMP_TASK
2714 : .uclamp_enabled = 1,
2715 : #endif
2716 : };
2717 :
2718 : #ifdef CONFIG_RT_GROUP_SCHED
2719 : /*
2720 : * Ensure that the real time constraints are schedulable.
2721 : */
2722 : static DEFINE_MUTEX(rt_constraints_mutex);
2723 :
2724 : static inline int tg_has_rt_tasks(struct task_group *tg)
2725 : {
2726 : struct task_struct *task;
2727 : struct css_task_iter it;
2728 : int ret = 0;
2729 :
2730 : /*
2731 : * Autogroups do not have RT tasks; see autogroup_create().
2732 : */
2733 : if (task_group_is_autogroup(tg))
2734 : return 0;
2735 :
2736 : css_task_iter_start(&tg->css, 0, &it);
2737 : while (!ret && (task = css_task_iter_next(&it)))
2738 : ret |= rt_task(task);
2739 : css_task_iter_end(&it);
2740 :
2741 : return ret;
2742 : }
2743 :
2744 : struct rt_schedulable_data {
2745 : struct task_group *tg;
2746 : u64 rt_period;
2747 : u64 rt_runtime;
2748 : };
2749 :
2750 : static int tg_rt_schedulable(struct task_group *tg, void *data)
2751 : {
2752 : struct rt_schedulable_data *d = data;
2753 : struct task_group *child;
2754 : unsigned long total, sum = 0;
2755 : u64 period, runtime;
2756 :
2757 : period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2758 : runtime = tg->rt_bandwidth.rt_runtime;
2759 :
2760 : if (tg == d->tg) {
2761 : period = d->rt_period;
2762 : runtime = d->rt_runtime;
2763 : }
2764 :
2765 : /*
2766 : * Cannot have more runtime than the period.
2767 : */
2768 : if (runtime > period && runtime != RUNTIME_INF)
2769 : return -EINVAL;
2770 :
2771 : /*
2772 : * Ensure we don't starve existing RT tasks if runtime turns zero.
2773 : */
2774 : if (rt_bandwidth_enabled() && !runtime &&
2775 : tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
2776 : return -EBUSY;
2777 :
2778 : total = to_ratio(period, runtime);
2779 :
2780 : /*
2781 : * Nobody can have more than the global setting allows.
2782 : */
2783 : if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2784 : return -EINVAL;
2785 :
2786 : /*
2787 : * The sum of our children's runtime should not exceed our own.
2788 : */
2789 : list_for_each_entry_rcu(child, &tg->children, siblings) {
2790 : period = ktime_to_ns(child->rt_bandwidth.rt_period);
2791 : runtime = child->rt_bandwidth.rt_runtime;
2792 :
2793 : if (child == d->tg) {
2794 : period = d->rt_period;
2795 : runtime = d->rt_runtime;
2796 : }
2797 :
2798 : sum += to_ratio(period, runtime);
2799 : }
2800 :
2801 : if (sum > total)
2802 : return -EINVAL;
2803 :
2804 : return 0;
2805 : }
2806 :
2807 : static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2808 : {
2809 : int ret;
2810 :
2811 : struct rt_schedulable_data data = {
2812 : .tg = tg,
2813 : .rt_period = period,
2814 : .rt_runtime = runtime,
2815 : };
2816 :
2817 : rcu_read_lock();
2818 : ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2819 : rcu_read_unlock();
2820 :
2821 : return ret;
2822 : }
2823 :
2824 : static int tg_set_rt_bandwidth(struct task_group *tg,
2825 : u64 rt_period, u64 rt_runtime)
2826 : {
2827 : int i, err = 0;
2828 :
2829 : /*
2830 : * Disallowing the root group RT runtime is BAD, it would disallow the
2831 : * kernel creating (and or operating) RT threads.
2832 : */
2833 : if (tg == &root_task_group && rt_runtime == 0)
2834 : return -EINVAL;
2835 :
2836 : /* No period doesn't make any sense. */
2837 : if (rt_period == 0)
2838 : return -EINVAL;
2839 :
2840 : /*
2841 : * Bound quota to defend quota against overflow during bandwidth shift.
2842 : */
2843 : if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
2844 : return -EINVAL;
2845 :
2846 : mutex_lock(&rt_constraints_mutex);
2847 : err = __rt_schedulable(tg, rt_period, rt_runtime);
2848 : if (err)
2849 : goto unlock;
2850 :
2851 : raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2852 : tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2853 : tg->rt_bandwidth.rt_runtime = rt_runtime;
2854 :
2855 : for_each_possible_cpu(i) {
2856 : struct rt_rq *rt_rq = tg->rt_rq[i];
2857 :
2858 : raw_spin_lock(&rt_rq->rt_runtime_lock);
2859 : rt_rq->rt_runtime = rt_runtime;
2860 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
2861 : }
2862 : raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2863 : unlock:
2864 : mutex_unlock(&rt_constraints_mutex);
2865 :
2866 : return err;
2867 : }
2868 :
2869 : int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2870 : {
2871 : u64 rt_runtime, rt_period;
2872 :
2873 : rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2874 : rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2875 : if (rt_runtime_us < 0)
2876 : rt_runtime = RUNTIME_INF;
2877 : else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
2878 : return -EINVAL;
2879 :
2880 : return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2881 : }
2882 :
2883 : long sched_group_rt_runtime(struct task_group *tg)
2884 : {
2885 : u64 rt_runtime_us;
2886 :
2887 : if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2888 : return -1;
2889 :
2890 : rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2891 : do_div(rt_runtime_us, NSEC_PER_USEC);
2892 : return rt_runtime_us;
2893 : }
2894 :
2895 : int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2896 : {
2897 : u64 rt_runtime, rt_period;
2898 :
2899 : if (rt_period_us > U64_MAX / NSEC_PER_USEC)
2900 : return -EINVAL;
2901 :
2902 : rt_period = rt_period_us * NSEC_PER_USEC;
2903 : rt_runtime = tg->rt_bandwidth.rt_runtime;
2904 :
2905 : return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2906 : }
2907 :
2908 : long sched_group_rt_period(struct task_group *tg)
2909 : {
2910 : u64 rt_period_us;
2911 :
2912 : rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2913 : do_div(rt_period_us, NSEC_PER_USEC);
2914 : return rt_period_us;
2915 : }
2916 :
2917 : #ifdef CONFIG_SYSCTL
2918 : static int sched_rt_global_constraints(void)
2919 : {
2920 : int ret = 0;
2921 :
2922 : mutex_lock(&rt_constraints_mutex);
2923 : ret = __rt_schedulable(NULL, 0, 0);
2924 : mutex_unlock(&rt_constraints_mutex);
2925 :
2926 : return ret;
2927 : }
2928 : #endif /* CONFIG_SYSCTL */
2929 :
2930 : int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2931 : {
2932 : /* Don't accept realtime tasks when there is no way for them to run */
2933 : if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2934 : return 0;
2935 :
2936 : return 1;
2937 : }
2938 :
2939 : #else /* !CONFIG_RT_GROUP_SCHED */
2940 :
2941 : #ifdef CONFIG_SYSCTL
2942 0 : static int sched_rt_global_constraints(void)
2943 : {
2944 : unsigned long flags;
2945 : int i;
2946 :
2947 0 : raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2948 0 : for_each_possible_cpu(i) {
2949 0 : struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2950 :
2951 0 : raw_spin_lock(&rt_rq->rt_runtime_lock);
2952 0 : rt_rq->rt_runtime = global_rt_runtime();
2953 0 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
2954 : }
2955 0 : raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2956 :
2957 0 : return 0;
2958 : }
2959 : #endif /* CONFIG_SYSCTL */
2960 : #endif /* CONFIG_RT_GROUP_SCHED */
2961 :
2962 : #ifdef CONFIG_SYSCTL
2963 : static int sched_rt_global_validate(void)
2964 : {
2965 0 : if (sysctl_sched_rt_period <= 0)
2966 : return -EINVAL;
2967 :
2968 0 : if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2969 0 : ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
2970 0 : ((u64)sysctl_sched_rt_runtime *
2971 : NSEC_PER_USEC > max_rt_runtime)))
2972 : return -EINVAL;
2973 :
2974 : return 0;
2975 : }
2976 :
2977 0 : static void sched_rt_do_global(void)
2978 : {
2979 : unsigned long flags;
2980 :
2981 0 : raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2982 0 : def_rt_bandwidth.rt_runtime = global_rt_runtime();
2983 0 : def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
2984 0 : raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2985 0 : }
2986 :
2987 0 : static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
2988 : size_t *lenp, loff_t *ppos)
2989 : {
2990 : int old_period, old_runtime;
2991 : static DEFINE_MUTEX(mutex);
2992 : int ret;
2993 :
2994 0 : mutex_lock(&mutex);
2995 0 : old_period = sysctl_sched_rt_period;
2996 0 : old_runtime = sysctl_sched_rt_runtime;
2997 :
2998 0 : ret = proc_dointvec(table, write, buffer, lenp, ppos);
2999 :
3000 0 : if (!ret && write) {
3001 0 : ret = sched_rt_global_validate();
3002 0 : if (ret)
3003 : goto undo;
3004 :
3005 0 : ret = sched_dl_global_validate();
3006 0 : if (ret)
3007 : goto undo;
3008 :
3009 0 : ret = sched_rt_global_constraints();
3010 0 : if (ret)
3011 : goto undo;
3012 :
3013 0 : sched_rt_do_global();
3014 0 : sched_dl_do_global();
3015 : }
3016 : if (0) {
3017 : undo:
3018 0 : sysctl_sched_rt_period = old_period;
3019 0 : sysctl_sched_rt_runtime = old_runtime;
3020 : }
3021 0 : mutex_unlock(&mutex);
3022 :
3023 0 : return ret;
3024 : }
3025 :
3026 0 : static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
3027 : size_t *lenp, loff_t *ppos)
3028 : {
3029 : int ret;
3030 : static DEFINE_MUTEX(mutex);
3031 :
3032 0 : mutex_lock(&mutex);
3033 0 : ret = proc_dointvec(table, write, buffer, lenp, ppos);
3034 : /*
3035 : * Make sure that internally we keep jiffies.
3036 : * Also, writing zero resets the timeslice to default:
3037 : */
3038 0 : if (!ret && write) {
3039 0 : sched_rr_timeslice =
3040 0 : sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
3041 0 : msecs_to_jiffies(sysctl_sched_rr_timeslice);
3042 : }
3043 0 : mutex_unlock(&mutex);
3044 :
3045 0 : return ret;
3046 : }
3047 : #endif /* CONFIG_SYSCTL */
3048 :
3049 : #ifdef CONFIG_SCHED_DEBUG
3050 0 : void print_rt_stats(struct seq_file *m, int cpu)
3051 : {
3052 : rt_rq_iter_t iter;
3053 : struct rt_rq *rt_rq;
3054 :
3055 : rcu_read_lock();
3056 0 : for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
3057 0 : print_rt_rq(m, cpu, rt_rq);
3058 : rcu_read_unlock();
3059 0 : }
3060 : #endif /* CONFIG_SCHED_DEBUG */
|