Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4 : * policies)
5 : */
6 :
7 : int sched_rr_timeslice = RR_TIMESLICE;
8 : /* More than 4 hours if BW_SHIFT equals 20. */
9 : static const u64 max_rt_runtime = MAX_BW;
10 :
11 : static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
12 :
13 : struct rt_bandwidth def_rt_bandwidth;
14 :
15 : /*
16 : * period over which we measure -rt task CPU usage in us.
17 : * default: 1s
18 : */
19 : unsigned int sysctl_sched_rt_period = 1000000;
20 :
21 : /*
22 : * part of the period that we allow rt tasks to run in us.
23 : * default: 0.95s
24 : */
25 : int sysctl_sched_rt_runtime = 950000;
26 :
27 : #ifdef CONFIG_SYSCTL
28 : static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
29 : static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
30 : size_t *lenp, loff_t *ppos);
31 : static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
32 : size_t *lenp, loff_t *ppos);
33 : static struct ctl_table sched_rt_sysctls[] = {
34 : {
35 : .procname = "sched_rt_period_us",
36 : .data = &sysctl_sched_rt_period,
37 : .maxlen = sizeof(unsigned int),
38 : .mode = 0644,
39 : .proc_handler = sched_rt_handler,
40 : },
41 : {
42 : .procname = "sched_rt_runtime_us",
43 : .data = &sysctl_sched_rt_runtime,
44 : .maxlen = sizeof(int),
45 : .mode = 0644,
46 : .proc_handler = sched_rt_handler,
47 : },
48 : {
49 : .procname = "sched_rr_timeslice_ms",
50 : .data = &sysctl_sched_rr_timeslice,
51 : .maxlen = sizeof(int),
52 : .mode = 0644,
53 : .proc_handler = sched_rr_handler,
54 : },
55 : {}
56 : };
57 :
58 1 : static int __init sched_rt_sysctl_init(void)
59 : {
60 1 : register_sysctl_init("kernel", sched_rt_sysctls);
61 1 : return 0;
62 : }
63 : late_initcall(sched_rt_sysctl_init);
64 : #endif
65 :
66 0 : static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
67 : {
68 0 : struct rt_bandwidth *rt_b =
69 0 : container_of(timer, struct rt_bandwidth, rt_period_timer);
70 0 : int idle = 0;
71 : int overrun;
72 :
73 0 : raw_spin_lock(&rt_b->rt_runtime_lock);
74 : for (;;) {
75 0 : overrun = hrtimer_forward_now(timer, rt_b->rt_period);
76 0 : if (!overrun)
77 : break;
78 :
79 0 : raw_spin_unlock(&rt_b->rt_runtime_lock);
80 0 : idle = do_sched_rt_period_timer(rt_b, overrun);
81 0 : raw_spin_lock(&rt_b->rt_runtime_lock);
82 : }
83 0 : if (idle)
84 0 : rt_b->rt_period_active = 0;
85 0 : raw_spin_unlock(&rt_b->rt_runtime_lock);
86 :
87 0 : return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
88 : }
89 :
90 1 : void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
91 : {
92 1 : rt_b->rt_period = ns_to_ktime(period);
93 1 : rt_b->rt_runtime = runtime;
94 :
95 : raw_spin_lock_init(&rt_b->rt_runtime_lock);
96 :
97 1 : hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
98 : HRTIMER_MODE_REL_HARD);
99 1 : rt_b->rt_period_timer.function = sched_rt_period_timer;
100 1 : }
101 :
102 0 : static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
103 : {
104 0 : raw_spin_lock(&rt_b->rt_runtime_lock);
105 0 : if (!rt_b->rt_period_active) {
106 0 : rt_b->rt_period_active = 1;
107 : /*
108 : * SCHED_DEADLINE updates the bandwidth, as a run away
109 : * RT task with a DL task could hog a CPU. But DL does
110 : * not reset the period. If a deadline task was running
111 : * without an RT task running, it can cause RT tasks to
112 : * throttle when they start up. Kick the timer right away
113 : * to update the period.
114 : */
115 0 : hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
116 0 : hrtimer_start_expires(&rt_b->rt_period_timer,
117 : HRTIMER_MODE_ABS_PINNED_HARD);
118 : }
119 0 : raw_spin_unlock(&rt_b->rt_runtime_lock);
120 0 : }
121 :
122 : static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
123 : {
124 0 : if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
125 : return;
126 :
127 0 : do_start_rt_bandwidth(rt_b);
128 : }
129 :
130 1 : void init_rt_rq(struct rt_rq *rt_rq)
131 : {
132 : struct rt_prio_array *array;
133 : int i;
134 :
135 1 : array = &rt_rq->active;
136 101 : for (i = 0; i < MAX_RT_PRIO; i++) {
137 200 : INIT_LIST_HEAD(array->queue + i);
138 200 : __clear_bit(i, array->bitmap);
139 : }
140 : /* delimiter for bitsearch: */
141 2 : __set_bit(MAX_RT_PRIO, array->bitmap);
142 :
143 : #if defined CONFIG_SMP
144 : rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
145 : rt_rq->highest_prio.next = MAX_RT_PRIO-1;
146 : rt_rq->rt_nr_migratory = 0;
147 : rt_rq->overloaded = 0;
148 : plist_head_init(&rt_rq->pushable_tasks);
149 : #endif /* CONFIG_SMP */
150 : /* We start is dequeued state, because no RT tasks are queued */
151 1 : rt_rq->rt_queued = 0;
152 :
153 1 : rt_rq->rt_time = 0;
154 1 : rt_rq->rt_throttled = 0;
155 1 : rt_rq->rt_runtime = 0;
156 : raw_spin_lock_init(&rt_rq->rt_runtime_lock);
157 1 : }
158 :
159 : #ifdef CONFIG_RT_GROUP_SCHED
160 : static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
161 : {
162 : hrtimer_cancel(&rt_b->rt_period_timer);
163 : }
164 :
165 : #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
166 :
167 : static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
168 : {
169 : #ifdef CONFIG_SCHED_DEBUG
170 : WARN_ON_ONCE(!rt_entity_is_task(rt_se));
171 : #endif
172 : return container_of(rt_se, struct task_struct, rt);
173 : }
174 :
175 : static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
176 : {
177 : return rt_rq->rq;
178 : }
179 :
180 : static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
181 : {
182 : return rt_se->rt_rq;
183 : }
184 :
185 : static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
186 : {
187 : struct rt_rq *rt_rq = rt_se->rt_rq;
188 :
189 : return rt_rq->rq;
190 : }
191 :
192 : void unregister_rt_sched_group(struct task_group *tg)
193 : {
194 : if (tg->rt_se)
195 : destroy_rt_bandwidth(&tg->rt_bandwidth);
196 :
197 : }
198 :
199 : void free_rt_sched_group(struct task_group *tg)
200 : {
201 : int i;
202 :
203 : for_each_possible_cpu(i) {
204 : if (tg->rt_rq)
205 : kfree(tg->rt_rq[i]);
206 : if (tg->rt_se)
207 : kfree(tg->rt_se[i]);
208 : }
209 :
210 : kfree(tg->rt_rq);
211 : kfree(tg->rt_se);
212 : }
213 :
214 : void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
215 : struct sched_rt_entity *rt_se, int cpu,
216 : struct sched_rt_entity *parent)
217 : {
218 : struct rq *rq = cpu_rq(cpu);
219 :
220 : rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
221 : rt_rq->rt_nr_boosted = 0;
222 : rt_rq->rq = rq;
223 : rt_rq->tg = tg;
224 :
225 : tg->rt_rq[cpu] = rt_rq;
226 : tg->rt_se[cpu] = rt_se;
227 :
228 : if (!rt_se)
229 : return;
230 :
231 : if (!parent)
232 : rt_se->rt_rq = &rq->rt;
233 : else
234 : rt_se->rt_rq = parent->my_q;
235 :
236 : rt_se->my_q = rt_rq;
237 : rt_se->parent = parent;
238 : INIT_LIST_HEAD(&rt_se->run_list);
239 : }
240 :
241 : int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
242 : {
243 : struct rt_rq *rt_rq;
244 : struct sched_rt_entity *rt_se;
245 : int i;
246 :
247 : tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
248 : if (!tg->rt_rq)
249 : goto err;
250 : tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
251 : if (!tg->rt_se)
252 : goto err;
253 :
254 : init_rt_bandwidth(&tg->rt_bandwidth,
255 : ktime_to_ns(def_rt_bandwidth.rt_period), 0);
256 :
257 : for_each_possible_cpu(i) {
258 : rt_rq = kzalloc_node(sizeof(struct rt_rq),
259 : GFP_KERNEL, cpu_to_node(i));
260 : if (!rt_rq)
261 : goto err;
262 :
263 : rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
264 : GFP_KERNEL, cpu_to_node(i));
265 : if (!rt_se)
266 : goto err_free_rq;
267 :
268 : init_rt_rq(rt_rq);
269 : rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
270 : init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
271 : }
272 :
273 : return 1;
274 :
275 : err_free_rq:
276 : kfree(rt_rq);
277 : err:
278 : return 0;
279 : }
280 :
281 : #else /* CONFIG_RT_GROUP_SCHED */
282 :
283 : #define rt_entity_is_task(rt_se) (1)
284 :
285 : static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
286 : {
287 0 : return container_of(rt_se, struct task_struct, rt);
288 : }
289 :
290 : static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
291 : {
292 0 : return container_of(rt_rq, struct rq, rt);
293 : }
294 :
295 : static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
296 : {
297 0 : struct task_struct *p = rt_task_of(rt_se);
298 :
299 0 : return task_rq(p);
300 : }
301 :
302 : static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
303 : {
304 0 : struct rq *rq = rq_of_rt_se(rt_se);
305 :
306 : return &rq->rt;
307 : }
308 :
309 0 : void unregister_rt_sched_group(struct task_group *tg) { }
310 :
311 0 : void free_rt_sched_group(struct task_group *tg) { }
312 :
313 0 : int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
314 : {
315 0 : return 1;
316 : }
317 : #endif /* CONFIG_RT_GROUP_SCHED */
318 :
319 : #ifdef CONFIG_SMP
320 :
321 : static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
322 : {
323 : /* Try to pull RT tasks here if we lower this rq's prio */
324 : return rq->online && rq->rt.highest_prio.curr > prev->prio;
325 : }
326 :
327 : static inline int rt_overloaded(struct rq *rq)
328 : {
329 : return atomic_read(&rq->rd->rto_count);
330 : }
331 :
332 : static inline void rt_set_overload(struct rq *rq)
333 : {
334 : if (!rq->online)
335 : return;
336 :
337 : cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
338 : /*
339 : * Make sure the mask is visible before we set
340 : * the overload count. That is checked to determine
341 : * if we should look at the mask. It would be a shame
342 : * if we looked at the mask, but the mask was not
343 : * updated yet.
344 : *
345 : * Matched by the barrier in pull_rt_task().
346 : */
347 : smp_wmb();
348 : atomic_inc(&rq->rd->rto_count);
349 : }
350 :
351 : static inline void rt_clear_overload(struct rq *rq)
352 : {
353 : if (!rq->online)
354 : return;
355 :
356 : /* the order here really doesn't matter */
357 : atomic_dec(&rq->rd->rto_count);
358 : cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
359 : }
360 :
361 : static void update_rt_migration(struct rt_rq *rt_rq)
362 : {
363 : if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
364 : if (!rt_rq->overloaded) {
365 : rt_set_overload(rq_of_rt_rq(rt_rq));
366 : rt_rq->overloaded = 1;
367 : }
368 : } else if (rt_rq->overloaded) {
369 : rt_clear_overload(rq_of_rt_rq(rt_rq));
370 : rt_rq->overloaded = 0;
371 : }
372 : }
373 :
374 : static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
375 : {
376 : struct task_struct *p;
377 :
378 : if (!rt_entity_is_task(rt_se))
379 : return;
380 :
381 : p = rt_task_of(rt_se);
382 : rt_rq = &rq_of_rt_rq(rt_rq)->rt;
383 :
384 : rt_rq->rt_nr_total++;
385 : if (p->nr_cpus_allowed > 1)
386 : rt_rq->rt_nr_migratory++;
387 :
388 : update_rt_migration(rt_rq);
389 : }
390 :
391 : static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
392 : {
393 : struct task_struct *p;
394 :
395 : if (!rt_entity_is_task(rt_se))
396 : return;
397 :
398 : p = rt_task_of(rt_se);
399 : rt_rq = &rq_of_rt_rq(rt_rq)->rt;
400 :
401 : rt_rq->rt_nr_total--;
402 : if (p->nr_cpus_allowed > 1)
403 : rt_rq->rt_nr_migratory--;
404 :
405 : update_rt_migration(rt_rq);
406 : }
407 :
408 : static inline int has_pushable_tasks(struct rq *rq)
409 : {
410 : return !plist_head_empty(&rq->rt.pushable_tasks);
411 : }
412 :
413 : static DEFINE_PER_CPU(struct balance_callback, rt_push_head);
414 : static DEFINE_PER_CPU(struct balance_callback, rt_pull_head);
415 :
416 : static void push_rt_tasks(struct rq *);
417 : static void pull_rt_task(struct rq *);
418 :
419 : static inline void rt_queue_push_tasks(struct rq *rq)
420 : {
421 : if (!has_pushable_tasks(rq))
422 : return;
423 :
424 : queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
425 : }
426 :
427 : static inline void rt_queue_pull_task(struct rq *rq)
428 : {
429 : queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
430 : }
431 :
432 : static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
433 : {
434 : plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
435 : plist_node_init(&p->pushable_tasks, p->prio);
436 : plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
437 :
438 : /* Update the highest prio pushable task */
439 : if (p->prio < rq->rt.highest_prio.next)
440 : rq->rt.highest_prio.next = p->prio;
441 : }
442 :
443 : static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
444 : {
445 : plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
446 :
447 : /* Update the new highest prio pushable task */
448 : if (has_pushable_tasks(rq)) {
449 : p = plist_first_entry(&rq->rt.pushable_tasks,
450 : struct task_struct, pushable_tasks);
451 : rq->rt.highest_prio.next = p->prio;
452 : } else {
453 : rq->rt.highest_prio.next = MAX_RT_PRIO-1;
454 : }
455 : }
456 :
457 : #else
458 :
459 : static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
460 : {
461 : }
462 :
463 : static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
464 : {
465 : }
466 :
467 : static inline
468 : void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
469 : {
470 : }
471 :
472 : static inline
473 : void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
474 : {
475 : }
476 :
477 : static inline void rt_queue_push_tasks(struct rq *rq)
478 : {
479 : }
480 : #endif /* CONFIG_SMP */
481 :
482 : static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
483 : static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
484 :
485 : static inline int on_rt_rq(struct sched_rt_entity *rt_se)
486 : {
487 0 : return rt_se->on_rq;
488 : }
489 :
490 : #ifdef CONFIG_UCLAMP_TASK
491 : /*
492 : * Verify the fitness of task @p to run on @cpu taking into account the uclamp
493 : * settings.
494 : *
495 : * This check is only important for heterogeneous systems where uclamp_min value
496 : * is higher than the capacity of a @cpu. For non-heterogeneous system this
497 : * function will always return true.
498 : *
499 : * The function will return true if the capacity of the @cpu is >= the
500 : * uclamp_min and false otherwise.
501 : *
502 : * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
503 : * > uclamp_max.
504 : */
505 : static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
506 : {
507 : unsigned int min_cap;
508 : unsigned int max_cap;
509 : unsigned int cpu_cap;
510 :
511 : /* Only heterogeneous systems can benefit from this check */
512 : if (!sched_asym_cpucap_active())
513 : return true;
514 :
515 : min_cap = uclamp_eff_value(p, UCLAMP_MIN);
516 : max_cap = uclamp_eff_value(p, UCLAMP_MAX);
517 :
518 : cpu_cap = capacity_orig_of(cpu);
519 :
520 : return cpu_cap >= min(min_cap, max_cap);
521 : }
522 : #else
523 : static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
524 : {
525 : return true;
526 : }
527 : #endif
528 :
529 : #ifdef CONFIG_RT_GROUP_SCHED
530 :
531 : static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
532 : {
533 : if (!rt_rq->tg)
534 : return RUNTIME_INF;
535 :
536 : return rt_rq->rt_runtime;
537 : }
538 :
539 : static inline u64 sched_rt_period(struct rt_rq *rt_rq)
540 : {
541 : return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
542 : }
543 :
544 : typedef struct task_group *rt_rq_iter_t;
545 :
546 : static inline struct task_group *next_task_group(struct task_group *tg)
547 : {
548 : do {
549 : tg = list_entry_rcu(tg->list.next,
550 : typeof(struct task_group), list);
551 : } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
552 :
553 : if (&tg->list == &task_groups)
554 : tg = NULL;
555 :
556 : return tg;
557 : }
558 :
559 : #define for_each_rt_rq(rt_rq, iter, rq) \
560 : for (iter = container_of(&task_groups, typeof(*iter), list); \
561 : (iter = next_task_group(iter)) && \
562 : (rt_rq = iter->rt_rq[cpu_of(rq)]);)
563 :
564 : #define for_each_sched_rt_entity(rt_se) \
565 : for (; rt_se; rt_se = rt_se->parent)
566 :
567 : static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
568 : {
569 : return rt_se->my_q;
570 : }
571 :
572 : static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
573 : static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
574 :
575 : static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
576 : {
577 : struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
578 : struct rq *rq = rq_of_rt_rq(rt_rq);
579 : struct sched_rt_entity *rt_se;
580 :
581 : int cpu = cpu_of(rq);
582 :
583 : rt_se = rt_rq->tg->rt_se[cpu];
584 :
585 : if (rt_rq->rt_nr_running) {
586 : if (!rt_se)
587 : enqueue_top_rt_rq(rt_rq);
588 : else if (!on_rt_rq(rt_se))
589 : enqueue_rt_entity(rt_se, 0);
590 :
591 : if (rt_rq->highest_prio.curr < curr->prio)
592 : resched_curr(rq);
593 : }
594 : }
595 :
596 : static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
597 : {
598 : struct sched_rt_entity *rt_se;
599 : int cpu = cpu_of(rq_of_rt_rq(rt_rq));
600 :
601 : rt_se = rt_rq->tg->rt_se[cpu];
602 :
603 : if (!rt_se) {
604 : dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
605 : /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
606 : cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
607 : }
608 : else if (on_rt_rq(rt_se))
609 : dequeue_rt_entity(rt_se, 0);
610 : }
611 :
612 : static inline int rt_rq_throttled(struct rt_rq *rt_rq)
613 : {
614 : return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
615 : }
616 :
617 : static int rt_se_boosted(struct sched_rt_entity *rt_se)
618 : {
619 : struct rt_rq *rt_rq = group_rt_rq(rt_se);
620 : struct task_struct *p;
621 :
622 : if (rt_rq)
623 : return !!rt_rq->rt_nr_boosted;
624 :
625 : p = rt_task_of(rt_se);
626 : return p->prio != p->normal_prio;
627 : }
628 :
629 : #ifdef CONFIG_SMP
630 : static inline const struct cpumask *sched_rt_period_mask(void)
631 : {
632 : return this_rq()->rd->span;
633 : }
634 : #else
635 : static inline const struct cpumask *sched_rt_period_mask(void)
636 : {
637 : return cpu_online_mask;
638 : }
639 : #endif
640 :
641 : static inline
642 : struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
643 : {
644 : return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
645 : }
646 :
647 : static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
648 : {
649 : return &rt_rq->tg->rt_bandwidth;
650 : }
651 :
652 : #else /* !CONFIG_RT_GROUP_SCHED */
653 :
654 : static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
655 : {
656 : return rt_rq->rt_runtime;
657 : }
658 :
659 : static inline u64 sched_rt_period(struct rt_rq *rt_rq)
660 : {
661 0 : return ktime_to_ns(def_rt_bandwidth.rt_period);
662 : }
663 :
664 : typedef struct rt_rq *rt_rq_iter_t;
665 :
666 : #define for_each_rt_rq(rt_rq, iter, rq) \
667 : for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
668 :
669 : #define for_each_sched_rt_entity(rt_se) \
670 : for (; rt_se; rt_se = NULL)
671 :
672 : static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
673 : {
674 : return NULL;
675 : }
676 :
677 0 : static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
678 : {
679 0 : struct rq *rq = rq_of_rt_rq(rt_rq);
680 :
681 0 : if (!rt_rq->rt_nr_running)
682 : return;
683 :
684 0 : enqueue_top_rt_rq(rt_rq);
685 0 : resched_curr(rq);
686 : }
687 :
688 : static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
689 : {
690 0 : dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
691 : }
692 :
693 : static inline int rt_rq_throttled(struct rt_rq *rt_rq)
694 : {
695 : return rt_rq->rt_throttled;
696 : }
697 :
698 : static inline const struct cpumask *sched_rt_period_mask(void)
699 : {
700 : return cpu_online_mask;
701 : }
702 :
703 : static inline
704 : struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
705 : {
706 0 : return &cpu_rq(cpu)->rt;
707 : }
708 :
709 : static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
710 : {
711 : return &def_rt_bandwidth;
712 : }
713 :
714 : #endif /* CONFIG_RT_GROUP_SCHED */
715 :
716 0 : bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
717 : {
718 0 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
719 :
720 0 : return (hrtimer_active(&rt_b->rt_period_timer) ||
721 0 : rt_rq->rt_time < rt_b->rt_runtime);
722 : }
723 :
724 : #ifdef CONFIG_SMP
725 : /*
726 : * We ran out of runtime, see if we can borrow some from our neighbours.
727 : */
728 : static void do_balance_runtime(struct rt_rq *rt_rq)
729 : {
730 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
731 : struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
732 : int i, weight;
733 : u64 rt_period;
734 :
735 : weight = cpumask_weight(rd->span);
736 :
737 : raw_spin_lock(&rt_b->rt_runtime_lock);
738 : rt_period = ktime_to_ns(rt_b->rt_period);
739 : for_each_cpu(i, rd->span) {
740 : struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
741 : s64 diff;
742 :
743 : if (iter == rt_rq)
744 : continue;
745 :
746 : raw_spin_lock(&iter->rt_runtime_lock);
747 : /*
748 : * Either all rqs have inf runtime and there's nothing to steal
749 : * or __disable_runtime() below sets a specific rq to inf to
750 : * indicate its been disabled and disallow stealing.
751 : */
752 : if (iter->rt_runtime == RUNTIME_INF)
753 : goto next;
754 :
755 : /*
756 : * From runqueues with spare time, take 1/n part of their
757 : * spare time, but no more than our period.
758 : */
759 : diff = iter->rt_runtime - iter->rt_time;
760 : if (diff > 0) {
761 : diff = div_u64((u64)diff, weight);
762 : if (rt_rq->rt_runtime + diff > rt_period)
763 : diff = rt_period - rt_rq->rt_runtime;
764 : iter->rt_runtime -= diff;
765 : rt_rq->rt_runtime += diff;
766 : if (rt_rq->rt_runtime == rt_period) {
767 : raw_spin_unlock(&iter->rt_runtime_lock);
768 : break;
769 : }
770 : }
771 : next:
772 : raw_spin_unlock(&iter->rt_runtime_lock);
773 : }
774 : raw_spin_unlock(&rt_b->rt_runtime_lock);
775 : }
776 :
777 : /*
778 : * Ensure this RQ takes back all the runtime it lend to its neighbours.
779 : */
780 : static void __disable_runtime(struct rq *rq)
781 : {
782 : struct root_domain *rd = rq->rd;
783 : rt_rq_iter_t iter;
784 : struct rt_rq *rt_rq;
785 :
786 : if (unlikely(!scheduler_running))
787 : return;
788 :
789 : for_each_rt_rq(rt_rq, iter, rq) {
790 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
791 : s64 want;
792 : int i;
793 :
794 : raw_spin_lock(&rt_b->rt_runtime_lock);
795 : raw_spin_lock(&rt_rq->rt_runtime_lock);
796 : /*
797 : * Either we're all inf and nobody needs to borrow, or we're
798 : * already disabled and thus have nothing to do, or we have
799 : * exactly the right amount of runtime to take out.
800 : */
801 : if (rt_rq->rt_runtime == RUNTIME_INF ||
802 : rt_rq->rt_runtime == rt_b->rt_runtime)
803 : goto balanced;
804 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
805 :
806 : /*
807 : * Calculate the difference between what we started out with
808 : * and what we current have, that's the amount of runtime
809 : * we lend and now have to reclaim.
810 : */
811 : want = rt_b->rt_runtime - rt_rq->rt_runtime;
812 :
813 : /*
814 : * Greedy reclaim, take back as much as we can.
815 : */
816 : for_each_cpu(i, rd->span) {
817 : struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
818 : s64 diff;
819 :
820 : /*
821 : * Can't reclaim from ourselves or disabled runqueues.
822 : */
823 : if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
824 : continue;
825 :
826 : raw_spin_lock(&iter->rt_runtime_lock);
827 : if (want > 0) {
828 : diff = min_t(s64, iter->rt_runtime, want);
829 : iter->rt_runtime -= diff;
830 : want -= diff;
831 : } else {
832 : iter->rt_runtime -= want;
833 : want -= want;
834 : }
835 : raw_spin_unlock(&iter->rt_runtime_lock);
836 :
837 : if (!want)
838 : break;
839 : }
840 :
841 : raw_spin_lock(&rt_rq->rt_runtime_lock);
842 : /*
843 : * We cannot be left wanting - that would mean some runtime
844 : * leaked out of the system.
845 : */
846 : WARN_ON_ONCE(want);
847 : balanced:
848 : /*
849 : * Disable all the borrow logic by pretending we have inf
850 : * runtime - in which case borrowing doesn't make sense.
851 : */
852 : rt_rq->rt_runtime = RUNTIME_INF;
853 : rt_rq->rt_throttled = 0;
854 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
855 : raw_spin_unlock(&rt_b->rt_runtime_lock);
856 :
857 : /* Make rt_rq available for pick_next_task() */
858 : sched_rt_rq_enqueue(rt_rq);
859 : }
860 : }
861 :
862 : static void __enable_runtime(struct rq *rq)
863 : {
864 : rt_rq_iter_t iter;
865 : struct rt_rq *rt_rq;
866 :
867 : if (unlikely(!scheduler_running))
868 : return;
869 :
870 : /*
871 : * Reset each runqueue's bandwidth settings
872 : */
873 : for_each_rt_rq(rt_rq, iter, rq) {
874 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
875 :
876 : raw_spin_lock(&rt_b->rt_runtime_lock);
877 : raw_spin_lock(&rt_rq->rt_runtime_lock);
878 : rt_rq->rt_runtime = rt_b->rt_runtime;
879 : rt_rq->rt_time = 0;
880 : rt_rq->rt_throttled = 0;
881 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
882 : raw_spin_unlock(&rt_b->rt_runtime_lock);
883 : }
884 : }
885 :
886 : static void balance_runtime(struct rt_rq *rt_rq)
887 : {
888 : if (!sched_feat(RT_RUNTIME_SHARE))
889 : return;
890 :
891 : if (rt_rq->rt_time > rt_rq->rt_runtime) {
892 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
893 : do_balance_runtime(rt_rq);
894 : raw_spin_lock(&rt_rq->rt_runtime_lock);
895 : }
896 : }
897 : #else /* !CONFIG_SMP */
898 : static inline void balance_runtime(struct rt_rq *rt_rq) {}
899 : #endif /* CONFIG_SMP */
900 :
901 0 : static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
902 : {
903 0 : int i, idle = 1, throttled = 0;
904 : const struct cpumask *span;
905 :
906 0 : span = sched_rt_period_mask();
907 : #ifdef CONFIG_RT_GROUP_SCHED
908 : /*
909 : * FIXME: isolated CPUs should really leave the root task group,
910 : * whether they are isolcpus or were isolated via cpusets, lest
911 : * the timer run on a CPU which does not service all runqueues,
912 : * potentially leaving other CPUs indefinitely throttled. If
913 : * isolation is really required, the user will turn the throttle
914 : * off to kill the perturbations it causes anyway. Meanwhile,
915 : * this maintains functionality for boot and/or troubleshooting.
916 : */
917 : if (rt_b == &root_task_group.rt_bandwidth)
918 : span = cpu_online_mask;
919 : #endif
920 0 : for_each_cpu(i, span) {
921 0 : int enqueue = 0;
922 0 : struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
923 0 : struct rq *rq = rq_of_rt_rq(rt_rq);
924 : struct rq_flags rf;
925 : int skip;
926 :
927 : /*
928 : * When span == cpu_online_mask, taking each rq->lock
929 : * can be time-consuming. Try to avoid it when possible.
930 : */
931 0 : raw_spin_lock(&rt_rq->rt_runtime_lock);
932 0 : if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
933 0 : rt_rq->rt_runtime = rt_b->rt_runtime;
934 0 : skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
935 0 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
936 0 : if (skip)
937 0 : continue;
938 :
939 0 : rq_lock(rq, &rf);
940 0 : update_rq_clock(rq);
941 :
942 0 : if (rt_rq->rt_time) {
943 : u64 runtime;
944 :
945 0 : raw_spin_lock(&rt_rq->rt_runtime_lock);
946 0 : if (rt_rq->rt_throttled)
947 : balance_runtime(rt_rq);
948 0 : runtime = rt_rq->rt_runtime;
949 0 : rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
950 0 : if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
951 0 : rt_rq->rt_throttled = 0;
952 0 : enqueue = 1;
953 :
954 : /*
955 : * When we're idle and a woken (rt) task is
956 : * throttled check_preempt_curr() will set
957 : * skip_update and the time between the wakeup
958 : * and this unthrottle will get accounted as
959 : * 'runtime'.
960 : */
961 0 : if (rt_rq->rt_nr_running && rq->curr == rq->idle)
962 : rq_clock_cancel_skipupdate(rq);
963 : }
964 0 : if (rt_rq->rt_time || rt_rq->rt_nr_running)
965 0 : idle = 0;
966 0 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
967 0 : } else if (rt_rq->rt_nr_running) {
968 0 : idle = 0;
969 0 : if (!rt_rq_throttled(rt_rq))
970 0 : enqueue = 1;
971 : }
972 0 : if (rt_rq->rt_throttled)
973 0 : throttled = 1;
974 :
975 0 : if (enqueue)
976 0 : sched_rt_rq_enqueue(rt_rq);
977 0 : rq_unlock(rq, &rf);
978 : }
979 :
980 0 : if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
981 : return 1;
982 :
983 : return idle;
984 : }
985 :
986 : static inline int rt_se_prio(struct sched_rt_entity *rt_se)
987 : {
988 : #ifdef CONFIG_RT_GROUP_SCHED
989 : struct rt_rq *rt_rq = group_rt_rq(rt_se);
990 :
991 : if (rt_rq)
992 : return rt_rq->highest_prio.curr;
993 : #endif
994 :
995 0 : return rt_task_of(rt_se)->prio;
996 : }
997 :
998 0 : static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
999 : {
1000 0 : u64 runtime = sched_rt_runtime(rt_rq);
1001 :
1002 0 : if (rt_rq->rt_throttled)
1003 : return rt_rq_throttled(rt_rq);
1004 :
1005 0 : if (runtime >= sched_rt_period(rt_rq))
1006 : return 0;
1007 :
1008 0 : balance_runtime(rt_rq);
1009 0 : runtime = sched_rt_runtime(rt_rq);
1010 0 : if (runtime == RUNTIME_INF)
1011 : return 0;
1012 :
1013 0 : if (rt_rq->rt_time > runtime) {
1014 0 : struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
1015 :
1016 : /*
1017 : * Don't actually throttle groups that have no runtime assigned
1018 : * but accrue some time due to boosting.
1019 : */
1020 0 : if (likely(rt_b->rt_runtime)) {
1021 0 : rt_rq->rt_throttled = 1;
1022 0 : printk_deferred_once("sched: RT throttling activated\n");
1023 : } else {
1024 : /*
1025 : * In case we did anyway, make it go away,
1026 : * replenishment is a joke, since it will replenish us
1027 : * with exactly 0 ns.
1028 : */
1029 0 : rt_rq->rt_time = 0;
1030 : }
1031 :
1032 0 : if (rt_rq_throttled(rt_rq)) {
1033 0 : sched_rt_rq_dequeue(rt_rq);
1034 0 : return 1;
1035 : }
1036 : }
1037 :
1038 : return 0;
1039 : }
1040 :
1041 : /*
1042 : * Update the current task's runtime statistics. Skip current tasks that
1043 : * are not in our scheduling class.
1044 : */
1045 0 : static void update_curr_rt(struct rq *rq)
1046 : {
1047 0 : struct task_struct *curr = rq->curr;
1048 0 : struct sched_rt_entity *rt_se = &curr->rt;
1049 : u64 delta_exec;
1050 : u64 now;
1051 :
1052 0 : if (curr->sched_class != &rt_sched_class)
1053 : return;
1054 :
1055 0 : now = rq_clock_task(rq);
1056 0 : delta_exec = now - curr->se.exec_start;
1057 0 : if (unlikely((s64)delta_exec <= 0))
1058 : return;
1059 :
1060 : schedstat_set(curr->stats.exec_max,
1061 : max(curr->stats.exec_max, delta_exec));
1062 :
1063 0 : trace_sched_stat_runtime(curr, delta_exec, 0);
1064 :
1065 0 : update_current_exec_runtime(curr, now, delta_exec);
1066 :
1067 0 : if (!rt_bandwidth_enabled())
1068 : return;
1069 :
1070 0 : for_each_sched_rt_entity(rt_se) {
1071 0 : struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1072 : int exceeded;
1073 :
1074 0 : if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
1075 0 : raw_spin_lock(&rt_rq->rt_runtime_lock);
1076 0 : rt_rq->rt_time += delta_exec;
1077 0 : exceeded = sched_rt_runtime_exceeded(rt_rq);
1078 0 : if (exceeded)
1079 0 : resched_curr(rq);
1080 0 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
1081 0 : if (exceeded)
1082 0 : do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
1083 : }
1084 : }
1085 : }
1086 :
1087 : static void
1088 0 : dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
1089 : {
1090 0 : struct rq *rq = rq_of_rt_rq(rt_rq);
1091 :
1092 0 : BUG_ON(&rq->rt != rt_rq);
1093 :
1094 0 : if (!rt_rq->rt_queued)
1095 : return;
1096 :
1097 0 : BUG_ON(!rq->nr_running);
1098 :
1099 0 : sub_nr_running(rq, count);
1100 0 : rt_rq->rt_queued = 0;
1101 :
1102 : }
1103 :
1104 : static void
1105 0 : enqueue_top_rt_rq(struct rt_rq *rt_rq)
1106 : {
1107 0 : struct rq *rq = rq_of_rt_rq(rt_rq);
1108 :
1109 0 : BUG_ON(&rq->rt != rt_rq);
1110 :
1111 0 : if (rt_rq->rt_queued)
1112 : return;
1113 :
1114 0 : if (rt_rq_throttled(rt_rq))
1115 : return;
1116 :
1117 0 : if (rt_rq->rt_nr_running) {
1118 0 : add_nr_running(rq, rt_rq->rt_nr_running);
1119 0 : rt_rq->rt_queued = 1;
1120 : }
1121 :
1122 : /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1123 : cpufreq_update_util(rq, 0);
1124 : }
1125 :
1126 : #if defined CONFIG_SMP
1127 :
1128 : static void
1129 : inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1130 : {
1131 : struct rq *rq = rq_of_rt_rq(rt_rq);
1132 :
1133 : #ifdef CONFIG_RT_GROUP_SCHED
1134 : /*
1135 : * Change rq's cpupri only if rt_rq is the top queue.
1136 : */
1137 : if (&rq->rt != rt_rq)
1138 : return;
1139 : #endif
1140 : if (rq->online && prio < prev_prio)
1141 : cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1142 : }
1143 :
1144 : static void
1145 : dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1146 : {
1147 : struct rq *rq = rq_of_rt_rq(rt_rq);
1148 :
1149 : #ifdef CONFIG_RT_GROUP_SCHED
1150 : /*
1151 : * Change rq's cpupri only if rt_rq is the top queue.
1152 : */
1153 : if (&rq->rt != rt_rq)
1154 : return;
1155 : #endif
1156 : if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1157 : cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1158 : }
1159 :
1160 : #else /* CONFIG_SMP */
1161 :
1162 : static inline
1163 : void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1164 : static inline
1165 : void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1166 :
1167 : #endif /* CONFIG_SMP */
1168 :
1169 : #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1170 : static void
1171 : inc_rt_prio(struct rt_rq *rt_rq, int prio)
1172 : {
1173 : int prev_prio = rt_rq->highest_prio.curr;
1174 :
1175 : if (prio < prev_prio)
1176 : rt_rq->highest_prio.curr = prio;
1177 :
1178 : inc_rt_prio_smp(rt_rq, prio, prev_prio);
1179 : }
1180 :
1181 : static void
1182 : dec_rt_prio(struct rt_rq *rt_rq, int prio)
1183 : {
1184 : int prev_prio = rt_rq->highest_prio.curr;
1185 :
1186 : if (rt_rq->rt_nr_running) {
1187 :
1188 : WARN_ON(prio < prev_prio);
1189 :
1190 : /*
1191 : * This may have been our highest task, and therefore
1192 : * we may have some recomputation to do
1193 : */
1194 : if (prio == prev_prio) {
1195 : struct rt_prio_array *array = &rt_rq->active;
1196 :
1197 : rt_rq->highest_prio.curr =
1198 : sched_find_first_bit(array->bitmap);
1199 : }
1200 :
1201 : } else {
1202 : rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
1203 : }
1204 :
1205 : dec_rt_prio_smp(rt_rq, prio, prev_prio);
1206 : }
1207 :
1208 : #else
1209 :
1210 : static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1211 : static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1212 :
1213 : #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1214 :
1215 : #ifdef CONFIG_RT_GROUP_SCHED
1216 :
1217 : static void
1218 : inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1219 : {
1220 : if (rt_se_boosted(rt_se))
1221 : rt_rq->rt_nr_boosted++;
1222 :
1223 : if (rt_rq->tg)
1224 : start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1225 : }
1226 :
1227 : static void
1228 : dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1229 : {
1230 : if (rt_se_boosted(rt_se))
1231 : rt_rq->rt_nr_boosted--;
1232 :
1233 : WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1234 : }
1235 :
1236 : #else /* CONFIG_RT_GROUP_SCHED */
1237 :
1238 : static void
1239 : inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1240 : {
1241 0 : start_rt_bandwidth(&def_rt_bandwidth);
1242 : }
1243 :
1244 : static inline
1245 : void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1246 :
1247 : #endif /* CONFIG_RT_GROUP_SCHED */
1248 :
1249 : static inline
1250 : unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1251 : {
1252 0 : struct rt_rq *group_rq = group_rt_rq(rt_se);
1253 :
1254 : if (group_rq)
1255 : return group_rq->rt_nr_running;
1256 : else
1257 : return 1;
1258 : }
1259 :
1260 : static inline
1261 : unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1262 : {
1263 0 : struct rt_rq *group_rq = group_rt_rq(rt_se);
1264 : struct task_struct *tsk;
1265 :
1266 : if (group_rq)
1267 : return group_rq->rr_nr_running;
1268 :
1269 0 : tsk = rt_task_of(rt_se);
1270 :
1271 0 : return (tsk->policy == SCHED_RR) ? 1 : 0;
1272 : }
1273 :
1274 : static inline
1275 0 : void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1276 : {
1277 0 : int prio = rt_se_prio(rt_se);
1278 :
1279 0 : WARN_ON(!rt_prio(prio));
1280 0 : rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1281 0 : rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1282 :
1283 0 : inc_rt_prio(rt_rq, prio);
1284 0 : inc_rt_migration(rt_se, rt_rq);
1285 0 : inc_rt_group(rt_se, rt_rq);
1286 0 : }
1287 :
1288 : static inline
1289 0 : void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1290 : {
1291 0 : WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1292 0 : WARN_ON(!rt_rq->rt_nr_running);
1293 0 : rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1294 0 : rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1295 :
1296 0 : dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1297 0 : dec_rt_migration(rt_se, rt_rq);
1298 0 : dec_rt_group(rt_se, rt_rq);
1299 0 : }
1300 :
1301 : /*
1302 : * Change rt_se->run_list location unless SAVE && !MOVE
1303 : *
1304 : * assumes ENQUEUE/DEQUEUE flags match
1305 : */
1306 : static inline bool move_entity(unsigned int flags)
1307 : {
1308 0 : if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1309 : return false;
1310 :
1311 : return true;
1312 : }
1313 :
1314 : static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1315 : {
1316 0 : list_del_init(&rt_se->run_list);
1317 :
1318 0 : if (list_empty(array->queue + rt_se_prio(rt_se)))
1319 0 : __clear_bit(rt_se_prio(rt_se), array->bitmap);
1320 :
1321 0 : rt_se->on_list = 0;
1322 : }
1323 :
1324 : static inline struct sched_statistics *
1325 : __schedstats_from_rt_se(struct sched_rt_entity *rt_se)
1326 : {
1327 : #ifdef CONFIG_RT_GROUP_SCHED
1328 : /* schedstats is not supported for rt group. */
1329 : if (!rt_entity_is_task(rt_se))
1330 : return NULL;
1331 : #endif
1332 :
1333 : return &rt_task_of(rt_se)->stats;
1334 : }
1335 :
1336 : static inline void
1337 : update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1338 : {
1339 : struct sched_statistics *stats;
1340 0 : struct task_struct *p = NULL;
1341 :
1342 : if (!schedstat_enabled())
1343 : return;
1344 :
1345 : if (rt_entity_is_task(rt_se))
1346 : p = rt_task_of(rt_se);
1347 :
1348 : stats = __schedstats_from_rt_se(rt_se);
1349 : if (!stats)
1350 : return;
1351 :
1352 : __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats);
1353 : }
1354 :
1355 : static inline void
1356 : update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1357 : {
1358 : struct sched_statistics *stats;
1359 : struct task_struct *p = NULL;
1360 :
1361 : if (!schedstat_enabled())
1362 : return;
1363 :
1364 : if (rt_entity_is_task(rt_se))
1365 : p = rt_task_of(rt_se);
1366 :
1367 : stats = __schedstats_from_rt_se(rt_se);
1368 : if (!stats)
1369 : return;
1370 :
1371 : __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats);
1372 : }
1373 :
1374 : static inline void
1375 : update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
1376 : int flags)
1377 : {
1378 : if (!schedstat_enabled())
1379 : return;
1380 :
1381 : if (flags & ENQUEUE_WAKEUP)
1382 : update_stats_enqueue_sleeper_rt(rt_rq, rt_se);
1383 : }
1384 :
1385 : static inline void
1386 : update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1387 : {
1388 : struct sched_statistics *stats;
1389 : struct task_struct *p = NULL;
1390 :
1391 : if (!schedstat_enabled())
1392 : return;
1393 :
1394 : if (rt_entity_is_task(rt_se))
1395 : p = rt_task_of(rt_se);
1396 :
1397 : stats = __schedstats_from_rt_se(rt_se);
1398 : if (!stats)
1399 : return;
1400 :
1401 : __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats);
1402 : }
1403 :
1404 : static inline void
1405 : update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
1406 : int flags)
1407 : {
1408 0 : struct task_struct *p = NULL;
1409 :
1410 : if (!schedstat_enabled())
1411 : return;
1412 :
1413 : if (rt_entity_is_task(rt_se))
1414 : p = rt_task_of(rt_se);
1415 :
1416 : if ((flags & DEQUEUE_SLEEP) && p) {
1417 : unsigned int state;
1418 :
1419 : state = READ_ONCE(p->__state);
1420 : if (state & TASK_INTERRUPTIBLE)
1421 : __schedstat_set(p->stats.sleep_start,
1422 : rq_clock(rq_of_rt_rq(rt_rq)));
1423 :
1424 : if (state & TASK_UNINTERRUPTIBLE)
1425 : __schedstat_set(p->stats.block_start,
1426 : rq_clock(rq_of_rt_rq(rt_rq)));
1427 : }
1428 : }
1429 :
1430 0 : static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1431 : {
1432 0 : struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1433 0 : struct rt_prio_array *array = &rt_rq->active;
1434 0 : struct rt_rq *group_rq = group_rt_rq(rt_se);
1435 0 : struct list_head *queue = array->queue + rt_se_prio(rt_se);
1436 :
1437 : /*
1438 : * Don't enqueue the group if its throttled, or when empty.
1439 : * The latter is a consequence of the former when a child group
1440 : * get throttled and the current group doesn't have any other
1441 : * active members.
1442 : */
1443 : if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1444 : if (rt_se->on_list)
1445 : __delist_rt_entity(rt_se, array);
1446 : return;
1447 : }
1448 :
1449 0 : if (move_entity(flags)) {
1450 0 : WARN_ON_ONCE(rt_se->on_list);
1451 0 : if (flags & ENQUEUE_HEAD)
1452 0 : list_add(&rt_se->run_list, queue);
1453 : else
1454 0 : list_add_tail(&rt_se->run_list, queue);
1455 :
1456 0 : __set_bit(rt_se_prio(rt_se), array->bitmap);
1457 0 : rt_se->on_list = 1;
1458 : }
1459 0 : rt_se->on_rq = 1;
1460 :
1461 0 : inc_rt_tasks(rt_se, rt_rq);
1462 : }
1463 :
1464 0 : static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1465 : {
1466 0 : struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1467 0 : struct rt_prio_array *array = &rt_rq->active;
1468 :
1469 0 : if (move_entity(flags)) {
1470 0 : WARN_ON_ONCE(!rt_se->on_list);
1471 : __delist_rt_entity(rt_se, array);
1472 : }
1473 0 : rt_se->on_rq = 0;
1474 :
1475 0 : dec_rt_tasks(rt_se, rt_rq);
1476 0 : }
1477 :
1478 : /*
1479 : * Because the prio of an upper entry depends on the lower
1480 : * entries, we must remove entries top - down.
1481 : */
1482 0 : static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1483 : {
1484 0 : struct sched_rt_entity *back = NULL;
1485 : unsigned int rt_nr_running;
1486 :
1487 0 : for_each_sched_rt_entity(rt_se) {
1488 0 : rt_se->back = back;
1489 0 : back = rt_se;
1490 : }
1491 :
1492 0 : rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
1493 :
1494 0 : for (rt_se = back; rt_se; rt_se = rt_se->back) {
1495 0 : if (on_rt_rq(rt_se))
1496 0 : __dequeue_rt_entity(rt_se, flags);
1497 : }
1498 :
1499 0 : dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
1500 0 : }
1501 :
1502 0 : static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1503 : {
1504 0 : struct rq *rq = rq_of_rt_se(rt_se);
1505 :
1506 0 : update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
1507 :
1508 0 : dequeue_rt_stack(rt_se, flags);
1509 0 : for_each_sched_rt_entity(rt_se)
1510 0 : __enqueue_rt_entity(rt_se, flags);
1511 0 : enqueue_top_rt_rq(&rq->rt);
1512 0 : }
1513 :
1514 : static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1515 : {
1516 0 : struct rq *rq = rq_of_rt_se(rt_se);
1517 :
1518 0 : update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
1519 :
1520 0 : dequeue_rt_stack(rt_se, flags);
1521 :
1522 0 : for_each_sched_rt_entity(rt_se) {
1523 : struct rt_rq *rt_rq = group_rt_rq(rt_se);
1524 :
1525 : if (rt_rq && rt_rq->rt_nr_running)
1526 : __enqueue_rt_entity(rt_se, flags);
1527 : }
1528 0 : enqueue_top_rt_rq(&rq->rt);
1529 : }
1530 :
1531 : /*
1532 : * Adding/removing a task to/from a priority array:
1533 : */
1534 : static void
1535 0 : enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1536 : {
1537 0 : struct sched_rt_entity *rt_se = &p->rt;
1538 :
1539 0 : if (flags & ENQUEUE_WAKEUP)
1540 0 : rt_se->timeout = 0;
1541 :
1542 : check_schedstat_required();
1543 0 : update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se);
1544 :
1545 0 : enqueue_rt_entity(rt_se, flags);
1546 :
1547 0 : if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1548 : enqueue_pushable_task(rq, p);
1549 0 : }
1550 :
1551 0 : static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1552 : {
1553 0 : struct sched_rt_entity *rt_se = &p->rt;
1554 :
1555 0 : update_curr_rt(rq);
1556 0 : dequeue_rt_entity(rt_se, flags);
1557 :
1558 0 : dequeue_pushable_task(rq, p);
1559 0 : }
1560 :
1561 : /*
1562 : * Put task to the head or the end of the run list without the overhead of
1563 : * dequeue followed by enqueue.
1564 : */
1565 : static void
1566 0 : requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1567 : {
1568 0 : if (on_rt_rq(rt_se)) {
1569 0 : struct rt_prio_array *array = &rt_rq->active;
1570 0 : struct list_head *queue = array->queue + rt_se_prio(rt_se);
1571 :
1572 0 : if (head)
1573 0 : list_move(&rt_se->run_list, queue);
1574 : else
1575 0 : list_move_tail(&rt_se->run_list, queue);
1576 : }
1577 0 : }
1578 :
1579 : static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1580 : {
1581 0 : struct sched_rt_entity *rt_se = &p->rt;
1582 : struct rt_rq *rt_rq;
1583 :
1584 0 : for_each_sched_rt_entity(rt_se) {
1585 0 : rt_rq = rt_rq_of_se(rt_se);
1586 0 : requeue_rt_entity(rt_rq, rt_se, head);
1587 : }
1588 : }
1589 :
1590 0 : static void yield_task_rt(struct rq *rq)
1591 : {
1592 0 : requeue_task_rt(rq, rq->curr, 0);
1593 0 : }
1594 :
1595 : #ifdef CONFIG_SMP
1596 : static int find_lowest_rq(struct task_struct *task);
1597 :
1598 : static int
1599 : select_task_rq_rt(struct task_struct *p, int cpu, int flags)
1600 : {
1601 : struct task_struct *curr;
1602 : struct rq *rq;
1603 : bool test;
1604 :
1605 : /* For anything but wake ups, just return the task_cpu */
1606 : if (!(flags & (WF_TTWU | WF_FORK)))
1607 : goto out;
1608 :
1609 : rq = cpu_rq(cpu);
1610 :
1611 : rcu_read_lock();
1612 : curr = READ_ONCE(rq->curr); /* unlocked access */
1613 :
1614 : /*
1615 : * If the current task on @p's runqueue is an RT task, then
1616 : * try to see if we can wake this RT task up on another
1617 : * runqueue. Otherwise simply start this RT task
1618 : * on its current runqueue.
1619 : *
1620 : * We want to avoid overloading runqueues. If the woken
1621 : * task is a higher priority, then it will stay on this CPU
1622 : * and the lower prio task should be moved to another CPU.
1623 : * Even though this will probably make the lower prio task
1624 : * lose its cache, we do not want to bounce a higher task
1625 : * around just because it gave up its CPU, perhaps for a
1626 : * lock?
1627 : *
1628 : * For equal prio tasks, we just let the scheduler sort it out.
1629 : *
1630 : * Otherwise, just let it ride on the affined RQ and the
1631 : * post-schedule router will push the preempted task away
1632 : *
1633 : * This test is optimistic, if we get it wrong the load-balancer
1634 : * will have to sort it out.
1635 : *
1636 : * We take into account the capacity of the CPU to ensure it fits the
1637 : * requirement of the task - which is only important on heterogeneous
1638 : * systems like big.LITTLE.
1639 : */
1640 : test = curr &&
1641 : unlikely(rt_task(curr)) &&
1642 : (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
1643 :
1644 : if (test || !rt_task_fits_capacity(p, cpu)) {
1645 : int target = find_lowest_rq(p);
1646 :
1647 : /*
1648 : * Bail out if we were forcing a migration to find a better
1649 : * fitting CPU but our search failed.
1650 : */
1651 : if (!test && target != -1 && !rt_task_fits_capacity(p, target))
1652 : goto out_unlock;
1653 :
1654 : /*
1655 : * Don't bother moving it if the destination CPU is
1656 : * not running a lower priority task.
1657 : */
1658 : if (target != -1 &&
1659 : p->prio < cpu_rq(target)->rt.highest_prio.curr)
1660 : cpu = target;
1661 : }
1662 :
1663 : out_unlock:
1664 : rcu_read_unlock();
1665 :
1666 : out:
1667 : return cpu;
1668 : }
1669 :
1670 : static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1671 : {
1672 : /*
1673 : * Current can't be migrated, useless to reschedule,
1674 : * let's hope p can move out.
1675 : */
1676 : if (rq->curr->nr_cpus_allowed == 1 ||
1677 : !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1678 : return;
1679 :
1680 : /*
1681 : * p is migratable, so let's not schedule it and
1682 : * see if it is pushed or pulled somewhere else.
1683 : */
1684 : if (p->nr_cpus_allowed != 1 &&
1685 : cpupri_find(&rq->rd->cpupri, p, NULL))
1686 : return;
1687 :
1688 : /*
1689 : * There appear to be other CPUs that can accept
1690 : * the current task but none can run 'p', so lets reschedule
1691 : * to try and push the current task away:
1692 : */
1693 : requeue_task_rt(rq, p, 1);
1694 : resched_curr(rq);
1695 : }
1696 :
1697 : static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1698 : {
1699 : if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
1700 : /*
1701 : * This is OK, because current is on_cpu, which avoids it being
1702 : * picked for load-balance and preemption/IRQs are still
1703 : * disabled avoiding further scheduler activity on it and we've
1704 : * not yet started the picking loop.
1705 : */
1706 : rq_unpin_lock(rq, rf);
1707 : pull_rt_task(rq);
1708 : rq_repin_lock(rq, rf);
1709 : }
1710 :
1711 : return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
1712 : }
1713 : #endif /* CONFIG_SMP */
1714 :
1715 : /*
1716 : * Preempt the current task with a newly woken task if needed:
1717 : */
1718 0 : static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1719 : {
1720 0 : if (p->prio < rq->curr->prio) {
1721 0 : resched_curr(rq);
1722 0 : return;
1723 : }
1724 :
1725 : #ifdef CONFIG_SMP
1726 : /*
1727 : * If:
1728 : *
1729 : * - the newly woken task is of equal priority to the current task
1730 : * - the newly woken task is non-migratable while current is migratable
1731 : * - current will be preempted on the next reschedule
1732 : *
1733 : * we should check to see if current can readily move to a different
1734 : * cpu. If so, we will reschedule to allow the push logic to try
1735 : * to move current somewhere else, making room for our non-migratable
1736 : * task.
1737 : */
1738 : if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1739 : check_preempt_equal_prio(rq, p);
1740 : #endif
1741 : }
1742 :
1743 0 : static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
1744 : {
1745 0 : struct sched_rt_entity *rt_se = &p->rt;
1746 0 : struct rt_rq *rt_rq = &rq->rt;
1747 :
1748 0 : p->se.exec_start = rq_clock_task(rq);
1749 0 : if (on_rt_rq(&p->rt))
1750 : update_stats_wait_end_rt(rt_rq, rt_se);
1751 :
1752 : /* The running task is never eligible for pushing */
1753 0 : dequeue_pushable_task(rq, p);
1754 :
1755 : if (!first)
1756 : return;
1757 :
1758 : /*
1759 : * If prev task was rt, put_prev_task() has already updated the
1760 : * utilization. We only care of the case where we start to schedule a
1761 : * rt task
1762 : */
1763 : if (rq->curr->sched_class != &rt_sched_class)
1764 : update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1765 :
1766 : rt_queue_push_tasks(rq);
1767 : }
1768 :
1769 0 : static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
1770 : {
1771 0 : struct rt_prio_array *array = &rt_rq->active;
1772 0 : struct sched_rt_entity *next = NULL;
1773 : struct list_head *queue;
1774 : int idx;
1775 :
1776 0 : idx = sched_find_first_bit(array->bitmap);
1777 0 : BUG_ON(idx >= MAX_RT_PRIO);
1778 :
1779 0 : queue = array->queue + idx;
1780 0 : if (SCHED_WARN_ON(list_empty(queue)))
1781 : return NULL;
1782 0 : next = list_entry(queue->next, struct sched_rt_entity, run_list);
1783 :
1784 : return next;
1785 : }
1786 :
1787 : static struct task_struct *_pick_next_task_rt(struct rq *rq)
1788 : {
1789 : struct sched_rt_entity *rt_se;
1790 0 : struct rt_rq *rt_rq = &rq->rt;
1791 :
1792 : do {
1793 0 : rt_se = pick_next_rt_entity(rt_rq);
1794 0 : if (unlikely(!rt_se))
1795 : return NULL;
1796 0 : rt_rq = group_rt_rq(rt_se);
1797 : } while (rt_rq);
1798 :
1799 0 : return rt_task_of(rt_se);
1800 : }
1801 :
1802 : static struct task_struct *pick_task_rt(struct rq *rq)
1803 : {
1804 : struct task_struct *p;
1805 :
1806 0 : if (!sched_rt_runnable(rq))
1807 : return NULL;
1808 :
1809 : p = _pick_next_task_rt(rq);
1810 :
1811 : return p;
1812 : }
1813 :
1814 0 : static struct task_struct *pick_next_task_rt(struct rq *rq)
1815 : {
1816 0 : struct task_struct *p = pick_task_rt(rq);
1817 :
1818 0 : if (p)
1819 : set_next_task_rt(rq, p, true);
1820 :
1821 0 : return p;
1822 : }
1823 :
1824 0 : static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1825 : {
1826 0 : struct sched_rt_entity *rt_se = &p->rt;
1827 0 : struct rt_rq *rt_rq = &rq->rt;
1828 :
1829 0 : if (on_rt_rq(&p->rt))
1830 : update_stats_wait_start_rt(rt_rq, rt_se);
1831 :
1832 0 : update_curr_rt(rq);
1833 :
1834 0 : update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1835 :
1836 : /*
1837 : * The previous task needs to be made eligible for pushing
1838 : * if it is still active
1839 : */
1840 0 : if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1841 : enqueue_pushable_task(rq, p);
1842 0 : }
1843 :
1844 : #ifdef CONFIG_SMP
1845 :
1846 : /* Only try algorithms three times */
1847 : #define RT_MAX_TRIES 3
1848 :
1849 : static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1850 : {
1851 : if (!task_on_cpu(rq, p) &&
1852 : cpumask_test_cpu(cpu, &p->cpus_mask))
1853 : return 1;
1854 :
1855 : return 0;
1856 : }
1857 :
1858 : /*
1859 : * Return the highest pushable rq's task, which is suitable to be executed
1860 : * on the CPU, NULL otherwise
1861 : */
1862 : static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1863 : {
1864 : struct plist_head *head = &rq->rt.pushable_tasks;
1865 : struct task_struct *p;
1866 :
1867 : if (!has_pushable_tasks(rq))
1868 : return NULL;
1869 :
1870 : plist_for_each_entry(p, head, pushable_tasks) {
1871 : if (pick_rt_task(rq, p, cpu))
1872 : return p;
1873 : }
1874 :
1875 : return NULL;
1876 : }
1877 :
1878 : static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1879 :
1880 : static int find_lowest_rq(struct task_struct *task)
1881 : {
1882 : struct sched_domain *sd;
1883 : struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1884 : int this_cpu = smp_processor_id();
1885 : int cpu = task_cpu(task);
1886 : int ret;
1887 :
1888 : /* Make sure the mask is initialized first */
1889 : if (unlikely(!lowest_mask))
1890 : return -1;
1891 :
1892 : if (task->nr_cpus_allowed == 1)
1893 : return -1; /* No other targets possible */
1894 :
1895 : /*
1896 : * If we're on asym system ensure we consider the different capacities
1897 : * of the CPUs when searching for the lowest_mask.
1898 : */
1899 : if (sched_asym_cpucap_active()) {
1900 :
1901 : ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
1902 : task, lowest_mask,
1903 : rt_task_fits_capacity);
1904 : } else {
1905 :
1906 : ret = cpupri_find(&task_rq(task)->rd->cpupri,
1907 : task, lowest_mask);
1908 : }
1909 :
1910 : if (!ret)
1911 : return -1; /* No targets found */
1912 :
1913 : /*
1914 : * At this point we have built a mask of CPUs representing the
1915 : * lowest priority tasks in the system. Now we want to elect
1916 : * the best one based on our affinity and topology.
1917 : *
1918 : * We prioritize the last CPU that the task executed on since
1919 : * it is most likely cache-hot in that location.
1920 : */
1921 : if (cpumask_test_cpu(cpu, lowest_mask))
1922 : return cpu;
1923 :
1924 : /*
1925 : * Otherwise, we consult the sched_domains span maps to figure
1926 : * out which CPU is logically closest to our hot cache data.
1927 : */
1928 : if (!cpumask_test_cpu(this_cpu, lowest_mask))
1929 : this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1930 :
1931 : rcu_read_lock();
1932 : for_each_domain(cpu, sd) {
1933 : if (sd->flags & SD_WAKE_AFFINE) {
1934 : int best_cpu;
1935 :
1936 : /*
1937 : * "this_cpu" is cheaper to preempt than a
1938 : * remote processor.
1939 : */
1940 : if (this_cpu != -1 &&
1941 : cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1942 : rcu_read_unlock();
1943 : return this_cpu;
1944 : }
1945 :
1946 : best_cpu = cpumask_any_and_distribute(lowest_mask,
1947 : sched_domain_span(sd));
1948 : if (best_cpu < nr_cpu_ids) {
1949 : rcu_read_unlock();
1950 : return best_cpu;
1951 : }
1952 : }
1953 : }
1954 : rcu_read_unlock();
1955 :
1956 : /*
1957 : * And finally, if there were no matches within the domains
1958 : * just give the caller *something* to work with from the compatible
1959 : * locations.
1960 : */
1961 : if (this_cpu != -1)
1962 : return this_cpu;
1963 :
1964 : cpu = cpumask_any_distribute(lowest_mask);
1965 : if (cpu < nr_cpu_ids)
1966 : return cpu;
1967 :
1968 : return -1;
1969 : }
1970 :
1971 : /* Will lock the rq it finds */
1972 : static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1973 : {
1974 : struct rq *lowest_rq = NULL;
1975 : int tries;
1976 : int cpu;
1977 :
1978 : for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1979 : cpu = find_lowest_rq(task);
1980 :
1981 : if ((cpu == -1) || (cpu == rq->cpu))
1982 : break;
1983 :
1984 : lowest_rq = cpu_rq(cpu);
1985 :
1986 : if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1987 : /*
1988 : * Target rq has tasks of equal or higher priority,
1989 : * retrying does not release any lock and is unlikely
1990 : * to yield a different result.
1991 : */
1992 : lowest_rq = NULL;
1993 : break;
1994 : }
1995 :
1996 : /* if the prio of this runqueue changed, try again */
1997 : if (double_lock_balance(rq, lowest_rq)) {
1998 : /*
1999 : * We had to unlock the run queue. In
2000 : * the mean time, task could have
2001 : * migrated already or had its affinity changed.
2002 : * Also make sure that it wasn't scheduled on its rq.
2003 : * It is possible the task was scheduled, set
2004 : * "migrate_disabled" and then got preempted, so we must
2005 : * check the task migration disable flag here too.
2006 : */
2007 : if (unlikely(task_rq(task) != rq ||
2008 : !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
2009 : task_on_cpu(rq, task) ||
2010 : !rt_task(task) ||
2011 : is_migration_disabled(task) ||
2012 : !task_on_rq_queued(task))) {
2013 :
2014 : double_unlock_balance(rq, lowest_rq);
2015 : lowest_rq = NULL;
2016 : break;
2017 : }
2018 : }
2019 :
2020 : /* If this rq is still suitable use it. */
2021 : if (lowest_rq->rt.highest_prio.curr > task->prio)
2022 : break;
2023 :
2024 : /* try again */
2025 : double_unlock_balance(rq, lowest_rq);
2026 : lowest_rq = NULL;
2027 : }
2028 :
2029 : return lowest_rq;
2030 : }
2031 :
2032 : static struct task_struct *pick_next_pushable_task(struct rq *rq)
2033 : {
2034 : struct task_struct *p;
2035 :
2036 : if (!has_pushable_tasks(rq))
2037 : return NULL;
2038 :
2039 : p = plist_first_entry(&rq->rt.pushable_tasks,
2040 : struct task_struct, pushable_tasks);
2041 :
2042 : BUG_ON(rq->cpu != task_cpu(p));
2043 : BUG_ON(task_current(rq, p));
2044 : BUG_ON(p->nr_cpus_allowed <= 1);
2045 :
2046 : BUG_ON(!task_on_rq_queued(p));
2047 : BUG_ON(!rt_task(p));
2048 :
2049 : return p;
2050 : }
2051 :
2052 : /*
2053 : * If the current CPU has more than one RT task, see if the non
2054 : * running task can migrate over to a CPU that is running a task
2055 : * of lesser priority.
2056 : */
2057 : static int push_rt_task(struct rq *rq, bool pull)
2058 : {
2059 : struct task_struct *next_task;
2060 : struct rq *lowest_rq;
2061 : int ret = 0;
2062 :
2063 : if (!rq->rt.overloaded)
2064 : return 0;
2065 :
2066 : next_task = pick_next_pushable_task(rq);
2067 : if (!next_task)
2068 : return 0;
2069 :
2070 : retry:
2071 : /*
2072 : * It's possible that the next_task slipped in of
2073 : * higher priority than current. If that's the case
2074 : * just reschedule current.
2075 : */
2076 : if (unlikely(next_task->prio < rq->curr->prio)) {
2077 : resched_curr(rq);
2078 : return 0;
2079 : }
2080 :
2081 : if (is_migration_disabled(next_task)) {
2082 : struct task_struct *push_task = NULL;
2083 : int cpu;
2084 :
2085 : if (!pull || rq->push_busy)
2086 : return 0;
2087 :
2088 : /*
2089 : * Invoking find_lowest_rq() on anything but an RT task doesn't
2090 : * make sense. Per the above priority check, curr has to
2091 : * be of higher priority than next_task, so no need to
2092 : * reschedule when bailing out.
2093 : *
2094 : * Note that the stoppers are masqueraded as SCHED_FIFO
2095 : * (cf. sched_set_stop_task()), so we can't rely on rt_task().
2096 : */
2097 : if (rq->curr->sched_class != &rt_sched_class)
2098 : return 0;
2099 :
2100 : cpu = find_lowest_rq(rq->curr);
2101 : if (cpu == -1 || cpu == rq->cpu)
2102 : return 0;
2103 :
2104 : /*
2105 : * Given we found a CPU with lower priority than @next_task,
2106 : * therefore it should be running. However we cannot migrate it
2107 : * to this other CPU, instead attempt to push the current
2108 : * running task on this CPU away.
2109 : */
2110 : push_task = get_push_task(rq);
2111 : if (push_task) {
2112 : raw_spin_rq_unlock(rq);
2113 : stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2114 : push_task, &rq->push_work);
2115 : raw_spin_rq_lock(rq);
2116 : }
2117 :
2118 : return 0;
2119 : }
2120 :
2121 : if (WARN_ON(next_task == rq->curr))
2122 : return 0;
2123 :
2124 : /* We might release rq lock */
2125 : get_task_struct(next_task);
2126 :
2127 : /* find_lock_lowest_rq locks the rq if found */
2128 : lowest_rq = find_lock_lowest_rq(next_task, rq);
2129 : if (!lowest_rq) {
2130 : struct task_struct *task;
2131 : /*
2132 : * find_lock_lowest_rq releases rq->lock
2133 : * so it is possible that next_task has migrated.
2134 : *
2135 : * We need to make sure that the task is still on the same
2136 : * run-queue and is also still the next task eligible for
2137 : * pushing.
2138 : */
2139 : task = pick_next_pushable_task(rq);
2140 : if (task == next_task) {
2141 : /*
2142 : * The task hasn't migrated, and is still the next
2143 : * eligible task, but we failed to find a run-queue
2144 : * to push it to. Do not retry in this case, since
2145 : * other CPUs will pull from us when ready.
2146 : */
2147 : goto out;
2148 : }
2149 :
2150 : if (!task)
2151 : /* No more tasks, just exit */
2152 : goto out;
2153 :
2154 : /*
2155 : * Something has shifted, try again.
2156 : */
2157 : put_task_struct(next_task);
2158 : next_task = task;
2159 : goto retry;
2160 : }
2161 :
2162 : deactivate_task(rq, next_task, 0);
2163 : set_task_cpu(next_task, lowest_rq->cpu);
2164 : activate_task(lowest_rq, next_task, 0);
2165 : resched_curr(lowest_rq);
2166 : ret = 1;
2167 :
2168 : double_unlock_balance(rq, lowest_rq);
2169 : out:
2170 : put_task_struct(next_task);
2171 :
2172 : return ret;
2173 : }
2174 :
2175 : static void push_rt_tasks(struct rq *rq)
2176 : {
2177 : /* push_rt_task will return true if it moved an RT */
2178 : while (push_rt_task(rq, false))
2179 : ;
2180 : }
2181 :
2182 : #ifdef HAVE_RT_PUSH_IPI
2183 :
2184 : /*
2185 : * When a high priority task schedules out from a CPU and a lower priority
2186 : * task is scheduled in, a check is made to see if there's any RT tasks
2187 : * on other CPUs that are waiting to run because a higher priority RT task
2188 : * is currently running on its CPU. In this case, the CPU with multiple RT
2189 : * tasks queued on it (overloaded) needs to be notified that a CPU has opened
2190 : * up that may be able to run one of its non-running queued RT tasks.
2191 : *
2192 : * All CPUs with overloaded RT tasks need to be notified as there is currently
2193 : * no way to know which of these CPUs have the highest priority task waiting
2194 : * to run. Instead of trying to take a spinlock on each of these CPUs,
2195 : * which has shown to cause large latency when done on machines with many
2196 : * CPUs, sending an IPI to the CPUs to have them push off the overloaded
2197 : * RT tasks waiting to run.
2198 : *
2199 : * Just sending an IPI to each of the CPUs is also an issue, as on large
2200 : * count CPU machines, this can cause an IPI storm on a CPU, especially
2201 : * if its the only CPU with multiple RT tasks queued, and a large number
2202 : * of CPUs scheduling a lower priority task at the same time.
2203 : *
2204 : * Each root domain has its own irq work function that can iterate over
2205 : * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
2206 : * task must be checked if there's one or many CPUs that are lowering
2207 : * their priority, there's a single irq work iterator that will try to
2208 : * push off RT tasks that are waiting to run.
2209 : *
2210 : * When a CPU schedules a lower priority task, it will kick off the
2211 : * irq work iterator that will jump to each CPU with overloaded RT tasks.
2212 : * As it only takes the first CPU that schedules a lower priority task
2213 : * to start the process, the rto_start variable is incremented and if
2214 : * the atomic result is one, then that CPU will try to take the rto_lock.
2215 : * This prevents high contention on the lock as the process handles all
2216 : * CPUs scheduling lower priority tasks.
2217 : *
2218 : * All CPUs that are scheduling a lower priority task will increment the
2219 : * rt_loop_next variable. This will make sure that the irq work iterator
2220 : * checks all RT overloaded CPUs whenever a CPU schedules a new lower
2221 : * priority task, even if the iterator is in the middle of a scan. Incrementing
2222 : * the rt_loop_next will cause the iterator to perform another scan.
2223 : *
2224 : */
2225 : static int rto_next_cpu(struct root_domain *rd)
2226 : {
2227 : int next;
2228 : int cpu;
2229 :
2230 : /*
2231 : * When starting the IPI RT pushing, the rto_cpu is set to -1,
2232 : * rt_next_cpu() will simply return the first CPU found in
2233 : * the rto_mask.
2234 : *
2235 : * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
2236 : * will return the next CPU found in the rto_mask.
2237 : *
2238 : * If there are no more CPUs left in the rto_mask, then a check is made
2239 : * against rto_loop and rto_loop_next. rto_loop is only updated with
2240 : * the rto_lock held, but any CPU may increment the rto_loop_next
2241 : * without any locking.
2242 : */
2243 : for (;;) {
2244 :
2245 : /* When rto_cpu is -1 this acts like cpumask_first() */
2246 : cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
2247 :
2248 : rd->rto_cpu = cpu;
2249 :
2250 : if (cpu < nr_cpu_ids)
2251 : return cpu;
2252 :
2253 : rd->rto_cpu = -1;
2254 :
2255 : /*
2256 : * ACQUIRE ensures we see the @rto_mask changes
2257 : * made prior to the @next value observed.
2258 : *
2259 : * Matches WMB in rt_set_overload().
2260 : */
2261 : next = atomic_read_acquire(&rd->rto_loop_next);
2262 :
2263 : if (rd->rto_loop == next)
2264 : break;
2265 :
2266 : rd->rto_loop = next;
2267 : }
2268 :
2269 : return -1;
2270 : }
2271 :
2272 : static inline bool rto_start_trylock(atomic_t *v)
2273 : {
2274 : return !atomic_cmpxchg_acquire(v, 0, 1);
2275 : }
2276 :
2277 : static inline void rto_start_unlock(atomic_t *v)
2278 : {
2279 : atomic_set_release(v, 0);
2280 : }
2281 :
2282 : static void tell_cpu_to_push(struct rq *rq)
2283 : {
2284 : int cpu = -1;
2285 :
2286 : /* Keep the loop going if the IPI is currently active */
2287 : atomic_inc(&rq->rd->rto_loop_next);
2288 :
2289 : /* Only one CPU can initiate a loop at a time */
2290 : if (!rto_start_trylock(&rq->rd->rto_loop_start))
2291 : return;
2292 :
2293 : raw_spin_lock(&rq->rd->rto_lock);
2294 :
2295 : /*
2296 : * The rto_cpu is updated under the lock, if it has a valid CPU
2297 : * then the IPI is still running and will continue due to the
2298 : * update to loop_next, and nothing needs to be done here.
2299 : * Otherwise it is finishing up and an ipi needs to be sent.
2300 : */
2301 : if (rq->rd->rto_cpu < 0)
2302 : cpu = rto_next_cpu(rq->rd);
2303 :
2304 : raw_spin_unlock(&rq->rd->rto_lock);
2305 :
2306 : rto_start_unlock(&rq->rd->rto_loop_start);
2307 :
2308 : if (cpu >= 0) {
2309 : /* Make sure the rd does not get freed while pushing */
2310 : sched_get_rd(rq->rd);
2311 : irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2312 : }
2313 : }
2314 :
2315 : /* Called from hardirq context */
2316 : void rto_push_irq_work_func(struct irq_work *work)
2317 : {
2318 : struct root_domain *rd =
2319 : container_of(work, struct root_domain, rto_push_work);
2320 : struct rq *rq;
2321 : int cpu;
2322 :
2323 : rq = this_rq();
2324 :
2325 : /*
2326 : * We do not need to grab the lock to check for has_pushable_tasks.
2327 : * When it gets updated, a check is made if a push is possible.
2328 : */
2329 : if (has_pushable_tasks(rq)) {
2330 : raw_spin_rq_lock(rq);
2331 : while (push_rt_task(rq, true))
2332 : ;
2333 : raw_spin_rq_unlock(rq);
2334 : }
2335 :
2336 : raw_spin_lock(&rd->rto_lock);
2337 :
2338 : /* Pass the IPI to the next rt overloaded queue */
2339 : cpu = rto_next_cpu(rd);
2340 :
2341 : raw_spin_unlock(&rd->rto_lock);
2342 :
2343 : if (cpu < 0) {
2344 : sched_put_rd(rd);
2345 : return;
2346 : }
2347 :
2348 : /* Try the next RT overloaded CPU */
2349 : irq_work_queue_on(&rd->rto_push_work, cpu);
2350 : }
2351 : #endif /* HAVE_RT_PUSH_IPI */
2352 :
2353 : static void pull_rt_task(struct rq *this_rq)
2354 : {
2355 : int this_cpu = this_rq->cpu, cpu;
2356 : bool resched = false;
2357 : struct task_struct *p, *push_task;
2358 : struct rq *src_rq;
2359 : int rt_overload_count = rt_overloaded(this_rq);
2360 :
2361 : if (likely(!rt_overload_count))
2362 : return;
2363 :
2364 : /*
2365 : * Match the barrier from rt_set_overloaded; this guarantees that if we
2366 : * see overloaded we must also see the rto_mask bit.
2367 : */
2368 : smp_rmb();
2369 :
2370 : /* If we are the only overloaded CPU do nothing */
2371 : if (rt_overload_count == 1 &&
2372 : cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2373 : return;
2374 :
2375 : #ifdef HAVE_RT_PUSH_IPI
2376 : if (sched_feat(RT_PUSH_IPI)) {
2377 : tell_cpu_to_push(this_rq);
2378 : return;
2379 : }
2380 : #endif
2381 :
2382 : for_each_cpu(cpu, this_rq->rd->rto_mask) {
2383 : if (this_cpu == cpu)
2384 : continue;
2385 :
2386 : src_rq = cpu_rq(cpu);
2387 :
2388 : /*
2389 : * Don't bother taking the src_rq->lock if the next highest
2390 : * task is known to be lower-priority than our current task.
2391 : * This may look racy, but if this value is about to go
2392 : * logically higher, the src_rq will push this task away.
2393 : * And if its going logically lower, we do not care
2394 : */
2395 : if (src_rq->rt.highest_prio.next >=
2396 : this_rq->rt.highest_prio.curr)
2397 : continue;
2398 :
2399 : /*
2400 : * We can potentially drop this_rq's lock in
2401 : * double_lock_balance, and another CPU could
2402 : * alter this_rq
2403 : */
2404 : push_task = NULL;
2405 : double_lock_balance(this_rq, src_rq);
2406 :
2407 : /*
2408 : * We can pull only a task, which is pushable
2409 : * on its rq, and no others.
2410 : */
2411 : p = pick_highest_pushable_task(src_rq, this_cpu);
2412 :
2413 : /*
2414 : * Do we have an RT task that preempts
2415 : * the to-be-scheduled task?
2416 : */
2417 : if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2418 : WARN_ON(p == src_rq->curr);
2419 : WARN_ON(!task_on_rq_queued(p));
2420 :
2421 : /*
2422 : * There's a chance that p is higher in priority
2423 : * than what's currently running on its CPU.
2424 : * This is just that p is waking up and hasn't
2425 : * had a chance to schedule. We only pull
2426 : * p if it is lower in priority than the
2427 : * current task on the run queue
2428 : */
2429 : if (p->prio < src_rq->curr->prio)
2430 : goto skip;
2431 :
2432 : if (is_migration_disabled(p)) {
2433 : push_task = get_push_task(src_rq);
2434 : } else {
2435 : deactivate_task(src_rq, p, 0);
2436 : set_task_cpu(p, this_cpu);
2437 : activate_task(this_rq, p, 0);
2438 : resched = true;
2439 : }
2440 : /*
2441 : * We continue with the search, just in
2442 : * case there's an even higher prio task
2443 : * in another runqueue. (low likelihood
2444 : * but possible)
2445 : */
2446 : }
2447 : skip:
2448 : double_unlock_balance(this_rq, src_rq);
2449 :
2450 : if (push_task) {
2451 : raw_spin_rq_unlock(this_rq);
2452 : stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
2453 : push_task, &src_rq->push_work);
2454 : raw_spin_rq_lock(this_rq);
2455 : }
2456 : }
2457 :
2458 : if (resched)
2459 : resched_curr(this_rq);
2460 : }
2461 :
2462 : /*
2463 : * If we are not running and we are not going to reschedule soon, we should
2464 : * try to push tasks away now
2465 : */
2466 : static void task_woken_rt(struct rq *rq, struct task_struct *p)
2467 : {
2468 : bool need_to_push = !task_on_cpu(rq, p) &&
2469 : !test_tsk_need_resched(rq->curr) &&
2470 : p->nr_cpus_allowed > 1 &&
2471 : (dl_task(rq->curr) || rt_task(rq->curr)) &&
2472 : (rq->curr->nr_cpus_allowed < 2 ||
2473 : rq->curr->prio <= p->prio);
2474 :
2475 : if (need_to_push)
2476 : push_rt_tasks(rq);
2477 : }
2478 :
2479 : /* Assumes rq->lock is held */
2480 : static void rq_online_rt(struct rq *rq)
2481 : {
2482 : if (rq->rt.overloaded)
2483 : rt_set_overload(rq);
2484 :
2485 : __enable_runtime(rq);
2486 :
2487 : cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2488 : }
2489 :
2490 : /* Assumes rq->lock is held */
2491 : static void rq_offline_rt(struct rq *rq)
2492 : {
2493 : if (rq->rt.overloaded)
2494 : rt_clear_overload(rq);
2495 :
2496 : __disable_runtime(rq);
2497 :
2498 : cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2499 : }
2500 :
2501 : /*
2502 : * When switch from the rt queue, we bring ourselves to a position
2503 : * that we might want to pull RT tasks from other runqueues.
2504 : */
2505 : static void switched_from_rt(struct rq *rq, struct task_struct *p)
2506 : {
2507 : /*
2508 : * If there are other RT tasks then we will reschedule
2509 : * and the scheduling of the other RT tasks will handle
2510 : * the balancing. But if we are the last RT task
2511 : * we may need to handle the pulling of RT tasks
2512 : * now.
2513 : */
2514 : if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2515 : return;
2516 :
2517 : rt_queue_pull_task(rq);
2518 : }
2519 :
2520 : void __init init_sched_rt_class(void)
2521 : {
2522 : unsigned int i;
2523 :
2524 : for_each_possible_cpu(i) {
2525 : zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2526 : GFP_KERNEL, cpu_to_node(i));
2527 : }
2528 : }
2529 : #endif /* CONFIG_SMP */
2530 :
2531 : /*
2532 : * When switching a task to RT, we may overload the runqueue
2533 : * with RT tasks. In this case we try to push them off to
2534 : * other runqueues.
2535 : */
2536 0 : static void switched_to_rt(struct rq *rq, struct task_struct *p)
2537 : {
2538 : /*
2539 : * If we are running, update the avg_rt tracking, as the running time
2540 : * will now on be accounted into the latter.
2541 : */
2542 0 : if (task_current(rq, p)) {
2543 : update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
2544 : return;
2545 : }
2546 :
2547 : /*
2548 : * If we are not running we may need to preempt the current
2549 : * running task. If that current running task is also an RT task
2550 : * then see if we can move to another run queue.
2551 : */
2552 0 : if (task_on_rq_queued(p)) {
2553 : #ifdef CONFIG_SMP
2554 : if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2555 : rt_queue_push_tasks(rq);
2556 : #endif /* CONFIG_SMP */
2557 0 : if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2558 0 : resched_curr(rq);
2559 : }
2560 : }
2561 :
2562 : /*
2563 : * Priority of the task has changed. This may cause
2564 : * us to initiate a push or pull.
2565 : */
2566 : static void
2567 0 : prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2568 : {
2569 0 : if (!task_on_rq_queued(p))
2570 : return;
2571 :
2572 0 : if (task_current(rq, p)) {
2573 : #ifdef CONFIG_SMP
2574 : /*
2575 : * If our priority decreases while running, we
2576 : * may need to pull tasks to this runqueue.
2577 : */
2578 : if (oldprio < p->prio)
2579 : rt_queue_pull_task(rq);
2580 :
2581 : /*
2582 : * If there's a higher priority task waiting to run
2583 : * then reschedule.
2584 : */
2585 : if (p->prio > rq->rt.highest_prio.curr)
2586 : resched_curr(rq);
2587 : #else
2588 : /* For UP simply resched on drop of prio */
2589 0 : if (oldprio < p->prio)
2590 0 : resched_curr(rq);
2591 : #endif /* CONFIG_SMP */
2592 : } else {
2593 : /*
2594 : * This task is not running, but if it is
2595 : * greater than the current running task
2596 : * then reschedule.
2597 : */
2598 0 : if (p->prio < rq->curr->prio)
2599 0 : resched_curr(rq);
2600 : }
2601 : }
2602 :
2603 : #ifdef CONFIG_POSIX_TIMERS
2604 0 : static void watchdog(struct rq *rq, struct task_struct *p)
2605 : {
2606 : unsigned long soft, hard;
2607 :
2608 : /* max may change after cur was read, this will be fixed next tick */
2609 0 : soft = task_rlimit(p, RLIMIT_RTTIME);
2610 0 : hard = task_rlimit_max(p, RLIMIT_RTTIME);
2611 :
2612 0 : if (soft != RLIM_INFINITY) {
2613 : unsigned long next;
2614 :
2615 0 : if (p->rt.watchdog_stamp != jiffies) {
2616 0 : p->rt.timeout++;
2617 0 : p->rt.watchdog_stamp = jiffies;
2618 : }
2619 :
2620 0 : next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2621 0 : if (p->rt.timeout > next) {
2622 0 : posix_cputimers_rt_watchdog(&p->posix_cputimers,
2623 : p->se.sum_exec_runtime);
2624 : }
2625 : }
2626 0 : }
2627 : #else
2628 : static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2629 : #endif
2630 :
2631 : /*
2632 : * scheduler tick hitting a task of our scheduling class.
2633 : *
2634 : * NOTE: This function can be called remotely by the tick offload that
2635 : * goes along full dynticks. Therefore no local assumption can be made
2636 : * and everything must be accessed through the @rq and @curr passed in
2637 : * parameters.
2638 : */
2639 0 : static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2640 : {
2641 0 : struct sched_rt_entity *rt_se = &p->rt;
2642 :
2643 0 : update_curr_rt(rq);
2644 0 : update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
2645 :
2646 0 : watchdog(rq, p);
2647 :
2648 : /*
2649 : * RR tasks need a special form of timeslice management.
2650 : * FIFO tasks have no timeslices.
2651 : */
2652 0 : if (p->policy != SCHED_RR)
2653 : return;
2654 :
2655 0 : if (--p->rt.time_slice)
2656 : return;
2657 :
2658 0 : p->rt.time_slice = sched_rr_timeslice;
2659 :
2660 : /*
2661 : * Requeue to the end of queue if we (and all of our ancestors) are not
2662 : * the only element on the queue
2663 : */
2664 0 : for_each_sched_rt_entity(rt_se) {
2665 0 : if (rt_se->run_list.prev != rt_se->run_list.next) {
2666 0 : requeue_task_rt(rq, p, 0);
2667 0 : resched_curr(rq);
2668 0 : return;
2669 : }
2670 : }
2671 : }
2672 :
2673 0 : static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2674 : {
2675 : /*
2676 : * Time slice is 0 for SCHED_FIFO tasks
2677 : */
2678 0 : if (task->policy == SCHED_RR)
2679 0 : return sched_rr_timeslice;
2680 : else
2681 : return 0;
2682 : }
2683 :
2684 : #ifdef CONFIG_SCHED_CORE
2685 : static int task_is_throttled_rt(struct task_struct *p, int cpu)
2686 : {
2687 : struct rt_rq *rt_rq;
2688 :
2689 : #ifdef CONFIG_RT_GROUP_SCHED
2690 : rt_rq = task_group(p)->rt_rq[cpu];
2691 : #else
2692 : rt_rq = &cpu_rq(cpu)->rt;
2693 : #endif
2694 :
2695 : return rt_rq_throttled(rt_rq);
2696 : }
2697 : #endif
2698 :
2699 : DEFINE_SCHED_CLASS(rt) = {
2700 :
2701 : .enqueue_task = enqueue_task_rt,
2702 : .dequeue_task = dequeue_task_rt,
2703 : .yield_task = yield_task_rt,
2704 :
2705 : .check_preempt_curr = check_preempt_curr_rt,
2706 :
2707 : .pick_next_task = pick_next_task_rt,
2708 : .put_prev_task = put_prev_task_rt,
2709 : .set_next_task = set_next_task_rt,
2710 :
2711 : #ifdef CONFIG_SMP
2712 : .balance = balance_rt,
2713 : .pick_task = pick_task_rt,
2714 : .select_task_rq = select_task_rq_rt,
2715 : .set_cpus_allowed = set_cpus_allowed_common,
2716 : .rq_online = rq_online_rt,
2717 : .rq_offline = rq_offline_rt,
2718 : .task_woken = task_woken_rt,
2719 : .switched_from = switched_from_rt,
2720 : .find_lock_rq = find_lock_lowest_rq,
2721 : #endif
2722 :
2723 : .task_tick = task_tick_rt,
2724 :
2725 : .get_rr_interval = get_rr_interval_rt,
2726 :
2727 : .prio_changed = prio_changed_rt,
2728 : .switched_to = switched_to_rt,
2729 :
2730 : .update_curr = update_curr_rt,
2731 :
2732 : #ifdef CONFIG_SCHED_CORE
2733 : .task_is_throttled = task_is_throttled_rt,
2734 : #endif
2735 :
2736 : #ifdef CONFIG_UCLAMP_TASK
2737 : .uclamp_enabled = 1,
2738 : #endif
2739 : };
2740 :
2741 : #ifdef CONFIG_RT_GROUP_SCHED
2742 : /*
2743 : * Ensure that the real time constraints are schedulable.
2744 : */
2745 : static DEFINE_MUTEX(rt_constraints_mutex);
2746 :
2747 : static inline int tg_has_rt_tasks(struct task_group *tg)
2748 : {
2749 : struct task_struct *task;
2750 : struct css_task_iter it;
2751 : int ret = 0;
2752 :
2753 : /*
2754 : * Autogroups do not have RT tasks; see autogroup_create().
2755 : */
2756 : if (task_group_is_autogroup(tg))
2757 : return 0;
2758 :
2759 : css_task_iter_start(&tg->css, 0, &it);
2760 : while (!ret && (task = css_task_iter_next(&it)))
2761 : ret |= rt_task(task);
2762 : css_task_iter_end(&it);
2763 :
2764 : return ret;
2765 : }
2766 :
2767 : struct rt_schedulable_data {
2768 : struct task_group *tg;
2769 : u64 rt_period;
2770 : u64 rt_runtime;
2771 : };
2772 :
2773 : static int tg_rt_schedulable(struct task_group *tg, void *data)
2774 : {
2775 : struct rt_schedulable_data *d = data;
2776 : struct task_group *child;
2777 : unsigned long total, sum = 0;
2778 : u64 period, runtime;
2779 :
2780 : period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2781 : runtime = tg->rt_bandwidth.rt_runtime;
2782 :
2783 : if (tg == d->tg) {
2784 : period = d->rt_period;
2785 : runtime = d->rt_runtime;
2786 : }
2787 :
2788 : /*
2789 : * Cannot have more runtime than the period.
2790 : */
2791 : if (runtime > period && runtime != RUNTIME_INF)
2792 : return -EINVAL;
2793 :
2794 : /*
2795 : * Ensure we don't starve existing RT tasks if runtime turns zero.
2796 : */
2797 : if (rt_bandwidth_enabled() && !runtime &&
2798 : tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
2799 : return -EBUSY;
2800 :
2801 : total = to_ratio(period, runtime);
2802 :
2803 : /*
2804 : * Nobody can have more than the global setting allows.
2805 : */
2806 : if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2807 : return -EINVAL;
2808 :
2809 : /*
2810 : * The sum of our children's runtime should not exceed our own.
2811 : */
2812 : list_for_each_entry_rcu(child, &tg->children, siblings) {
2813 : period = ktime_to_ns(child->rt_bandwidth.rt_period);
2814 : runtime = child->rt_bandwidth.rt_runtime;
2815 :
2816 : if (child == d->tg) {
2817 : period = d->rt_period;
2818 : runtime = d->rt_runtime;
2819 : }
2820 :
2821 : sum += to_ratio(period, runtime);
2822 : }
2823 :
2824 : if (sum > total)
2825 : return -EINVAL;
2826 :
2827 : return 0;
2828 : }
2829 :
2830 : static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2831 : {
2832 : int ret;
2833 :
2834 : struct rt_schedulable_data data = {
2835 : .tg = tg,
2836 : .rt_period = period,
2837 : .rt_runtime = runtime,
2838 : };
2839 :
2840 : rcu_read_lock();
2841 : ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2842 : rcu_read_unlock();
2843 :
2844 : return ret;
2845 : }
2846 :
2847 : static int tg_set_rt_bandwidth(struct task_group *tg,
2848 : u64 rt_period, u64 rt_runtime)
2849 : {
2850 : int i, err = 0;
2851 :
2852 : /*
2853 : * Disallowing the root group RT runtime is BAD, it would disallow the
2854 : * kernel creating (and or operating) RT threads.
2855 : */
2856 : if (tg == &root_task_group && rt_runtime == 0)
2857 : return -EINVAL;
2858 :
2859 : /* No period doesn't make any sense. */
2860 : if (rt_period == 0)
2861 : return -EINVAL;
2862 :
2863 : /*
2864 : * Bound quota to defend quota against overflow during bandwidth shift.
2865 : */
2866 : if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
2867 : return -EINVAL;
2868 :
2869 : mutex_lock(&rt_constraints_mutex);
2870 : err = __rt_schedulable(tg, rt_period, rt_runtime);
2871 : if (err)
2872 : goto unlock;
2873 :
2874 : raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2875 : tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2876 : tg->rt_bandwidth.rt_runtime = rt_runtime;
2877 :
2878 : for_each_possible_cpu(i) {
2879 : struct rt_rq *rt_rq = tg->rt_rq[i];
2880 :
2881 : raw_spin_lock(&rt_rq->rt_runtime_lock);
2882 : rt_rq->rt_runtime = rt_runtime;
2883 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
2884 : }
2885 : raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2886 : unlock:
2887 : mutex_unlock(&rt_constraints_mutex);
2888 :
2889 : return err;
2890 : }
2891 :
2892 : int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2893 : {
2894 : u64 rt_runtime, rt_period;
2895 :
2896 : rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2897 : rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2898 : if (rt_runtime_us < 0)
2899 : rt_runtime = RUNTIME_INF;
2900 : else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
2901 : return -EINVAL;
2902 :
2903 : return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2904 : }
2905 :
2906 : long sched_group_rt_runtime(struct task_group *tg)
2907 : {
2908 : u64 rt_runtime_us;
2909 :
2910 : if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2911 : return -1;
2912 :
2913 : rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2914 : do_div(rt_runtime_us, NSEC_PER_USEC);
2915 : return rt_runtime_us;
2916 : }
2917 :
2918 : int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2919 : {
2920 : u64 rt_runtime, rt_period;
2921 :
2922 : if (rt_period_us > U64_MAX / NSEC_PER_USEC)
2923 : return -EINVAL;
2924 :
2925 : rt_period = rt_period_us * NSEC_PER_USEC;
2926 : rt_runtime = tg->rt_bandwidth.rt_runtime;
2927 :
2928 : return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2929 : }
2930 :
2931 : long sched_group_rt_period(struct task_group *tg)
2932 : {
2933 : u64 rt_period_us;
2934 :
2935 : rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2936 : do_div(rt_period_us, NSEC_PER_USEC);
2937 : return rt_period_us;
2938 : }
2939 :
2940 : #ifdef CONFIG_SYSCTL
2941 : static int sched_rt_global_constraints(void)
2942 : {
2943 : int ret = 0;
2944 :
2945 : mutex_lock(&rt_constraints_mutex);
2946 : ret = __rt_schedulable(NULL, 0, 0);
2947 : mutex_unlock(&rt_constraints_mutex);
2948 :
2949 : return ret;
2950 : }
2951 : #endif /* CONFIG_SYSCTL */
2952 :
2953 : int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2954 : {
2955 : /* Don't accept realtime tasks when there is no way for them to run */
2956 : if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2957 : return 0;
2958 :
2959 : return 1;
2960 : }
2961 :
2962 : #else /* !CONFIG_RT_GROUP_SCHED */
2963 :
2964 : #ifdef CONFIG_SYSCTL
2965 0 : static int sched_rt_global_constraints(void)
2966 : {
2967 : unsigned long flags;
2968 : int i;
2969 :
2970 0 : raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2971 0 : for_each_possible_cpu(i) {
2972 0 : struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2973 :
2974 0 : raw_spin_lock(&rt_rq->rt_runtime_lock);
2975 0 : rt_rq->rt_runtime = global_rt_runtime();
2976 0 : raw_spin_unlock(&rt_rq->rt_runtime_lock);
2977 : }
2978 0 : raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2979 :
2980 0 : return 0;
2981 : }
2982 : #endif /* CONFIG_SYSCTL */
2983 : #endif /* CONFIG_RT_GROUP_SCHED */
2984 :
2985 : #ifdef CONFIG_SYSCTL
2986 : static int sched_rt_global_validate(void)
2987 : {
2988 0 : if (sysctl_sched_rt_period <= 0)
2989 : return -EINVAL;
2990 :
2991 0 : if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2992 0 : ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
2993 0 : ((u64)sysctl_sched_rt_runtime *
2994 : NSEC_PER_USEC > max_rt_runtime)))
2995 : return -EINVAL;
2996 :
2997 : return 0;
2998 : }
2999 :
3000 0 : static void sched_rt_do_global(void)
3001 : {
3002 : unsigned long flags;
3003 :
3004 0 : raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
3005 0 : def_rt_bandwidth.rt_runtime = global_rt_runtime();
3006 0 : def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
3007 0 : raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
3008 0 : }
3009 :
3010 0 : static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
3011 : size_t *lenp, loff_t *ppos)
3012 : {
3013 : int old_period, old_runtime;
3014 : static DEFINE_MUTEX(mutex);
3015 : int ret;
3016 :
3017 0 : mutex_lock(&mutex);
3018 0 : old_period = sysctl_sched_rt_period;
3019 0 : old_runtime = sysctl_sched_rt_runtime;
3020 :
3021 0 : ret = proc_dointvec(table, write, buffer, lenp, ppos);
3022 :
3023 0 : if (!ret && write) {
3024 0 : ret = sched_rt_global_validate();
3025 0 : if (ret)
3026 : goto undo;
3027 :
3028 0 : ret = sched_dl_global_validate();
3029 0 : if (ret)
3030 : goto undo;
3031 :
3032 0 : ret = sched_rt_global_constraints();
3033 0 : if (ret)
3034 : goto undo;
3035 :
3036 0 : sched_rt_do_global();
3037 0 : sched_dl_do_global();
3038 : }
3039 : if (0) {
3040 : undo:
3041 0 : sysctl_sched_rt_period = old_period;
3042 0 : sysctl_sched_rt_runtime = old_runtime;
3043 : }
3044 0 : mutex_unlock(&mutex);
3045 :
3046 0 : return ret;
3047 : }
3048 :
3049 0 : static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
3050 : size_t *lenp, loff_t *ppos)
3051 : {
3052 : int ret;
3053 : static DEFINE_MUTEX(mutex);
3054 :
3055 0 : mutex_lock(&mutex);
3056 0 : ret = proc_dointvec(table, write, buffer, lenp, ppos);
3057 : /*
3058 : * Make sure that internally we keep jiffies.
3059 : * Also, writing zero resets the timeslice to default:
3060 : */
3061 0 : if (!ret && write) {
3062 0 : sched_rr_timeslice =
3063 0 : sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
3064 0 : msecs_to_jiffies(sysctl_sched_rr_timeslice);
3065 : }
3066 0 : mutex_unlock(&mutex);
3067 :
3068 0 : return ret;
3069 : }
3070 : #endif /* CONFIG_SYSCTL */
3071 :
3072 : #ifdef CONFIG_SCHED_DEBUG
3073 : void print_rt_stats(struct seq_file *m, int cpu)
3074 : {
3075 : rt_rq_iter_t iter;
3076 : struct rt_rq *rt_rq;
3077 :
3078 : rcu_read_lock();
3079 : for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
3080 : print_rt_rq(m, cpu, rt_rq);
3081 : rcu_read_unlock();
3082 : }
3083 : #endif /* CONFIG_SCHED_DEBUG */
|