Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 :
3 : #include <linux/slab.h>
4 : #include <linux/sched/task.h>
5 :
6 : #include "futex.h"
7 : #include "../locking/rtmutex_common.h"
8 :
9 : /*
10 : * PI code:
11 : */
12 0 : int refill_pi_state_cache(void)
13 : {
14 : struct futex_pi_state *pi_state;
15 :
16 0 : if (likely(current->pi_state_cache))
17 : return 0;
18 :
19 0 : pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
20 :
21 0 : if (!pi_state)
22 : return -ENOMEM;
23 :
24 0 : INIT_LIST_HEAD(&pi_state->list);
25 : /* pi_mutex gets initialized later */
26 0 : pi_state->owner = NULL;
27 0 : refcount_set(&pi_state->refcount, 1);
28 0 : pi_state->key = FUTEX_KEY_INIT;
29 :
30 0 : current->pi_state_cache = pi_state;
31 :
32 0 : return 0;
33 : }
34 :
35 0 : static struct futex_pi_state *alloc_pi_state(void)
36 : {
37 0 : struct futex_pi_state *pi_state = current->pi_state_cache;
38 :
39 0 : WARN_ON(!pi_state);
40 0 : current->pi_state_cache = NULL;
41 :
42 0 : return pi_state;
43 : }
44 :
45 0 : static void pi_state_update_owner(struct futex_pi_state *pi_state,
46 : struct task_struct *new_owner)
47 : {
48 0 : struct task_struct *old_owner = pi_state->owner;
49 :
50 : lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
51 :
52 0 : if (old_owner) {
53 0 : raw_spin_lock(&old_owner->pi_lock);
54 0 : WARN_ON(list_empty(&pi_state->list));
55 0 : list_del_init(&pi_state->list);
56 0 : raw_spin_unlock(&old_owner->pi_lock);
57 : }
58 :
59 0 : if (new_owner) {
60 0 : raw_spin_lock(&new_owner->pi_lock);
61 0 : WARN_ON(!list_empty(&pi_state->list));
62 0 : list_add(&pi_state->list, &new_owner->pi_state_list);
63 0 : pi_state->owner = new_owner;
64 0 : raw_spin_unlock(&new_owner->pi_lock);
65 : }
66 0 : }
67 :
68 0 : void get_pi_state(struct futex_pi_state *pi_state)
69 : {
70 0 : WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
71 0 : }
72 :
73 : /*
74 : * Drops a reference to the pi_state object and frees or caches it
75 : * when the last reference is gone.
76 : */
77 0 : void put_pi_state(struct futex_pi_state *pi_state)
78 : {
79 0 : if (!pi_state)
80 : return;
81 :
82 0 : if (!refcount_dec_and_test(&pi_state->refcount))
83 : return;
84 :
85 : /*
86 : * If pi_state->owner is NULL, the owner is most probably dying
87 : * and has cleaned up the pi_state already
88 : */
89 0 : if (pi_state->owner) {
90 : unsigned long flags;
91 :
92 0 : raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
93 0 : pi_state_update_owner(pi_state, NULL);
94 0 : rt_mutex_proxy_unlock(&pi_state->pi_mutex);
95 0 : raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
96 : }
97 :
98 0 : if (current->pi_state_cache) {
99 0 : kfree(pi_state);
100 : } else {
101 : /*
102 : * pi_state->list is already empty.
103 : * clear pi_state->owner.
104 : * refcount is at 0 - put it back to 1.
105 : */
106 0 : pi_state->owner = NULL;
107 0 : refcount_set(&pi_state->refcount, 1);
108 0 : current->pi_state_cache = pi_state;
109 : }
110 : }
111 :
112 : /*
113 : * We need to check the following states:
114 : *
115 : * Waiter | pi_state | pi->owner | uTID | uODIED | ?
116 : *
117 : * [1] NULL | --- | --- | 0 | 0/1 | Valid
118 : * [2] NULL | --- | --- | >0 | 0/1 | Valid
119 : *
120 : * [3] Found | NULL | -- | Any | 0/1 | Invalid
121 : *
122 : * [4] Found | Found | NULL | 0 | 1 | Valid
123 : * [5] Found | Found | NULL | >0 | 1 | Invalid
124 : *
125 : * [6] Found | Found | task | 0 | 1 | Valid
126 : *
127 : * [7] Found | Found | NULL | Any | 0 | Invalid
128 : *
129 : * [8] Found | Found | task | ==taskTID | 0/1 | Valid
130 : * [9] Found | Found | task | 0 | 0 | Invalid
131 : * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
132 : *
133 : * [1] Indicates that the kernel can acquire the futex atomically. We
134 : * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
135 : *
136 : * [2] Valid, if TID does not belong to a kernel thread. If no matching
137 : * thread is found then it indicates that the owner TID has died.
138 : *
139 : * [3] Invalid. The waiter is queued on a non PI futex
140 : *
141 : * [4] Valid state after exit_robust_list(), which sets the user space
142 : * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
143 : *
144 : * [5] The user space value got manipulated between exit_robust_list()
145 : * and exit_pi_state_list()
146 : *
147 : * [6] Valid state after exit_pi_state_list() which sets the new owner in
148 : * the pi_state but cannot access the user space value.
149 : *
150 : * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
151 : *
152 : * [8] Owner and user space value match
153 : *
154 : * [9] There is no transient state which sets the user space TID to 0
155 : * except exit_robust_list(), but this is indicated by the
156 : * FUTEX_OWNER_DIED bit. See [4]
157 : *
158 : * [10] There is no transient state which leaves owner and user space
159 : * TID out of sync. Except one error case where the kernel is denied
160 : * write access to the user address, see fixup_pi_state_owner().
161 : *
162 : *
163 : * Serialization and lifetime rules:
164 : *
165 : * hb->lock:
166 : *
167 : * hb -> futex_q, relation
168 : * futex_q -> pi_state, relation
169 : *
170 : * (cannot be raw because hb can contain arbitrary amount
171 : * of futex_q's)
172 : *
173 : * pi_mutex->wait_lock:
174 : *
175 : * {uval, pi_state}
176 : *
177 : * (and pi_mutex 'obviously')
178 : *
179 : * p->pi_lock:
180 : *
181 : * p->pi_state_list -> pi_state->list, relation
182 : * pi_mutex->owner -> pi_state->owner, relation
183 : *
184 : * pi_state->refcount:
185 : *
186 : * pi_state lifetime
187 : *
188 : *
189 : * Lock order:
190 : *
191 : * hb->lock
192 : * pi_mutex->wait_lock
193 : * p->pi_lock
194 : *
195 : */
196 :
197 : /*
198 : * Validate that the existing waiter has a pi_state and sanity check
199 : * the pi_state against the user space value. If correct, attach to
200 : * it.
201 : */
202 0 : static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
203 : struct futex_pi_state *pi_state,
204 : struct futex_pi_state **ps)
205 : {
206 0 : pid_t pid = uval & FUTEX_TID_MASK;
207 : u32 uval2;
208 : int ret;
209 :
210 : /*
211 : * Userspace might have messed up non-PI and PI futexes [3]
212 : */
213 0 : if (unlikely(!pi_state))
214 : return -EINVAL;
215 :
216 : /*
217 : * We get here with hb->lock held, and having found a
218 : * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
219 : * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
220 : * which in turn means that futex_lock_pi() still has a reference on
221 : * our pi_state.
222 : *
223 : * The waiter holding a reference on @pi_state also protects against
224 : * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
225 : * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
226 : * free pi_state before we can take a reference ourselves.
227 : */
228 0 : WARN_ON(!refcount_read(&pi_state->refcount));
229 :
230 : /*
231 : * Now that we have a pi_state, we can acquire wait_lock
232 : * and do the state validation.
233 : */
234 0 : raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
235 :
236 : /*
237 : * Since {uval, pi_state} is serialized by wait_lock, and our current
238 : * uval was read without holding it, it can have changed. Verify it
239 : * still is what we expect it to be, otherwise retry the entire
240 : * operation.
241 : */
242 0 : if (futex_get_value_locked(&uval2, uaddr))
243 : goto out_efault;
244 :
245 0 : if (uval != uval2)
246 : goto out_eagain;
247 :
248 : /*
249 : * Handle the owner died case:
250 : */
251 0 : if (uval & FUTEX_OWNER_DIED) {
252 : /*
253 : * exit_pi_state_list sets owner to NULL and wakes the
254 : * topmost waiter. The task which acquires the
255 : * pi_state->rt_mutex will fixup owner.
256 : */
257 0 : if (!pi_state->owner) {
258 : /*
259 : * No pi state owner, but the user space TID
260 : * is not 0. Inconsistent state. [5]
261 : */
262 0 : if (pid)
263 : goto out_einval;
264 : /*
265 : * Take a ref on the state and return success. [4]
266 : */
267 : goto out_attach;
268 : }
269 :
270 : /*
271 : * If TID is 0, then either the dying owner has not
272 : * yet executed exit_pi_state_list() or some waiter
273 : * acquired the rtmutex in the pi state, but did not
274 : * yet fixup the TID in user space.
275 : *
276 : * Take a ref on the state and return success. [6]
277 : */
278 0 : if (!pid)
279 : goto out_attach;
280 : } else {
281 : /*
282 : * If the owner died bit is not set, then the pi_state
283 : * must have an owner. [7]
284 : */
285 0 : if (!pi_state->owner)
286 : goto out_einval;
287 : }
288 :
289 : /*
290 : * Bail out if user space manipulated the futex value. If pi
291 : * state exists then the owner TID must be the same as the
292 : * user space TID. [9/10]
293 : */
294 0 : if (pid != task_pid_vnr(pi_state->owner))
295 : goto out_einval;
296 :
297 : out_attach:
298 0 : get_pi_state(pi_state);
299 0 : raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
300 0 : *ps = pi_state;
301 0 : return 0;
302 :
303 : out_einval:
304 : ret = -EINVAL;
305 : goto out_error;
306 :
307 : out_eagain:
308 : ret = -EAGAIN;
309 : goto out_error;
310 :
311 : out_efault:
312 : ret = -EFAULT;
313 : goto out_error;
314 :
315 : out_error:
316 0 : raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
317 0 : return ret;
318 : }
319 :
320 0 : static int handle_exit_race(u32 __user *uaddr, u32 uval,
321 : struct task_struct *tsk)
322 : {
323 : u32 uval2;
324 :
325 : /*
326 : * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
327 : * caller that the alleged owner is busy.
328 : */
329 0 : if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
330 : return -EBUSY;
331 :
332 : /*
333 : * Reread the user space value to handle the following situation:
334 : *
335 : * CPU0 CPU1
336 : *
337 : * sys_exit() sys_futex()
338 : * do_exit() futex_lock_pi()
339 : * futex_lock_pi_atomic()
340 : * exit_signals(tsk) No waiters:
341 : * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
342 : * mm_release(tsk) Set waiter bit
343 : * exit_robust_list(tsk) { *uaddr = 0x80000PID;
344 : * Set owner died attach_to_pi_owner() {
345 : * *uaddr = 0xC0000000; tsk = get_task(PID);
346 : * } if (!tsk->flags & PF_EXITING) {
347 : * ... attach();
348 : * tsk->futex_state = } else {
349 : * FUTEX_STATE_DEAD; if (tsk->futex_state !=
350 : * FUTEX_STATE_DEAD)
351 : * return -EAGAIN;
352 : * return -ESRCH; <--- FAIL
353 : * }
354 : *
355 : * Returning ESRCH unconditionally is wrong here because the
356 : * user space value has been changed by the exiting task.
357 : *
358 : * The same logic applies to the case where the exiting task is
359 : * already gone.
360 : */
361 0 : if (futex_get_value_locked(&uval2, uaddr))
362 : return -EFAULT;
363 :
364 : /* If the user space value has changed, try again. */
365 0 : if (uval2 != uval)
366 : return -EAGAIN;
367 :
368 : /*
369 : * The exiting task did not have a robust list, the robust list was
370 : * corrupted or the user space value in *uaddr is simply bogus.
371 : * Give up and tell user space.
372 : */
373 0 : return -ESRCH;
374 : }
375 :
376 0 : static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
377 : struct futex_pi_state **ps)
378 : {
379 : /*
380 : * No existing pi state. First waiter. [2]
381 : *
382 : * This creates pi_state, we have hb->lock held, this means nothing can
383 : * observe this state, wait_lock is irrelevant.
384 : */
385 0 : struct futex_pi_state *pi_state = alloc_pi_state();
386 :
387 : /*
388 : * Initialize the pi_mutex in locked state and make @p
389 : * the owner of it:
390 : */
391 0 : rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
392 :
393 : /* Store the key for possible exit cleanups: */
394 0 : pi_state->key = *key;
395 :
396 0 : WARN_ON(!list_empty(&pi_state->list));
397 0 : list_add(&pi_state->list, &p->pi_state_list);
398 : /*
399 : * Assignment without holding pi_state->pi_mutex.wait_lock is safe
400 : * because there is no concurrency as the object is not published yet.
401 : */
402 0 : pi_state->owner = p;
403 :
404 0 : *ps = pi_state;
405 0 : }
406 : /*
407 : * Lookup the task for the TID provided from user space and attach to
408 : * it after doing proper sanity checks.
409 : */
410 0 : static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
411 : struct futex_pi_state **ps,
412 : struct task_struct **exiting)
413 : {
414 0 : pid_t pid = uval & FUTEX_TID_MASK;
415 : struct task_struct *p;
416 :
417 : /*
418 : * We are the first waiter - try to look up the real owner and attach
419 : * the new pi_state to it, but bail out when TID = 0 [1]
420 : *
421 : * The !pid check is paranoid. None of the call sites should end up
422 : * with pid == 0, but better safe than sorry. Let the caller retry
423 : */
424 0 : if (!pid)
425 : return -EAGAIN;
426 0 : p = find_get_task_by_vpid(pid);
427 0 : if (!p)
428 0 : return handle_exit_race(uaddr, uval, NULL);
429 :
430 0 : if (unlikely(p->flags & PF_KTHREAD)) {
431 0 : put_task_struct(p);
432 0 : return -EPERM;
433 : }
434 :
435 : /*
436 : * We need to look at the task state to figure out, whether the
437 : * task is exiting. To protect against the change of the task state
438 : * in futex_exit_release(), we do this protected by p->pi_lock:
439 : */
440 0 : raw_spin_lock_irq(&p->pi_lock);
441 0 : if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
442 : /*
443 : * The task is on the way out. When the futex state is
444 : * FUTEX_STATE_DEAD, we know that the task has finished
445 : * the cleanup:
446 : */
447 0 : int ret = handle_exit_race(uaddr, uval, p);
448 :
449 0 : raw_spin_unlock_irq(&p->pi_lock);
450 : /*
451 : * If the owner task is between FUTEX_STATE_EXITING and
452 : * FUTEX_STATE_DEAD then store the task pointer and keep
453 : * the reference on the task struct. The calling code will
454 : * drop all locks, wait for the task to reach
455 : * FUTEX_STATE_DEAD and then drop the refcount. This is
456 : * required to prevent a live lock when the current task
457 : * preempted the exiting task between the two states.
458 : */
459 0 : if (ret == -EBUSY)
460 0 : *exiting = p;
461 : else
462 0 : put_task_struct(p);
463 : return ret;
464 : }
465 :
466 0 : __attach_to_pi_owner(p, key, ps);
467 0 : raw_spin_unlock_irq(&p->pi_lock);
468 :
469 0 : put_task_struct(p);
470 :
471 0 : return 0;
472 : }
473 :
474 : static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
475 : {
476 : int err;
477 : u32 curval;
478 :
479 0 : if (unlikely(should_fail_futex(true)))
480 : return -EFAULT;
481 :
482 0 : err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
483 0 : if (unlikely(err))
484 : return err;
485 :
486 : /* If user space value changed, let the caller retry */
487 0 : return curval != uval ? -EAGAIN : 0;
488 : }
489 :
490 : /**
491 : * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
492 : * @uaddr: the pi futex user address
493 : * @hb: the pi futex hash bucket
494 : * @key: the futex key associated with uaddr and hb
495 : * @ps: the pi_state pointer where we store the result of the
496 : * lookup
497 : * @task: the task to perform the atomic lock work for. This will
498 : * be "current" except in the case of requeue pi.
499 : * @exiting: Pointer to store the task pointer of the owner task
500 : * which is in the middle of exiting
501 : * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
502 : *
503 : * Return:
504 : * - 0 - ready to wait;
505 : * - 1 - acquired the lock;
506 : * - <0 - error
507 : *
508 : * The hb->lock must be held by the caller.
509 : *
510 : * @exiting is only set when the return value is -EBUSY. If so, this holds
511 : * a refcount on the exiting task on return and the caller needs to drop it
512 : * after waiting for the exit to complete.
513 : */
514 0 : int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
515 : union futex_key *key,
516 : struct futex_pi_state **ps,
517 : struct task_struct *task,
518 : struct task_struct **exiting,
519 : int set_waiters)
520 : {
521 0 : u32 uval, newval, vpid = task_pid_vnr(task);
522 : struct futex_q *top_waiter;
523 : int ret;
524 :
525 : /*
526 : * Read the user space value first so we can validate a few
527 : * things before proceeding further.
528 : */
529 0 : if (futex_get_value_locked(&uval, uaddr))
530 : return -EFAULT;
531 :
532 0 : if (unlikely(should_fail_futex(true)))
533 : return -EFAULT;
534 :
535 : /*
536 : * Detect deadlocks.
537 : */
538 0 : if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
539 : return -EDEADLK;
540 :
541 0 : if ((unlikely(should_fail_futex(true))))
542 : return -EDEADLK;
543 :
544 : /*
545 : * Lookup existing state first. If it exists, try to attach to
546 : * its pi_state.
547 : */
548 0 : top_waiter = futex_top_waiter(hb, key);
549 0 : if (top_waiter)
550 0 : return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
551 :
552 : /*
553 : * No waiter and user TID is 0. We are here because the
554 : * waiters or the owner died bit is set or called from
555 : * requeue_cmp_pi or for whatever reason something took the
556 : * syscall.
557 : */
558 0 : if (!(uval & FUTEX_TID_MASK)) {
559 : /*
560 : * We take over the futex. No other waiters and the user space
561 : * TID is 0. We preserve the owner died bit.
562 : */
563 0 : newval = uval & FUTEX_OWNER_DIED;
564 0 : newval |= vpid;
565 :
566 : /* The futex requeue_pi code can enforce the waiters bit */
567 0 : if (set_waiters)
568 0 : newval |= FUTEX_WAITERS;
569 :
570 0 : ret = lock_pi_update_atomic(uaddr, uval, newval);
571 0 : if (ret)
572 : return ret;
573 :
574 : /*
575 : * If the waiter bit was requested the caller also needs PI
576 : * state attached to the new owner of the user space futex.
577 : *
578 : * @task is guaranteed to be alive and it cannot be exiting
579 : * because it is either sleeping or waiting in
580 : * futex_requeue_pi_wakeup_sync().
581 : *
582 : * No need to do the full attach_to_pi_owner() exercise
583 : * because @task is known and valid.
584 : */
585 0 : if (set_waiters) {
586 0 : raw_spin_lock_irq(&task->pi_lock);
587 0 : __attach_to_pi_owner(task, key, ps);
588 0 : raw_spin_unlock_irq(&task->pi_lock);
589 : }
590 : return 1;
591 : }
592 :
593 : /*
594 : * First waiter. Set the waiters bit before attaching ourself to
595 : * the owner. If owner tries to unlock, it will be forced into
596 : * the kernel and blocked on hb->lock.
597 : */
598 0 : newval = uval | FUTEX_WAITERS;
599 0 : ret = lock_pi_update_atomic(uaddr, uval, newval);
600 0 : if (ret)
601 : return ret;
602 : /*
603 : * If the update of the user space value succeeded, we try to
604 : * attach to the owner. If that fails, no harm done, we only
605 : * set the FUTEX_WAITERS bit in the user space variable.
606 : */
607 0 : return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
608 : }
609 :
610 : /*
611 : * Caller must hold a reference on @pi_state.
612 : */
613 0 : static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
614 : {
615 : struct rt_mutex_waiter *top_waiter;
616 : struct task_struct *new_owner;
617 0 : bool postunlock = false;
618 0 : DEFINE_RT_WAKE_Q(wqh);
619 : u32 curval, newval;
620 0 : int ret = 0;
621 :
622 0 : top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
623 0 : if (WARN_ON_ONCE(!top_waiter)) {
624 : /*
625 : * As per the comment in futex_unlock_pi() this should not happen.
626 : *
627 : * When this happens, give up our locks and try again, giving
628 : * the futex_lock_pi() instance time to complete, either by
629 : * waiting on the rtmutex or removing itself from the futex
630 : * queue.
631 : */
632 : ret = -EAGAIN;
633 : goto out_unlock;
634 : }
635 :
636 0 : new_owner = top_waiter->task;
637 :
638 : /*
639 : * We pass it to the next owner. The WAITERS bit is always kept
640 : * enabled while there is PI state around. We cleanup the owner
641 : * died bit, because we are the owner.
642 : */
643 0 : newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
644 :
645 0 : if (unlikely(should_fail_futex(true))) {
646 : ret = -EFAULT;
647 : goto out_unlock;
648 : }
649 :
650 0 : ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
651 0 : if (!ret && (curval != uval)) {
652 : /*
653 : * If a unconditional UNLOCK_PI operation (user space did not
654 : * try the TID->0 transition) raced with a waiter setting the
655 : * FUTEX_WAITERS flag between get_user() and locking the hash
656 : * bucket lock, retry the operation.
657 : */
658 0 : if ((FUTEX_TID_MASK & curval) == uval)
659 : ret = -EAGAIN;
660 : else
661 0 : ret = -EINVAL;
662 : }
663 :
664 0 : if (!ret) {
665 : /*
666 : * This is a point of no return; once we modified the uval
667 : * there is no going back and subsequent operations must
668 : * not fail.
669 : */
670 0 : pi_state_update_owner(pi_state, new_owner);
671 0 : postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
672 : }
673 :
674 : out_unlock:
675 0 : raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
676 :
677 0 : if (postunlock)
678 0 : rt_mutex_postunlock(&wqh);
679 :
680 0 : return ret;
681 : }
682 :
683 0 : static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
684 : struct task_struct *argowner)
685 : {
686 0 : struct futex_pi_state *pi_state = q->pi_state;
687 : struct task_struct *oldowner, *newowner;
688 : u32 uval, curval, newval, newtid;
689 0 : int err = 0;
690 :
691 0 : oldowner = pi_state->owner;
692 :
693 : /*
694 : * We are here because either:
695 : *
696 : * - we stole the lock and pi_state->owner needs updating to reflect
697 : * that (@argowner == current),
698 : *
699 : * or:
700 : *
701 : * - someone stole our lock and we need to fix things to point to the
702 : * new owner (@argowner == NULL).
703 : *
704 : * Either way, we have to replace the TID in the user space variable.
705 : * This must be atomic as we have to preserve the owner died bit here.
706 : *
707 : * Note: We write the user space value _before_ changing the pi_state
708 : * because we can fault here. Imagine swapped out pages or a fork
709 : * that marked all the anonymous memory readonly for cow.
710 : *
711 : * Modifying pi_state _before_ the user space value would leave the
712 : * pi_state in an inconsistent state when we fault here, because we
713 : * need to drop the locks to handle the fault. This might be observed
714 : * in the PID checks when attaching to PI state .
715 : */
716 : retry:
717 0 : if (!argowner) {
718 0 : if (oldowner != current) {
719 : /*
720 : * We raced against a concurrent self; things are
721 : * already fixed up. Nothing to do.
722 : */
723 : return 0;
724 : }
725 :
726 0 : if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
727 : /* We got the lock. pi_state is correct. Tell caller. */
728 : return 1;
729 : }
730 :
731 : /*
732 : * The trylock just failed, so either there is an owner or
733 : * there is a higher priority waiter than this one.
734 : */
735 0 : newowner = rt_mutex_owner(&pi_state->pi_mutex);
736 : /*
737 : * If the higher priority waiter has not yet taken over the
738 : * rtmutex then newowner is NULL. We can't return here with
739 : * that state because it's inconsistent vs. the user space
740 : * state. So drop the locks and try again. It's a valid
741 : * situation and not any different from the other retry
742 : * conditions.
743 : */
744 0 : if (unlikely(!newowner)) {
745 : err = -EAGAIN;
746 : goto handle_err;
747 : }
748 : } else {
749 0 : WARN_ON_ONCE(argowner != current);
750 0 : if (oldowner == current) {
751 : /*
752 : * We raced against a concurrent self; things are
753 : * already fixed up. Nothing to do.
754 : */
755 : return 1;
756 : }
757 : newowner = argowner;
758 : }
759 :
760 0 : newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
761 : /* Owner died? */
762 0 : if (!pi_state->owner)
763 0 : newtid |= FUTEX_OWNER_DIED;
764 :
765 0 : err = futex_get_value_locked(&uval, uaddr);
766 0 : if (err)
767 : goto handle_err;
768 :
769 : for (;;) {
770 0 : newval = (uval & FUTEX_OWNER_DIED) | newtid;
771 :
772 0 : err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
773 0 : if (err)
774 : goto handle_err;
775 :
776 0 : if (curval == uval)
777 : break;
778 0 : uval = curval;
779 : }
780 :
781 : /*
782 : * We fixed up user space. Now we need to fix the pi_state
783 : * itself.
784 : */
785 0 : pi_state_update_owner(pi_state, newowner);
786 :
787 0 : return argowner == current;
788 :
789 : /*
790 : * In order to reschedule or handle a page fault, we need to drop the
791 : * locks here. In the case of a fault, this gives the other task
792 : * (either the highest priority waiter itself or the task which stole
793 : * the rtmutex) the chance to try the fixup of the pi_state. So once we
794 : * are back from handling the fault we need to check the pi_state after
795 : * reacquiring the locks and before trying to do another fixup. When
796 : * the fixup has been done already we simply return.
797 : *
798 : * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
799 : * drop hb->lock since the caller owns the hb -> futex_q relation.
800 : * Dropping the pi_mutex->wait_lock requires the state revalidate.
801 : */
802 : handle_err:
803 0 : raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
804 0 : spin_unlock(q->lock_ptr);
805 :
806 0 : switch (err) {
807 : case -EFAULT:
808 0 : err = fault_in_user_writeable(uaddr);
809 : break;
810 :
811 : case -EAGAIN:
812 0 : cond_resched();
813 0 : err = 0;
814 : break;
815 :
816 : default:
817 0 : WARN_ON_ONCE(1);
818 : break;
819 : }
820 :
821 0 : spin_lock(q->lock_ptr);
822 0 : raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
823 :
824 : /*
825 : * Check if someone else fixed it for us:
826 : */
827 0 : if (pi_state->owner != oldowner)
828 0 : return argowner == current;
829 :
830 : /* Retry if err was -EAGAIN or the fault in succeeded */
831 0 : if (!err)
832 : goto retry;
833 :
834 : /*
835 : * fault_in_user_writeable() failed so user state is immutable. At
836 : * best we can make the kernel state consistent but user state will
837 : * be most likely hosed and any subsequent unlock operation will be
838 : * rejected due to PI futex rule [10].
839 : *
840 : * Ensure that the rtmutex owner is also the pi_state owner despite
841 : * the user space value claiming something different. There is no
842 : * point in unlocking the rtmutex if current is the owner as it
843 : * would need to wait until the next waiter has taken the rtmutex
844 : * to guarantee consistent state. Keep it simple. Userspace asked
845 : * for this wreckaged state.
846 : *
847 : * The rtmutex has an owner - either current or some other
848 : * task. See the EAGAIN loop above.
849 : */
850 0 : pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
851 :
852 : return err;
853 : }
854 :
855 : static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
856 : struct task_struct *argowner)
857 : {
858 0 : struct futex_pi_state *pi_state = q->pi_state;
859 : int ret;
860 :
861 : lockdep_assert_held(q->lock_ptr);
862 :
863 0 : raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
864 0 : ret = __fixup_pi_state_owner(uaddr, q, argowner);
865 0 : raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
866 : return ret;
867 : }
868 :
869 : /**
870 : * fixup_pi_owner() - Post lock pi_state and corner case management
871 : * @uaddr: user address of the futex
872 : * @q: futex_q (contains pi_state and access to the rt_mutex)
873 : * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
874 : *
875 : * After attempting to lock an rt_mutex, this function is called to cleanup
876 : * the pi_state owner as well as handle race conditions that may allow us to
877 : * acquire the lock. Must be called with the hb lock held.
878 : *
879 : * Return:
880 : * - 1 - success, lock taken;
881 : * - 0 - success, lock not taken;
882 : * - <0 - on error (-EFAULT)
883 : */
884 0 : int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
885 : {
886 0 : if (locked) {
887 : /*
888 : * Got the lock. We might not be the anticipated owner if we
889 : * did a lock-steal - fix up the PI-state in that case:
890 : *
891 : * Speculative pi_state->owner read (we don't hold wait_lock);
892 : * since we own the lock pi_state->owner == current is the
893 : * stable state, anything else needs more attention.
894 : */
895 0 : if (q->pi_state->owner != current)
896 0 : return fixup_pi_state_owner(uaddr, q, current);
897 : return 1;
898 : }
899 :
900 : /*
901 : * If we didn't get the lock; check if anybody stole it from us. In
902 : * that case, we need to fix up the uval to point to them instead of
903 : * us, otherwise bad things happen. [10]
904 : *
905 : * Another speculative read; pi_state->owner == current is unstable
906 : * but needs our attention.
907 : */
908 0 : if (q->pi_state->owner == current)
909 0 : return fixup_pi_state_owner(uaddr, q, NULL);
910 :
911 : /*
912 : * Paranoia check. If we did not take the lock, then we should not be
913 : * the owner of the rt_mutex. Warn and establish consistent state.
914 : */
915 0 : if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
916 0 : return fixup_pi_state_owner(uaddr, q, current);
917 :
918 : return 0;
919 : }
920 :
921 : /*
922 : * Userspace tried a 0 -> TID atomic transition of the futex value
923 : * and failed. The kernel side here does the whole locking operation:
924 : * if there are waiters then it will block as a consequence of relying
925 : * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
926 : * a 0 value of the futex too.).
927 : *
928 : * Also serves as futex trylock_pi()'ing, and due semantics.
929 : */
930 0 : int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
931 : {
932 : struct hrtimer_sleeper timeout, *to;
933 0 : struct task_struct *exiting = NULL;
934 : struct rt_mutex_waiter rt_waiter;
935 : struct futex_hash_bucket *hb;
936 0 : struct futex_q q = futex_q_init;
937 : int res, ret;
938 :
939 : if (!IS_ENABLED(CONFIG_FUTEX_PI))
940 : return -ENOSYS;
941 :
942 0 : if (refill_pi_state_cache())
943 : return -ENOMEM;
944 :
945 0 : to = futex_setup_timer(time, &timeout, flags, 0);
946 :
947 : retry:
948 0 : ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
949 0 : if (unlikely(ret != 0))
950 : goto out;
951 :
952 : retry_private:
953 0 : hb = futex_q_lock(&q);
954 :
955 0 : ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
956 : &exiting, 0);
957 0 : if (unlikely(ret)) {
958 : /*
959 : * Atomic work succeeded and we got the lock,
960 : * or failed. Either way, we do _not_ block.
961 : */
962 0 : switch (ret) {
963 : case 1:
964 : /* We got the lock. */
965 0 : ret = 0;
966 0 : goto out_unlock_put_key;
967 : case -EFAULT:
968 : goto uaddr_faulted;
969 : case -EBUSY:
970 : case -EAGAIN:
971 : /*
972 : * Two reasons for this:
973 : * - EBUSY: Task is exiting and we just wait for the
974 : * exit to complete.
975 : * - EAGAIN: The user space value changed.
976 : */
977 0 : futex_q_unlock(hb);
978 : /*
979 : * Handle the case where the owner is in the middle of
980 : * exiting. Wait for the exit to complete otherwise
981 : * this task might loop forever, aka. live lock.
982 : */
983 0 : wait_for_owner_exiting(ret, exiting);
984 0 : cond_resched();
985 0 : goto retry;
986 : default:
987 : goto out_unlock_put_key;
988 : }
989 : }
990 :
991 0 : WARN_ON(!q.pi_state);
992 :
993 : /*
994 : * Only actually queue now that the atomic ops are done:
995 : */
996 0 : __futex_queue(&q, hb);
997 :
998 0 : if (trylock) {
999 0 : ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
1000 : /* Fixup the trylock return value: */
1001 0 : ret = ret ? 0 : -EWOULDBLOCK;
1002 : goto no_block;
1003 : }
1004 :
1005 0 : rt_mutex_init_waiter(&rt_waiter);
1006 :
1007 : /*
1008 : * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
1009 : * hold it while doing rt_mutex_start_proxy(), because then it will
1010 : * include hb->lock in the blocking chain, even through we'll not in
1011 : * fact hold it while blocking. This will lead it to report -EDEADLK
1012 : * and BUG when futex_unlock_pi() interleaves with this.
1013 : *
1014 : * Therefore acquire wait_lock while holding hb->lock, but drop the
1015 : * latter before calling __rt_mutex_start_proxy_lock(). This
1016 : * interleaves with futex_unlock_pi() -- which does a similar lock
1017 : * handoff -- such that the latter can observe the futex_q::pi_state
1018 : * before __rt_mutex_start_proxy_lock() is done.
1019 : */
1020 0 : raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1021 0 : spin_unlock(q.lock_ptr);
1022 : /*
1023 : * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1024 : * such that futex_unlock_pi() is guaranteed to observe the waiter when
1025 : * it sees the futex_q::pi_state.
1026 : */
1027 0 : ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
1028 0 : raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
1029 :
1030 0 : if (ret) {
1031 0 : if (ret == 1)
1032 0 : ret = 0;
1033 : goto cleanup;
1034 : }
1035 :
1036 0 : if (unlikely(to))
1037 0 : hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1038 :
1039 0 : ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1040 :
1041 : cleanup:
1042 0 : spin_lock(q.lock_ptr);
1043 : /*
1044 : * If we failed to acquire the lock (deadlock/signal/timeout), we must
1045 : * first acquire the hb->lock before removing the lock from the
1046 : * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
1047 : * lists consistent.
1048 : *
1049 : * In particular; it is important that futex_unlock_pi() can not
1050 : * observe this inconsistency.
1051 : */
1052 0 : if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1053 0 : ret = 0;
1054 :
1055 : no_block:
1056 : /*
1057 : * Fixup the pi_state owner and possibly acquire the lock if we
1058 : * haven't already.
1059 : */
1060 0 : res = fixup_pi_owner(uaddr, &q, !ret);
1061 : /*
1062 : * If fixup_pi_owner() returned an error, propagate that. If it acquired
1063 : * the lock, clear our -ETIMEDOUT or -EINTR.
1064 : */
1065 0 : if (res)
1066 0 : ret = (res < 0) ? res : 0;
1067 :
1068 0 : futex_unqueue_pi(&q);
1069 0 : spin_unlock(q.lock_ptr);
1070 : goto out;
1071 :
1072 : out_unlock_put_key:
1073 0 : futex_q_unlock(hb);
1074 :
1075 : out:
1076 0 : if (to) {
1077 0 : hrtimer_cancel(&to->timer);
1078 0 : destroy_hrtimer_on_stack(&to->timer);
1079 : }
1080 0 : return ret != -EINTR ? ret : -ERESTARTNOINTR;
1081 :
1082 : uaddr_faulted:
1083 0 : futex_q_unlock(hb);
1084 :
1085 0 : ret = fault_in_user_writeable(uaddr);
1086 0 : if (ret)
1087 : goto out;
1088 :
1089 0 : if (!(flags & FLAGS_SHARED))
1090 : goto retry_private;
1091 :
1092 : goto retry;
1093 : }
1094 :
1095 : /*
1096 : * Userspace attempted a TID -> 0 atomic transition, and failed.
1097 : * This is the in-kernel slowpath: we look up the PI state (if any),
1098 : * and do the rt-mutex unlock.
1099 : */
1100 0 : int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1101 : {
1102 0 : u32 curval, uval, vpid = task_pid_vnr(current);
1103 0 : union futex_key key = FUTEX_KEY_INIT;
1104 : struct futex_hash_bucket *hb;
1105 : struct futex_q *top_waiter;
1106 : int ret;
1107 :
1108 : if (!IS_ENABLED(CONFIG_FUTEX_PI))
1109 : return -ENOSYS;
1110 :
1111 : retry:
1112 0 : if (get_user(uval, uaddr))
1113 : return -EFAULT;
1114 : /*
1115 : * We release only a lock we actually own:
1116 : */
1117 0 : if ((uval & FUTEX_TID_MASK) != vpid)
1118 : return -EPERM;
1119 :
1120 0 : ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
1121 0 : if (ret)
1122 : return ret;
1123 :
1124 0 : hb = futex_hash(&key);
1125 0 : spin_lock(&hb->lock);
1126 :
1127 : /*
1128 : * Check waiters first. We do not trust user space values at
1129 : * all and we at least want to know if user space fiddled
1130 : * with the futex value instead of blindly unlocking.
1131 : */
1132 0 : top_waiter = futex_top_waiter(hb, &key);
1133 0 : if (top_waiter) {
1134 0 : struct futex_pi_state *pi_state = top_waiter->pi_state;
1135 :
1136 0 : ret = -EINVAL;
1137 0 : if (!pi_state)
1138 : goto out_unlock;
1139 :
1140 : /*
1141 : * If current does not own the pi_state then the futex is
1142 : * inconsistent and user space fiddled with the futex value.
1143 : */
1144 0 : if (pi_state->owner != current)
1145 : goto out_unlock;
1146 :
1147 0 : get_pi_state(pi_state);
1148 : /*
1149 : * By taking wait_lock while still holding hb->lock, we ensure
1150 : * there is no point where we hold neither; and therefore
1151 : * wake_futex_p() must observe a state consistent with what we
1152 : * observed.
1153 : *
1154 : * In particular; this forces __rt_mutex_start_proxy() to
1155 : * complete such that we're guaranteed to observe the
1156 : * rt_waiter. Also see the WARN in wake_futex_pi().
1157 : */
1158 0 : raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1159 0 : spin_unlock(&hb->lock);
1160 :
1161 : /* drops pi_state->pi_mutex.wait_lock */
1162 0 : ret = wake_futex_pi(uaddr, uval, pi_state);
1163 :
1164 0 : put_pi_state(pi_state);
1165 :
1166 : /*
1167 : * Success, we're done! No tricky corner cases.
1168 : */
1169 0 : if (!ret)
1170 : return ret;
1171 : /*
1172 : * The atomic access to the futex value generated a
1173 : * pagefault, so retry the user-access and the wakeup:
1174 : */
1175 0 : if (ret == -EFAULT)
1176 : goto pi_faulted;
1177 : /*
1178 : * A unconditional UNLOCK_PI op raced against a waiter
1179 : * setting the FUTEX_WAITERS bit. Try again.
1180 : */
1181 0 : if (ret == -EAGAIN)
1182 : goto pi_retry;
1183 : /*
1184 : * wake_futex_pi has detected invalid state. Tell user
1185 : * space.
1186 : */
1187 : return ret;
1188 : }
1189 :
1190 : /*
1191 : * We have no kernel internal state, i.e. no waiters in the
1192 : * kernel. Waiters which are about to queue themselves are stuck
1193 : * on hb->lock. So we can safely ignore them. We do neither
1194 : * preserve the WAITERS bit not the OWNER_DIED one. We are the
1195 : * owner.
1196 : */
1197 0 : if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1198 0 : spin_unlock(&hb->lock);
1199 0 : switch (ret) {
1200 : case -EFAULT:
1201 : goto pi_faulted;
1202 :
1203 : case -EAGAIN:
1204 : goto pi_retry;
1205 :
1206 : default:
1207 0 : WARN_ON_ONCE(1);
1208 : return ret;
1209 : }
1210 : }
1211 :
1212 : /*
1213 : * If uval has changed, let user space handle it.
1214 : */
1215 0 : ret = (curval == uval) ? 0 : -EAGAIN;
1216 :
1217 : out_unlock:
1218 0 : spin_unlock(&hb->lock);
1219 0 : return ret;
1220 :
1221 : pi_retry:
1222 0 : cond_resched();
1223 0 : goto retry;
1224 :
1225 : pi_faulted:
1226 :
1227 0 : ret = fault_in_user_writeable(uaddr);
1228 0 : if (!ret)
1229 : goto retry;
1230 :
1231 : return ret;
1232 : }
1233 :
|