LCOV - code coverage report
Current view: top level - io_uring - io_uring.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 4 1902 0.2 %
Date: 2023-04-06 08:38:28 Functions: 1 133 0.8 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Shared application/kernel submission and completion ring pairs, for
       4             :  * supporting fast/efficient IO.
       5             :  *
       6             :  * A note on the read/write ordering memory barriers that are matched between
       7             :  * the application and kernel side.
       8             :  *
       9             :  * After the application reads the CQ ring tail, it must use an
      10             :  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
      11             :  * before writing the tail (using smp_load_acquire to read the tail will
      12             :  * do). It also needs a smp_mb() before updating CQ head (ordering the
      13             :  * entry load(s) with the head store), pairing with an implicit barrier
      14             :  * through a control-dependency in io_get_cqe (smp_store_release to
      15             :  * store head will do). Failure to do so could lead to reading invalid
      16             :  * CQ entries.
      17             :  *
      18             :  * Likewise, the application must use an appropriate smp_wmb() before
      19             :  * writing the SQ tail (ordering SQ entry stores with the tail store),
      20             :  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
      21             :  * to store the tail will do). And it needs a barrier ordering the SQ
      22             :  * head load before writing new SQ entries (smp_load_acquire to read
      23             :  * head will do).
      24             :  *
      25             :  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
      26             :  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
      27             :  * updating the SQ tail; a full memory barrier smp_mb() is needed
      28             :  * between.
      29             :  *
      30             :  * Also see the examples in the liburing library:
      31             :  *
      32             :  *      git://git.kernel.dk/liburing
      33             :  *
      34             :  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
      35             :  * from data shared between the kernel and application. This is done both
      36             :  * for ordering purposes, but also to ensure that once a value is loaded from
      37             :  * data that the application could potentially modify, it remains stable.
      38             :  *
      39             :  * Copyright (C) 2018-2019 Jens Axboe
      40             :  * Copyright (c) 2018-2019 Christoph Hellwig
      41             :  */
      42             : #include <linux/kernel.h>
      43             : #include <linux/init.h>
      44             : #include <linux/errno.h>
      45             : #include <linux/syscalls.h>
      46             : #include <net/compat.h>
      47             : #include <linux/refcount.h>
      48             : #include <linux/uio.h>
      49             : #include <linux/bits.h>
      50             : 
      51             : #include <linux/sched/signal.h>
      52             : #include <linux/fs.h>
      53             : #include <linux/file.h>
      54             : #include <linux/fdtable.h>
      55             : #include <linux/mm.h>
      56             : #include <linux/mman.h>
      57             : #include <linux/percpu.h>
      58             : #include <linux/slab.h>
      59             : #include <linux/bvec.h>
      60             : #include <linux/net.h>
      61             : #include <net/sock.h>
      62             : #include <net/af_unix.h>
      63             : #include <net/scm.h>
      64             : #include <linux/anon_inodes.h>
      65             : #include <linux/sched/mm.h>
      66             : #include <linux/uaccess.h>
      67             : #include <linux/nospec.h>
      68             : #include <linux/highmem.h>
      69             : #include <linux/fsnotify.h>
      70             : #include <linux/fadvise.h>
      71             : #include <linux/task_work.h>
      72             : #include <linux/io_uring.h>
      73             : #include <linux/audit.h>
      74             : #include <linux/security.h>
      75             : 
      76             : #define CREATE_TRACE_POINTS
      77             : #include <trace/events/io_uring.h>
      78             : 
      79             : #include <uapi/linux/io_uring.h>
      80             : 
      81             : #include "io-wq.h"
      82             : 
      83             : #include "io_uring.h"
      84             : #include "opdef.h"
      85             : #include "refs.h"
      86             : #include "tctx.h"
      87             : #include "sqpoll.h"
      88             : #include "fdinfo.h"
      89             : #include "kbuf.h"
      90             : #include "rsrc.h"
      91             : #include "cancel.h"
      92             : #include "net.h"
      93             : #include "notif.h"
      94             : 
      95             : #include "timeout.h"
      96             : #include "poll.h"
      97             : #include "alloc_cache.h"
      98             : 
      99             : #define IORING_MAX_ENTRIES      32768
     100             : #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
     101             : 
     102             : #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
     103             :                                  IORING_REGISTER_LAST + IORING_OP_LAST)
     104             : 
     105             : #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
     106             :                           IOSQE_IO_HARDLINK | IOSQE_ASYNC)
     107             : 
     108             : #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
     109             :                         IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
     110             : 
     111             : #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
     112             :                                 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
     113             :                                 REQ_F_ASYNC_DATA)
     114             : 
     115             : #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
     116             :                                  IO_REQ_CLEAN_FLAGS)
     117             : 
     118             : #define IO_TCTX_REFS_CACHE_NR   (1U << 10)
     119             : 
     120             : #define IO_COMPL_BATCH                  32
     121             : #define IO_REQ_ALLOC_BATCH              8
     122             : 
     123             : enum {
     124             :         IO_CHECK_CQ_OVERFLOW_BIT,
     125             :         IO_CHECK_CQ_DROPPED_BIT,
     126             : };
     127             : 
     128             : enum {
     129             :         IO_EVENTFD_OP_SIGNAL_BIT,
     130             :         IO_EVENTFD_OP_FREE_BIT,
     131             : };
     132             : 
     133             : struct io_defer_entry {
     134             :         struct list_head        list;
     135             :         struct io_kiocb         *req;
     136             :         u32                     seq;
     137             : };
     138             : 
     139             : /* requests with any of those set should undergo io_disarm_next() */
     140             : #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
     141             : #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
     142             : 
     143             : static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
     144             :                                          struct task_struct *task,
     145             :                                          bool cancel_all);
     146             : 
     147             : static void io_dismantle_req(struct io_kiocb *req);
     148             : static void io_clean_op(struct io_kiocb *req);
     149             : static void io_queue_sqe(struct io_kiocb *req);
     150             : static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
     151             : static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
     152             : static __cold void io_fallback_tw(struct io_uring_task *tctx);
     153             : 
     154             : struct kmem_cache *req_cachep;
     155             : 
     156           0 : struct sock *io_uring_get_socket(struct file *file)
     157             : {
     158             : #if defined(CONFIG_UNIX)
     159             :         if (io_is_uring_fops(file)) {
     160             :                 struct io_ring_ctx *ctx = file->private_data;
     161             : 
     162             :                 return ctx->ring_sock->sk;
     163             :         }
     164             : #endif
     165           0 :         return NULL;
     166             : }
     167             : EXPORT_SYMBOL(io_uring_get_socket);
     168             : 
     169             : static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
     170             : {
     171           0 :         if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
     172           0 :             ctx->submit_state.cqes_count)
     173           0 :                 __io_submit_flush_completions(ctx);
     174             : }
     175             : 
     176             : static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
     177             : {
     178           0 :         return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
     179             : }
     180             : 
     181             : static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx)
     182             : {
     183           0 :         return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head);
     184             : }
     185             : 
     186             : static bool io_match_linked(struct io_kiocb *head)
     187             : {
     188             :         struct io_kiocb *req;
     189             : 
     190           0 :         io_for_each_link(req, head) {
     191           0 :                 if (req->flags & REQ_F_INFLIGHT)
     192             :                         return true;
     193             :         }
     194             :         return false;
     195             : }
     196             : 
     197             : /*
     198             :  * As io_match_task() but protected against racing with linked timeouts.
     199             :  * User must not hold timeout_lock.
     200             :  */
     201           0 : bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
     202             :                         bool cancel_all)
     203             : {
     204             :         bool matched;
     205             : 
     206           0 :         if (task && head->task != task)
     207             :                 return false;
     208           0 :         if (cancel_all)
     209             :                 return true;
     210             : 
     211           0 :         if (head->flags & REQ_F_LINK_TIMEOUT) {
     212           0 :                 struct io_ring_ctx *ctx = head->ctx;
     213             : 
     214             :                 /* protect against races with linked timeouts */
     215           0 :                 spin_lock_irq(&ctx->timeout_lock);
     216           0 :                 matched = io_match_linked(head);
     217           0 :                 spin_unlock_irq(&ctx->timeout_lock);
     218             :         } else {
     219             :                 matched = io_match_linked(head);
     220             :         }
     221             :         return matched;
     222             : }
     223             : 
     224             : static inline void req_fail_link_node(struct io_kiocb *req, int res)
     225             : {
     226           0 :         req_set_fail(req);
     227           0 :         io_req_set_res(req, res, 0);
     228             : }
     229             : 
     230             : static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
     231             : {
     232           0 :         wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
     233           0 :         kasan_poison_object_data(req_cachep, req);
     234             : }
     235             : 
     236           0 : static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
     237             : {
     238           0 :         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
     239             : 
     240           0 :         complete(&ctx->ref_comp);
     241           0 : }
     242             : 
     243           0 : static __cold void io_fallback_req_func(struct work_struct *work)
     244             : {
     245           0 :         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
     246             :                                                 fallback_work.work);
     247           0 :         struct llist_node *node = llist_del_all(&ctx->fallback_llist);
     248             :         struct io_kiocb *req, *tmp;
     249           0 :         bool locked = true;
     250             : 
     251           0 :         mutex_lock(&ctx->uring_lock);
     252           0 :         llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
     253           0 :                 req->io_task_work.func(req, &locked);
     254           0 :         if (WARN_ON_ONCE(!locked))
     255           0 :                 return;
     256           0 :         io_submit_flush_completions(ctx);
     257           0 :         mutex_unlock(&ctx->uring_lock);
     258             : }
     259             : 
     260           0 : static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
     261             : {
     262           0 :         unsigned hash_buckets = 1U << bits;
     263           0 :         size_t hash_size = hash_buckets * sizeof(table->hbs[0]);
     264             : 
     265           0 :         table->hbs = kmalloc(hash_size, GFP_KERNEL);
     266           0 :         if (!table->hbs)
     267             :                 return -ENOMEM;
     268             : 
     269           0 :         table->hash_bits = bits;
     270           0 :         init_hash_table(table, hash_buckets);
     271           0 :         return 0;
     272             : }
     273             : 
     274           0 : static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
     275             : {
     276             :         struct io_ring_ctx *ctx;
     277             :         int hash_bits;
     278             : 
     279           0 :         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
     280           0 :         if (!ctx)
     281             :                 return NULL;
     282             : 
     283           0 :         xa_init(&ctx->io_bl_xa);
     284             : 
     285             :         /*
     286             :          * Use 5 bits less than the max cq entries, that should give us around
     287             :          * 32 entries per hash list if totally full and uniformly spread, but
     288             :          * don't keep too many buckets to not overconsume memory.
     289             :          */
     290           0 :         hash_bits = ilog2(p->cq_entries) - 5;
     291           0 :         hash_bits = clamp(hash_bits, 1, 8);
     292           0 :         if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
     293             :                 goto err;
     294           0 :         if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits))
     295             :                 goto err;
     296             : 
     297           0 :         ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
     298           0 :         if (!ctx->dummy_ubuf)
     299             :                 goto err;
     300             :         /* set invalid range, so io_import_fixed() fails meeting it */
     301           0 :         ctx->dummy_ubuf->ubuf = -1UL;
     302             : 
     303           0 :         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
     304             :                             0, GFP_KERNEL))
     305             :                 goto err;
     306             : 
     307           0 :         ctx->flags = p->flags;
     308           0 :         init_waitqueue_head(&ctx->sqo_sq_wait);
     309           0 :         INIT_LIST_HEAD(&ctx->sqd_list);
     310           0 :         INIT_LIST_HEAD(&ctx->cq_overflow_list);
     311           0 :         INIT_LIST_HEAD(&ctx->io_buffers_cache);
     312           0 :         io_alloc_cache_init(&ctx->apoll_cache);
     313           0 :         io_alloc_cache_init(&ctx->netmsg_cache);
     314           0 :         init_completion(&ctx->ref_comp);
     315           0 :         xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
     316           0 :         mutex_init(&ctx->uring_lock);
     317           0 :         init_waitqueue_head(&ctx->cq_wait);
     318           0 :         init_waitqueue_head(&ctx->poll_wq);
     319           0 :         spin_lock_init(&ctx->completion_lock);
     320           0 :         spin_lock_init(&ctx->timeout_lock);
     321           0 :         INIT_WQ_LIST(&ctx->iopoll_list);
     322           0 :         INIT_LIST_HEAD(&ctx->io_buffers_pages);
     323           0 :         INIT_LIST_HEAD(&ctx->io_buffers_comp);
     324           0 :         INIT_LIST_HEAD(&ctx->defer_list);
     325           0 :         INIT_LIST_HEAD(&ctx->timeout_list);
     326           0 :         INIT_LIST_HEAD(&ctx->ltimeout_list);
     327           0 :         spin_lock_init(&ctx->rsrc_ref_lock);
     328           0 :         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
     329           0 :         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
     330           0 :         init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw);
     331           0 :         init_llist_head(&ctx->rsrc_put_llist);
     332           0 :         init_llist_head(&ctx->work_llist);
     333           0 :         INIT_LIST_HEAD(&ctx->tctx_list);
     334           0 :         ctx->submit_state.free_list.next = NULL;
     335           0 :         INIT_WQ_LIST(&ctx->locked_free_list);
     336           0 :         INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
     337           0 :         INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
     338             :         return ctx;
     339             : err:
     340           0 :         kfree(ctx->dummy_ubuf);
     341           0 :         kfree(ctx->cancel_table.hbs);
     342           0 :         kfree(ctx->cancel_table_locked.hbs);
     343           0 :         kfree(ctx->io_bl);
     344           0 :         xa_destroy(&ctx->io_bl_xa);
     345           0 :         kfree(ctx);
     346             :         return NULL;
     347             : }
     348             : 
     349             : static void io_account_cq_overflow(struct io_ring_ctx *ctx)
     350             : {
     351           0 :         struct io_rings *r = ctx->rings;
     352             : 
     353           0 :         WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
     354           0 :         ctx->cq_extra--;
     355             : }
     356             : 
     357             : static bool req_need_defer(struct io_kiocb *req, u32 seq)
     358             : {
     359           0 :         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
     360           0 :                 struct io_ring_ctx *ctx = req->ctx;
     361             : 
     362           0 :                 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
     363             :         }
     364             : 
     365             :         return false;
     366             : }
     367             : 
     368             : static inline void io_req_track_inflight(struct io_kiocb *req)
     369             : {
     370           0 :         if (!(req->flags & REQ_F_INFLIGHT)) {
     371           0 :                 req->flags |= REQ_F_INFLIGHT;
     372           0 :                 atomic_inc(&req->task->io_uring->inflight_tracked);
     373             :         }
     374             : }
     375             : 
     376           0 : static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
     377             : {
     378           0 :         if (WARN_ON_ONCE(!req->link))
     379             :                 return NULL;
     380             : 
     381           0 :         req->flags &= ~REQ_F_ARM_LTIMEOUT;
     382           0 :         req->flags |= REQ_F_LINK_TIMEOUT;
     383             : 
     384             :         /* linked timeouts should have two refs once prep'ed */
     385           0 :         io_req_set_refcount(req);
     386           0 :         __io_req_set_refcount(req->link, 2);
     387           0 :         return req->link;
     388             : }
     389             : 
     390             : static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
     391             : {
     392           0 :         if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
     393             :                 return NULL;
     394           0 :         return __io_prep_linked_timeout(req);
     395             : }
     396             : 
     397           0 : static noinline void __io_arm_ltimeout(struct io_kiocb *req)
     398             : {
     399           0 :         io_queue_linked_timeout(__io_prep_linked_timeout(req));
     400           0 : }
     401             : 
     402             : static inline void io_arm_ltimeout(struct io_kiocb *req)
     403             : {
     404           0 :         if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
     405           0 :                 __io_arm_ltimeout(req);
     406             : }
     407             : 
     408           0 : static void io_prep_async_work(struct io_kiocb *req)
     409             : {
     410           0 :         const struct io_issue_def *def = &io_issue_defs[req->opcode];
     411           0 :         struct io_ring_ctx *ctx = req->ctx;
     412             : 
     413           0 :         if (!(req->flags & REQ_F_CREDS)) {
     414           0 :                 req->flags |= REQ_F_CREDS;
     415           0 :                 req->creds = get_current_cred();
     416             :         }
     417             : 
     418           0 :         req->work.list.next = NULL;
     419           0 :         req->work.flags = 0;
     420           0 :         req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
     421           0 :         if (req->flags & REQ_F_FORCE_ASYNC)
     422           0 :                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
     423             : 
     424           0 :         if (req->file && !io_req_ffs_set(req))
     425           0 :                 req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT;
     426             : 
     427           0 :         if (req->flags & REQ_F_ISREG) {
     428           0 :                 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
     429           0 :                         io_wq_hash_work(&req->work, file_inode(req->file));
     430           0 :         } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
     431           0 :                 if (def->unbound_nonreg_file)
     432           0 :                         req->work.flags |= IO_WQ_WORK_UNBOUND;
     433             :         }
     434           0 : }
     435             : 
     436           0 : static void io_prep_async_link(struct io_kiocb *req)
     437             : {
     438             :         struct io_kiocb *cur;
     439             : 
     440           0 :         if (req->flags & REQ_F_LINK_TIMEOUT) {
     441           0 :                 struct io_ring_ctx *ctx = req->ctx;
     442             : 
     443           0 :                 spin_lock_irq(&ctx->timeout_lock);
     444           0 :                 io_for_each_link(cur, req)
     445           0 :                         io_prep_async_work(cur);
     446           0 :                 spin_unlock_irq(&ctx->timeout_lock);
     447             :         } else {
     448           0 :                 io_for_each_link(cur, req)
     449           0 :                         io_prep_async_work(cur);
     450             :         }
     451           0 : }
     452             : 
     453           0 : void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
     454             : {
     455           0 :         struct io_kiocb *link = io_prep_linked_timeout(req);
     456           0 :         struct io_uring_task *tctx = req->task->io_uring;
     457             : 
     458           0 :         BUG_ON(!tctx);
     459           0 :         BUG_ON(!tctx->io_wq);
     460             : 
     461             :         /* init ->work of the whole link before punting */
     462           0 :         io_prep_async_link(req);
     463             : 
     464             :         /*
     465             :          * Not expected to happen, but if we do have a bug where this _can_
     466             :          * happen, catch it here and ensure the request is marked as
     467             :          * canceled. That will make io-wq go through the usual work cancel
     468             :          * procedure rather than attempt to run this request (or create a new
     469             :          * worker for it).
     470             :          */
     471           0 :         if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
     472           0 :                 req->work.flags |= IO_WQ_WORK_CANCEL;
     473             : 
     474           0 :         trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
     475           0 :         io_wq_enqueue(tctx->io_wq, &req->work);
     476           0 :         if (link)
     477           0 :                 io_queue_linked_timeout(link);
     478           0 : }
     479             : 
     480           0 : static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
     481             : {
     482           0 :         while (!list_empty(&ctx->defer_list)) {
     483           0 :                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
     484             :                                                 struct io_defer_entry, list);
     485             : 
     486           0 :                 if (req_need_defer(de->req, de->seq))
     487             :                         break;
     488           0 :                 list_del_init(&de->list);
     489           0 :                 io_req_task_queue(de->req);
     490           0 :                 kfree(de);
     491             :         }
     492           0 : }
     493             : 
     494             : 
     495           0 : static void io_eventfd_ops(struct rcu_head *rcu)
     496             : {
     497           0 :         struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
     498           0 :         int ops = atomic_xchg(&ev_fd->ops, 0);
     499             : 
     500           0 :         if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
     501           0 :                 eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
     502             : 
     503             :         /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
     504             :          * ordering in a race but if references are 0 we know we have to free
     505             :          * it regardless.
     506             :          */
     507           0 :         if (atomic_dec_and_test(&ev_fd->refs)) {
     508           0 :                 eventfd_ctx_put(ev_fd->cq_ev_fd);
     509           0 :                 kfree(ev_fd);
     510             :         }
     511           0 : }
     512             : 
     513           0 : static void io_eventfd_signal(struct io_ring_ctx *ctx)
     514             : {
     515           0 :         struct io_ev_fd *ev_fd = NULL;
     516             : 
     517             :         rcu_read_lock();
     518             :         /*
     519             :          * rcu_dereference ctx->io_ev_fd once and use it for both for checking
     520             :          * and eventfd_signal
     521             :          */
     522           0 :         ev_fd = rcu_dereference(ctx->io_ev_fd);
     523             : 
     524             :         /*
     525             :          * Check again if ev_fd exists incase an io_eventfd_unregister call
     526             :          * completed between the NULL check of ctx->io_ev_fd at the start of
     527             :          * the function and rcu_read_lock.
     528             :          */
     529           0 :         if (unlikely(!ev_fd))
     530             :                 goto out;
     531           0 :         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
     532             :                 goto out;
     533           0 :         if (ev_fd->eventfd_async && !io_wq_current_is_worker())
     534             :                 goto out;
     535             : 
     536           0 :         if (likely(eventfd_signal_allowed())) {
     537           0 :                 eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
     538             :         } else {
     539           0 :                 atomic_inc(&ev_fd->refs);
     540           0 :                 if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
     541           0 :                         call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops);
     542             :                 else
     543           0 :                         atomic_dec(&ev_fd->refs);
     544             :         }
     545             : 
     546             : out:
     547             :         rcu_read_unlock();
     548           0 : }
     549             : 
     550             : static void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
     551             : {
     552             :         bool skip;
     553             : 
     554           0 :         spin_lock(&ctx->completion_lock);
     555             : 
     556             :         /*
     557             :          * Eventfd should only get triggered when at least one event has been
     558             :          * posted. Some applications rely on the eventfd notification count
     559             :          * only changing IFF a new CQE has been added to the CQ ring. There's
     560             :          * no depedency on 1:1 relationship between how many times this
     561             :          * function is called (and hence the eventfd count) and number of CQEs
     562             :          * posted to the CQ ring.
     563             :          */
     564           0 :         skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
     565           0 :         ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
     566           0 :         spin_unlock(&ctx->completion_lock);
     567           0 :         if (skip)
     568             :                 return;
     569             : 
     570           0 :         io_eventfd_signal(ctx);
     571             : }
     572             : 
     573           0 : void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
     574             : {
     575           0 :         if (ctx->poll_activated)
     576             :                 io_poll_wq_wake(ctx);
     577           0 :         if (ctx->off_timeout_used)
     578           0 :                 io_flush_timeouts(ctx);
     579           0 :         if (ctx->drain_active) {
     580           0 :                 spin_lock(&ctx->completion_lock);
     581           0 :                 io_queue_deferred(ctx);
     582           0 :                 spin_unlock(&ctx->completion_lock);
     583             :         }
     584           0 :         if (ctx->has_evfd)
     585             :                 io_eventfd_flush_signal(ctx);
     586           0 : }
     587             : 
     588             : static inline void __io_cq_lock(struct io_ring_ctx *ctx)
     589             :         __acquires(ctx->completion_lock)
     590             : {
     591           0 :         if (!ctx->task_complete)
     592           0 :                 spin_lock(&ctx->completion_lock);
     593             : }
     594             : 
     595             : static inline void __io_cq_unlock(struct io_ring_ctx *ctx)
     596             : {
     597           0 :         if (!ctx->task_complete)
     598           0 :                 spin_unlock(&ctx->completion_lock);
     599             : }
     600             : 
     601             : static inline void io_cq_lock(struct io_ring_ctx *ctx)
     602             :         __acquires(ctx->completion_lock)
     603             : {
     604           0 :         spin_lock(&ctx->completion_lock);
     605             : }
     606             : 
     607             : static inline void io_cq_unlock(struct io_ring_ctx *ctx)
     608             :         __releases(ctx->completion_lock)
     609             : {
     610           0 :         spin_unlock(&ctx->completion_lock);
     611             : }
     612             : 
     613             : /* keep it inlined for io_submit_flush_completions() */
     614           0 : static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
     615             :         __releases(ctx->completion_lock)
     616             : {
     617           0 :         io_commit_cqring(ctx);
     618           0 :         __io_cq_unlock(ctx);
     619           0 :         io_commit_cqring_flush(ctx);
     620           0 :         io_cqring_wake(ctx);
     621           0 : }
     622             : 
     623           0 : static inline void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)
     624             :         __releases(ctx->completion_lock)
     625             : {
     626           0 :         io_commit_cqring(ctx);
     627           0 :         __io_cq_unlock(ctx);
     628           0 :         io_commit_cqring_flush(ctx);
     629             : 
     630             :         /*
     631             :          * As ->task_complete implies that the ring is single tasked, cq_wait
     632             :          * may only be waited on by the current in io_cqring_wait(), but since
     633             :          * it will re-check the wakeup conditions once we return we can safely
     634             :          * skip waking it up.
     635             :          */
     636           0 :         if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) {
     637           0 :                 smp_mb();
     638             :                 __io_cqring_wake(ctx);
     639             :         }
     640           0 : }
     641             : 
     642           0 : void io_cq_unlock_post(struct io_ring_ctx *ctx)
     643             :         __releases(ctx->completion_lock)
     644             : {
     645           0 :         io_commit_cqring(ctx);
     646           0 :         spin_unlock(&ctx->completion_lock);
     647           0 :         io_commit_cqring_flush(ctx);
     648           0 :         io_cqring_wake(ctx);
     649           0 : }
     650             : 
     651             : /* Returns true if there are no backlogged entries after the flush */
     652           0 : static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
     653             : {
     654             :         struct io_overflow_cqe *ocqe;
     655           0 :         LIST_HEAD(list);
     656             : 
     657           0 :         io_cq_lock(ctx);
     658           0 :         list_splice_init(&ctx->cq_overflow_list, &list);
     659           0 :         clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
     660           0 :         io_cq_unlock(ctx);
     661             : 
     662           0 :         while (!list_empty(&list)) {
     663           0 :                 ocqe = list_first_entry(&list, struct io_overflow_cqe, list);
     664           0 :                 list_del(&ocqe->list);
     665           0 :                 kfree(ocqe);
     666             :         }
     667           0 : }
     668             : 
     669           0 : static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
     670             : {
     671           0 :         size_t cqe_size = sizeof(struct io_uring_cqe);
     672             : 
     673           0 :         if (__io_cqring_events(ctx) == ctx->cq_entries)
     674             :                 return;
     675             : 
     676           0 :         if (ctx->flags & IORING_SETUP_CQE32)
     677           0 :                 cqe_size <<= 1;
     678             : 
     679           0 :         io_cq_lock(ctx);
     680           0 :         while (!list_empty(&ctx->cq_overflow_list)) {
     681           0 :                 struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
     682             :                 struct io_overflow_cqe *ocqe;
     683             : 
     684           0 :                 if (!cqe)
     685             :                         break;
     686           0 :                 ocqe = list_first_entry(&ctx->cq_overflow_list,
     687             :                                         struct io_overflow_cqe, list);
     688           0 :                 memcpy(cqe, &ocqe->cqe, cqe_size);
     689           0 :                 list_del(&ocqe->list);
     690           0 :                 kfree(ocqe);
     691             :         }
     692             : 
     693           0 :         if (list_empty(&ctx->cq_overflow_list)) {
     694           0 :                 clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
     695           0 :                 atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
     696             :         }
     697           0 :         io_cq_unlock_post(ctx);
     698             : }
     699             : 
     700           0 : static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
     701             : {
     702             :         /* iopoll syncs against uring_lock, not completion_lock */
     703           0 :         if (ctx->flags & IORING_SETUP_IOPOLL)
     704           0 :                 mutex_lock(&ctx->uring_lock);
     705           0 :         __io_cqring_overflow_flush(ctx);
     706           0 :         if (ctx->flags & IORING_SETUP_IOPOLL)
     707           0 :                 mutex_unlock(&ctx->uring_lock);
     708           0 : }
     709             : 
     710           0 : static void io_cqring_overflow_flush(struct io_ring_ctx *ctx)
     711             : {
     712           0 :         if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
     713           0 :                 io_cqring_do_overflow_flush(ctx);
     714           0 : }
     715             : 
     716             : /* can be called by any task */
     717           0 : static void io_put_task_remote(struct task_struct *task, int nr)
     718             : {
     719           0 :         struct io_uring_task *tctx = task->io_uring;
     720             : 
     721           0 :         percpu_counter_sub(&tctx->inflight, nr);
     722           0 :         if (unlikely(atomic_read(&tctx->in_cancel)))
     723           0 :                 wake_up(&tctx->wait);
     724           0 :         put_task_struct_many(task, nr);
     725           0 : }
     726             : 
     727             : /* used by a task to put its own references */
     728             : static void io_put_task_local(struct task_struct *task, int nr)
     729             : {
     730           0 :         task->io_uring->cached_refs += nr;
     731             : }
     732             : 
     733             : /* must to be called somewhat shortly after putting a request */
     734           0 : static inline void io_put_task(struct task_struct *task, int nr)
     735             : {
     736           0 :         if (likely(task == current))
     737           0 :                 io_put_task_local(task, nr);
     738             :         else
     739           0 :                 io_put_task_remote(task, nr);
     740           0 : }
     741             : 
     742           0 : void io_task_refs_refill(struct io_uring_task *tctx)
     743             : {
     744           0 :         unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
     745             : 
     746           0 :         percpu_counter_add(&tctx->inflight, refill);
     747           0 :         refcount_add(refill, &current->usage);
     748           0 :         tctx->cached_refs += refill;
     749           0 : }
     750             : 
     751           0 : static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
     752             : {
     753           0 :         struct io_uring_task *tctx = task->io_uring;
     754           0 :         unsigned int refs = tctx->cached_refs;
     755             : 
     756           0 :         if (refs) {
     757           0 :                 tctx->cached_refs = 0;
     758           0 :                 percpu_counter_sub(&tctx->inflight, refs);
     759           0 :                 put_task_struct_many(task, refs);
     760             :         }
     761           0 : }
     762             : 
     763           0 : static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
     764             :                                      s32 res, u32 cflags, u64 extra1, u64 extra2)
     765             : {
     766             :         struct io_overflow_cqe *ocqe;
     767           0 :         size_t ocq_size = sizeof(struct io_overflow_cqe);
     768           0 :         bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
     769             : 
     770             :         lockdep_assert_held(&ctx->completion_lock);
     771             : 
     772           0 :         if (is_cqe32)
     773           0 :                 ocq_size += sizeof(struct io_uring_cqe);
     774             : 
     775           0 :         ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
     776           0 :         trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
     777           0 :         if (!ocqe) {
     778             :                 /*
     779             :                  * If we're in ring overflow flush mode, or in task cancel mode,
     780             :                  * or cannot allocate an overflow entry, then we need to drop it
     781             :                  * on the floor.
     782             :                  */
     783           0 :                 io_account_cq_overflow(ctx);
     784           0 :                 set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
     785           0 :                 return false;
     786             :         }
     787           0 :         if (list_empty(&ctx->cq_overflow_list)) {
     788           0 :                 set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
     789           0 :                 atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
     790             : 
     791             :         }
     792           0 :         ocqe->cqe.user_data = user_data;
     793           0 :         ocqe->cqe.res = res;
     794           0 :         ocqe->cqe.flags = cflags;
     795           0 :         if (is_cqe32) {
     796           0 :                 ocqe->cqe.big_cqe[0] = extra1;
     797           0 :                 ocqe->cqe.big_cqe[1] = extra2;
     798             :         }
     799           0 :         list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
     800           0 :         return true;
     801             : }
     802             : 
     803           0 : bool io_req_cqe_overflow(struct io_kiocb *req)
     804             : {
     805           0 :         if (!(req->flags & REQ_F_CQE32_INIT)) {
     806           0 :                 req->extra1 = 0;
     807           0 :                 req->extra2 = 0;
     808             :         }
     809           0 :         return io_cqring_event_overflow(req->ctx, req->cqe.user_data,
     810             :                                         req->cqe.res, req->cqe.flags,
     811             :                                         req->extra1, req->extra2);
     812             : }
     813             : 
     814             : /*
     815             :  * writes to the cq entry need to come after reading head; the
     816             :  * control dependency is enough as we're using WRITE_ONCE to
     817             :  * fill the cq entry
     818             :  */
     819           0 : struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
     820             : {
     821           0 :         struct io_rings *rings = ctx->rings;
     822           0 :         unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
     823             :         unsigned int free, queued, len;
     824             : 
     825             :         /*
     826             :          * Posting into the CQ when there are pending overflowed CQEs may break
     827             :          * ordering guarantees, which will affect links, F_MORE users and more.
     828             :          * Force overflow the completion.
     829             :          */
     830           0 :         if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
     831             :                 return NULL;
     832             : 
     833             :         /* userspace may cheat modifying the tail, be safe and do min */
     834           0 :         queued = min(__io_cqring_events(ctx), ctx->cq_entries);
     835           0 :         free = ctx->cq_entries - queued;
     836             :         /* we need a contiguous range, limit based on the current array offset */
     837           0 :         len = min(free, ctx->cq_entries - off);
     838           0 :         if (!len)
     839             :                 return NULL;
     840             : 
     841           0 :         if (ctx->flags & IORING_SETUP_CQE32) {
     842           0 :                 off <<= 1;
     843           0 :                 len <<= 1;
     844             :         }
     845             : 
     846           0 :         ctx->cqe_cached = &rings->cqes[off];
     847           0 :         ctx->cqe_sentinel = ctx->cqe_cached + len;
     848             : 
     849           0 :         ctx->cached_cq_tail++;
     850           0 :         ctx->cqe_cached++;
     851           0 :         if (ctx->flags & IORING_SETUP_CQE32)
     852           0 :                 ctx->cqe_cached++;
     853             :         return &rings->cqes[off];
     854             : }
     855             : 
     856           0 : static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
     857             :                               u32 cflags)
     858             : {
     859             :         struct io_uring_cqe *cqe;
     860             : 
     861           0 :         ctx->cq_extra++;
     862             : 
     863             :         /*
     864             :          * If we can't get a cq entry, userspace overflowed the
     865             :          * submission (by quite a lot). Increment the overflow count in
     866             :          * the ring.
     867             :          */
     868           0 :         cqe = io_get_cqe(ctx);
     869           0 :         if (likely(cqe)) {
     870           0 :                 trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
     871             : 
     872           0 :                 WRITE_ONCE(cqe->user_data, user_data);
     873           0 :                 WRITE_ONCE(cqe->res, res);
     874           0 :                 WRITE_ONCE(cqe->flags, cflags);
     875             : 
     876           0 :                 if (ctx->flags & IORING_SETUP_CQE32) {
     877           0 :                         WRITE_ONCE(cqe->big_cqe[0], 0);
     878           0 :                         WRITE_ONCE(cqe->big_cqe[1], 0);
     879             :                 }
     880             :                 return true;
     881             :         }
     882             :         return false;
     883             : }
     884             : 
     885           0 : static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
     886             :         __must_hold(&ctx->uring_lock)
     887             : {
     888           0 :         struct io_submit_state *state = &ctx->submit_state;
     889             :         unsigned int i;
     890             : 
     891             :         lockdep_assert_held(&ctx->uring_lock);
     892           0 :         for (i = 0; i < state->cqes_count; i++) {
     893           0 :                 struct io_uring_cqe *cqe = &state->cqes[i];
     894             : 
     895           0 :                 if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
     896           0 :                         if (ctx->task_complete) {
     897           0 :                                 spin_lock(&ctx->completion_lock);
     898           0 :                                 io_cqring_event_overflow(ctx, cqe->user_data,
     899             :                                                         cqe->res, cqe->flags, 0, 0);
     900           0 :                                 spin_unlock(&ctx->completion_lock);
     901             :                         } else {
     902           0 :                                 io_cqring_event_overflow(ctx, cqe->user_data,
     903             :                                                         cqe->res, cqe->flags, 0, 0);
     904             :                         }
     905             :                 }
     906             :         }
     907           0 :         state->cqes_count = 0;
     908           0 : }
     909             : 
     910           0 : static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
     911             :                               bool allow_overflow)
     912             : {
     913             :         bool filled;
     914             : 
     915           0 :         io_cq_lock(ctx);
     916           0 :         filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
     917           0 :         if (!filled && allow_overflow)
     918           0 :                 filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
     919             : 
     920           0 :         io_cq_unlock_post(ctx);
     921           0 :         return filled;
     922             : }
     923             : 
     924           0 : bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
     925             : {
     926           0 :         return __io_post_aux_cqe(ctx, user_data, res, cflags, true);
     927             : }
     928             : 
     929           0 : bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
     930             :                 bool allow_overflow)
     931             : {
     932             :         struct io_uring_cqe *cqe;
     933             :         unsigned int length;
     934             : 
     935           0 :         if (!defer)
     936           0 :                 return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow);
     937             : 
     938           0 :         length = ARRAY_SIZE(ctx->submit_state.cqes);
     939             : 
     940             :         lockdep_assert_held(&ctx->uring_lock);
     941             : 
     942           0 :         if (ctx->submit_state.cqes_count == length) {
     943           0 :                 __io_cq_lock(ctx);
     944           0 :                 __io_flush_post_cqes(ctx);
     945             :                 /* no need to flush - flush is deferred */
     946           0 :                 __io_cq_unlock_post(ctx);
     947             :         }
     948             : 
     949             :         /* For defered completions this is not as strict as it is otherwise,
     950             :          * however it's main job is to prevent unbounded posted completions,
     951             :          * and in that it works just as well.
     952             :          */
     953           0 :         if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
     954             :                 return false;
     955             : 
     956           0 :         cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++];
     957           0 :         cqe->user_data = user_data;
     958           0 :         cqe->res = res;
     959           0 :         cqe->flags = cflags;
     960           0 :         return true;
     961             : }
     962             : 
     963           0 : static void __io_req_complete_post(struct io_kiocb *req)
     964             : {
     965           0 :         struct io_ring_ctx *ctx = req->ctx;
     966             : 
     967           0 :         io_cq_lock(ctx);
     968           0 :         if (!(req->flags & REQ_F_CQE_SKIP))
     969           0 :                 io_fill_cqe_req(ctx, req);
     970             : 
     971             :         /*
     972             :          * If we're the last reference to this request, add to our locked
     973             :          * free_list cache.
     974             :          */
     975           0 :         if (req_ref_put_and_test(req)) {
     976           0 :                 if (req->flags & IO_REQ_LINK_FLAGS) {
     977           0 :                         if (req->flags & IO_DISARM_MASK)
     978           0 :                                 io_disarm_next(req);
     979           0 :                         if (req->link) {
     980           0 :                                 io_req_task_queue(req->link);
     981           0 :                                 req->link = NULL;
     982             :                         }
     983             :                 }
     984           0 :                 io_put_kbuf_comp(req);
     985           0 :                 io_dismantle_req(req);
     986           0 :                 io_req_put_rsrc(req);
     987             :                 /*
     988             :                  * Selected buffer deallocation in io_clean_op() assumes that
     989             :                  * we don't hold ->completion_lock. Clean them here to avoid
     990             :                  * deadlocks.
     991             :                  */
     992           0 :                 io_put_task_remote(req->task, 1);
     993           0 :                 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
     994           0 :                 ctx->locked_free_nr++;
     995             :         }
     996           0 :         io_cq_unlock_post(ctx);
     997           0 : }
     998             : 
     999           0 : void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
    1000             : {
    1001           0 :         if (req->ctx->task_complete && (issue_flags & IO_URING_F_IOWQ)) {
    1002           0 :                 req->io_task_work.func = io_req_task_complete;
    1003             :                 io_req_task_work_add(req);
    1004           0 :         } else if (!(issue_flags & IO_URING_F_UNLOCKED) ||
    1005           0 :                    !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
    1006           0 :                 __io_req_complete_post(req);
    1007             :         } else {
    1008           0 :                 struct io_ring_ctx *ctx = req->ctx;
    1009             : 
    1010           0 :                 mutex_lock(&ctx->uring_lock);
    1011           0 :                 __io_req_complete_post(req);
    1012           0 :                 mutex_unlock(&ctx->uring_lock);
    1013             :         }
    1014           0 : }
    1015             : 
    1016           0 : void io_req_defer_failed(struct io_kiocb *req, s32 res)
    1017             :         __must_hold(&ctx->uring_lock)
    1018             : {
    1019           0 :         const struct io_cold_def *def = &io_cold_defs[req->opcode];
    1020             : 
    1021             :         lockdep_assert_held(&req->ctx->uring_lock);
    1022             : 
    1023           0 :         req_set_fail(req);
    1024           0 :         io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
    1025           0 :         if (def->fail)
    1026           0 :                 def->fail(req);
    1027           0 :         io_req_complete_defer(req);
    1028           0 : }
    1029             : 
    1030             : /*
    1031             :  * Don't initialise the fields below on every allocation, but do that in
    1032             :  * advance and keep them valid across allocations.
    1033             :  */
    1034             : static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
    1035             : {
    1036           0 :         req->ctx = ctx;
    1037           0 :         req->link = NULL;
    1038           0 :         req->async_data = NULL;
    1039             :         /* not necessary, but safer to zero */
    1040           0 :         req->cqe.res = 0;
    1041             : }
    1042             : 
    1043             : static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
    1044             :                                         struct io_submit_state *state)
    1045             : {
    1046           0 :         spin_lock(&ctx->completion_lock);
    1047           0 :         wq_list_splice(&ctx->locked_free_list, &state->free_list);
    1048           0 :         ctx->locked_free_nr = 0;
    1049           0 :         spin_unlock(&ctx->completion_lock);
    1050             : }
    1051             : 
    1052             : /*
    1053             :  * A request might get retired back into the request caches even before opcode
    1054             :  * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
    1055             :  * Because of that, io_alloc_req() should be called only under ->uring_lock
    1056             :  * and with extra caution to not get a request that is still worked on.
    1057             :  */
    1058           0 : __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
    1059             :         __must_hold(&ctx->uring_lock)
    1060             : {
    1061           0 :         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
    1062             :         void *reqs[IO_REQ_ALLOC_BATCH];
    1063             :         int ret, i;
    1064             : 
    1065             :         /*
    1066             :          * If we have more than a batch's worth of requests in our IRQ side
    1067             :          * locked cache, grab the lock and move them over to our submission
    1068             :          * side cache.
    1069             :          */
    1070           0 :         if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
    1071           0 :                 io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
    1072           0 :                 if (!io_req_cache_empty(ctx))
    1073             :                         return true;
    1074             :         }
    1075             : 
    1076           0 :         ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
    1077             : 
    1078             :         /*
    1079             :          * Bulk alloc is all-or-nothing. If we fail to get a batch,
    1080             :          * retry single alloc to be on the safe side.
    1081             :          */
    1082           0 :         if (unlikely(ret <= 0)) {
    1083           0 :                 reqs[0] = kmem_cache_alloc(req_cachep, gfp);
    1084           0 :                 if (!reqs[0])
    1085             :                         return false;
    1086             :                 ret = 1;
    1087             :         }
    1088             : 
    1089           0 :         percpu_ref_get_many(&ctx->refs, ret);
    1090           0 :         for (i = 0; i < ret; i++) {
    1091           0 :                 struct io_kiocb *req = reqs[i];
    1092             : 
    1093           0 :                 io_preinit_req(req, ctx);
    1094           0 :                 io_req_add_to_cache(req, ctx);
    1095             :         }
    1096             :         return true;
    1097             : }
    1098             : 
    1099           0 : static inline void io_dismantle_req(struct io_kiocb *req)
    1100             : {
    1101           0 :         unsigned int flags = req->flags;
    1102             : 
    1103           0 :         if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
    1104           0 :                 io_clean_op(req);
    1105           0 :         if (!(flags & REQ_F_FIXED_FILE))
    1106           0 :                 io_put_file(req->file);
    1107           0 : }
    1108             : 
    1109           0 : __cold void io_free_req(struct io_kiocb *req)
    1110             : {
    1111           0 :         struct io_ring_ctx *ctx = req->ctx;
    1112             : 
    1113           0 :         io_req_put_rsrc(req);
    1114           0 :         io_dismantle_req(req);
    1115           0 :         io_put_task_remote(req->task, 1);
    1116             : 
    1117           0 :         spin_lock(&ctx->completion_lock);
    1118           0 :         wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
    1119           0 :         ctx->locked_free_nr++;
    1120           0 :         spin_unlock(&ctx->completion_lock);
    1121           0 : }
    1122             : 
    1123             : static void __io_req_find_next_prep(struct io_kiocb *req)
    1124             : {
    1125           0 :         struct io_ring_ctx *ctx = req->ctx;
    1126             : 
    1127           0 :         spin_lock(&ctx->completion_lock);
    1128           0 :         io_disarm_next(req);
    1129           0 :         spin_unlock(&ctx->completion_lock);
    1130             : }
    1131             : 
    1132             : static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
    1133             : {
    1134             :         struct io_kiocb *nxt;
    1135             : 
    1136             :         /*
    1137             :          * If LINK is set, we have dependent requests in this chain. If we
    1138             :          * didn't fail this request, queue the first one up, moving any other
    1139             :          * dependencies to the next request. In case of failure, fail the rest
    1140             :          * of the chain.
    1141             :          */
    1142           0 :         if (unlikely(req->flags & IO_DISARM_MASK))
    1143             :                 __io_req_find_next_prep(req);
    1144           0 :         nxt = req->link;
    1145           0 :         req->link = NULL;
    1146             :         return nxt;
    1147             : }
    1148             : 
    1149           0 : static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
    1150             : {
    1151           0 :         if (!ctx)
    1152             :                 return;
    1153           0 :         if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
    1154           0 :                 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
    1155           0 :         if (*locked) {
    1156           0 :                 io_submit_flush_completions(ctx);
    1157           0 :                 mutex_unlock(&ctx->uring_lock);
    1158           0 :                 *locked = false;
    1159             :         }
    1160           0 :         percpu_ref_put(&ctx->refs);
    1161             : }
    1162             : 
    1163           0 : static unsigned int handle_tw_list(struct llist_node *node,
    1164             :                                    struct io_ring_ctx **ctx, bool *locked,
    1165             :                                    struct llist_node *last)
    1166             : {
    1167           0 :         unsigned int count = 0;
    1168             : 
    1169           0 :         while (node && node != last) {
    1170           0 :                 struct llist_node *next = node->next;
    1171           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1172             :                                                     io_task_work.node);
    1173             : 
    1174           0 :                 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
    1175             : 
    1176           0 :                 if (req->ctx != *ctx) {
    1177           0 :                         ctx_flush_and_put(*ctx, locked);
    1178           0 :                         *ctx = req->ctx;
    1179             :                         /* if not contended, grab and improve batching */
    1180           0 :                         *locked = mutex_trylock(&(*ctx)->uring_lock);
    1181           0 :                         percpu_ref_get(&(*ctx)->refs);
    1182           0 :                 } else if (!*locked)
    1183           0 :                         *locked = mutex_trylock(&(*ctx)->uring_lock);
    1184           0 :                 req->io_task_work.func(req, locked);
    1185           0 :                 node = next;
    1186           0 :                 count++;
    1187           0 :                 if (unlikely(need_resched())) {
    1188           0 :                         ctx_flush_and_put(*ctx, locked);
    1189           0 :                         *ctx = NULL;
    1190           0 :                         cond_resched();
    1191             :                 }
    1192             :         }
    1193             : 
    1194           0 :         return count;
    1195             : }
    1196             : 
    1197             : /**
    1198             :  * io_llist_xchg - swap all entries in a lock-less list
    1199             :  * @head:       the head of lock-less list to delete all entries
    1200             :  * @new:        new entry as the head of the list
    1201             :  *
    1202             :  * If list is empty, return NULL, otherwise, return the pointer to the first entry.
    1203             :  * The order of entries returned is from the newest to the oldest added one.
    1204             :  */
    1205             : static inline struct llist_node *io_llist_xchg(struct llist_head *head,
    1206             :                                                struct llist_node *new)
    1207             : {
    1208           0 :         return xchg(&head->first, new);
    1209             : }
    1210             : 
    1211             : /**
    1212             :  * io_llist_cmpxchg - possibly swap all entries in a lock-less list
    1213             :  * @head:       the head of lock-less list to delete all entries
    1214             :  * @old:        expected old value of the first entry of the list
    1215             :  * @new:        new entry as the head of the list
    1216             :  *
    1217             :  * perform a cmpxchg on the first entry of the list.
    1218             :  */
    1219             : 
    1220             : static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
    1221             :                                                   struct llist_node *old,
    1222             :                                                   struct llist_node *new)
    1223             : {
    1224           0 :         return cmpxchg(&head->first, old, new);
    1225             : }
    1226             : 
    1227           0 : void tctx_task_work(struct callback_head *cb)
    1228             : {
    1229           0 :         bool uring_locked = false;
    1230           0 :         struct io_ring_ctx *ctx = NULL;
    1231           0 :         struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
    1232             :                                                   task_work);
    1233           0 :         struct llist_node fake = {};
    1234             :         struct llist_node *node;
    1235           0 :         unsigned int loops = 0;
    1236           0 :         unsigned int count = 0;
    1237             : 
    1238           0 :         if (unlikely(current->flags & PF_EXITING)) {
    1239           0 :                 io_fallback_tw(tctx);
    1240           0 :                 return;
    1241             :         }
    1242             : 
    1243             :         do {
    1244           0 :                 loops++;
    1245           0 :                 node = io_llist_xchg(&tctx->task_list, &fake);
    1246           0 :                 count += handle_tw_list(node, &ctx, &uring_locked, &fake);
    1247             : 
    1248             :                 /* skip expensive cmpxchg if there are items in the list */
    1249           0 :                 if (READ_ONCE(tctx->task_list.first) != &fake)
    1250           0 :                         continue;
    1251           0 :                 if (uring_locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) {
    1252           0 :                         io_submit_flush_completions(ctx);
    1253           0 :                         if (READ_ONCE(tctx->task_list.first) != &fake)
    1254           0 :                                 continue;
    1255             :                 }
    1256           0 :                 node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
    1257           0 :         } while (node != &fake);
    1258             : 
    1259           0 :         ctx_flush_and_put(ctx, &uring_locked);
    1260             : 
    1261             :         /* relaxed read is enough as only the task itself sets ->in_cancel */
    1262           0 :         if (unlikely(atomic_read(&tctx->in_cancel)))
    1263           0 :                 io_uring_drop_tctx_refs(current);
    1264             : 
    1265           0 :         trace_io_uring_task_work_run(tctx, count, loops);
    1266             : }
    1267             : 
    1268           0 : static __cold void io_fallback_tw(struct io_uring_task *tctx)
    1269             : {
    1270           0 :         struct llist_node *node = llist_del_all(&tctx->task_list);
    1271             :         struct io_kiocb *req;
    1272             : 
    1273           0 :         while (node) {
    1274           0 :                 req = container_of(node, struct io_kiocb, io_task_work.node);
    1275           0 :                 node = node->next;
    1276           0 :                 if (llist_add(&req->io_task_work.node,
    1277           0 :                               &req->ctx->fallback_llist))
    1278           0 :                         schedule_delayed_work(&req->ctx->fallback_work, 1);
    1279             :         }
    1280           0 : }
    1281             : 
    1282           0 : static void io_req_local_work_add(struct io_kiocb *req)
    1283             : {
    1284           0 :         struct io_ring_ctx *ctx = req->ctx;
    1285             : 
    1286           0 :         percpu_ref_get(&ctx->refs);
    1287             : 
    1288           0 :         if (!llist_add(&req->io_task_work.node, &ctx->work_llist))
    1289             :                 goto put_ref;
    1290             : 
    1291             :         /* needed for the following wake up */
    1292           0 :         smp_mb__after_atomic();
    1293             : 
    1294           0 :         if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) {
    1295           0 :                 io_move_task_work_from_local(ctx);
    1296           0 :                 goto put_ref;
    1297             :         }
    1298             : 
    1299           0 :         if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
    1300           0 :                 atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
    1301           0 :         if (ctx->has_evfd)
    1302           0 :                 io_eventfd_signal(ctx);
    1303             : 
    1304           0 :         if (READ_ONCE(ctx->cq_waiting))
    1305           0 :                 wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
    1306             : 
    1307             : put_ref:
    1308           0 :         percpu_ref_put(&ctx->refs);
    1309           0 : }
    1310             : 
    1311           0 : void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
    1312             : {
    1313           0 :         struct io_uring_task *tctx = req->task->io_uring;
    1314           0 :         struct io_ring_ctx *ctx = req->ctx;
    1315             : 
    1316           0 :         if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
    1317           0 :                 io_req_local_work_add(req);
    1318           0 :                 return;
    1319             :         }
    1320             : 
    1321             :         /* task_work already pending, we're done */
    1322           0 :         if (!llist_add(&req->io_task_work.node, &tctx->task_list))
    1323             :                 return;
    1324             : 
    1325           0 :         if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
    1326           0 :                 atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
    1327             : 
    1328           0 :         if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
    1329             :                 return;
    1330             : 
    1331           0 :         io_fallback_tw(tctx);
    1332             : }
    1333             : 
    1334           0 : static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
    1335             : {
    1336             :         struct llist_node *node;
    1337             : 
    1338           0 :         node = llist_del_all(&ctx->work_llist);
    1339           0 :         while (node) {
    1340           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1341             :                                                     io_task_work.node);
    1342             : 
    1343           0 :                 node = node->next;
    1344           0 :                 __io_req_task_work_add(req, false);
    1345             :         }
    1346           0 : }
    1347             : 
    1348           0 : static int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked)
    1349             : {
    1350             :         struct llist_node *node;
    1351           0 :         unsigned int loops = 0;
    1352           0 :         int ret = 0;
    1353             : 
    1354           0 :         if (WARN_ON_ONCE(ctx->submitter_task != current))
    1355             :                 return -EEXIST;
    1356           0 :         if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
    1357           0 :                 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
    1358             : again:
    1359           0 :         node = io_llist_xchg(&ctx->work_llist, NULL);
    1360           0 :         while (node) {
    1361           0 :                 struct llist_node *next = node->next;
    1362           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1363             :                                                     io_task_work.node);
    1364           0 :                 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
    1365           0 :                 req->io_task_work.func(req, locked);
    1366           0 :                 ret++;
    1367           0 :                 node = next;
    1368             :         }
    1369           0 :         loops++;
    1370             : 
    1371           0 :         if (!llist_empty(&ctx->work_llist))
    1372             :                 goto again;
    1373           0 :         if (*locked) {
    1374           0 :                 io_submit_flush_completions(ctx);
    1375           0 :                 if (!llist_empty(&ctx->work_llist))
    1376             :                         goto again;
    1377             :         }
    1378             :         trace_io_uring_local_work_run(ctx, ret, loops);
    1379             :         return ret;
    1380             : }
    1381             : 
    1382           0 : static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
    1383             : {
    1384             :         bool locked;
    1385             :         int ret;
    1386             : 
    1387           0 :         if (llist_empty(&ctx->work_llist))
    1388             :                 return 0;
    1389             : 
    1390           0 :         locked = true;
    1391           0 :         ret = __io_run_local_work(ctx, &locked);
    1392             :         /* shouldn't happen! */
    1393           0 :         if (WARN_ON_ONCE(!locked))
    1394           0 :                 mutex_lock(&ctx->uring_lock);
    1395             :         return ret;
    1396             : }
    1397             : 
    1398           0 : static int io_run_local_work(struct io_ring_ctx *ctx)
    1399             : {
    1400           0 :         bool locked = mutex_trylock(&ctx->uring_lock);
    1401             :         int ret;
    1402             : 
    1403           0 :         ret = __io_run_local_work(ctx, &locked);
    1404           0 :         if (locked)
    1405           0 :                 mutex_unlock(&ctx->uring_lock);
    1406             : 
    1407           0 :         return ret;
    1408             : }
    1409             : 
    1410           0 : static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
    1411             : {
    1412           0 :         io_tw_lock(req->ctx, locked);
    1413           0 :         io_req_defer_failed(req, req->cqe.res);
    1414           0 : }
    1415             : 
    1416           0 : void io_req_task_submit(struct io_kiocb *req, bool *locked)
    1417             : {
    1418           0 :         io_tw_lock(req->ctx, locked);
    1419             :         /* req->task == current here, checking PF_EXITING is safe */
    1420           0 :         if (unlikely(req->task->flags & PF_EXITING))
    1421           0 :                 io_req_defer_failed(req, -EFAULT);
    1422           0 :         else if (req->flags & REQ_F_FORCE_ASYNC)
    1423           0 :                 io_queue_iowq(req, locked);
    1424             :         else
    1425           0 :                 io_queue_sqe(req);
    1426           0 : }
    1427             : 
    1428           0 : void io_req_task_queue_fail(struct io_kiocb *req, int ret)
    1429             : {
    1430           0 :         io_req_set_res(req, ret, 0);
    1431           0 :         req->io_task_work.func = io_req_task_cancel;
    1432           0 :         io_req_task_work_add(req);
    1433           0 : }
    1434             : 
    1435           0 : void io_req_task_queue(struct io_kiocb *req)
    1436             : {
    1437           0 :         req->io_task_work.func = io_req_task_submit;
    1438           0 :         io_req_task_work_add(req);
    1439           0 : }
    1440             : 
    1441           0 : void io_queue_next(struct io_kiocb *req)
    1442             : {
    1443           0 :         struct io_kiocb *nxt = io_req_find_next(req);
    1444             : 
    1445           0 :         if (nxt)
    1446             :                 io_req_task_queue(nxt);
    1447           0 : }
    1448             : 
    1449           0 : void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
    1450             :         __must_hold(&ctx->uring_lock)
    1451             : {
    1452           0 :         struct task_struct *task = NULL;
    1453           0 :         int task_refs = 0;
    1454             : 
    1455             :         do {
    1456           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1457             :                                                     comp_list);
    1458             : 
    1459           0 :                 if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
    1460           0 :                         if (req->flags & REQ_F_REFCOUNT) {
    1461           0 :                                 node = req->comp_list.next;
    1462           0 :                                 if (!req_ref_put_and_test(req))
    1463           0 :                                         continue;
    1464             :                         }
    1465           0 :                         if ((req->flags & REQ_F_POLLED) && req->apoll) {
    1466           0 :                                 struct async_poll *apoll = req->apoll;
    1467             : 
    1468           0 :                                 if (apoll->double_poll)
    1469           0 :                                         kfree(apoll->double_poll);
    1470           0 :                                 if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache))
    1471           0 :                                         kfree(apoll);
    1472           0 :                                 req->flags &= ~REQ_F_POLLED;
    1473             :                         }
    1474           0 :                         if (req->flags & IO_REQ_LINK_FLAGS)
    1475           0 :                                 io_queue_next(req);
    1476           0 :                         if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
    1477           0 :                                 io_clean_op(req);
    1478             :                 }
    1479           0 :                 if (!(req->flags & REQ_F_FIXED_FILE))
    1480           0 :                         io_put_file(req->file);
    1481             : 
    1482           0 :                 io_req_put_rsrc_locked(req, ctx);
    1483             : 
    1484           0 :                 if (req->task != task) {
    1485           0 :                         if (task)
    1486           0 :                                 io_put_task(task, task_refs);
    1487           0 :                         task = req->task;
    1488           0 :                         task_refs = 0;
    1489             :                 }
    1490           0 :                 task_refs++;
    1491           0 :                 node = req->comp_list.next;
    1492             :                 io_req_add_to_cache(req, ctx);
    1493           0 :         } while (node);
    1494             : 
    1495           0 :         if (task)
    1496           0 :                 io_put_task(task, task_refs);
    1497           0 : }
    1498             : 
    1499           0 : static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
    1500             :         __must_hold(&ctx->uring_lock)
    1501             : {
    1502             :         struct io_wq_work_node *node, *prev;
    1503           0 :         struct io_submit_state *state = &ctx->submit_state;
    1504             : 
    1505           0 :         __io_cq_lock(ctx);
    1506             :         /* must come first to preserve CQE ordering in failure cases */
    1507           0 :         if (state->cqes_count)
    1508           0 :                 __io_flush_post_cqes(ctx);
    1509           0 :         wq_list_for_each(node, prev, &state->compl_reqs) {
    1510           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1511             :                                             comp_list);
    1512             : 
    1513           0 :                 if (!(req->flags & REQ_F_CQE_SKIP) &&
    1514           0 :                     unlikely(!__io_fill_cqe_req(ctx, req))) {
    1515           0 :                         if (ctx->task_complete) {
    1516           0 :                                 spin_lock(&ctx->completion_lock);
    1517           0 :                                 io_req_cqe_overflow(req);
    1518           0 :                                 spin_unlock(&ctx->completion_lock);
    1519             :                         } else {
    1520           0 :                                 io_req_cqe_overflow(req);
    1521             :                         }
    1522             :                 }
    1523             :         }
    1524           0 :         __io_cq_unlock_post_flush(ctx);
    1525             : 
    1526           0 :         if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
    1527           0 :                 io_free_batch_list(ctx, state->compl_reqs.first);
    1528           0 :                 INIT_WQ_LIST(&state->compl_reqs);
    1529             :         }
    1530           0 : }
    1531             : 
    1532             : /*
    1533             :  * Drop reference to request, return next in chain (if there is one) if this
    1534             :  * was the last reference to this request.
    1535             :  */
    1536           0 : static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
    1537             : {
    1538           0 :         struct io_kiocb *nxt = NULL;
    1539             : 
    1540           0 :         if (req_ref_put_and_test(req)) {
    1541           0 :                 if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
    1542           0 :                         nxt = io_req_find_next(req);
    1543           0 :                 io_free_req(req);
    1544             :         }
    1545           0 :         return nxt;
    1546             : }
    1547             : 
    1548             : static unsigned io_cqring_events(struct io_ring_ctx *ctx)
    1549             : {
    1550             :         /* See comment at the top of this file */
    1551           0 :         smp_rmb();
    1552           0 :         return __io_cqring_events(ctx);
    1553             : }
    1554             : 
    1555             : /*
    1556             :  * We can't just wait for polled events to come to us, we have to actively
    1557             :  * find and complete them.
    1558             :  */
    1559           0 : static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
    1560             : {
    1561           0 :         if (!(ctx->flags & IORING_SETUP_IOPOLL))
    1562             :                 return;
    1563             : 
    1564           0 :         mutex_lock(&ctx->uring_lock);
    1565           0 :         while (!wq_list_empty(&ctx->iopoll_list)) {
    1566             :                 /* let it sleep and repeat later if can't complete a request */
    1567           0 :                 if (io_do_iopoll(ctx, true) == 0)
    1568             :                         break;
    1569             :                 /*
    1570             :                  * Ensure we allow local-to-the-cpu processing to take place,
    1571             :                  * in this case we need to ensure that we reap all events.
    1572             :                  * Also let task_work, etc. to progress by releasing the mutex
    1573             :                  */
    1574           0 :                 if (need_resched()) {
    1575           0 :                         mutex_unlock(&ctx->uring_lock);
    1576           0 :                         cond_resched();
    1577           0 :                         mutex_lock(&ctx->uring_lock);
    1578             :                 }
    1579             :         }
    1580           0 :         mutex_unlock(&ctx->uring_lock);
    1581             : }
    1582             : 
    1583           0 : static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
    1584             : {
    1585           0 :         unsigned int nr_events = 0;
    1586           0 :         int ret = 0;
    1587             :         unsigned long check_cq;
    1588             : 
    1589           0 :         if (!io_allowed_run_tw(ctx))
    1590             :                 return -EEXIST;
    1591             : 
    1592           0 :         check_cq = READ_ONCE(ctx->check_cq);
    1593           0 :         if (unlikely(check_cq)) {
    1594           0 :                 if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
    1595           0 :                         __io_cqring_overflow_flush(ctx);
    1596             :                 /*
    1597             :                  * Similarly do not spin if we have not informed the user of any
    1598             :                  * dropped CQE.
    1599             :                  */
    1600           0 :                 if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
    1601             :                         return -EBADR;
    1602             :         }
    1603             :         /*
    1604             :          * Don't enter poll loop if we already have events pending.
    1605             :          * If we do, we can potentially be spinning for commands that
    1606             :          * already triggered a CQE (eg in error).
    1607             :          */
    1608           0 :         if (io_cqring_events(ctx))
    1609             :                 return 0;
    1610             : 
    1611             :         do {
    1612             :                 /*
    1613             :                  * If a submit got punted to a workqueue, we can have the
    1614             :                  * application entering polling for a command before it gets
    1615             :                  * issued. That app will hold the uring_lock for the duration
    1616             :                  * of the poll right here, so we need to take a breather every
    1617             :                  * now and then to ensure that the issue has a chance to add
    1618             :                  * the poll to the issued list. Otherwise we can spin here
    1619             :                  * forever, while the workqueue is stuck trying to acquire the
    1620             :                  * very same mutex.
    1621             :                  */
    1622           0 :                 if (wq_list_empty(&ctx->iopoll_list) ||
    1623           0 :                     io_task_work_pending(ctx)) {
    1624           0 :                         u32 tail = ctx->cached_cq_tail;
    1625             : 
    1626           0 :                         (void) io_run_local_work_locked(ctx);
    1627             : 
    1628           0 :                         if (task_work_pending(current) ||
    1629           0 :                             wq_list_empty(&ctx->iopoll_list)) {
    1630           0 :                                 mutex_unlock(&ctx->uring_lock);
    1631           0 :                                 io_run_task_work();
    1632           0 :                                 mutex_lock(&ctx->uring_lock);
    1633             :                         }
    1634             :                         /* some requests don't go through iopoll_list */
    1635           0 :                         if (tail != ctx->cached_cq_tail ||
    1636           0 :                             wq_list_empty(&ctx->iopoll_list))
    1637             :                                 break;
    1638             :                 }
    1639           0 :                 ret = io_do_iopoll(ctx, !min);
    1640           0 :                 if (ret < 0)
    1641             :                         break;
    1642           0 :                 nr_events += ret;
    1643           0 :                 ret = 0;
    1644           0 :         } while (nr_events < min && !need_resched());
    1645             : 
    1646             :         return ret;
    1647             : }
    1648             : 
    1649           0 : void io_req_task_complete(struct io_kiocb *req, bool *locked)
    1650             : {
    1651           0 :         if (*locked)
    1652             :                 io_req_complete_defer(req);
    1653             :         else
    1654           0 :                 io_req_complete_post(req, IO_URING_F_UNLOCKED);
    1655           0 : }
    1656             : 
    1657             : /*
    1658             :  * After the iocb has been issued, it's safe to be found on the poll list.
    1659             :  * Adding the kiocb to the list AFTER submission ensures that we don't
    1660             :  * find it from a io_do_iopoll() thread before the issuer is done
    1661             :  * accessing the kiocb cookie.
    1662             :  */
    1663           0 : static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
    1664             : {
    1665           0 :         struct io_ring_ctx *ctx = req->ctx;
    1666           0 :         const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
    1667             : 
    1668             :         /* workqueue context doesn't hold uring_lock, grab it now */
    1669           0 :         if (unlikely(needs_lock))
    1670           0 :                 mutex_lock(&ctx->uring_lock);
    1671             : 
    1672             :         /*
    1673             :          * Track whether we have multiple files in our lists. This will impact
    1674             :          * how we do polling eventually, not spinning if we're on potentially
    1675             :          * different devices.
    1676             :          */
    1677           0 :         if (wq_list_empty(&ctx->iopoll_list)) {
    1678           0 :                 ctx->poll_multi_queue = false;
    1679           0 :         } else if (!ctx->poll_multi_queue) {
    1680             :                 struct io_kiocb *list_req;
    1681             : 
    1682           0 :                 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
    1683             :                                         comp_list);
    1684           0 :                 if (list_req->file != req->file)
    1685           0 :                         ctx->poll_multi_queue = true;
    1686             :         }
    1687             : 
    1688             :         /*
    1689             :          * For fast devices, IO may have already completed. If it has, add
    1690             :          * it to the front so we find it first.
    1691             :          */
    1692           0 :         if (READ_ONCE(req->iopoll_completed))
    1693           0 :                 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
    1694             :         else
    1695           0 :                 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
    1696             : 
    1697           0 :         if (unlikely(needs_lock)) {
    1698             :                 /*
    1699             :                  * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
    1700             :                  * in sq thread task context or in io worker task context. If
    1701             :                  * current task context is sq thread, we don't need to check
    1702             :                  * whether should wake up sq thread.
    1703             :                  */
    1704           0 :                 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
    1705           0 :                     wq_has_sleeper(&ctx->sq_data->wait))
    1706           0 :                         wake_up(&ctx->sq_data->wait);
    1707             : 
    1708           0 :                 mutex_unlock(&ctx->uring_lock);
    1709             :         }
    1710           0 : }
    1711             : 
    1712             : static bool io_bdev_nowait(struct block_device *bdev)
    1713             : {
    1714           0 :         return !bdev || bdev_nowait(bdev);
    1715             : }
    1716             : 
    1717             : /*
    1718             :  * If we tracked the file through the SCM inflight mechanism, we could support
    1719             :  * any file. For now, just ensure that anything potentially problematic is done
    1720             :  * inline.
    1721             :  */
    1722           0 : static bool __io_file_supports_nowait(struct file *file, umode_t mode)
    1723             : {
    1724           0 :         if (S_ISBLK(mode)) {
    1725           0 :                 if (IS_ENABLED(CONFIG_BLOCK) &&
    1726           0 :                     io_bdev_nowait(I_BDEV(file->f_mapping->host)))
    1727             :                         return true;
    1728           0 :                 return false;
    1729             :         }
    1730           0 :         if (S_ISSOCK(mode))
    1731             :                 return true;
    1732           0 :         if (S_ISREG(mode)) {
    1733           0 :                 if (IS_ENABLED(CONFIG_BLOCK) &&
    1734           0 :                     io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
    1735           0 :                     !io_is_uring_fops(file))
    1736             :                         return true;
    1737             :                 return false;
    1738             :         }
    1739             : 
    1740             :         /* any ->read/write should understand O_NONBLOCK */
    1741           0 :         if (file->f_flags & O_NONBLOCK)
    1742             :                 return true;
    1743           0 :         return file->f_mode & FMODE_NOWAIT;
    1744             : }
    1745             : 
    1746             : /*
    1747             :  * If we tracked the file through the SCM inflight mechanism, we could support
    1748             :  * any file. For now, just ensure that anything potentially problematic is done
    1749             :  * inline.
    1750             :  */
    1751           0 : unsigned int io_file_get_flags(struct file *file)
    1752             : {
    1753           0 :         umode_t mode = file_inode(file)->i_mode;
    1754           0 :         unsigned int res = 0;
    1755             : 
    1756           0 :         if (S_ISREG(mode))
    1757           0 :                 res |= FFS_ISREG;
    1758           0 :         if (__io_file_supports_nowait(file, mode))
    1759           0 :                 res |= FFS_NOWAIT;
    1760           0 :         return res;
    1761             : }
    1762             : 
    1763           0 : bool io_alloc_async_data(struct io_kiocb *req)
    1764             : {
    1765           0 :         WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size);
    1766           0 :         req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL);
    1767           0 :         if (req->async_data) {
    1768           0 :                 req->flags |= REQ_F_ASYNC_DATA;
    1769           0 :                 return false;
    1770             :         }
    1771             :         return true;
    1772             : }
    1773             : 
    1774           0 : int io_req_prep_async(struct io_kiocb *req)
    1775             : {
    1776           0 :         const struct io_cold_def *cdef = &io_cold_defs[req->opcode];
    1777           0 :         const struct io_issue_def *def = &io_issue_defs[req->opcode];
    1778             : 
    1779             :         /* assign early for deferred execution for non-fixed file */
    1780           0 :         if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file)
    1781           0 :                 req->file = io_file_get_normal(req, req->cqe.fd);
    1782           0 :         if (!cdef->prep_async)
    1783             :                 return 0;
    1784           0 :         if (WARN_ON_ONCE(req_has_async_data(req)))
    1785             :                 return -EFAULT;
    1786           0 :         if (!def->manual_alloc) {
    1787           0 :                 if (io_alloc_async_data(req))
    1788             :                         return -EAGAIN;
    1789             :         }
    1790           0 :         return cdef->prep_async(req);
    1791             : }
    1792             : 
    1793             : static u32 io_get_sequence(struct io_kiocb *req)
    1794             : {
    1795           0 :         u32 seq = req->ctx->cached_sq_head;
    1796             :         struct io_kiocb *cur;
    1797             : 
    1798             :         /* need original cached_sq_head, but it was increased for each req */
    1799           0 :         io_for_each_link(cur, req)
    1800           0 :                 seq--;
    1801             :         return seq;
    1802             : }
    1803             : 
    1804           0 : static __cold void io_drain_req(struct io_kiocb *req)
    1805             :         __must_hold(&ctx->uring_lock)
    1806             : {
    1807           0 :         struct io_ring_ctx *ctx = req->ctx;
    1808             :         struct io_defer_entry *de;
    1809             :         int ret;
    1810           0 :         u32 seq = io_get_sequence(req);
    1811             : 
    1812             :         /* Still need defer if there is pending req in defer list. */
    1813           0 :         spin_lock(&ctx->completion_lock);
    1814           0 :         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
    1815           0 :                 spin_unlock(&ctx->completion_lock);
    1816             : queue:
    1817           0 :                 ctx->drain_active = false;
    1818             :                 io_req_task_queue(req);
    1819             :                 return;
    1820             :         }
    1821           0 :         spin_unlock(&ctx->completion_lock);
    1822             : 
    1823           0 :         io_prep_async_link(req);
    1824           0 :         de = kmalloc(sizeof(*de), GFP_KERNEL);
    1825           0 :         if (!de) {
    1826           0 :                 ret = -ENOMEM;
    1827           0 :                 io_req_defer_failed(req, ret);
    1828           0 :                 return;
    1829             :         }
    1830             : 
    1831           0 :         spin_lock(&ctx->completion_lock);
    1832           0 :         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
    1833           0 :                 spin_unlock(&ctx->completion_lock);
    1834           0 :                 kfree(de);
    1835           0 :                 goto queue;
    1836             :         }
    1837             : 
    1838           0 :         trace_io_uring_defer(req);
    1839           0 :         de->req = req;
    1840           0 :         de->seq = seq;
    1841           0 :         list_add_tail(&de->list, &ctx->defer_list);
    1842           0 :         spin_unlock(&ctx->completion_lock);
    1843             : }
    1844             : 
    1845           0 : static void io_clean_op(struct io_kiocb *req)
    1846             : {
    1847           0 :         if (req->flags & REQ_F_BUFFER_SELECTED) {
    1848           0 :                 spin_lock(&req->ctx->completion_lock);
    1849           0 :                 io_put_kbuf_comp(req);
    1850           0 :                 spin_unlock(&req->ctx->completion_lock);
    1851             :         }
    1852             : 
    1853           0 :         if (req->flags & REQ_F_NEED_CLEANUP) {
    1854           0 :                 const struct io_cold_def *def = &io_cold_defs[req->opcode];
    1855             : 
    1856           0 :                 if (def->cleanup)
    1857           0 :                         def->cleanup(req);
    1858             :         }
    1859           0 :         if ((req->flags & REQ_F_POLLED) && req->apoll) {
    1860           0 :                 kfree(req->apoll->double_poll);
    1861           0 :                 kfree(req->apoll);
    1862           0 :                 req->apoll = NULL;
    1863             :         }
    1864           0 :         if (req->flags & REQ_F_INFLIGHT) {
    1865           0 :                 struct io_uring_task *tctx = req->task->io_uring;
    1866             : 
    1867           0 :                 atomic_dec(&tctx->inflight_tracked);
    1868             :         }
    1869           0 :         if (req->flags & REQ_F_CREDS)
    1870           0 :                 put_cred(req->creds);
    1871           0 :         if (req->flags & REQ_F_ASYNC_DATA) {
    1872           0 :                 kfree(req->async_data);
    1873           0 :                 req->async_data = NULL;
    1874             :         }
    1875           0 :         req->flags &= ~IO_REQ_CLEAN_FLAGS;
    1876           0 : }
    1877             : 
    1878           0 : static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
    1879             :                            unsigned int issue_flags)
    1880             : {
    1881           0 :         if (req->file || !def->needs_file)
    1882             :                 return true;
    1883             : 
    1884           0 :         if (req->flags & REQ_F_FIXED_FILE)
    1885           0 :                 req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
    1886             :         else
    1887           0 :                 req->file = io_file_get_normal(req, req->cqe.fd);
    1888             : 
    1889           0 :         return !!req->file;
    1890             : }
    1891             : 
    1892           0 : static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
    1893             : {
    1894           0 :         const struct io_issue_def *def = &io_issue_defs[req->opcode];
    1895           0 :         const struct cred *creds = NULL;
    1896             :         int ret;
    1897             : 
    1898           0 :         if (unlikely(!io_assign_file(req, def, issue_flags)))
    1899             :                 return -EBADF;
    1900             : 
    1901           0 :         if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
    1902           0 :                 creds = override_creds(req->creds);
    1903             : 
    1904           0 :         if (!def->audit_skip)
    1905             :                 audit_uring_entry(req->opcode);
    1906             : 
    1907           0 :         ret = def->issue(req, issue_flags);
    1908             : 
    1909             :         if (!def->audit_skip)
    1910             :                 audit_uring_exit(!ret, ret);
    1911             : 
    1912           0 :         if (creds)
    1913           0 :                 revert_creds(creds);
    1914             : 
    1915           0 :         if (ret == IOU_OK) {
    1916           0 :                 if (issue_flags & IO_URING_F_COMPLETE_DEFER)
    1917             :                         io_req_complete_defer(req);
    1918             :                 else
    1919           0 :                         io_req_complete_post(req, issue_flags);
    1920           0 :         } else if (ret != IOU_ISSUE_SKIP_COMPLETE)
    1921             :                 return ret;
    1922             : 
    1923             :         /* If the op doesn't have a file, we're not polling for it */
    1924           0 :         if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
    1925           0 :                 io_iopoll_req_issued(req, issue_flags);
    1926             : 
    1927             :         return 0;
    1928             : }
    1929             : 
    1930           0 : int io_poll_issue(struct io_kiocb *req, bool *locked)
    1931             : {
    1932           0 :         io_tw_lock(req->ctx, locked);
    1933           0 :         return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT|
    1934             :                                  IO_URING_F_COMPLETE_DEFER);
    1935             : }
    1936             : 
    1937           0 : struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
    1938             : {
    1939           0 :         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
    1940             : 
    1941           0 :         req = io_put_req_find_next(req);
    1942           0 :         return req ? &req->work : NULL;
    1943             : }
    1944             : 
    1945           0 : void io_wq_submit_work(struct io_wq_work *work)
    1946             : {
    1947           0 :         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
    1948           0 :         const struct io_issue_def *def = &io_issue_defs[req->opcode];
    1949           0 :         unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ;
    1950           0 :         bool needs_poll = false;
    1951           0 :         int ret = 0, err = -ECANCELED;
    1952             : 
    1953             :         /* one will be dropped by ->io_wq_free_work() after returning to io-wq */
    1954           0 :         if (!(req->flags & REQ_F_REFCOUNT))
    1955             :                 __io_req_set_refcount(req, 2);
    1956             :         else
    1957           0 :                 req_ref_get(req);
    1958             : 
    1959           0 :         io_arm_ltimeout(req);
    1960             : 
    1961             :         /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
    1962           0 :         if (work->flags & IO_WQ_WORK_CANCEL) {
    1963             : fail:
    1964             :                 io_req_task_queue_fail(req, err);
    1965             :                 return;
    1966             :         }
    1967           0 :         if (!io_assign_file(req, def, issue_flags)) {
    1968           0 :                 err = -EBADF;
    1969           0 :                 work->flags |= IO_WQ_WORK_CANCEL;
    1970           0 :                 goto fail;
    1971             :         }
    1972             : 
    1973           0 :         if (req->flags & REQ_F_FORCE_ASYNC) {
    1974           0 :                 bool opcode_poll = def->pollin || def->pollout;
    1975             : 
    1976           0 :                 if (opcode_poll && file_can_poll(req->file)) {
    1977           0 :                         needs_poll = true;
    1978           0 :                         issue_flags |= IO_URING_F_NONBLOCK;
    1979             :                 }
    1980             :         }
    1981             : 
    1982             :         do {
    1983           0 :                 ret = io_issue_sqe(req, issue_flags);
    1984           0 :                 if (ret != -EAGAIN)
    1985             :                         break;
    1986             :                 /*
    1987             :                  * We can get EAGAIN for iopolled IO even though we're
    1988             :                  * forcing a sync submission from here, since we can't
    1989             :                  * wait for request slots on the block side.
    1990             :                  */
    1991           0 :                 if (!needs_poll) {
    1992           0 :                         if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
    1993             :                                 break;
    1994           0 :                         cond_resched();
    1995           0 :                         continue;
    1996             :                 }
    1997             : 
    1998           0 :                 if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
    1999             :                         return;
    2000             :                 /* aborted or ready, in either case retry blocking */
    2001             :                 needs_poll = false;
    2002             :                 issue_flags &= ~IO_URING_F_NONBLOCK;
    2003             :         } while (1);
    2004             : 
    2005             :         /* avoid locking problems by failing it from a clean context */
    2006           0 :         if (ret < 0)
    2007             :                 io_req_task_queue_fail(req, ret);
    2008             : }
    2009             : 
    2010           0 : inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
    2011             :                                       unsigned int issue_flags)
    2012             : {
    2013           0 :         struct io_ring_ctx *ctx = req->ctx;
    2014           0 :         struct file *file = NULL;
    2015             :         unsigned long file_ptr;
    2016             : 
    2017           0 :         io_ring_submit_lock(ctx, issue_flags);
    2018             : 
    2019           0 :         if (unlikely((unsigned int)fd >= ctx->nr_user_files))
    2020             :                 goto out;
    2021           0 :         fd = array_index_nospec(fd, ctx->nr_user_files);
    2022           0 :         file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
    2023           0 :         file = (struct file *) (file_ptr & FFS_MASK);
    2024           0 :         file_ptr &= ~FFS_MASK;
    2025             :         /* mask in overlapping REQ_F and FFS bits */
    2026           0 :         req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
    2027           0 :         io_req_set_rsrc_node(req, ctx, 0);
    2028             : out:
    2029           0 :         io_ring_submit_unlock(ctx, issue_flags);
    2030           0 :         return file;
    2031             : }
    2032             : 
    2033           0 : struct file *io_file_get_normal(struct io_kiocb *req, int fd)
    2034             : {
    2035           0 :         struct file *file = fget(fd);
    2036             : 
    2037           0 :         trace_io_uring_file_get(req, fd);
    2038             : 
    2039             :         /* we don't allow fixed io_uring files */
    2040           0 :         if (file && io_is_uring_fops(file))
    2041           0 :                 io_req_track_inflight(req);
    2042           0 :         return file;
    2043             : }
    2044             : 
    2045           0 : static void io_queue_async(struct io_kiocb *req, int ret)
    2046             :         __must_hold(&req->ctx->uring_lock)
    2047             : {
    2048             :         struct io_kiocb *linked_timeout;
    2049             : 
    2050           0 :         if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
    2051           0 :                 io_req_defer_failed(req, ret);
    2052           0 :                 return;
    2053             :         }
    2054             : 
    2055           0 :         linked_timeout = io_prep_linked_timeout(req);
    2056             : 
    2057           0 :         switch (io_arm_poll_handler(req, 0)) {
    2058             :         case IO_APOLL_READY:
    2059           0 :                 io_kbuf_recycle(req, 0);
    2060             :                 io_req_task_queue(req);
    2061             :                 break;
    2062             :         case IO_APOLL_ABORTED:
    2063           0 :                 io_kbuf_recycle(req, 0);
    2064           0 :                 io_queue_iowq(req, NULL);
    2065           0 :                 break;
    2066             :         case IO_APOLL_OK:
    2067             :                 break;
    2068             :         }
    2069             : 
    2070           0 :         if (linked_timeout)
    2071           0 :                 io_queue_linked_timeout(linked_timeout);
    2072             : }
    2073             : 
    2074           0 : static inline void io_queue_sqe(struct io_kiocb *req)
    2075             :         __must_hold(&req->ctx->uring_lock)
    2076             : {
    2077             :         int ret;
    2078             : 
    2079           0 :         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
    2080             : 
    2081             :         /*
    2082             :          * We async punt it if the file wasn't marked NOWAIT, or if the file
    2083             :          * doesn't support non-blocking read/write attempts
    2084             :          */
    2085           0 :         if (likely(!ret))
    2086             :                 io_arm_ltimeout(req);
    2087             :         else
    2088           0 :                 io_queue_async(req, ret);
    2089           0 : }
    2090             : 
    2091           0 : static void io_queue_sqe_fallback(struct io_kiocb *req)
    2092             :         __must_hold(&req->ctx->uring_lock)
    2093             : {
    2094           0 :         if (unlikely(req->flags & REQ_F_FAIL)) {
    2095             :                 /*
    2096             :                  * We don't submit, fail them all, for that replace hardlinks
    2097             :                  * with normal links. Extra REQ_F_LINK is tolerated.
    2098             :                  */
    2099           0 :                 req->flags &= ~REQ_F_HARDLINK;
    2100           0 :                 req->flags |= REQ_F_LINK;
    2101           0 :                 io_req_defer_failed(req, req->cqe.res);
    2102             :         } else {
    2103           0 :                 int ret = io_req_prep_async(req);
    2104             : 
    2105           0 :                 if (unlikely(ret)) {
    2106           0 :                         io_req_defer_failed(req, ret);
    2107           0 :                         return;
    2108             :                 }
    2109             : 
    2110           0 :                 if (unlikely(req->ctx->drain_active))
    2111           0 :                         io_drain_req(req);
    2112             :                 else
    2113           0 :                         io_queue_iowq(req, NULL);
    2114             :         }
    2115             : }
    2116             : 
    2117             : /*
    2118             :  * Check SQE restrictions (opcode and flags).
    2119             :  *
    2120             :  * Returns 'true' if SQE is allowed, 'false' otherwise.
    2121             :  */
    2122           0 : static inline bool io_check_restriction(struct io_ring_ctx *ctx,
    2123             :                                         struct io_kiocb *req,
    2124             :                                         unsigned int sqe_flags)
    2125             : {
    2126           0 :         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
    2127             :                 return false;
    2128             : 
    2129           0 :         if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
    2130             :             ctx->restrictions.sqe_flags_required)
    2131             :                 return false;
    2132             : 
    2133           0 :         if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
    2134             :                           ctx->restrictions.sqe_flags_required))
    2135             :                 return false;
    2136             : 
    2137             :         return true;
    2138             : }
    2139             : 
    2140             : static void io_init_req_drain(struct io_kiocb *req)
    2141             : {
    2142           0 :         struct io_ring_ctx *ctx = req->ctx;
    2143           0 :         struct io_kiocb *head = ctx->submit_state.link.head;
    2144             : 
    2145           0 :         ctx->drain_active = true;
    2146           0 :         if (head) {
    2147             :                 /*
    2148             :                  * If we need to drain a request in the middle of a link, drain
    2149             :                  * the head request and the next request/link after the current
    2150             :                  * link. Considering sequential execution of links,
    2151             :                  * REQ_F_IO_DRAIN will be maintained for every request of our
    2152             :                  * link.
    2153             :                  */
    2154           0 :                 head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
    2155           0 :                 ctx->drain_next = true;
    2156             :         }
    2157             : }
    2158             : 
    2159           0 : static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
    2160             :                        const struct io_uring_sqe *sqe)
    2161             :         __must_hold(&ctx->uring_lock)
    2162             : {
    2163             :         const struct io_issue_def *def;
    2164             :         unsigned int sqe_flags;
    2165             :         int personality;
    2166             :         u8 opcode;
    2167             : 
    2168             :         /* req is partially pre-initialised, see io_preinit_req() */
    2169           0 :         req->opcode = opcode = READ_ONCE(sqe->opcode);
    2170             :         /* same numerical values with corresponding REQ_F_*, safe to copy */
    2171           0 :         req->flags = sqe_flags = READ_ONCE(sqe->flags);
    2172           0 :         req->cqe.user_data = READ_ONCE(sqe->user_data);
    2173           0 :         req->file = NULL;
    2174           0 :         req->rsrc_node = NULL;
    2175           0 :         req->task = current;
    2176             : 
    2177           0 :         if (unlikely(opcode >= IORING_OP_LAST)) {
    2178           0 :                 req->opcode = 0;
    2179           0 :                 return -EINVAL;
    2180             :         }
    2181           0 :         def = &io_issue_defs[opcode];
    2182           0 :         if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
    2183             :                 /* enforce forwards compatibility on users */
    2184           0 :                 if (sqe_flags & ~SQE_VALID_FLAGS)
    2185             :                         return -EINVAL;
    2186           0 :                 if (sqe_flags & IOSQE_BUFFER_SELECT) {
    2187           0 :                         if (!def->buffer_select)
    2188             :                                 return -EOPNOTSUPP;
    2189           0 :                         req->buf_index = READ_ONCE(sqe->buf_group);
    2190             :                 }
    2191           0 :                 if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
    2192           0 :                         ctx->drain_disabled = true;
    2193           0 :                 if (sqe_flags & IOSQE_IO_DRAIN) {
    2194           0 :                         if (ctx->drain_disabled)
    2195             :                                 return -EOPNOTSUPP;
    2196           0 :                         io_init_req_drain(req);
    2197             :                 }
    2198             :         }
    2199           0 :         if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
    2200           0 :                 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
    2201             :                         return -EACCES;
    2202             :                 /* knock it to the slow queue path, will be drained there */
    2203           0 :                 if (ctx->drain_active)
    2204           0 :                         req->flags |= REQ_F_FORCE_ASYNC;
    2205             :                 /* if there is no link, we're at "next" request and need to drain */
    2206           0 :                 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
    2207           0 :                         ctx->drain_next = false;
    2208           0 :                         ctx->drain_active = true;
    2209           0 :                         req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
    2210             :                 }
    2211             :         }
    2212             : 
    2213           0 :         if (!def->ioprio && sqe->ioprio)
    2214             :                 return -EINVAL;
    2215           0 :         if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
    2216             :                 return -EINVAL;
    2217             : 
    2218           0 :         if (def->needs_file) {
    2219           0 :                 struct io_submit_state *state = &ctx->submit_state;
    2220             : 
    2221           0 :                 req->cqe.fd = READ_ONCE(sqe->fd);
    2222             : 
    2223             :                 /*
    2224             :                  * Plug now if we have more than 2 IO left after this, and the
    2225             :                  * target is potentially a read/write to block based storage.
    2226             :                  */
    2227           0 :                 if (state->need_plug && def->plug) {
    2228           0 :                         state->plug_started = true;
    2229           0 :                         state->need_plug = false;
    2230           0 :                         blk_start_plug_nr_ios(&state->plug, state->submit_nr);
    2231             :                 }
    2232             :         }
    2233             : 
    2234           0 :         personality = READ_ONCE(sqe->personality);
    2235           0 :         if (personality) {
    2236             :                 int ret;
    2237             : 
    2238           0 :                 req->creds = xa_load(&ctx->personalities, personality);
    2239           0 :                 if (!req->creds)
    2240             :                         return -EINVAL;
    2241           0 :                 get_cred(req->creds);
    2242           0 :                 ret = security_uring_override_creds(req->creds);
    2243             :                 if (ret) {
    2244             :                         put_cred(req->creds);
    2245             :                         return ret;
    2246             :                 }
    2247           0 :                 req->flags |= REQ_F_CREDS;
    2248             :         }
    2249             : 
    2250           0 :         return def->prep(req, sqe);
    2251             : }
    2252             : 
    2253           0 : static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
    2254             :                                       struct io_kiocb *req, int ret)
    2255             : {
    2256           0 :         struct io_ring_ctx *ctx = req->ctx;
    2257           0 :         struct io_submit_link *link = &ctx->submit_state.link;
    2258           0 :         struct io_kiocb *head = link->head;
    2259             : 
    2260           0 :         trace_io_uring_req_failed(sqe, req, ret);
    2261             : 
    2262             :         /*
    2263             :          * Avoid breaking links in the middle as it renders links with SQPOLL
    2264             :          * unusable. Instead of failing eagerly, continue assembling the link if
    2265             :          * applicable and mark the head with REQ_F_FAIL. The link flushing code
    2266             :          * should find the flag and handle the rest.
    2267             :          */
    2268           0 :         req_fail_link_node(req, ret);
    2269           0 :         if (head && !(head->flags & REQ_F_FAIL))
    2270             :                 req_fail_link_node(head, -ECANCELED);
    2271             : 
    2272           0 :         if (!(req->flags & IO_REQ_LINK_FLAGS)) {
    2273           0 :                 if (head) {
    2274           0 :                         link->last->link = req;
    2275           0 :                         link->head = NULL;
    2276           0 :                         req = head;
    2277             :                 }
    2278           0 :                 io_queue_sqe_fallback(req);
    2279             :                 return ret;
    2280             :         }
    2281             : 
    2282           0 :         if (head)
    2283           0 :                 link->last->link = req;
    2284             :         else
    2285           0 :                 link->head = req;
    2286           0 :         link->last = req;
    2287             :         return 0;
    2288             : }
    2289             : 
    2290           0 : static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
    2291             :                          const struct io_uring_sqe *sqe)
    2292             :         __must_hold(&ctx->uring_lock)
    2293             : {
    2294           0 :         struct io_submit_link *link = &ctx->submit_state.link;
    2295             :         int ret;
    2296             : 
    2297           0 :         ret = io_init_req(ctx, req, sqe);
    2298           0 :         if (unlikely(ret))
    2299           0 :                 return io_submit_fail_init(sqe, req, ret);
    2300             : 
    2301             :         /* don't need @sqe from now on */
    2302           0 :         trace_io_uring_submit_sqe(req, true);
    2303             : 
    2304             :         /*
    2305             :          * If we already have a head request, queue this one for async
    2306             :          * submittal once the head completes. If we don't have a head but
    2307             :          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
    2308             :          * submitted sync once the chain is complete. If none of those
    2309             :          * conditions are true (normal request), then just queue it.
    2310             :          */
    2311           0 :         if (unlikely(link->head)) {
    2312           0 :                 ret = io_req_prep_async(req);
    2313           0 :                 if (unlikely(ret))
    2314           0 :                         return io_submit_fail_init(sqe, req, ret);
    2315             : 
    2316           0 :                 trace_io_uring_link(req, link->head);
    2317           0 :                 link->last->link = req;
    2318           0 :                 link->last = req;
    2319             : 
    2320           0 :                 if (req->flags & IO_REQ_LINK_FLAGS)
    2321             :                         return 0;
    2322             :                 /* last request of the link, flush it */
    2323           0 :                 req = link->head;
    2324           0 :                 link->head = NULL;
    2325           0 :                 if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
    2326             :                         goto fallback;
    2327             : 
    2328           0 :         } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
    2329             :                                           REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
    2330           0 :                 if (req->flags & IO_REQ_LINK_FLAGS) {
    2331           0 :                         link->head = req;
    2332           0 :                         link->last = req;
    2333             :                 } else {
    2334             : fallback:
    2335           0 :                         io_queue_sqe_fallback(req);
    2336             :                 }
    2337             :                 return 0;
    2338             :         }
    2339             : 
    2340           0 :         io_queue_sqe(req);
    2341           0 :         return 0;
    2342             : }
    2343             : 
    2344             : /*
    2345             :  * Batched submission is done, ensure local IO is flushed out.
    2346             :  */
    2347           0 : static void io_submit_state_end(struct io_ring_ctx *ctx)
    2348             : {
    2349           0 :         struct io_submit_state *state = &ctx->submit_state;
    2350             : 
    2351           0 :         if (unlikely(state->link.head))
    2352           0 :                 io_queue_sqe_fallback(state->link.head);
    2353             :         /* flush only after queuing links as they can generate completions */
    2354           0 :         io_submit_flush_completions(ctx);
    2355           0 :         if (state->plug_started)
    2356           0 :                 blk_finish_plug(&state->plug);
    2357           0 : }
    2358             : 
    2359             : /*
    2360             :  * Start submission side cache.
    2361             :  */
    2362             : static void io_submit_state_start(struct io_submit_state *state,
    2363             :                                   unsigned int max_ios)
    2364             : {
    2365           0 :         state->plug_started = false;
    2366           0 :         state->need_plug = max_ios > 2;
    2367           0 :         state->submit_nr = max_ios;
    2368             :         /* set only head, no need to init link_last in advance */
    2369           0 :         state->link.head = NULL;
    2370             : }
    2371             : 
    2372             : static void io_commit_sqring(struct io_ring_ctx *ctx)
    2373             : {
    2374           0 :         struct io_rings *rings = ctx->rings;
    2375             : 
    2376             :         /*
    2377             :          * Ensure any loads from the SQEs are done at this point,
    2378             :          * since once we write the new head, the application could
    2379             :          * write new data to them.
    2380             :          */
    2381           0 :         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
    2382             : }
    2383             : 
    2384             : /*
    2385             :  * Fetch an sqe, if one is available. Note this returns a pointer to memory
    2386             :  * that is mapped by userspace. This means that care needs to be taken to
    2387             :  * ensure that reads are stable, as we cannot rely on userspace always
    2388             :  * being a good citizen. If members of the sqe are validated and then later
    2389             :  * used, it's important that those reads are done through READ_ONCE() to
    2390             :  * prevent a re-load down the line.
    2391             :  */
    2392           0 : static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
    2393             : {
    2394           0 :         unsigned head, mask = ctx->sq_entries - 1;
    2395           0 :         unsigned sq_idx = ctx->cached_sq_head++ & mask;
    2396             : 
    2397             :         /*
    2398             :          * The cached sq head (or cq tail) serves two purposes:
    2399             :          *
    2400             :          * 1) allows us to batch the cost of updating the user visible
    2401             :          *    head updates.
    2402             :          * 2) allows the kernel side to track the head on its own, even
    2403             :          *    though the application is the one updating it.
    2404             :          */
    2405           0 :         head = READ_ONCE(ctx->sq_array[sq_idx]);
    2406           0 :         if (likely(head < ctx->sq_entries)) {
    2407             :                 /* double index for 128-byte SQEs, twice as long */
    2408           0 :                 if (ctx->flags & IORING_SETUP_SQE128)
    2409           0 :                         head <<= 1;
    2410           0 :                 *sqe = &ctx->sq_sqes[head];
    2411           0 :                 return true;
    2412             :         }
    2413             : 
    2414             :         /* drop invalid entries */
    2415           0 :         ctx->cq_extra--;
    2416           0 :         WRITE_ONCE(ctx->rings->sq_dropped,
    2417             :                    READ_ONCE(ctx->rings->sq_dropped) + 1);
    2418           0 :         return false;
    2419             : }
    2420             : 
    2421           0 : int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
    2422             :         __must_hold(&ctx->uring_lock)
    2423             : {
    2424           0 :         unsigned int entries = io_sqring_entries(ctx);
    2425             :         unsigned int left;
    2426             :         int ret;
    2427             : 
    2428           0 :         if (unlikely(!entries))
    2429             :                 return 0;
    2430             :         /* make sure SQ entry isn't read before tail */
    2431           0 :         ret = left = min3(nr, ctx->sq_entries, entries);
    2432           0 :         io_get_task_refs(left);
    2433           0 :         io_submit_state_start(&ctx->submit_state, left);
    2434             : 
    2435             :         do {
    2436             :                 const struct io_uring_sqe *sqe;
    2437             :                 struct io_kiocb *req;
    2438             : 
    2439           0 :                 if (unlikely(!io_alloc_req(ctx, &req)))
    2440             :                         break;
    2441           0 :                 if (unlikely(!io_get_sqe(ctx, &sqe))) {
    2442           0 :                         io_req_add_to_cache(req, ctx);
    2443             :                         break;
    2444             :                 }
    2445             : 
    2446             :                 /*
    2447             :                  * Continue submitting even for sqe failure if the
    2448             :                  * ring was setup with IORING_SETUP_SUBMIT_ALL
    2449             :                  */
    2450           0 :                 if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
    2451           0 :                     !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
    2452           0 :                         left--;
    2453           0 :                         break;
    2454             :                 }
    2455           0 :         } while (--left);
    2456             : 
    2457           0 :         if (unlikely(left)) {
    2458           0 :                 ret -= left;
    2459             :                 /* try again if it submitted nothing and can't allocate a req */
    2460           0 :                 if (!ret && io_req_cache_empty(ctx))
    2461           0 :                         ret = -EAGAIN;
    2462           0 :                 current->io_uring->cached_refs += left;
    2463             :         }
    2464             : 
    2465           0 :         io_submit_state_end(ctx);
    2466             :          /* Commit SQ ring head once we've consumed and submitted all SQEs */
    2467           0 :         io_commit_sqring(ctx);
    2468           0 :         return ret;
    2469             : }
    2470             : 
    2471             : struct io_wait_queue {
    2472             :         struct wait_queue_entry wq;
    2473             :         struct io_ring_ctx *ctx;
    2474             :         unsigned cq_tail;
    2475             :         unsigned nr_timeouts;
    2476             :         ktime_t timeout;
    2477             : };
    2478             : 
    2479             : static inline bool io_has_work(struct io_ring_ctx *ctx)
    2480             : {
    2481           0 :         return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) ||
    2482           0 :                !llist_empty(&ctx->work_llist);
    2483             : }
    2484             : 
    2485             : static inline bool io_should_wake(struct io_wait_queue *iowq)
    2486             : {
    2487           0 :         struct io_ring_ctx *ctx = iowq->ctx;
    2488           0 :         int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
    2489             : 
    2490             :         /*
    2491             :          * Wake up if we have enough events, or if a timeout occurred since we
    2492             :          * started waiting. For timeouts, we always want to return to userspace,
    2493             :          * regardless of event count.
    2494             :          */
    2495           0 :         return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
    2496             : }
    2497             : 
    2498           0 : static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
    2499             :                             int wake_flags, void *key)
    2500             : {
    2501           0 :         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq);
    2502             : 
    2503             :         /*
    2504             :          * Cannot safely flush overflowed CQEs from here, ensure we wake up
    2505             :          * the task, and the next invocation will do it.
    2506             :          */
    2507           0 :         if (io_should_wake(iowq) || io_has_work(iowq->ctx))
    2508           0 :                 return autoremove_wake_function(curr, mode, wake_flags, key);
    2509             :         return -1;
    2510             : }
    2511             : 
    2512           0 : int io_run_task_work_sig(struct io_ring_ctx *ctx)
    2513             : {
    2514           0 :         if (!llist_empty(&ctx->work_llist)) {
    2515           0 :                 __set_current_state(TASK_RUNNING);
    2516           0 :                 if (io_run_local_work(ctx) > 0)
    2517             :                         return 1;
    2518             :         }
    2519           0 :         if (io_run_task_work() > 0)
    2520             :                 return 1;
    2521           0 :         if (task_sigpending(current))
    2522             :                 return -EINTR;
    2523           0 :         return 0;
    2524             : }
    2525             : 
    2526             : /* when returns >0, the caller should retry */
    2527           0 : static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
    2528             :                                           struct io_wait_queue *iowq)
    2529             : {
    2530           0 :         if (unlikely(READ_ONCE(ctx->check_cq)))
    2531             :                 return 1;
    2532           0 :         if (unlikely(!llist_empty(&ctx->work_llist)))
    2533             :                 return 1;
    2534           0 :         if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
    2535             :                 return 1;
    2536           0 :         if (unlikely(task_sigpending(current)))
    2537             :                 return -EINTR;
    2538           0 :         if (unlikely(io_should_wake(iowq)))
    2539             :                 return 0;
    2540           0 :         if (iowq->timeout == KTIME_MAX)
    2541           0 :                 schedule();
    2542           0 :         else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
    2543             :                 return -ETIME;
    2544             :         return 0;
    2545             : }
    2546             : 
    2547             : /*
    2548             :  * Wait until events become available, if we don't already have some. The
    2549             :  * application must reap them itself, as they reside on the shared cq ring.
    2550             :  */
    2551           0 : static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
    2552             :                           const sigset_t __user *sig, size_t sigsz,
    2553             :                           struct __kernel_timespec __user *uts)
    2554             : {
    2555             :         struct io_wait_queue iowq;
    2556           0 :         struct io_rings *rings = ctx->rings;
    2557             :         int ret;
    2558             : 
    2559           0 :         if (!io_allowed_run_tw(ctx))
    2560             :                 return -EEXIST;
    2561           0 :         if (!llist_empty(&ctx->work_llist))
    2562           0 :                 io_run_local_work(ctx);
    2563           0 :         io_run_task_work();
    2564           0 :         io_cqring_overflow_flush(ctx);
    2565             :         /* if user messes with these they will just get an early return */
    2566           0 :         if (__io_cqring_events_user(ctx) >= min_events)
    2567             :                 return 0;
    2568             : 
    2569           0 :         if (sig) {
    2570             : #ifdef CONFIG_COMPAT
    2571             :                 if (in_compat_syscall())
    2572             :                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
    2573             :                                                       sigsz);
    2574             :                 else
    2575             : #endif
    2576           0 :                         ret = set_user_sigmask(sig, sigsz);
    2577             : 
    2578           0 :                 if (ret)
    2579             :                         return ret;
    2580             :         }
    2581             : 
    2582           0 :         init_waitqueue_func_entry(&iowq.wq, io_wake_function);
    2583           0 :         iowq.wq.private = current;
    2584           0 :         INIT_LIST_HEAD(&iowq.wq.entry);
    2585           0 :         iowq.ctx = ctx;
    2586           0 :         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
    2587           0 :         iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
    2588           0 :         iowq.timeout = KTIME_MAX;
    2589             : 
    2590           0 :         if (uts) {
    2591             :                 struct timespec64 ts;
    2592             : 
    2593           0 :                 if (get_timespec64(&ts, uts))
    2594           0 :                         return -EFAULT;
    2595           0 :                 iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
    2596             :         }
    2597             : 
    2598             :         trace_io_uring_cqring_wait(ctx, min_events);
    2599           0 :         do {
    2600             :                 unsigned long check_cq;
    2601             : 
    2602           0 :                 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
    2603           0 :                         WRITE_ONCE(ctx->cq_waiting, 1);
    2604           0 :                         set_current_state(TASK_INTERRUPTIBLE);
    2605             :                 } else {
    2606           0 :                         prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
    2607             :                                                         TASK_INTERRUPTIBLE);
    2608             :                 }
    2609             : 
    2610           0 :                 ret = io_cqring_wait_schedule(ctx, &iowq);
    2611           0 :                 __set_current_state(TASK_RUNNING);
    2612           0 :                 WRITE_ONCE(ctx->cq_waiting, 0);
    2613             : 
    2614           0 :                 if (ret < 0)
    2615             :                         break;
    2616             :                 /*
    2617             :                  * Run task_work after scheduling and before io_should_wake().
    2618             :                  * If we got woken because of task_work being processed, run it
    2619             :                  * now rather than let the caller do another wait loop.
    2620             :                  */
    2621           0 :                 io_run_task_work();
    2622           0 :                 if (!llist_empty(&ctx->work_llist))
    2623           0 :                         io_run_local_work(ctx);
    2624             : 
    2625           0 :                 check_cq = READ_ONCE(ctx->check_cq);
    2626           0 :                 if (unlikely(check_cq)) {
    2627             :                         /* let the caller flush overflows, retry */
    2628           0 :                         if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
    2629           0 :                                 io_cqring_do_overflow_flush(ctx);
    2630           0 :                         if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) {
    2631             :                                 ret = -EBADR;
    2632             :                                 break;
    2633             :                         }
    2634             :                 }
    2635             : 
    2636           0 :                 if (io_should_wake(&iowq)) {
    2637             :                         ret = 0;
    2638             :                         break;
    2639             :                 }
    2640           0 :                 cond_resched();
    2641             :         } while (1);
    2642             : 
    2643           0 :         if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
    2644           0 :                 finish_wait(&ctx->cq_wait, &iowq.wq);
    2645           0 :         restore_saved_sigmask_unless(ret == -EINTR);
    2646             : 
    2647           0 :         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
    2648             : }
    2649             : 
    2650           0 : static void io_mem_free(void *ptr)
    2651             : {
    2652             :         struct page *page;
    2653             : 
    2654           0 :         if (!ptr)
    2655             :                 return;
    2656             : 
    2657           0 :         page = virt_to_head_page(ptr);
    2658           0 :         if (put_page_testzero(page))
    2659           0 :                 free_compound_page(page);
    2660             : }
    2661             : 
    2662           0 : static void *io_mem_alloc(size_t size)
    2663             : {
    2664           0 :         gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
    2665             : 
    2666           0 :         return (void *) __get_free_pages(gfp, get_order(size));
    2667             : }
    2668             : 
    2669           0 : static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
    2670             :                                 unsigned int cq_entries, size_t *sq_offset)
    2671             : {
    2672             :         struct io_rings *rings;
    2673             :         size_t off, sq_array_size;
    2674             : 
    2675           0 :         off = struct_size(rings, cqes, cq_entries);
    2676           0 :         if (off == SIZE_MAX)
    2677             :                 return SIZE_MAX;
    2678           0 :         if (ctx->flags & IORING_SETUP_CQE32) {
    2679           0 :                 if (check_shl_overflow(off, 1, &off))
    2680             :                         return SIZE_MAX;
    2681             :         }
    2682             : 
    2683             : #ifdef CONFIG_SMP
    2684             :         off = ALIGN(off, SMP_CACHE_BYTES);
    2685             :         if (off == 0)
    2686             :                 return SIZE_MAX;
    2687             : #endif
    2688             : 
    2689           0 :         if (sq_offset)
    2690           0 :                 *sq_offset = off;
    2691             : 
    2692           0 :         sq_array_size = array_size(sizeof(u32), sq_entries);
    2693           0 :         if (sq_array_size == SIZE_MAX)
    2694             :                 return SIZE_MAX;
    2695             : 
    2696           0 :         if (check_add_overflow(off, sq_array_size, &off))
    2697             :                 return SIZE_MAX;
    2698             : 
    2699           0 :         return off;
    2700             : }
    2701             : 
    2702           0 : static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
    2703             :                                unsigned int eventfd_async)
    2704             : {
    2705             :         struct io_ev_fd *ev_fd;
    2706           0 :         __s32 __user *fds = arg;
    2707             :         int fd;
    2708             : 
    2709           0 :         ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
    2710             :                                         lockdep_is_held(&ctx->uring_lock));
    2711           0 :         if (ev_fd)
    2712             :                 return -EBUSY;
    2713             : 
    2714           0 :         if (copy_from_user(&fd, fds, sizeof(*fds)))
    2715             :                 return -EFAULT;
    2716             : 
    2717           0 :         ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
    2718           0 :         if (!ev_fd)
    2719             :                 return -ENOMEM;
    2720             : 
    2721           0 :         ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
    2722           0 :         if (IS_ERR(ev_fd->cq_ev_fd)) {
    2723           0 :                 int ret = PTR_ERR(ev_fd->cq_ev_fd);
    2724           0 :                 kfree(ev_fd);
    2725           0 :                 return ret;
    2726             :         }
    2727             : 
    2728           0 :         spin_lock(&ctx->completion_lock);
    2729           0 :         ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
    2730           0 :         spin_unlock(&ctx->completion_lock);
    2731             : 
    2732           0 :         ev_fd->eventfd_async = eventfd_async;
    2733           0 :         ctx->has_evfd = true;
    2734           0 :         rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
    2735           0 :         atomic_set(&ev_fd->refs, 1);
    2736           0 :         atomic_set(&ev_fd->ops, 0);
    2737           0 :         return 0;
    2738             : }
    2739             : 
    2740           0 : static int io_eventfd_unregister(struct io_ring_ctx *ctx)
    2741             : {
    2742             :         struct io_ev_fd *ev_fd;
    2743             : 
    2744           0 :         ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
    2745             :                                         lockdep_is_held(&ctx->uring_lock));
    2746           0 :         if (ev_fd) {
    2747           0 :                 ctx->has_evfd = false;
    2748           0 :                 rcu_assign_pointer(ctx->io_ev_fd, NULL);
    2749           0 :                 if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
    2750           0 :                         call_rcu(&ev_fd->rcu, io_eventfd_ops);
    2751             :                 return 0;
    2752             :         }
    2753             : 
    2754             :         return -ENXIO;
    2755             : }
    2756             : 
    2757           0 : static void io_req_caches_free(struct io_ring_ctx *ctx)
    2758             : {
    2759             :         struct io_kiocb *req;
    2760           0 :         int nr = 0;
    2761             : 
    2762           0 :         mutex_lock(&ctx->uring_lock);
    2763           0 :         io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
    2764             : 
    2765           0 :         while (!io_req_cache_empty(ctx)) {
    2766           0 :                 req = io_extract_req(ctx);
    2767           0 :                 kmem_cache_free(req_cachep, req);
    2768           0 :                 nr++;
    2769             :         }
    2770           0 :         if (nr)
    2771           0 :                 percpu_ref_put_many(&ctx->refs, nr);
    2772           0 :         mutex_unlock(&ctx->uring_lock);
    2773           0 : }
    2774             : 
    2775           0 : static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
    2776             : {
    2777           0 :         io_sq_thread_finish(ctx);
    2778           0 :         io_rsrc_refs_drop(ctx);
    2779             :         /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
    2780           0 :         io_wait_rsrc_data(ctx->buf_data);
    2781           0 :         io_wait_rsrc_data(ctx->file_data);
    2782             : 
    2783           0 :         mutex_lock(&ctx->uring_lock);
    2784           0 :         if (ctx->buf_data)
    2785           0 :                 __io_sqe_buffers_unregister(ctx);
    2786           0 :         if (ctx->file_data)
    2787           0 :                 __io_sqe_files_unregister(ctx);
    2788           0 :         io_cqring_overflow_kill(ctx);
    2789           0 :         io_eventfd_unregister(ctx);
    2790           0 :         io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
    2791           0 :         io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
    2792           0 :         mutex_unlock(&ctx->uring_lock);
    2793           0 :         io_destroy_buffers(ctx);
    2794           0 :         if (ctx->sq_creds)
    2795           0 :                 put_cred(ctx->sq_creds);
    2796           0 :         if (ctx->submitter_task)
    2797           0 :                 put_task_struct(ctx->submitter_task);
    2798             : 
    2799             :         /* there are no registered resources left, nobody uses it */
    2800           0 :         if (ctx->rsrc_node)
    2801           0 :                 io_rsrc_node_destroy(ctx->rsrc_node);
    2802           0 :         if (ctx->rsrc_backup_node)
    2803           0 :                 io_rsrc_node_destroy(ctx->rsrc_backup_node);
    2804           0 :         flush_delayed_work(&ctx->rsrc_put_work);
    2805           0 :         flush_delayed_work(&ctx->fallback_work);
    2806             : 
    2807           0 :         WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
    2808           0 :         WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
    2809             : 
    2810             : #if defined(CONFIG_UNIX)
    2811             :         if (ctx->ring_sock) {
    2812             :                 ctx->ring_sock->file = NULL; /* so that iput() is called */
    2813             :                 sock_release(ctx->ring_sock);
    2814             :         }
    2815             : #endif
    2816           0 :         WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
    2817             : 
    2818           0 :         if (ctx->mm_account) {
    2819           0 :                 mmdrop(ctx->mm_account);
    2820           0 :                 ctx->mm_account = NULL;
    2821             :         }
    2822           0 :         io_mem_free(ctx->rings);
    2823           0 :         io_mem_free(ctx->sq_sqes);
    2824             : 
    2825           0 :         percpu_ref_exit(&ctx->refs);
    2826           0 :         free_uid(ctx->user);
    2827           0 :         io_req_caches_free(ctx);
    2828           0 :         if (ctx->hash_map)
    2829           0 :                 io_wq_put_hash(ctx->hash_map);
    2830           0 :         kfree(ctx->cancel_table.hbs);
    2831           0 :         kfree(ctx->cancel_table_locked.hbs);
    2832           0 :         kfree(ctx->dummy_ubuf);
    2833           0 :         kfree(ctx->io_bl);
    2834           0 :         xa_destroy(&ctx->io_bl_xa);
    2835           0 :         kfree(ctx);
    2836           0 : }
    2837             : 
    2838           0 : static __cold void io_activate_pollwq_cb(struct callback_head *cb)
    2839             : {
    2840           0 :         struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx,
    2841             :                                                poll_wq_task_work);
    2842             : 
    2843           0 :         mutex_lock(&ctx->uring_lock);
    2844           0 :         ctx->poll_activated = true;
    2845           0 :         mutex_unlock(&ctx->uring_lock);
    2846             : 
    2847             :         /*
    2848             :          * Wake ups for some events between start of polling and activation
    2849             :          * might've been lost due to loose synchronisation.
    2850             :          */
    2851           0 :         wake_up_all(&ctx->poll_wq);
    2852           0 :         percpu_ref_put(&ctx->refs);
    2853           0 : }
    2854             : 
    2855           0 : static __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
    2856             : {
    2857           0 :         spin_lock(&ctx->completion_lock);
    2858             :         /* already activated or in progress */
    2859           0 :         if (ctx->poll_activated || ctx->poll_wq_task_work.func)
    2860             :                 goto out;
    2861           0 :         if (WARN_ON_ONCE(!ctx->task_complete))
    2862             :                 goto out;
    2863           0 :         if (!ctx->submitter_task)
    2864             :                 goto out;
    2865             :         /*
    2866             :          * with ->submitter_task only the submitter task completes requests, we
    2867             :          * only need to sync with it, which is done by injecting a tw
    2868             :          */
    2869           0 :         init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb);
    2870           0 :         percpu_ref_get(&ctx->refs);
    2871           0 :         if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL))
    2872           0 :                 percpu_ref_put(&ctx->refs);
    2873             : out:
    2874           0 :         spin_unlock(&ctx->completion_lock);
    2875           0 : }
    2876             : 
    2877           0 : static __poll_t io_uring_poll(struct file *file, poll_table *wait)
    2878             : {
    2879           0 :         struct io_ring_ctx *ctx = file->private_data;
    2880           0 :         __poll_t mask = 0;
    2881             : 
    2882           0 :         if (unlikely(!ctx->poll_activated))
    2883           0 :                 io_activate_pollwq(ctx);
    2884             : 
    2885           0 :         poll_wait(file, &ctx->poll_wq, wait);
    2886             :         /*
    2887             :          * synchronizes with barrier from wq_has_sleeper call in
    2888             :          * io_commit_cqring
    2889             :          */
    2890           0 :         smp_rmb();
    2891           0 :         if (!io_sqring_full(ctx))
    2892           0 :                 mask |= EPOLLOUT | EPOLLWRNORM;
    2893             : 
    2894             :         /*
    2895             :          * Don't flush cqring overflow list here, just do a simple check.
    2896             :          * Otherwise there could possible be ABBA deadlock:
    2897             :          *      CPU0                    CPU1
    2898             :          *      ----                    ----
    2899             :          * lock(&ctx->uring_lock);
    2900             :          *                              lock(&ep->mtx);
    2901             :          *                              lock(&ctx->uring_lock);
    2902             :          * lock(&ep->mtx);
    2903             :          *
    2904             :          * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
    2905             :          * pushes them to do the flush.
    2906             :          */
    2907             : 
    2908           0 :         if (__io_cqring_events_user(ctx) || io_has_work(ctx))
    2909           0 :                 mask |= EPOLLIN | EPOLLRDNORM;
    2910             : 
    2911           0 :         return mask;
    2912             : }
    2913             : 
    2914           0 : static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
    2915             : {
    2916             :         const struct cred *creds;
    2917             : 
    2918           0 :         creds = xa_erase(&ctx->personalities, id);
    2919           0 :         if (creds) {
    2920             :                 put_cred(creds);
    2921             :                 return 0;
    2922             :         }
    2923             : 
    2924             :         return -EINVAL;
    2925             : }
    2926             : 
    2927             : struct io_tctx_exit {
    2928             :         struct callback_head            task_work;
    2929             :         struct completion               completion;
    2930             :         struct io_ring_ctx              *ctx;
    2931             : };
    2932             : 
    2933           0 : static __cold void io_tctx_exit_cb(struct callback_head *cb)
    2934             : {
    2935           0 :         struct io_uring_task *tctx = current->io_uring;
    2936             :         struct io_tctx_exit *work;
    2937             : 
    2938           0 :         work = container_of(cb, struct io_tctx_exit, task_work);
    2939             :         /*
    2940             :          * When @in_cancel, we're in cancellation and it's racy to remove the
    2941             :          * node. It'll be removed by the end of cancellation, just ignore it.
    2942             :          * tctx can be NULL if the queueing of this task_work raced with
    2943             :          * work cancelation off the exec path.
    2944             :          */
    2945           0 :         if (tctx && !atomic_read(&tctx->in_cancel))
    2946           0 :                 io_uring_del_tctx_node((unsigned long)work->ctx);
    2947           0 :         complete(&work->completion);
    2948           0 : }
    2949             : 
    2950           0 : static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
    2951             : {
    2952           0 :         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
    2953             : 
    2954           0 :         return req->ctx == data;
    2955             : }
    2956             : 
    2957           0 : static __cold void io_ring_exit_work(struct work_struct *work)
    2958             : {
    2959           0 :         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
    2960           0 :         unsigned long timeout = jiffies + HZ * 60 * 5;
    2961           0 :         unsigned long interval = HZ / 20;
    2962             :         struct io_tctx_exit exit;
    2963             :         struct io_tctx_node *node;
    2964             :         int ret;
    2965             : 
    2966             :         /*
    2967             :          * If we're doing polled IO and end up having requests being
    2968             :          * submitted async (out-of-line), then completions can come in while
    2969             :          * we're waiting for refs to drop. We need to reap these manually,
    2970             :          * as nobody else will be looking for them.
    2971             :          */
    2972             :         do {
    2973           0 :                 if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
    2974           0 :                         mutex_lock(&ctx->uring_lock);
    2975           0 :                         io_cqring_overflow_kill(ctx);
    2976           0 :                         mutex_unlock(&ctx->uring_lock);
    2977             :                 }
    2978             : 
    2979           0 :                 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
    2980           0 :                         io_move_task_work_from_local(ctx);
    2981             : 
    2982           0 :                 while (io_uring_try_cancel_requests(ctx, NULL, true))
    2983           0 :                         cond_resched();
    2984             : 
    2985           0 :                 if (ctx->sq_data) {
    2986           0 :                         struct io_sq_data *sqd = ctx->sq_data;
    2987             :                         struct task_struct *tsk;
    2988             : 
    2989           0 :                         io_sq_thread_park(sqd);
    2990           0 :                         tsk = sqd->thread;
    2991           0 :                         if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
    2992           0 :                                 io_wq_cancel_cb(tsk->io_uring->io_wq,
    2993             :                                                 io_cancel_ctx_cb, ctx, true);
    2994           0 :                         io_sq_thread_unpark(sqd);
    2995             :                 }
    2996             : 
    2997           0 :                 io_req_caches_free(ctx);
    2998             : 
    2999           0 :                 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
    3000             :                         /* there is little hope left, don't run it too often */
    3001           0 :                         interval = HZ * 60;
    3002             :                 }
    3003           0 :         } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
    3004             : 
    3005           0 :         init_completion(&exit.completion);
    3006           0 :         init_task_work(&exit.task_work, io_tctx_exit_cb);
    3007           0 :         exit.ctx = ctx;
    3008             :         /*
    3009             :          * Some may use context even when all refs and requests have been put,
    3010             :          * and they are free to do so while still holding uring_lock or
    3011             :          * completion_lock, see io_req_task_submit(). Apart from other work,
    3012             :          * this lock/unlock section also waits them to finish.
    3013             :          */
    3014           0 :         mutex_lock(&ctx->uring_lock);
    3015           0 :         while (!list_empty(&ctx->tctx_list)) {
    3016           0 :                 WARN_ON_ONCE(time_after(jiffies, timeout));
    3017             : 
    3018           0 :                 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
    3019             :                                         ctx_node);
    3020             :                 /* don't spin on a single task if cancellation failed */
    3021           0 :                 list_rotate_left(&ctx->tctx_list);
    3022           0 :                 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
    3023           0 :                 if (WARN_ON_ONCE(ret))
    3024           0 :                         continue;
    3025             : 
    3026           0 :                 mutex_unlock(&ctx->uring_lock);
    3027           0 :                 wait_for_completion(&exit.completion);
    3028           0 :                 mutex_lock(&ctx->uring_lock);
    3029             :         }
    3030           0 :         mutex_unlock(&ctx->uring_lock);
    3031           0 :         spin_lock(&ctx->completion_lock);
    3032           0 :         spin_unlock(&ctx->completion_lock);
    3033             : 
    3034           0 :         io_ring_ctx_free(ctx);
    3035           0 : }
    3036             : 
    3037           0 : static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
    3038             : {
    3039             :         unsigned long index;
    3040             :         struct creds *creds;
    3041             : 
    3042           0 :         mutex_lock(&ctx->uring_lock);
    3043           0 :         percpu_ref_kill(&ctx->refs);
    3044           0 :         xa_for_each(&ctx->personalities, index, creds)
    3045           0 :                 io_unregister_personality(ctx, index);
    3046           0 :         if (ctx->rings)
    3047           0 :                 io_poll_remove_all(ctx, NULL, true);
    3048           0 :         mutex_unlock(&ctx->uring_lock);
    3049             : 
    3050             :         /*
    3051             :          * If we failed setting up the ctx, we might not have any rings
    3052             :          * and therefore did not submit any requests
    3053             :          */
    3054           0 :         if (ctx->rings)
    3055           0 :                 io_kill_timeouts(ctx, NULL, true);
    3056             : 
    3057           0 :         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
    3058             :         /*
    3059             :          * Use system_unbound_wq to avoid spawning tons of event kworkers
    3060             :          * if we're exiting a ton of rings at the same time. It just adds
    3061             :          * noise and overhead, there's no discernable change in runtime
    3062             :          * over using system_wq.
    3063             :          */
    3064           0 :         queue_work(system_unbound_wq, &ctx->exit_work);
    3065           0 : }
    3066             : 
    3067           0 : static int io_uring_release(struct inode *inode, struct file *file)
    3068             : {
    3069           0 :         struct io_ring_ctx *ctx = file->private_data;
    3070             : 
    3071           0 :         file->private_data = NULL;
    3072           0 :         io_ring_ctx_wait_and_kill(ctx);
    3073           0 :         return 0;
    3074             : }
    3075             : 
    3076             : struct io_task_cancel {
    3077             :         struct task_struct *task;
    3078             :         bool all;
    3079             : };
    3080             : 
    3081           0 : static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
    3082             : {
    3083           0 :         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
    3084           0 :         struct io_task_cancel *cancel = data;
    3085             : 
    3086           0 :         return io_match_task_safe(req, cancel->task, cancel->all);
    3087             : }
    3088             : 
    3089           0 : static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
    3090             :                                          struct task_struct *task,
    3091             :                                          bool cancel_all)
    3092             : {
    3093             :         struct io_defer_entry *de;
    3094           0 :         LIST_HEAD(list);
    3095             : 
    3096           0 :         spin_lock(&ctx->completion_lock);
    3097           0 :         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
    3098           0 :                 if (io_match_task_safe(de->req, task, cancel_all)) {
    3099           0 :                         list_cut_position(&list, &ctx->defer_list, &de->list);
    3100           0 :                         break;
    3101             :                 }
    3102             :         }
    3103           0 :         spin_unlock(&ctx->completion_lock);
    3104           0 :         if (list_empty(&list))
    3105             :                 return false;
    3106             : 
    3107           0 :         while (!list_empty(&list)) {
    3108           0 :                 de = list_first_entry(&list, struct io_defer_entry, list);
    3109           0 :                 list_del_init(&de->list);
    3110           0 :                 io_req_task_queue_fail(de->req, -ECANCELED);
    3111           0 :                 kfree(de);
    3112             :         }
    3113             :         return true;
    3114             : }
    3115             : 
    3116           0 : static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
    3117             : {
    3118             :         struct io_tctx_node *node;
    3119             :         enum io_wq_cancel cret;
    3120           0 :         bool ret = false;
    3121             : 
    3122           0 :         mutex_lock(&ctx->uring_lock);
    3123           0 :         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
    3124           0 :                 struct io_uring_task *tctx = node->task->io_uring;
    3125             : 
    3126             :                 /*
    3127             :                  * io_wq will stay alive while we hold uring_lock, because it's
    3128             :                  * killed after ctx nodes, which requires to take the lock.
    3129             :                  */
    3130           0 :                 if (!tctx || !tctx->io_wq)
    3131           0 :                         continue;
    3132           0 :                 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
    3133           0 :                 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
    3134             :         }
    3135           0 :         mutex_unlock(&ctx->uring_lock);
    3136             : 
    3137           0 :         return ret;
    3138             : }
    3139             : 
    3140           0 : static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
    3141             :                                                 struct task_struct *task,
    3142             :                                                 bool cancel_all)
    3143             : {
    3144           0 :         struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
    3145           0 :         struct io_uring_task *tctx = task ? task->io_uring : NULL;
    3146             :         enum io_wq_cancel cret;
    3147           0 :         bool ret = false;
    3148             : 
    3149             :         /* failed during ring init, it couldn't have issued any requests */
    3150           0 :         if (!ctx->rings)
    3151             :                 return false;
    3152             : 
    3153           0 :         if (!task) {
    3154           0 :                 ret |= io_uring_try_cancel_iowq(ctx);
    3155           0 :         } else if (tctx && tctx->io_wq) {
    3156             :                 /*
    3157             :                  * Cancels requests of all rings, not only @ctx, but
    3158             :                  * it's fine as the task is in exit/exec.
    3159             :                  */
    3160           0 :                 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
    3161             :                                        &cancel, true);
    3162           0 :                 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
    3163             :         }
    3164             : 
    3165             :         /* SQPOLL thread does its own polling */
    3166           0 :         if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
    3167           0 :             (ctx->sq_data && ctx->sq_data->thread == current)) {
    3168           0 :                 while (!wq_list_empty(&ctx->iopoll_list)) {
    3169           0 :                         io_iopoll_try_reap_events(ctx);
    3170           0 :                         ret = true;
    3171           0 :                         cond_resched();
    3172             :                 }
    3173             :         }
    3174             : 
    3175           0 :         if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
    3176           0 :             io_allowed_defer_tw_run(ctx))
    3177           0 :                 ret |= io_run_local_work(ctx) > 0;
    3178           0 :         ret |= io_cancel_defer_files(ctx, task, cancel_all);
    3179           0 :         mutex_lock(&ctx->uring_lock);
    3180           0 :         ret |= io_poll_remove_all(ctx, task, cancel_all);
    3181           0 :         mutex_unlock(&ctx->uring_lock);
    3182           0 :         ret |= io_kill_timeouts(ctx, task, cancel_all);
    3183           0 :         if (task)
    3184           0 :                 ret |= io_run_task_work() > 0;
    3185             :         return ret;
    3186             : }
    3187             : 
    3188             : static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
    3189             : {
    3190           0 :         if (tracked)
    3191           0 :                 return atomic_read(&tctx->inflight_tracked);
    3192           0 :         return percpu_counter_sum(&tctx->inflight);
    3193             : }
    3194             : 
    3195             : /*
    3196             :  * Find any io_uring ctx that this task has registered or done IO on, and cancel
    3197             :  * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
    3198             :  */
    3199           0 : __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
    3200             : {
    3201           0 :         struct io_uring_task *tctx = current->io_uring;
    3202             :         struct io_ring_ctx *ctx;
    3203             :         s64 inflight;
    3204           0 :         DEFINE_WAIT(wait);
    3205             : 
    3206           0 :         WARN_ON_ONCE(sqd && sqd->thread != current);
    3207             : 
    3208           0 :         if (!current->io_uring)
    3209           0 :                 return;
    3210           0 :         if (tctx->io_wq)
    3211           0 :                 io_wq_exit_start(tctx->io_wq);
    3212             : 
    3213           0 :         atomic_inc(&tctx->in_cancel);
    3214             :         do {
    3215           0 :                 bool loop = false;
    3216             : 
    3217           0 :                 io_uring_drop_tctx_refs(current);
    3218             :                 /* read completions before cancelations */
    3219           0 :                 inflight = tctx_inflight(tctx, !cancel_all);
    3220           0 :                 if (!inflight)
    3221             :                         break;
    3222             : 
    3223           0 :                 if (!sqd) {
    3224             :                         struct io_tctx_node *node;
    3225             :                         unsigned long index;
    3226             : 
    3227           0 :                         xa_for_each(&tctx->xa, index, node) {
    3228             :                                 /* sqpoll task will cancel all its requests */
    3229           0 :                                 if (node->ctx->sq_data)
    3230           0 :                                         continue;
    3231           0 :                                 loop |= io_uring_try_cancel_requests(node->ctx,
    3232           0 :                                                         current, cancel_all);
    3233             :                         }
    3234             :                 } else {
    3235           0 :                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
    3236           0 :                                 loop |= io_uring_try_cancel_requests(ctx,
    3237           0 :                                                                      current,
    3238             :                                                                      cancel_all);
    3239             :                 }
    3240             : 
    3241           0 :                 if (loop) {
    3242           0 :                         cond_resched();
    3243           0 :                         continue;
    3244             :                 }
    3245             : 
    3246           0 :                 prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
    3247           0 :                 io_run_task_work();
    3248           0 :                 io_uring_drop_tctx_refs(current);
    3249             : 
    3250             :                 /*
    3251             :                  * If we've seen completions, retry without waiting. This
    3252             :                  * avoids a race where a completion comes in before we did
    3253             :                  * prepare_to_wait().
    3254             :                  */
    3255           0 :                 if (inflight == tctx_inflight(tctx, !cancel_all))
    3256           0 :                         schedule();
    3257           0 :                 finish_wait(&tctx->wait, &wait);
    3258             :         } while (1);
    3259             : 
    3260           0 :         io_uring_clean_tctx(tctx);
    3261           0 :         if (cancel_all) {
    3262             :                 /*
    3263             :                  * We shouldn't run task_works after cancel, so just leave
    3264             :                  * ->in_cancel set for normal exit.
    3265             :                  */
    3266           0 :                 atomic_dec(&tctx->in_cancel);
    3267             :                 /* for exec all current's requests should be gone, kill tctx */
    3268           0 :                 __io_uring_free(current);
    3269             :         }
    3270             : }
    3271             : 
    3272           0 : void __io_uring_cancel(bool cancel_all)
    3273             : {
    3274           0 :         io_uring_cancel_generic(cancel_all, NULL);
    3275           0 : }
    3276             : 
    3277           0 : static void *io_uring_validate_mmap_request(struct file *file,
    3278             :                                             loff_t pgoff, size_t sz)
    3279             : {
    3280           0 :         struct io_ring_ctx *ctx = file->private_data;
    3281           0 :         loff_t offset = pgoff << PAGE_SHIFT;
    3282             :         struct page *page;
    3283             :         void *ptr;
    3284             : 
    3285           0 :         switch (offset) {
    3286             :         case IORING_OFF_SQ_RING:
    3287             :         case IORING_OFF_CQ_RING:
    3288           0 :                 ptr = ctx->rings;
    3289             :                 break;
    3290             :         case IORING_OFF_SQES:
    3291           0 :                 ptr = ctx->sq_sqes;
    3292             :                 break;
    3293             :         default:
    3294             :                 return ERR_PTR(-EINVAL);
    3295             :         }
    3296             : 
    3297           0 :         page = virt_to_head_page(ptr);
    3298           0 :         if (sz > page_size(page))
    3299             :                 return ERR_PTR(-EINVAL);
    3300             : 
    3301             :         return ptr;
    3302             : }
    3303             : 
    3304             : #ifdef CONFIG_MMU
    3305             : 
    3306           0 : static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
    3307             : {
    3308           0 :         size_t sz = vma->vm_end - vma->vm_start;
    3309             :         unsigned long pfn;
    3310             :         void *ptr;
    3311             : 
    3312           0 :         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
    3313           0 :         if (IS_ERR(ptr))
    3314           0 :                 return PTR_ERR(ptr);
    3315             : 
    3316           0 :         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
    3317           0 :         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
    3318             : }
    3319             : 
    3320             : #else /* !CONFIG_MMU */
    3321             : 
    3322             : static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
    3323             : {
    3324             :         return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
    3325             : }
    3326             : 
    3327             : static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
    3328             : {
    3329             :         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
    3330             : }
    3331             : 
    3332             : static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
    3333             :         unsigned long addr, unsigned long len,
    3334             :         unsigned long pgoff, unsigned long flags)
    3335             : {
    3336             :         void *ptr;
    3337             : 
    3338             :         ptr = io_uring_validate_mmap_request(file, pgoff, len);
    3339             :         if (IS_ERR(ptr))
    3340             :                 return PTR_ERR(ptr);
    3341             : 
    3342             :         return (unsigned long) ptr;
    3343             : }
    3344             : 
    3345             : #endif /* !CONFIG_MMU */
    3346             : 
    3347             : static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
    3348             : {
    3349           0 :         if (flags & IORING_ENTER_EXT_ARG) {
    3350             :                 struct io_uring_getevents_arg arg;
    3351             : 
    3352           0 :                 if (argsz != sizeof(arg))
    3353           0 :                         return -EINVAL;
    3354           0 :                 if (copy_from_user(&arg, argp, sizeof(arg)))
    3355             :                         return -EFAULT;
    3356             :         }
    3357             :         return 0;
    3358             : }
    3359             : 
    3360           0 : static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
    3361             :                           struct __kernel_timespec __user **ts,
    3362             :                           const sigset_t __user **sig)
    3363             : {
    3364             :         struct io_uring_getevents_arg arg;
    3365             : 
    3366             :         /*
    3367             :          * If EXT_ARG isn't set, then we have no timespec and the argp pointer
    3368             :          * is just a pointer to the sigset_t.
    3369             :          */
    3370           0 :         if (!(flags & IORING_ENTER_EXT_ARG)) {
    3371           0 :                 *sig = (const sigset_t __user *) argp;
    3372           0 :                 *ts = NULL;
    3373           0 :                 return 0;
    3374             :         }
    3375             : 
    3376             :         /*
    3377             :          * EXT_ARG is set - ensure we agree on the size of it and copy in our
    3378             :          * timespec and sigset_t pointers if good.
    3379             :          */
    3380           0 :         if (*argsz != sizeof(arg))
    3381             :                 return -EINVAL;
    3382           0 :         if (copy_from_user(&arg, argp, sizeof(arg)))
    3383             :                 return -EFAULT;
    3384           0 :         if (arg.pad)
    3385             :                 return -EINVAL;
    3386           0 :         *sig = u64_to_user_ptr(arg.sigmask);
    3387           0 :         *argsz = arg.sigmask_sz;
    3388           0 :         *ts = u64_to_user_ptr(arg.ts);
    3389           0 :         return 0;
    3390             : }
    3391             : 
    3392           0 : SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
    3393             :                 u32, min_complete, u32, flags, const void __user *, argp,
    3394             :                 size_t, argsz)
    3395             : {
    3396             :         struct io_ring_ctx *ctx;
    3397             :         struct fd f;
    3398             :         long ret;
    3399             : 
    3400           0 :         if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
    3401             :                                IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
    3402             :                                IORING_ENTER_REGISTERED_RING)))
    3403             :                 return -EINVAL;
    3404             : 
    3405             :         /*
    3406             :          * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
    3407             :          * need only dereference our task private array to find it.
    3408             :          */
    3409           0 :         if (flags & IORING_ENTER_REGISTERED_RING) {
    3410           0 :                 struct io_uring_task *tctx = current->io_uring;
    3411             : 
    3412           0 :                 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
    3413             :                         return -EINVAL;
    3414           0 :                 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
    3415           0 :                 f.file = tctx->registered_rings[fd];
    3416           0 :                 f.flags = 0;
    3417           0 :                 if (unlikely(!f.file))
    3418             :                         return -EBADF;
    3419             :         } else {
    3420           0 :                 f = fdget(fd);
    3421           0 :                 if (unlikely(!f.file))
    3422             :                         return -EBADF;
    3423           0 :                 ret = -EOPNOTSUPP;
    3424           0 :                 if (unlikely(!io_is_uring_fops(f.file)))
    3425             :                         goto out;
    3426             :         }
    3427             : 
    3428           0 :         ctx = f.file->private_data;
    3429           0 :         ret = -EBADFD;
    3430           0 :         if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
    3431             :                 goto out;
    3432             : 
    3433             :         /*
    3434             :          * For SQ polling, the thread will do all submissions and completions.
    3435             :          * Just return the requested submit count, and wake the thread if
    3436             :          * we were asked to.
    3437             :          */
    3438           0 :         ret = 0;
    3439           0 :         if (ctx->flags & IORING_SETUP_SQPOLL) {
    3440           0 :                 io_cqring_overflow_flush(ctx);
    3441             : 
    3442           0 :                 if (unlikely(ctx->sq_data->thread == NULL)) {
    3443             :                         ret = -EOWNERDEAD;
    3444             :                         goto out;
    3445             :                 }
    3446           0 :                 if (flags & IORING_ENTER_SQ_WAKEUP)
    3447           0 :                         wake_up(&ctx->sq_data->wait);
    3448           0 :                 if (flags & IORING_ENTER_SQ_WAIT)
    3449           0 :                         io_sqpoll_wait_sq(ctx);
    3450             : 
    3451           0 :                 ret = to_submit;
    3452           0 :         } else if (to_submit) {
    3453           0 :                 ret = io_uring_add_tctx_node(ctx);
    3454           0 :                 if (unlikely(ret))
    3455             :                         goto out;
    3456             : 
    3457           0 :                 mutex_lock(&ctx->uring_lock);
    3458           0 :                 ret = io_submit_sqes(ctx, to_submit);
    3459           0 :                 if (ret != to_submit) {
    3460           0 :                         mutex_unlock(&ctx->uring_lock);
    3461           0 :                         goto out;
    3462             :                 }
    3463           0 :                 if (flags & IORING_ENTER_GETEVENTS) {
    3464           0 :                         if (ctx->syscall_iopoll)
    3465             :                                 goto iopoll_locked;
    3466             :                         /*
    3467             :                          * Ignore errors, we'll soon call io_cqring_wait() and
    3468             :                          * it should handle ownership problems if any.
    3469             :                          */
    3470           0 :                         if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
    3471           0 :                                 (void)io_run_local_work_locked(ctx);
    3472             :                 }
    3473           0 :                 mutex_unlock(&ctx->uring_lock);
    3474             :         }
    3475             : 
    3476           0 :         if (flags & IORING_ENTER_GETEVENTS) {
    3477             :                 int ret2;
    3478             : 
    3479           0 :                 if (ctx->syscall_iopoll) {
    3480             :                         /*
    3481             :                          * We disallow the app entering submit/complete with
    3482             :                          * polling, but we still need to lock the ring to
    3483             :                          * prevent racing with polled issue that got punted to
    3484             :                          * a workqueue.
    3485             :                          */
    3486           0 :                         mutex_lock(&ctx->uring_lock);
    3487             : iopoll_locked:
    3488           0 :                         ret2 = io_validate_ext_arg(flags, argp, argsz);
    3489           0 :                         if (likely(!ret2)) {
    3490           0 :                                 min_complete = min(min_complete,
    3491             :                                                    ctx->cq_entries);
    3492           0 :                                 ret2 = io_iopoll_check(ctx, min_complete);
    3493             :                         }
    3494           0 :                         mutex_unlock(&ctx->uring_lock);
    3495             :                 } else {
    3496             :                         const sigset_t __user *sig;
    3497             :                         struct __kernel_timespec __user *ts;
    3498             : 
    3499           0 :                         ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
    3500           0 :                         if (likely(!ret2)) {
    3501           0 :                                 min_complete = min(min_complete,
    3502             :                                                    ctx->cq_entries);
    3503           0 :                                 ret2 = io_cqring_wait(ctx, min_complete, sig,
    3504             :                                                       argsz, ts);
    3505             :                         }
    3506             :                 }
    3507             : 
    3508           0 :                 if (!ret) {
    3509           0 :                         ret = ret2;
    3510             : 
    3511             :                         /*
    3512             :                          * EBADR indicates that one or more CQE were dropped.
    3513             :                          * Once the user has been informed we can clear the bit
    3514             :                          * as they are obviously ok with those drops.
    3515             :                          */
    3516           0 :                         if (unlikely(ret2 == -EBADR))
    3517             :                                 clear_bit(IO_CHECK_CQ_DROPPED_BIT,
    3518           0 :                                           &ctx->check_cq);
    3519             :                 }
    3520             :         }
    3521             : out:
    3522           0 :         fdput(f);
    3523             :         return ret;
    3524             : }
    3525             : 
    3526             : static const struct file_operations io_uring_fops = {
    3527             :         .release        = io_uring_release,
    3528             :         .mmap           = io_uring_mmap,
    3529             : #ifndef CONFIG_MMU
    3530             :         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
    3531             :         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
    3532             : #endif
    3533             :         .poll           = io_uring_poll,
    3534             : #ifdef CONFIG_PROC_FS
    3535             :         .show_fdinfo    = io_uring_show_fdinfo,
    3536             : #endif
    3537             : };
    3538             : 
    3539           0 : bool io_is_uring_fops(struct file *file)
    3540             : {
    3541           0 :         return file->f_op == &io_uring_fops;
    3542             : }
    3543             : 
    3544           0 : static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
    3545             :                                          struct io_uring_params *p)
    3546             : {
    3547             :         struct io_rings *rings;
    3548             :         size_t size, sq_array_offset;
    3549             : 
    3550             :         /* make sure these are sane, as we already accounted them */
    3551           0 :         ctx->sq_entries = p->sq_entries;
    3552           0 :         ctx->cq_entries = p->cq_entries;
    3553             : 
    3554           0 :         size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
    3555           0 :         if (size == SIZE_MAX)
    3556             :                 return -EOVERFLOW;
    3557             : 
    3558           0 :         rings = io_mem_alloc(size);
    3559           0 :         if (!rings)
    3560             :                 return -ENOMEM;
    3561             : 
    3562           0 :         ctx->rings = rings;
    3563           0 :         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
    3564           0 :         rings->sq_ring_mask = p->sq_entries - 1;
    3565           0 :         rings->cq_ring_mask = p->cq_entries - 1;
    3566           0 :         rings->sq_ring_entries = p->sq_entries;
    3567           0 :         rings->cq_ring_entries = p->cq_entries;
    3568             : 
    3569           0 :         if (p->flags & IORING_SETUP_SQE128)
    3570           0 :                 size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
    3571             :         else
    3572           0 :                 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
    3573           0 :         if (size == SIZE_MAX) {
    3574           0 :                 io_mem_free(ctx->rings);
    3575           0 :                 ctx->rings = NULL;
    3576           0 :                 return -EOVERFLOW;
    3577             :         }
    3578             : 
    3579           0 :         ctx->sq_sqes = io_mem_alloc(size);
    3580           0 :         if (!ctx->sq_sqes) {
    3581           0 :                 io_mem_free(ctx->rings);
    3582           0 :                 ctx->rings = NULL;
    3583           0 :                 return -ENOMEM;
    3584             :         }
    3585             : 
    3586             :         return 0;
    3587             : }
    3588             : 
    3589           0 : static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
    3590             : {
    3591             :         int ret, fd;
    3592             : 
    3593           0 :         fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
    3594           0 :         if (fd < 0)
    3595             :                 return fd;
    3596             : 
    3597           0 :         ret = __io_uring_add_tctx_node(ctx);
    3598           0 :         if (ret) {
    3599           0 :                 put_unused_fd(fd);
    3600           0 :                 return ret;
    3601             :         }
    3602           0 :         fd_install(fd, file);
    3603           0 :         return fd;
    3604             : }
    3605             : 
    3606             : /*
    3607             :  * Allocate an anonymous fd, this is what constitutes the application
    3608             :  * visible backing of an io_uring instance. The application mmaps this
    3609             :  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
    3610             :  * we have to tie this fd to a socket for file garbage collection purposes.
    3611             :  */
    3612             : static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
    3613             : {
    3614             :         struct file *file;
    3615             : #if defined(CONFIG_UNIX)
    3616             :         int ret;
    3617             : 
    3618             :         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
    3619             :                                 &ctx->ring_sock);
    3620             :         if (ret)
    3621             :                 return ERR_PTR(ret);
    3622             : #endif
    3623             : 
    3624           0 :         file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
    3625             :                                          O_RDWR | O_CLOEXEC, NULL);
    3626             : #if defined(CONFIG_UNIX)
    3627             :         if (IS_ERR(file)) {
    3628             :                 sock_release(ctx->ring_sock);
    3629             :                 ctx->ring_sock = NULL;
    3630             :         } else {
    3631             :                 ctx->ring_sock->file = file;
    3632             :         }
    3633             : #endif
    3634             :         return file;
    3635             : }
    3636             : 
    3637           0 : static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
    3638             :                                   struct io_uring_params __user *params)
    3639             : {
    3640             :         struct io_ring_ctx *ctx;
    3641             :         struct file *file;
    3642             :         int ret;
    3643             : 
    3644           0 :         if (!entries)
    3645             :                 return -EINVAL;
    3646           0 :         if (entries > IORING_MAX_ENTRIES) {
    3647           0 :                 if (!(p->flags & IORING_SETUP_CLAMP))
    3648             :                         return -EINVAL;
    3649             :                 entries = IORING_MAX_ENTRIES;
    3650             :         }
    3651             : 
    3652             :         /*
    3653             :          * Use twice as many entries for the CQ ring. It's possible for the
    3654             :          * application to drive a higher depth than the size of the SQ ring,
    3655             :          * since the sqes are only used at submission time. This allows for
    3656             :          * some flexibility in overcommitting a bit. If the application has
    3657             :          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
    3658             :          * of CQ ring entries manually.
    3659             :          */
    3660           0 :         p->sq_entries = roundup_pow_of_two(entries);
    3661           0 :         if (p->flags & IORING_SETUP_CQSIZE) {
    3662             :                 /*
    3663             :                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
    3664             :                  * to a power-of-two, if it isn't already. We do NOT impose
    3665             :                  * any cq vs sq ring sizing.
    3666             :                  */
    3667           0 :                 if (!p->cq_entries)
    3668             :                         return -EINVAL;
    3669           0 :                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
    3670           0 :                         if (!(p->flags & IORING_SETUP_CLAMP))
    3671             :                                 return -EINVAL;
    3672           0 :                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
    3673             :                 }
    3674           0 :                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
    3675           0 :                 if (p->cq_entries < p->sq_entries)
    3676             :                         return -EINVAL;
    3677             :         } else {
    3678           0 :                 p->cq_entries = 2 * p->sq_entries;
    3679             :         }
    3680             : 
    3681           0 :         ctx = io_ring_ctx_alloc(p);
    3682           0 :         if (!ctx)
    3683             :                 return -ENOMEM;
    3684             : 
    3685           0 :         if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
    3686           0 :             !(ctx->flags & IORING_SETUP_IOPOLL) &&
    3687             :             !(ctx->flags & IORING_SETUP_SQPOLL))
    3688           0 :                 ctx->task_complete = true;
    3689             : 
    3690             :         /*
    3691             :          * lazy poll_wq activation relies on ->task_complete for synchronisation
    3692             :          * purposes, see io_activate_pollwq()
    3693             :          */
    3694           0 :         if (!ctx->task_complete)
    3695           0 :                 ctx->poll_activated = true;
    3696             : 
    3697             :         /*
    3698             :          * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
    3699             :          * space applications don't need to do io completion events
    3700             :          * polling again, they can rely on io_sq_thread to do polling
    3701             :          * work, which can reduce cpu usage and uring_lock contention.
    3702             :          */
    3703           0 :         if (ctx->flags & IORING_SETUP_IOPOLL &&
    3704             :             !(ctx->flags & IORING_SETUP_SQPOLL))
    3705           0 :                 ctx->syscall_iopoll = 1;
    3706             : 
    3707           0 :         ctx->compat = in_compat_syscall();
    3708           0 :         if (!capable(CAP_IPC_LOCK))
    3709           0 :                 ctx->user = get_uid(current_user());
    3710             : 
    3711             :         /*
    3712             :          * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
    3713             :          * COOP_TASKRUN is set, then IPIs are never needed by the app.
    3714             :          */
    3715           0 :         ret = -EINVAL;
    3716           0 :         if (ctx->flags & IORING_SETUP_SQPOLL) {
    3717             :                 /* IPI related flags don't make sense with SQPOLL */
    3718           0 :                 if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
    3719             :                                   IORING_SETUP_TASKRUN_FLAG |
    3720             :                                   IORING_SETUP_DEFER_TASKRUN))
    3721             :                         goto err;
    3722           0 :                 ctx->notify_method = TWA_SIGNAL_NO_IPI;
    3723           0 :         } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
    3724           0 :                 ctx->notify_method = TWA_SIGNAL_NO_IPI;
    3725             :         } else {
    3726           0 :                 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG &&
    3727             :                     !(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
    3728             :                         goto err;
    3729           0 :                 ctx->notify_method = TWA_SIGNAL;
    3730             :         }
    3731             : 
    3732             :         /*
    3733             :          * For DEFER_TASKRUN we require the completion task to be the same as the
    3734             :          * submission task. This implies that there is only one submitter, so enforce
    3735             :          * that.
    3736             :          */
    3737           0 :         if (ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
    3738             :             !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) {
    3739             :                 goto err;
    3740             :         }
    3741             : 
    3742             :         /*
    3743             :          * This is just grabbed for accounting purposes. When a process exits,
    3744             :          * the mm is exited and dropped before the files, hence we need to hang
    3745             :          * on to this mm purely for the purposes of being able to unaccount
    3746             :          * memory (locked/pinned vm). It's not used for anything else.
    3747             :          */
    3748           0 :         mmgrab(current->mm);
    3749           0 :         ctx->mm_account = current->mm;
    3750             : 
    3751           0 :         ret = io_allocate_scq_urings(ctx, p);
    3752           0 :         if (ret)
    3753             :                 goto err;
    3754             : 
    3755           0 :         ret = io_sq_offload_create(ctx, p);
    3756           0 :         if (ret)
    3757             :                 goto err;
    3758             :         /* always set a rsrc node */
    3759           0 :         ret = io_rsrc_node_switch_start(ctx);
    3760           0 :         if (ret)
    3761             :                 goto err;
    3762           0 :         io_rsrc_node_switch(ctx, NULL);
    3763             : 
    3764           0 :         memset(&p->sq_off, 0, sizeof(p->sq_off));
    3765           0 :         p->sq_off.head = offsetof(struct io_rings, sq.head);
    3766           0 :         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
    3767           0 :         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
    3768           0 :         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
    3769           0 :         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
    3770           0 :         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
    3771           0 :         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
    3772             : 
    3773           0 :         memset(&p->cq_off, 0, sizeof(p->cq_off));
    3774           0 :         p->cq_off.head = offsetof(struct io_rings, cq.head);
    3775           0 :         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
    3776           0 :         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
    3777           0 :         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
    3778           0 :         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
    3779           0 :         p->cq_off.cqes = offsetof(struct io_rings, cqes);
    3780           0 :         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
    3781             : 
    3782           0 :         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
    3783             :                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
    3784             :                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
    3785             :                         IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
    3786             :                         IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
    3787             :                         IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
    3788             :                         IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING;
    3789             : 
    3790           0 :         if (copy_to_user(params, p, sizeof(*p))) {
    3791             :                 ret = -EFAULT;
    3792             :                 goto err;
    3793             :         }
    3794             : 
    3795           0 :         if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
    3796           0 :             && !(ctx->flags & IORING_SETUP_R_DISABLED))
    3797           0 :                 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
    3798             : 
    3799           0 :         file = io_uring_get_file(ctx);
    3800           0 :         if (IS_ERR(file)) {
    3801           0 :                 ret = PTR_ERR(file);
    3802           0 :                 goto err;
    3803             :         }
    3804             : 
    3805             :         /*
    3806             :          * Install ring fd as the very last thing, so we don't risk someone
    3807             :          * having closed it before we finish setup
    3808             :          */
    3809           0 :         ret = io_uring_install_fd(ctx, file);
    3810           0 :         if (ret < 0) {
    3811             :                 /* fput will clean it up */
    3812           0 :                 fput(file);
    3813           0 :                 return ret;
    3814             :         }
    3815             : 
    3816             :         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
    3817             :         return ret;
    3818             : err:
    3819           0 :         io_ring_ctx_wait_and_kill(ctx);
    3820           0 :         return ret;
    3821             : }
    3822             : 
    3823             : /*
    3824             :  * Sets up an aio uring context, and returns the fd. Applications asks for a
    3825             :  * ring size, we return the actual sq/cq ring sizes (among other things) in the
    3826             :  * params structure passed in.
    3827             :  */
    3828           0 : static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
    3829             : {
    3830             :         struct io_uring_params p;
    3831             :         int i;
    3832             : 
    3833           0 :         if (copy_from_user(&p, params, sizeof(p)))
    3834             :                 return -EFAULT;
    3835           0 :         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
    3836           0 :                 if (p.resv[i])
    3837             :                         return -EINVAL;
    3838             :         }
    3839             : 
    3840           0 :         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
    3841             :                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
    3842             :                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
    3843             :                         IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
    3844             :                         IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
    3845             :                         IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
    3846             :                         IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN))
    3847             :                 return -EINVAL;
    3848             : 
    3849           0 :         return io_uring_create(entries, &p, params);
    3850             : }
    3851             : 
    3852           0 : SYSCALL_DEFINE2(io_uring_setup, u32, entries,
    3853             :                 struct io_uring_params __user *, params)
    3854             : {
    3855           0 :         return io_uring_setup(entries, params);
    3856             : }
    3857             : 
    3858           0 : static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
    3859             :                            unsigned nr_args)
    3860             : {
    3861             :         struct io_uring_probe *p;
    3862             :         size_t size;
    3863             :         int i, ret;
    3864             : 
    3865           0 :         size = struct_size(p, ops, nr_args);
    3866           0 :         if (size == SIZE_MAX)
    3867             :                 return -EOVERFLOW;
    3868           0 :         p = kzalloc(size, GFP_KERNEL);
    3869           0 :         if (!p)
    3870             :                 return -ENOMEM;
    3871             : 
    3872           0 :         ret = -EFAULT;
    3873           0 :         if (copy_from_user(p, arg, size))
    3874             :                 goto out;
    3875           0 :         ret = -EINVAL;
    3876           0 :         if (memchr_inv(p, 0, size))
    3877             :                 goto out;
    3878             : 
    3879           0 :         p->last_op = IORING_OP_LAST - 1;
    3880           0 :         if (nr_args > IORING_OP_LAST)
    3881           0 :                 nr_args = IORING_OP_LAST;
    3882             : 
    3883           0 :         for (i = 0; i < nr_args; i++) {
    3884           0 :                 p->ops[i].op = i;
    3885           0 :                 if (!io_issue_defs[i].not_supported)
    3886           0 :                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
    3887             :         }
    3888           0 :         p->ops_len = i;
    3889             : 
    3890           0 :         ret = 0;
    3891           0 :         if (copy_to_user(arg, p, size))
    3892           0 :                 ret = -EFAULT;
    3893             : out:
    3894           0 :         kfree(p);
    3895             :         return ret;
    3896             : }
    3897             : 
    3898           0 : static int io_register_personality(struct io_ring_ctx *ctx)
    3899             : {
    3900             :         const struct cred *creds;
    3901             :         u32 id;
    3902             :         int ret;
    3903             : 
    3904           0 :         creds = get_current_cred();
    3905             : 
    3906           0 :         ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
    3907           0 :                         XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
    3908           0 :         if (ret < 0) {
    3909             :                 put_cred(creds);
    3910             :                 return ret;
    3911             :         }
    3912           0 :         return id;
    3913             : }
    3914             : 
    3915           0 : static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
    3916             :                                            void __user *arg, unsigned int nr_args)
    3917             : {
    3918             :         struct io_uring_restriction *res;
    3919             :         size_t size;
    3920             :         int i, ret;
    3921             : 
    3922             :         /* Restrictions allowed only if rings started disabled */
    3923           0 :         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
    3924             :                 return -EBADFD;
    3925             : 
    3926             :         /* We allow only a single restrictions registration */
    3927           0 :         if (ctx->restrictions.registered)
    3928             :                 return -EBUSY;
    3929             : 
    3930           0 :         if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
    3931             :                 return -EINVAL;
    3932             : 
    3933           0 :         size = array_size(nr_args, sizeof(*res));
    3934           0 :         if (size == SIZE_MAX)
    3935             :                 return -EOVERFLOW;
    3936             : 
    3937           0 :         res = memdup_user(arg, size);
    3938           0 :         if (IS_ERR(res))
    3939           0 :                 return PTR_ERR(res);
    3940             : 
    3941             :         ret = 0;
    3942             : 
    3943           0 :         for (i = 0; i < nr_args; i++) {
    3944           0 :                 switch (res[i].opcode) {
    3945             :                 case IORING_RESTRICTION_REGISTER_OP:
    3946           0 :                         if (res[i].register_op >= IORING_REGISTER_LAST) {
    3947             :                                 ret = -EINVAL;
    3948             :                                 goto out;
    3949             :                         }
    3950             : 
    3951           0 :                         __set_bit(res[i].register_op,
    3952             :                                   ctx->restrictions.register_op);
    3953             :                         break;
    3954             :                 case IORING_RESTRICTION_SQE_OP:
    3955           0 :                         if (res[i].sqe_op >= IORING_OP_LAST) {
    3956             :                                 ret = -EINVAL;
    3957             :                                 goto out;
    3958             :                         }
    3959             : 
    3960           0 :                         __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
    3961             :                         break;
    3962             :                 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
    3963           0 :                         ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
    3964           0 :                         break;
    3965             :                 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
    3966           0 :                         ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
    3967           0 :                         break;
    3968             :                 default:
    3969             :                         ret = -EINVAL;
    3970             :                         goto out;
    3971             :                 }
    3972             :         }
    3973             : 
    3974             : out:
    3975             :         /* Reset all restrictions if an error happened */
    3976           0 :         if (ret != 0)
    3977           0 :                 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
    3978             :         else
    3979           0 :                 ctx->restrictions.registered = true;
    3980             : 
    3981           0 :         kfree(res);
    3982           0 :         return ret;
    3983             : }
    3984             : 
    3985           0 : static int io_register_enable_rings(struct io_ring_ctx *ctx)
    3986             : {
    3987           0 :         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
    3988             :                 return -EBADFD;
    3989             : 
    3990           0 :         if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
    3991           0 :                 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
    3992             :                 /*
    3993             :                  * Lazy activation attempts would fail if it was polled before
    3994             :                  * submitter_task is set.
    3995             :                  */
    3996           0 :                 if (wq_has_sleeper(&ctx->poll_wq))
    3997           0 :                         io_activate_pollwq(ctx);
    3998             :         }
    3999             : 
    4000           0 :         if (ctx->restrictions.registered)
    4001           0 :                 ctx->restricted = 1;
    4002             : 
    4003           0 :         ctx->flags &= ~IORING_SETUP_R_DISABLED;
    4004           0 :         if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
    4005           0 :                 wake_up(&ctx->sq_data->wait);
    4006             :         return 0;
    4007             : }
    4008             : 
    4009           0 : static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
    4010             :                                        void __user *arg, unsigned len)
    4011             : {
    4012           0 :         struct io_uring_task *tctx = current->io_uring;
    4013             :         cpumask_var_t new_mask;
    4014             :         int ret;
    4015             : 
    4016           0 :         if (!tctx || !tctx->io_wq)
    4017             :                 return -EINVAL;
    4018             : 
    4019           0 :         if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
    4020             :                 return -ENOMEM;
    4021             : 
    4022           0 :         cpumask_clear(new_mask);
    4023           0 :         if (len > cpumask_size())
    4024           0 :                 len = cpumask_size();
    4025             : 
    4026             :         if (in_compat_syscall()) {
    4027             :                 ret = compat_get_bitmap(cpumask_bits(new_mask),
    4028             :                                         (const compat_ulong_t __user *)arg,
    4029             :                                         len * 8 /* CHAR_BIT */);
    4030             :         } else {
    4031           0 :                 ret = copy_from_user(new_mask, arg, len);
    4032             :         }
    4033             : 
    4034           0 :         if (ret) {
    4035             :                 free_cpumask_var(new_mask);
    4036             :                 return -EFAULT;
    4037             :         }
    4038             : 
    4039           0 :         ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
    4040           0 :         free_cpumask_var(new_mask);
    4041             :         return ret;
    4042             : }
    4043             : 
    4044           0 : static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
    4045             : {
    4046           0 :         struct io_uring_task *tctx = current->io_uring;
    4047             : 
    4048           0 :         if (!tctx || !tctx->io_wq)
    4049             :                 return -EINVAL;
    4050             : 
    4051           0 :         return io_wq_cpu_affinity(tctx->io_wq, NULL);
    4052             : }
    4053             : 
    4054           0 : static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
    4055             :                                                void __user *arg)
    4056             :         __must_hold(&ctx->uring_lock)
    4057             : {
    4058             :         struct io_tctx_node *node;
    4059           0 :         struct io_uring_task *tctx = NULL;
    4060           0 :         struct io_sq_data *sqd = NULL;
    4061             :         __u32 new_count[2];
    4062             :         int i, ret;
    4063             : 
    4064           0 :         if (copy_from_user(new_count, arg, sizeof(new_count)))
    4065             :                 return -EFAULT;
    4066           0 :         for (i = 0; i < ARRAY_SIZE(new_count); i++)
    4067           0 :                 if (new_count[i] > INT_MAX)
    4068             :                         return -EINVAL;
    4069             : 
    4070           0 :         if (ctx->flags & IORING_SETUP_SQPOLL) {
    4071           0 :                 sqd = ctx->sq_data;
    4072           0 :                 if (sqd) {
    4073             :                         /*
    4074             :                          * Observe the correct sqd->lock -> ctx->uring_lock
    4075             :                          * ordering. Fine to drop uring_lock here, we hold
    4076             :                          * a ref to the ctx.
    4077             :                          */
    4078           0 :                         refcount_inc(&sqd->refs);
    4079           0 :                         mutex_unlock(&ctx->uring_lock);
    4080           0 :                         mutex_lock(&sqd->lock);
    4081           0 :                         mutex_lock(&ctx->uring_lock);
    4082           0 :                         if (sqd->thread)
    4083           0 :                                 tctx = sqd->thread->io_uring;
    4084             :                 }
    4085             :         } else {
    4086           0 :                 tctx = current->io_uring;
    4087             :         }
    4088             : 
    4089             :         BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
    4090             : 
    4091           0 :         for (i = 0; i < ARRAY_SIZE(new_count); i++)
    4092           0 :                 if (new_count[i])
    4093           0 :                         ctx->iowq_limits[i] = new_count[i];
    4094           0 :         ctx->iowq_limits_set = true;
    4095             : 
    4096           0 :         if (tctx && tctx->io_wq) {
    4097           0 :                 ret = io_wq_max_workers(tctx->io_wq, new_count);
    4098           0 :                 if (ret)
    4099             :                         goto err;
    4100             :         } else {
    4101           0 :                 memset(new_count, 0, sizeof(new_count));
    4102             :         }
    4103             : 
    4104           0 :         if (sqd) {
    4105           0 :                 mutex_unlock(&sqd->lock);
    4106           0 :                 io_put_sq_data(sqd);
    4107             :         }
    4108             : 
    4109           0 :         if (copy_to_user(arg, new_count, sizeof(new_count)))
    4110             :                 return -EFAULT;
    4111             : 
    4112             :         /* that's it for SQPOLL, only the SQPOLL task creates requests */
    4113           0 :         if (sqd)
    4114             :                 return 0;
    4115             : 
    4116             :         /* now propagate the restriction to all registered users */
    4117           0 :         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
    4118           0 :                 struct io_uring_task *tctx = node->task->io_uring;
    4119             : 
    4120           0 :                 if (WARN_ON_ONCE(!tctx->io_wq))
    4121           0 :                         continue;
    4122             : 
    4123           0 :                 for (i = 0; i < ARRAY_SIZE(new_count); i++)
    4124           0 :                         new_count[i] = ctx->iowq_limits[i];
    4125             :                 /* ignore errors, it always returns zero anyway */
    4126           0 :                 (void)io_wq_max_workers(tctx->io_wq, new_count);
    4127             :         }
    4128             :         return 0;
    4129             : err:
    4130           0 :         if (sqd) {
    4131           0 :                 mutex_unlock(&sqd->lock);
    4132           0 :                 io_put_sq_data(sqd);
    4133             :         }
    4134             :         return ret;
    4135             : }
    4136             : 
    4137           0 : static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
    4138             :                                void __user *arg, unsigned nr_args)
    4139             :         __releases(ctx->uring_lock)
    4140             :         __acquires(ctx->uring_lock)
    4141             : {
    4142             :         int ret;
    4143             : 
    4144             :         /*
    4145             :          * We don't quiesce the refs for register anymore and so it can't be
    4146             :          * dying as we're holding a file ref here.
    4147             :          */
    4148           0 :         if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
    4149             :                 return -ENXIO;
    4150             : 
    4151           0 :         if (ctx->submitter_task && ctx->submitter_task != current)
    4152             :                 return -EEXIST;
    4153             : 
    4154           0 :         if (ctx->restricted) {
    4155           0 :                 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
    4156           0 :                 if (!test_bit(opcode, ctx->restrictions.register_op))
    4157             :                         return -EACCES;
    4158             :         }
    4159             : 
    4160           0 :         switch (opcode) {
    4161             :         case IORING_REGISTER_BUFFERS:
    4162           0 :                 ret = -EFAULT;
    4163           0 :                 if (!arg)
    4164             :                         break;
    4165           0 :                 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
    4166           0 :                 break;
    4167             :         case IORING_UNREGISTER_BUFFERS:
    4168           0 :                 ret = -EINVAL;
    4169           0 :                 if (arg || nr_args)
    4170             :                         break;
    4171           0 :                 ret = io_sqe_buffers_unregister(ctx);
    4172           0 :                 break;
    4173             :         case IORING_REGISTER_FILES:
    4174           0 :                 ret = -EFAULT;
    4175           0 :                 if (!arg)
    4176             :                         break;
    4177           0 :                 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
    4178           0 :                 break;
    4179             :         case IORING_UNREGISTER_FILES:
    4180           0 :                 ret = -EINVAL;
    4181           0 :                 if (arg || nr_args)
    4182             :                         break;
    4183           0 :                 ret = io_sqe_files_unregister(ctx);
    4184           0 :                 break;
    4185             :         case IORING_REGISTER_FILES_UPDATE:
    4186           0 :                 ret = io_register_files_update(ctx, arg, nr_args);
    4187           0 :                 break;
    4188             :         case IORING_REGISTER_EVENTFD:
    4189           0 :                 ret = -EINVAL;
    4190           0 :                 if (nr_args != 1)
    4191             :                         break;
    4192           0 :                 ret = io_eventfd_register(ctx, arg, 0);
    4193           0 :                 break;
    4194             :         case IORING_REGISTER_EVENTFD_ASYNC:
    4195           0 :                 ret = -EINVAL;
    4196           0 :                 if (nr_args != 1)
    4197             :                         break;
    4198           0 :                 ret = io_eventfd_register(ctx, arg, 1);
    4199           0 :                 break;
    4200             :         case IORING_UNREGISTER_EVENTFD:
    4201           0 :                 ret = -EINVAL;
    4202           0 :                 if (arg || nr_args)
    4203             :                         break;
    4204           0 :                 ret = io_eventfd_unregister(ctx);
    4205           0 :                 break;
    4206             :         case IORING_REGISTER_PROBE:
    4207           0 :                 ret = -EINVAL;
    4208           0 :                 if (!arg || nr_args > 256)
    4209             :                         break;
    4210           0 :                 ret = io_probe(ctx, arg, nr_args);
    4211           0 :                 break;
    4212             :         case IORING_REGISTER_PERSONALITY:
    4213           0 :                 ret = -EINVAL;
    4214           0 :                 if (arg || nr_args)
    4215             :                         break;
    4216           0 :                 ret = io_register_personality(ctx);
    4217           0 :                 break;
    4218             :         case IORING_UNREGISTER_PERSONALITY:
    4219           0 :                 ret = -EINVAL;
    4220           0 :                 if (arg)
    4221             :                         break;
    4222           0 :                 ret = io_unregister_personality(ctx, nr_args);
    4223           0 :                 break;
    4224             :         case IORING_REGISTER_ENABLE_RINGS:
    4225           0 :                 ret = -EINVAL;
    4226           0 :                 if (arg || nr_args)
    4227             :                         break;
    4228           0 :                 ret = io_register_enable_rings(ctx);
    4229           0 :                 break;
    4230             :         case IORING_REGISTER_RESTRICTIONS:
    4231           0 :                 ret = io_register_restrictions(ctx, arg, nr_args);
    4232           0 :                 break;
    4233             :         case IORING_REGISTER_FILES2:
    4234           0 :                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
    4235           0 :                 break;
    4236             :         case IORING_REGISTER_FILES_UPDATE2:
    4237           0 :                 ret = io_register_rsrc_update(ctx, arg, nr_args,
    4238             :                                               IORING_RSRC_FILE);
    4239           0 :                 break;
    4240             :         case IORING_REGISTER_BUFFERS2:
    4241           0 :                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
    4242           0 :                 break;
    4243             :         case IORING_REGISTER_BUFFERS_UPDATE:
    4244           0 :                 ret = io_register_rsrc_update(ctx, arg, nr_args,
    4245             :                                               IORING_RSRC_BUFFER);
    4246           0 :                 break;
    4247             :         case IORING_REGISTER_IOWQ_AFF:
    4248           0 :                 ret = -EINVAL;
    4249           0 :                 if (!arg || !nr_args)
    4250             :                         break;
    4251           0 :                 ret = io_register_iowq_aff(ctx, arg, nr_args);
    4252           0 :                 break;
    4253             :         case IORING_UNREGISTER_IOWQ_AFF:
    4254           0 :                 ret = -EINVAL;
    4255           0 :                 if (arg || nr_args)
    4256             :                         break;
    4257           0 :                 ret = io_unregister_iowq_aff(ctx);
    4258           0 :                 break;
    4259             :         case IORING_REGISTER_IOWQ_MAX_WORKERS:
    4260           0 :                 ret = -EINVAL;
    4261           0 :                 if (!arg || nr_args != 2)
    4262             :                         break;
    4263           0 :                 ret = io_register_iowq_max_workers(ctx, arg);
    4264           0 :                 break;
    4265             :         case IORING_REGISTER_RING_FDS:
    4266           0 :                 ret = io_ringfd_register(ctx, arg, nr_args);
    4267           0 :                 break;
    4268             :         case IORING_UNREGISTER_RING_FDS:
    4269           0 :                 ret = io_ringfd_unregister(ctx, arg, nr_args);
    4270           0 :                 break;
    4271             :         case IORING_REGISTER_PBUF_RING:
    4272           0 :                 ret = -EINVAL;
    4273           0 :                 if (!arg || nr_args != 1)
    4274             :                         break;
    4275           0 :                 ret = io_register_pbuf_ring(ctx, arg);
    4276           0 :                 break;
    4277             :         case IORING_UNREGISTER_PBUF_RING:
    4278           0 :                 ret = -EINVAL;
    4279           0 :                 if (!arg || nr_args != 1)
    4280             :                         break;
    4281           0 :                 ret = io_unregister_pbuf_ring(ctx, arg);
    4282           0 :                 break;
    4283             :         case IORING_REGISTER_SYNC_CANCEL:
    4284           0 :                 ret = -EINVAL;
    4285           0 :                 if (!arg || nr_args != 1)
    4286             :                         break;
    4287           0 :                 ret = io_sync_cancel(ctx, arg);
    4288           0 :                 break;
    4289             :         case IORING_REGISTER_FILE_ALLOC_RANGE:
    4290           0 :                 ret = -EINVAL;
    4291           0 :                 if (!arg || nr_args)
    4292             :                         break;
    4293           0 :                 ret = io_register_file_alloc_range(ctx, arg);
    4294           0 :                 break;
    4295             :         default:
    4296             :                 ret = -EINVAL;
    4297             :                 break;
    4298             :         }
    4299             : 
    4300             :         return ret;
    4301             : }
    4302             : 
    4303           0 : SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
    4304             :                 void __user *, arg, unsigned int, nr_args)
    4305             : {
    4306             :         struct io_ring_ctx *ctx;
    4307           0 :         long ret = -EBADF;
    4308             :         struct fd f;
    4309             :         bool use_registered_ring;
    4310             : 
    4311           0 :         use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
    4312           0 :         opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
    4313             : 
    4314           0 :         if (opcode >= IORING_REGISTER_LAST)
    4315             :                 return -EINVAL;
    4316             : 
    4317           0 :         if (use_registered_ring) {
    4318             :                 /*
    4319             :                  * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
    4320             :                  * need only dereference our task private array to find it.
    4321             :                  */
    4322           0 :                 struct io_uring_task *tctx = current->io_uring;
    4323             : 
    4324           0 :                 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
    4325             :                         return -EINVAL;
    4326           0 :                 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
    4327           0 :                 f.file = tctx->registered_rings[fd];
    4328           0 :                 f.flags = 0;
    4329           0 :                 if (unlikely(!f.file))
    4330             :                         return -EBADF;
    4331             :         } else {
    4332           0 :                 f = fdget(fd);
    4333           0 :                 if (unlikely(!f.file))
    4334             :                         return -EBADF;
    4335           0 :                 ret = -EOPNOTSUPP;
    4336           0 :                 if (!io_is_uring_fops(f.file))
    4337             :                         goto out_fput;
    4338             :         }
    4339             : 
    4340           0 :         ctx = f.file->private_data;
    4341             : 
    4342           0 :         mutex_lock(&ctx->uring_lock);
    4343           0 :         ret = __io_uring_register(ctx, opcode, arg, nr_args);
    4344           0 :         mutex_unlock(&ctx->uring_lock);
    4345           0 :         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
    4346             : out_fput:
    4347           0 :         fdput(f);
    4348             :         return ret;
    4349             : }
    4350             : 
    4351           1 : static int __init io_uring_init(void)
    4352             : {
    4353             : #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
    4354             :         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
    4355             :         BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \
    4356             : } while (0)
    4357             : 
    4358             : #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
    4359             :         __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, sizeof(etype), ename)
    4360             : #define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \
    4361             :         __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename)
    4362             :         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
    4363             :         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
    4364             :         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
    4365             :         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
    4366             :         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
    4367             :         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
    4368             :         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
    4369             :         BUILD_BUG_SQE_ELEM(8,  __u32,  cmd_op);
    4370             :         BUILD_BUG_SQE_ELEM(12, __u32, __pad1);
    4371             :         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
    4372             :         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
    4373             :         BUILD_BUG_SQE_ELEM(24, __u32,  len);
    4374             :         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
    4375             :         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
    4376             :         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
    4377             :         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
    4378             :         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
    4379             :         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
    4380             :         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
    4381             :         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
    4382             :         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
    4383             :         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
    4384             :         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
    4385             :         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
    4386             :         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
    4387             :         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
    4388             :         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
    4389             :         BUILD_BUG_SQE_ELEM(28, __u32,  rename_flags);
    4390             :         BUILD_BUG_SQE_ELEM(28, __u32,  unlink_flags);
    4391             :         BUILD_BUG_SQE_ELEM(28, __u32,  hardlink_flags);
    4392             :         BUILD_BUG_SQE_ELEM(28, __u32,  xattr_flags);
    4393             :         BUILD_BUG_SQE_ELEM(28, __u32,  msg_ring_flags);
    4394             :         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
    4395             :         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
    4396             :         BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
    4397             :         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
    4398             :         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
    4399             :         BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
    4400             :         BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
    4401             :         BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
    4402             :         BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
    4403             :         BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
    4404             :         BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
    4405             : 
    4406             :         BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
    4407             :                      sizeof(struct io_uring_rsrc_update));
    4408             :         BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
    4409             :                      sizeof(struct io_uring_rsrc_update2));
    4410             : 
    4411             :         /* ->buf_index is u16 */
    4412             :         BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
    4413             :         BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
    4414             :                      offsetof(struct io_uring_buf_ring, tail));
    4415             : 
    4416             :         /* should fit into one byte */
    4417             :         BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
    4418             :         BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
    4419             :         BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
    4420             : 
    4421             :         BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
    4422             : 
    4423             :         BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
    4424             : 
    4425           1 :         io_uring_optable_init();
    4426             : 
    4427           1 :         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
    4428             :                                 SLAB_ACCOUNT);
    4429           1 :         return 0;
    4430             : };
    4431             : __initcall(io_uring_init);

Generated by: LCOV version 1.14