LCOV - coverage.info - io_uring/io

LCOV - code coverage report

Current view:	top level - io_uring - io_uring.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	4	1946	0.2 %
Date:	2023-07-19 18:55:55	Functions:	1	136	0.7 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Shared application/kernel submission and completion ring pairs, for
       4             :  * supporting fast/efficient IO.
       5             :  *
       6             :  * A note on the read/write ordering memory barriers that are matched between
       7             :  * the application and kernel side.
       8             :  *
       9             :  * After the application reads the CQ ring tail, it must use an
      10             :  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
      11             :  * before writing the tail (using smp_load_acquire to read the tail will
      12             :  * do). It also needs a smp_mb() before updating CQ head (ordering the
      13             :  * entry load(s) with the head store), pairing with an implicit barrier
      14             :  * through a control-dependency in io_get_cqe (smp_store_release to
      15             :  * store head will do). Failure to do so could lead to reading invalid
      16             :  * CQ entries.
      17             :  *
      18             :  * Likewise, the application must use an appropriate smp_wmb() before
      19             :  * writing the SQ tail (ordering SQ entry stores with the tail store),
      20             :  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
      21             :  * to store the tail will do). And it needs a barrier ordering the SQ
      22             :  * head load before writing new SQ entries (smp_load_acquire to read
      23             :  * head will do).
      24             :  *
      25             :  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
      26             :  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
      27             :  * updating the SQ tail; a full memory barrier smp_mb() is needed
      28             :  * between.
      29             :  *
      30             :  * Also see the examples in the liburing library:
      31             :  *
      32             :  *      git://git.kernel.dk/liburing
      33             :  *
      34             :  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
      35             :  * from data shared between the kernel and application. This is done both
      36             :  * for ordering purposes, but also to ensure that once a value is loaded from
      37             :  * data that the application could potentially modify, it remains stable.
      38             :  *
      39             :  * Copyright (C) 2018-2019 Jens Axboe
      40             :  * Copyright (c) 2018-2019 Christoph Hellwig
      41             :  */
      42             : #include <linux/kernel.h>
      43             : #include <linux/init.h>
      44             : #include <linux/errno.h>
      45             : #include <linux/syscalls.h>
      46             : #include <net/compat.h>
      47             : #include <linux/refcount.h>
      48             : #include <linux/uio.h>
      49             : #include <linux/bits.h>
      50             : 
      51             : #include <linux/sched/signal.h>
      52             : #include <linux/fs.h>
      53             : #include <linux/file.h>
      54             : #include <linux/fdtable.h>
      55             : #include <linux/mm.h>
      56             : #include <linux/mman.h>
      57             : #include <linux/percpu.h>
      58             : #include <linux/slab.h>
      59             : #include <linux/bvec.h>
      60             : #include <linux/net.h>
      61             : #include <net/sock.h>
      62             : #include <net/af_unix.h>
      63             : #include <net/scm.h>
      64             : #include <linux/anon_inodes.h>
      65             : #include <linux/sched/mm.h>
      66             : #include <linux/uaccess.h>
      67             : #include <linux/nospec.h>
      68             : #include <linux/highmem.h>
      69             : #include <linux/fsnotify.h>
      70             : #include <linux/fadvise.h>
      71             : #include <linux/task_work.h>
      72             : #include <linux/io_uring.h>
      73             : #include <linux/audit.h>
      74             : #include <linux/security.h>
      75             : #include <asm/shmparam.h>
      76             : 
      77             : #define CREATE_TRACE_POINTS
      78             : #include <trace/events/io_uring.h>
      79             : 
      80             : #include <uapi/linux/io_uring.h>
      81             : 
      82             : #include "io-wq.h"
      83             : 
      84             : #include "io_uring.h"
      85             : #include "opdef.h"
      86             : #include "refs.h"
      87             : #include "tctx.h"
      88             : #include "sqpoll.h"
      89             : #include "fdinfo.h"
      90             : #include "kbuf.h"
      91             : #include "rsrc.h"
      92             : #include "cancel.h"
      93             : #include "net.h"
      94             : #include "notif.h"
      95             : 
      96             : #include "timeout.h"
      97             : #include "poll.h"
      98             : #include "alloc_cache.h"
      99             : 
     100             : #define IORING_MAX_ENTRIES      32768
     101             : #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
     102             : 
     103             : #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
     104             :                                  IORING_REGISTER_LAST + IORING_OP_LAST)
     105             : 
     106             : #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
     107             :                           IOSQE_IO_HARDLINK | IOSQE_ASYNC)
     108             : 
     109             : #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
     110             :                         IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
     111             : 
     112             : #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
     113             :                                 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
     114             :                                 REQ_F_ASYNC_DATA)
     115             : 
     116             : #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
     117             :                                  IO_REQ_CLEAN_FLAGS)
     118             : 
     119             : #define IO_TCTX_REFS_CACHE_NR   (1U << 10)
     120             : 
     121             : #define IO_COMPL_BATCH                  32
     122             : #define IO_REQ_ALLOC_BATCH              8
     123             : 
     124             : enum {
     125             :         IO_CHECK_CQ_OVERFLOW_BIT,
     126             :         IO_CHECK_CQ_DROPPED_BIT,
     127             : };
     128             : 
     129             : enum {
     130             :         IO_EVENTFD_OP_SIGNAL_BIT,
     131             :         IO_EVENTFD_OP_FREE_BIT,
     132             : };
     133             : 
     134             : struct io_defer_entry {
     135             :         struct list_head        list;
     136             :         struct io_kiocb         *req;
     137             :         u32                     seq;
     138             : };
     139             : 
     140             : /* requests with any of those set should undergo io_disarm_next() */
     141             : #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
     142             : #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
     143             : 
     144             : static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
     145             :                                          struct task_struct *task,
     146             :                                          bool cancel_all);
     147             : 
     148             : static void io_dismantle_req(struct io_kiocb *req);
     149             : static void io_clean_op(struct io_kiocb *req);
     150             : static void io_queue_sqe(struct io_kiocb *req);
     151             : static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
     152             : static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
     153             : static __cold void io_fallback_tw(struct io_uring_task *tctx);
     154             : 
     155             : struct kmem_cache *req_cachep;
     156             : 
     157           0 : struct sock *io_uring_get_socket(struct file *file)
     158             : {
     159             : #if defined(CONFIG_UNIX)
     160             :         if (io_is_uring_fops(file)) {
     161             :                 struct io_ring_ctx *ctx = file->private_data;
     162             : 
     163             :                 return ctx->ring_sock->sk;
     164             :         }
     165             : #endif
     166           0 :         return NULL;
     167             : }
     168             : EXPORT_SYMBOL(io_uring_get_socket);
     169             : 
     170             : static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
     171             : {
     172           0 :         if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
     173           0 :             ctx->submit_state.cqes_count)
     174           0 :                 __io_submit_flush_completions(ctx);
     175             : }
     176             : 
     177             : static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
     178             : {
     179           0 :         return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
     180             : }
     181             : 
     182             : static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx)
     183             : {
     184           0 :         return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head);
     185             : }
     186             : 
     187             : static bool io_match_linked(struct io_kiocb *head)
     188             : {
     189             :         struct io_kiocb *req;
     190             : 
     191           0 :         io_for_each_link(req, head) {
     192           0 :                 if (req->flags & REQ_F_INFLIGHT)
     193             :                         return true;
     194             :         }
     195             :         return false;
     196             : }
     197             : 
     198             : /*
     199             :  * As io_match_task() but protected against racing with linked timeouts.
     200             :  * User must not hold timeout_lock.
     201             :  */
     202           0 : bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
     203             :                         bool cancel_all)
     204             : {
     205             :         bool matched;
     206             : 
     207           0 :         if (task && head->task != task)
     208             :                 return false;
     209           0 :         if (cancel_all)
     210             :                 return true;
     211             : 
     212           0 :         if (head->flags & REQ_F_LINK_TIMEOUT) {
     213           0 :                 struct io_ring_ctx *ctx = head->ctx;
     214             : 
     215             :                 /* protect against races with linked timeouts */
     216           0 :                 spin_lock_irq(&ctx->timeout_lock);
     217           0 :                 matched = io_match_linked(head);
     218           0 :                 spin_unlock_irq(&ctx->timeout_lock);
     219             :         } else {
     220             :                 matched = io_match_linked(head);
     221             :         }
     222             :         return matched;
     223             : }
     224             : 
     225             : static inline void req_fail_link_node(struct io_kiocb *req, int res)
     226             : {
     227           0 :         req_set_fail(req);
     228           0 :         io_req_set_res(req, res, 0);
     229             : }
     230             : 
     231             : static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
     232             : {
     233           0 :         wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
     234           0 :         kasan_poison_object_data(req_cachep, req);
     235             : }
     236             : 
     237           0 : static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
     238             : {
     239           0 :         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
     240             : 
     241           0 :         complete(&ctx->ref_comp);
     242           0 : }
     243             : 
     244           0 : static __cold void io_fallback_req_func(struct work_struct *work)
     245             : {
     246           0 :         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
     247             :                                                 fallback_work.work);
     248           0 :         struct llist_node *node = llist_del_all(&ctx->fallback_llist);
     249             :         struct io_kiocb *req, *tmp;
     250           0 :         struct io_tw_state ts = { .locked = true, };
     251             : 
     252           0 :         mutex_lock(&ctx->uring_lock);
     253           0 :         llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
     254           0 :                 req->io_task_work.func(req, &ts);
     255           0 :         if (WARN_ON_ONCE(!ts.locked))
     256           0 :                 return;
     257           0 :         io_submit_flush_completions(ctx);
     258           0 :         mutex_unlock(&ctx->uring_lock);
     259             : }
     260             : 
     261           0 : static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
     262             : {
     263           0 :         unsigned hash_buckets = 1U << bits;
     264           0 :         size_t hash_size = hash_buckets * sizeof(table->hbs[0]);
     265             : 
     266           0 :         table->hbs = kmalloc(hash_size, GFP_KERNEL);
     267           0 :         if (!table->hbs)
     268             :                 return -ENOMEM;
     269             : 
     270           0 :         table->hash_bits = bits;
     271           0 :         init_hash_table(table, hash_buckets);
     272           0 :         return 0;
     273             : }
     274             : 
     275           0 : static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
     276             : {
     277             :         struct io_ring_ctx *ctx;
     278             :         int hash_bits;
     279             : 
     280           0 :         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
     281           0 :         if (!ctx)
     282             :                 return NULL;
     283             : 
     284           0 :         xa_init(&ctx->io_bl_xa);
     285             : 
     286             :         /*
     287             :          * Use 5 bits less than the max cq entries, that should give us around
     288             :          * 32 entries per hash list if totally full and uniformly spread, but
     289             :          * don't keep too many buckets to not overconsume memory.
     290             :          */
     291           0 :         hash_bits = ilog2(p->cq_entries) - 5;
     292           0 :         hash_bits = clamp(hash_bits, 1, 8);
     293           0 :         if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
     294             :                 goto err;
     295           0 :         if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits))
     296             :                 goto err;
     297             : 
     298           0 :         ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
     299           0 :         if (!ctx->dummy_ubuf)
     300             :                 goto err;
     301             :         /* set invalid range, so io_import_fixed() fails meeting it */
     302           0 :         ctx->dummy_ubuf->ubuf = -1UL;
     303             : 
     304           0 :         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
     305             :                             0, GFP_KERNEL))
     306             :                 goto err;
     307             : 
     308           0 :         ctx->flags = p->flags;
     309           0 :         init_waitqueue_head(&ctx->sqo_sq_wait);
     310           0 :         INIT_LIST_HEAD(&ctx->sqd_list);
     311           0 :         INIT_LIST_HEAD(&ctx->cq_overflow_list);
     312           0 :         INIT_LIST_HEAD(&ctx->io_buffers_cache);
     313           0 :         io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
     314             :                             sizeof(struct io_rsrc_node));
     315           0 :         io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
     316             :                             sizeof(struct async_poll));
     317           0 :         io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
     318             :                             sizeof(struct io_async_msghdr));
     319           0 :         init_completion(&ctx->ref_comp);
     320           0 :         xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
     321           0 :         mutex_init(&ctx->uring_lock);
     322           0 :         init_waitqueue_head(&ctx->cq_wait);
     323           0 :         init_waitqueue_head(&ctx->poll_wq);
     324           0 :         init_waitqueue_head(&ctx->rsrc_quiesce_wq);
     325           0 :         spin_lock_init(&ctx->completion_lock);
     326           0 :         spin_lock_init(&ctx->timeout_lock);
     327           0 :         INIT_WQ_LIST(&ctx->iopoll_list);
     328           0 :         INIT_LIST_HEAD(&ctx->io_buffers_pages);
     329           0 :         INIT_LIST_HEAD(&ctx->io_buffers_comp);
     330           0 :         INIT_LIST_HEAD(&ctx->defer_list);
     331           0 :         INIT_LIST_HEAD(&ctx->timeout_list);
     332           0 :         INIT_LIST_HEAD(&ctx->ltimeout_list);
     333           0 :         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
     334           0 :         init_llist_head(&ctx->work_llist);
     335           0 :         INIT_LIST_HEAD(&ctx->tctx_list);
     336           0 :         ctx->submit_state.free_list.next = NULL;
     337           0 :         INIT_WQ_LIST(&ctx->locked_free_list);
     338           0 :         INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
     339           0 :         INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
     340             :         return ctx;
     341             : err:
     342           0 :         kfree(ctx->dummy_ubuf);
     343           0 :         kfree(ctx->cancel_table.hbs);
     344           0 :         kfree(ctx->cancel_table_locked.hbs);
     345           0 :         kfree(ctx->io_bl);
     346           0 :         xa_destroy(&ctx->io_bl_xa);
     347           0 :         kfree(ctx);
     348             :         return NULL;
     349             : }
     350             : 
     351             : static void io_account_cq_overflow(struct io_ring_ctx *ctx)
     352             : {
     353           0 :         struct io_rings *r = ctx->rings;
     354             : 
     355           0 :         WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
     356           0 :         ctx->cq_extra--;
     357             : }
     358             : 
     359             : static bool req_need_defer(struct io_kiocb *req, u32 seq)
     360             : {
     361           0 :         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
     362           0 :                 struct io_ring_ctx *ctx = req->ctx;
     363             : 
     364           0 :                 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
     365             :         }
     366             : 
     367             :         return false;
     368             : }
     369             : 
     370             : static inline void io_req_track_inflight(struct io_kiocb *req)
     371             : {
     372           0 :         if (!(req->flags & REQ_F_INFLIGHT)) {
     373           0 :                 req->flags |= REQ_F_INFLIGHT;
     374           0 :                 atomic_inc(&req->task->io_uring->inflight_tracked);
     375             :         }
     376             : }
     377             : 
     378           0 : static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
     379             : {
     380           0 :         if (WARN_ON_ONCE(!req->link))
     381             :                 return NULL;
     382             : 
     383           0 :         req->flags &= ~REQ_F_ARM_LTIMEOUT;
     384           0 :         req->flags |= REQ_F_LINK_TIMEOUT;
     385             : 
     386             :         /* linked timeouts should have two refs once prep'ed */
     387           0 :         io_req_set_refcount(req);
     388           0 :         __io_req_set_refcount(req->link, 2);
     389           0 :         return req->link;
     390             : }
     391             : 
     392             : static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
     393             : {
     394           0 :         if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
     395             :                 return NULL;
     396           0 :         return __io_prep_linked_timeout(req);
     397             : }
     398             : 
     399           0 : static noinline void __io_arm_ltimeout(struct io_kiocb *req)
     400             : {
     401           0 :         io_queue_linked_timeout(__io_prep_linked_timeout(req));
     402           0 : }
     403             : 
     404             : static inline void io_arm_ltimeout(struct io_kiocb *req)
     405             : {
     406           0 :         if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
     407           0 :                 __io_arm_ltimeout(req);
     408             : }
     409             : 
     410           0 : static void io_prep_async_work(struct io_kiocb *req)
     411             : {
     412           0 :         const struct io_issue_def *def = &io_issue_defs[req->opcode];
     413           0 :         struct io_ring_ctx *ctx = req->ctx;
     414             : 
     415           0 :         if (!(req->flags & REQ_F_CREDS)) {
     416           0 :                 req->flags |= REQ_F_CREDS;
     417           0 :                 req->creds = get_current_cred();
     418             :         }
     419             : 
     420           0 :         req->work.list.next = NULL;
     421           0 :         req->work.flags = 0;
     422           0 :         req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
     423           0 :         if (req->flags & REQ_F_FORCE_ASYNC)
     424           0 :                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
     425             : 
     426           0 :         if (req->file && !io_req_ffs_set(req))
     427           0 :                 req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT;
     428             : 
     429           0 :         if (req->file && (req->flags & REQ_F_ISREG)) {
     430           0 :                 bool should_hash = def->hash_reg_file;
     431             : 
     432             :                 /* don't serialize this request if the fs doesn't need it */
     433           0 :                 if (should_hash && (req->file->f_flags & O_DIRECT) &&
     434             :                     (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE))
     435           0 :                         should_hash = false;
     436           0 :                 if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
     437           0 :                         io_wq_hash_work(&req->work, file_inode(req->file));
     438           0 :         } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
     439           0 :                 if (def->unbound_nonreg_file)
     440           0 :                         req->work.flags |= IO_WQ_WORK_UNBOUND;
     441             :         }
     442           0 : }
     443             : 
     444           0 : static void io_prep_async_link(struct io_kiocb *req)
     445             : {
     446             :         struct io_kiocb *cur;
     447             : 
     448           0 :         if (req->flags & REQ_F_LINK_TIMEOUT) {
     449           0 :                 struct io_ring_ctx *ctx = req->ctx;
     450             : 
     451           0 :                 spin_lock_irq(&ctx->timeout_lock);
     452           0 :                 io_for_each_link(cur, req)
     453           0 :                         io_prep_async_work(cur);
     454           0 :                 spin_unlock_irq(&ctx->timeout_lock);
     455             :         } else {
     456           0 :                 io_for_each_link(cur, req)
     457           0 :                         io_prep_async_work(cur);
     458             :         }
     459           0 : }
     460             : 
     461           0 : void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use)
     462             : {
     463           0 :         struct io_kiocb *link = io_prep_linked_timeout(req);
     464           0 :         struct io_uring_task *tctx = req->task->io_uring;
     465             : 
     466           0 :         BUG_ON(!tctx);
     467           0 :         BUG_ON(!tctx->io_wq);
     468             : 
     469             :         /* init ->work of the whole link before punting */
     470           0 :         io_prep_async_link(req);
     471             : 
     472             :         /*
     473             :          * Not expected to happen, but if we do have a bug where this _can_
     474             :          * happen, catch it here and ensure the request is marked as
     475             :          * canceled. That will make io-wq go through the usual work cancel
     476             :          * procedure rather than attempt to run this request (or create a new
     477             :          * worker for it).
     478             :          */
     479           0 :         if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
     480           0 :                 req->work.flags |= IO_WQ_WORK_CANCEL;
     481             : 
     482           0 :         trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
     483           0 :         io_wq_enqueue(tctx->io_wq, &req->work);
     484           0 :         if (link)
     485           0 :                 io_queue_linked_timeout(link);
     486           0 : }
     487             : 
     488           0 : static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
     489             : {
     490           0 :         while (!list_empty(&ctx->defer_list)) {
     491           0 :                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
     492             :                                                 struct io_defer_entry, list);
     493             : 
     494           0 :                 if (req_need_defer(de->req, de->seq))
     495             :                         break;
     496           0 :                 list_del_init(&de->list);
     497           0 :                 io_req_task_queue(de->req);
     498           0 :                 kfree(de);
     499             :         }
     500           0 : }
     501             : 
     502             : 
     503           0 : static void io_eventfd_ops(struct rcu_head *rcu)
     504             : {
     505           0 :         struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
     506           0 :         int ops = atomic_xchg(&ev_fd->ops, 0);
     507             : 
     508           0 :         if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
     509           0 :                 eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
     510             : 
     511             :         /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
     512             :          * ordering in a race but if references are 0 we know we have to free
     513             :          * it regardless.
     514             :          */
     515           0 :         if (atomic_dec_and_test(&ev_fd->refs)) {
     516           0 :                 eventfd_ctx_put(ev_fd->cq_ev_fd);
     517           0 :                 kfree(ev_fd);
     518             :         }
     519           0 : }
     520             : 
     521           0 : static void io_eventfd_signal(struct io_ring_ctx *ctx)
     522             : {
     523           0 :         struct io_ev_fd *ev_fd = NULL;
     524             : 
     525             :         rcu_read_lock();
     526             :         /*
     527             :          * rcu_dereference ctx->io_ev_fd once and use it for both for checking
     528             :          * and eventfd_signal
     529             :          */
     530           0 :         ev_fd = rcu_dereference(ctx->io_ev_fd);
     531             : 
     532             :         /*
     533             :          * Check again if ev_fd exists incase an io_eventfd_unregister call
     534             :          * completed between the NULL check of ctx->io_ev_fd at the start of
     535             :          * the function and rcu_read_lock.
     536             :          */
     537           0 :         if (unlikely(!ev_fd))
     538             :                 goto out;
     539           0 :         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
     540             :                 goto out;
     541           0 :         if (ev_fd->eventfd_async && !io_wq_current_is_worker())
     542             :                 goto out;
     543             : 
     544           0 :         if (likely(eventfd_signal_allowed())) {
     545           0 :                 eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
     546             :         } else {
     547           0 :                 atomic_inc(&ev_fd->refs);
     548           0 :                 if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
     549           0 :                         call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops);
     550             :                 else
     551           0 :                         atomic_dec(&ev_fd->refs);
     552             :         }
     553             : 
     554             : out:
     555             :         rcu_read_unlock();
     556           0 : }
     557             : 
     558             : static void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
     559             : {
     560             :         bool skip;
     561             : 
     562           0 :         spin_lock(&ctx->completion_lock);
     563             : 
     564             :         /*
     565             :          * Eventfd should only get triggered when at least one event has been
     566             :          * posted. Some applications rely on the eventfd notification count
     567             :          * only changing IFF a new CQE has been added to the CQ ring. There's
     568             :          * no depedency on 1:1 relationship between how many times this
     569             :          * function is called (and hence the eventfd count) and number of CQEs
     570             :          * posted to the CQ ring.
     571             :          */
     572           0 :         skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
     573           0 :         ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
     574           0 :         spin_unlock(&ctx->completion_lock);
     575           0 :         if (skip)
     576             :                 return;
     577             : 
     578           0 :         io_eventfd_signal(ctx);
     579             : }
     580             : 
     581           0 : void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
     582             : {
     583           0 :         if (ctx->poll_activated)
     584             :                 io_poll_wq_wake(ctx);
     585           0 :         if (ctx->off_timeout_used)
     586           0 :                 io_flush_timeouts(ctx);
     587           0 :         if (ctx->drain_active) {
     588           0 :                 spin_lock(&ctx->completion_lock);
     589           0 :                 io_queue_deferred(ctx);
     590           0 :                 spin_unlock(&ctx->completion_lock);
     591             :         }
     592           0 :         if (ctx->has_evfd)
     593             :                 io_eventfd_flush_signal(ctx);
     594           0 : }
     595             : 
     596             : static inline void __io_cq_lock(struct io_ring_ctx *ctx)
     597             :         __acquires(ctx->completion_lock)
     598             : {
     599           0 :         if (!ctx->task_complete)
     600           0 :                 spin_lock(&ctx->completion_lock);
     601             : }
     602             : 
     603             : static inline void __io_cq_unlock(struct io_ring_ctx *ctx)
     604             : {
     605           0 :         if (!ctx->task_complete)
     606           0 :                 spin_unlock(&ctx->completion_lock);
     607             : }
     608             : 
     609             : static inline void io_cq_lock(struct io_ring_ctx *ctx)
     610             :         __acquires(ctx->completion_lock)
     611             : {
     612           0 :         spin_lock(&ctx->completion_lock);
     613             : }
     614             : 
     615             : static inline void io_cq_unlock(struct io_ring_ctx *ctx)
     616             :         __releases(ctx->completion_lock)
     617             : {
     618           0 :         spin_unlock(&ctx->completion_lock);
     619             : }
     620             : 
     621             : /* keep it inlined for io_submit_flush_completions() */
     622           0 : static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
     623             :         __releases(ctx->completion_lock)
     624             : {
     625           0 :         io_commit_cqring(ctx);
     626           0 :         __io_cq_unlock(ctx);
     627           0 :         io_commit_cqring_flush(ctx);
     628           0 :         io_cqring_wake(ctx);
     629           0 : }
     630             : 
     631           0 : static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)
     632             :         __releases(ctx->completion_lock)
     633             : {
     634           0 :         io_commit_cqring(ctx);
     635             : 
     636           0 :         if (ctx->task_complete) {
     637             :                 /*
     638             :                  * ->task_complete implies that only current might be waiting
     639             :                  * for CQEs, and obviously, we currently don't. No one is
     640             :                  * waiting, wakeups are futile, skip them.
     641             :                  */
     642             :                 io_commit_cqring_flush(ctx);
     643             :         } else {
     644           0 :                 __io_cq_unlock(ctx);
     645           0 :                 io_commit_cqring_flush(ctx);
     646             :                 io_cqring_wake(ctx);
     647             :         }
     648           0 : }
     649             : 
     650           0 : void io_cq_unlock_post(struct io_ring_ctx *ctx)
     651             :         __releases(ctx->completion_lock)
     652             : {
     653           0 :         io_commit_cqring(ctx);
     654           0 :         spin_unlock(&ctx->completion_lock);
     655           0 :         io_commit_cqring_flush(ctx);
     656           0 :         io_cqring_wake(ctx);
     657           0 : }
     658             : 
     659             : /* Returns true if there are no backlogged entries after the flush */
     660           0 : static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
     661             : {
     662             :         struct io_overflow_cqe *ocqe;
     663           0 :         LIST_HEAD(list);
     664             : 
     665           0 :         io_cq_lock(ctx);
     666           0 :         list_splice_init(&ctx->cq_overflow_list, &list);
     667           0 :         clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
     668           0 :         io_cq_unlock(ctx);
     669             : 
     670           0 :         while (!list_empty(&list)) {
     671           0 :                 ocqe = list_first_entry(&list, struct io_overflow_cqe, list);
     672           0 :                 list_del(&ocqe->list);
     673           0 :                 kfree(ocqe);
     674             :         }
     675           0 : }
     676             : 
     677           0 : static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
     678             : {
     679           0 :         size_t cqe_size = sizeof(struct io_uring_cqe);
     680             : 
     681           0 :         if (__io_cqring_events(ctx) == ctx->cq_entries)
     682             :                 return;
     683             : 
     684           0 :         if (ctx->flags & IORING_SETUP_CQE32)
     685           0 :                 cqe_size <<= 1;
     686             : 
     687           0 :         io_cq_lock(ctx);
     688           0 :         while (!list_empty(&ctx->cq_overflow_list)) {
     689           0 :                 struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
     690             :                 struct io_overflow_cqe *ocqe;
     691             : 
     692           0 :                 if (!cqe)
     693             :                         break;
     694           0 :                 ocqe = list_first_entry(&ctx->cq_overflow_list,
     695             :                                         struct io_overflow_cqe, list);
     696           0 :                 memcpy(cqe, &ocqe->cqe, cqe_size);
     697           0 :                 list_del(&ocqe->list);
     698           0 :                 kfree(ocqe);
     699             :         }
     700             : 
     701           0 :         if (list_empty(&ctx->cq_overflow_list)) {
     702           0 :                 clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
     703           0 :                 atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
     704             :         }
     705           0 :         io_cq_unlock_post(ctx);
     706             : }
     707             : 
     708           0 : static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
     709             : {
     710             :         /* iopoll syncs against uring_lock, not completion_lock */
     711           0 :         if (ctx->flags & IORING_SETUP_IOPOLL)
     712           0 :                 mutex_lock(&ctx->uring_lock);
     713           0 :         __io_cqring_overflow_flush(ctx);
     714           0 :         if (ctx->flags & IORING_SETUP_IOPOLL)
     715           0 :                 mutex_unlock(&ctx->uring_lock);
     716           0 : }
     717             : 
     718           0 : static void io_cqring_overflow_flush(struct io_ring_ctx *ctx)
     719             : {
     720           0 :         if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
     721           0 :                 io_cqring_do_overflow_flush(ctx);
     722           0 : }
     723             : 
     724             : /* can be called by any task */
     725           0 : static void io_put_task_remote(struct task_struct *task, int nr)
     726             : {
     727           0 :         struct io_uring_task *tctx = task->io_uring;
     728             : 
     729           0 :         percpu_counter_sub(&tctx->inflight, nr);
     730           0 :         if (unlikely(atomic_read(&tctx->in_cancel)))
     731           0 :                 wake_up(&tctx->wait);
     732           0 :         put_task_struct_many(task, nr);
     733           0 : }
     734             : 
     735             : /* used by a task to put its own references */
     736             : static void io_put_task_local(struct task_struct *task, int nr)
     737             : {
     738           0 :         task->io_uring->cached_refs += nr;
     739             : }
     740             : 
     741             : /* must to be called somewhat shortly after putting a request */
     742           0 : static inline void io_put_task(struct task_struct *task, int nr)
     743             : {
     744           0 :         if (likely(task == current))
     745           0 :                 io_put_task_local(task, nr);
     746             :         else
     747           0 :                 io_put_task_remote(task, nr);
     748           0 : }
     749             : 
     750           0 : void io_task_refs_refill(struct io_uring_task *tctx)
     751             : {
     752           0 :         unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
     753             : 
     754           0 :         percpu_counter_add(&tctx->inflight, refill);
     755           0 :         refcount_add(refill, &current->usage);
     756           0 :         tctx->cached_refs += refill;
     757           0 : }
     758             : 
     759           0 : static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
     760             : {
     761           0 :         struct io_uring_task *tctx = task->io_uring;
     762           0 :         unsigned int refs = tctx->cached_refs;
     763             : 
     764           0 :         if (refs) {
     765           0 :                 tctx->cached_refs = 0;
     766           0 :                 percpu_counter_sub(&tctx->inflight, refs);
     767           0 :                 put_task_struct_many(task, refs);
     768             :         }
     769           0 : }
     770             : 
     771           0 : static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
     772             :                                      s32 res, u32 cflags, u64 extra1, u64 extra2)
     773             : {
     774             :         struct io_overflow_cqe *ocqe;
     775           0 :         size_t ocq_size = sizeof(struct io_overflow_cqe);
     776           0 :         bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
     777             : 
     778             :         lockdep_assert_held(&ctx->completion_lock);
     779             : 
     780           0 :         if (is_cqe32)
     781           0 :                 ocq_size += sizeof(struct io_uring_cqe);
     782             : 
     783           0 :         ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
     784           0 :         trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
     785           0 :         if (!ocqe) {
     786             :                 /*
     787             :                  * If we're in ring overflow flush mode, or in task cancel mode,
     788             :                  * or cannot allocate an overflow entry, then we need to drop it
     789             :                  * on the floor.
     790             :                  */
     791           0 :                 io_account_cq_overflow(ctx);
     792           0 :                 set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
     793           0 :                 return false;
     794             :         }
     795           0 :         if (list_empty(&ctx->cq_overflow_list)) {
     796           0 :                 set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
     797           0 :                 atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
     798             : 
     799             :         }
     800           0 :         ocqe->cqe.user_data = user_data;
     801           0 :         ocqe->cqe.res = res;
     802           0 :         ocqe->cqe.flags = cflags;
     803           0 :         if (is_cqe32) {
     804           0 :                 ocqe->cqe.big_cqe[0] = extra1;
     805           0 :                 ocqe->cqe.big_cqe[1] = extra2;
     806             :         }
     807           0 :         list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
     808           0 :         return true;
     809             : }
     810             : 
     811           0 : bool io_req_cqe_overflow(struct io_kiocb *req)
     812             : {
     813           0 :         if (!(req->flags & REQ_F_CQE32_INIT)) {
     814           0 :                 req->extra1 = 0;
     815           0 :                 req->extra2 = 0;
     816             :         }
     817           0 :         return io_cqring_event_overflow(req->ctx, req->cqe.user_data,
     818             :                                         req->cqe.res, req->cqe.flags,
     819             :                                         req->extra1, req->extra2);
     820             : }
     821             : 
     822             : /*
     823             :  * writes to the cq entry need to come after reading head; the
     824             :  * control dependency is enough as we're using WRITE_ONCE to
     825             :  * fill the cq entry
     826             :  */
     827           0 : struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
     828             : {
     829           0 :         struct io_rings *rings = ctx->rings;
     830           0 :         unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
     831             :         unsigned int free, queued, len;
     832             : 
     833             :         /*
     834             :          * Posting into the CQ when there are pending overflowed CQEs may break
     835             :          * ordering guarantees, which will affect links, F_MORE users and more.
     836             :          * Force overflow the completion.
     837             :          */
     838           0 :         if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
     839             :                 return NULL;
     840             : 
     841             :         /* userspace may cheat modifying the tail, be safe and do min */
     842           0 :         queued = min(__io_cqring_events(ctx), ctx->cq_entries);
     843           0 :         free = ctx->cq_entries - queued;
     844             :         /* we need a contiguous range, limit based on the current array offset */
     845           0 :         len = min(free, ctx->cq_entries - off);
     846           0 :         if (!len)
     847             :                 return NULL;
     848             : 
     849           0 :         if (ctx->flags & IORING_SETUP_CQE32) {
     850           0 :                 off <<= 1;
     851           0 :                 len <<= 1;
     852             :         }
     853             : 
     854           0 :         ctx->cqe_cached = &rings->cqes[off];
     855           0 :         ctx->cqe_sentinel = ctx->cqe_cached + len;
     856             : 
     857           0 :         ctx->cached_cq_tail++;
     858           0 :         ctx->cqe_cached++;
     859           0 :         if (ctx->flags & IORING_SETUP_CQE32)
     860           0 :                 ctx->cqe_cached++;
     861             :         return &rings->cqes[off];
     862             : }
     863             : 
     864           0 : static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
     865             :                               u32 cflags)
     866             : {
     867             :         struct io_uring_cqe *cqe;
     868             : 
     869           0 :         ctx->cq_extra++;
     870             : 
     871             :         /*
     872             :          * If we can't get a cq entry, userspace overflowed the
     873             :          * submission (by quite a lot). Increment the overflow count in
     874             :          * the ring.
     875             :          */
     876           0 :         cqe = io_get_cqe(ctx);
     877           0 :         if (likely(cqe)) {
     878           0 :                 trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
     879             : 
     880           0 :                 WRITE_ONCE(cqe->user_data, user_data);
     881           0 :                 WRITE_ONCE(cqe->res, res);
     882           0 :                 WRITE_ONCE(cqe->flags, cflags);
     883             : 
     884           0 :                 if (ctx->flags & IORING_SETUP_CQE32) {
     885           0 :                         WRITE_ONCE(cqe->big_cqe[0], 0);
     886           0 :                         WRITE_ONCE(cqe->big_cqe[1], 0);
     887             :                 }
     888             :                 return true;
     889             :         }
     890             :         return false;
     891             : }
     892             : 
     893           0 : static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
     894             :         __must_hold(&ctx->uring_lock)
     895             : {
     896           0 :         struct io_submit_state *state = &ctx->submit_state;
     897             :         unsigned int i;
     898             : 
     899             :         lockdep_assert_held(&ctx->uring_lock);
     900           0 :         for (i = 0; i < state->cqes_count; i++) {
     901           0 :                 struct io_uring_cqe *cqe = &state->cqes[i];
     902             : 
     903           0 :                 if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
     904           0 :                         if (ctx->task_complete) {
     905           0 :                                 spin_lock(&ctx->completion_lock);
     906           0 :                                 io_cqring_event_overflow(ctx, cqe->user_data,
     907             :                                                         cqe->res, cqe->flags, 0, 0);
     908           0 :                                 spin_unlock(&ctx->completion_lock);
     909             :                         } else {
     910           0 :                                 io_cqring_event_overflow(ctx, cqe->user_data,
     911             :                                                         cqe->res, cqe->flags, 0, 0);
     912             :                         }
     913             :                 }
     914             :         }
     915           0 :         state->cqes_count = 0;
     916           0 : }
     917             : 
     918           0 : static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
     919             :                               bool allow_overflow)
     920             : {
     921             :         bool filled;
     922             : 
     923           0 :         io_cq_lock(ctx);
     924           0 :         filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
     925           0 :         if (!filled && allow_overflow)
     926           0 :                 filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
     927             : 
     928           0 :         io_cq_unlock_post(ctx);
     929           0 :         return filled;
     930             : }
     931             : 
     932           0 : bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
     933             : {
     934           0 :         return __io_post_aux_cqe(ctx, user_data, res, cflags, true);
     935             : }
     936             : 
     937           0 : bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
     938             :                 bool allow_overflow)
     939             : {
     940             :         struct io_uring_cqe *cqe;
     941             :         unsigned int length;
     942             : 
     943           0 :         if (!defer)
     944           0 :                 return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow);
     945             : 
     946           0 :         length = ARRAY_SIZE(ctx->submit_state.cqes);
     947             : 
     948             :         lockdep_assert_held(&ctx->uring_lock);
     949             : 
     950           0 :         if (ctx->submit_state.cqes_count == length) {
     951           0 :                 __io_cq_lock(ctx);
     952           0 :                 __io_flush_post_cqes(ctx);
     953             :                 /* no need to flush - flush is deferred */
     954           0 :                 __io_cq_unlock_post(ctx);
     955             :         }
     956             : 
     957             :         /* For defered completions this is not as strict as it is otherwise,
     958             :          * however it's main job is to prevent unbounded posted completions,
     959             :          * and in that it works just as well.
     960             :          */
     961           0 :         if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
     962             :                 return false;
     963             : 
     964           0 :         cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++];
     965           0 :         cqe->user_data = user_data;
     966           0 :         cqe->res = res;
     967           0 :         cqe->flags = cflags;
     968           0 :         return true;
     969             : }
     970             : 
     971           0 : static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
     972             : {
     973           0 :         struct io_ring_ctx *ctx = req->ctx;
     974           0 :         struct io_rsrc_node *rsrc_node = NULL;
     975             : 
     976           0 :         io_cq_lock(ctx);
     977           0 :         if (!(req->flags & REQ_F_CQE_SKIP))
     978           0 :                 io_fill_cqe_req(ctx, req);
     979             : 
     980             :         /*
     981             :          * If we're the last reference to this request, add to our locked
     982             :          * free_list cache.
     983             :          */
     984           0 :         if (req_ref_put_and_test(req)) {
     985           0 :                 if (req->flags & IO_REQ_LINK_FLAGS) {
     986           0 :                         if (req->flags & IO_DISARM_MASK)
     987           0 :                                 io_disarm_next(req);
     988           0 :                         if (req->link) {
     989           0 :                                 io_req_task_queue(req->link);
     990           0 :                                 req->link = NULL;
     991             :                         }
     992             :                 }
     993           0 :                 io_put_kbuf_comp(req);
     994           0 :                 io_dismantle_req(req);
     995           0 :                 rsrc_node = req->rsrc_node;
     996             :                 /*
     997             :                  * Selected buffer deallocation in io_clean_op() assumes that
     998             :                  * we don't hold ->completion_lock. Clean them here to avoid
     999             :                  * deadlocks.
    1000             :                  */
    1001           0 :                 io_put_task_remote(req->task, 1);
    1002           0 :                 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
    1003           0 :                 ctx->locked_free_nr++;
    1004             :         }
    1005           0 :         io_cq_unlock_post(ctx);
    1006             : 
    1007           0 :         if (rsrc_node) {
    1008           0 :                 io_ring_submit_lock(ctx, issue_flags);
    1009           0 :                 io_put_rsrc_node(ctx, rsrc_node);
    1010             :                 io_ring_submit_unlock(ctx, issue_flags);
    1011             :         }
    1012           0 : }
    1013             : 
    1014           0 : void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
    1015             : {
    1016           0 :         if (req->ctx->task_complete && req->ctx->submitter_task != current) {
    1017           0 :                 req->io_task_work.func = io_req_task_complete;
    1018             :                 io_req_task_work_add(req);
    1019           0 :         } else if (!(issue_flags & IO_URING_F_UNLOCKED) ||
    1020           0 :                    !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
    1021           0 :                 __io_req_complete_post(req, issue_flags);
    1022             :         } else {
    1023           0 :                 struct io_ring_ctx *ctx = req->ctx;
    1024             : 
    1025           0 :                 mutex_lock(&ctx->uring_lock);
    1026           0 :                 __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED);
    1027           0 :                 mutex_unlock(&ctx->uring_lock);
    1028             :         }
    1029           0 : }
    1030             : 
    1031           0 : void io_req_defer_failed(struct io_kiocb *req, s32 res)
    1032             :         __must_hold(&ctx->uring_lock)
    1033             : {
    1034           0 :         const struct io_cold_def *def = &io_cold_defs[req->opcode];
    1035             : 
    1036             :         lockdep_assert_held(&req->ctx->uring_lock);
    1037             : 
    1038           0 :         req_set_fail(req);
    1039           0 :         io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
    1040           0 :         if (def->fail)
    1041           0 :                 def->fail(req);
    1042           0 :         io_req_complete_defer(req);
    1043           0 : }
    1044             : 
    1045             : /*
    1046             :  * Don't initialise the fields below on every allocation, but do that in
    1047             :  * advance and keep them valid across allocations.
    1048             :  */
    1049             : static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
    1050             : {
    1051           0 :         req->ctx = ctx;
    1052           0 :         req->link = NULL;
    1053           0 :         req->async_data = NULL;
    1054             :         /* not necessary, but safer to zero */
    1055           0 :         req->cqe.res = 0;
    1056             : }
    1057             : 
    1058             : static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
    1059             :                                         struct io_submit_state *state)
    1060             : {
    1061           0 :         spin_lock(&ctx->completion_lock);
    1062           0 :         wq_list_splice(&ctx->locked_free_list, &state->free_list);
    1063           0 :         ctx->locked_free_nr = 0;
    1064           0 :         spin_unlock(&ctx->completion_lock);
    1065             : }
    1066             : 
    1067             : /*
    1068             :  * A request might get retired back into the request caches even before opcode
    1069             :  * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
    1070             :  * Because of that, io_alloc_req() should be called only under ->uring_lock
    1071             :  * and with extra caution to not get a request that is still worked on.
    1072             :  */
    1073           0 : __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
    1074             :         __must_hold(&ctx->uring_lock)
    1075             : {
    1076           0 :         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
    1077             :         void *reqs[IO_REQ_ALLOC_BATCH];
    1078             :         int ret, i;
    1079             : 
    1080             :         /*
    1081             :          * If we have more than a batch's worth of requests in our IRQ side
    1082             :          * locked cache, grab the lock and move them over to our submission
    1083             :          * side cache.
    1084             :          */
    1085           0 :         if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
    1086           0 :                 io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
    1087           0 :                 if (!io_req_cache_empty(ctx))
    1088             :                         return true;
    1089             :         }
    1090             : 
    1091           0 :         ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
    1092             : 
    1093             :         /*
    1094             :          * Bulk alloc is all-or-nothing. If we fail to get a batch,
    1095             :          * retry single alloc to be on the safe side.
    1096             :          */
    1097           0 :         if (unlikely(ret <= 0)) {
    1098           0 :                 reqs[0] = kmem_cache_alloc(req_cachep, gfp);
    1099           0 :                 if (!reqs[0])
    1100             :                         return false;
    1101             :                 ret = 1;
    1102             :         }
    1103             : 
    1104           0 :         percpu_ref_get_many(&ctx->refs, ret);
    1105           0 :         for (i = 0; i < ret; i++) {
    1106           0 :                 struct io_kiocb *req = reqs[i];
    1107             : 
    1108           0 :                 io_preinit_req(req, ctx);
    1109           0 :                 io_req_add_to_cache(req, ctx);
    1110             :         }
    1111             :         return true;
    1112             : }
    1113             : 
    1114           0 : static inline void io_dismantle_req(struct io_kiocb *req)
    1115             : {
    1116           0 :         unsigned int flags = req->flags;
    1117             : 
    1118           0 :         if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
    1119           0 :                 io_clean_op(req);
    1120           0 :         if (!(flags & REQ_F_FIXED_FILE))
    1121           0 :                 io_put_file(req->file);
    1122           0 : }
    1123             : 
    1124           0 : static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts)
    1125             : {
    1126           0 :         struct io_ring_ctx *ctx = req->ctx;
    1127             : 
    1128           0 :         if (req->rsrc_node) {
    1129           0 :                 io_tw_lock(ctx, ts);
    1130           0 :                 io_put_rsrc_node(ctx, req->rsrc_node);
    1131             :         }
    1132           0 :         io_dismantle_req(req);
    1133           0 :         io_put_task_remote(req->task, 1);
    1134             : 
    1135           0 :         spin_lock(&ctx->completion_lock);
    1136           0 :         wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
    1137           0 :         ctx->locked_free_nr++;
    1138           0 :         spin_unlock(&ctx->completion_lock);
    1139           0 : }
    1140             : 
    1141           0 : __cold void io_free_req(struct io_kiocb *req)
    1142             : {
    1143           0 :         req->io_task_work.func = io_free_req_tw;
    1144           0 :         io_req_task_work_add(req);
    1145           0 : }
    1146             : 
    1147             : static void __io_req_find_next_prep(struct io_kiocb *req)
    1148             : {
    1149           0 :         struct io_ring_ctx *ctx = req->ctx;
    1150             : 
    1151           0 :         spin_lock(&ctx->completion_lock);
    1152           0 :         io_disarm_next(req);
    1153           0 :         spin_unlock(&ctx->completion_lock);
    1154             : }
    1155             : 
    1156             : static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
    1157             : {
    1158             :         struct io_kiocb *nxt;
    1159             : 
    1160             :         /*
    1161             :          * If LINK is set, we have dependent requests in this chain. If we
    1162             :          * didn't fail this request, queue the first one up, moving any other
    1163             :          * dependencies to the next request. In case of failure, fail the rest
    1164             :          * of the chain.
    1165             :          */
    1166           0 :         if (unlikely(req->flags & IO_DISARM_MASK))
    1167             :                 __io_req_find_next_prep(req);
    1168           0 :         nxt = req->link;
    1169           0 :         req->link = NULL;
    1170             :         return nxt;
    1171             : }
    1172             : 
    1173           0 : static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
    1174             : {
    1175           0 :         if (!ctx)
    1176             :                 return;
    1177           0 :         if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
    1178           0 :                 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
    1179           0 :         if (ts->locked) {
    1180           0 :                 io_submit_flush_completions(ctx);
    1181           0 :                 mutex_unlock(&ctx->uring_lock);
    1182           0 :                 ts->locked = false;
    1183             :         }
    1184           0 :         percpu_ref_put(&ctx->refs);
    1185             : }
    1186             : 
    1187           0 : static unsigned int handle_tw_list(struct llist_node *node,
    1188             :                                    struct io_ring_ctx **ctx,
    1189             :                                    struct io_tw_state *ts,
    1190             :                                    struct llist_node *last)
    1191             : {
    1192           0 :         unsigned int count = 0;
    1193             : 
    1194           0 :         while (node && node != last) {
    1195           0 :                 struct llist_node *next = node->next;
    1196           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1197             :                                                     io_task_work.node);
    1198             : 
    1199           0 :                 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
    1200             : 
    1201           0 :                 if (req->ctx != *ctx) {
    1202           0 :                         ctx_flush_and_put(*ctx, ts);
    1203           0 :                         *ctx = req->ctx;
    1204             :                         /* if not contended, grab and improve batching */
    1205           0 :                         ts->locked = mutex_trylock(&(*ctx)->uring_lock);
    1206           0 :                         percpu_ref_get(&(*ctx)->refs);
    1207             :                 }
    1208           0 :                 req->io_task_work.func(req, ts);
    1209           0 :                 node = next;
    1210           0 :                 count++;
    1211           0 :                 if (unlikely(need_resched())) {
    1212           0 :                         ctx_flush_and_put(*ctx, ts);
    1213           0 :                         *ctx = NULL;
    1214           0 :                         cond_resched();
    1215             :                 }
    1216             :         }
    1217             : 
    1218           0 :         return count;
    1219             : }
    1220             : 
    1221             : /**
    1222             :  * io_llist_xchg - swap all entries in a lock-less list
    1223             :  * @head:       the head of lock-less list to delete all entries
    1224             :  * @new:        new entry as the head of the list
    1225             :  *
    1226             :  * If list is empty, return NULL, otherwise, return the pointer to the first entry.
    1227             :  * The order of entries returned is from the newest to the oldest added one.
    1228             :  */
    1229             : static inline struct llist_node *io_llist_xchg(struct llist_head *head,
    1230             :                                                struct llist_node *new)
    1231             : {
    1232           0 :         return xchg(&head->first, new);
    1233             : }
    1234             : 
    1235             : /**
    1236             :  * io_llist_cmpxchg - possibly swap all entries in a lock-less list
    1237             :  * @head:       the head of lock-less list to delete all entries
    1238             :  * @old:        expected old value of the first entry of the list
    1239             :  * @new:        new entry as the head of the list
    1240             :  *
    1241             :  * perform a cmpxchg on the first entry of the list.
    1242             :  */
    1243             : 
    1244             : static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
    1245             :                                                   struct llist_node *old,
    1246             :                                                   struct llist_node *new)
    1247             : {
    1248           0 :         return cmpxchg(&head->first, old, new);
    1249             : }
    1250             : 
    1251           0 : void tctx_task_work(struct callback_head *cb)
    1252             : {
    1253           0 :         struct io_tw_state ts = {};
    1254           0 :         struct io_ring_ctx *ctx = NULL;
    1255           0 :         struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
    1256             :                                                   task_work);
    1257           0 :         struct llist_node fake = {};
    1258             :         struct llist_node *node;
    1259           0 :         unsigned int loops = 0;
    1260           0 :         unsigned int count = 0;
    1261             : 
    1262           0 :         if (unlikely(current->flags & PF_EXITING)) {
    1263           0 :                 io_fallback_tw(tctx);
    1264           0 :                 return;
    1265             :         }
    1266             : 
    1267             :         do {
    1268           0 :                 loops++;
    1269           0 :                 node = io_llist_xchg(&tctx->task_list, &fake);
    1270           0 :                 count += handle_tw_list(node, &ctx, &ts, &fake);
    1271             : 
    1272             :                 /* skip expensive cmpxchg if there are items in the list */
    1273           0 :                 if (READ_ONCE(tctx->task_list.first) != &fake)
    1274           0 :                         continue;
    1275           0 :                 if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) {
    1276           0 :                         io_submit_flush_completions(ctx);
    1277           0 :                         if (READ_ONCE(tctx->task_list.first) != &fake)
    1278           0 :                                 continue;
    1279             :                 }
    1280           0 :                 node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
    1281           0 :         } while (node != &fake);
    1282             : 
    1283           0 :         ctx_flush_and_put(ctx, &ts);
    1284             : 
    1285             :         /* relaxed read is enough as only the task itself sets ->in_cancel */
    1286           0 :         if (unlikely(atomic_read(&tctx->in_cancel)))
    1287           0 :                 io_uring_drop_tctx_refs(current);
    1288             : 
    1289           0 :         trace_io_uring_task_work_run(tctx, count, loops);
    1290             : }
    1291             : 
    1292           0 : static __cold void io_fallback_tw(struct io_uring_task *tctx)
    1293             : {
    1294           0 :         struct llist_node *node = llist_del_all(&tctx->task_list);
    1295             :         struct io_kiocb *req;
    1296             : 
    1297           0 :         while (node) {
    1298           0 :                 req = container_of(node, struct io_kiocb, io_task_work.node);
    1299           0 :                 node = node->next;
    1300           0 :                 if (llist_add(&req->io_task_work.node,
    1301           0 :                               &req->ctx->fallback_llist))
    1302           0 :                         schedule_delayed_work(&req->ctx->fallback_work, 1);
    1303             :         }
    1304           0 : }
    1305             : 
    1306           0 : static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
    1307             : {
    1308           0 :         struct io_ring_ctx *ctx = req->ctx;
    1309             :         unsigned nr_wait, nr_tw, nr_tw_prev;
    1310             :         struct llist_node *first;
    1311             : 
    1312           0 :         if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
    1313           0 :                 flags &= ~IOU_F_TWQ_LAZY_WAKE;
    1314             : 
    1315           0 :         first = READ_ONCE(ctx->work_llist.first);
    1316             :         do {
    1317           0 :                 nr_tw_prev = 0;
    1318           0 :                 if (first) {
    1319           0 :                         struct io_kiocb *first_req = container_of(first,
    1320             :                                                         struct io_kiocb,
    1321             :                                                         io_task_work.node);
    1322             :                         /*
    1323             :                          * Might be executed at any moment, rely on
    1324             :                          * SLAB_TYPESAFE_BY_RCU to keep it alive.
    1325             :                          */
    1326           0 :                         nr_tw_prev = READ_ONCE(first_req->nr_tw);
    1327             :                 }
    1328           0 :                 nr_tw = nr_tw_prev + 1;
    1329             :                 /* Large enough to fail the nr_wait comparison below */
    1330           0 :                 if (!(flags & IOU_F_TWQ_LAZY_WAKE))
    1331           0 :                         nr_tw = -1U;
    1332             : 
    1333           0 :                 req->nr_tw = nr_tw;
    1334           0 :                 req->io_task_work.node.next = first;
    1335           0 :         } while (!try_cmpxchg(&ctx->work_llist.first, &first,
    1336             :                               &req->io_task_work.node));
    1337             : 
    1338           0 :         if (!first) {
    1339           0 :                 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
    1340           0 :                         atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
    1341           0 :                 if (ctx->has_evfd)
    1342           0 :                         io_eventfd_signal(ctx);
    1343             :         }
    1344             : 
    1345           0 :         nr_wait = atomic_read(&ctx->cq_wait_nr);
    1346             :         /* no one is waiting */
    1347           0 :         if (!nr_wait)
    1348             :                 return;
    1349             :         /* either not enough or the previous add has already woken it up */
    1350           0 :         if (nr_wait > nr_tw || nr_tw_prev >= nr_wait)
    1351             :                 return;
    1352             :         /* pairs with set_current_state() in io_cqring_wait() */
    1353           0 :         smp_mb__after_atomic();
    1354           0 :         wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
    1355             : }
    1356             : 
    1357           0 : void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
    1358             : {
    1359           0 :         struct io_uring_task *tctx = req->task->io_uring;
    1360           0 :         struct io_ring_ctx *ctx = req->ctx;
    1361             : 
    1362           0 :         if (!(flags & IOU_F_TWQ_FORCE_NORMAL) &&
    1363           0 :             (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) {
    1364             :                 rcu_read_lock();
    1365           0 :                 io_req_local_work_add(req, flags);
    1366             :                 rcu_read_unlock();
    1367             :                 return;
    1368             :         }
    1369             : 
    1370             :         /* task_work already pending, we're done */
    1371           0 :         if (!llist_add(&req->io_task_work.node, &tctx->task_list))
    1372             :                 return;
    1373             : 
    1374           0 :         if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
    1375           0 :                 atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
    1376             : 
    1377           0 :         if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
    1378             :                 return;
    1379             : 
    1380           0 :         io_fallback_tw(tctx);
    1381             : }
    1382             : 
    1383           0 : static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
    1384             : {
    1385             :         struct llist_node *node;
    1386             : 
    1387           0 :         node = llist_del_all(&ctx->work_llist);
    1388           0 :         while (node) {
    1389           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1390             :                                                     io_task_work.node);
    1391             : 
    1392           0 :                 node = node->next;
    1393           0 :                 __io_req_task_work_add(req, IOU_F_TWQ_FORCE_NORMAL);
    1394             :         }
    1395           0 : }
    1396             : 
    1397           0 : static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts)
    1398             : {
    1399             :         struct llist_node *node;
    1400           0 :         unsigned int loops = 0;
    1401           0 :         int ret = 0;
    1402             : 
    1403           0 :         if (WARN_ON_ONCE(ctx->submitter_task != current))
    1404             :                 return -EEXIST;
    1405           0 :         if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
    1406           0 :                 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
    1407             : again:
    1408           0 :         node = io_llist_xchg(&ctx->work_llist, NULL);
    1409           0 :         while (node) {
    1410           0 :                 struct llist_node *next = node->next;
    1411           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1412             :                                                     io_task_work.node);
    1413           0 :                 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
    1414           0 :                 req->io_task_work.func(req, ts);
    1415           0 :                 ret++;
    1416           0 :                 node = next;
    1417             :         }
    1418           0 :         loops++;
    1419             : 
    1420           0 :         if (!llist_empty(&ctx->work_llist))
    1421             :                 goto again;
    1422           0 :         if (ts->locked) {
    1423           0 :                 io_submit_flush_completions(ctx);
    1424           0 :                 if (!llist_empty(&ctx->work_llist))
    1425             :                         goto again;
    1426             :         }
    1427             :         trace_io_uring_local_work_run(ctx, ret, loops);
    1428             :         return ret;
    1429             : }
    1430             : 
    1431           0 : static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
    1432             : {
    1433           0 :         struct io_tw_state ts = { .locked = true, };
    1434             :         int ret;
    1435             : 
    1436           0 :         if (llist_empty(&ctx->work_llist))
    1437             :                 return 0;
    1438             : 
    1439           0 :         ret = __io_run_local_work(ctx, &ts);
    1440             :         /* shouldn't happen! */
    1441           0 :         if (WARN_ON_ONCE(!ts.locked))
    1442           0 :                 mutex_lock(&ctx->uring_lock);
    1443             :         return ret;
    1444             : }
    1445             : 
    1446           0 : static int io_run_local_work(struct io_ring_ctx *ctx)
    1447             : {
    1448           0 :         struct io_tw_state ts = {};
    1449             :         int ret;
    1450             : 
    1451           0 :         ts.locked = mutex_trylock(&ctx->uring_lock);
    1452           0 :         ret = __io_run_local_work(ctx, &ts);
    1453           0 :         if (ts.locked)
    1454           0 :                 mutex_unlock(&ctx->uring_lock);
    1455             : 
    1456           0 :         return ret;
    1457             : }
    1458             : 
    1459           0 : static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts)
    1460             : {
    1461           0 :         io_tw_lock(req->ctx, ts);
    1462           0 :         io_req_defer_failed(req, req->cqe.res);
    1463           0 : }
    1464             : 
    1465           0 : void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts)
    1466             : {
    1467           0 :         io_tw_lock(req->ctx, ts);
    1468             :         /* req->task == current here, checking PF_EXITING is safe */
    1469           0 :         if (unlikely(req->task->flags & PF_EXITING))
    1470           0 :                 io_req_defer_failed(req, -EFAULT);
    1471           0 :         else if (req->flags & REQ_F_FORCE_ASYNC)
    1472           0 :                 io_queue_iowq(req, ts);
    1473             :         else
    1474           0 :                 io_queue_sqe(req);
    1475           0 : }
    1476             : 
    1477           0 : void io_req_task_queue_fail(struct io_kiocb *req, int ret)
    1478             : {
    1479           0 :         io_req_set_res(req, ret, 0);
    1480           0 :         req->io_task_work.func = io_req_task_cancel;
    1481           0 :         io_req_task_work_add(req);
    1482           0 : }
    1483             : 
    1484           0 : void io_req_task_queue(struct io_kiocb *req)
    1485             : {
    1486           0 :         req->io_task_work.func = io_req_task_submit;
    1487           0 :         io_req_task_work_add(req);
    1488           0 : }
    1489             : 
    1490           0 : void io_queue_next(struct io_kiocb *req)
    1491             : {
    1492           0 :         struct io_kiocb *nxt = io_req_find_next(req);
    1493             : 
    1494           0 :         if (nxt)
    1495             :                 io_req_task_queue(nxt);
    1496           0 : }
    1497             : 
    1498           0 : void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
    1499             :         __must_hold(&ctx->uring_lock)
    1500             : {
    1501           0 :         struct task_struct *task = NULL;
    1502           0 :         int task_refs = 0;
    1503             : 
    1504             :         do {
    1505           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1506             :                                                     comp_list);
    1507             : 
    1508           0 :                 if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
    1509           0 :                         if (req->flags & REQ_F_REFCOUNT) {
    1510           0 :                                 node = req->comp_list.next;
    1511           0 :                                 if (!req_ref_put_and_test(req))
    1512           0 :                                         continue;
    1513             :                         }
    1514           0 :                         if ((req->flags & REQ_F_POLLED) && req->apoll) {
    1515           0 :                                 struct async_poll *apoll = req->apoll;
    1516             : 
    1517           0 :                                 if (apoll->double_poll)
    1518           0 :                                         kfree(apoll->double_poll);
    1519           0 :                                 if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache))
    1520           0 :                                         kfree(apoll);
    1521           0 :                                 req->flags &= ~REQ_F_POLLED;
    1522             :                         }
    1523           0 :                         if (req->flags & IO_REQ_LINK_FLAGS)
    1524           0 :                                 io_queue_next(req);
    1525           0 :                         if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
    1526           0 :                                 io_clean_op(req);
    1527             :                 }
    1528           0 :                 if (!(req->flags & REQ_F_FIXED_FILE))
    1529           0 :                         io_put_file(req->file);
    1530             : 
    1531           0 :                 io_req_put_rsrc_locked(req, ctx);
    1532             : 
    1533           0 :                 if (req->task != task) {
    1534           0 :                         if (task)
    1535           0 :                                 io_put_task(task, task_refs);
    1536           0 :                         task = req->task;
    1537           0 :                         task_refs = 0;
    1538             :                 }
    1539           0 :                 task_refs++;
    1540           0 :                 node = req->comp_list.next;
    1541             :                 io_req_add_to_cache(req, ctx);
    1542           0 :         } while (node);
    1543             : 
    1544           0 :         if (task)
    1545           0 :                 io_put_task(task, task_refs);
    1546           0 : }
    1547             : 
    1548           0 : static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
    1549             :         __must_hold(&ctx->uring_lock)
    1550             : {
    1551           0 :         struct io_submit_state *state = &ctx->submit_state;
    1552             :         struct io_wq_work_node *node;
    1553             : 
    1554           0 :         __io_cq_lock(ctx);
    1555             :         /* must come first to preserve CQE ordering in failure cases */
    1556           0 :         if (state->cqes_count)
    1557           0 :                 __io_flush_post_cqes(ctx);
    1558           0 :         __wq_list_for_each(node, &state->compl_reqs) {
    1559           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1560             :                                             comp_list);
    1561             : 
    1562           0 :                 if (!(req->flags & REQ_F_CQE_SKIP) &&
    1563           0 :                     unlikely(!__io_fill_cqe_req(ctx, req))) {
    1564           0 :                         if (ctx->task_complete) {
    1565           0 :                                 spin_lock(&ctx->completion_lock);
    1566           0 :                                 io_req_cqe_overflow(req);
    1567           0 :                                 spin_unlock(&ctx->completion_lock);
    1568             :                         } else {
    1569           0 :                                 io_req_cqe_overflow(req);
    1570             :                         }
    1571             :                 }
    1572             :         }
    1573           0 :         __io_cq_unlock_post_flush(ctx);
    1574             : 
    1575           0 :         if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
    1576           0 :                 io_free_batch_list(ctx, state->compl_reqs.first);
    1577           0 :                 INIT_WQ_LIST(&state->compl_reqs);
    1578             :         }
    1579           0 : }
    1580             : 
    1581             : /*
    1582             :  * Drop reference to request, return next in chain (if there is one) if this
    1583             :  * was the last reference to this request.
    1584             :  */
    1585           0 : static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
    1586             : {
    1587           0 :         struct io_kiocb *nxt = NULL;
    1588             : 
    1589           0 :         if (req_ref_put_and_test(req)) {
    1590           0 :                 if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
    1591           0 :                         nxt = io_req_find_next(req);
    1592           0 :                 io_free_req(req);
    1593             :         }
    1594           0 :         return nxt;
    1595             : }
    1596             : 
    1597             : static unsigned io_cqring_events(struct io_ring_ctx *ctx)
    1598             : {
    1599             :         /* See comment at the top of this file */
    1600           0 :         smp_rmb();
    1601           0 :         return __io_cqring_events(ctx);
    1602             : }
    1603             : 
    1604             : /*
    1605             :  * We can't just wait for polled events to come to us, we have to actively
    1606             :  * find and complete them.
    1607             :  */
    1608           0 : static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
    1609             : {
    1610           0 :         if (!(ctx->flags & IORING_SETUP_IOPOLL))
    1611             :                 return;
    1612             : 
    1613           0 :         mutex_lock(&ctx->uring_lock);
    1614           0 :         while (!wq_list_empty(&ctx->iopoll_list)) {
    1615             :                 /* let it sleep and repeat later if can't complete a request */
    1616           0 :                 if (io_do_iopoll(ctx, true) == 0)
    1617             :                         break;
    1618             :                 /*
    1619             :                  * Ensure we allow local-to-the-cpu processing to take place,
    1620             :                  * in this case we need to ensure that we reap all events.
    1621             :                  * Also let task_work, etc. to progress by releasing the mutex
    1622             :                  */
    1623           0 :                 if (need_resched()) {
    1624           0 :                         mutex_unlock(&ctx->uring_lock);
    1625           0 :                         cond_resched();
    1626           0 :                         mutex_lock(&ctx->uring_lock);
    1627             :                 }
    1628             :         }
    1629           0 :         mutex_unlock(&ctx->uring_lock);
    1630             : }
    1631             : 
    1632           0 : static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
    1633             : {
    1634           0 :         unsigned int nr_events = 0;
    1635           0 :         int ret = 0;
    1636             :         unsigned long check_cq;
    1637             : 
    1638           0 :         if (!io_allowed_run_tw(ctx))
    1639             :                 return -EEXIST;
    1640             : 
    1641           0 :         check_cq = READ_ONCE(ctx->check_cq);
    1642           0 :         if (unlikely(check_cq)) {
    1643           0 :                 if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
    1644           0 :                         __io_cqring_overflow_flush(ctx);
    1645             :                 /*
    1646             :                  * Similarly do not spin if we have not informed the user of any
    1647             :                  * dropped CQE.
    1648             :                  */
    1649           0 :                 if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
    1650             :                         return -EBADR;
    1651             :         }
    1652             :         /*
    1653             :          * Don't enter poll loop if we already have events pending.
    1654             :          * If we do, we can potentially be spinning for commands that
    1655             :          * already triggered a CQE (eg in error).
    1656             :          */
    1657           0 :         if (io_cqring_events(ctx))
    1658             :                 return 0;
    1659             : 
    1660             :         do {
    1661             :                 /*
    1662             :                  * If a submit got punted to a workqueue, we can have the
    1663             :                  * application entering polling for a command before it gets
    1664             :                  * issued. That app will hold the uring_lock for the duration
    1665             :                  * of the poll right here, so we need to take a breather every
    1666             :                  * now and then to ensure that the issue has a chance to add
    1667             :                  * the poll to the issued list. Otherwise we can spin here
    1668             :                  * forever, while the workqueue is stuck trying to acquire the
    1669             :                  * very same mutex.
    1670             :                  */
    1671           0 :                 if (wq_list_empty(&ctx->iopoll_list) ||
    1672           0 :                     io_task_work_pending(ctx)) {
    1673           0 :                         u32 tail = ctx->cached_cq_tail;
    1674             : 
    1675           0 :                         (void) io_run_local_work_locked(ctx);
    1676             : 
    1677           0 :                         if (task_work_pending(current) ||
    1678           0 :                             wq_list_empty(&ctx->iopoll_list)) {
    1679           0 :                                 mutex_unlock(&ctx->uring_lock);
    1680           0 :                                 io_run_task_work();
    1681           0 :                                 mutex_lock(&ctx->uring_lock);
    1682             :                         }
    1683             :                         /* some requests don't go through iopoll_list */
    1684           0 :                         if (tail != ctx->cached_cq_tail ||
    1685           0 :                             wq_list_empty(&ctx->iopoll_list))
    1686             :                                 break;
    1687             :                 }
    1688           0 :                 ret = io_do_iopoll(ctx, !min);
    1689           0 :                 if (ret < 0)
    1690             :                         break;
    1691           0 :                 nr_events += ret;
    1692           0 :                 ret = 0;
    1693           0 :         } while (nr_events < min && !need_resched());
    1694             : 
    1695             :         return ret;
    1696             : }
    1697             : 
    1698           0 : void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts)
    1699             : {
    1700           0 :         if (ts->locked)
    1701             :                 io_req_complete_defer(req);
    1702             :         else
    1703           0 :                 io_req_complete_post(req, IO_URING_F_UNLOCKED);
    1704           0 : }
    1705             : 
    1706             : /*
    1707             :  * After the iocb has been issued, it's safe to be found on the poll list.
    1708             :  * Adding the kiocb to the list AFTER submission ensures that we don't
    1709             :  * find it from a io_do_iopoll() thread before the issuer is done
    1710             :  * accessing the kiocb cookie.
    1711             :  */
    1712           0 : static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
    1713             : {
    1714           0 :         struct io_ring_ctx *ctx = req->ctx;
    1715           0 :         const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
    1716             : 
    1717             :         /* workqueue context doesn't hold uring_lock, grab it now */
    1718           0 :         if (unlikely(needs_lock))
    1719           0 :                 mutex_lock(&ctx->uring_lock);
    1720             : 
    1721             :         /*
    1722             :          * Track whether we have multiple files in our lists. This will impact
    1723             :          * how we do polling eventually, not spinning if we're on potentially
    1724             :          * different devices.
    1725             :          */
    1726           0 :         if (wq_list_empty(&ctx->iopoll_list)) {
    1727           0 :                 ctx->poll_multi_queue = false;
    1728           0 :         } else if (!ctx->poll_multi_queue) {
    1729             :                 struct io_kiocb *list_req;
    1730             : 
    1731           0 :                 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
    1732             :                                         comp_list);
    1733           0 :                 if (list_req->file != req->file)
    1734           0 :                         ctx->poll_multi_queue = true;
    1735             :         }
    1736             : 
    1737             :         /*
    1738             :          * For fast devices, IO may have already completed. If it has, add
    1739             :          * it to the front so we find it first.
    1740             :          */
    1741           0 :         if (READ_ONCE(req->iopoll_completed))
    1742           0 :                 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
    1743             :         else
    1744           0 :                 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
    1745             : 
    1746           0 :         if (unlikely(needs_lock)) {
    1747             :                 /*
    1748             :                  * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
    1749             :                  * in sq thread task context or in io worker task context. If
    1750             :                  * current task context is sq thread, we don't need to check
    1751             :                  * whether should wake up sq thread.
    1752             :                  */
    1753           0 :                 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
    1754           0 :                     wq_has_sleeper(&ctx->sq_data->wait))
    1755           0 :                         wake_up(&ctx->sq_data->wait);
    1756             : 
    1757           0 :                 mutex_unlock(&ctx->uring_lock);
    1758             :         }
    1759           0 : }
    1760             : 
    1761             : static bool io_bdev_nowait(struct block_device *bdev)
    1762             : {
    1763           0 :         return !bdev || bdev_nowait(bdev);
    1764             : }
    1765             : 
    1766             : /*
    1767             :  * If we tracked the file through the SCM inflight mechanism, we could support
    1768             :  * any file. For now, just ensure that anything potentially problematic is done
    1769             :  * inline.
    1770             :  */
    1771           0 : static bool __io_file_supports_nowait(struct file *file, umode_t mode)
    1772             : {
    1773           0 :         if (S_ISBLK(mode)) {
    1774           0 :                 if (IS_ENABLED(CONFIG_BLOCK) &&
    1775           0 :                     io_bdev_nowait(I_BDEV(file->f_mapping->host)))
    1776             :                         return true;
    1777           0 :                 return false;
    1778             :         }
    1779           0 :         if (S_ISSOCK(mode))
    1780             :                 return true;
    1781           0 :         if (S_ISREG(mode)) {
    1782           0 :                 if (IS_ENABLED(CONFIG_BLOCK) &&
    1783           0 :                     io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
    1784           0 :                     !io_is_uring_fops(file))
    1785             :                         return true;
    1786             :                 return false;
    1787             :         }
    1788             : 
    1789             :         /* any ->read/write should understand O_NONBLOCK */
    1790           0 :         if (file->f_flags & O_NONBLOCK)
    1791             :                 return true;
    1792           0 :         return file->f_mode & FMODE_NOWAIT;
    1793             : }
    1794             : 
    1795             : /*
    1796             :  * If we tracked the file through the SCM inflight mechanism, we could support
    1797             :  * any file. For now, just ensure that anything potentially problematic is done
    1798             :  * inline.
    1799             :  */
    1800           0 : unsigned int io_file_get_flags(struct file *file)
    1801             : {
    1802           0 :         umode_t mode = file_inode(file)->i_mode;
    1803           0 :         unsigned int res = 0;
    1804             : 
    1805           0 :         if (S_ISREG(mode))
    1806           0 :                 res |= FFS_ISREG;
    1807           0 :         if (__io_file_supports_nowait(file, mode))
    1808           0 :                 res |= FFS_NOWAIT;
    1809           0 :         return res;
    1810             : }
    1811             : 
    1812           0 : bool io_alloc_async_data(struct io_kiocb *req)
    1813             : {
    1814           0 :         WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size);
    1815           0 :         req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL);
    1816           0 :         if (req->async_data) {
    1817           0 :                 req->flags |= REQ_F_ASYNC_DATA;
    1818           0 :                 return false;
    1819             :         }
    1820             :         return true;
    1821             : }
    1822             : 
    1823           0 : int io_req_prep_async(struct io_kiocb *req)
    1824             : {
    1825           0 :         const struct io_cold_def *cdef = &io_cold_defs[req->opcode];
    1826           0 :         const struct io_issue_def *def = &io_issue_defs[req->opcode];
    1827             : 
    1828             :         /* assign early for deferred execution for non-fixed file */
    1829           0 :         if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file)
    1830           0 :                 req->file = io_file_get_normal(req, req->cqe.fd);
    1831           0 :         if (!cdef->prep_async)
    1832             :                 return 0;
    1833           0 :         if (WARN_ON_ONCE(req_has_async_data(req)))
    1834             :                 return -EFAULT;
    1835           0 :         if (!def->manual_alloc) {
    1836           0 :                 if (io_alloc_async_data(req))
    1837             :                         return -EAGAIN;
    1838             :         }
    1839           0 :         return cdef->prep_async(req);
    1840             : }
    1841             : 
    1842             : static u32 io_get_sequence(struct io_kiocb *req)
    1843             : {
    1844           0 :         u32 seq = req->ctx->cached_sq_head;
    1845             :         struct io_kiocb *cur;
    1846             : 
    1847             :         /* need original cached_sq_head, but it was increased for each req */
    1848           0 :         io_for_each_link(cur, req)
    1849           0 :                 seq--;
    1850             :         return seq;
    1851             : }
    1852             : 
    1853           0 : static __cold void io_drain_req(struct io_kiocb *req)
    1854             :         __must_hold(&ctx->uring_lock)
    1855             : {
    1856           0 :         struct io_ring_ctx *ctx = req->ctx;
    1857             :         struct io_defer_entry *de;
    1858             :         int ret;
    1859           0 :         u32 seq = io_get_sequence(req);
    1860             : 
    1861             :         /* Still need defer if there is pending req in defer list. */
    1862           0 :         spin_lock(&ctx->completion_lock);
    1863           0 :         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
    1864           0 :                 spin_unlock(&ctx->completion_lock);
    1865             : queue:
    1866           0 :                 ctx->drain_active = false;
    1867             :                 io_req_task_queue(req);
    1868             :                 return;
    1869             :         }
    1870           0 :         spin_unlock(&ctx->completion_lock);
    1871             : 
    1872           0 :         io_prep_async_link(req);
    1873           0 :         de = kmalloc(sizeof(*de), GFP_KERNEL);
    1874           0 :         if (!de) {
    1875           0 :                 ret = -ENOMEM;
    1876           0 :                 io_req_defer_failed(req, ret);
    1877           0 :                 return;
    1878             :         }
    1879             : 
    1880           0 :         spin_lock(&ctx->completion_lock);
    1881           0 :         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
    1882           0 :                 spin_unlock(&ctx->completion_lock);
    1883           0 :                 kfree(de);
    1884           0 :                 goto queue;
    1885             :         }
    1886             : 
    1887           0 :         trace_io_uring_defer(req);
    1888           0 :         de->req = req;
    1889           0 :         de->seq = seq;
    1890           0 :         list_add_tail(&de->list, &ctx->defer_list);
    1891           0 :         spin_unlock(&ctx->completion_lock);
    1892             : }
    1893             : 
    1894           0 : static void io_clean_op(struct io_kiocb *req)
    1895             : {
    1896           0 :         if (req->flags & REQ_F_BUFFER_SELECTED) {
    1897           0 :                 spin_lock(&req->ctx->completion_lock);
    1898           0 :                 io_put_kbuf_comp(req);
    1899           0 :                 spin_unlock(&req->ctx->completion_lock);
    1900             :         }
    1901             : 
    1902           0 :         if (req->flags & REQ_F_NEED_CLEANUP) {
    1903           0 :                 const struct io_cold_def *def = &io_cold_defs[req->opcode];
    1904             : 
    1905           0 :                 if (def->cleanup)
    1906           0 :                         def->cleanup(req);
    1907             :         }
    1908           0 :         if ((req->flags & REQ_F_POLLED) && req->apoll) {
    1909           0 :                 kfree(req->apoll->double_poll);
    1910           0 :                 kfree(req->apoll);
    1911           0 :                 req->apoll = NULL;
    1912             :         }
    1913           0 :         if (req->flags & REQ_F_INFLIGHT) {
    1914           0 :                 struct io_uring_task *tctx = req->task->io_uring;
    1915             : 
    1916           0 :                 atomic_dec(&tctx->inflight_tracked);
    1917             :         }
    1918           0 :         if (req->flags & REQ_F_CREDS)
    1919           0 :                 put_cred(req->creds);
    1920           0 :         if (req->flags & REQ_F_ASYNC_DATA) {
    1921           0 :                 kfree(req->async_data);
    1922           0 :                 req->async_data = NULL;
    1923             :         }
    1924           0 :         req->flags &= ~IO_REQ_CLEAN_FLAGS;
    1925           0 : }
    1926             : 
    1927           0 : static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
    1928             :                            unsigned int issue_flags)
    1929             : {
    1930           0 :         if (req->file || !def->needs_file)
    1931             :                 return true;
    1932             : 
    1933           0 :         if (req->flags & REQ_F_FIXED_FILE)
    1934           0 :                 req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
    1935             :         else
    1936           0 :                 req->file = io_file_get_normal(req, req->cqe.fd);
    1937             : 
    1938           0 :         return !!req->file;
    1939             : }
    1940             : 
    1941           0 : static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
    1942             : {
    1943           0 :         const struct io_issue_def *def = &io_issue_defs[req->opcode];
    1944           0 :         const struct cred *creds = NULL;
    1945             :         int ret;
    1946             : 
    1947           0 :         if (unlikely(!io_assign_file(req, def, issue_flags)))
    1948             :                 return -EBADF;
    1949             : 
    1950           0 :         if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
    1951           0 :                 creds = override_creds(req->creds);
    1952             : 
    1953           0 :         if (!def->audit_skip)
    1954             :                 audit_uring_entry(req->opcode);
    1955             : 
    1956           0 :         ret = def->issue(req, issue_flags);
    1957             : 
    1958             :         if (!def->audit_skip)
    1959             :                 audit_uring_exit(!ret, ret);
    1960             : 
    1961           0 :         if (creds)
    1962           0 :                 revert_creds(creds);
    1963             : 
    1964           0 :         if (ret == IOU_OK) {
    1965           0 :                 if (issue_flags & IO_URING_F_COMPLETE_DEFER)
    1966             :                         io_req_complete_defer(req);
    1967             :                 else
    1968           0 :                         io_req_complete_post(req, issue_flags);
    1969           0 :         } else if (ret != IOU_ISSUE_SKIP_COMPLETE)
    1970             :                 return ret;
    1971             : 
    1972             :         /* If the op doesn't have a file, we're not polling for it */
    1973           0 :         if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
    1974           0 :                 io_iopoll_req_issued(req, issue_flags);
    1975             : 
    1976             :         return 0;
    1977             : }
    1978             : 
    1979           0 : int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
    1980             : {
    1981           0 :         io_tw_lock(req->ctx, ts);
    1982           0 :         return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT|
    1983             :                                  IO_URING_F_COMPLETE_DEFER);
    1984             : }
    1985             : 
    1986           0 : struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
    1987             : {
    1988           0 :         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
    1989             : 
    1990           0 :         req = io_put_req_find_next(req);
    1991           0 :         return req ? &req->work : NULL;
    1992             : }
    1993             : 
    1994           0 : void io_wq_submit_work(struct io_wq_work *work)
    1995             : {
    1996           0 :         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
    1997           0 :         const struct io_issue_def *def = &io_issue_defs[req->opcode];
    1998           0 :         unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ;
    1999           0 :         bool needs_poll = false;
    2000           0 :         int ret = 0, err = -ECANCELED;
    2001             : 
    2002             :         /* one will be dropped by ->io_wq_free_work() after returning to io-wq */
    2003           0 :         if (!(req->flags & REQ_F_REFCOUNT))
    2004             :                 __io_req_set_refcount(req, 2);
    2005             :         else
    2006           0 :                 req_ref_get(req);
    2007             : 
    2008           0 :         io_arm_ltimeout(req);
    2009             : 
    2010             :         /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
    2011           0 :         if (work->flags & IO_WQ_WORK_CANCEL) {
    2012             : fail:
    2013             :                 io_req_task_queue_fail(req, err);
    2014             :                 return;
    2015             :         }
    2016           0 :         if (!io_assign_file(req, def, issue_flags)) {
    2017           0 :                 err = -EBADF;
    2018           0 :                 work->flags |= IO_WQ_WORK_CANCEL;
    2019           0 :                 goto fail;
    2020             :         }
    2021             : 
    2022           0 :         if (req->flags & REQ_F_FORCE_ASYNC) {
    2023           0 :                 bool opcode_poll = def->pollin || def->pollout;
    2024             : 
    2025           0 :                 if (opcode_poll && file_can_poll(req->file)) {
    2026           0 :                         needs_poll = true;
    2027           0 :                         issue_flags |= IO_URING_F_NONBLOCK;
    2028             :                 }
    2029             :         }
    2030             : 
    2031             :         do {
    2032           0 :                 ret = io_issue_sqe(req, issue_flags);
    2033           0 :                 if (ret != -EAGAIN)
    2034             :                         break;
    2035             :                 /*
    2036             :                  * We can get EAGAIN for iopolled IO even though we're
    2037             :                  * forcing a sync submission from here, since we can't
    2038             :                  * wait for request slots on the block side.
    2039             :                  */
    2040           0 :                 if (!needs_poll) {
    2041           0 :                         if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
    2042             :                                 break;
    2043           0 :                         cond_resched();
    2044           0 :                         continue;
    2045             :                 }
    2046             : 
    2047           0 :                 if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
    2048             :                         return;
    2049             :                 /* aborted or ready, in either case retry blocking */
    2050             :                 needs_poll = false;
    2051             :                 issue_flags &= ~IO_URING_F_NONBLOCK;
    2052             :         } while (1);
    2053             : 
    2054             :         /* avoid locking problems by failing it from a clean context */
    2055           0 :         if (ret < 0)
    2056             :                 io_req_task_queue_fail(req, ret);
    2057             : }
    2058             : 
    2059           0 : inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
    2060             :                                       unsigned int issue_flags)
    2061             : {
    2062           0 :         struct io_ring_ctx *ctx = req->ctx;
    2063           0 :         struct file *file = NULL;
    2064             :         unsigned long file_ptr;
    2065             : 
    2066           0 :         io_ring_submit_lock(ctx, issue_flags);
    2067             : 
    2068           0 :         if (unlikely((unsigned int)fd >= ctx->nr_user_files))
    2069             :                 goto out;
    2070           0 :         fd = array_index_nospec(fd, ctx->nr_user_files);
    2071           0 :         file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
    2072           0 :         file = (struct file *) (file_ptr & FFS_MASK);
    2073           0 :         file_ptr &= ~FFS_MASK;
    2074             :         /* mask in overlapping REQ_F and FFS bits */
    2075           0 :         req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
    2076           0 :         io_req_set_rsrc_node(req, ctx, 0);
    2077             : out:
    2078           0 :         io_ring_submit_unlock(ctx, issue_flags);
    2079           0 :         return file;
    2080             : }
    2081             : 
    2082           0 : struct file *io_file_get_normal(struct io_kiocb *req, int fd)
    2083             : {
    2084           0 :         struct file *file = fget(fd);
    2085             : 
    2086           0 :         trace_io_uring_file_get(req, fd);
    2087             : 
    2088             :         /* we don't allow fixed io_uring files */
    2089           0 :         if (file && io_is_uring_fops(file))
    2090           0 :                 io_req_track_inflight(req);
    2091           0 :         return file;
    2092             : }
    2093             : 
    2094           0 : static void io_queue_async(struct io_kiocb *req, int ret)
    2095             :         __must_hold(&req->ctx->uring_lock)
    2096             : {
    2097             :         struct io_kiocb *linked_timeout;
    2098             : 
    2099           0 :         if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
    2100           0 :                 io_req_defer_failed(req, ret);
    2101           0 :                 return;
    2102             :         }
    2103             : 
    2104           0 :         linked_timeout = io_prep_linked_timeout(req);
    2105             : 
    2106           0 :         switch (io_arm_poll_handler(req, 0)) {
    2107             :         case IO_APOLL_READY:
    2108           0 :                 io_kbuf_recycle(req, 0);
    2109             :                 io_req_task_queue(req);
    2110             :                 break;
    2111             :         case IO_APOLL_ABORTED:
    2112           0 :                 io_kbuf_recycle(req, 0);
    2113           0 :                 io_queue_iowq(req, NULL);
    2114           0 :                 break;
    2115             :         case IO_APOLL_OK:
    2116             :                 break;
    2117             :         }
    2118             : 
    2119           0 :         if (linked_timeout)
    2120           0 :                 io_queue_linked_timeout(linked_timeout);
    2121             : }
    2122             : 
    2123           0 : static inline void io_queue_sqe(struct io_kiocb *req)
    2124             :         __must_hold(&req->ctx->uring_lock)
    2125             : {
    2126             :         int ret;
    2127             : 
    2128           0 :         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
    2129             : 
    2130             :         /*
    2131             :          * We async punt it if the file wasn't marked NOWAIT, or if the file
    2132             :          * doesn't support non-blocking read/write attempts
    2133             :          */
    2134           0 :         if (likely(!ret))
    2135             :                 io_arm_ltimeout(req);
    2136             :         else
    2137           0 :                 io_queue_async(req, ret);
    2138           0 : }
    2139             : 
    2140           0 : static void io_queue_sqe_fallback(struct io_kiocb *req)
    2141             :         __must_hold(&req->ctx->uring_lock)
    2142             : {
    2143           0 :         if (unlikely(req->flags & REQ_F_FAIL)) {
    2144             :                 /*
    2145             :                  * We don't submit, fail them all, for that replace hardlinks
    2146             :                  * with normal links. Extra REQ_F_LINK is tolerated.
    2147             :                  */
    2148           0 :                 req->flags &= ~REQ_F_HARDLINK;
    2149           0 :                 req->flags |= REQ_F_LINK;
    2150           0 :                 io_req_defer_failed(req, req->cqe.res);
    2151             :         } else {
    2152           0 :                 int ret = io_req_prep_async(req);
    2153             : 
    2154           0 :                 if (unlikely(ret)) {
    2155           0 :                         io_req_defer_failed(req, ret);
    2156           0 :                         return;
    2157             :                 }
    2158             : 
    2159           0 :                 if (unlikely(req->ctx->drain_active))
    2160           0 :                         io_drain_req(req);
    2161             :                 else
    2162           0 :                         io_queue_iowq(req, NULL);
    2163             :         }
    2164             : }
    2165             : 
    2166             : /*
    2167             :  * Check SQE restrictions (opcode and flags).
    2168             :  *
    2169             :  * Returns 'true' if SQE is allowed, 'false' otherwise.
    2170             :  */
    2171           0 : static inline bool io_check_restriction(struct io_ring_ctx *ctx,
    2172             :                                         struct io_kiocb *req,
    2173             :                                         unsigned int sqe_flags)
    2174             : {
    2175           0 :         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
    2176             :                 return false;
    2177             : 
    2178           0 :         if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
    2179             :             ctx->restrictions.sqe_flags_required)
    2180             :                 return false;
    2181             : 
    2182           0 :         if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
    2183             :                           ctx->restrictions.sqe_flags_required))
    2184             :                 return false;
    2185             : 
    2186             :         return true;
    2187             : }
    2188             : 
    2189             : static void io_init_req_drain(struct io_kiocb *req)
    2190             : {
    2191           0 :         struct io_ring_ctx *ctx = req->ctx;
    2192           0 :         struct io_kiocb *head = ctx->submit_state.link.head;
    2193             : 
    2194           0 :         ctx->drain_active = true;
    2195           0 :         if (head) {
    2196             :                 /*
    2197             :                  * If we need to drain a request in the middle of a link, drain
    2198             :                  * the head request and the next request/link after the current
    2199             :                  * link. Considering sequential execution of links,
    2200             :                  * REQ_F_IO_DRAIN will be maintained for every request of our
    2201             :                  * link.
    2202             :                  */
    2203           0 :                 head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
    2204           0 :                 ctx->drain_next = true;
    2205             :         }
    2206             : }
    2207             : 
    2208           0 : static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
    2209             :                        const struct io_uring_sqe *sqe)
    2210             :         __must_hold(&ctx->uring_lock)
    2211             : {
    2212             :         const struct io_issue_def *def;
    2213             :         unsigned int sqe_flags;
    2214             :         int personality;
    2215             :         u8 opcode;
    2216             : 
    2217             :         /* req is partially pre-initialised, see io_preinit_req() */
    2218           0 :         req->opcode = opcode = READ_ONCE(sqe->opcode);
    2219             :         /* same numerical values with corresponding REQ_F_*, safe to copy */
    2220           0 :         req->flags = sqe_flags = READ_ONCE(sqe->flags);
    2221           0 :         req->cqe.user_data = READ_ONCE(sqe->user_data);
    2222           0 :         req->file = NULL;
    2223           0 :         req->rsrc_node = NULL;
    2224           0 :         req->task = current;
    2225             : 
    2226           0 :         if (unlikely(opcode >= IORING_OP_LAST)) {
    2227           0 :                 req->opcode = 0;
    2228           0 :                 return -EINVAL;
    2229             :         }
    2230           0 :         def = &io_issue_defs[opcode];
    2231           0 :         if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
    2232             :                 /* enforce forwards compatibility on users */
    2233           0 :                 if (sqe_flags & ~SQE_VALID_FLAGS)
    2234             :                         return -EINVAL;
    2235           0 :                 if (sqe_flags & IOSQE_BUFFER_SELECT) {
    2236           0 :                         if (!def->buffer_select)
    2237             :                                 return -EOPNOTSUPP;
    2238           0 :                         req->buf_index = READ_ONCE(sqe->buf_group);
    2239             :                 }
    2240           0 :                 if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
    2241           0 :                         ctx->drain_disabled = true;
    2242           0 :                 if (sqe_flags & IOSQE_IO_DRAIN) {
    2243           0 :                         if (ctx->drain_disabled)
    2244             :                                 return -EOPNOTSUPP;
    2245           0 :                         io_init_req_drain(req);
    2246             :                 }
    2247             :         }
    2248           0 :         if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
    2249           0 :                 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
    2250             :                         return -EACCES;
    2251             :                 /* knock it to the slow queue path, will be drained there */
    2252           0 :                 if (ctx->drain_active)
    2253           0 :                         req->flags |= REQ_F_FORCE_ASYNC;
    2254             :                 /* if there is no link, we're at "next" request and need to drain */
    2255           0 :                 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
    2256           0 :                         ctx->drain_next = false;
    2257           0 :                         ctx->drain_active = true;
    2258           0 :                         req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
    2259             :                 }
    2260             :         }
    2261             : 
    2262           0 :         if (!def->ioprio && sqe->ioprio)
    2263             :                 return -EINVAL;
    2264           0 :         if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
    2265             :                 return -EINVAL;
    2266             : 
    2267           0 :         if (def->needs_file) {
    2268           0 :                 struct io_submit_state *state = &ctx->submit_state;
    2269             : 
    2270           0 :                 req->cqe.fd = READ_ONCE(sqe->fd);
    2271             : 
    2272             :                 /*
    2273             :                  * Plug now if we have more than 2 IO left after this, and the
    2274             :                  * target is potentially a read/write to block based storage.
    2275             :                  */
    2276           0 :                 if (state->need_plug && def->plug) {
    2277           0 :                         state->plug_started = true;
    2278           0 :                         state->need_plug = false;
    2279           0 :                         blk_start_plug_nr_ios(&state->plug, state->submit_nr);
    2280             :                 }
    2281             :         }
    2282             : 
    2283           0 :         personality = READ_ONCE(sqe->personality);
    2284           0 :         if (personality) {
    2285             :                 int ret;
    2286             : 
    2287           0 :                 req->creds = xa_load(&ctx->personalities, personality);
    2288           0 :                 if (!req->creds)
    2289             :                         return -EINVAL;
    2290           0 :                 get_cred(req->creds);
    2291           0 :                 ret = security_uring_override_creds(req->creds);
    2292             :                 if (ret) {
    2293             :                         put_cred(req->creds);
    2294             :                         return ret;
    2295             :                 }
    2296           0 :                 req->flags |= REQ_F_CREDS;
    2297             :         }
    2298             : 
    2299           0 :         return def->prep(req, sqe);
    2300             : }
    2301             : 
    2302           0 : static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
    2303             :                                       struct io_kiocb *req, int ret)
    2304             : {
    2305           0 :         struct io_ring_ctx *ctx = req->ctx;
    2306           0 :         struct io_submit_link *link = &ctx->submit_state.link;
    2307           0 :         struct io_kiocb *head = link->head;
    2308             : 
    2309           0 :         trace_io_uring_req_failed(sqe, req, ret);
    2310             : 
    2311             :         /*
    2312             :          * Avoid breaking links in the middle as it renders links with SQPOLL
    2313             :          * unusable. Instead of failing eagerly, continue assembling the link if
    2314             :          * applicable and mark the head with REQ_F_FAIL. The link flushing code
    2315             :          * should find the flag and handle the rest.
    2316             :          */
    2317           0 :         req_fail_link_node(req, ret);
    2318           0 :         if (head && !(head->flags & REQ_F_FAIL))
    2319             :                 req_fail_link_node(head, -ECANCELED);
    2320             : 
    2321           0 :         if (!(req->flags & IO_REQ_LINK_FLAGS)) {
    2322           0 :                 if (head) {
    2323           0 :                         link->last->link = req;
    2324           0 :                         link->head = NULL;
    2325           0 :                         req = head;
    2326             :                 }
    2327           0 :                 io_queue_sqe_fallback(req);
    2328             :                 return ret;
    2329             :         }
    2330             : 
    2331           0 :         if (head)
    2332           0 :                 link->last->link = req;
    2333             :         else
    2334           0 :                 link->head = req;
    2335           0 :         link->last = req;
    2336             :         return 0;
    2337             : }
    2338             : 
    2339           0 : static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
    2340             :                          const struct io_uring_sqe *sqe)
    2341             :         __must_hold(&ctx->uring_lock)
    2342             : {
    2343           0 :         struct io_submit_link *link = &ctx->submit_state.link;
    2344             :         int ret;
    2345             : 
    2346           0 :         ret = io_init_req(ctx, req, sqe);
    2347           0 :         if (unlikely(ret))
    2348           0 :                 return io_submit_fail_init(sqe, req, ret);
    2349             : 
    2350           0 :         trace_io_uring_submit_req(req);
    2351             : 
    2352             :         /*
    2353             :          * If we already have a head request, queue this one for async
    2354             :          * submittal once the head completes. If we don't have a head but
    2355             :          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
    2356             :          * submitted sync once the chain is complete. If none of those
    2357             :          * conditions are true (normal request), then just queue it.
    2358             :          */
    2359           0 :         if (unlikely(link->head)) {
    2360           0 :                 ret = io_req_prep_async(req);
    2361           0 :                 if (unlikely(ret))
    2362           0 :                         return io_submit_fail_init(sqe, req, ret);
    2363             : 
    2364           0 :                 trace_io_uring_link(req, link->head);
    2365           0 :                 link->last->link = req;
    2366           0 :                 link->last = req;
    2367             : 
    2368           0 :                 if (req->flags & IO_REQ_LINK_FLAGS)
    2369             :                         return 0;
    2370             :                 /* last request of the link, flush it */
    2371           0 :                 req = link->head;
    2372           0 :                 link->head = NULL;
    2373           0 :                 if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
    2374             :                         goto fallback;
    2375             : 
    2376           0 :         } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
    2377             :                                           REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
    2378           0 :                 if (req->flags & IO_REQ_LINK_FLAGS) {
    2379           0 :                         link->head = req;
    2380           0 :                         link->last = req;
    2381             :                 } else {
    2382             : fallback:
    2383           0 :                         io_queue_sqe_fallback(req);
    2384             :                 }
    2385             :                 return 0;
    2386             :         }
    2387             : 
    2388           0 :         io_queue_sqe(req);
    2389           0 :         return 0;
    2390             : }
    2391             : 
    2392             : /*
    2393             :  * Batched submission is done, ensure local IO is flushed out.
    2394             :  */
    2395           0 : static void io_submit_state_end(struct io_ring_ctx *ctx)
    2396             : {
    2397           0 :         struct io_submit_state *state = &ctx->submit_state;
    2398             : 
    2399           0 :         if (unlikely(state->link.head))
    2400           0 :                 io_queue_sqe_fallback(state->link.head);
    2401             :         /* flush only after queuing links as they can generate completions */
    2402           0 :         io_submit_flush_completions(ctx);
    2403           0 :         if (state->plug_started)
    2404           0 :                 blk_finish_plug(&state->plug);
    2405           0 : }
    2406             : 
    2407             : /*
    2408             :  * Start submission side cache.
    2409             :  */
    2410             : static void io_submit_state_start(struct io_submit_state *state,
    2411             :                                   unsigned int max_ios)
    2412             : {
    2413           0 :         state->plug_started = false;
    2414           0 :         state->need_plug = max_ios > 2;
    2415           0 :         state->submit_nr = max_ios;
    2416             :         /* set only head, no need to init link_last in advance */
    2417           0 :         state->link.head = NULL;
    2418             : }
    2419             : 
    2420             : static void io_commit_sqring(struct io_ring_ctx *ctx)
    2421             : {
    2422           0 :         struct io_rings *rings = ctx->rings;
    2423             : 
    2424             :         /*
    2425             :          * Ensure any loads from the SQEs are done at this point,
    2426             :          * since once we write the new head, the application could
    2427             :          * write new data to them.
    2428             :          */
    2429           0 :         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
    2430             : }
    2431             : 
    2432             : /*
    2433             :  * Fetch an sqe, if one is available. Note this returns a pointer to memory
    2434             :  * that is mapped by userspace. This means that care needs to be taken to
    2435             :  * ensure that reads are stable, as we cannot rely on userspace always
    2436             :  * being a good citizen. If members of the sqe are validated and then later
    2437             :  * used, it's important that those reads are done through READ_ONCE() to
    2438             :  * prevent a re-load down the line.
    2439             :  */
    2440           0 : static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
    2441             : {
    2442           0 :         unsigned head, mask = ctx->sq_entries - 1;
    2443           0 :         unsigned sq_idx = ctx->cached_sq_head++ & mask;
    2444             : 
    2445             :         /*
    2446             :          * The cached sq head (or cq tail) serves two purposes:
    2447             :          *
    2448             :          * 1) allows us to batch the cost of updating the user visible
    2449             :          *    head updates.
    2450             :          * 2) allows the kernel side to track the head on its own, even
    2451             :          *    though the application is the one updating it.
    2452             :          */
    2453           0 :         head = READ_ONCE(ctx->sq_array[sq_idx]);
    2454           0 :         if (likely(head < ctx->sq_entries)) {
    2455             :                 /* double index for 128-byte SQEs, twice as long */
    2456           0 :                 if (ctx->flags & IORING_SETUP_SQE128)
    2457           0 :                         head <<= 1;
    2458           0 :                 *sqe = &ctx->sq_sqes[head];
    2459           0 :                 return true;
    2460             :         }
    2461             : 
    2462             :         /* drop invalid entries */
    2463           0 :         ctx->cq_extra--;
    2464           0 :         WRITE_ONCE(ctx->rings->sq_dropped,
    2465             :                    READ_ONCE(ctx->rings->sq_dropped) + 1);
    2466           0 :         return false;
    2467             : }
    2468             : 
    2469           0 : int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
    2470             :         __must_hold(&ctx->uring_lock)
    2471             : {
    2472           0 :         unsigned int entries = io_sqring_entries(ctx);
    2473             :         unsigned int left;
    2474             :         int ret;
    2475             : 
    2476           0 :         if (unlikely(!entries))
    2477             :                 return 0;
    2478             :         /* make sure SQ entry isn't read before tail */
    2479           0 :         ret = left = min(nr, entries);
    2480           0 :         io_get_task_refs(left);
    2481           0 :         io_submit_state_start(&ctx->submit_state, left);
    2482             : 
    2483             :         do {
    2484             :                 const struct io_uring_sqe *sqe;
    2485             :                 struct io_kiocb *req;
    2486             : 
    2487           0 :                 if (unlikely(!io_alloc_req(ctx, &req)))
    2488             :                         break;
    2489           0 :                 if (unlikely(!io_get_sqe(ctx, &sqe))) {
    2490           0 :                         io_req_add_to_cache(req, ctx);
    2491             :                         break;
    2492             :                 }
    2493             : 
    2494             :                 /*
    2495             :                  * Continue submitting even for sqe failure if the
    2496             :                  * ring was setup with IORING_SETUP_SUBMIT_ALL
    2497             :                  */
    2498           0 :                 if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
    2499           0 :                     !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
    2500           0 :                         left--;
    2501           0 :                         break;
    2502             :                 }
    2503           0 :         } while (--left);
    2504             : 
    2505           0 :         if (unlikely(left)) {
    2506           0 :                 ret -= left;
    2507             :                 /* try again if it submitted nothing and can't allocate a req */
    2508           0 :                 if (!ret && io_req_cache_empty(ctx))
    2509           0 :                         ret = -EAGAIN;
    2510           0 :                 current->io_uring->cached_refs += left;
    2511             :         }
    2512             : 
    2513           0 :         io_submit_state_end(ctx);
    2514             :          /* Commit SQ ring head once we've consumed and submitted all SQEs */
    2515           0 :         io_commit_sqring(ctx);
    2516           0 :         return ret;
    2517             : }
    2518             : 
    2519             : struct io_wait_queue {
    2520             :         struct wait_queue_entry wq;
    2521             :         struct io_ring_ctx *ctx;
    2522             :         unsigned cq_tail;
    2523             :         unsigned nr_timeouts;
    2524             :         ktime_t timeout;
    2525             : };
    2526             : 
    2527             : static inline bool io_has_work(struct io_ring_ctx *ctx)
    2528             : {
    2529           0 :         return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) ||
    2530           0 :                !llist_empty(&ctx->work_llist);
    2531             : }
    2532             : 
    2533             : static inline bool io_should_wake(struct io_wait_queue *iowq)
    2534             : {
    2535           0 :         struct io_ring_ctx *ctx = iowq->ctx;
    2536           0 :         int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
    2537             : 
    2538             :         /*
    2539             :          * Wake up if we have enough events, or if a timeout occurred since we
    2540             :          * started waiting. For timeouts, we always want to return to userspace,
    2541             :          * regardless of event count.
    2542             :          */
    2543           0 :         return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
    2544             : }
    2545             : 
    2546           0 : static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
    2547             :                             int wake_flags, void *key)
    2548             : {
    2549           0 :         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq);
    2550             : 
    2551             :         /*
    2552             :          * Cannot safely flush overflowed CQEs from here, ensure we wake up
    2553             :          * the task, and the next invocation will do it.
    2554             :          */
    2555           0 :         if (io_should_wake(iowq) || io_has_work(iowq->ctx))
    2556           0 :                 return autoremove_wake_function(curr, mode, wake_flags, key);
    2557             :         return -1;
    2558             : }
    2559             : 
    2560           0 : int io_run_task_work_sig(struct io_ring_ctx *ctx)
    2561             : {
    2562           0 :         if (!llist_empty(&ctx->work_llist)) {
    2563           0 :                 __set_current_state(TASK_RUNNING);
    2564           0 :                 if (io_run_local_work(ctx) > 0)
    2565             :                         return 1;
    2566             :         }
    2567           0 :         if (io_run_task_work() > 0)
    2568             :                 return 1;
    2569           0 :         if (task_sigpending(current))
    2570             :                 return -EINTR;
    2571           0 :         return 0;
    2572             : }
    2573             : 
    2574             : /* when returns >0, the caller should retry */
    2575           0 : static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
    2576             :                                           struct io_wait_queue *iowq)
    2577             : {
    2578           0 :         if (unlikely(READ_ONCE(ctx->check_cq)))
    2579             :                 return 1;
    2580           0 :         if (unlikely(!llist_empty(&ctx->work_llist)))
    2581             :                 return 1;
    2582           0 :         if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
    2583             :                 return 1;
    2584           0 :         if (unlikely(task_sigpending(current)))
    2585             :                 return -EINTR;
    2586           0 :         if (unlikely(io_should_wake(iowq)))
    2587             :                 return 0;
    2588           0 :         if (iowq->timeout == KTIME_MAX)
    2589           0 :                 schedule();
    2590           0 :         else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
    2591             :                 return -ETIME;
    2592             :         return 0;
    2593             : }
    2594             : 
    2595             : /*
    2596             :  * Wait until events become available, if we don't already have some. The
    2597             :  * application must reap them itself, as they reside on the shared cq ring.
    2598             :  */
    2599           0 : static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
    2600             :                           const sigset_t __user *sig, size_t sigsz,
    2601             :                           struct __kernel_timespec __user *uts)
    2602             : {
    2603             :         struct io_wait_queue iowq;
    2604           0 :         struct io_rings *rings = ctx->rings;
    2605             :         int ret;
    2606             : 
    2607           0 :         if (!io_allowed_run_tw(ctx))
    2608             :                 return -EEXIST;
    2609           0 :         if (!llist_empty(&ctx->work_llist))
    2610           0 :                 io_run_local_work(ctx);
    2611           0 :         io_run_task_work();
    2612           0 :         io_cqring_overflow_flush(ctx);
    2613             :         /* if user messes with these they will just get an early return */
    2614           0 :         if (__io_cqring_events_user(ctx) >= min_events)
    2615             :                 return 0;
    2616             : 
    2617           0 :         if (sig) {
    2618             : #ifdef CONFIG_COMPAT
    2619             :                 if (in_compat_syscall())
    2620             :                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
    2621             :                                                       sigsz);
    2622             :                 else
    2623             : #endif
    2624           0 :                         ret = set_user_sigmask(sig, sigsz);
    2625             : 
    2626           0 :                 if (ret)
    2627             :                         return ret;
    2628             :         }
    2629             : 
    2630           0 :         init_waitqueue_func_entry(&iowq.wq, io_wake_function);
    2631           0 :         iowq.wq.private = current;
    2632           0 :         INIT_LIST_HEAD(&iowq.wq.entry);
    2633           0 :         iowq.ctx = ctx;
    2634           0 :         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
    2635           0 :         iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
    2636           0 :         iowq.timeout = KTIME_MAX;
    2637             : 
    2638           0 :         if (uts) {
    2639             :                 struct timespec64 ts;
    2640             : 
    2641           0 :                 if (get_timespec64(&ts, uts))
    2642           0 :                         return -EFAULT;
    2643           0 :                 iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
    2644             :         }
    2645             : 
    2646             :         trace_io_uring_cqring_wait(ctx, min_events);
    2647           0 :         do {
    2648             :                 unsigned long check_cq;
    2649             : 
    2650           0 :                 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
    2651           0 :                         int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
    2652             : 
    2653           0 :                         atomic_set(&ctx->cq_wait_nr, nr_wait);
    2654           0 :                         set_current_state(TASK_INTERRUPTIBLE);
    2655             :                 } else {
    2656           0 :                         prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
    2657             :                                                         TASK_INTERRUPTIBLE);
    2658             :                 }
    2659             : 
    2660           0 :                 ret = io_cqring_wait_schedule(ctx, &iowq);
    2661           0 :                 __set_current_state(TASK_RUNNING);
    2662           0 :                 atomic_set(&ctx->cq_wait_nr, 0);
    2663             : 
    2664           0 :                 if (ret < 0)
    2665             :                         break;
    2666             :                 /*
    2667             :                  * Run task_work after scheduling and before io_should_wake().
    2668             :                  * If we got woken because of task_work being processed, run it
    2669             :                  * now rather than let the caller do another wait loop.
    2670             :                  */
    2671           0 :                 io_run_task_work();
    2672           0 :                 if (!llist_empty(&ctx->work_llist))
    2673           0 :                         io_run_local_work(ctx);
    2674             : 
    2675           0 :                 check_cq = READ_ONCE(ctx->check_cq);
    2676           0 :                 if (unlikely(check_cq)) {
    2677             :                         /* let the caller flush overflows, retry */
    2678           0 :                         if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
    2679           0 :                                 io_cqring_do_overflow_flush(ctx);
    2680           0 :                         if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) {
    2681             :                                 ret = -EBADR;
    2682             :                                 break;
    2683             :                         }
    2684             :                 }
    2685             : 
    2686           0 :                 if (io_should_wake(&iowq)) {
    2687             :                         ret = 0;
    2688             :                         break;
    2689             :                 }
    2690           0 :                 cond_resched();
    2691             :         } while (1);
    2692             : 
    2693           0 :         if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
    2694           0 :                 finish_wait(&ctx->cq_wait, &iowq.wq);
    2695           0 :         restore_saved_sigmask_unless(ret == -EINTR);
    2696             : 
    2697           0 :         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
    2698             : }
    2699             : 
    2700           0 : static void io_mem_free(void *ptr)
    2701             : {
    2702             :         struct page *page;
    2703             : 
    2704           0 :         if (!ptr)
    2705             :                 return;
    2706             : 
    2707           0 :         page = virt_to_head_page(ptr);
    2708           0 :         if (put_page_testzero(page))
    2709           0 :                 free_compound_page(page);
    2710             : }
    2711             : 
    2712           0 : static void *io_mem_alloc(size_t size)
    2713             : {
    2714           0 :         gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
    2715             : 
    2716           0 :         return (void *) __get_free_pages(gfp, get_order(size));
    2717             : }
    2718             : 
    2719           0 : static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
    2720             :                                 unsigned int cq_entries, size_t *sq_offset)
    2721             : {
    2722             :         struct io_rings *rings;
    2723             :         size_t off, sq_array_size;
    2724             : 
    2725           0 :         off = struct_size(rings, cqes, cq_entries);
    2726           0 :         if (off == SIZE_MAX)
    2727             :                 return SIZE_MAX;
    2728           0 :         if (ctx->flags & IORING_SETUP_CQE32) {
    2729           0 :                 if (check_shl_overflow(off, 1, &off))
    2730             :                         return SIZE_MAX;
    2731             :         }
    2732             : 
    2733             : #ifdef CONFIG_SMP
    2734             :         off = ALIGN(off, SMP_CACHE_BYTES);
    2735             :         if (off == 0)
    2736             :                 return SIZE_MAX;
    2737             : #endif
    2738             : 
    2739           0 :         if (sq_offset)
    2740           0 :                 *sq_offset = off;
    2741             : 
    2742           0 :         sq_array_size = array_size(sizeof(u32), sq_entries);
    2743           0 :         if (sq_array_size == SIZE_MAX)
    2744             :                 return SIZE_MAX;
    2745             : 
    2746           0 :         if (check_add_overflow(off, sq_array_size, &off))
    2747             :                 return SIZE_MAX;
    2748             : 
    2749           0 :         return off;
    2750             : }
    2751             : 
    2752           0 : static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
    2753             :                                unsigned int eventfd_async)
    2754             : {
    2755             :         struct io_ev_fd *ev_fd;
    2756           0 :         __s32 __user *fds = arg;
    2757             :         int fd;
    2758             : 
    2759           0 :         ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
    2760             :                                         lockdep_is_held(&ctx->uring_lock));
    2761           0 :         if (ev_fd)
    2762             :                 return -EBUSY;
    2763             : 
    2764           0 :         if (copy_from_user(&fd, fds, sizeof(*fds)))
    2765             :                 return -EFAULT;
    2766             : 
    2767           0 :         ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
    2768           0 :         if (!ev_fd)
    2769             :                 return -ENOMEM;
    2770             : 
    2771           0 :         ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
    2772           0 :         if (IS_ERR(ev_fd->cq_ev_fd)) {
    2773           0 :                 int ret = PTR_ERR(ev_fd->cq_ev_fd);
    2774           0 :                 kfree(ev_fd);
    2775           0 :                 return ret;
    2776             :         }
    2777             : 
    2778           0 :         spin_lock(&ctx->completion_lock);
    2779           0 :         ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
    2780           0 :         spin_unlock(&ctx->completion_lock);
    2781             : 
    2782           0 :         ev_fd->eventfd_async = eventfd_async;
    2783           0 :         ctx->has_evfd = true;
    2784           0 :         rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
    2785           0 :         atomic_set(&ev_fd->refs, 1);
    2786           0 :         atomic_set(&ev_fd->ops, 0);
    2787           0 :         return 0;
    2788             : }
    2789             : 
    2790           0 : static int io_eventfd_unregister(struct io_ring_ctx *ctx)
    2791             : {
    2792             :         struct io_ev_fd *ev_fd;
    2793             : 
    2794           0 :         ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
    2795             :                                         lockdep_is_held(&ctx->uring_lock));
    2796           0 :         if (ev_fd) {
    2797           0 :                 ctx->has_evfd = false;
    2798           0 :                 rcu_assign_pointer(ctx->io_ev_fd, NULL);
    2799           0 :                 if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
    2800           0 :                         call_rcu(&ev_fd->rcu, io_eventfd_ops);
    2801             :                 return 0;
    2802             :         }
    2803             : 
    2804             :         return -ENXIO;
    2805             : }
    2806             : 
    2807           0 : static void io_req_caches_free(struct io_ring_ctx *ctx)
    2808             : {
    2809             :         struct io_kiocb *req;
    2810           0 :         int nr = 0;
    2811             : 
    2812           0 :         mutex_lock(&ctx->uring_lock);
    2813           0 :         io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
    2814             : 
    2815           0 :         while (!io_req_cache_empty(ctx)) {
    2816           0 :                 req = io_extract_req(ctx);
    2817           0 :                 kmem_cache_free(req_cachep, req);
    2818           0 :                 nr++;
    2819             :         }
    2820           0 :         if (nr)
    2821           0 :                 percpu_ref_put_many(&ctx->refs, nr);
    2822           0 :         mutex_unlock(&ctx->uring_lock);
    2823           0 : }
    2824             : 
    2825           0 : static void io_rsrc_node_cache_free(struct io_cache_entry *entry)
    2826             : {
    2827           0 :         kfree(container_of(entry, struct io_rsrc_node, cache));
    2828           0 : }
    2829             : 
    2830           0 : static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
    2831             : {
    2832           0 :         io_sq_thread_finish(ctx);
    2833             :         /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
    2834           0 :         if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)))
    2835             :                 return;
    2836             : 
    2837           0 :         mutex_lock(&ctx->uring_lock);
    2838           0 :         if (ctx->buf_data)
    2839           0 :                 __io_sqe_buffers_unregister(ctx);
    2840           0 :         if (ctx->file_data)
    2841           0 :                 __io_sqe_files_unregister(ctx);
    2842           0 :         io_cqring_overflow_kill(ctx);
    2843           0 :         io_eventfd_unregister(ctx);
    2844           0 :         io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
    2845           0 :         io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
    2846           0 :         io_destroy_buffers(ctx);
    2847           0 :         mutex_unlock(&ctx->uring_lock);
    2848           0 :         if (ctx->sq_creds)
    2849           0 :                 put_cred(ctx->sq_creds);
    2850           0 :         if (ctx->submitter_task)
    2851           0 :                 put_task_struct(ctx->submitter_task);
    2852             : 
    2853             :         /* there are no registered resources left, nobody uses it */
    2854           0 :         if (ctx->rsrc_node)
    2855           0 :                 io_rsrc_node_destroy(ctx, ctx->rsrc_node);
    2856             : 
    2857           0 :         WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
    2858             : 
    2859             : #if defined(CONFIG_UNIX)
    2860             :         if (ctx->ring_sock) {
    2861             :                 ctx->ring_sock->file = NULL; /* so that iput() is called */
    2862             :                 sock_release(ctx->ring_sock);
    2863             :         }
    2864             : #endif
    2865           0 :         WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
    2866             : 
    2867           0 :         io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
    2868           0 :         if (ctx->mm_account) {
    2869           0 :                 mmdrop(ctx->mm_account);
    2870           0 :                 ctx->mm_account = NULL;
    2871             :         }
    2872           0 :         io_mem_free(ctx->rings);
    2873           0 :         io_mem_free(ctx->sq_sqes);
    2874             : 
    2875           0 :         percpu_ref_exit(&ctx->refs);
    2876           0 :         free_uid(ctx->user);
    2877           0 :         io_req_caches_free(ctx);
    2878           0 :         if (ctx->hash_map)
    2879           0 :                 io_wq_put_hash(ctx->hash_map);
    2880           0 :         kfree(ctx->cancel_table.hbs);
    2881           0 :         kfree(ctx->cancel_table_locked.hbs);
    2882           0 :         kfree(ctx->dummy_ubuf);
    2883           0 :         kfree(ctx->io_bl);
    2884           0 :         xa_destroy(&ctx->io_bl_xa);
    2885           0 :         kfree(ctx);
    2886             : }
    2887             : 
    2888           0 : static __cold void io_activate_pollwq_cb(struct callback_head *cb)
    2889             : {
    2890           0 :         struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx,
    2891             :                                                poll_wq_task_work);
    2892             : 
    2893           0 :         mutex_lock(&ctx->uring_lock);
    2894           0 :         ctx->poll_activated = true;
    2895           0 :         mutex_unlock(&ctx->uring_lock);
    2896             : 
    2897             :         /*
    2898             :          * Wake ups for some events between start of polling and activation
    2899             :          * might've been lost due to loose synchronisation.
    2900             :          */
    2901           0 :         wake_up_all(&ctx->poll_wq);
    2902           0 :         percpu_ref_put(&ctx->refs);
    2903           0 : }
    2904             : 
    2905           0 : static __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
    2906             : {
    2907           0 :         spin_lock(&ctx->completion_lock);
    2908             :         /* already activated or in progress */
    2909           0 :         if (ctx->poll_activated || ctx->poll_wq_task_work.func)
    2910             :                 goto out;
    2911           0 :         if (WARN_ON_ONCE(!ctx->task_complete))
    2912             :                 goto out;
    2913           0 :         if (!ctx->submitter_task)
    2914             :                 goto out;
    2915             :         /*
    2916             :          * with ->submitter_task only the submitter task completes requests, we
    2917             :          * only need to sync with it, which is done by injecting a tw
    2918             :          */
    2919           0 :         init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb);
    2920           0 :         percpu_ref_get(&ctx->refs);
    2921           0 :         if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL))
    2922           0 :                 percpu_ref_put(&ctx->refs);
    2923             : out:
    2924           0 :         spin_unlock(&ctx->completion_lock);
    2925           0 : }
    2926             : 
    2927           0 : static __poll_t io_uring_poll(struct file *file, poll_table *wait)
    2928             : {
    2929           0 :         struct io_ring_ctx *ctx = file->private_data;
    2930           0 :         __poll_t mask = 0;
    2931             : 
    2932           0 :         if (unlikely(!ctx->poll_activated))
    2933           0 :                 io_activate_pollwq(ctx);
    2934             : 
    2935           0 :         poll_wait(file, &ctx->poll_wq, wait);
    2936             :         /*
    2937             :          * synchronizes with barrier from wq_has_sleeper call in
    2938             :          * io_commit_cqring
    2939             :          */
    2940           0 :         smp_rmb();
    2941           0 :         if (!io_sqring_full(ctx))
    2942           0 :                 mask |= EPOLLOUT | EPOLLWRNORM;
    2943             : 
    2944             :         /*
    2945             :          * Don't flush cqring overflow list here, just do a simple check.
    2946             :          * Otherwise there could possible be ABBA deadlock:
    2947             :          *      CPU0                    CPU1
    2948             :          *      ----                    ----
    2949             :          * lock(&ctx->uring_lock);
    2950             :          *                              lock(&ep->mtx);
    2951             :          *                              lock(&ctx->uring_lock);
    2952             :          * lock(&ep->mtx);
    2953             :          *
    2954             :          * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
    2955             :          * pushes them to do the flush.
    2956             :          */
    2957             : 
    2958           0 :         if (__io_cqring_events_user(ctx) || io_has_work(ctx))
    2959           0 :                 mask |= EPOLLIN | EPOLLRDNORM;
    2960             : 
    2961           0 :         return mask;
    2962             : }
    2963             : 
    2964           0 : static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
    2965             : {
    2966             :         const struct cred *creds;
    2967             : 
    2968           0 :         creds = xa_erase(&ctx->personalities, id);
    2969           0 :         if (creds) {
    2970             :                 put_cred(creds);
    2971             :                 return 0;
    2972             :         }
    2973             : 
    2974             :         return -EINVAL;
    2975             : }
    2976             : 
    2977             : struct io_tctx_exit {
    2978             :         struct callback_head            task_work;
    2979             :         struct completion               completion;
    2980             :         struct io_ring_ctx              *ctx;
    2981             : };
    2982             : 
    2983           0 : static __cold void io_tctx_exit_cb(struct callback_head *cb)
    2984             : {
    2985           0 :         struct io_uring_task *tctx = current->io_uring;
    2986             :         struct io_tctx_exit *work;
    2987             : 
    2988           0 :         work = container_of(cb, struct io_tctx_exit, task_work);
    2989             :         /*
    2990             :          * When @in_cancel, we're in cancellation and it's racy to remove the
    2991             :          * node. It'll be removed by the end of cancellation, just ignore it.
    2992             :          * tctx can be NULL if the queueing of this task_work raced with
    2993             :          * work cancelation off the exec path.
    2994             :          */
    2995           0 :         if (tctx && !atomic_read(&tctx->in_cancel))
    2996           0 :                 io_uring_del_tctx_node((unsigned long)work->ctx);
    2997           0 :         complete(&work->completion);
    2998           0 : }
    2999             : 
    3000           0 : static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
    3001             : {
    3002           0 :         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
    3003             : 
    3004           0 :         return req->ctx == data;
    3005             : }
    3006             : 
    3007           0 : static __cold void io_ring_exit_work(struct work_struct *work)
    3008             : {
    3009           0 :         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
    3010           0 :         unsigned long timeout = jiffies + HZ * 60 * 5;
    3011           0 :         unsigned long interval = HZ / 20;
    3012             :         struct io_tctx_exit exit;
    3013             :         struct io_tctx_node *node;
    3014             :         int ret;
    3015             : 
    3016             :         /*
    3017             :          * If we're doing polled IO and end up having requests being
    3018             :          * submitted async (out-of-line), then completions can come in while
    3019             :          * we're waiting for refs to drop. We need to reap these manually,
    3020             :          * as nobody else will be looking for them.
    3021             :          */
    3022             :         do {
    3023           0 :                 if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
    3024           0 :                         mutex_lock(&ctx->uring_lock);
    3025           0 :                         io_cqring_overflow_kill(ctx);
    3026           0 :                         mutex_unlock(&ctx->uring_lock);
    3027             :                 }
    3028             : 
    3029           0 :                 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
    3030           0 :                         io_move_task_work_from_local(ctx);
    3031             : 
    3032           0 :                 while (io_uring_try_cancel_requests(ctx, NULL, true))
    3033           0 :                         cond_resched();
    3034             : 
    3035           0 :                 if (ctx->sq_data) {
    3036           0 :                         struct io_sq_data *sqd = ctx->sq_data;
    3037             :                         struct task_struct *tsk;
    3038             : 
    3039           0 :                         io_sq_thread_park(sqd);
    3040           0 :                         tsk = sqd->thread;
    3041           0 :                         if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
    3042           0 :                                 io_wq_cancel_cb(tsk->io_uring->io_wq,
    3043             :                                                 io_cancel_ctx_cb, ctx, true);
    3044           0 :                         io_sq_thread_unpark(sqd);
    3045             :                 }
    3046             : 
    3047           0 :                 io_req_caches_free(ctx);
    3048             : 
    3049           0 :                 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
    3050             :                         /* there is little hope left, don't run it too often */
    3051           0 :                         interval = HZ * 60;
    3052             :                 }
    3053           0 :         } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
    3054             : 
    3055           0 :         init_completion(&exit.completion);
    3056           0 :         init_task_work(&exit.task_work, io_tctx_exit_cb);
    3057           0 :         exit.ctx = ctx;
    3058             :         /*
    3059             :          * Some may use context even when all refs and requests have been put,
    3060             :          * and they are free to do so while still holding uring_lock or
    3061             :          * completion_lock, see io_req_task_submit(). Apart from other work,
    3062             :          * this lock/unlock section also waits them to finish.
    3063             :          */
    3064           0 :         mutex_lock(&ctx->uring_lock);
    3065           0 :         while (!list_empty(&ctx->tctx_list)) {
    3066           0 :                 WARN_ON_ONCE(time_after(jiffies, timeout));
    3067             : 
    3068           0 :                 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
    3069             :                                         ctx_node);
    3070             :                 /* don't spin on a single task if cancellation failed */
    3071           0 :                 list_rotate_left(&ctx->tctx_list);
    3072           0 :                 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
    3073           0 :                 if (WARN_ON_ONCE(ret))
    3074           0 :                         continue;
    3075             : 
    3076           0 :                 mutex_unlock(&ctx->uring_lock);
    3077           0 :                 wait_for_completion(&exit.completion);
    3078           0 :                 mutex_lock(&ctx->uring_lock);
    3079             :         }
    3080           0 :         mutex_unlock(&ctx->uring_lock);
    3081           0 :         spin_lock(&ctx->completion_lock);
    3082           0 :         spin_unlock(&ctx->completion_lock);
    3083             : 
    3084             :         /* pairs with RCU read section in io_req_local_work_add() */
    3085           0 :         if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
    3086           0 :                 synchronize_rcu();
    3087             : 
    3088           0 :         io_ring_ctx_free(ctx);
    3089           0 : }
    3090             : 
    3091           0 : static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
    3092             : {
    3093             :         unsigned long index;
    3094             :         struct creds *creds;
    3095             : 
    3096           0 :         mutex_lock(&ctx->uring_lock);
    3097           0 :         percpu_ref_kill(&ctx->refs);
    3098           0 :         xa_for_each(&ctx->personalities, index, creds)
    3099           0 :                 io_unregister_personality(ctx, index);
    3100           0 :         if (ctx->rings)
    3101           0 :                 io_poll_remove_all(ctx, NULL, true);
    3102           0 :         mutex_unlock(&ctx->uring_lock);
    3103             : 
    3104             :         /*
    3105             :          * If we failed setting up the ctx, we might not have any rings
    3106             :          * and therefore did not submit any requests
    3107             :          */
    3108           0 :         if (ctx->rings)
    3109           0 :                 io_kill_timeouts(ctx, NULL, true);
    3110             : 
    3111           0 :         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
    3112             :         /*
    3113             :          * Use system_unbound_wq to avoid spawning tons of event kworkers
    3114             :          * if we're exiting a ton of rings at the same time. It just adds
    3115             :          * noise and overhead, there's no discernable change in runtime
    3116             :          * over using system_wq.
    3117             :          */
    3118           0 :         queue_work(system_unbound_wq, &ctx->exit_work);
    3119           0 : }
    3120             : 
    3121           0 : static int io_uring_release(struct inode *inode, struct file *file)
    3122             : {
    3123           0 :         struct io_ring_ctx *ctx = file->private_data;
    3124             : 
    3125           0 :         file->private_data = NULL;
    3126           0 :         io_ring_ctx_wait_and_kill(ctx);
    3127           0 :         return 0;
    3128             : }
    3129             : 
    3130             : struct io_task_cancel {
    3131             :         struct task_struct *task;
    3132             :         bool all;
    3133             : };
    3134             : 
    3135           0 : static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
    3136             : {
    3137           0 :         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
    3138           0 :         struct io_task_cancel *cancel = data;
    3139             : 
    3140           0 :         return io_match_task_safe(req, cancel->task, cancel->all);
    3141             : }
    3142             : 
    3143           0 : static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
    3144             :                                          struct task_struct *task,
    3145             :                                          bool cancel_all)
    3146             : {
    3147             :         struct io_defer_entry *de;
    3148           0 :         LIST_HEAD(list);
    3149             : 
    3150           0 :         spin_lock(&ctx->completion_lock);
    3151           0 :         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
    3152           0 :                 if (io_match_task_safe(de->req, task, cancel_all)) {
    3153           0 :                         list_cut_position(&list, &ctx->defer_list, &de->list);
    3154           0 :                         break;
    3155             :                 }
    3156             :         }
    3157           0 :         spin_unlock(&ctx->completion_lock);
    3158           0 :         if (list_empty(&list))
    3159             :                 return false;
    3160             : 
    3161           0 :         while (!list_empty(&list)) {
    3162           0 :                 de = list_first_entry(&list, struct io_defer_entry, list);
    3163           0 :                 list_del_init(&de->list);
    3164           0 :                 io_req_task_queue_fail(de->req, -ECANCELED);
    3165           0 :                 kfree(de);
    3166             :         }
    3167             :         return true;
    3168             : }
    3169             : 
    3170           0 : static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
    3171             : {
    3172             :         struct io_tctx_node *node;
    3173             :         enum io_wq_cancel cret;
    3174           0 :         bool ret = false;
    3175             : 
    3176           0 :         mutex_lock(&ctx->uring_lock);
    3177           0 :         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
    3178           0 :                 struct io_uring_task *tctx = node->task->io_uring;
    3179             : 
    3180             :                 /*
    3181             :                  * io_wq will stay alive while we hold uring_lock, because it's
    3182             :                  * killed after ctx nodes, which requires to take the lock.
    3183             :                  */
    3184           0 :                 if (!tctx || !tctx->io_wq)
    3185           0 :                         continue;
    3186           0 :                 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
    3187           0 :                 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
    3188             :         }
    3189           0 :         mutex_unlock(&ctx->uring_lock);
    3190             : 
    3191           0 :         return ret;
    3192             : }
    3193             : 
    3194           0 : static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
    3195             :                                                 struct task_struct *task,
    3196             :                                                 bool cancel_all)
    3197             : {
    3198           0 :         struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
    3199           0 :         struct io_uring_task *tctx = task ? task->io_uring : NULL;
    3200             :         enum io_wq_cancel cret;
    3201           0 :         bool ret = false;
    3202             : 
    3203             :         /* set it so io_req_local_work_add() would wake us up */
    3204           0 :         if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
    3205           0 :                 atomic_set(&ctx->cq_wait_nr, 1);
    3206           0 :                 smp_mb();
    3207             :         }
    3208             : 
    3209             :         /* failed during ring init, it couldn't have issued any requests */
    3210           0 :         if (!ctx->rings)
    3211             :                 return false;
    3212             : 
    3213           0 :         if (!task) {
    3214           0 :                 ret |= io_uring_try_cancel_iowq(ctx);
    3215           0 :         } else if (tctx && tctx->io_wq) {
    3216             :                 /*
    3217             :                  * Cancels requests of all rings, not only @ctx, but
    3218             :                  * it's fine as the task is in exit/exec.
    3219             :                  */
    3220           0 :                 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
    3221             :                                        &cancel, true);
    3222           0 :                 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
    3223             :         }
    3224             : 
    3225             :         /* SQPOLL thread does its own polling */
    3226           0 :         if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
    3227           0 :             (ctx->sq_data && ctx->sq_data->thread == current)) {
    3228           0 :                 while (!wq_list_empty(&ctx->iopoll_list)) {
    3229           0 :                         io_iopoll_try_reap_events(ctx);
    3230           0 :                         ret = true;
    3231           0 :                         cond_resched();
    3232             :                 }
    3233             :         }
    3234             : 
    3235           0 :         if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
    3236           0 :             io_allowed_defer_tw_run(ctx))
    3237           0 :                 ret |= io_run_local_work(ctx) > 0;
    3238           0 :         ret |= io_cancel_defer_files(ctx, task, cancel_all);
    3239           0 :         mutex_lock(&ctx->uring_lock);
    3240           0 :         ret |= io_poll_remove_all(ctx, task, cancel_all);
    3241           0 :         mutex_unlock(&ctx->uring_lock);
    3242           0 :         ret |= io_kill_timeouts(ctx, task, cancel_all);
    3243           0 :         if (task)
    3244           0 :                 ret |= io_run_task_work() > 0;
    3245             :         return ret;
    3246             : }
    3247             : 
    3248             : static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
    3249             : {
    3250           0 :         if (tracked)
    3251           0 :                 return atomic_read(&tctx->inflight_tracked);
    3252           0 :         return percpu_counter_sum(&tctx->inflight);
    3253             : }
    3254             : 
    3255             : /*
    3256             :  * Find any io_uring ctx that this task has registered or done IO on, and cancel
    3257             :  * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
    3258             :  */
    3259           0 : __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
    3260             : {
    3261           0 :         struct io_uring_task *tctx = current->io_uring;
    3262             :         struct io_ring_ctx *ctx;
    3263             :         struct io_tctx_node *node;
    3264             :         unsigned long index;
    3265             :         s64 inflight;
    3266           0 :         DEFINE_WAIT(wait);
    3267             : 
    3268           0 :         WARN_ON_ONCE(sqd && sqd->thread != current);
    3269             : 
    3270           0 :         if (!current->io_uring)
    3271           0 :                 return;
    3272           0 :         if (tctx->io_wq)
    3273           0 :                 io_wq_exit_start(tctx->io_wq);
    3274             : 
    3275           0 :         atomic_inc(&tctx->in_cancel);
    3276             :         do {
    3277           0 :                 bool loop = false;
    3278             : 
    3279           0 :                 io_uring_drop_tctx_refs(current);
    3280             :                 /* read completions before cancelations */
    3281           0 :                 inflight = tctx_inflight(tctx, !cancel_all);
    3282           0 :                 if (!inflight)
    3283             :                         break;
    3284             : 
    3285           0 :                 if (!sqd) {
    3286           0 :                         xa_for_each(&tctx->xa, index, node) {
    3287             :                                 /* sqpoll task will cancel all its requests */
    3288           0 :                                 if (node->ctx->sq_data)
    3289           0 :                                         continue;
    3290           0 :                                 loop |= io_uring_try_cancel_requests(node->ctx,
    3291           0 :                                                         current, cancel_all);
    3292             :                         }
    3293             :                 } else {
    3294           0 :                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
    3295           0 :                                 loop |= io_uring_try_cancel_requests(ctx,
    3296           0 :                                                                      current,
    3297             :                                                                      cancel_all);
    3298             :                 }
    3299             : 
    3300           0 :                 if (loop) {
    3301           0 :                         cond_resched();
    3302           0 :                         continue;
    3303             :                 }
    3304             : 
    3305           0 :                 prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
    3306           0 :                 io_run_task_work();
    3307           0 :                 io_uring_drop_tctx_refs(current);
    3308           0 :                 xa_for_each(&tctx->xa, index, node) {
    3309           0 :                         if (!llist_empty(&node->ctx->work_llist)) {
    3310           0 :                                 WARN_ON_ONCE(node->ctx->submitter_task &&
    3311             :                                              node->ctx->submitter_task != current);
    3312             :                                 goto end_wait;
    3313             :                         }
    3314             :                 }
    3315             :                 /*
    3316             :                  * If we've seen completions, retry without waiting. This
    3317             :                  * avoids a race where a completion comes in before we did
    3318             :                  * prepare_to_wait().
    3319             :                  */
    3320           0 :                 if (inflight == tctx_inflight(tctx, !cancel_all))
    3321           0 :                         schedule();
    3322             : end_wait:
    3323           0 :                 finish_wait(&tctx->wait, &wait);
    3324             :         } while (1);
    3325             : 
    3326           0 :         io_uring_clean_tctx(tctx);
    3327           0 :         if (cancel_all) {
    3328             :                 /*
    3329             :                  * We shouldn't run task_works after cancel, so just leave
    3330             :                  * ->in_cancel set for normal exit.
    3331             :                  */
    3332           0 :                 atomic_dec(&tctx->in_cancel);
    3333             :                 /* for exec all current's requests should be gone, kill tctx */
    3334           0 :                 __io_uring_free(current);
    3335             :         }
    3336             : }
    3337             : 
    3338           0 : void __io_uring_cancel(bool cancel_all)
    3339             : {
    3340           0 :         io_uring_cancel_generic(cancel_all, NULL);
    3341           0 : }
    3342             : 
    3343           0 : static void *io_uring_validate_mmap_request(struct file *file,
    3344             :                                             loff_t pgoff, size_t sz)
    3345             : {
    3346           0 :         struct io_ring_ctx *ctx = file->private_data;
    3347           0 :         loff_t offset = pgoff << PAGE_SHIFT;
    3348             :         struct page *page;
    3349             :         void *ptr;
    3350             : 
    3351           0 :         switch (offset & IORING_OFF_MMAP_MASK) {
    3352             :         case IORING_OFF_SQ_RING:
    3353             :         case IORING_OFF_CQ_RING:
    3354           0 :                 ptr = ctx->rings;
    3355             :                 break;
    3356             :         case IORING_OFF_SQES:
    3357           0 :                 ptr = ctx->sq_sqes;
    3358             :                 break;
    3359             :         case IORING_OFF_PBUF_RING: {
    3360             :                 unsigned int bgid;
    3361             : 
    3362           0 :                 bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
    3363           0 :                 mutex_lock(&ctx->uring_lock);
    3364           0 :                 ptr = io_pbuf_get_address(ctx, bgid);
    3365           0 :                 mutex_unlock(&ctx->uring_lock);
    3366           0 :                 if (!ptr)
    3367             :                         return ERR_PTR(-EINVAL);
    3368             :                 break;
    3369             :                 }
    3370             :         default:
    3371             :                 return ERR_PTR(-EINVAL);
    3372             :         }
    3373             : 
    3374           0 :         page = virt_to_head_page(ptr);
    3375           0 :         if (sz > page_size(page))
    3376             :                 return ERR_PTR(-EINVAL);
    3377             : 
    3378             :         return ptr;
    3379             : }
    3380             : 
    3381             : #ifdef CONFIG_MMU
    3382             : 
    3383           0 : static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
    3384             : {
    3385           0 :         size_t sz = vma->vm_end - vma->vm_start;
    3386             :         unsigned long pfn;
    3387             :         void *ptr;
    3388             : 
    3389           0 :         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
    3390           0 :         if (IS_ERR(ptr))
    3391           0 :                 return PTR_ERR(ptr);
    3392             : 
    3393           0 :         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
    3394           0 :         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
    3395             : }
    3396             : 
    3397           0 : static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
    3398             :                         unsigned long addr, unsigned long len,
    3399             :                         unsigned long pgoff, unsigned long flags)
    3400             : {
    3401           0 :         const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
    3402             :         struct vm_unmapped_area_info info;
    3403             :         void *ptr;
    3404             : 
    3405             :         /*
    3406             :          * Do not allow to map to user-provided address to avoid breaking the
    3407             :          * aliasing rules. Userspace is not able to guess the offset address of
    3408             :          * kernel kmalloc()ed memory area.
    3409             :          */
    3410           0 :         if (addr)
    3411             :                 return -EINVAL;
    3412             : 
    3413           0 :         ptr = io_uring_validate_mmap_request(filp, pgoff, len);
    3414           0 :         if (IS_ERR(ptr))
    3415             :                 return -ENOMEM;
    3416             : 
    3417           0 :         info.flags = VM_UNMAPPED_AREA_TOPDOWN;
    3418           0 :         info.length = len;
    3419           0 :         info.low_limit = max(PAGE_SIZE, mmap_min_addr);
    3420           0 :         info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
    3421             : #ifdef SHM_COLOUR
    3422             :         info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
    3423             : #else
    3424           0 :         info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
    3425             : #endif
    3426           0 :         info.align_offset = (unsigned long) ptr;
    3427             : 
    3428             :         /*
    3429             :          * A failed mmap() very likely causes application failure,
    3430             :          * so fall back to the bottom-up function here. This scenario
    3431             :          * can happen with large stack limits and large mmap()
    3432             :          * allocations.
    3433             :          */
    3434           0 :         addr = vm_unmapped_area(&info);
    3435           0 :         if (offset_in_page(addr)) {
    3436           0 :                 info.flags = 0;
    3437           0 :                 info.low_limit = TASK_UNMAPPED_BASE;
    3438           0 :                 info.high_limit = mmap_end;
    3439           0 :                 addr = vm_unmapped_area(&info);
    3440             :         }
    3441             : 
    3442             :         return addr;
    3443             : }
    3444             : 
    3445             : #else /* !CONFIG_MMU */
    3446             : 
    3447             : static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
    3448             : {
    3449             :         return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
    3450             : }
    3451             : 
    3452             : static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
    3453             : {
    3454             :         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
    3455             : }
    3456             : 
    3457             : static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
    3458             :         unsigned long addr, unsigned long len,
    3459             :         unsigned long pgoff, unsigned long flags)
    3460             : {
    3461             :         void *ptr;
    3462             : 
    3463             :         ptr = io_uring_validate_mmap_request(file, pgoff, len);
    3464             :         if (IS_ERR(ptr))
    3465             :                 return PTR_ERR(ptr);
    3466             : 
    3467             :         return (unsigned long) ptr;
    3468             : }
    3469             : 
    3470             : #endif /* !CONFIG_MMU */
    3471             : 
    3472             : static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
    3473             : {
    3474           0 :         if (flags & IORING_ENTER_EXT_ARG) {
    3475             :                 struct io_uring_getevents_arg arg;
    3476             : 
    3477           0 :                 if (argsz != sizeof(arg))
    3478           0 :                         return -EINVAL;
    3479           0 :                 if (copy_from_user(&arg, argp, sizeof(arg)))
    3480             :                         return -EFAULT;
    3481             :         }
    3482             :         return 0;
    3483             : }
    3484             : 
    3485           0 : static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
    3486             :                           struct __kernel_timespec __user **ts,
    3487             :                           const sigset_t __user **sig)
    3488             : {
    3489             :         struct io_uring_getevents_arg arg;
    3490             : 
    3491             :         /*
    3492             :          * If EXT_ARG isn't set, then we have no timespec and the argp pointer
    3493             :          * is just a pointer to the sigset_t.
    3494             :          */
    3495           0 :         if (!(flags & IORING_ENTER_EXT_ARG)) {
    3496           0 :                 *sig = (const sigset_t __user *) argp;
    3497           0 :                 *ts = NULL;
    3498           0 :                 return 0;
    3499             :         }
    3500             : 
    3501             :         /*
    3502             :          * EXT_ARG is set - ensure we agree on the size of it and copy in our
    3503             :          * timespec and sigset_t pointers if good.
    3504             :          */
    3505           0 :         if (*argsz != sizeof(arg))
    3506             :                 return -EINVAL;
    3507           0 :         if (copy_from_user(&arg, argp, sizeof(arg)))
    3508             :                 return -EFAULT;
    3509           0 :         if (arg.pad)
    3510             :                 return -EINVAL;
    3511           0 :         *sig = u64_to_user_ptr(arg.sigmask);
    3512           0 :         *argsz = arg.sigmask_sz;
    3513           0 :         *ts = u64_to_user_ptr(arg.ts);
    3514           0 :         return 0;
    3515             : }
    3516             : 
    3517           0 : SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
    3518             :                 u32, min_complete, u32, flags, const void __user *, argp,
    3519             :                 size_t, argsz)
    3520             : {
    3521             :         struct io_ring_ctx *ctx;
    3522             :         struct fd f;
    3523             :         long ret;
    3524             : 
    3525           0 :         if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
    3526             :                                IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
    3527             :                                IORING_ENTER_REGISTERED_RING)))
    3528             :                 return -EINVAL;
    3529             : 
    3530             :         /*
    3531             :          * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
    3532             :          * need only dereference our task private array to find it.
    3533             :          */
    3534           0 :         if (flags & IORING_ENTER_REGISTERED_RING) {
    3535           0 :                 struct io_uring_task *tctx = current->io_uring;
    3536             : 
    3537           0 :                 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
    3538             :                         return -EINVAL;
    3539           0 :                 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
    3540           0 :                 f.file = tctx->registered_rings[fd];
    3541           0 :                 f.flags = 0;
    3542           0 :                 if (unlikely(!f.file))
    3543             :                         return -EBADF;
    3544             :         } else {
    3545           0 :                 f = fdget(fd);
    3546           0 :                 if (unlikely(!f.file))
    3547             :                         return -EBADF;
    3548           0 :                 ret = -EOPNOTSUPP;
    3549           0 :                 if (unlikely(!io_is_uring_fops(f.file)))
    3550             :                         goto out;
    3551             :         }
    3552             : 
    3553           0 :         ctx = f.file->private_data;
    3554           0 :         ret = -EBADFD;
    3555           0 :         if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
    3556             :                 goto out;
    3557             : 
    3558             :         /*
    3559             :          * For SQ polling, the thread will do all submissions and completions.
    3560             :          * Just return the requested submit count, and wake the thread if
    3561             :          * we were asked to.
    3562             :          */
    3563           0 :         ret = 0;
    3564           0 :         if (ctx->flags & IORING_SETUP_SQPOLL) {
    3565           0 :                 io_cqring_overflow_flush(ctx);
    3566             : 
    3567           0 :                 if (unlikely(ctx->sq_data->thread == NULL)) {
    3568             :                         ret = -EOWNERDEAD;
    3569             :                         goto out;
    3570             :                 }
    3571           0 :                 if (flags & IORING_ENTER_SQ_WAKEUP)
    3572           0 :                         wake_up(&ctx->sq_data->wait);
    3573           0 :                 if (flags & IORING_ENTER_SQ_WAIT)
    3574           0 :                         io_sqpoll_wait_sq(ctx);
    3575             : 
    3576           0 :                 ret = to_submit;
    3577           0 :         } else if (to_submit) {
    3578           0 :                 ret = io_uring_add_tctx_node(ctx);
    3579           0 :                 if (unlikely(ret))
    3580             :                         goto out;
    3581             : 
    3582           0 :                 mutex_lock(&ctx->uring_lock);
    3583           0 :                 ret = io_submit_sqes(ctx, to_submit);
    3584           0 :                 if (ret != to_submit) {
    3585           0 :                         mutex_unlock(&ctx->uring_lock);
    3586           0 :                         goto out;
    3587             :                 }
    3588           0 :                 if (flags & IORING_ENTER_GETEVENTS) {
    3589           0 :                         if (ctx->syscall_iopoll)
    3590             :                                 goto iopoll_locked;
    3591             :                         /*
    3592             :                          * Ignore errors, we'll soon call io_cqring_wait() and
    3593             :                          * it should handle ownership problems if any.
    3594             :                          */
    3595           0 :                         if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
    3596           0 :                                 (void)io_run_local_work_locked(ctx);
    3597             :                 }
    3598           0 :                 mutex_unlock(&ctx->uring_lock);
    3599             :         }
    3600             : 
    3601           0 :         if (flags & IORING_ENTER_GETEVENTS) {
    3602             :                 int ret2;
    3603             : 
    3604           0 :                 if (ctx->syscall_iopoll) {
    3605             :                         /*
    3606             :                          * We disallow the app entering submit/complete with
    3607             :                          * polling, but we still need to lock the ring to
    3608             :                          * prevent racing with polled issue that got punted to
    3609             :                          * a workqueue.
    3610             :                          */
    3611           0 :                         mutex_lock(&ctx->uring_lock);
    3612             : iopoll_locked:
    3613           0 :                         ret2 = io_validate_ext_arg(flags, argp, argsz);
    3614           0 :                         if (likely(!ret2)) {
    3615           0 :                                 min_complete = min(min_complete,
    3616             :                                                    ctx->cq_entries);
    3617           0 :                                 ret2 = io_iopoll_check(ctx, min_complete);
    3618             :                         }
    3619           0 :                         mutex_unlock(&ctx->uring_lock);
    3620             :                 } else {
    3621             :                         const sigset_t __user *sig;
    3622             :                         struct __kernel_timespec __user *ts;
    3623             : 
    3624           0 :                         ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
    3625           0 :                         if (likely(!ret2)) {
    3626           0 :                                 min_complete = min(min_complete,
    3627             :                                                    ctx->cq_entries);
    3628           0 :                                 ret2 = io_cqring_wait(ctx, min_complete, sig,
    3629             :                                                       argsz, ts);
    3630             :                         }
    3631             :                 }
    3632             : 
    3633           0 :                 if (!ret) {
    3634           0 :                         ret = ret2;
    3635             : 
    3636             :                         /*
    3637             :                          * EBADR indicates that one or more CQE were dropped.
    3638             :                          * Once the user has been informed we can clear the bit
    3639             :                          * as they are obviously ok with those drops.
    3640             :                          */
    3641           0 :                         if (unlikely(ret2 == -EBADR))
    3642             :                                 clear_bit(IO_CHECK_CQ_DROPPED_BIT,
    3643           0 :                                           &ctx->check_cq);
    3644             :                 }
    3645             :         }
    3646             : out:
    3647           0 :         fdput(f);
    3648             :         return ret;
    3649             : }
    3650             : 
    3651             : static const struct file_operations io_uring_fops = {
    3652             :         .release        = io_uring_release,
    3653             :         .mmap           = io_uring_mmap,
    3654             : #ifndef CONFIG_MMU
    3655             :         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
    3656             :         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
    3657             : #else
    3658             :         .get_unmapped_area = io_uring_mmu_get_unmapped_area,
    3659             : #endif
    3660             :         .poll           = io_uring_poll,
    3661             : #ifdef CONFIG_PROC_FS
    3662             :         .show_fdinfo    = io_uring_show_fdinfo,
    3663             : #endif
    3664             : };
    3665             : 
    3666           0 : bool io_is_uring_fops(struct file *file)
    3667             : {
    3668           0 :         return file->f_op == &io_uring_fops;
    3669             : }
    3670             : 
    3671           0 : static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
    3672             :                                          struct io_uring_params *p)
    3673             : {
    3674             :         struct io_rings *rings;
    3675             :         size_t size, sq_array_offset;
    3676             : 
    3677             :         /* make sure these are sane, as we already accounted them */
    3678           0 :         ctx->sq_entries = p->sq_entries;
    3679           0 :         ctx->cq_entries = p->cq_entries;
    3680             : 
    3681           0 :         size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
    3682           0 :         if (size == SIZE_MAX)
    3683             :                 return -EOVERFLOW;
    3684             : 
    3685           0 :         rings = io_mem_alloc(size);
    3686           0 :         if (!rings)
    3687             :                 return -ENOMEM;
    3688             : 
    3689           0 :         ctx->rings = rings;
    3690           0 :         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
    3691           0 :         rings->sq_ring_mask = p->sq_entries - 1;
    3692           0 :         rings->cq_ring_mask = p->cq_entries - 1;
    3693           0 :         rings->sq_ring_entries = p->sq_entries;
    3694           0 :         rings->cq_ring_entries = p->cq_entries;
    3695             : 
    3696           0 :         if (p->flags & IORING_SETUP_SQE128)
    3697           0 :                 size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
    3698             :         else
    3699           0 :                 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
    3700           0 :         if (size == SIZE_MAX) {
    3701           0 :                 io_mem_free(ctx->rings);
    3702           0 :                 ctx->rings = NULL;
    3703           0 :                 return -EOVERFLOW;
    3704             :         }
    3705             : 
    3706           0 :         ctx->sq_sqes = io_mem_alloc(size);
    3707           0 :         if (!ctx->sq_sqes) {
    3708           0 :                 io_mem_free(ctx->rings);
    3709           0 :                 ctx->rings = NULL;
    3710           0 :                 return -ENOMEM;
    3711             :         }
    3712             : 
    3713             :         return 0;
    3714             : }
    3715             : 
    3716           0 : static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
    3717             : {
    3718             :         int ret, fd;
    3719             : 
    3720           0 :         fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
    3721           0 :         if (fd < 0)
    3722             :                 return fd;
    3723             : 
    3724           0 :         ret = __io_uring_add_tctx_node(ctx);
    3725           0 :         if (ret) {
    3726           0 :                 put_unused_fd(fd);
    3727           0 :                 return ret;
    3728             :         }
    3729           0 :         fd_install(fd, file);
    3730           0 :         return fd;
    3731             : }
    3732             : 
    3733             : /*
    3734             :  * Allocate an anonymous fd, this is what constitutes the application
    3735             :  * visible backing of an io_uring instance. The application mmaps this
    3736             :  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
    3737             :  * we have to tie this fd to a socket for file garbage collection purposes.
    3738             :  */
    3739             : static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
    3740             : {
    3741             :         struct file *file;
    3742             : #if defined(CONFIG_UNIX)
    3743             :         int ret;
    3744             : 
    3745             :         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
    3746             :                                 &ctx->ring_sock);
    3747             :         if (ret)
    3748             :                 return ERR_PTR(ret);
    3749             : #endif
    3750             : 
    3751           0 :         file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
    3752             :                                          O_RDWR | O_CLOEXEC, NULL);
    3753             : #if defined(CONFIG_UNIX)
    3754             :         if (IS_ERR(file)) {
    3755             :                 sock_release(ctx->ring_sock);
    3756             :                 ctx->ring_sock = NULL;
    3757             :         } else {
    3758             :                 ctx->ring_sock->file = file;
    3759             :         }
    3760             : #endif
    3761             :         return file;
    3762             : }
    3763             : 
    3764           0 : static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
    3765             :                                   struct io_uring_params __user *params)
    3766             : {
    3767             :         struct io_ring_ctx *ctx;
    3768             :         struct file *file;
    3769             :         int ret;
    3770             : 
    3771           0 :         if (!entries)
    3772             :                 return -EINVAL;
    3773           0 :         if (entries > IORING_MAX_ENTRIES) {
    3774           0 :                 if (!(p->flags & IORING_SETUP_CLAMP))
    3775             :                         return -EINVAL;
    3776             :                 entries = IORING_MAX_ENTRIES;
    3777             :         }
    3778             : 
    3779             :         /*
    3780             :          * Use twice as many entries for the CQ ring. It's possible for the
    3781             :          * application to drive a higher depth than the size of the SQ ring,
    3782             :          * since the sqes are only used at submission time. This allows for
    3783             :          * some flexibility in overcommitting a bit. If the application has
    3784             :          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
    3785             :          * of CQ ring entries manually.
    3786             :          */
    3787           0 :         p->sq_entries = roundup_pow_of_two(entries);
    3788           0 :         if (p->flags & IORING_SETUP_CQSIZE) {
    3789             :                 /*
    3790             :                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
    3791             :                  * to a power-of-two, if it isn't already. We do NOT impose
    3792             :                  * any cq vs sq ring sizing.
    3793             :                  */
    3794           0 :                 if (!p->cq_entries)
    3795             :                         return -EINVAL;
    3796           0 :                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
    3797           0 :                         if (!(p->flags & IORING_SETUP_CLAMP))
    3798             :                                 return -EINVAL;
    3799           0 :                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
    3800             :                 }
    3801           0 :                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
    3802           0 :                 if (p->cq_entries < p->sq_entries)
    3803             :                         return -EINVAL;
    3804             :         } else {
    3805           0 :                 p->cq_entries = 2 * p->sq_entries;
    3806             :         }
    3807             : 
    3808           0 :         ctx = io_ring_ctx_alloc(p);
    3809           0 :         if (!ctx)
    3810             :                 return -ENOMEM;
    3811             : 
    3812           0 :         if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
    3813           0 :             !(ctx->flags & IORING_SETUP_IOPOLL) &&
    3814             :             !(ctx->flags & IORING_SETUP_SQPOLL))
    3815           0 :                 ctx->task_complete = true;
    3816             : 
    3817             :         /*
    3818             :          * lazy poll_wq activation relies on ->task_complete for synchronisation
    3819             :          * purposes, see io_activate_pollwq()
    3820             :          */
    3821           0 :         if (!ctx->task_complete)
    3822           0 :                 ctx->poll_activated = true;
    3823             : 
    3824             :         /*
    3825             :          * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
    3826             :          * space applications don't need to do io completion events
    3827             :          * polling again, they can rely on io_sq_thread to do polling
    3828             :          * work, which can reduce cpu usage and uring_lock contention.
    3829             :          */
    3830           0 :         if (ctx->flags & IORING_SETUP_IOPOLL &&
    3831             :             !(ctx->flags & IORING_SETUP_SQPOLL))
    3832           0 :                 ctx->syscall_iopoll = 1;
    3833             : 
    3834           0 :         ctx->compat = in_compat_syscall();
    3835           0 :         if (!capable(CAP_IPC_LOCK))
    3836           0 :                 ctx->user = get_uid(current_user());
    3837             : 
    3838             :         /*
    3839             :          * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
    3840             :          * COOP_TASKRUN is set, then IPIs are never needed by the app.
    3841             :          */
    3842           0 :         ret = -EINVAL;
    3843           0 :         if (ctx->flags & IORING_SETUP_SQPOLL) {
    3844             :                 /* IPI related flags don't make sense with SQPOLL */
    3845           0 :                 if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
    3846             :                                   IORING_SETUP_TASKRUN_FLAG |
    3847             :                                   IORING_SETUP_DEFER_TASKRUN))
    3848             :                         goto err;
    3849           0 :                 ctx->notify_method = TWA_SIGNAL_NO_IPI;
    3850           0 :         } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
    3851           0 :                 ctx->notify_method = TWA_SIGNAL_NO_IPI;
    3852             :         } else {
    3853           0 :                 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG &&
    3854             :                     !(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
    3855             :                         goto err;
    3856           0 :                 ctx->notify_method = TWA_SIGNAL;
    3857             :         }
    3858             : 
    3859             :         /*
    3860             :          * For DEFER_TASKRUN we require the completion task to be the same as the
    3861             :          * submission task. This implies that there is only one submitter, so enforce
    3862             :          * that.
    3863             :          */
    3864           0 :         if (ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
    3865             :             !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) {
    3866             :                 goto err;
    3867             :         }
    3868             : 
    3869             :         /*
    3870             :          * This is just grabbed for accounting purposes. When a process exits,
    3871             :          * the mm is exited and dropped before the files, hence we need to hang
    3872             :          * on to this mm purely for the purposes of being able to unaccount
    3873             :          * memory (locked/pinned vm). It's not used for anything else.
    3874             :          */
    3875           0 :         mmgrab(current->mm);
    3876           0 :         ctx->mm_account = current->mm;
    3877             : 
    3878           0 :         ret = io_allocate_scq_urings(ctx, p);
    3879           0 :         if (ret)
    3880             :                 goto err;
    3881             : 
    3882           0 :         ret = io_sq_offload_create(ctx, p);
    3883           0 :         if (ret)
    3884             :                 goto err;
    3885             : 
    3886           0 :         ret = io_rsrc_init(ctx);
    3887           0 :         if (ret)
    3888             :                 goto err;
    3889             : 
    3890           0 :         memset(&p->sq_off, 0, sizeof(p->sq_off));
    3891           0 :         p->sq_off.head = offsetof(struct io_rings, sq.head);
    3892           0 :         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
    3893           0 :         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
    3894           0 :         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
    3895           0 :         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
    3896           0 :         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
    3897           0 :         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
    3898             : 
    3899           0 :         memset(&p->cq_off, 0, sizeof(p->cq_off));
    3900           0 :         p->cq_off.head = offsetof(struct io_rings, cq.head);
    3901           0 :         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
    3902           0 :         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
    3903           0 :         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
    3904           0 :         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
    3905           0 :         p->cq_off.cqes = offsetof(struct io_rings, cqes);
    3906           0 :         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
    3907             : 
    3908           0 :         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
    3909             :                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
    3910             :                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
    3911             :                         IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
    3912             :                         IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
    3913             :                         IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
    3914             :                         IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING;
    3915             : 
    3916           0 :         if (copy_to_user(params, p, sizeof(*p))) {
    3917             :                 ret = -EFAULT;
    3918             :                 goto err;
    3919             :         }
    3920             : 
    3921           0 :         if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
    3922           0 :             && !(ctx->flags & IORING_SETUP_R_DISABLED))
    3923           0 :                 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
    3924             : 
    3925           0 :         file = io_uring_get_file(ctx);
    3926           0 :         if (IS_ERR(file)) {
    3927           0 :                 ret = PTR_ERR(file);
    3928           0 :                 goto err;
    3929             :         }
    3930             : 
    3931             :         /*
    3932             :          * Install ring fd as the very last thing, so we don't risk someone
    3933             :          * having closed it before we finish setup
    3934             :          */
    3935           0 :         ret = io_uring_install_fd(ctx, file);
    3936           0 :         if (ret < 0) {
    3937             :                 /* fput will clean it up */
    3938           0 :                 fput(file);
    3939           0 :                 return ret;
    3940             :         }
    3941             : 
    3942             :         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
    3943             :         return ret;
    3944             : err:
    3945           0 :         io_ring_ctx_wait_and_kill(ctx);
    3946           0 :         return ret;
    3947             : }
    3948             : 
    3949             : /*
    3950             :  * Sets up an aio uring context, and returns the fd. Applications asks for a
    3951             :  * ring size, we return the actual sq/cq ring sizes (among other things) in the
    3952             :  * params structure passed in.
    3953             :  */
    3954           0 : static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
    3955             : {
    3956             :         struct io_uring_params p;
    3957             :         int i;
    3958             : 
    3959           0 :         if (copy_from_user(&p, params, sizeof(p)))
    3960             :                 return -EFAULT;
    3961           0 :         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
    3962           0 :                 if (p.resv[i])
    3963             :                         return -EINVAL;
    3964             :         }
    3965             : 
    3966           0 :         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
    3967             :                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
    3968             :                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
    3969             :                         IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
    3970             :                         IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
    3971             :                         IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
    3972             :                         IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN))
    3973             :                 return -EINVAL;
    3974             : 
    3975           0 :         return io_uring_create(entries, &p, params);
    3976             : }
    3977             : 
    3978           0 : SYSCALL_DEFINE2(io_uring_setup, u32, entries,
    3979             :                 struct io_uring_params __user *, params)
    3980             : {
    3981           0 :         return io_uring_setup(entries, params);
    3982             : }
    3983             : 
    3984           0 : static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
    3985             :                            unsigned nr_args)
    3986             : {
    3987             :         struct io_uring_probe *p;
    3988             :         size_t size;
    3989             :         int i, ret;
    3990             : 
    3991           0 :         size = struct_size(p, ops, nr_args);
    3992           0 :         if (size == SIZE_MAX)
    3993             :                 return -EOVERFLOW;
    3994           0 :         p = kzalloc(size, GFP_KERNEL);
    3995           0 :         if (!p)
    3996             :                 return -ENOMEM;
    3997             : 
    3998           0 :         ret = -EFAULT;
    3999           0 :         if (copy_from_user(p, arg, size))
    4000             :                 goto out;
    4001           0 :         ret = -EINVAL;
    4002           0 :         if (memchr_inv(p, 0, size))
    4003             :                 goto out;
    4004             : 
    4005           0 :         p->last_op = IORING_OP_LAST - 1;
    4006           0 :         if (nr_args > IORING_OP_LAST)
    4007           0 :                 nr_args = IORING_OP_LAST;
    4008             : 
    4009           0 :         for (i = 0; i < nr_args; i++) {
    4010           0 :                 p->ops[i].op = i;
    4011           0 :                 if (!io_issue_defs[i].not_supported)
    4012           0 :                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
    4013             :         }
    4014           0 :         p->ops_len = i;
    4015             : 
    4016           0 :         ret = 0;
    4017           0 :         if (copy_to_user(arg, p, size))
    4018           0 :                 ret = -EFAULT;
    4019             : out:
    4020           0 :         kfree(p);
    4021             :         return ret;
    4022             : }
    4023             : 
    4024           0 : static int io_register_personality(struct io_ring_ctx *ctx)
    4025             : {
    4026             :         const struct cred *creds;
    4027             :         u32 id;
    4028             :         int ret;
    4029             : 
    4030           0 :         creds = get_current_cred();
    4031             : 
    4032           0 :         ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
    4033           0 :                         XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
    4034           0 :         if (ret < 0) {
    4035             :                 put_cred(creds);
    4036             :                 return ret;
    4037             :         }
    4038           0 :         return id;
    4039             : }
    4040             : 
    4041           0 : static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
    4042             :                                            void __user *arg, unsigned int nr_args)
    4043             : {
    4044             :         struct io_uring_restriction *res;
    4045             :         size_t size;
    4046             :         int i, ret;
    4047             : 
    4048             :         /* Restrictions allowed only if rings started disabled */
    4049           0 :         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
    4050             :                 return -EBADFD;
    4051             : 
    4052             :         /* We allow only a single restrictions registration */
    4053           0 :         if (ctx->restrictions.registered)
    4054             :                 return -EBUSY;
    4055             : 
    4056           0 :         if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
    4057             :                 return -EINVAL;
    4058             : 
    4059           0 :         size = array_size(nr_args, sizeof(*res));
    4060           0 :         if (size == SIZE_MAX)
    4061             :                 return -EOVERFLOW;
    4062             : 
    4063           0 :         res = memdup_user(arg, size);
    4064           0 :         if (IS_ERR(res))
    4065           0 :                 return PTR_ERR(res);
    4066             : 
    4067             :         ret = 0;
    4068             : 
    4069           0 :         for (i = 0; i < nr_args; i++) {
    4070           0 :                 switch (res[i].opcode) {
    4071             :                 case IORING_RESTRICTION_REGISTER_OP:
    4072           0 :                         if (res[i].register_op >= IORING_REGISTER_LAST) {
    4073             :                                 ret = -EINVAL;
    4074             :                                 goto out;
    4075             :                         }
    4076             : 
    4077           0 :                         __set_bit(res[i].register_op,
    4078             :                                   ctx->restrictions.register_op);
    4079             :                         break;
    4080             :                 case IORING_RESTRICTION_SQE_OP:
    4081           0 :                         if (res[i].sqe_op >= IORING_OP_LAST) {
    4082             :                                 ret = -EINVAL;
    4083             :                                 goto out;
    4084             :                         }
    4085             : 
    4086           0 :                         __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
    4087             :                         break;
    4088             :                 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
    4089           0 :                         ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
    4090           0 :                         break;
    4091             :                 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
    4092           0 :                         ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
    4093           0 :                         break;
    4094             :                 default:
    4095             :                         ret = -EINVAL;
    4096             :                         goto out;
    4097             :                 }
    4098             :         }
    4099             : 
    4100             : out:
    4101             :         /* Reset all restrictions if an error happened */
    4102           0 :         if (ret != 0)
    4103           0 :                 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
    4104             :         else
    4105           0 :                 ctx->restrictions.registered = true;
    4106             : 
    4107           0 :         kfree(res);
    4108           0 :         return ret;
    4109             : }
    4110             : 
    4111           0 : static int io_register_enable_rings(struct io_ring_ctx *ctx)
    4112             : {
    4113           0 :         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
    4114             :                 return -EBADFD;
    4115             : 
    4116           0 :         if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
    4117           0 :                 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
    4118             :                 /*
    4119             :                  * Lazy activation attempts would fail if it was polled before
    4120             :                  * submitter_task is set.
    4121             :                  */
    4122           0 :                 if (wq_has_sleeper(&ctx->poll_wq))
    4123           0 :                         io_activate_pollwq(ctx);
    4124             :         }
    4125             : 
    4126           0 :         if (ctx->restrictions.registered)
    4127           0 :                 ctx->restricted = 1;
    4128             : 
    4129           0 :         ctx->flags &= ~IORING_SETUP_R_DISABLED;
    4130           0 :         if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
    4131           0 :                 wake_up(&ctx->sq_data->wait);
    4132             :         return 0;
    4133             : }
    4134             : 
    4135           0 : static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
    4136             :                                        void __user *arg, unsigned len)
    4137             : {
    4138           0 :         struct io_uring_task *tctx = current->io_uring;
    4139             :         cpumask_var_t new_mask;
    4140             :         int ret;
    4141             : 
    4142           0 :         if (!tctx || !tctx->io_wq)
    4143             :                 return -EINVAL;
    4144             : 
    4145           0 :         if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
    4146             :                 return -ENOMEM;
    4147             : 
    4148           0 :         cpumask_clear(new_mask);
    4149           0 :         if (len > cpumask_size())
    4150           0 :                 len = cpumask_size();
    4151             : 
    4152             :         if (in_compat_syscall()) {
    4153             :                 ret = compat_get_bitmap(cpumask_bits(new_mask),
    4154             :                                         (const compat_ulong_t __user *)arg,
    4155             :                                         len * 8 /* CHAR_BIT */);
    4156             :         } else {
    4157           0 :                 ret = copy_from_user(new_mask, arg, len);
    4158             :         }
    4159             : 
    4160           0 :         if (ret) {
    4161             :                 free_cpumask_var(new_mask);
    4162             :                 return -EFAULT;
    4163             :         }
    4164             : 
    4165           0 :         ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
    4166           0 :         free_cpumask_var(new_mask);
    4167             :         return ret;
    4168             : }
    4169             : 
    4170           0 : static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
    4171             : {
    4172           0 :         struct io_uring_task *tctx = current->io_uring;
    4173             : 
    4174           0 :         if (!tctx || !tctx->io_wq)
    4175             :                 return -EINVAL;
    4176             : 
    4177           0 :         return io_wq_cpu_affinity(tctx->io_wq, NULL);
    4178             : }
    4179             : 
    4180           0 : static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
    4181             :                                                void __user *arg)
    4182             :         __must_hold(&ctx->uring_lock)
    4183             : {
    4184             :         struct io_tctx_node *node;
    4185           0 :         struct io_uring_task *tctx = NULL;
    4186           0 :         struct io_sq_data *sqd = NULL;
    4187             :         __u32 new_count[2];
    4188             :         int i, ret;
    4189             : 
    4190           0 :         if (copy_from_user(new_count, arg, sizeof(new_count)))
    4191             :                 return -EFAULT;
    4192           0 :         for (i = 0; i < ARRAY_SIZE(new_count); i++)
    4193           0 :                 if (new_count[i] > INT_MAX)
    4194             :                         return -EINVAL;
    4195             : 
    4196           0 :         if (ctx->flags & IORING_SETUP_SQPOLL) {
    4197           0 :                 sqd = ctx->sq_data;
    4198           0 :                 if (sqd) {
    4199             :                         /*
    4200             :                          * Observe the correct sqd->lock -> ctx->uring_lock
    4201             :                          * ordering. Fine to drop uring_lock here, we hold
    4202             :                          * a ref to the ctx.
    4203             :                          */
    4204           0 :                         refcount_inc(&sqd->refs);
    4205           0 :                         mutex_unlock(&ctx->uring_lock);
    4206           0 :                         mutex_lock(&sqd->lock);
    4207           0 :                         mutex_lock(&ctx->uring_lock);
    4208           0 :                         if (sqd->thread)
    4209           0 :                                 tctx = sqd->thread->io_uring;
    4210             :                 }
    4211             :         } else {
    4212           0 :                 tctx = current->io_uring;
    4213             :         }
    4214             : 
    4215             :         BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
    4216             : 
    4217           0 :         for (i = 0; i < ARRAY_SIZE(new_count); i++)
    4218           0 :                 if (new_count[i])
    4219           0 :                         ctx->iowq_limits[i] = new_count[i];
    4220           0 :         ctx->iowq_limits_set = true;
    4221             : 
    4222           0 :         if (tctx && tctx->io_wq) {
    4223           0 :                 ret = io_wq_max_workers(tctx->io_wq, new_count);
    4224           0 :                 if (ret)
    4225             :                         goto err;
    4226             :         } else {
    4227           0 :                 memset(new_count, 0, sizeof(new_count));
    4228             :         }
    4229             : 
    4230           0 :         if (sqd) {
    4231           0 :                 mutex_unlock(&sqd->lock);
    4232           0 :                 io_put_sq_data(sqd);
    4233             :         }
    4234             : 
    4235           0 :         if (copy_to_user(arg, new_count, sizeof(new_count)))
    4236             :                 return -EFAULT;
    4237             : 
    4238             :         /* that's it for SQPOLL, only the SQPOLL task creates requests */
    4239           0 :         if (sqd)
    4240             :                 return 0;
    4241             : 
    4242             :         /* now propagate the restriction to all registered users */
    4243           0 :         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
    4244           0 :                 struct io_uring_task *tctx = node->task->io_uring;
    4245             : 
    4246           0 :                 if (WARN_ON_ONCE(!tctx->io_wq))
    4247           0 :                         continue;
    4248             : 
    4249           0 :                 for (i = 0; i < ARRAY_SIZE(new_count); i++)
    4250           0 :                         new_count[i] = ctx->iowq_limits[i];
    4251             :                 /* ignore errors, it always returns zero anyway */
    4252           0 :                 (void)io_wq_max_workers(tctx->io_wq, new_count);
    4253             :         }
    4254             :         return 0;
    4255             : err:
    4256           0 :         if (sqd) {
    4257           0 :                 mutex_unlock(&sqd->lock);
    4258           0 :                 io_put_sq_data(sqd);
    4259             :         }
    4260             :         return ret;
    4261             : }
    4262             : 
    4263           0 : static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
    4264             :                                void __user *arg, unsigned nr_args)
    4265             :         __releases(ctx->uring_lock)
    4266             :         __acquires(ctx->uring_lock)
    4267             : {
    4268             :         int ret;
    4269             : 
    4270             :         /*
    4271             :          * We don't quiesce the refs for register anymore and so it can't be
    4272             :          * dying as we're holding a file ref here.
    4273             :          */
    4274           0 :         if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
    4275             :                 return -ENXIO;
    4276             : 
    4277           0 :         if (ctx->submitter_task && ctx->submitter_task != current)
    4278             :                 return -EEXIST;
    4279             : 
    4280           0 :         if (ctx->restricted) {
    4281           0 :                 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
    4282           0 :                 if (!test_bit(opcode, ctx->restrictions.register_op))
    4283             :                         return -EACCES;
    4284             :         }
    4285             : 
    4286           0 :         switch (opcode) {
    4287             :         case IORING_REGISTER_BUFFERS:
    4288           0 :                 ret = -EFAULT;
    4289           0 :                 if (!arg)
    4290             :                         break;
    4291           0 :                 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
    4292           0 :                 break;
    4293             :         case IORING_UNREGISTER_BUFFERS:
    4294           0 :                 ret = -EINVAL;
    4295           0 :                 if (arg || nr_args)
    4296             :                         break;
    4297           0 :                 ret = io_sqe_buffers_unregister(ctx);
    4298           0 :                 break;
    4299             :         case IORING_REGISTER_FILES:
    4300           0 :                 ret = -EFAULT;
    4301           0 :                 if (!arg)
    4302             :                         break;
    4303           0 :                 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
    4304           0 :                 break;
    4305             :         case IORING_UNREGISTER_FILES:
    4306           0 :                 ret = -EINVAL;
    4307           0 :                 if (arg || nr_args)
    4308             :                         break;
    4309           0 :                 ret = io_sqe_files_unregister(ctx);
    4310           0 :                 break;
    4311             :         case IORING_REGISTER_FILES_UPDATE:
    4312           0 :                 ret = io_register_files_update(ctx, arg, nr_args);
    4313           0 :                 break;
    4314             :         case IORING_REGISTER_EVENTFD:
    4315           0 :                 ret = -EINVAL;
    4316           0 :                 if (nr_args != 1)
    4317             :                         break;
    4318           0 :                 ret = io_eventfd_register(ctx, arg, 0);
    4319           0 :                 break;
    4320             :         case IORING_REGISTER_EVENTFD_ASYNC:
    4321           0 :                 ret = -EINVAL;
    4322           0 :                 if (nr_args != 1)
    4323             :                         break;
    4324           0 :                 ret = io_eventfd_register(ctx, arg, 1);
    4325           0 :                 break;
    4326             :         case IORING_UNREGISTER_EVENTFD:
    4327           0 :                 ret = -EINVAL;
    4328           0 :                 if (arg || nr_args)
    4329             :                         break;
    4330           0 :                 ret = io_eventfd_unregister(ctx);
    4331           0 :                 break;
    4332             :         case IORING_REGISTER_PROBE:
    4333           0 :                 ret = -EINVAL;
    4334           0 :                 if (!arg || nr_args > 256)
    4335             :                         break;
    4336           0 :                 ret = io_probe(ctx, arg, nr_args);
    4337           0 :                 break;
    4338             :         case IORING_REGISTER_PERSONALITY:
    4339           0 :                 ret = -EINVAL;
    4340           0 :                 if (arg || nr_args)
    4341             :                         break;
    4342           0 :                 ret = io_register_personality(ctx);
    4343           0 :                 break;
    4344             :         case IORING_UNREGISTER_PERSONALITY:
    4345           0 :                 ret = -EINVAL;
    4346           0 :                 if (arg)
    4347             :                         break;
    4348           0 :                 ret = io_unregister_personality(ctx, nr_args);
    4349           0 :                 break;
    4350             :         case IORING_REGISTER_ENABLE_RINGS:
    4351           0 :                 ret = -EINVAL;
    4352           0 :                 if (arg || nr_args)
    4353             :                         break;
    4354           0 :                 ret = io_register_enable_rings(ctx);
    4355           0 :                 break;
    4356             :         case IORING_REGISTER_RESTRICTIONS:
    4357           0 :                 ret = io_register_restrictions(ctx, arg, nr_args);
    4358           0 :                 break;
    4359             :         case IORING_REGISTER_FILES2:
    4360           0 :                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
    4361           0 :                 break;
    4362             :         case IORING_REGISTER_FILES_UPDATE2:
    4363           0 :                 ret = io_register_rsrc_update(ctx, arg, nr_args,
    4364             :                                               IORING_RSRC_FILE);
    4365           0 :                 break;
    4366             :         case IORING_REGISTER_BUFFERS2:
    4367           0 :                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
    4368           0 :                 break;
    4369             :         case IORING_REGISTER_BUFFERS_UPDATE:
    4370           0 :                 ret = io_register_rsrc_update(ctx, arg, nr_args,
    4371             :                                               IORING_RSRC_BUFFER);
    4372           0 :                 break;
    4373             :         case IORING_REGISTER_IOWQ_AFF:
    4374           0 :                 ret = -EINVAL;
    4375           0 :                 if (!arg || !nr_args)
    4376             :                         break;
    4377           0 :                 ret = io_register_iowq_aff(ctx, arg, nr_args);
    4378           0 :                 break;
    4379             :         case IORING_UNREGISTER_IOWQ_AFF:
    4380           0 :                 ret = -EINVAL;
    4381           0 :                 if (arg || nr_args)
    4382             :                         break;
    4383           0 :                 ret = io_unregister_iowq_aff(ctx);
    4384           0 :                 break;
    4385             :         case IORING_REGISTER_IOWQ_MAX_WORKERS:
    4386           0 :                 ret = -EINVAL;
    4387           0 :                 if (!arg || nr_args != 2)
    4388             :                         break;
    4389           0 :                 ret = io_register_iowq_max_workers(ctx, arg);
    4390           0 :                 break;
    4391             :         case IORING_REGISTER_RING_FDS:
    4392           0 :                 ret = io_ringfd_register(ctx, arg, nr_args);
    4393           0 :                 break;
    4394             :         case IORING_UNREGISTER_RING_FDS:
    4395           0 :                 ret = io_ringfd_unregister(ctx, arg, nr_args);
    4396           0 :                 break;
    4397             :         case IORING_REGISTER_PBUF_RING:
    4398           0 :                 ret = -EINVAL;
    4399           0 :                 if (!arg || nr_args != 1)
    4400             :                         break;
    4401           0 :                 ret = io_register_pbuf_ring(ctx, arg);
    4402           0 :                 break;
    4403             :         case IORING_UNREGISTER_PBUF_RING:
    4404           0 :                 ret = -EINVAL;
    4405           0 :                 if (!arg || nr_args != 1)
    4406             :                         break;
    4407           0 :                 ret = io_unregister_pbuf_ring(ctx, arg);
    4408           0 :                 break;
    4409             :         case IORING_REGISTER_SYNC_CANCEL:
    4410           0 :                 ret = -EINVAL;
    4411           0 :                 if (!arg || nr_args != 1)
    4412             :                         break;
    4413           0 :                 ret = io_sync_cancel(ctx, arg);
    4414           0 :                 break;
    4415             :         case IORING_REGISTER_FILE_ALLOC_RANGE:
    4416           0 :                 ret = -EINVAL;
    4417           0 :                 if (!arg || nr_args)
    4418             :                         break;
    4419           0 :                 ret = io_register_file_alloc_range(ctx, arg);
    4420           0 :                 break;
    4421             :         default:
    4422             :                 ret = -EINVAL;
    4423             :                 break;
    4424             :         }
    4425             : 
    4426             :         return ret;
    4427             : }
    4428             : 
    4429           0 : SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
    4430             :                 void __user *, arg, unsigned int, nr_args)
    4431             : {
    4432             :         struct io_ring_ctx *ctx;
    4433           0 :         long ret = -EBADF;
    4434             :         struct fd f;
    4435             :         bool use_registered_ring;
    4436             : 
    4437           0 :         use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
    4438           0 :         opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
    4439             : 
    4440           0 :         if (opcode >= IORING_REGISTER_LAST)
    4441             :                 return -EINVAL;
    4442             : 
    4443           0 :         if (use_registered_ring) {
    4444             :                 /*
    4445             :                  * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
    4446             :                  * need only dereference our task private array to find it.
    4447             :                  */
    4448           0 :                 struct io_uring_task *tctx = current->io_uring;
    4449             : 
    4450           0 :                 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
    4451             :                         return -EINVAL;
    4452           0 :                 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
    4453           0 :                 f.file = tctx->registered_rings[fd];
    4454           0 :                 f.flags = 0;
    4455           0 :                 if (unlikely(!f.file))
    4456             :                         return -EBADF;
    4457             :         } else {
    4458           0 :                 f = fdget(fd);
    4459           0 :                 if (unlikely(!f.file))
    4460             :                         return -EBADF;
    4461           0 :                 ret = -EOPNOTSUPP;
    4462           0 :                 if (!io_is_uring_fops(f.file))
    4463             :                         goto out_fput;
    4464             :         }
    4465             : 
    4466           0 :         ctx = f.file->private_data;
    4467             : 
    4468           0 :         mutex_lock(&ctx->uring_lock);
    4469           0 :         ret = __io_uring_register(ctx, opcode, arg, nr_args);
    4470           0 :         mutex_unlock(&ctx->uring_lock);
    4471           0 :         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
    4472             : out_fput:
    4473           0 :         fdput(f);
    4474             :         return ret;
    4475             : }
    4476             : 
    4477           1 : static int __init io_uring_init(void)
    4478             : {
    4479             : #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
    4480             :         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
    4481             :         BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \
    4482             : } while (0)
    4483             : 
    4484             : #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
    4485             :         __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, sizeof(etype), ename)
    4486             : #define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \
    4487             :         __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename)
    4488             :         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
    4489             :         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
    4490             :         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
    4491             :         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
    4492             :         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
    4493             :         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
    4494             :         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
    4495             :         BUILD_BUG_SQE_ELEM(8,  __u32,  cmd_op);
    4496             :         BUILD_BUG_SQE_ELEM(12, __u32, __pad1);
    4497             :         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
    4498             :         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
    4499             :         BUILD_BUG_SQE_ELEM(24, __u32,  len);
    4500             :         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
    4501             :         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
    4502             :         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
    4503             :         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
    4504             :         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
    4505             :         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
    4506             :         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
    4507             :         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
    4508             :         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
    4509             :         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
    4510             :         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
    4511             :         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
    4512             :         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
    4513             :         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
    4514             :         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
    4515             :         BUILD_BUG_SQE_ELEM(28, __u32,  rename_flags);
    4516             :         BUILD_BUG_SQE_ELEM(28, __u32,  unlink_flags);
    4517             :         BUILD_BUG_SQE_ELEM(28, __u32,  hardlink_flags);
    4518             :         BUILD_BUG_SQE_ELEM(28, __u32,  xattr_flags);
    4519             :         BUILD_BUG_SQE_ELEM(28, __u32,  msg_ring_flags);
    4520             :         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
    4521             :         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
    4522             :         BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
    4523             :         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
    4524             :         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
    4525             :         BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
    4526             :         BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
    4527             :         BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
    4528             :         BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
    4529             :         BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
    4530             :         BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
    4531             : 
    4532             :         BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
    4533             :                      sizeof(struct io_uring_rsrc_update));
    4534             :         BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
    4535             :                      sizeof(struct io_uring_rsrc_update2));
    4536             : 
    4537             :         /* ->buf_index is u16 */
    4538             :         BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
    4539             :         BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
    4540             :                      offsetof(struct io_uring_buf_ring, tail));
    4541             : 
    4542             :         /* should fit into one byte */
    4543             :         BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
    4544             :         BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
    4545             :         BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
    4546             : 
    4547             :         BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
    4548             : 
    4549             :         BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
    4550             : 
    4551           1 :         io_uring_optable_init();
    4552             : 
    4553           1 :         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
    4554             :                                 SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
    4555           1 :         return 0;
    4556             : };
    4557             : __initcall(io_uring_init);

Generated by: LCOV version 1.14