LCOV - code coverage report
Current view: top level - io_uring - io_uring.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 4 1963 0.2 %
Date: 2023-08-24 13:40:31 Functions: 1 135 0.7 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Shared application/kernel submission and completion ring pairs, for
       4             :  * supporting fast/efficient IO.
       5             :  *
       6             :  * A note on the read/write ordering memory barriers that are matched between
       7             :  * the application and kernel side.
       8             :  *
       9             :  * After the application reads the CQ ring tail, it must use an
      10             :  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
      11             :  * before writing the tail (using smp_load_acquire to read the tail will
      12             :  * do). It also needs a smp_mb() before updating CQ head (ordering the
      13             :  * entry load(s) with the head store), pairing with an implicit barrier
      14             :  * through a control-dependency in io_get_cqe (smp_store_release to
      15             :  * store head will do). Failure to do so could lead to reading invalid
      16             :  * CQ entries.
      17             :  *
      18             :  * Likewise, the application must use an appropriate smp_wmb() before
      19             :  * writing the SQ tail (ordering SQ entry stores with the tail store),
      20             :  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
      21             :  * to store the tail will do). And it needs a barrier ordering the SQ
      22             :  * head load before writing new SQ entries (smp_load_acquire to read
      23             :  * head will do).
      24             :  *
      25             :  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
      26             :  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
      27             :  * updating the SQ tail; a full memory barrier smp_mb() is needed
      28             :  * between.
      29             :  *
      30             :  * Also see the examples in the liburing library:
      31             :  *
      32             :  *      git://git.kernel.dk/liburing
      33             :  *
      34             :  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
      35             :  * from data shared between the kernel and application. This is done both
      36             :  * for ordering purposes, but also to ensure that once a value is loaded from
      37             :  * data that the application could potentially modify, it remains stable.
      38             :  *
      39             :  * Copyright (C) 2018-2019 Jens Axboe
      40             :  * Copyright (c) 2018-2019 Christoph Hellwig
      41             :  */
      42             : #include <linux/kernel.h>
      43             : #include <linux/init.h>
      44             : #include <linux/errno.h>
      45             : #include <linux/syscalls.h>
      46             : #include <net/compat.h>
      47             : #include <linux/refcount.h>
      48             : #include <linux/uio.h>
      49             : #include <linux/bits.h>
      50             : 
      51             : #include <linux/sched/signal.h>
      52             : #include <linux/fs.h>
      53             : #include <linux/file.h>
      54             : #include <linux/fdtable.h>
      55             : #include <linux/mm.h>
      56             : #include <linux/mman.h>
      57             : #include <linux/percpu.h>
      58             : #include <linux/slab.h>
      59             : #include <linux/bvec.h>
      60             : #include <linux/net.h>
      61             : #include <net/sock.h>
      62             : #include <net/af_unix.h>
      63             : #include <net/scm.h>
      64             : #include <linux/anon_inodes.h>
      65             : #include <linux/sched/mm.h>
      66             : #include <linux/uaccess.h>
      67             : #include <linux/nospec.h>
      68             : #include <linux/highmem.h>
      69             : #include <linux/fsnotify.h>
      70             : #include <linux/fadvise.h>
      71             : #include <linux/task_work.h>
      72             : #include <linux/io_uring.h>
      73             : #include <linux/audit.h>
      74             : #include <linux/security.h>
      75             : #include <asm/shmparam.h>
      76             : 
      77             : #define CREATE_TRACE_POINTS
      78             : #include <trace/events/io_uring.h>
      79             : 
      80             : #include <uapi/linux/io_uring.h>
      81             : 
      82             : #include "io-wq.h"
      83             : 
      84             : #include "io_uring.h"
      85             : #include "opdef.h"
      86             : #include "refs.h"
      87             : #include "tctx.h"
      88             : #include "sqpoll.h"
      89             : #include "fdinfo.h"
      90             : #include "kbuf.h"
      91             : #include "rsrc.h"
      92             : #include "cancel.h"
      93             : #include "net.h"
      94             : #include "notif.h"
      95             : 
      96             : #include "timeout.h"
      97             : #include "poll.h"
      98             : #include "rw.h"
      99             : #include "alloc_cache.h"
     100             : 
     101             : #define IORING_MAX_ENTRIES      32768
     102             : #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
     103             : 
     104             : #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
     105             :                                  IORING_REGISTER_LAST + IORING_OP_LAST)
     106             : 
     107             : #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
     108             :                           IOSQE_IO_HARDLINK | IOSQE_ASYNC)
     109             : 
     110             : #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
     111             :                         IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
     112             : 
     113             : #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
     114             :                                 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
     115             :                                 REQ_F_ASYNC_DATA)
     116             : 
     117             : #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
     118             :                                  IO_REQ_CLEAN_FLAGS)
     119             : 
     120             : #define IO_TCTX_REFS_CACHE_NR   (1U << 10)
     121             : 
     122             : #define IO_COMPL_BATCH                  32
     123             : #define IO_REQ_ALLOC_BATCH              8
     124             : 
     125             : enum {
     126             :         IO_CHECK_CQ_OVERFLOW_BIT,
     127             :         IO_CHECK_CQ_DROPPED_BIT,
     128             : };
     129             : 
     130             : enum {
     131             :         IO_EVENTFD_OP_SIGNAL_BIT,
     132             :         IO_EVENTFD_OP_FREE_BIT,
     133             : };
     134             : 
     135             : struct io_defer_entry {
     136             :         struct list_head        list;
     137             :         struct io_kiocb         *req;
     138             :         u32                     seq;
     139             : };
     140             : 
     141             : /* requests with any of those set should undergo io_disarm_next() */
     142             : #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
     143             : #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
     144             : 
     145             : static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
     146             :                                          struct task_struct *task,
     147             :                                          bool cancel_all);
     148             : 
     149             : static void io_queue_sqe(struct io_kiocb *req);
     150             : static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
     151             : static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
     152             : 
     153             : struct kmem_cache *req_cachep;
     154             : 
     155           0 : struct sock *io_uring_get_socket(struct file *file)
     156             : {
     157             : #if defined(CONFIG_UNIX)
     158             :         if (io_is_uring_fops(file)) {
     159             :                 struct io_ring_ctx *ctx = file->private_data;
     160             : 
     161             :                 return ctx->ring_sock->sk;
     162             :         }
     163             : #endif
     164           0 :         return NULL;
     165             : }
     166             : EXPORT_SYMBOL(io_uring_get_socket);
     167             : 
     168             : static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
     169             : {
     170           0 :         if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
     171           0 :             ctx->submit_state.cqes_count)
     172           0 :                 __io_submit_flush_completions(ctx);
     173             : }
     174             : 
     175             : static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
     176             : {
     177           0 :         return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
     178             : }
     179             : 
     180             : static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx)
     181             : {
     182           0 :         return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head);
     183             : }
     184             : 
     185             : static bool io_match_linked(struct io_kiocb *head)
     186             : {
     187             :         struct io_kiocb *req;
     188             : 
     189           0 :         io_for_each_link(req, head) {
     190           0 :                 if (req->flags & REQ_F_INFLIGHT)
     191             :                         return true;
     192             :         }
     193             :         return false;
     194             : }
     195             : 
     196             : /*
     197             :  * As io_match_task() but protected against racing with linked timeouts.
     198             :  * User must not hold timeout_lock.
     199             :  */
     200           0 : bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
     201             :                         bool cancel_all)
     202             : {
     203             :         bool matched;
     204             : 
     205           0 :         if (task && head->task != task)
     206             :                 return false;
     207           0 :         if (cancel_all)
     208             :                 return true;
     209             : 
     210           0 :         if (head->flags & REQ_F_LINK_TIMEOUT) {
     211           0 :                 struct io_ring_ctx *ctx = head->ctx;
     212             : 
     213             :                 /* protect against races with linked timeouts */
     214           0 :                 spin_lock_irq(&ctx->timeout_lock);
     215           0 :                 matched = io_match_linked(head);
     216           0 :                 spin_unlock_irq(&ctx->timeout_lock);
     217             :         } else {
     218             :                 matched = io_match_linked(head);
     219             :         }
     220             :         return matched;
     221             : }
     222             : 
     223             : static inline void req_fail_link_node(struct io_kiocb *req, int res)
     224             : {
     225           0 :         req_set_fail(req);
     226           0 :         io_req_set_res(req, res, 0);
     227             : }
     228             : 
     229             : static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
     230             : {
     231           0 :         wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
     232           0 :         kasan_poison_object_data(req_cachep, req);
     233             : }
     234             : 
     235           0 : static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
     236             : {
     237           0 :         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
     238             : 
     239           0 :         complete(&ctx->ref_comp);
     240           0 : }
     241             : 
     242           0 : static __cold void io_fallback_req_func(struct work_struct *work)
     243             : {
     244           0 :         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
     245             :                                                 fallback_work.work);
     246           0 :         struct llist_node *node = llist_del_all(&ctx->fallback_llist);
     247             :         struct io_kiocb *req, *tmp;
     248           0 :         struct io_tw_state ts = { .locked = true, };
     249             : 
     250           0 :         mutex_lock(&ctx->uring_lock);
     251           0 :         llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
     252           0 :                 req->io_task_work.func(req, &ts);
     253           0 :         if (WARN_ON_ONCE(!ts.locked))
     254           0 :                 return;
     255           0 :         io_submit_flush_completions(ctx);
     256           0 :         mutex_unlock(&ctx->uring_lock);
     257             : }
     258             : 
     259           0 : static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
     260             : {
     261           0 :         unsigned hash_buckets = 1U << bits;
     262           0 :         size_t hash_size = hash_buckets * sizeof(table->hbs[0]);
     263             : 
     264           0 :         table->hbs = kmalloc(hash_size, GFP_KERNEL);
     265           0 :         if (!table->hbs)
     266             :                 return -ENOMEM;
     267             : 
     268           0 :         table->hash_bits = bits;
     269           0 :         init_hash_table(table, hash_buckets);
     270           0 :         return 0;
     271             : }
     272             : 
     273           0 : static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
     274             : {
     275             :         struct io_ring_ctx *ctx;
     276             :         int hash_bits;
     277             : 
     278           0 :         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
     279           0 :         if (!ctx)
     280             :                 return NULL;
     281             : 
     282           0 :         xa_init(&ctx->io_bl_xa);
     283             : 
     284             :         /*
     285             :          * Use 5 bits less than the max cq entries, that should give us around
     286             :          * 32 entries per hash list if totally full and uniformly spread, but
     287             :          * don't keep too many buckets to not overconsume memory.
     288             :          */
     289           0 :         hash_bits = ilog2(p->cq_entries) - 5;
     290           0 :         hash_bits = clamp(hash_bits, 1, 8);
     291           0 :         if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
     292             :                 goto err;
     293           0 :         if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits))
     294             :                 goto err;
     295             : 
     296           0 :         ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
     297           0 :         if (!ctx->dummy_ubuf)
     298             :                 goto err;
     299             :         /* set invalid range, so io_import_fixed() fails meeting it */
     300           0 :         ctx->dummy_ubuf->ubuf = -1UL;
     301             : 
     302           0 :         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
     303             :                             0, GFP_KERNEL))
     304             :                 goto err;
     305             : 
     306           0 :         ctx->flags = p->flags;
     307           0 :         init_waitqueue_head(&ctx->sqo_sq_wait);
     308           0 :         INIT_LIST_HEAD(&ctx->sqd_list);
     309           0 :         INIT_LIST_HEAD(&ctx->cq_overflow_list);
     310           0 :         INIT_LIST_HEAD(&ctx->io_buffers_cache);
     311           0 :         io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
     312             :                             sizeof(struct io_rsrc_node));
     313           0 :         io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
     314             :                             sizeof(struct async_poll));
     315           0 :         io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
     316             :                             sizeof(struct io_async_msghdr));
     317           0 :         init_completion(&ctx->ref_comp);
     318           0 :         xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
     319           0 :         mutex_init(&ctx->uring_lock);
     320           0 :         init_waitqueue_head(&ctx->cq_wait);
     321           0 :         init_waitqueue_head(&ctx->poll_wq);
     322           0 :         init_waitqueue_head(&ctx->rsrc_quiesce_wq);
     323           0 :         spin_lock_init(&ctx->completion_lock);
     324           0 :         spin_lock_init(&ctx->timeout_lock);
     325           0 :         INIT_WQ_LIST(&ctx->iopoll_list);
     326           0 :         INIT_LIST_HEAD(&ctx->io_buffers_pages);
     327           0 :         INIT_LIST_HEAD(&ctx->io_buffers_comp);
     328           0 :         INIT_LIST_HEAD(&ctx->defer_list);
     329           0 :         INIT_LIST_HEAD(&ctx->timeout_list);
     330           0 :         INIT_LIST_HEAD(&ctx->ltimeout_list);
     331           0 :         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
     332           0 :         init_llist_head(&ctx->work_llist);
     333           0 :         INIT_LIST_HEAD(&ctx->tctx_list);
     334           0 :         ctx->submit_state.free_list.next = NULL;
     335           0 :         INIT_WQ_LIST(&ctx->locked_free_list);
     336           0 :         INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
     337           0 :         INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
     338             :         return ctx;
     339             : err:
     340           0 :         kfree(ctx->dummy_ubuf);
     341           0 :         kfree(ctx->cancel_table.hbs);
     342           0 :         kfree(ctx->cancel_table_locked.hbs);
     343           0 :         kfree(ctx->io_bl);
     344           0 :         xa_destroy(&ctx->io_bl_xa);
     345           0 :         kfree(ctx);
     346             :         return NULL;
     347             : }
     348             : 
     349             : static void io_account_cq_overflow(struct io_ring_ctx *ctx)
     350             : {
     351           0 :         struct io_rings *r = ctx->rings;
     352             : 
     353           0 :         WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
     354           0 :         ctx->cq_extra--;
     355             : }
     356             : 
     357             : static bool req_need_defer(struct io_kiocb *req, u32 seq)
     358             : {
     359           0 :         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
     360           0 :                 struct io_ring_ctx *ctx = req->ctx;
     361             : 
     362           0 :                 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
     363             :         }
     364             : 
     365             :         return false;
     366             : }
     367             : 
     368           0 : static void io_clean_op(struct io_kiocb *req)
     369             : {
     370           0 :         if (req->flags & REQ_F_BUFFER_SELECTED) {
     371           0 :                 spin_lock(&req->ctx->completion_lock);
     372           0 :                 io_put_kbuf_comp(req);
     373           0 :                 spin_unlock(&req->ctx->completion_lock);
     374             :         }
     375             : 
     376           0 :         if (req->flags & REQ_F_NEED_CLEANUP) {
     377           0 :                 const struct io_cold_def *def = &io_cold_defs[req->opcode];
     378             : 
     379           0 :                 if (def->cleanup)
     380           0 :                         def->cleanup(req);
     381             :         }
     382           0 :         if ((req->flags & REQ_F_POLLED) && req->apoll) {
     383           0 :                 kfree(req->apoll->double_poll);
     384           0 :                 kfree(req->apoll);
     385           0 :                 req->apoll = NULL;
     386             :         }
     387           0 :         if (req->flags & REQ_F_INFLIGHT) {
     388           0 :                 struct io_uring_task *tctx = req->task->io_uring;
     389             : 
     390           0 :                 atomic_dec(&tctx->inflight_tracked);
     391             :         }
     392           0 :         if (req->flags & REQ_F_CREDS)
     393           0 :                 put_cred(req->creds);
     394           0 :         if (req->flags & REQ_F_ASYNC_DATA) {
     395           0 :                 kfree(req->async_data);
     396           0 :                 req->async_data = NULL;
     397             :         }
     398           0 :         req->flags &= ~IO_REQ_CLEAN_FLAGS;
     399           0 : }
     400             : 
     401             : static inline void io_req_track_inflight(struct io_kiocb *req)
     402             : {
     403           0 :         if (!(req->flags & REQ_F_INFLIGHT)) {
     404           0 :                 req->flags |= REQ_F_INFLIGHT;
     405           0 :                 atomic_inc(&req->task->io_uring->inflight_tracked);
     406             :         }
     407             : }
     408             : 
     409           0 : static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
     410             : {
     411           0 :         if (WARN_ON_ONCE(!req->link))
     412             :                 return NULL;
     413             : 
     414           0 :         req->flags &= ~REQ_F_ARM_LTIMEOUT;
     415           0 :         req->flags |= REQ_F_LINK_TIMEOUT;
     416             : 
     417             :         /* linked timeouts should have two refs once prep'ed */
     418           0 :         io_req_set_refcount(req);
     419           0 :         __io_req_set_refcount(req->link, 2);
     420           0 :         return req->link;
     421             : }
     422             : 
     423             : static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
     424             : {
     425           0 :         if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
     426             :                 return NULL;
     427           0 :         return __io_prep_linked_timeout(req);
     428             : }
     429             : 
     430           0 : static noinline void __io_arm_ltimeout(struct io_kiocb *req)
     431             : {
     432           0 :         io_queue_linked_timeout(__io_prep_linked_timeout(req));
     433           0 : }
     434             : 
     435             : static inline void io_arm_ltimeout(struct io_kiocb *req)
     436             : {
     437           0 :         if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
     438           0 :                 __io_arm_ltimeout(req);
     439             : }
     440             : 
     441           0 : static void io_prep_async_work(struct io_kiocb *req)
     442             : {
     443           0 :         const struct io_issue_def *def = &io_issue_defs[req->opcode];
     444           0 :         struct io_ring_ctx *ctx = req->ctx;
     445             : 
     446           0 :         if (!(req->flags & REQ_F_CREDS)) {
     447           0 :                 req->flags |= REQ_F_CREDS;
     448           0 :                 req->creds = get_current_cred();
     449             :         }
     450             : 
     451           0 :         req->work.list.next = NULL;
     452           0 :         req->work.flags = 0;
     453           0 :         req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
     454           0 :         if (req->flags & REQ_F_FORCE_ASYNC)
     455           0 :                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
     456             : 
     457           0 :         if (req->file && !(req->flags & REQ_F_FIXED_FILE))
     458           0 :                 req->flags |= io_file_get_flags(req->file);
     459             : 
     460           0 :         if (req->file && (req->flags & REQ_F_ISREG)) {
     461           0 :                 bool should_hash = def->hash_reg_file;
     462             : 
     463             :                 /* don't serialize this request if the fs doesn't need it */
     464           0 :                 if (should_hash && (req->file->f_flags & O_DIRECT) &&
     465           0 :                     (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE))
     466           0 :                         should_hash = false;
     467           0 :                 if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
     468           0 :                         io_wq_hash_work(&req->work, file_inode(req->file));
     469           0 :         } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
     470           0 :                 if (def->unbound_nonreg_file)
     471           0 :                         req->work.flags |= IO_WQ_WORK_UNBOUND;
     472             :         }
     473           0 : }
     474             : 
     475           0 : static void io_prep_async_link(struct io_kiocb *req)
     476             : {
     477             :         struct io_kiocb *cur;
     478             : 
     479           0 :         if (req->flags & REQ_F_LINK_TIMEOUT) {
     480           0 :                 struct io_ring_ctx *ctx = req->ctx;
     481             : 
     482           0 :                 spin_lock_irq(&ctx->timeout_lock);
     483           0 :                 io_for_each_link(cur, req)
     484           0 :                         io_prep_async_work(cur);
     485           0 :                 spin_unlock_irq(&ctx->timeout_lock);
     486             :         } else {
     487           0 :                 io_for_each_link(cur, req)
     488           0 :                         io_prep_async_work(cur);
     489             :         }
     490           0 : }
     491             : 
     492           0 : void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use)
     493             : {
     494           0 :         struct io_kiocb *link = io_prep_linked_timeout(req);
     495           0 :         struct io_uring_task *tctx = req->task->io_uring;
     496             : 
     497           0 :         BUG_ON(!tctx);
     498           0 :         BUG_ON(!tctx->io_wq);
     499             : 
     500             :         /* init ->work of the whole link before punting */
     501           0 :         io_prep_async_link(req);
     502             : 
     503             :         /*
     504             :          * Not expected to happen, but if we do have a bug where this _can_
     505             :          * happen, catch it here and ensure the request is marked as
     506             :          * canceled. That will make io-wq go through the usual work cancel
     507             :          * procedure rather than attempt to run this request (or create a new
     508             :          * worker for it).
     509             :          */
     510           0 :         if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
     511           0 :                 req->work.flags |= IO_WQ_WORK_CANCEL;
     512             : 
     513           0 :         trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
     514           0 :         io_wq_enqueue(tctx->io_wq, &req->work);
     515           0 :         if (link)
     516           0 :                 io_queue_linked_timeout(link);
     517           0 : }
     518             : 
     519           0 : static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
     520             : {
     521           0 :         while (!list_empty(&ctx->defer_list)) {
     522           0 :                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
     523             :                                                 struct io_defer_entry, list);
     524             : 
     525           0 :                 if (req_need_defer(de->req, de->seq))
     526             :                         break;
     527           0 :                 list_del_init(&de->list);
     528           0 :                 io_req_task_queue(de->req);
     529           0 :                 kfree(de);
     530             :         }
     531           0 : }
     532             : 
     533             : 
     534           0 : static void io_eventfd_ops(struct rcu_head *rcu)
     535             : {
     536           0 :         struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
     537           0 :         int ops = atomic_xchg(&ev_fd->ops, 0);
     538             : 
     539           0 :         if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
     540           0 :                 eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
     541             : 
     542             :         /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
     543             :          * ordering in a race but if references are 0 we know we have to free
     544             :          * it regardless.
     545             :          */
     546           0 :         if (atomic_dec_and_test(&ev_fd->refs)) {
     547           0 :                 eventfd_ctx_put(ev_fd->cq_ev_fd);
     548           0 :                 kfree(ev_fd);
     549             :         }
     550           0 : }
     551             : 
     552           0 : static void io_eventfd_signal(struct io_ring_ctx *ctx)
     553             : {
     554           0 :         struct io_ev_fd *ev_fd = NULL;
     555             : 
     556             :         rcu_read_lock();
     557             :         /*
     558             :          * rcu_dereference ctx->io_ev_fd once and use it for both for checking
     559             :          * and eventfd_signal
     560             :          */
     561           0 :         ev_fd = rcu_dereference(ctx->io_ev_fd);
     562             : 
     563             :         /*
     564             :          * Check again if ev_fd exists incase an io_eventfd_unregister call
     565             :          * completed between the NULL check of ctx->io_ev_fd at the start of
     566             :          * the function and rcu_read_lock.
     567             :          */
     568           0 :         if (unlikely(!ev_fd))
     569             :                 goto out;
     570           0 :         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
     571             :                 goto out;
     572           0 :         if (ev_fd->eventfd_async && !io_wq_current_is_worker())
     573             :                 goto out;
     574             : 
     575           0 :         if (likely(eventfd_signal_allowed())) {
     576           0 :                 eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
     577             :         } else {
     578           0 :                 atomic_inc(&ev_fd->refs);
     579           0 :                 if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
     580           0 :                         call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops);
     581             :                 else
     582           0 :                         atomic_dec(&ev_fd->refs);
     583             :         }
     584             : 
     585             : out:
     586             :         rcu_read_unlock();
     587           0 : }
     588             : 
     589             : static void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
     590             : {
     591             :         bool skip;
     592             : 
     593           0 :         spin_lock(&ctx->completion_lock);
     594             : 
     595             :         /*
     596             :          * Eventfd should only get triggered when at least one event has been
     597             :          * posted. Some applications rely on the eventfd notification count
     598             :          * only changing IFF a new CQE has been added to the CQ ring. There's
     599             :          * no depedency on 1:1 relationship between how many times this
     600             :          * function is called (and hence the eventfd count) and number of CQEs
     601             :          * posted to the CQ ring.
     602             :          */
     603           0 :         skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
     604           0 :         ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
     605           0 :         spin_unlock(&ctx->completion_lock);
     606           0 :         if (skip)
     607             :                 return;
     608             : 
     609           0 :         io_eventfd_signal(ctx);
     610             : }
     611             : 
     612           0 : void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
     613             : {
     614           0 :         if (ctx->poll_activated)
     615             :                 io_poll_wq_wake(ctx);
     616           0 :         if (ctx->off_timeout_used)
     617           0 :                 io_flush_timeouts(ctx);
     618           0 :         if (ctx->drain_active) {
     619           0 :                 spin_lock(&ctx->completion_lock);
     620           0 :                 io_queue_deferred(ctx);
     621           0 :                 spin_unlock(&ctx->completion_lock);
     622             :         }
     623           0 :         if (ctx->has_evfd)
     624             :                 io_eventfd_flush_signal(ctx);
     625           0 : }
     626             : 
     627             : static inline void __io_cq_lock(struct io_ring_ctx *ctx)
     628             : {
     629           0 :         if (!ctx->task_complete)
     630           0 :                 spin_lock(&ctx->completion_lock);
     631             : }
     632             : 
     633             : static inline void io_cq_lock(struct io_ring_ctx *ctx)
     634             :         __acquires(ctx->completion_lock)
     635             : {
     636           0 :         spin_lock(&ctx->completion_lock);
     637             : }
     638             : 
     639           0 : static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
     640             : {
     641           0 :         io_commit_cqring(ctx);
     642             : 
     643           0 :         if (ctx->task_complete) {
     644             :                 /*
     645             :                  * ->task_complete implies that only current might be waiting
     646             :                  * for CQEs, and obviously, we currently don't. No one is
     647             :                  * waiting, wakeups are futile, skip them.
     648             :                  */
     649             :                 io_commit_cqring_flush(ctx);
     650             :         } else {
     651           0 :                 spin_unlock(&ctx->completion_lock);
     652           0 :                 io_commit_cqring_flush(ctx);
     653             :                 io_cqring_wake(ctx);
     654             :         }
     655           0 : }
     656             : 
     657           0 : static void io_cq_unlock_post(struct io_ring_ctx *ctx)
     658             :         __releases(ctx->completion_lock)
     659             : {
     660           0 :         io_commit_cqring(ctx);
     661           0 :         spin_unlock(&ctx->completion_lock);
     662           0 :         io_commit_cqring_flush(ctx);
     663           0 :         io_cqring_wake(ctx);
     664           0 : }
     665             : 
     666             : /* Returns true if there are no backlogged entries after the flush */
     667           0 : static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
     668             : {
     669             :         struct io_overflow_cqe *ocqe;
     670           0 :         LIST_HEAD(list);
     671             : 
     672           0 :         spin_lock(&ctx->completion_lock);
     673           0 :         list_splice_init(&ctx->cq_overflow_list, &list);
     674           0 :         clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
     675           0 :         spin_unlock(&ctx->completion_lock);
     676             : 
     677           0 :         while (!list_empty(&list)) {
     678           0 :                 ocqe = list_first_entry(&list, struct io_overflow_cqe, list);
     679           0 :                 list_del(&ocqe->list);
     680           0 :                 kfree(ocqe);
     681             :         }
     682           0 : }
     683             : 
     684           0 : static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
     685             : {
     686           0 :         size_t cqe_size = sizeof(struct io_uring_cqe);
     687             : 
     688           0 :         if (__io_cqring_events(ctx) == ctx->cq_entries)
     689             :                 return;
     690             : 
     691           0 :         if (ctx->flags & IORING_SETUP_CQE32)
     692           0 :                 cqe_size <<= 1;
     693             : 
     694           0 :         io_cq_lock(ctx);
     695           0 :         while (!list_empty(&ctx->cq_overflow_list)) {
     696           0 :                 struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
     697             :                 struct io_overflow_cqe *ocqe;
     698             : 
     699           0 :                 if (!cqe)
     700             :                         break;
     701           0 :                 ocqe = list_first_entry(&ctx->cq_overflow_list,
     702             :                                         struct io_overflow_cqe, list);
     703           0 :                 memcpy(cqe, &ocqe->cqe, cqe_size);
     704           0 :                 list_del(&ocqe->list);
     705           0 :                 kfree(ocqe);
     706             :         }
     707             : 
     708           0 :         if (list_empty(&ctx->cq_overflow_list)) {
     709           0 :                 clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
     710           0 :                 atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
     711             :         }
     712           0 :         io_cq_unlock_post(ctx);
     713             : }
     714             : 
     715           0 : static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
     716             : {
     717             :         /* iopoll syncs against uring_lock, not completion_lock */
     718           0 :         if (ctx->flags & IORING_SETUP_IOPOLL)
     719           0 :                 mutex_lock(&ctx->uring_lock);
     720           0 :         __io_cqring_overflow_flush(ctx);
     721           0 :         if (ctx->flags & IORING_SETUP_IOPOLL)
     722           0 :                 mutex_unlock(&ctx->uring_lock);
     723           0 : }
     724             : 
     725           0 : static void io_cqring_overflow_flush(struct io_ring_ctx *ctx)
     726             : {
     727           0 :         if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
     728           0 :                 io_cqring_do_overflow_flush(ctx);
     729           0 : }
     730             : 
     731             : /* can be called by any task */
     732           0 : static void io_put_task_remote(struct task_struct *task)
     733             : {
     734           0 :         struct io_uring_task *tctx = task->io_uring;
     735             : 
     736           0 :         percpu_counter_sub(&tctx->inflight, 1);
     737           0 :         if (unlikely(atomic_read(&tctx->in_cancel)))
     738           0 :                 wake_up(&tctx->wait);
     739           0 :         put_task_struct(task);
     740           0 : }
     741             : 
     742             : /* used by a task to put its own references */
     743             : static void io_put_task_local(struct task_struct *task)
     744             : {
     745           0 :         task->io_uring->cached_refs++;
     746             : }
     747             : 
     748             : /* must to be called somewhat shortly after putting a request */
     749           0 : static inline void io_put_task(struct task_struct *task)
     750             : {
     751           0 :         if (likely(task == current))
     752           0 :                 io_put_task_local(task);
     753             :         else
     754           0 :                 io_put_task_remote(task);
     755           0 : }
     756             : 
     757           0 : void io_task_refs_refill(struct io_uring_task *tctx)
     758             : {
     759           0 :         unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
     760             : 
     761           0 :         percpu_counter_add(&tctx->inflight, refill);
     762           0 :         refcount_add(refill, &current->usage);
     763           0 :         tctx->cached_refs += refill;
     764           0 : }
     765             : 
     766           0 : static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
     767             : {
     768           0 :         struct io_uring_task *tctx = task->io_uring;
     769           0 :         unsigned int refs = tctx->cached_refs;
     770             : 
     771           0 :         if (refs) {
     772           0 :                 tctx->cached_refs = 0;
     773           0 :                 percpu_counter_sub(&tctx->inflight, refs);
     774           0 :                 put_task_struct_many(task, refs);
     775             :         }
     776           0 : }
     777             : 
     778           0 : static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
     779             :                                      s32 res, u32 cflags, u64 extra1, u64 extra2)
     780             : {
     781             :         struct io_overflow_cqe *ocqe;
     782           0 :         size_t ocq_size = sizeof(struct io_overflow_cqe);
     783           0 :         bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
     784             : 
     785             :         lockdep_assert_held(&ctx->completion_lock);
     786             : 
     787           0 :         if (is_cqe32)
     788           0 :                 ocq_size += sizeof(struct io_uring_cqe);
     789             : 
     790           0 :         ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
     791           0 :         trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
     792           0 :         if (!ocqe) {
     793             :                 /*
     794             :                  * If we're in ring overflow flush mode, or in task cancel mode,
     795             :                  * or cannot allocate an overflow entry, then we need to drop it
     796             :                  * on the floor.
     797             :                  */
     798           0 :                 io_account_cq_overflow(ctx);
     799           0 :                 set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
     800           0 :                 return false;
     801             :         }
     802           0 :         if (list_empty(&ctx->cq_overflow_list)) {
     803           0 :                 set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
     804           0 :                 atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
     805             : 
     806             :         }
     807           0 :         ocqe->cqe.user_data = user_data;
     808           0 :         ocqe->cqe.res = res;
     809           0 :         ocqe->cqe.flags = cflags;
     810           0 :         if (is_cqe32) {
     811           0 :                 ocqe->cqe.big_cqe[0] = extra1;
     812           0 :                 ocqe->cqe.big_cqe[1] = extra2;
     813             :         }
     814           0 :         list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
     815           0 :         return true;
     816             : }
     817             : 
     818           0 : bool io_req_cqe_overflow(struct io_kiocb *req)
     819             : {
     820           0 :         if (!(req->flags & REQ_F_CQE32_INIT)) {
     821           0 :                 req->extra1 = 0;
     822           0 :                 req->extra2 = 0;
     823             :         }
     824           0 :         return io_cqring_event_overflow(req->ctx, req->cqe.user_data,
     825             :                                         req->cqe.res, req->cqe.flags,
     826             :                                         req->extra1, req->extra2);
     827             : }
     828             : 
     829             : /*
     830             :  * writes to the cq entry need to come after reading head; the
     831             :  * control dependency is enough as we're using WRITE_ONCE to
     832             :  * fill the cq entry
     833             :  */
     834           0 : struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
     835             : {
     836           0 :         struct io_rings *rings = ctx->rings;
     837           0 :         unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
     838             :         unsigned int free, queued, len;
     839             : 
     840             :         /*
     841             :          * Posting into the CQ when there are pending overflowed CQEs may break
     842             :          * ordering guarantees, which will affect links, F_MORE users and more.
     843             :          * Force overflow the completion.
     844             :          */
     845           0 :         if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
     846             :                 return NULL;
     847             : 
     848             :         /* userspace may cheat modifying the tail, be safe and do min */
     849           0 :         queued = min(__io_cqring_events(ctx), ctx->cq_entries);
     850           0 :         free = ctx->cq_entries - queued;
     851             :         /* we need a contiguous range, limit based on the current array offset */
     852           0 :         len = min(free, ctx->cq_entries - off);
     853           0 :         if (!len)
     854             :                 return NULL;
     855             : 
     856           0 :         if (ctx->flags & IORING_SETUP_CQE32) {
     857           0 :                 off <<= 1;
     858           0 :                 len <<= 1;
     859             :         }
     860             : 
     861           0 :         ctx->cqe_cached = &rings->cqes[off];
     862           0 :         ctx->cqe_sentinel = ctx->cqe_cached + len;
     863             : 
     864           0 :         ctx->cached_cq_tail++;
     865           0 :         ctx->cqe_cached++;
     866           0 :         if (ctx->flags & IORING_SETUP_CQE32)
     867           0 :                 ctx->cqe_cached++;
     868             :         return &rings->cqes[off];
     869             : }
     870             : 
     871           0 : static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
     872             :                               u32 cflags)
     873             : {
     874             :         struct io_uring_cqe *cqe;
     875             : 
     876           0 :         ctx->cq_extra++;
     877             : 
     878             :         /*
     879             :          * If we can't get a cq entry, userspace overflowed the
     880             :          * submission (by quite a lot). Increment the overflow count in
     881             :          * the ring.
     882             :          */
     883           0 :         cqe = io_get_cqe(ctx);
     884           0 :         if (likely(cqe)) {
     885           0 :                 trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
     886             : 
     887           0 :                 WRITE_ONCE(cqe->user_data, user_data);
     888           0 :                 WRITE_ONCE(cqe->res, res);
     889           0 :                 WRITE_ONCE(cqe->flags, cflags);
     890             : 
     891           0 :                 if (ctx->flags & IORING_SETUP_CQE32) {
     892           0 :                         WRITE_ONCE(cqe->big_cqe[0], 0);
     893           0 :                         WRITE_ONCE(cqe->big_cqe[1], 0);
     894             :                 }
     895             :                 return true;
     896             :         }
     897             :         return false;
     898             : }
     899             : 
     900           0 : static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
     901             :         __must_hold(&ctx->uring_lock)
     902             : {
     903           0 :         struct io_submit_state *state = &ctx->submit_state;
     904             :         unsigned int i;
     905             : 
     906             :         lockdep_assert_held(&ctx->uring_lock);
     907           0 :         for (i = 0; i < state->cqes_count; i++) {
     908           0 :                 struct io_uring_cqe *cqe = &state->cqes[i];
     909             : 
     910           0 :                 if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
     911           0 :                         if (ctx->task_complete) {
     912           0 :                                 spin_lock(&ctx->completion_lock);
     913           0 :                                 io_cqring_event_overflow(ctx, cqe->user_data,
     914             :                                                         cqe->res, cqe->flags, 0, 0);
     915           0 :                                 spin_unlock(&ctx->completion_lock);
     916             :                         } else {
     917           0 :                                 io_cqring_event_overflow(ctx, cqe->user_data,
     918             :                                                         cqe->res, cqe->flags, 0, 0);
     919             :                         }
     920             :                 }
     921             :         }
     922           0 :         state->cqes_count = 0;
     923           0 : }
     924             : 
     925           0 : static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
     926             :                               bool allow_overflow)
     927             : {
     928             :         bool filled;
     929             : 
     930           0 :         io_cq_lock(ctx);
     931           0 :         filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
     932           0 :         if (!filled && allow_overflow)
     933           0 :                 filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
     934             : 
     935           0 :         io_cq_unlock_post(ctx);
     936           0 :         return filled;
     937             : }
     938             : 
     939           0 : bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
     940             : {
     941           0 :         return __io_post_aux_cqe(ctx, user_data, res, cflags, true);
     942             : }
     943             : 
     944           0 : bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags,
     945             :                 bool allow_overflow)
     946             : {
     947           0 :         struct io_ring_ctx *ctx = req->ctx;
     948           0 :         u64 user_data = req->cqe.user_data;
     949             :         struct io_uring_cqe *cqe;
     950             : 
     951           0 :         if (!defer)
     952           0 :                 return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow);
     953             : 
     954             :         lockdep_assert_held(&ctx->uring_lock);
     955             : 
     956           0 :         if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->submit_state.cqes)) {
     957           0 :                 __io_cq_lock(ctx);
     958           0 :                 __io_flush_post_cqes(ctx);
     959             :                 /* no need to flush - flush is deferred */
     960           0 :                 __io_cq_unlock_post(ctx);
     961             :         }
     962             : 
     963             :         /* For defered completions this is not as strict as it is otherwise,
     964             :          * however it's main job is to prevent unbounded posted completions,
     965             :          * and in that it works just as well.
     966             :          */
     967           0 :         if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
     968             :                 return false;
     969             : 
     970           0 :         cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++];
     971           0 :         cqe->user_data = user_data;
     972           0 :         cqe->res = res;
     973           0 :         cqe->flags = cflags;
     974           0 :         return true;
     975             : }
     976             : 
     977           0 : static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
     978             : {
     979           0 :         struct io_ring_ctx *ctx = req->ctx;
     980           0 :         struct io_rsrc_node *rsrc_node = NULL;
     981             : 
     982           0 :         io_cq_lock(ctx);
     983           0 :         if (!(req->flags & REQ_F_CQE_SKIP))
     984           0 :                 io_fill_cqe_req(ctx, req);
     985             : 
     986             :         /*
     987             :          * If we're the last reference to this request, add to our locked
     988             :          * free_list cache.
     989             :          */
     990           0 :         if (req_ref_put_and_test(req)) {
     991           0 :                 if (req->flags & IO_REQ_LINK_FLAGS) {
     992           0 :                         if (req->flags & IO_DISARM_MASK)
     993           0 :                                 io_disarm_next(req);
     994           0 :                         if (req->link) {
     995           0 :                                 io_req_task_queue(req->link);
     996           0 :                                 req->link = NULL;
     997             :                         }
     998             :                 }
     999           0 :                 io_put_kbuf_comp(req);
    1000           0 :                 if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
    1001           0 :                         io_clean_op(req);
    1002           0 :                 if (!(req->flags & REQ_F_FIXED_FILE))
    1003           0 :                         io_put_file(req->file);
    1004             : 
    1005           0 :                 rsrc_node = req->rsrc_node;
    1006             :                 /*
    1007             :                  * Selected buffer deallocation in io_clean_op() assumes that
    1008             :                  * we don't hold ->completion_lock. Clean them here to avoid
    1009             :                  * deadlocks.
    1010             :                  */
    1011           0 :                 io_put_task_remote(req->task);
    1012           0 :                 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
    1013           0 :                 ctx->locked_free_nr++;
    1014             :         }
    1015           0 :         io_cq_unlock_post(ctx);
    1016             : 
    1017           0 :         if (rsrc_node) {
    1018           0 :                 io_ring_submit_lock(ctx, issue_flags);
    1019           0 :                 io_put_rsrc_node(ctx, rsrc_node);
    1020             :                 io_ring_submit_unlock(ctx, issue_flags);
    1021             :         }
    1022           0 : }
    1023             : 
    1024           0 : void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
    1025             : {
    1026           0 :         if (req->ctx->task_complete && req->ctx->submitter_task != current) {
    1027           0 :                 req->io_task_work.func = io_req_task_complete;
    1028             :                 io_req_task_work_add(req);
    1029           0 :         } else if (!(issue_flags & IO_URING_F_UNLOCKED) ||
    1030           0 :                    !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
    1031           0 :                 __io_req_complete_post(req, issue_flags);
    1032             :         } else {
    1033           0 :                 struct io_ring_ctx *ctx = req->ctx;
    1034             : 
    1035           0 :                 mutex_lock(&ctx->uring_lock);
    1036           0 :                 __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED);
    1037           0 :                 mutex_unlock(&ctx->uring_lock);
    1038             :         }
    1039           0 : }
    1040             : 
    1041           0 : void io_req_defer_failed(struct io_kiocb *req, s32 res)
    1042             :         __must_hold(&ctx->uring_lock)
    1043             : {
    1044           0 :         const struct io_cold_def *def = &io_cold_defs[req->opcode];
    1045             : 
    1046             :         lockdep_assert_held(&req->ctx->uring_lock);
    1047             : 
    1048           0 :         req_set_fail(req);
    1049           0 :         io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
    1050           0 :         if (def->fail)
    1051           0 :                 def->fail(req);
    1052           0 :         io_req_complete_defer(req);
    1053           0 : }
    1054             : 
    1055             : /*
    1056             :  * Don't initialise the fields below on every allocation, but do that in
    1057             :  * advance and keep them valid across allocations.
    1058             :  */
    1059             : static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
    1060             : {
    1061           0 :         req->ctx = ctx;
    1062           0 :         req->link = NULL;
    1063           0 :         req->async_data = NULL;
    1064             :         /* not necessary, but safer to zero */
    1065           0 :         req->cqe.res = 0;
    1066             : }
    1067             : 
    1068             : static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
    1069             :                                         struct io_submit_state *state)
    1070             : {
    1071           0 :         spin_lock(&ctx->completion_lock);
    1072           0 :         wq_list_splice(&ctx->locked_free_list, &state->free_list);
    1073           0 :         ctx->locked_free_nr = 0;
    1074           0 :         spin_unlock(&ctx->completion_lock);
    1075             : }
    1076             : 
    1077             : /*
    1078             :  * A request might get retired back into the request caches even before opcode
    1079             :  * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
    1080             :  * Because of that, io_alloc_req() should be called only under ->uring_lock
    1081             :  * and with extra caution to not get a request that is still worked on.
    1082             :  */
    1083           0 : __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
    1084             :         __must_hold(&ctx->uring_lock)
    1085             : {
    1086           0 :         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
    1087             :         void *reqs[IO_REQ_ALLOC_BATCH];
    1088             :         int ret, i;
    1089             : 
    1090             :         /*
    1091             :          * If we have more than a batch's worth of requests in our IRQ side
    1092             :          * locked cache, grab the lock and move them over to our submission
    1093             :          * side cache.
    1094             :          */
    1095           0 :         if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
    1096           0 :                 io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
    1097           0 :                 if (!io_req_cache_empty(ctx))
    1098             :                         return true;
    1099             :         }
    1100             : 
    1101           0 :         ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
    1102             : 
    1103             :         /*
    1104             :          * Bulk alloc is all-or-nothing. If we fail to get a batch,
    1105             :          * retry single alloc to be on the safe side.
    1106             :          */
    1107           0 :         if (unlikely(ret <= 0)) {
    1108           0 :                 reqs[0] = kmem_cache_alloc(req_cachep, gfp);
    1109           0 :                 if (!reqs[0])
    1110             :                         return false;
    1111             :                 ret = 1;
    1112             :         }
    1113             : 
    1114           0 :         percpu_ref_get_many(&ctx->refs, ret);
    1115           0 :         for (i = 0; i < ret; i++) {
    1116           0 :                 struct io_kiocb *req = reqs[i];
    1117             : 
    1118           0 :                 io_preinit_req(req, ctx);
    1119           0 :                 io_req_add_to_cache(req, ctx);
    1120             :         }
    1121             :         return true;
    1122             : }
    1123             : 
    1124           0 : __cold void io_free_req(struct io_kiocb *req)
    1125             : {
    1126             :         /* refs were already put, restore them for io_req_task_complete() */
    1127           0 :         req->flags &= ~REQ_F_REFCOUNT;
    1128             :         /* we only want to free it, don't post CQEs */
    1129           0 :         req->flags |= REQ_F_CQE_SKIP;
    1130           0 :         req->io_task_work.func = io_req_task_complete;
    1131           0 :         io_req_task_work_add(req);
    1132           0 : }
    1133             : 
    1134             : static void __io_req_find_next_prep(struct io_kiocb *req)
    1135             : {
    1136           0 :         struct io_ring_ctx *ctx = req->ctx;
    1137             : 
    1138           0 :         spin_lock(&ctx->completion_lock);
    1139           0 :         io_disarm_next(req);
    1140           0 :         spin_unlock(&ctx->completion_lock);
    1141             : }
    1142             : 
    1143             : static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
    1144             : {
    1145             :         struct io_kiocb *nxt;
    1146             : 
    1147             :         /*
    1148             :          * If LINK is set, we have dependent requests in this chain. If we
    1149             :          * didn't fail this request, queue the first one up, moving any other
    1150             :          * dependencies to the next request. In case of failure, fail the rest
    1151             :          * of the chain.
    1152             :          */
    1153           0 :         if (unlikely(req->flags & IO_DISARM_MASK))
    1154             :                 __io_req_find_next_prep(req);
    1155           0 :         nxt = req->link;
    1156           0 :         req->link = NULL;
    1157             :         return nxt;
    1158             : }
    1159             : 
    1160           0 : static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
    1161             : {
    1162           0 :         if (!ctx)
    1163             :                 return;
    1164           0 :         if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
    1165           0 :                 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
    1166           0 :         if (ts->locked) {
    1167           0 :                 io_submit_flush_completions(ctx);
    1168           0 :                 mutex_unlock(&ctx->uring_lock);
    1169           0 :                 ts->locked = false;
    1170             :         }
    1171           0 :         percpu_ref_put(&ctx->refs);
    1172             : }
    1173             : 
    1174           0 : static unsigned int handle_tw_list(struct llist_node *node,
    1175             :                                    struct io_ring_ctx **ctx,
    1176             :                                    struct io_tw_state *ts,
    1177             :                                    struct llist_node *last)
    1178             : {
    1179           0 :         unsigned int count = 0;
    1180             : 
    1181           0 :         while (node && node != last) {
    1182           0 :                 struct llist_node *next = node->next;
    1183           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1184             :                                                     io_task_work.node);
    1185             : 
    1186           0 :                 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
    1187             : 
    1188           0 :                 if (req->ctx != *ctx) {
    1189           0 :                         ctx_flush_and_put(*ctx, ts);
    1190           0 :                         *ctx = req->ctx;
    1191             :                         /* if not contended, grab and improve batching */
    1192           0 :                         ts->locked = mutex_trylock(&(*ctx)->uring_lock);
    1193           0 :                         percpu_ref_get(&(*ctx)->refs);
    1194             :                 }
    1195           0 :                 INDIRECT_CALL_2(req->io_task_work.func,
    1196             :                                 io_poll_task_func, io_req_rw_complete,
    1197             :                                 req, ts);
    1198           0 :                 node = next;
    1199           0 :                 count++;
    1200           0 :                 if (unlikely(need_resched())) {
    1201           0 :                         ctx_flush_and_put(*ctx, ts);
    1202           0 :                         *ctx = NULL;
    1203           0 :                         cond_resched();
    1204             :                 }
    1205             :         }
    1206             : 
    1207           0 :         return count;
    1208             : }
    1209             : 
    1210             : /**
    1211             :  * io_llist_xchg - swap all entries in a lock-less list
    1212             :  * @head:       the head of lock-less list to delete all entries
    1213             :  * @new:        new entry as the head of the list
    1214             :  *
    1215             :  * If list is empty, return NULL, otherwise, return the pointer to the first entry.
    1216             :  * The order of entries returned is from the newest to the oldest added one.
    1217             :  */
    1218             : static inline struct llist_node *io_llist_xchg(struct llist_head *head,
    1219             :                                                struct llist_node *new)
    1220             : {
    1221           0 :         return xchg(&head->first, new);
    1222             : }
    1223             : 
    1224             : /**
    1225             :  * io_llist_cmpxchg - possibly swap all entries in a lock-less list
    1226             :  * @head:       the head of lock-less list to delete all entries
    1227             :  * @old:        expected old value of the first entry of the list
    1228             :  * @new:        new entry as the head of the list
    1229             :  *
    1230             :  * perform a cmpxchg on the first entry of the list.
    1231             :  */
    1232             : 
    1233             : static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
    1234             :                                                   struct llist_node *old,
    1235             :                                                   struct llist_node *new)
    1236             : {
    1237           0 :         return cmpxchg(&head->first, old, new);
    1238             : }
    1239             : 
    1240           0 : static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
    1241             : {
    1242           0 :         struct llist_node *node = llist_del_all(&tctx->task_list);
    1243           0 :         struct io_ring_ctx *last_ctx = NULL;
    1244             :         struct io_kiocb *req;
    1245             : 
    1246           0 :         while (node) {
    1247           0 :                 req = container_of(node, struct io_kiocb, io_task_work.node);
    1248           0 :                 node = node->next;
    1249           0 :                 if (sync && last_ctx != req->ctx) {
    1250           0 :                         if (last_ctx) {
    1251           0 :                                 flush_delayed_work(&last_ctx->fallback_work);
    1252           0 :                                 percpu_ref_put(&last_ctx->refs);
    1253             :                         }
    1254           0 :                         last_ctx = req->ctx;
    1255           0 :                         percpu_ref_get(&last_ctx->refs);
    1256             :                 }
    1257           0 :                 if (llist_add(&req->io_task_work.node,
    1258           0 :                               &req->ctx->fallback_llist))
    1259           0 :                         schedule_delayed_work(&req->ctx->fallback_work, 1);
    1260             :         }
    1261             : 
    1262           0 :         if (last_ctx) {
    1263           0 :                 flush_delayed_work(&last_ctx->fallback_work);
    1264           0 :                 percpu_ref_put(&last_ctx->refs);
    1265             :         }
    1266           0 : }
    1267             : 
    1268           0 : void tctx_task_work(struct callback_head *cb)
    1269             : {
    1270           0 :         struct io_tw_state ts = {};
    1271           0 :         struct io_ring_ctx *ctx = NULL;
    1272           0 :         struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
    1273             :                                                   task_work);
    1274           0 :         struct llist_node fake = {};
    1275             :         struct llist_node *node;
    1276           0 :         unsigned int loops = 0;
    1277           0 :         unsigned int count = 0;
    1278             : 
    1279           0 :         if (unlikely(current->flags & PF_EXITING)) {
    1280           0 :                 io_fallback_tw(tctx, true);
    1281           0 :                 return;
    1282             :         }
    1283             : 
    1284             :         do {
    1285           0 :                 loops++;
    1286           0 :                 node = io_llist_xchg(&tctx->task_list, &fake);
    1287           0 :                 count += handle_tw_list(node, &ctx, &ts, &fake);
    1288             : 
    1289             :                 /* skip expensive cmpxchg if there are items in the list */
    1290           0 :                 if (READ_ONCE(tctx->task_list.first) != &fake)
    1291           0 :                         continue;
    1292           0 :                 if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) {
    1293           0 :                         io_submit_flush_completions(ctx);
    1294           0 :                         if (READ_ONCE(tctx->task_list.first) != &fake)
    1295           0 :                                 continue;
    1296             :                 }
    1297           0 :                 node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
    1298           0 :         } while (node != &fake);
    1299             : 
    1300           0 :         ctx_flush_and_put(ctx, &ts);
    1301             : 
    1302             :         /* relaxed read is enough as only the task itself sets ->in_cancel */
    1303           0 :         if (unlikely(atomic_read(&tctx->in_cancel)))
    1304           0 :                 io_uring_drop_tctx_refs(current);
    1305             : 
    1306           0 :         trace_io_uring_task_work_run(tctx, count, loops);
    1307             : }
    1308             : 
    1309           0 : static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
    1310             : {
    1311           0 :         struct io_ring_ctx *ctx = req->ctx;
    1312             :         unsigned nr_wait, nr_tw, nr_tw_prev;
    1313             :         struct llist_node *first;
    1314             : 
    1315           0 :         if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
    1316           0 :                 flags &= ~IOU_F_TWQ_LAZY_WAKE;
    1317             : 
    1318           0 :         first = READ_ONCE(ctx->work_llist.first);
    1319             :         do {
    1320           0 :                 nr_tw_prev = 0;
    1321           0 :                 if (first) {
    1322           0 :                         struct io_kiocb *first_req = container_of(first,
    1323             :                                                         struct io_kiocb,
    1324             :                                                         io_task_work.node);
    1325             :                         /*
    1326             :                          * Might be executed at any moment, rely on
    1327             :                          * SLAB_TYPESAFE_BY_RCU to keep it alive.
    1328             :                          */
    1329           0 :                         nr_tw_prev = READ_ONCE(first_req->nr_tw);
    1330             :                 }
    1331           0 :                 nr_tw = nr_tw_prev + 1;
    1332             :                 /* Large enough to fail the nr_wait comparison below */
    1333           0 :                 if (!(flags & IOU_F_TWQ_LAZY_WAKE))
    1334           0 :                         nr_tw = -1U;
    1335             : 
    1336           0 :                 req->nr_tw = nr_tw;
    1337           0 :                 req->io_task_work.node.next = first;
    1338           0 :         } while (!try_cmpxchg(&ctx->work_llist.first, &first,
    1339             :                               &req->io_task_work.node));
    1340             : 
    1341           0 :         if (!first) {
    1342           0 :                 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
    1343           0 :                         atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
    1344           0 :                 if (ctx->has_evfd)
    1345           0 :                         io_eventfd_signal(ctx);
    1346             :         }
    1347             : 
    1348           0 :         nr_wait = atomic_read(&ctx->cq_wait_nr);
    1349             :         /* no one is waiting */
    1350           0 :         if (!nr_wait)
    1351             :                 return;
    1352             :         /* either not enough or the previous add has already woken it up */
    1353           0 :         if (nr_wait > nr_tw || nr_tw_prev >= nr_wait)
    1354             :                 return;
    1355             :         /* pairs with set_current_state() in io_cqring_wait() */
    1356           0 :         smp_mb__after_atomic();
    1357           0 :         wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
    1358             : }
    1359             : 
    1360           0 : static void io_req_normal_work_add(struct io_kiocb *req)
    1361             : {
    1362           0 :         struct io_uring_task *tctx = req->task->io_uring;
    1363           0 :         struct io_ring_ctx *ctx = req->ctx;
    1364             : 
    1365             :         /* task_work already pending, we're done */
    1366           0 :         if (!llist_add(&req->io_task_work.node, &tctx->task_list))
    1367             :                 return;
    1368             : 
    1369           0 :         if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
    1370           0 :                 atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
    1371             : 
    1372           0 :         if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
    1373             :                 return;
    1374             : 
    1375           0 :         io_fallback_tw(tctx, false);
    1376             : }
    1377             : 
    1378           0 : void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
    1379             : {
    1380           0 :         if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
    1381             :                 rcu_read_lock();
    1382           0 :                 io_req_local_work_add(req, flags);
    1383             :                 rcu_read_unlock();
    1384             :         } else {
    1385           0 :                 io_req_normal_work_add(req);
    1386             :         }
    1387           0 : }
    1388             : 
    1389           0 : static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
    1390             : {
    1391             :         struct llist_node *node;
    1392             : 
    1393           0 :         node = llist_del_all(&ctx->work_llist);
    1394           0 :         while (node) {
    1395           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1396             :                                                     io_task_work.node);
    1397             : 
    1398           0 :                 node = node->next;
    1399           0 :                 io_req_normal_work_add(req);
    1400             :         }
    1401           0 : }
    1402             : 
    1403           0 : static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts)
    1404             : {
    1405             :         struct llist_node *node;
    1406           0 :         unsigned int loops = 0;
    1407           0 :         int ret = 0;
    1408             : 
    1409           0 :         if (WARN_ON_ONCE(ctx->submitter_task != current))
    1410             :                 return -EEXIST;
    1411           0 :         if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
    1412           0 :                 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
    1413             : again:
    1414             :         /*
    1415             :          * llists are in reverse order, flip it back the right way before
    1416             :          * running the pending items.
    1417             :          */
    1418           0 :         node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL));
    1419           0 :         while (node) {
    1420           0 :                 struct llist_node *next = node->next;
    1421           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1422             :                                                     io_task_work.node);
    1423           0 :                 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
    1424           0 :                 INDIRECT_CALL_2(req->io_task_work.func,
    1425             :                                 io_poll_task_func, io_req_rw_complete,
    1426             :                                 req, ts);
    1427           0 :                 ret++;
    1428           0 :                 node = next;
    1429             :         }
    1430           0 :         loops++;
    1431             : 
    1432           0 :         if (!llist_empty(&ctx->work_llist))
    1433             :                 goto again;
    1434           0 :         if (ts->locked) {
    1435           0 :                 io_submit_flush_completions(ctx);
    1436           0 :                 if (!llist_empty(&ctx->work_llist))
    1437             :                         goto again;
    1438             :         }
    1439             :         trace_io_uring_local_work_run(ctx, ret, loops);
    1440             :         return ret;
    1441             : }
    1442             : 
    1443           0 : static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
    1444             : {
    1445           0 :         struct io_tw_state ts = { .locked = true, };
    1446             :         int ret;
    1447             : 
    1448           0 :         if (llist_empty(&ctx->work_llist))
    1449             :                 return 0;
    1450             : 
    1451           0 :         ret = __io_run_local_work(ctx, &ts);
    1452             :         /* shouldn't happen! */
    1453           0 :         if (WARN_ON_ONCE(!ts.locked))
    1454           0 :                 mutex_lock(&ctx->uring_lock);
    1455             :         return ret;
    1456             : }
    1457             : 
    1458           0 : static int io_run_local_work(struct io_ring_ctx *ctx)
    1459             : {
    1460           0 :         struct io_tw_state ts = {};
    1461             :         int ret;
    1462             : 
    1463           0 :         ts.locked = mutex_trylock(&ctx->uring_lock);
    1464           0 :         ret = __io_run_local_work(ctx, &ts);
    1465           0 :         if (ts.locked)
    1466           0 :                 mutex_unlock(&ctx->uring_lock);
    1467             : 
    1468           0 :         return ret;
    1469             : }
    1470             : 
    1471           0 : static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts)
    1472             : {
    1473           0 :         io_tw_lock(req->ctx, ts);
    1474           0 :         io_req_defer_failed(req, req->cqe.res);
    1475           0 : }
    1476             : 
    1477           0 : void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts)
    1478             : {
    1479           0 :         io_tw_lock(req->ctx, ts);
    1480             :         /* req->task == current here, checking PF_EXITING is safe */
    1481           0 :         if (unlikely(req->task->flags & PF_EXITING))
    1482           0 :                 io_req_defer_failed(req, -EFAULT);
    1483           0 :         else if (req->flags & REQ_F_FORCE_ASYNC)
    1484           0 :                 io_queue_iowq(req, ts);
    1485             :         else
    1486           0 :                 io_queue_sqe(req);
    1487           0 : }
    1488             : 
    1489           0 : void io_req_task_queue_fail(struct io_kiocb *req, int ret)
    1490             : {
    1491           0 :         io_req_set_res(req, ret, 0);
    1492           0 :         req->io_task_work.func = io_req_task_cancel;
    1493           0 :         io_req_task_work_add(req);
    1494           0 : }
    1495             : 
    1496           0 : void io_req_task_queue(struct io_kiocb *req)
    1497             : {
    1498           0 :         req->io_task_work.func = io_req_task_submit;
    1499           0 :         io_req_task_work_add(req);
    1500           0 : }
    1501             : 
    1502           0 : void io_queue_next(struct io_kiocb *req)
    1503             : {
    1504           0 :         struct io_kiocb *nxt = io_req_find_next(req);
    1505             : 
    1506           0 :         if (nxt)
    1507             :                 io_req_task_queue(nxt);
    1508           0 : }
    1509             : 
    1510           0 : void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
    1511             :         __must_hold(&ctx->uring_lock)
    1512             : {
    1513             :         do {
    1514           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1515             :                                                     comp_list);
    1516             : 
    1517           0 :                 if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
    1518           0 :                         if (req->flags & REQ_F_REFCOUNT) {
    1519           0 :                                 node = req->comp_list.next;
    1520           0 :                                 if (!req_ref_put_and_test(req))
    1521           0 :                                         continue;
    1522             :                         }
    1523           0 :                         if ((req->flags & REQ_F_POLLED) && req->apoll) {
    1524           0 :                                 struct async_poll *apoll = req->apoll;
    1525             : 
    1526           0 :                                 if (apoll->double_poll)
    1527           0 :                                         kfree(apoll->double_poll);
    1528           0 :                                 if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache))
    1529           0 :                                         kfree(apoll);
    1530           0 :                                 req->flags &= ~REQ_F_POLLED;
    1531             :                         }
    1532           0 :                         if (req->flags & IO_REQ_LINK_FLAGS)
    1533           0 :                                 io_queue_next(req);
    1534           0 :                         if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
    1535           0 :                                 io_clean_op(req);
    1536             :                 }
    1537           0 :                 if (!(req->flags & REQ_F_FIXED_FILE))
    1538           0 :                         io_put_file(req->file);
    1539             : 
    1540           0 :                 io_req_put_rsrc_locked(req, ctx);
    1541             : 
    1542           0 :                 io_put_task(req->task);
    1543           0 :                 node = req->comp_list.next;
    1544             :                 io_req_add_to_cache(req, ctx);
    1545           0 :         } while (node);
    1546           0 : }
    1547             : 
    1548           0 : static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
    1549             :         __must_hold(&ctx->uring_lock)
    1550             : {
    1551           0 :         struct io_submit_state *state = &ctx->submit_state;
    1552             :         struct io_wq_work_node *node;
    1553             : 
    1554           0 :         __io_cq_lock(ctx);
    1555             :         /* must come first to preserve CQE ordering in failure cases */
    1556           0 :         if (state->cqes_count)
    1557           0 :                 __io_flush_post_cqes(ctx);
    1558           0 :         __wq_list_for_each(node, &state->compl_reqs) {
    1559           0 :                 struct io_kiocb *req = container_of(node, struct io_kiocb,
    1560             :                                             comp_list);
    1561             : 
    1562           0 :                 if (!(req->flags & REQ_F_CQE_SKIP) &&
    1563           0 :                     unlikely(!__io_fill_cqe_req(ctx, req))) {
    1564           0 :                         if (ctx->task_complete) {
    1565           0 :                                 spin_lock(&ctx->completion_lock);
    1566           0 :                                 io_req_cqe_overflow(req);
    1567           0 :                                 spin_unlock(&ctx->completion_lock);
    1568             :                         } else {
    1569           0 :                                 io_req_cqe_overflow(req);
    1570             :                         }
    1571             :                 }
    1572             :         }
    1573           0 :         __io_cq_unlock_post(ctx);
    1574             : 
    1575           0 :         if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
    1576           0 :                 io_free_batch_list(ctx, state->compl_reqs.first);
    1577           0 :                 INIT_WQ_LIST(&state->compl_reqs);
    1578             :         }
    1579           0 : }
    1580             : 
    1581             : static unsigned io_cqring_events(struct io_ring_ctx *ctx)
    1582             : {
    1583             :         /* See comment at the top of this file */
    1584           0 :         smp_rmb();
    1585           0 :         return __io_cqring_events(ctx);
    1586             : }
    1587             : 
    1588             : /*
    1589             :  * We can't just wait for polled events to come to us, we have to actively
    1590             :  * find and complete them.
    1591             :  */
    1592           0 : static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
    1593             : {
    1594           0 :         if (!(ctx->flags & IORING_SETUP_IOPOLL))
    1595             :                 return;
    1596             : 
    1597           0 :         mutex_lock(&ctx->uring_lock);
    1598           0 :         while (!wq_list_empty(&ctx->iopoll_list)) {
    1599             :                 /* let it sleep and repeat later if can't complete a request */
    1600           0 :                 if (io_do_iopoll(ctx, true) == 0)
    1601             :                         break;
    1602             :                 /*
    1603             :                  * Ensure we allow local-to-the-cpu processing to take place,
    1604             :                  * in this case we need to ensure that we reap all events.
    1605             :                  * Also let task_work, etc. to progress by releasing the mutex
    1606             :                  */
    1607           0 :                 if (need_resched()) {
    1608           0 :                         mutex_unlock(&ctx->uring_lock);
    1609           0 :                         cond_resched();
    1610           0 :                         mutex_lock(&ctx->uring_lock);
    1611             :                 }
    1612             :         }
    1613           0 :         mutex_unlock(&ctx->uring_lock);
    1614             : }
    1615             : 
    1616           0 : static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
    1617             : {
    1618           0 :         unsigned int nr_events = 0;
    1619           0 :         int ret = 0;
    1620             :         unsigned long check_cq;
    1621             : 
    1622           0 :         if (!io_allowed_run_tw(ctx))
    1623             :                 return -EEXIST;
    1624             : 
    1625           0 :         check_cq = READ_ONCE(ctx->check_cq);
    1626           0 :         if (unlikely(check_cq)) {
    1627           0 :                 if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
    1628           0 :                         __io_cqring_overflow_flush(ctx);
    1629             :                 /*
    1630             :                  * Similarly do not spin if we have not informed the user of any
    1631             :                  * dropped CQE.
    1632             :                  */
    1633           0 :                 if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
    1634             :                         return -EBADR;
    1635             :         }
    1636             :         /*
    1637             :          * Don't enter poll loop if we already have events pending.
    1638             :          * If we do, we can potentially be spinning for commands that
    1639             :          * already triggered a CQE (eg in error).
    1640             :          */
    1641           0 :         if (io_cqring_events(ctx))
    1642             :                 return 0;
    1643             : 
    1644             :         do {
    1645             :                 /*
    1646             :                  * If a submit got punted to a workqueue, we can have the
    1647             :                  * application entering polling for a command before it gets
    1648             :                  * issued. That app will hold the uring_lock for the duration
    1649             :                  * of the poll right here, so we need to take a breather every
    1650             :                  * now and then to ensure that the issue has a chance to add
    1651             :                  * the poll to the issued list. Otherwise we can spin here
    1652             :                  * forever, while the workqueue is stuck trying to acquire the
    1653             :                  * very same mutex.
    1654             :                  */
    1655           0 :                 if (wq_list_empty(&ctx->iopoll_list) ||
    1656           0 :                     io_task_work_pending(ctx)) {
    1657           0 :                         u32 tail = ctx->cached_cq_tail;
    1658             : 
    1659           0 :                         (void) io_run_local_work_locked(ctx);
    1660             : 
    1661           0 :                         if (task_work_pending(current) ||
    1662           0 :                             wq_list_empty(&ctx->iopoll_list)) {
    1663           0 :                                 mutex_unlock(&ctx->uring_lock);
    1664           0 :                                 io_run_task_work();
    1665           0 :                                 mutex_lock(&ctx->uring_lock);
    1666             :                         }
    1667             :                         /* some requests don't go through iopoll_list */
    1668           0 :                         if (tail != ctx->cached_cq_tail ||
    1669           0 :                             wq_list_empty(&ctx->iopoll_list))
    1670             :                                 break;
    1671             :                 }
    1672           0 :                 ret = io_do_iopoll(ctx, !min);
    1673           0 :                 if (ret < 0)
    1674             :                         break;
    1675           0 :                 nr_events += ret;
    1676           0 :                 ret = 0;
    1677           0 :         } while (nr_events < min && !need_resched());
    1678             : 
    1679             :         return ret;
    1680             : }
    1681             : 
    1682           0 : void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts)
    1683             : {
    1684           0 :         if (ts->locked)
    1685             :                 io_req_complete_defer(req);
    1686             :         else
    1687           0 :                 io_req_complete_post(req, IO_URING_F_UNLOCKED);
    1688           0 : }
    1689             : 
    1690             : /*
    1691             :  * After the iocb has been issued, it's safe to be found on the poll list.
    1692             :  * Adding the kiocb to the list AFTER submission ensures that we don't
    1693             :  * find it from a io_do_iopoll() thread before the issuer is done
    1694             :  * accessing the kiocb cookie.
    1695             :  */
    1696           0 : static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
    1697             : {
    1698           0 :         struct io_ring_ctx *ctx = req->ctx;
    1699           0 :         const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
    1700             : 
    1701             :         /* workqueue context doesn't hold uring_lock, grab it now */
    1702           0 :         if (unlikely(needs_lock))
    1703           0 :                 mutex_lock(&ctx->uring_lock);
    1704             : 
    1705             :         /*
    1706             :          * Track whether we have multiple files in our lists. This will impact
    1707             :          * how we do polling eventually, not spinning if we're on potentially
    1708             :          * different devices.
    1709             :          */
    1710           0 :         if (wq_list_empty(&ctx->iopoll_list)) {
    1711           0 :                 ctx->poll_multi_queue = false;
    1712           0 :         } else if (!ctx->poll_multi_queue) {
    1713             :                 struct io_kiocb *list_req;
    1714             : 
    1715           0 :                 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
    1716             :                                         comp_list);
    1717           0 :                 if (list_req->file != req->file)
    1718           0 :                         ctx->poll_multi_queue = true;
    1719             :         }
    1720             : 
    1721             :         /*
    1722             :          * For fast devices, IO may have already completed. If it has, add
    1723             :          * it to the front so we find it first.
    1724             :          */
    1725           0 :         if (READ_ONCE(req->iopoll_completed))
    1726           0 :                 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
    1727             :         else
    1728           0 :                 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
    1729             : 
    1730           0 :         if (unlikely(needs_lock)) {
    1731             :                 /*
    1732             :                  * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
    1733             :                  * in sq thread task context or in io worker task context. If
    1734             :                  * current task context is sq thread, we don't need to check
    1735             :                  * whether should wake up sq thread.
    1736             :                  */
    1737           0 :                 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
    1738           0 :                     wq_has_sleeper(&ctx->sq_data->wait))
    1739           0 :                         wake_up(&ctx->sq_data->wait);
    1740             : 
    1741           0 :                 mutex_unlock(&ctx->uring_lock);
    1742             :         }
    1743           0 : }
    1744             : 
    1745           0 : unsigned int io_file_get_flags(struct file *file)
    1746             : {
    1747           0 :         unsigned int res = 0;
    1748             : 
    1749           0 :         if (S_ISREG(file_inode(file)->i_mode))
    1750           0 :                 res |= REQ_F_ISREG;
    1751           0 :         if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT))
    1752           0 :                 res |= REQ_F_SUPPORT_NOWAIT;
    1753           0 :         return res;
    1754             : }
    1755             : 
    1756           0 : bool io_alloc_async_data(struct io_kiocb *req)
    1757             : {
    1758           0 :         WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size);
    1759           0 :         req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL);
    1760           0 :         if (req->async_data) {
    1761           0 :                 req->flags |= REQ_F_ASYNC_DATA;
    1762           0 :                 return false;
    1763             :         }
    1764             :         return true;
    1765             : }
    1766             : 
    1767           0 : int io_req_prep_async(struct io_kiocb *req)
    1768             : {
    1769           0 :         const struct io_cold_def *cdef = &io_cold_defs[req->opcode];
    1770           0 :         const struct io_issue_def *def = &io_issue_defs[req->opcode];
    1771             : 
    1772             :         /* assign early for deferred execution for non-fixed file */
    1773           0 :         if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file)
    1774           0 :                 req->file = io_file_get_normal(req, req->cqe.fd);
    1775           0 :         if (!cdef->prep_async)
    1776             :                 return 0;
    1777           0 :         if (WARN_ON_ONCE(req_has_async_data(req)))
    1778             :                 return -EFAULT;
    1779           0 :         if (!def->manual_alloc) {
    1780           0 :                 if (io_alloc_async_data(req))
    1781             :                         return -EAGAIN;
    1782             :         }
    1783           0 :         return cdef->prep_async(req);
    1784             : }
    1785             : 
    1786             : static u32 io_get_sequence(struct io_kiocb *req)
    1787             : {
    1788           0 :         u32 seq = req->ctx->cached_sq_head;
    1789             :         struct io_kiocb *cur;
    1790             : 
    1791             :         /* need original cached_sq_head, but it was increased for each req */
    1792           0 :         io_for_each_link(cur, req)
    1793           0 :                 seq--;
    1794             :         return seq;
    1795             : }
    1796             : 
    1797           0 : static __cold void io_drain_req(struct io_kiocb *req)
    1798             :         __must_hold(&ctx->uring_lock)
    1799             : {
    1800           0 :         struct io_ring_ctx *ctx = req->ctx;
    1801             :         struct io_defer_entry *de;
    1802             :         int ret;
    1803           0 :         u32 seq = io_get_sequence(req);
    1804             : 
    1805             :         /* Still need defer if there is pending req in defer list. */
    1806           0 :         spin_lock(&ctx->completion_lock);
    1807           0 :         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
    1808           0 :                 spin_unlock(&ctx->completion_lock);
    1809             : queue:
    1810           0 :                 ctx->drain_active = false;
    1811             :                 io_req_task_queue(req);
    1812             :                 return;
    1813             :         }
    1814           0 :         spin_unlock(&ctx->completion_lock);
    1815             : 
    1816           0 :         io_prep_async_link(req);
    1817           0 :         de = kmalloc(sizeof(*de), GFP_KERNEL);
    1818           0 :         if (!de) {
    1819           0 :                 ret = -ENOMEM;
    1820           0 :                 io_req_defer_failed(req, ret);
    1821           0 :                 return;
    1822             :         }
    1823             : 
    1824           0 :         spin_lock(&ctx->completion_lock);
    1825           0 :         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
    1826           0 :                 spin_unlock(&ctx->completion_lock);
    1827           0 :                 kfree(de);
    1828           0 :                 goto queue;
    1829             :         }
    1830             : 
    1831           0 :         trace_io_uring_defer(req);
    1832           0 :         de->req = req;
    1833           0 :         de->seq = seq;
    1834           0 :         list_add_tail(&de->list, &ctx->defer_list);
    1835           0 :         spin_unlock(&ctx->completion_lock);
    1836             : }
    1837             : 
    1838           0 : static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
    1839             :                            unsigned int issue_flags)
    1840             : {
    1841           0 :         if (req->file || !def->needs_file)
    1842             :                 return true;
    1843             : 
    1844           0 :         if (req->flags & REQ_F_FIXED_FILE)
    1845           0 :                 req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
    1846             :         else
    1847           0 :                 req->file = io_file_get_normal(req, req->cqe.fd);
    1848             : 
    1849           0 :         return !!req->file;
    1850             : }
    1851             : 
    1852           0 : static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
    1853             : {
    1854           0 :         const struct io_issue_def *def = &io_issue_defs[req->opcode];
    1855           0 :         const struct cred *creds = NULL;
    1856             :         int ret;
    1857             : 
    1858           0 :         if (unlikely(!io_assign_file(req, def, issue_flags)))
    1859             :                 return -EBADF;
    1860             : 
    1861           0 :         if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
    1862           0 :                 creds = override_creds(req->creds);
    1863             : 
    1864           0 :         if (!def->audit_skip)
    1865             :                 audit_uring_entry(req->opcode);
    1866             : 
    1867           0 :         ret = def->issue(req, issue_flags);
    1868             : 
    1869             :         if (!def->audit_skip)
    1870             :                 audit_uring_exit(!ret, ret);
    1871             : 
    1872           0 :         if (creds)
    1873           0 :                 revert_creds(creds);
    1874             : 
    1875           0 :         if (ret == IOU_OK) {
    1876           0 :                 if (issue_flags & IO_URING_F_COMPLETE_DEFER)
    1877             :                         io_req_complete_defer(req);
    1878             :                 else
    1879           0 :                         io_req_complete_post(req, issue_flags);
    1880           0 :         } else if (ret != IOU_ISSUE_SKIP_COMPLETE)
    1881             :                 return ret;
    1882             : 
    1883             :         /* If the op doesn't have a file, we're not polling for it */
    1884           0 :         if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
    1885           0 :                 io_iopoll_req_issued(req, issue_flags);
    1886             : 
    1887             :         return 0;
    1888             : }
    1889             : 
    1890           0 : int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
    1891             : {
    1892           0 :         io_tw_lock(req->ctx, ts);
    1893           0 :         return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT|
    1894             :                                  IO_URING_F_COMPLETE_DEFER);
    1895             : }
    1896             : 
    1897           0 : struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
    1898             : {
    1899           0 :         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
    1900           0 :         struct io_kiocb *nxt = NULL;
    1901             : 
    1902           0 :         if (req_ref_put_and_test(req)) {
    1903           0 :                 if (req->flags & IO_REQ_LINK_FLAGS)
    1904           0 :                         nxt = io_req_find_next(req);
    1905           0 :                 io_free_req(req);
    1906             :         }
    1907           0 :         return nxt ? &nxt->work : NULL;
    1908             : }
    1909             : 
    1910           0 : void io_wq_submit_work(struct io_wq_work *work)
    1911             : {
    1912           0 :         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
    1913           0 :         const struct io_issue_def *def = &io_issue_defs[req->opcode];
    1914           0 :         unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ;
    1915           0 :         bool needs_poll = false;
    1916           0 :         int ret = 0, err = -ECANCELED;
    1917             : 
    1918             :         /* one will be dropped by ->io_wq_free_work() after returning to io-wq */
    1919           0 :         if (!(req->flags & REQ_F_REFCOUNT))
    1920             :                 __io_req_set_refcount(req, 2);
    1921             :         else
    1922           0 :                 req_ref_get(req);
    1923             : 
    1924           0 :         io_arm_ltimeout(req);
    1925             : 
    1926             :         /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
    1927           0 :         if (work->flags & IO_WQ_WORK_CANCEL) {
    1928             : fail:
    1929             :                 io_req_task_queue_fail(req, err);
    1930             :                 return;
    1931             :         }
    1932           0 :         if (!io_assign_file(req, def, issue_flags)) {
    1933           0 :                 err = -EBADF;
    1934           0 :                 work->flags |= IO_WQ_WORK_CANCEL;
    1935           0 :                 goto fail;
    1936             :         }
    1937             : 
    1938           0 :         if (req->flags & REQ_F_FORCE_ASYNC) {
    1939           0 :                 bool opcode_poll = def->pollin || def->pollout;
    1940             : 
    1941           0 :                 if (opcode_poll && file_can_poll(req->file)) {
    1942           0 :                         needs_poll = true;
    1943           0 :                         issue_flags |= IO_URING_F_NONBLOCK;
    1944             :                 }
    1945             :         }
    1946             : 
    1947             :         do {
    1948           0 :                 ret = io_issue_sqe(req, issue_flags);
    1949           0 :                 if (ret != -EAGAIN)
    1950             :                         break;
    1951             :                 /*
    1952             :                  * We can get EAGAIN for iopolled IO even though we're
    1953             :                  * forcing a sync submission from here, since we can't
    1954             :                  * wait for request slots on the block side.
    1955             :                  */
    1956           0 :                 if (!needs_poll) {
    1957           0 :                         if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
    1958             :                                 break;
    1959           0 :                         cond_resched();
    1960           0 :                         continue;
    1961             :                 }
    1962             : 
    1963           0 :                 if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
    1964             :                         return;
    1965             :                 /* aborted or ready, in either case retry blocking */
    1966             :                 needs_poll = false;
    1967             :                 issue_flags &= ~IO_URING_F_NONBLOCK;
    1968             :         } while (1);
    1969             : 
    1970             :         /* avoid locking problems by failing it from a clean context */
    1971           0 :         if (ret < 0)
    1972             :                 io_req_task_queue_fail(req, ret);
    1973             : }
    1974             : 
    1975           0 : inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
    1976             :                                       unsigned int issue_flags)
    1977             : {
    1978           0 :         struct io_ring_ctx *ctx = req->ctx;
    1979             :         struct io_fixed_file *slot;
    1980           0 :         struct file *file = NULL;
    1981             : 
    1982           0 :         io_ring_submit_lock(ctx, issue_flags);
    1983             : 
    1984           0 :         if (unlikely((unsigned int)fd >= ctx->nr_user_files))
    1985             :                 goto out;
    1986           0 :         fd = array_index_nospec(fd, ctx->nr_user_files);
    1987           0 :         slot = io_fixed_file_slot(&ctx->file_table, fd);
    1988           0 :         file = io_slot_file(slot);
    1989           0 :         req->flags |= io_slot_flags(slot);
    1990           0 :         io_req_set_rsrc_node(req, ctx, 0);
    1991             : out:
    1992           0 :         io_ring_submit_unlock(ctx, issue_flags);
    1993           0 :         return file;
    1994             : }
    1995             : 
    1996           0 : struct file *io_file_get_normal(struct io_kiocb *req, int fd)
    1997             : {
    1998           0 :         struct file *file = fget(fd);
    1999             : 
    2000           0 :         trace_io_uring_file_get(req, fd);
    2001             : 
    2002             :         /* we don't allow fixed io_uring files */
    2003           0 :         if (file && io_is_uring_fops(file))
    2004           0 :                 io_req_track_inflight(req);
    2005           0 :         return file;
    2006             : }
    2007             : 
    2008           0 : static void io_queue_async(struct io_kiocb *req, int ret)
    2009             :         __must_hold(&req->ctx->uring_lock)
    2010             : {
    2011             :         struct io_kiocb *linked_timeout;
    2012             : 
    2013           0 :         if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
    2014           0 :                 io_req_defer_failed(req, ret);
    2015           0 :                 return;
    2016             :         }
    2017             : 
    2018           0 :         linked_timeout = io_prep_linked_timeout(req);
    2019             : 
    2020           0 :         switch (io_arm_poll_handler(req, 0)) {
    2021             :         case IO_APOLL_READY:
    2022           0 :                 io_kbuf_recycle(req, 0);
    2023             :                 io_req_task_queue(req);
    2024             :                 break;
    2025             :         case IO_APOLL_ABORTED:
    2026           0 :                 io_kbuf_recycle(req, 0);
    2027           0 :                 io_queue_iowq(req, NULL);
    2028           0 :                 break;
    2029             :         case IO_APOLL_OK:
    2030             :                 break;
    2031             :         }
    2032             : 
    2033           0 :         if (linked_timeout)
    2034           0 :                 io_queue_linked_timeout(linked_timeout);
    2035             : }
    2036             : 
    2037           0 : static inline void io_queue_sqe(struct io_kiocb *req)
    2038             :         __must_hold(&req->ctx->uring_lock)
    2039             : {
    2040             :         int ret;
    2041             : 
    2042           0 :         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
    2043             : 
    2044             :         /*
    2045             :          * We async punt it if the file wasn't marked NOWAIT, or if the file
    2046             :          * doesn't support non-blocking read/write attempts
    2047             :          */
    2048           0 :         if (likely(!ret))
    2049             :                 io_arm_ltimeout(req);
    2050             :         else
    2051           0 :                 io_queue_async(req, ret);
    2052           0 : }
    2053             : 
    2054           0 : static void io_queue_sqe_fallback(struct io_kiocb *req)
    2055             :         __must_hold(&req->ctx->uring_lock)
    2056             : {
    2057           0 :         if (unlikely(req->flags & REQ_F_FAIL)) {
    2058             :                 /*
    2059             :                  * We don't submit, fail them all, for that replace hardlinks
    2060             :                  * with normal links. Extra REQ_F_LINK is tolerated.
    2061             :                  */
    2062           0 :                 req->flags &= ~REQ_F_HARDLINK;
    2063           0 :                 req->flags |= REQ_F_LINK;
    2064           0 :                 io_req_defer_failed(req, req->cqe.res);
    2065             :         } else {
    2066           0 :                 int ret = io_req_prep_async(req);
    2067             : 
    2068           0 :                 if (unlikely(ret)) {
    2069           0 :                         io_req_defer_failed(req, ret);
    2070           0 :                         return;
    2071             :                 }
    2072             : 
    2073           0 :                 if (unlikely(req->ctx->drain_active))
    2074           0 :                         io_drain_req(req);
    2075             :                 else
    2076           0 :                         io_queue_iowq(req, NULL);
    2077             :         }
    2078             : }
    2079             : 
    2080             : /*
    2081             :  * Check SQE restrictions (opcode and flags).
    2082             :  *
    2083             :  * Returns 'true' if SQE is allowed, 'false' otherwise.
    2084             :  */
    2085           0 : static inline bool io_check_restriction(struct io_ring_ctx *ctx,
    2086             :                                         struct io_kiocb *req,
    2087             :                                         unsigned int sqe_flags)
    2088             : {
    2089           0 :         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
    2090             :                 return false;
    2091             : 
    2092           0 :         if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
    2093             :             ctx->restrictions.sqe_flags_required)
    2094             :                 return false;
    2095             : 
    2096           0 :         if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
    2097             :                           ctx->restrictions.sqe_flags_required))
    2098             :                 return false;
    2099             : 
    2100             :         return true;
    2101             : }
    2102             : 
    2103             : static void io_init_req_drain(struct io_kiocb *req)
    2104             : {
    2105           0 :         struct io_ring_ctx *ctx = req->ctx;
    2106           0 :         struct io_kiocb *head = ctx->submit_state.link.head;
    2107             : 
    2108           0 :         ctx->drain_active = true;
    2109           0 :         if (head) {
    2110             :                 /*
    2111             :                  * If we need to drain a request in the middle of a link, drain
    2112             :                  * the head request and the next request/link after the current
    2113             :                  * link. Considering sequential execution of links,
    2114             :                  * REQ_F_IO_DRAIN will be maintained for every request of our
    2115             :                  * link.
    2116             :                  */
    2117           0 :                 head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
    2118           0 :                 ctx->drain_next = true;
    2119             :         }
    2120             : }
    2121             : 
    2122           0 : static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
    2123             :                        const struct io_uring_sqe *sqe)
    2124             :         __must_hold(&ctx->uring_lock)
    2125             : {
    2126             :         const struct io_issue_def *def;
    2127             :         unsigned int sqe_flags;
    2128             :         int personality;
    2129             :         u8 opcode;
    2130             : 
    2131             :         /* req is partially pre-initialised, see io_preinit_req() */
    2132           0 :         req->opcode = opcode = READ_ONCE(sqe->opcode);
    2133             :         /* same numerical values with corresponding REQ_F_*, safe to copy */
    2134           0 :         req->flags = sqe_flags = READ_ONCE(sqe->flags);
    2135           0 :         req->cqe.user_data = READ_ONCE(sqe->user_data);
    2136           0 :         req->file = NULL;
    2137           0 :         req->rsrc_node = NULL;
    2138           0 :         req->task = current;
    2139             : 
    2140           0 :         if (unlikely(opcode >= IORING_OP_LAST)) {
    2141           0 :                 req->opcode = 0;
    2142           0 :                 return -EINVAL;
    2143             :         }
    2144           0 :         def = &io_issue_defs[opcode];
    2145           0 :         if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
    2146             :                 /* enforce forwards compatibility on users */
    2147           0 :                 if (sqe_flags & ~SQE_VALID_FLAGS)
    2148             :                         return -EINVAL;
    2149           0 :                 if (sqe_flags & IOSQE_BUFFER_SELECT) {
    2150           0 :                         if (!def->buffer_select)
    2151             :                                 return -EOPNOTSUPP;
    2152           0 :                         req->buf_index = READ_ONCE(sqe->buf_group);
    2153             :                 }
    2154           0 :                 if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
    2155           0 :                         ctx->drain_disabled = true;
    2156           0 :                 if (sqe_flags & IOSQE_IO_DRAIN) {
    2157           0 :                         if (ctx->drain_disabled)
    2158             :                                 return -EOPNOTSUPP;
    2159           0 :                         io_init_req_drain(req);
    2160             :                 }
    2161             :         }
    2162           0 :         if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
    2163           0 :                 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
    2164             :                         return -EACCES;
    2165             :                 /* knock it to the slow queue path, will be drained there */
    2166           0 :                 if (ctx->drain_active)
    2167           0 :                         req->flags |= REQ_F_FORCE_ASYNC;
    2168             :                 /* if there is no link, we're at "next" request and need to drain */
    2169           0 :                 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
    2170           0 :                         ctx->drain_next = false;
    2171           0 :                         ctx->drain_active = true;
    2172           0 :                         req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
    2173             :                 }
    2174             :         }
    2175             : 
    2176           0 :         if (!def->ioprio && sqe->ioprio)
    2177             :                 return -EINVAL;
    2178           0 :         if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
    2179             :                 return -EINVAL;
    2180             : 
    2181           0 :         if (def->needs_file) {
    2182           0 :                 struct io_submit_state *state = &ctx->submit_state;
    2183             : 
    2184           0 :                 req->cqe.fd = READ_ONCE(sqe->fd);
    2185             : 
    2186             :                 /*
    2187             :                  * Plug now if we have more than 2 IO left after this, and the
    2188             :                  * target is potentially a read/write to block based storage.
    2189             :                  */
    2190           0 :                 if (state->need_plug && def->plug) {
    2191           0 :                         state->plug_started = true;
    2192           0 :                         state->need_plug = false;
    2193           0 :                         blk_start_plug_nr_ios(&state->plug, state->submit_nr);
    2194             :                 }
    2195             :         }
    2196             : 
    2197           0 :         personality = READ_ONCE(sqe->personality);
    2198           0 :         if (personality) {
    2199             :                 int ret;
    2200             : 
    2201           0 :                 req->creds = xa_load(&ctx->personalities, personality);
    2202           0 :                 if (!req->creds)
    2203             :                         return -EINVAL;
    2204           0 :                 get_cred(req->creds);
    2205           0 :                 ret = security_uring_override_creds(req->creds);
    2206             :                 if (ret) {
    2207             :                         put_cred(req->creds);
    2208             :                         return ret;
    2209             :                 }
    2210           0 :                 req->flags |= REQ_F_CREDS;
    2211             :         }
    2212             : 
    2213           0 :         return def->prep(req, sqe);
    2214             : }
    2215             : 
    2216           0 : static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
    2217             :                                       struct io_kiocb *req, int ret)
    2218             : {
    2219           0 :         struct io_ring_ctx *ctx = req->ctx;
    2220           0 :         struct io_submit_link *link = &ctx->submit_state.link;
    2221           0 :         struct io_kiocb *head = link->head;
    2222             : 
    2223           0 :         trace_io_uring_req_failed(sqe, req, ret);
    2224             : 
    2225             :         /*
    2226             :          * Avoid breaking links in the middle as it renders links with SQPOLL
    2227             :          * unusable. Instead of failing eagerly, continue assembling the link if
    2228             :          * applicable and mark the head with REQ_F_FAIL. The link flushing code
    2229             :          * should find the flag and handle the rest.
    2230             :          */
    2231           0 :         req_fail_link_node(req, ret);
    2232           0 :         if (head && !(head->flags & REQ_F_FAIL))
    2233             :                 req_fail_link_node(head, -ECANCELED);
    2234             : 
    2235           0 :         if (!(req->flags & IO_REQ_LINK_FLAGS)) {
    2236           0 :                 if (head) {
    2237           0 :                         link->last->link = req;
    2238           0 :                         link->head = NULL;
    2239           0 :                         req = head;
    2240             :                 }
    2241           0 :                 io_queue_sqe_fallback(req);
    2242             :                 return ret;
    2243             :         }
    2244             : 
    2245           0 :         if (head)
    2246           0 :                 link->last->link = req;
    2247             :         else
    2248           0 :                 link->head = req;
    2249           0 :         link->last = req;
    2250             :         return 0;
    2251             : }
    2252             : 
    2253           0 : static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
    2254             :                          const struct io_uring_sqe *sqe)
    2255             :         __must_hold(&ctx->uring_lock)
    2256             : {
    2257           0 :         struct io_submit_link *link = &ctx->submit_state.link;
    2258             :         int ret;
    2259             : 
    2260           0 :         ret = io_init_req(ctx, req, sqe);
    2261           0 :         if (unlikely(ret))
    2262           0 :                 return io_submit_fail_init(sqe, req, ret);
    2263             : 
    2264           0 :         trace_io_uring_submit_req(req);
    2265             : 
    2266             :         /*
    2267             :          * If we already have a head request, queue this one for async
    2268             :          * submittal once the head completes. If we don't have a head but
    2269             :          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
    2270             :          * submitted sync once the chain is complete. If none of those
    2271             :          * conditions are true (normal request), then just queue it.
    2272             :          */
    2273           0 :         if (unlikely(link->head)) {
    2274           0 :                 ret = io_req_prep_async(req);
    2275           0 :                 if (unlikely(ret))
    2276           0 :                         return io_submit_fail_init(sqe, req, ret);
    2277             : 
    2278           0 :                 trace_io_uring_link(req, link->head);
    2279           0 :                 link->last->link = req;
    2280           0 :                 link->last = req;
    2281             : 
    2282           0 :                 if (req->flags & IO_REQ_LINK_FLAGS)
    2283             :                         return 0;
    2284             :                 /* last request of the link, flush it */
    2285           0 :                 req = link->head;
    2286           0 :                 link->head = NULL;
    2287           0 :                 if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
    2288             :                         goto fallback;
    2289             : 
    2290           0 :         } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
    2291             :                                           REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
    2292           0 :                 if (req->flags & IO_REQ_LINK_FLAGS) {
    2293           0 :                         link->head = req;
    2294           0 :                         link->last = req;
    2295             :                 } else {
    2296             : fallback:
    2297           0 :                         io_queue_sqe_fallback(req);
    2298             :                 }
    2299             :                 return 0;
    2300             :         }
    2301             : 
    2302           0 :         io_queue_sqe(req);
    2303           0 :         return 0;
    2304             : }
    2305             : 
    2306             : /*
    2307             :  * Batched submission is done, ensure local IO is flushed out.
    2308             :  */
    2309           0 : static void io_submit_state_end(struct io_ring_ctx *ctx)
    2310             : {
    2311           0 :         struct io_submit_state *state = &ctx->submit_state;
    2312             : 
    2313           0 :         if (unlikely(state->link.head))
    2314           0 :                 io_queue_sqe_fallback(state->link.head);
    2315             :         /* flush only after queuing links as they can generate completions */
    2316           0 :         io_submit_flush_completions(ctx);
    2317           0 :         if (state->plug_started)
    2318           0 :                 blk_finish_plug(&state->plug);
    2319           0 : }
    2320             : 
    2321             : /*
    2322             :  * Start submission side cache.
    2323             :  */
    2324             : static void io_submit_state_start(struct io_submit_state *state,
    2325             :                                   unsigned int max_ios)
    2326             : {
    2327           0 :         state->plug_started = false;
    2328           0 :         state->need_plug = max_ios > 2;
    2329           0 :         state->submit_nr = max_ios;
    2330             :         /* set only head, no need to init link_last in advance */
    2331           0 :         state->link.head = NULL;
    2332             : }
    2333             : 
    2334             : static void io_commit_sqring(struct io_ring_ctx *ctx)
    2335             : {
    2336           0 :         struct io_rings *rings = ctx->rings;
    2337             : 
    2338             :         /*
    2339             :          * Ensure any loads from the SQEs are done at this point,
    2340             :          * since once we write the new head, the application could
    2341             :          * write new data to them.
    2342             :          */
    2343           0 :         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
    2344             : }
    2345             : 
    2346             : /*
    2347             :  * Fetch an sqe, if one is available. Note this returns a pointer to memory
    2348             :  * that is mapped by userspace. This means that care needs to be taken to
    2349             :  * ensure that reads are stable, as we cannot rely on userspace always
    2350             :  * being a good citizen. If members of the sqe are validated and then later
    2351             :  * used, it's important that those reads are done through READ_ONCE() to
    2352             :  * prevent a re-load down the line.
    2353             :  */
    2354           0 : static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
    2355             : {
    2356           0 :         unsigned head, mask = ctx->sq_entries - 1;
    2357           0 :         unsigned sq_idx = ctx->cached_sq_head++ & mask;
    2358             : 
    2359             :         /*
    2360             :          * The cached sq head (or cq tail) serves two purposes:
    2361             :          *
    2362             :          * 1) allows us to batch the cost of updating the user visible
    2363             :          *    head updates.
    2364             :          * 2) allows the kernel side to track the head on its own, even
    2365             :          *    though the application is the one updating it.
    2366             :          */
    2367           0 :         head = READ_ONCE(ctx->sq_array[sq_idx]);
    2368           0 :         if (likely(head < ctx->sq_entries)) {
    2369             :                 /* double index for 128-byte SQEs, twice as long */
    2370           0 :                 if (ctx->flags & IORING_SETUP_SQE128)
    2371           0 :                         head <<= 1;
    2372           0 :                 *sqe = &ctx->sq_sqes[head];
    2373           0 :                 return true;
    2374             :         }
    2375             : 
    2376             :         /* drop invalid entries */
    2377           0 :         ctx->cq_extra--;
    2378           0 :         WRITE_ONCE(ctx->rings->sq_dropped,
    2379             :                    READ_ONCE(ctx->rings->sq_dropped) + 1);
    2380           0 :         return false;
    2381             : }
    2382             : 
    2383           0 : int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
    2384             :         __must_hold(&ctx->uring_lock)
    2385             : {
    2386           0 :         unsigned int entries = io_sqring_entries(ctx);
    2387             :         unsigned int left;
    2388             :         int ret;
    2389             : 
    2390           0 :         if (unlikely(!entries))
    2391             :                 return 0;
    2392             :         /* make sure SQ entry isn't read before tail */
    2393           0 :         ret = left = min(nr, entries);
    2394           0 :         io_get_task_refs(left);
    2395           0 :         io_submit_state_start(&ctx->submit_state, left);
    2396             : 
    2397             :         do {
    2398             :                 const struct io_uring_sqe *sqe;
    2399             :                 struct io_kiocb *req;
    2400             : 
    2401           0 :                 if (unlikely(!io_alloc_req(ctx, &req)))
    2402             :                         break;
    2403           0 :                 if (unlikely(!io_get_sqe(ctx, &sqe))) {
    2404           0 :                         io_req_add_to_cache(req, ctx);
    2405             :                         break;
    2406             :                 }
    2407             : 
    2408             :                 /*
    2409             :                  * Continue submitting even for sqe failure if the
    2410             :                  * ring was setup with IORING_SETUP_SUBMIT_ALL
    2411             :                  */
    2412           0 :                 if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
    2413           0 :                     !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
    2414           0 :                         left--;
    2415           0 :                         break;
    2416             :                 }
    2417           0 :         } while (--left);
    2418             : 
    2419           0 :         if (unlikely(left)) {
    2420           0 :                 ret -= left;
    2421             :                 /* try again if it submitted nothing and can't allocate a req */
    2422           0 :                 if (!ret && io_req_cache_empty(ctx))
    2423           0 :                         ret = -EAGAIN;
    2424           0 :                 current->io_uring->cached_refs += left;
    2425             :         }
    2426             : 
    2427           0 :         io_submit_state_end(ctx);
    2428             :          /* Commit SQ ring head once we've consumed and submitted all SQEs */
    2429           0 :         io_commit_sqring(ctx);
    2430           0 :         return ret;
    2431             : }
    2432             : 
    2433             : struct io_wait_queue {
    2434             :         struct wait_queue_entry wq;
    2435             :         struct io_ring_ctx *ctx;
    2436             :         unsigned cq_tail;
    2437             :         unsigned nr_timeouts;
    2438             :         ktime_t timeout;
    2439             : };
    2440             : 
    2441             : static inline bool io_has_work(struct io_ring_ctx *ctx)
    2442             : {
    2443           0 :         return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) ||
    2444           0 :                !llist_empty(&ctx->work_llist);
    2445             : }
    2446             : 
    2447             : static inline bool io_should_wake(struct io_wait_queue *iowq)
    2448             : {
    2449           0 :         struct io_ring_ctx *ctx = iowq->ctx;
    2450           0 :         int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
    2451             : 
    2452             :         /*
    2453             :          * Wake up if we have enough events, or if a timeout occurred since we
    2454             :          * started waiting. For timeouts, we always want to return to userspace,
    2455             :          * regardless of event count.
    2456             :          */
    2457           0 :         return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
    2458             : }
    2459             : 
    2460           0 : static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
    2461             :                             int wake_flags, void *key)
    2462             : {
    2463           0 :         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq);
    2464             : 
    2465             :         /*
    2466             :          * Cannot safely flush overflowed CQEs from here, ensure we wake up
    2467             :          * the task, and the next invocation will do it.
    2468             :          */
    2469           0 :         if (io_should_wake(iowq) || io_has_work(iowq->ctx))
    2470           0 :                 return autoremove_wake_function(curr, mode, wake_flags, key);
    2471             :         return -1;
    2472             : }
    2473             : 
    2474           0 : int io_run_task_work_sig(struct io_ring_ctx *ctx)
    2475             : {
    2476           0 :         if (!llist_empty(&ctx->work_llist)) {
    2477           0 :                 __set_current_state(TASK_RUNNING);
    2478           0 :                 if (io_run_local_work(ctx) > 0)
    2479             :                         return 1;
    2480             :         }
    2481           0 :         if (io_run_task_work() > 0)
    2482             :                 return 1;
    2483           0 :         if (task_sigpending(current))
    2484             :                 return -EINTR;
    2485           0 :         return 0;
    2486             : }
    2487             : 
    2488             : /* when returns >0, the caller should retry */
    2489           0 : static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
    2490             :                                           struct io_wait_queue *iowq)
    2491             : {
    2492             :         int token, ret;
    2493             : 
    2494           0 :         if (unlikely(READ_ONCE(ctx->check_cq)))
    2495             :                 return 1;
    2496           0 :         if (unlikely(!llist_empty(&ctx->work_llist)))
    2497             :                 return 1;
    2498           0 :         if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
    2499             :                 return 1;
    2500           0 :         if (unlikely(task_sigpending(current)))
    2501             :                 return -EINTR;
    2502           0 :         if (unlikely(io_should_wake(iowq)))
    2503             :                 return 0;
    2504             : 
    2505             :         /*
    2506             :          * Use io_schedule_prepare/finish, so cpufreq can take into account
    2507             :          * that the task is waiting for IO - turns out to be important for low
    2508             :          * QD IO.
    2509             :          */
    2510           0 :         token = io_schedule_prepare();
    2511           0 :         ret = 0;
    2512           0 :         if (iowq->timeout == KTIME_MAX)
    2513           0 :                 schedule();
    2514           0 :         else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
    2515           0 :                 ret = -ETIME;
    2516           0 :         io_schedule_finish(token);
    2517           0 :         return ret;
    2518             : }
    2519             : 
    2520             : /*
    2521             :  * Wait until events become available, if we don't already have some. The
    2522             :  * application must reap them itself, as they reside on the shared cq ring.
    2523             :  */
    2524           0 : static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
    2525             :                           const sigset_t __user *sig, size_t sigsz,
    2526             :                           struct __kernel_timespec __user *uts)
    2527             : {
    2528             :         struct io_wait_queue iowq;
    2529           0 :         struct io_rings *rings = ctx->rings;
    2530             :         int ret;
    2531             : 
    2532           0 :         if (!io_allowed_run_tw(ctx))
    2533             :                 return -EEXIST;
    2534           0 :         if (!llist_empty(&ctx->work_llist))
    2535           0 :                 io_run_local_work(ctx);
    2536           0 :         io_run_task_work();
    2537           0 :         io_cqring_overflow_flush(ctx);
    2538             :         /* if user messes with these they will just get an early return */
    2539           0 :         if (__io_cqring_events_user(ctx) >= min_events)
    2540             :                 return 0;
    2541             : 
    2542           0 :         if (sig) {
    2543             : #ifdef CONFIG_COMPAT
    2544             :                 if (in_compat_syscall())
    2545             :                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
    2546             :                                                       sigsz);
    2547             :                 else
    2548             : #endif
    2549           0 :                         ret = set_user_sigmask(sig, sigsz);
    2550             : 
    2551           0 :                 if (ret)
    2552             :                         return ret;
    2553             :         }
    2554             : 
    2555           0 :         init_waitqueue_func_entry(&iowq.wq, io_wake_function);
    2556           0 :         iowq.wq.private = current;
    2557           0 :         INIT_LIST_HEAD(&iowq.wq.entry);
    2558           0 :         iowq.ctx = ctx;
    2559           0 :         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
    2560           0 :         iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
    2561           0 :         iowq.timeout = KTIME_MAX;
    2562             : 
    2563           0 :         if (uts) {
    2564             :                 struct timespec64 ts;
    2565             : 
    2566           0 :                 if (get_timespec64(&ts, uts))
    2567           0 :                         return -EFAULT;
    2568           0 :                 iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
    2569             :         }
    2570             : 
    2571             :         trace_io_uring_cqring_wait(ctx, min_events);
    2572           0 :         do {
    2573             :                 unsigned long check_cq;
    2574             : 
    2575           0 :                 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
    2576           0 :                         int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
    2577             : 
    2578           0 :                         atomic_set(&ctx->cq_wait_nr, nr_wait);
    2579           0 :                         set_current_state(TASK_INTERRUPTIBLE);
    2580             :                 } else {
    2581           0 :                         prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
    2582             :                                                         TASK_INTERRUPTIBLE);
    2583             :                 }
    2584             : 
    2585           0 :                 ret = io_cqring_wait_schedule(ctx, &iowq);
    2586           0 :                 __set_current_state(TASK_RUNNING);
    2587           0 :                 atomic_set(&ctx->cq_wait_nr, 0);
    2588             : 
    2589           0 :                 if (ret < 0)
    2590             :                         break;
    2591             :                 /*
    2592             :                  * Run task_work after scheduling and before io_should_wake().
    2593             :                  * If we got woken because of task_work being processed, run it
    2594             :                  * now rather than let the caller do another wait loop.
    2595             :                  */
    2596           0 :                 io_run_task_work();
    2597           0 :                 if (!llist_empty(&ctx->work_llist))
    2598           0 :                         io_run_local_work(ctx);
    2599             : 
    2600           0 :                 check_cq = READ_ONCE(ctx->check_cq);
    2601           0 :                 if (unlikely(check_cq)) {
    2602             :                         /* let the caller flush overflows, retry */
    2603           0 :                         if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
    2604           0 :                                 io_cqring_do_overflow_flush(ctx);
    2605           0 :                         if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) {
    2606             :                                 ret = -EBADR;
    2607             :                                 break;
    2608             :                         }
    2609             :                 }
    2610             : 
    2611           0 :                 if (io_should_wake(&iowq)) {
    2612             :                         ret = 0;
    2613             :                         break;
    2614             :                 }
    2615           0 :                 cond_resched();
    2616             :         } while (1);
    2617             : 
    2618           0 :         if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
    2619           0 :                 finish_wait(&ctx->cq_wait, &iowq.wq);
    2620           0 :         restore_saved_sigmask_unless(ret == -EINTR);
    2621             : 
    2622           0 :         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
    2623             : }
    2624             : 
    2625           0 : static void io_mem_free(void *ptr)
    2626             : {
    2627             :         struct page *page;
    2628             : 
    2629           0 :         if (!ptr)
    2630             :                 return;
    2631             : 
    2632           0 :         page = virt_to_head_page(ptr);
    2633           0 :         if (put_page_testzero(page))
    2634           0 :                 free_compound_page(page);
    2635             : }
    2636             : 
    2637           0 : static void io_pages_free(struct page ***pages, int npages)
    2638             : {
    2639             :         struct page **page_array;
    2640             :         int i;
    2641             : 
    2642           0 :         if (!pages)
    2643             :                 return;
    2644           0 :         page_array = *pages;
    2645           0 :         for (i = 0; i < npages; i++)
    2646           0 :                 unpin_user_page(page_array[i]);
    2647           0 :         kvfree(page_array);
    2648           0 :         *pages = NULL;
    2649             : }
    2650             : 
    2651           0 : static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
    2652             :                             unsigned long uaddr, size_t size)
    2653             : {
    2654             :         struct page **page_array;
    2655             :         unsigned int nr_pages;
    2656             :         int ret;
    2657             : 
    2658           0 :         *npages = 0;
    2659             : 
    2660           0 :         if (uaddr & (PAGE_SIZE - 1) || !size)
    2661             :                 return ERR_PTR(-EINVAL);
    2662             : 
    2663           0 :         nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
    2664           0 :         if (nr_pages > USHRT_MAX)
    2665             :                 return ERR_PTR(-EINVAL);
    2666           0 :         page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
    2667           0 :         if (!page_array)
    2668             :                 return ERR_PTR(-ENOMEM);
    2669             : 
    2670           0 :         ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
    2671             :                                         page_array);
    2672           0 :         if (ret != nr_pages) {
    2673             : err:
    2674           0 :                 io_pages_free(&page_array, ret > 0 ? ret : 0);
    2675           0 :                 return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
    2676             :         }
    2677             :         /*
    2678             :          * Should be a single page. If the ring is small enough that we can
    2679             :          * use a normal page, that is fine. If we need multiple pages, then
    2680             :          * userspace should use a huge page. That's the only way to guarantee
    2681             :          * that we get contigious memory, outside of just being lucky or
    2682             :          * (currently) having low memory fragmentation.
    2683             :          */
    2684           0 :         if (page_array[0] != page_array[ret - 1])
    2685             :                 goto err;
    2686           0 :         *pages = page_array;
    2687           0 :         *npages = nr_pages;
    2688           0 :         return page_to_virt(page_array[0]);
    2689             : }
    2690             : 
    2691             : static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
    2692             :                           size_t size)
    2693             : {
    2694           0 :         return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr,
    2695             :                                 size);
    2696             : }
    2697             : 
    2698             : static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
    2699             :                          size_t size)
    2700             : {
    2701           0 :         return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr,
    2702             :                                 size);
    2703             : }
    2704             : 
    2705           0 : static void io_rings_free(struct io_ring_ctx *ctx)
    2706             : {
    2707           0 :         if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
    2708           0 :                 io_mem_free(ctx->rings);
    2709           0 :                 io_mem_free(ctx->sq_sqes);
    2710           0 :                 ctx->rings = NULL;
    2711           0 :                 ctx->sq_sqes = NULL;
    2712             :         } else {
    2713           0 :                 io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
    2714           0 :                 io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
    2715             :         }
    2716           0 : }
    2717             : 
    2718           0 : static void *io_mem_alloc(size_t size)
    2719             : {
    2720           0 :         gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
    2721             :         void *ret;
    2722             : 
    2723           0 :         ret = (void *) __get_free_pages(gfp, get_order(size));
    2724           0 :         if (ret)
    2725             :                 return ret;
    2726           0 :         return ERR_PTR(-ENOMEM);
    2727             : }
    2728             : 
    2729           0 : static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
    2730             :                                 unsigned int cq_entries, size_t *sq_offset)
    2731             : {
    2732             :         struct io_rings *rings;
    2733             :         size_t off, sq_array_size;
    2734             : 
    2735           0 :         off = struct_size(rings, cqes, cq_entries);
    2736           0 :         if (off == SIZE_MAX)
    2737             :                 return SIZE_MAX;
    2738           0 :         if (ctx->flags & IORING_SETUP_CQE32) {
    2739           0 :                 if (check_shl_overflow(off, 1, &off))
    2740             :                         return SIZE_MAX;
    2741             :         }
    2742             : 
    2743             : #ifdef CONFIG_SMP
    2744             :         off = ALIGN(off, SMP_CACHE_BYTES);
    2745             :         if (off == 0)
    2746             :                 return SIZE_MAX;
    2747             : #endif
    2748             : 
    2749           0 :         if (sq_offset)
    2750           0 :                 *sq_offset = off;
    2751             : 
    2752           0 :         sq_array_size = array_size(sizeof(u32), sq_entries);
    2753           0 :         if (sq_array_size == SIZE_MAX)
    2754             :                 return SIZE_MAX;
    2755             : 
    2756           0 :         if (check_add_overflow(off, sq_array_size, &off))
    2757             :                 return SIZE_MAX;
    2758             : 
    2759           0 :         return off;
    2760             : }
    2761             : 
    2762           0 : static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
    2763             :                                unsigned int eventfd_async)
    2764             : {
    2765             :         struct io_ev_fd *ev_fd;
    2766           0 :         __s32 __user *fds = arg;
    2767             :         int fd;
    2768             : 
    2769           0 :         ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
    2770             :                                         lockdep_is_held(&ctx->uring_lock));
    2771           0 :         if (ev_fd)
    2772             :                 return -EBUSY;
    2773             : 
    2774           0 :         if (copy_from_user(&fd, fds, sizeof(*fds)))
    2775             :                 return -EFAULT;
    2776             : 
    2777           0 :         ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
    2778           0 :         if (!ev_fd)
    2779             :                 return -ENOMEM;
    2780             : 
    2781           0 :         ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
    2782           0 :         if (IS_ERR(ev_fd->cq_ev_fd)) {
    2783           0 :                 int ret = PTR_ERR(ev_fd->cq_ev_fd);
    2784           0 :                 kfree(ev_fd);
    2785           0 :                 return ret;
    2786             :         }
    2787             : 
    2788           0 :         spin_lock(&ctx->completion_lock);
    2789           0 :         ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
    2790           0 :         spin_unlock(&ctx->completion_lock);
    2791             : 
    2792           0 :         ev_fd->eventfd_async = eventfd_async;
    2793           0 :         ctx->has_evfd = true;
    2794           0 :         rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
    2795           0 :         atomic_set(&ev_fd->refs, 1);
    2796           0 :         atomic_set(&ev_fd->ops, 0);
    2797           0 :         return 0;
    2798             : }
    2799             : 
    2800           0 : static int io_eventfd_unregister(struct io_ring_ctx *ctx)
    2801             : {
    2802             :         struct io_ev_fd *ev_fd;
    2803             : 
    2804           0 :         ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
    2805             :                                         lockdep_is_held(&ctx->uring_lock));
    2806           0 :         if (ev_fd) {
    2807           0 :                 ctx->has_evfd = false;
    2808           0 :                 rcu_assign_pointer(ctx->io_ev_fd, NULL);
    2809           0 :                 if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
    2810           0 :                         call_rcu(&ev_fd->rcu, io_eventfd_ops);
    2811             :                 return 0;
    2812             :         }
    2813             : 
    2814             :         return -ENXIO;
    2815             : }
    2816             : 
    2817           0 : static void io_req_caches_free(struct io_ring_ctx *ctx)
    2818             : {
    2819             :         struct io_kiocb *req;
    2820           0 :         int nr = 0;
    2821             : 
    2822           0 :         mutex_lock(&ctx->uring_lock);
    2823           0 :         io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
    2824             : 
    2825           0 :         while (!io_req_cache_empty(ctx)) {
    2826           0 :                 req = io_extract_req(ctx);
    2827           0 :                 kmem_cache_free(req_cachep, req);
    2828           0 :                 nr++;
    2829             :         }
    2830           0 :         if (nr)
    2831           0 :                 percpu_ref_put_many(&ctx->refs, nr);
    2832           0 :         mutex_unlock(&ctx->uring_lock);
    2833           0 : }
    2834             : 
    2835           0 : static void io_rsrc_node_cache_free(struct io_cache_entry *entry)
    2836             : {
    2837           0 :         kfree(container_of(entry, struct io_rsrc_node, cache));
    2838           0 : }
    2839             : 
    2840           0 : static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
    2841             : {
    2842           0 :         io_sq_thread_finish(ctx);
    2843             :         /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
    2844           0 :         if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)))
    2845             :                 return;
    2846             : 
    2847           0 :         mutex_lock(&ctx->uring_lock);
    2848           0 :         if (ctx->buf_data)
    2849           0 :                 __io_sqe_buffers_unregister(ctx);
    2850           0 :         if (ctx->file_data)
    2851           0 :                 __io_sqe_files_unregister(ctx);
    2852           0 :         io_cqring_overflow_kill(ctx);
    2853           0 :         io_eventfd_unregister(ctx);
    2854           0 :         io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
    2855           0 :         io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
    2856           0 :         io_destroy_buffers(ctx);
    2857           0 :         mutex_unlock(&ctx->uring_lock);
    2858           0 :         if (ctx->sq_creds)
    2859           0 :                 put_cred(ctx->sq_creds);
    2860           0 :         if (ctx->submitter_task)
    2861           0 :                 put_task_struct(ctx->submitter_task);
    2862             : 
    2863             :         /* there are no registered resources left, nobody uses it */
    2864           0 :         if (ctx->rsrc_node)
    2865           0 :                 io_rsrc_node_destroy(ctx, ctx->rsrc_node);
    2866             : 
    2867           0 :         WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
    2868             : 
    2869             : #if defined(CONFIG_UNIX)
    2870             :         if (ctx->ring_sock) {
    2871             :                 ctx->ring_sock->file = NULL; /* so that iput() is called */
    2872             :                 sock_release(ctx->ring_sock);
    2873             :         }
    2874             : #endif
    2875           0 :         WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
    2876             : 
    2877           0 :         io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
    2878           0 :         if (ctx->mm_account) {
    2879           0 :                 mmdrop(ctx->mm_account);
    2880           0 :                 ctx->mm_account = NULL;
    2881             :         }
    2882           0 :         io_rings_free(ctx);
    2883             : 
    2884           0 :         percpu_ref_exit(&ctx->refs);
    2885           0 :         free_uid(ctx->user);
    2886           0 :         io_req_caches_free(ctx);
    2887           0 :         if (ctx->hash_map)
    2888           0 :                 io_wq_put_hash(ctx->hash_map);
    2889           0 :         kfree(ctx->cancel_table.hbs);
    2890           0 :         kfree(ctx->cancel_table_locked.hbs);
    2891           0 :         kfree(ctx->dummy_ubuf);
    2892           0 :         kfree(ctx->io_bl);
    2893           0 :         xa_destroy(&ctx->io_bl_xa);
    2894           0 :         kfree(ctx);
    2895             : }
    2896             : 
    2897           0 : static __cold void io_activate_pollwq_cb(struct callback_head *cb)
    2898             : {
    2899           0 :         struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx,
    2900             :                                                poll_wq_task_work);
    2901             : 
    2902           0 :         mutex_lock(&ctx->uring_lock);
    2903           0 :         ctx->poll_activated = true;
    2904           0 :         mutex_unlock(&ctx->uring_lock);
    2905             : 
    2906             :         /*
    2907             :          * Wake ups for some events between start of polling and activation
    2908             :          * might've been lost due to loose synchronisation.
    2909             :          */
    2910           0 :         wake_up_all(&ctx->poll_wq);
    2911           0 :         percpu_ref_put(&ctx->refs);
    2912           0 : }
    2913             : 
    2914           0 : static __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
    2915             : {
    2916           0 :         spin_lock(&ctx->completion_lock);
    2917             :         /* already activated or in progress */
    2918           0 :         if (ctx->poll_activated || ctx->poll_wq_task_work.func)
    2919             :                 goto out;
    2920           0 :         if (WARN_ON_ONCE(!ctx->task_complete))
    2921             :                 goto out;
    2922           0 :         if (!ctx->submitter_task)
    2923             :                 goto out;
    2924             :         /*
    2925             :          * with ->submitter_task only the submitter task completes requests, we
    2926             :          * only need to sync with it, which is done by injecting a tw
    2927             :          */
    2928           0 :         init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb);
    2929           0 :         percpu_ref_get(&ctx->refs);
    2930           0 :         if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL))
    2931           0 :                 percpu_ref_put(&ctx->refs);
    2932             : out:
    2933           0 :         spin_unlock(&ctx->completion_lock);
    2934           0 : }
    2935             : 
    2936           0 : static __poll_t io_uring_poll(struct file *file, poll_table *wait)
    2937             : {
    2938           0 :         struct io_ring_ctx *ctx = file->private_data;
    2939           0 :         __poll_t mask = 0;
    2940             : 
    2941           0 :         if (unlikely(!ctx->poll_activated))
    2942           0 :                 io_activate_pollwq(ctx);
    2943             : 
    2944           0 :         poll_wait(file, &ctx->poll_wq, wait);
    2945             :         /*
    2946             :          * synchronizes with barrier from wq_has_sleeper call in
    2947             :          * io_commit_cqring
    2948             :          */
    2949           0 :         smp_rmb();
    2950           0 :         if (!io_sqring_full(ctx))
    2951           0 :                 mask |= EPOLLOUT | EPOLLWRNORM;
    2952             : 
    2953             :         /*
    2954             :          * Don't flush cqring overflow list here, just do a simple check.
    2955             :          * Otherwise there could possible be ABBA deadlock:
    2956             :          *      CPU0                    CPU1
    2957             :          *      ----                    ----
    2958             :          * lock(&ctx->uring_lock);
    2959             :          *                              lock(&ep->mtx);
    2960             :          *                              lock(&ctx->uring_lock);
    2961             :          * lock(&ep->mtx);
    2962             :          *
    2963             :          * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
    2964             :          * pushes them to do the flush.
    2965             :          */
    2966             : 
    2967           0 :         if (__io_cqring_events_user(ctx) || io_has_work(ctx))
    2968           0 :                 mask |= EPOLLIN | EPOLLRDNORM;
    2969             : 
    2970           0 :         return mask;
    2971             : }
    2972             : 
    2973           0 : static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
    2974             : {
    2975             :         const struct cred *creds;
    2976             : 
    2977           0 :         creds = xa_erase(&ctx->personalities, id);
    2978           0 :         if (creds) {
    2979             :                 put_cred(creds);
    2980             :                 return 0;
    2981             :         }
    2982             : 
    2983             :         return -EINVAL;
    2984             : }
    2985             : 
    2986             : struct io_tctx_exit {
    2987             :         struct callback_head            task_work;
    2988             :         struct completion               completion;
    2989             :         struct io_ring_ctx              *ctx;
    2990             : };
    2991             : 
    2992           0 : static __cold void io_tctx_exit_cb(struct callback_head *cb)
    2993             : {
    2994           0 :         struct io_uring_task *tctx = current->io_uring;
    2995             :         struct io_tctx_exit *work;
    2996             : 
    2997           0 :         work = container_of(cb, struct io_tctx_exit, task_work);
    2998             :         /*
    2999             :          * When @in_cancel, we're in cancellation and it's racy to remove the
    3000             :          * node. It'll be removed by the end of cancellation, just ignore it.
    3001             :          * tctx can be NULL if the queueing of this task_work raced with
    3002             :          * work cancelation off the exec path.
    3003             :          */
    3004           0 :         if (tctx && !atomic_read(&tctx->in_cancel))
    3005           0 :                 io_uring_del_tctx_node((unsigned long)work->ctx);
    3006           0 :         complete(&work->completion);
    3007           0 : }
    3008             : 
    3009           0 : static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
    3010             : {
    3011           0 :         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
    3012             : 
    3013           0 :         return req->ctx == data;
    3014             : }
    3015             : 
    3016           0 : static __cold void io_ring_exit_work(struct work_struct *work)
    3017             : {
    3018           0 :         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
    3019           0 :         unsigned long timeout = jiffies + HZ * 60 * 5;
    3020           0 :         unsigned long interval = HZ / 20;
    3021             :         struct io_tctx_exit exit;
    3022             :         struct io_tctx_node *node;
    3023             :         int ret;
    3024             : 
    3025             :         /*
    3026             :          * If we're doing polled IO and end up having requests being
    3027             :          * submitted async (out-of-line), then completions can come in while
    3028             :          * we're waiting for refs to drop. We need to reap these manually,
    3029             :          * as nobody else will be looking for them.
    3030             :          */
    3031             :         do {
    3032           0 :                 if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
    3033           0 :                         mutex_lock(&ctx->uring_lock);
    3034           0 :                         io_cqring_overflow_kill(ctx);
    3035           0 :                         mutex_unlock(&ctx->uring_lock);
    3036             :                 }
    3037             : 
    3038           0 :                 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
    3039           0 :                         io_move_task_work_from_local(ctx);
    3040             : 
    3041           0 :                 while (io_uring_try_cancel_requests(ctx, NULL, true))
    3042           0 :                         cond_resched();
    3043             : 
    3044           0 :                 if (ctx->sq_data) {
    3045           0 :                         struct io_sq_data *sqd = ctx->sq_data;
    3046             :                         struct task_struct *tsk;
    3047             : 
    3048           0 :                         io_sq_thread_park(sqd);
    3049           0 :                         tsk = sqd->thread;
    3050           0 :                         if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
    3051           0 :                                 io_wq_cancel_cb(tsk->io_uring->io_wq,
    3052             :                                                 io_cancel_ctx_cb, ctx, true);
    3053           0 :                         io_sq_thread_unpark(sqd);
    3054             :                 }
    3055             : 
    3056           0 :                 io_req_caches_free(ctx);
    3057             : 
    3058           0 :                 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
    3059             :                         /* there is little hope left, don't run it too often */
    3060           0 :                         interval = HZ * 60;
    3061             :                 }
    3062             :                 /*
    3063             :                  * This is really an uninterruptible wait, as it has to be
    3064             :                  * complete. But it's also run from a kworker, which doesn't
    3065             :                  * take signals, so it's fine to make it interruptible. This
    3066             :                  * avoids scenarios where we knowingly can wait much longer
    3067             :                  * on completions, for example if someone does a SIGSTOP on
    3068             :                  * a task that needs to finish task_work to make this loop
    3069             :                  * complete. That's a synthetic situation that should not
    3070             :                  * cause a stuck task backtrace, and hence a potential panic
    3071             :                  * on stuck tasks if that is enabled.
    3072             :                  */
    3073           0 :         } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));
    3074             : 
    3075           0 :         init_completion(&exit.completion);
    3076           0 :         init_task_work(&exit.task_work, io_tctx_exit_cb);
    3077           0 :         exit.ctx = ctx;
    3078             :         /*
    3079             :          * Some may use context even when all refs and requests have been put,
    3080             :          * and they are free to do so while still holding uring_lock or
    3081             :          * completion_lock, see io_req_task_submit(). Apart from other work,
    3082             :          * this lock/unlock section also waits them to finish.
    3083             :          */
    3084           0 :         mutex_lock(&ctx->uring_lock);
    3085           0 :         while (!list_empty(&ctx->tctx_list)) {
    3086           0 :                 WARN_ON_ONCE(time_after(jiffies, timeout));
    3087             : 
    3088           0 :                 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
    3089             :                                         ctx_node);
    3090             :                 /* don't spin on a single task if cancellation failed */
    3091           0 :                 list_rotate_left(&ctx->tctx_list);
    3092           0 :                 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
    3093           0 :                 if (WARN_ON_ONCE(ret))
    3094           0 :                         continue;
    3095             : 
    3096           0 :                 mutex_unlock(&ctx->uring_lock);
    3097             :                 /*
    3098             :                  * See comment above for
    3099             :                  * wait_for_completion_interruptible_timeout() on why this
    3100             :                  * wait is marked as interruptible.
    3101             :                  */
    3102           0 :                 wait_for_completion_interruptible(&exit.completion);
    3103           0 :                 mutex_lock(&ctx->uring_lock);
    3104             :         }
    3105           0 :         mutex_unlock(&ctx->uring_lock);
    3106           0 :         spin_lock(&ctx->completion_lock);
    3107           0 :         spin_unlock(&ctx->completion_lock);
    3108             : 
    3109             :         /* pairs with RCU read section in io_req_local_work_add() */
    3110           0 :         if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
    3111           0 :                 synchronize_rcu();
    3112             : 
    3113           0 :         io_ring_ctx_free(ctx);
    3114           0 : }
    3115             : 
    3116           0 : static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
    3117             : {
    3118             :         unsigned long index;
    3119             :         struct creds *creds;
    3120             : 
    3121           0 :         mutex_lock(&ctx->uring_lock);
    3122           0 :         percpu_ref_kill(&ctx->refs);
    3123           0 :         xa_for_each(&ctx->personalities, index, creds)
    3124           0 :                 io_unregister_personality(ctx, index);
    3125           0 :         if (ctx->rings)
    3126           0 :                 io_poll_remove_all(ctx, NULL, true);
    3127           0 :         mutex_unlock(&ctx->uring_lock);
    3128             : 
    3129             :         /*
    3130             :          * If we failed setting up the ctx, we might not have any rings
    3131             :          * and therefore did not submit any requests
    3132             :          */
    3133           0 :         if (ctx->rings)
    3134           0 :                 io_kill_timeouts(ctx, NULL, true);
    3135             : 
    3136           0 :         flush_delayed_work(&ctx->fallback_work);
    3137             : 
    3138           0 :         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
    3139             :         /*
    3140             :          * Use system_unbound_wq to avoid spawning tons of event kworkers
    3141             :          * if we're exiting a ton of rings at the same time. It just adds
    3142             :          * noise and overhead, there's no discernable change in runtime
    3143             :          * over using system_wq.
    3144             :          */
    3145           0 :         queue_work(system_unbound_wq, &ctx->exit_work);
    3146           0 : }
    3147             : 
    3148           0 : static int io_uring_release(struct inode *inode, struct file *file)
    3149             : {
    3150           0 :         struct io_ring_ctx *ctx = file->private_data;
    3151             : 
    3152           0 :         file->private_data = NULL;
    3153           0 :         io_ring_ctx_wait_and_kill(ctx);
    3154           0 :         return 0;
    3155             : }
    3156             : 
    3157             : struct io_task_cancel {
    3158             :         struct task_struct *task;
    3159             :         bool all;
    3160             : };
    3161             : 
    3162           0 : static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
    3163             : {
    3164           0 :         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
    3165           0 :         struct io_task_cancel *cancel = data;
    3166             : 
    3167           0 :         return io_match_task_safe(req, cancel->task, cancel->all);
    3168             : }
    3169             : 
    3170           0 : static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
    3171             :                                          struct task_struct *task,
    3172             :                                          bool cancel_all)
    3173             : {
    3174             :         struct io_defer_entry *de;
    3175           0 :         LIST_HEAD(list);
    3176             : 
    3177           0 :         spin_lock(&ctx->completion_lock);
    3178           0 :         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
    3179           0 :                 if (io_match_task_safe(de->req, task, cancel_all)) {
    3180           0 :                         list_cut_position(&list, &ctx->defer_list, &de->list);
    3181           0 :                         break;
    3182             :                 }
    3183             :         }
    3184           0 :         spin_unlock(&ctx->completion_lock);
    3185           0 :         if (list_empty(&list))
    3186             :                 return false;
    3187             : 
    3188           0 :         while (!list_empty(&list)) {
    3189           0 :                 de = list_first_entry(&list, struct io_defer_entry, list);
    3190           0 :                 list_del_init(&de->list);
    3191           0 :                 io_req_task_queue_fail(de->req, -ECANCELED);
    3192           0 :                 kfree(de);
    3193             :         }
    3194             :         return true;
    3195             : }
    3196             : 
    3197           0 : static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
    3198             : {
    3199             :         struct io_tctx_node *node;
    3200             :         enum io_wq_cancel cret;
    3201           0 :         bool ret = false;
    3202             : 
    3203           0 :         mutex_lock(&ctx->uring_lock);
    3204           0 :         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
    3205           0 :                 struct io_uring_task *tctx = node->task->io_uring;
    3206             : 
    3207             :                 /*
    3208             :                  * io_wq will stay alive while we hold uring_lock, because it's
    3209             :                  * killed after ctx nodes, which requires to take the lock.
    3210             :                  */
    3211           0 :                 if (!tctx || !tctx->io_wq)
    3212           0 :                         continue;
    3213           0 :                 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
    3214           0 :                 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
    3215             :         }
    3216           0 :         mutex_unlock(&ctx->uring_lock);
    3217             : 
    3218           0 :         return ret;
    3219             : }
    3220             : 
    3221           0 : static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
    3222             :                                                 struct task_struct *task,
    3223             :                                                 bool cancel_all)
    3224             : {
    3225           0 :         struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
    3226           0 :         struct io_uring_task *tctx = task ? task->io_uring : NULL;
    3227             :         enum io_wq_cancel cret;
    3228           0 :         bool ret = false;
    3229             : 
    3230             :         /* set it so io_req_local_work_add() would wake us up */
    3231           0 :         if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
    3232           0 :                 atomic_set(&ctx->cq_wait_nr, 1);
    3233           0 :                 smp_mb();
    3234             :         }
    3235             : 
    3236             :         /* failed during ring init, it couldn't have issued any requests */
    3237           0 :         if (!ctx->rings)
    3238             :                 return false;
    3239             : 
    3240           0 :         if (!task) {
    3241           0 :                 ret |= io_uring_try_cancel_iowq(ctx);
    3242           0 :         } else if (tctx && tctx->io_wq) {
    3243             :                 /*
    3244             :                  * Cancels requests of all rings, not only @ctx, but
    3245             :                  * it's fine as the task is in exit/exec.
    3246             :                  */
    3247           0 :                 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
    3248             :                                        &cancel, true);
    3249           0 :                 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
    3250             :         }
    3251             : 
    3252             :         /* SQPOLL thread does its own polling */
    3253           0 :         if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
    3254           0 :             (ctx->sq_data && ctx->sq_data->thread == current)) {
    3255           0 :                 while (!wq_list_empty(&ctx->iopoll_list)) {
    3256           0 :                         io_iopoll_try_reap_events(ctx);
    3257           0 :                         ret = true;
    3258           0 :                         cond_resched();
    3259             :                 }
    3260             :         }
    3261             : 
    3262           0 :         if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
    3263           0 :             io_allowed_defer_tw_run(ctx))
    3264           0 :                 ret |= io_run_local_work(ctx) > 0;
    3265           0 :         ret |= io_cancel_defer_files(ctx, task, cancel_all);
    3266           0 :         mutex_lock(&ctx->uring_lock);
    3267           0 :         ret |= io_poll_remove_all(ctx, task, cancel_all);
    3268           0 :         mutex_unlock(&ctx->uring_lock);
    3269           0 :         ret |= io_kill_timeouts(ctx, task, cancel_all);
    3270           0 :         if (task)
    3271           0 :                 ret |= io_run_task_work() > 0;
    3272             :         return ret;
    3273             : }
    3274             : 
    3275             : static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
    3276             : {
    3277           0 :         if (tracked)
    3278           0 :                 return atomic_read(&tctx->inflight_tracked);
    3279           0 :         return percpu_counter_sum(&tctx->inflight);
    3280             : }
    3281             : 
    3282             : /*
    3283             :  * Find any io_uring ctx that this task has registered or done IO on, and cancel
    3284             :  * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
    3285             :  */
    3286           0 : __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
    3287             : {
    3288           0 :         struct io_uring_task *tctx = current->io_uring;
    3289             :         struct io_ring_ctx *ctx;
    3290             :         struct io_tctx_node *node;
    3291             :         unsigned long index;
    3292             :         s64 inflight;
    3293           0 :         DEFINE_WAIT(wait);
    3294             : 
    3295           0 :         WARN_ON_ONCE(sqd && sqd->thread != current);
    3296             : 
    3297           0 :         if (!current->io_uring)
    3298           0 :                 return;
    3299           0 :         if (tctx->io_wq)
    3300           0 :                 io_wq_exit_start(tctx->io_wq);
    3301             : 
    3302           0 :         atomic_inc(&tctx->in_cancel);
    3303             :         do {
    3304           0 :                 bool loop = false;
    3305             : 
    3306           0 :                 io_uring_drop_tctx_refs(current);
    3307             :                 /* read completions before cancelations */
    3308           0 :                 inflight = tctx_inflight(tctx, !cancel_all);
    3309           0 :                 if (!inflight)
    3310             :                         break;
    3311             : 
    3312           0 :                 if (!sqd) {
    3313           0 :                         xa_for_each(&tctx->xa, index, node) {
    3314             :                                 /* sqpoll task will cancel all its requests */
    3315           0 :                                 if (node->ctx->sq_data)
    3316           0 :                                         continue;
    3317           0 :                                 loop |= io_uring_try_cancel_requests(node->ctx,
    3318           0 :                                                         current, cancel_all);
    3319             :                         }
    3320             :                 } else {
    3321           0 :                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
    3322           0 :                                 loop |= io_uring_try_cancel_requests(ctx,
    3323           0 :                                                                      current,
    3324             :                                                                      cancel_all);
    3325             :                 }
    3326             : 
    3327           0 :                 if (loop) {
    3328           0 :                         cond_resched();
    3329           0 :                         continue;
    3330             :                 }
    3331             : 
    3332           0 :                 prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
    3333           0 :                 io_run_task_work();
    3334           0 :                 io_uring_drop_tctx_refs(current);
    3335           0 :                 xa_for_each(&tctx->xa, index, node) {
    3336           0 :                         if (!llist_empty(&node->ctx->work_llist)) {
    3337           0 :                                 WARN_ON_ONCE(node->ctx->submitter_task &&
    3338             :                                              node->ctx->submitter_task != current);
    3339             :                                 goto end_wait;
    3340             :                         }
    3341             :                 }
    3342             :                 /*
    3343             :                  * If we've seen completions, retry without waiting. This
    3344             :                  * avoids a race where a completion comes in before we did
    3345             :                  * prepare_to_wait().
    3346             :                  */
    3347           0 :                 if (inflight == tctx_inflight(tctx, !cancel_all))
    3348           0 :                         schedule();
    3349             : end_wait:
    3350           0 :                 finish_wait(&tctx->wait, &wait);
    3351             :         } while (1);
    3352             : 
    3353           0 :         io_uring_clean_tctx(tctx);
    3354           0 :         if (cancel_all) {
    3355             :                 /*
    3356             :                  * We shouldn't run task_works after cancel, so just leave
    3357             :                  * ->in_cancel set for normal exit.
    3358             :                  */
    3359           0 :                 atomic_dec(&tctx->in_cancel);
    3360             :                 /* for exec all current's requests should be gone, kill tctx */
    3361           0 :                 __io_uring_free(current);
    3362             :         }
    3363             : }
    3364             : 
    3365           0 : void __io_uring_cancel(bool cancel_all)
    3366             : {
    3367           0 :         io_uring_cancel_generic(cancel_all, NULL);
    3368           0 : }
    3369             : 
    3370           0 : static void *io_uring_validate_mmap_request(struct file *file,
    3371             :                                             loff_t pgoff, size_t sz)
    3372             : {
    3373           0 :         struct io_ring_ctx *ctx = file->private_data;
    3374           0 :         loff_t offset = pgoff << PAGE_SHIFT;
    3375             :         struct page *page;
    3376             :         void *ptr;
    3377             : 
    3378             :         /* Don't allow mmap if the ring was setup without it */
    3379           0 :         if (ctx->flags & IORING_SETUP_NO_MMAP)
    3380             :                 return ERR_PTR(-EINVAL);
    3381             : 
    3382           0 :         switch (offset & IORING_OFF_MMAP_MASK) {
    3383             :         case IORING_OFF_SQ_RING:
    3384             :         case IORING_OFF_CQ_RING:
    3385           0 :                 ptr = ctx->rings;
    3386             :                 break;
    3387             :         case IORING_OFF_SQES:
    3388           0 :                 ptr = ctx->sq_sqes;
    3389             :                 break;
    3390             :         case IORING_OFF_PBUF_RING: {
    3391             :                 unsigned int bgid;
    3392             : 
    3393           0 :                 bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
    3394           0 :                 mutex_lock(&ctx->uring_lock);
    3395           0 :                 ptr = io_pbuf_get_address(ctx, bgid);
    3396           0 :                 mutex_unlock(&ctx->uring_lock);
    3397           0 :                 if (!ptr)
    3398             :                         return ERR_PTR(-EINVAL);
    3399             :                 break;
    3400             :                 }
    3401             :         default:
    3402             :                 return ERR_PTR(-EINVAL);
    3403             :         }
    3404             : 
    3405           0 :         page = virt_to_head_page(ptr);
    3406           0 :         if (sz > page_size(page))
    3407             :                 return ERR_PTR(-EINVAL);
    3408             : 
    3409             :         return ptr;
    3410             : }
    3411             : 
    3412             : #ifdef CONFIG_MMU
    3413             : 
    3414           0 : static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
    3415             : {
    3416           0 :         size_t sz = vma->vm_end - vma->vm_start;
    3417             :         unsigned long pfn;
    3418             :         void *ptr;
    3419             : 
    3420           0 :         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
    3421           0 :         if (IS_ERR(ptr))
    3422           0 :                 return PTR_ERR(ptr);
    3423             : 
    3424           0 :         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
    3425           0 :         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
    3426             : }
    3427             : 
    3428           0 : static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
    3429             :                         unsigned long addr, unsigned long len,
    3430             :                         unsigned long pgoff, unsigned long flags)
    3431             : {
    3432           0 :         const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
    3433             :         struct vm_unmapped_area_info info;
    3434             :         void *ptr;
    3435             : 
    3436             :         /*
    3437             :          * Do not allow to map to user-provided address to avoid breaking the
    3438             :          * aliasing rules. Userspace is not able to guess the offset address of
    3439             :          * kernel kmalloc()ed memory area.
    3440             :          */
    3441           0 :         if (addr)
    3442             :                 return -EINVAL;
    3443             : 
    3444           0 :         ptr = io_uring_validate_mmap_request(filp, pgoff, len);
    3445           0 :         if (IS_ERR(ptr))
    3446             :                 return -ENOMEM;
    3447             : 
    3448           0 :         info.flags = VM_UNMAPPED_AREA_TOPDOWN;
    3449           0 :         info.length = len;
    3450           0 :         info.low_limit = max(PAGE_SIZE, mmap_min_addr);
    3451           0 :         info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
    3452             : #ifdef SHM_COLOUR
    3453             :         info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
    3454             : #else
    3455           0 :         info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
    3456             : #endif
    3457           0 :         info.align_offset = (unsigned long) ptr;
    3458             : 
    3459             :         /*
    3460             :          * A failed mmap() very likely causes application failure,
    3461             :          * so fall back to the bottom-up function here. This scenario
    3462             :          * can happen with large stack limits and large mmap()
    3463             :          * allocations.
    3464             :          */
    3465           0 :         addr = vm_unmapped_area(&info);
    3466           0 :         if (offset_in_page(addr)) {
    3467           0 :                 info.flags = 0;
    3468           0 :                 info.low_limit = TASK_UNMAPPED_BASE;
    3469           0 :                 info.high_limit = mmap_end;
    3470           0 :                 addr = vm_unmapped_area(&info);
    3471             :         }
    3472             : 
    3473             :         return addr;
    3474             : }
    3475             : 
    3476             : #else /* !CONFIG_MMU */
    3477             : 
    3478             : static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
    3479             : {
    3480             :         return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
    3481             : }
    3482             : 
    3483             : static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
    3484             : {
    3485             :         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
    3486             : }
    3487             : 
    3488             : static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
    3489             :         unsigned long addr, unsigned long len,
    3490             :         unsigned long pgoff, unsigned long flags)
    3491             : {
    3492             :         void *ptr;
    3493             : 
    3494             :         ptr = io_uring_validate_mmap_request(file, pgoff, len);
    3495             :         if (IS_ERR(ptr))
    3496             :                 return PTR_ERR(ptr);
    3497             : 
    3498             :         return (unsigned long) ptr;
    3499             : }
    3500             : 
    3501             : #endif /* !CONFIG_MMU */
    3502             : 
    3503             : static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
    3504             : {
    3505           0 :         if (flags & IORING_ENTER_EXT_ARG) {
    3506             :                 struct io_uring_getevents_arg arg;
    3507             : 
    3508           0 :                 if (argsz != sizeof(arg))
    3509           0 :                         return -EINVAL;
    3510           0 :                 if (copy_from_user(&arg, argp, sizeof(arg)))
    3511             :                         return -EFAULT;
    3512             :         }
    3513             :         return 0;
    3514             : }
    3515             : 
    3516           0 : static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
    3517             :                           struct __kernel_timespec __user **ts,
    3518             :                           const sigset_t __user **sig)
    3519             : {
    3520             :         struct io_uring_getevents_arg arg;
    3521             : 
    3522             :         /*
    3523             :          * If EXT_ARG isn't set, then we have no timespec and the argp pointer
    3524             :          * is just a pointer to the sigset_t.
    3525             :          */
    3526           0 :         if (!(flags & IORING_ENTER_EXT_ARG)) {
    3527           0 :                 *sig = (const sigset_t __user *) argp;
    3528           0 :                 *ts = NULL;
    3529           0 :                 return 0;
    3530             :         }
    3531             : 
    3532             :         /*
    3533             :          * EXT_ARG is set - ensure we agree on the size of it and copy in our
    3534             :          * timespec and sigset_t pointers if good.
    3535             :          */
    3536           0 :         if (*argsz != sizeof(arg))
    3537             :                 return -EINVAL;
    3538           0 :         if (copy_from_user(&arg, argp, sizeof(arg)))
    3539             :                 return -EFAULT;
    3540           0 :         if (arg.pad)
    3541             :                 return -EINVAL;
    3542           0 :         *sig = u64_to_user_ptr(arg.sigmask);
    3543           0 :         *argsz = arg.sigmask_sz;
    3544           0 :         *ts = u64_to_user_ptr(arg.ts);
    3545           0 :         return 0;
    3546             : }
    3547             : 
    3548           0 : SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
    3549             :                 u32, min_complete, u32, flags, const void __user *, argp,
    3550             :                 size_t, argsz)
    3551             : {
    3552             :         struct io_ring_ctx *ctx;
    3553             :         struct fd f;
    3554             :         long ret;
    3555             : 
    3556           0 :         if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
    3557             :                                IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
    3558             :                                IORING_ENTER_REGISTERED_RING)))
    3559             :                 return -EINVAL;
    3560             : 
    3561             :         /*
    3562             :          * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
    3563             :          * need only dereference our task private array to find it.
    3564             :          */
    3565           0 :         if (flags & IORING_ENTER_REGISTERED_RING) {
    3566           0 :                 struct io_uring_task *tctx = current->io_uring;
    3567             : 
    3568           0 :                 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
    3569             :                         return -EINVAL;
    3570           0 :                 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
    3571           0 :                 f.file = tctx->registered_rings[fd];
    3572           0 :                 f.flags = 0;
    3573           0 :                 if (unlikely(!f.file))
    3574             :                         return -EBADF;
    3575             :         } else {
    3576           0 :                 f = fdget(fd);
    3577           0 :                 if (unlikely(!f.file))
    3578             :                         return -EBADF;
    3579           0 :                 ret = -EOPNOTSUPP;
    3580           0 :                 if (unlikely(!io_is_uring_fops(f.file)))
    3581             :                         goto out;
    3582             :         }
    3583             : 
    3584           0 :         ctx = f.file->private_data;
    3585           0 :         ret = -EBADFD;
    3586           0 :         if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
    3587             :                 goto out;
    3588             : 
    3589             :         /*
    3590             :          * For SQ polling, the thread will do all submissions and completions.
    3591             :          * Just return the requested submit count, and wake the thread if
    3592             :          * we were asked to.
    3593             :          */
    3594           0 :         ret = 0;
    3595           0 :         if (ctx->flags & IORING_SETUP_SQPOLL) {
    3596           0 :                 io_cqring_overflow_flush(ctx);
    3597             : 
    3598           0 :                 if (unlikely(ctx->sq_data->thread == NULL)) {
    3599             :                         ret = -EOWNERDEAD;
    3600             :                         goto out;
    3601             :                 }
    3602           0 :                 if (flags & IORING_ENTER_SQ_WAKEUP)
    3603           0 :                         wake_up(&ctx->sq_data->wait);
    3604           0 :                 if (flags & IORING_ENTER_SQ_WAIT)
    3605           0 :                         io_sqpoll_wait_sq(ctx);
    3606             : 
    3607           0 :                 ret = to_submit;
    3608           0 :         } else if (to_submit) {
    3609           0 :                 ret = io_uring_add_tctx_node(ctx);
    3610           0 :                 if (unlikely(ret))
    3611             :                         goto out;
    3612             : 
    3613           0 :                 mutex_lock(&ctx->uring_lock);
    3614           0 :                 ret = io_submit_sqes(ctx, to_submit);
    3615           0 :                 if (ret != to_submit) {
    3616           0 :                         mutex_unlock(&ctx->uring_lock);
    3617           0 :                         goto out;
    3618             :                 }
    3619           0 :                 if (flags & IORING_ENTER_GETEVENTS) {
    3620           0 :                         if (ctx->syscall_iopoll)
    3621             :                                 goto iopoll_locked;
    3622             :                         /*
    3623             :                          * Ignore errors, we'll soon call io_cqring_wait() and
    3624             :                          * it should handle ownership problems if any.
    3625             :                          */
    3626           0 :                         if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
    3627           0 :                                 (void)io_run_local_work_locked(ctx);
    3628             :                 }
    3629           0 :                 mutex_unlock(&ctx->uring_lock);
    3630             :         }
    3631             : 
    3632           0 :         if (flags & IORING_ENTER_GETEVENTS) {
    3633             :                 int ret2;
    3634             : 
    3635           0 :                 if (ctx->syscall_iopoll) {
    3636             :                         /*
    3637             :                          * We disallow the app entering submit/complete with
    3638             :                          * polling, but we still need to lock the ring to
    3639             :                          * prevent racing with polled issue that got punted to
    3640             :                          * a workqueue.
    3641             :                          */
    3642           0 :                         mutex_lock(&ctx->uring_lock);
    3643             : iopoll_locked:
    3644           0 :                         ret2 = io_validate_ext_arg(flags, argp, argsz);
    3645           0 :                         if (likely(!ret2)) {
    3646           0 :                                 min_complete = min(min_complete,
    3647             :                                                    ctx->cq_entries);
    3648           0 :                                 ret2 = io_iopoll_check(ctx, min_complete);
    3649             :                         }
    3650           0 :                         mutex_unlock(&ctx->uring_lock);
    3651             :                 } else {
    3652             :                         const sigset_t __user *sig;
    3653             :                         struct __kernel_timespec __user *ts;
    3654             : 
    3655           0 :                         ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
    3656           0 :                         if (likely(!ret2)) {
    3657           0 :                                 min_complete = min(min_complete,
    3658             :                                                    ctx->cq_entries);
    3659           0 :                                 ret2 = io_cqring_wait(ctx, min_complete, sig,
    3660             :                                                       argsz, ts);
    3661             :                         }
    3662             :                 }
    3663             : 
    3664           0 :                 if (!ret) {
    3665           0 :                         ret = ret2;
    3666             : 
    3667             :                         /*
    3668             :                          * EBADR indicates that one or more CQE were dropped.
    3669             :                          * Once the user has been informed we can clear the bit
    3670             :                          * as they are obviously ok with those drops.
    3671             :                          */
    3672           0 :                         if (unlikely(ret2 == -EBADR))
    3673             :                                 clear_bit(IO_CHECK_CQ_DROPPED_BIT,
    3674           0 :                                           &ctx->check_cq);
    3675             :                 }
    3676             :         }
    3677             : out:
    3678           0 :         fdput(f);
    3679             :         return ret;
    3680             : }
    3681             : 
    3682             : static const struct file_operations io_uring_fops = {
    3683             :         .release        = io_uring_release,
    3684             :         .mmap           = io_uring_mmap,
    3685             : #ifndef CONFIG_MMU
    3686             :         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
    3687             :         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
    3688             : #else
    3689             :         .get_unmapped_area = io_uring_mmu_get_unmapped_area,
    3690             : #endif
    3691             :         .poll           = io_uring_poll,
    3692             : #ifdef CONFIG_PROC_FS
    3693             :         .show_fdinfo    = io_uring_show_fdinfo,
    3694             : #endif
    3695             : };
    3696             : 
    3697           0 : bool io_is_uring_fops(struct file *file)
    3698             : {
    3699           0 :         return file->f_op == &io_uring_fops;
    3700             : }
    3701             : 
    3702           0 : static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
    3703             :                                          struct io_uring_params *p)
    3704             : {
    3705             :         struct io_rings *rings;
    3706             :         size_t size, sq_array_offset;
    3707             :         void *ptr;
    3708             : 
    3709             :         /* make sure these are sane, as we already accounted them */
    3710           0 :         ctx->sq_entries = p->sq_entries;
    3711           0 :         ctx->cq_entries = p->cq_entries;
    3712             : 
    3713           0 :         size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
    3714           0 :         if (size == SIZE_MAX)
    3715             :                 return -EOVERFLOW;
    3716             : 
    3717           0 :         if (!(ctx->flags & IORING_SETUP_NO_MMAP))
    3718           0 :                 rings = io_mem_alloc(size);
    3719             :         else
    3720           0 :                 rings = io_rings_map(ctx, p->cq_off.user_addr, size);
    3721             : 
    3722           0 :         if (IS_ERR(rings))
    3723           0 :                 return PTR_ERR(rings);
    3724             : 
    3725           0 :         ctx->rings = rings;
    3726           0 :         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
    3727           0 :         rings->sq_ring_mask = p->sq_entries - 1;
    3728           0 :         rings->cq_ring_mask = p->cq_entries - 1;
    3729           0 :         rings->sq_ring_entries = p->sq_entries;
    3730           0 :         rings->cq_ring_entries = p->cq_entries;
    3731             : 
    3732           0 :         if (p->flags & IORING_SETUP_SQE128)
    3733           0 :                 size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
    3734             :         else
    3735           0 :                 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
    3736           0 :         if (size == SIZE_MAX) {
    3737           0 :                 io_rings_free(ctx);
    3738           0 :                 return -EOVERFLOW;
    3739             :         }
    3740             : 
    3741           0 :         if (!(ctx->flags & IORING_SETUP_NO_MMAP))
    3742           0 :                 ptr = io_mem_alloc(size);
    3743             :         else
    3744           0 :                 ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
    3745             : 
    3746           0 :         if (IS_ERR(ptr)) {
    3747           0 :                 io_rings_free(ctx);
    3748           0 :                 return PTR_ERR(ptr);
    3749             :         }
    3750             : 
    3751           0 :         ctx->sq_sqes = ptr;
    3752           0 :         return 0;
    3753             : }
    3754             : 
    3755           0 : static int io_uring_install_fd(struct file *file)
    3756             : {
    3757             :         int fd;
    3758             : 
    3759           0 :         fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
    3760           0 :         if (fd < 0)
    3761             :                 return fd;
    3762           0 :         fd_install(fd, file);
    3763           0 :         return fd;
    3764             : }
    3765             : 
    3766             : /*
    3767             :  * Allocate an anonymous fd, this is what constitutes the application
    3768             :  * visible backing of an io_uring instance. The application mmaps this
    3769             :  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
    3770             :  * we have to tie this fd to a socket for file garbage collection purposes.
    3771             :  */
    3772             : static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
    3773             : {
    3774             :         struct file *file;
    3775             : #if defined(CONFIG_UNIX)
    3776             :         int ret;
    3777             : 
    3778             :         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
    3779             :                                 &ctx->ring_sock);
    3780             :         if (ret)
    3781             :                 return ERR_PTR(ret);
    3782             : #endif
    3783             : 
    3784           0 :         file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
    3785             :                                          O_RDWR | O_CLOEXEC, NULL);
    3786             : #if defined(CONFIG_UNIX)
    3787             :         if (IS_ERR(file)) {
    3788             :                 sock_release(ctx->ring_sock);
    3789             :                 ctx->ring_sock = NULL;
    3790             :         } else {
    3791             :                 ctx->ring_sock->file = file;
    3792             :         }
    3793             : #endif
    3794             :         return file;
    3795             : }
    3796             : 
    3797           0 : static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
    3798             :                                   struct io_uring_params __user *params)
    3799             : {
    3800             :         struct io_ring_ctx *ctx;
    3801             :         struct io_uring_task *tctx;
    3802             :         struct file *file;
    3803             :         int ret;
    3804             : 
    3805           0 :         if (!entries)
    3806             :                 return -EINVAL;
    3807           0 :         if (entries > IORING_MAX_ENTRIES) {
    3808           0 :                 if (!(p->flags & IORING_SETUP_CLAMP))
    3809             :                         return -EINVAL;
    3810             :                 entries = IORING_MAX_ENTRIES;
    3811             :         }
    3812             : 
    3813           0 :         if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
    3814           0 :             && !(p->flags & IORING_SETUP_NO_MMAP))
    3815             :                 return -EINVAL;
    3816             : 
    3817             :         /*
    3818             :          * Use twice as many entries for the CQ ring. It's possible for the
    3819             :          * application to drive a higher depth than the size of the SQ ring,
    3820             :          * since the sqes are only used at submission time. This allows for
    3821             :          * some flexibility in overcommitting a bit. If the application has
    3822             :          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
    3823             :          * of CQ ring entries manually.
    3824             :          */
    3825           0 :         p->sq_entries = roundup_pow_of_two(entries);
    3826           0 :         if (p->flags & IORING_SETUP_CQSIZE) {
    3827             :                 /*
    3828             :                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
    3829             :                  * to a power-of-two, if it isn't already. We do NOT impose
    3830             :                  * any cq vs sq ring sizing.
    3831             :                  */
    3832           0 :                 if (!p->cq_entries)
    3833             :                         return -EINVAL;
    3834           0 :                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
    3835           0 :                         if (!(p->flags & IORING_SETUP_CLAMP))
    3836             :                                 return -EINVAL;
    3837           0 :                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
    3838             :                 }
    3839           0 :                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
    3840           0 :                 if (p->cq_entries < p->sq_entries)
    3841             :                         return -EINVAL;
    3842             :         } else {
    3843           0 :                 p->cq_entries = 2 * p->sq_entries;
    3844             :         }
    3845             : 
    3846           0 :         ctx = io_ring_ctx_alloc(p);
    3847           0 :         if (!ctx)
    3848             :                 return -ENOMEM;
    3849             : 
    3850           0 :         if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
    3851           0 :             !(ctx->flags & IORING_SETUP_IOPOLL) &&
    3852             :             !(ctx->flags & IORING_SETUP_SQPOLL))
    3853           0 :                 ctx->task_complete = true;
    3854             : 
    3855             :         /*
    3856             :          * lazy poll_wq activation relies on ->task_complete for synchronisation
    3857             :          * purposes, see io_activate_pollwq()
    3858             :          */
    3859           0 :         if (!ctx->task_complete)
    3860           0 :                 ctx->poll_activated = true;
    3861             : 
    3862             :         /*
    3863             :          * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
    3864             :          * space applications don't need to do io completion events
    3865             :          * polling again, they can rely on io_sq_thread to do polling
    3866             :          * work, which can reduce cpu usage and uring_lock contention.
    3867             :          */
    3868           0 :         if (ctx->flags & IORING_SETUP_IOPOLL &&
    3869             :             !(ctx->flags & IORING_SETUP_SQPOLL))
    3870           0 :                 ctx->syscall_iopoll = 1;
    3871             : 
    3872           0 :         ctx->compat = in_compat_syscall();
    3873           0 :         if (!capable(CAP_IPC_LOCK))
    3874           0 :                 ctx->user = get_uid(current_user());
    3875             : 
    3876             :         /*
    3877             :          * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
    3878             :          * COOP_TASKRUN is set, then IPIs are never needed by the app.
    3879             :          */
    3880           0 :         ret = -EINVAL;
    3881           0 :         if (ctx->flags & IORING_SETUP_SQPOLL) {
    3882             :                 /* IPI related flags don't make sense with SQPOLL */
    3883           0 :                 if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
    3884             :                                   IORING_SETUP_TASKRUN_FLAG |
    3885             :                                   IORING_SETUP_DEFER_TASKRUN))
    3886             :                         goto err;
    3887           0 :                 ctx->notify_method = TWA_SIGNAL_NO_IPI;
    3888           0 :         } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
    3889           0 :                 ctx->notify_method = TWA_SIGNAL_NO_IPI;
    3890             :         } else {
    3891           0 :                 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG &&
    3892             :                     !(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
    3893             :                         goto err;
    3894           0 :                 ctx->notify_method = TWA_SIGNAL;
    3895             :         }
    3896             : 
    3897             :         /*
    3898             :          * For DEFER_TASKRUN we require the completion task to be the same as the
    3899             :          * submission task. This implies that there is only one submitter, so enforce
    3900             :          * that.
    3901             :          */
    3902           0 :         if (ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
    3903             :             !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) {
    3904             :                 goto err;
    3905             :         }
    3906             : 
    3907             :         /*
    3908             :          * This is just grabbed for accounting purposes. When a process exits,
    3909             :          * the mm is exited and dropped before the files, hence we need to hang
    3910             :          * on to this mm purely for the purposes of being able to unaccount
    3911             :          * memory (locked/pinned vm). It's not used for anything else.
    3912             :          */
    3913           0 :         mmgrab(current->mm);
    3914           0 :         ctx->mm_account = current->mm;
    3915             : 
    3916           0 :         ret = io_allocate_scq_urings(ctx, p);
    3917           0 :         if (ret)
    3918             :                 goto err;
    3919             : 
    3920           0 :         ret = io_sq_offload_create(ctx, p);
    3921           0 :         if (ret)
    3922             :                 goto err;
    3923             : 
    3924           0 :         ret = io_rsrc_init(ctx);
    3925           0 :         if (ret)
    3926             :                 goto err;
    3927             : 
    3928           0 :         p->sq_off.head = offsetof(struct io_rings, sq.head);
    3929           0 :         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
    3930           0 :         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
    3931           0 :         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
    3932           0 :         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
    3933           0 :         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
    3934           0 :         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
    3935           0 :         p->sq_off.resv1 = 0;
    3936           0 :         if (!(ctx->flags & IORING_SETUP_NO_MMAP))
    3937           0 :                 p->sq_off.user_addr = 0;
    3938             : 
    3939           0 :         p->cq_off.head = offsetof(struct io_rings, cq.head);
    3940           0 :         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
    3941           0 :         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
    3942           0 :         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
    3943           0 :         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
    3944           0 :         p->cq_off.cqes = offsetof(struct io_rings, cqes);
    3945           0 :         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
    3946           0 :         p->cq_off.resv1 = 0;
    3947           0 :         if (!(ctx->flags & IORING_SETUP_NO_MMAP))
    3948           0 :                 p->cq_off.user_addr = 0;
    3949             : 
    3950           0 :         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
    3951             :                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
    3952             :                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
    3953             :                         IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
    3954             :                         IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
    3955             :                         IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
    3956             :                         IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING;
    3957             : 
    3958           0 :         if (copy_to_user(params, p, sizeof(*p))) {
    3959             :                 ret = -EFAULT;
    3960             :                 goto err;
    3961             :         }
    3962             : 
    3963           0 :         if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
    3964           0 :             && !(ctx->flags & IORING_SETUP_R_DISABLED))
    3965           0 :                 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
    3966             : 
    3967           0 :         file = io_uring_get_file(ctx);
    3968           0 :         if (IS_ERR(file)) {
    3969           0 :                 ret = PTR_ERR(file);
    3970           0 :                 goto err;
    3971             :         }
    3972             : 
    3973           0 :         ret = __io_uring_add_tctx_node(ctx);
    3974           0 :         if (ret)
    3975             :                 goto err_fput;
    3976           0 :         tctx = current->io_uring;
    3977             : 
    3978             :         /*
    3979             :          * Install ring fd as the very last thing, so we don't risk someone
    3980             :          * having closed it before we finish setup
    3981             :          */
    3982           0 :         if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
    3983           0 :                 ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX);
    3984             :         else
    3985           0 :                 ret = io_uring_install_fd(file);
    3986           0 :         if (ret < 0)
    3987             :                 goto err_fput;
    3988             : 
    3989             :         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
    3990             :         return ret;
    3991             : err:
    3992           0 :         io_ring_ctx_wait_and_kill(ctx);
    3993           0 :         return ret;
    3994             : err_fput:
    3995           0 :         fput(file);
    3996           0 :         return ret;
    3997             : }
    3998             : 
    3999             : /*
    4000             :  * Sets up an aio uring context, and returns the fd. Applications asks for a
    4001             :  * ring size, we return the actual sq/cq ring sizes (among other things) in the
    4002             :  * params structure passed in.
    4003             :  */
    4004           0 : static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
    4005             : {
    4006             :         struct io_uring_params p;
    4007             :         int i;
    4008             : 
    4009           0 :         if (copy_from_user(&p, params, sizeof(p)))
    4010             :                 return -EFAULT;
    4011           0 :         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
    4012           0 :                 if (p.resv[i])
    4013             :                         return -EINVAL;
    4014             :         }
    4015             : 
    4016           0 :         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
    4017             :                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
    4018             :                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
    4019             :                         IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
    4020             :                         IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
    4021             :                         IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
    4022             :                         IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
    4023             :                         IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY))
    4024             :                 return -EINVAL;
    4025             : 
    4026           0 :         return io_uring_create(entries, &p, params);
    4027             : }
    4028             : 
    4029           0 : SYSCALL_DEFINE2(io_uring_setup, u32, entries,
    4030             :                 struct io_uring_params __user *, params)
    4031             : {
    4032           0 :         return io_uring_setup(entries, params);
    4033             : }
    4034             : 
    4035           0 : static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
    4036             :                            unsigned nr_args)
    4037             : {
    4038             :         struct io_uring_probe *p;
    4039             :         size_t size;
    4040             :         int i, ret;
    4041             : 
    4042           0 :         size = struct_size(p, ops, nr_args);
    4043           0 :         if (size == SIZE_MAX)
    4044             :                 return -EOVERFLOW;
    4045           0 :         p = kzalloc(size, GFP_KERNEL);
    4046           0 :         if (!p)
    4047             :                 return -ENOMEM;
    4048             : 
    4049           0 :         ret = -EFAULT;
    4050           0 :         if (copy_from_user(p, arg, size))
    4051             :                 goto out;
    4052           0 :         ret = -EINVAL;
    4053           0 :         if (memchr_inv(p, 0, size))
    4054             :                 goto out;
    4055             : 
    4056           0 :         p->last_op = IORING_OP_LAST - 1;
    4057           0 :         if (nr_args > IORING_OP_LAST)
    4058           0 :                 nr_args = IORING_OP_LAST;
    4059             : 
    4060           0 :         for (i = 0; i < nr_args; i++) {
    4061           0 :                 p->ops[i].op = i;
    4062           0 :                 if (!io_issue_defs[i].not_supported)
    4063           0 :                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
    4064             :         }
    4065           0 :         p->ops_len = i;
    4066             : 
    4067           0 :         ret = 0;
    4068           0 :         if (copy_to_user(arg, p, size))
    4069           0 :                 ret = -EFAULT;
    4070             : out:
    4071           0 :         kfree(p);
    4072             :         return ret;
    4073             : }
    4074             : 
    4075           0 : static int io_register_personality(struct io_ring_ctx *ctx)
    4076             : {
    4077             :         const struct cred *creds;
    4078             :         u32 id;
    4079             :         int ret;
    4080             : 
    4081           0 :         creds = get_current_cred();
    4082             : 
    4083           0 :         ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
    4084           0 :                         XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
    4085           0 :         if (ret < 0) {
    4086             :                 put_cred(creds);
    4087             :                 return ret;
    4088             :         }
    4089           0 :         return id;
    4090             : }
    4091             : 
    4092           0 : static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
    4093             :                                            void __user *arg, unsigned int nr_args)
    4094             : {
    4095             :         struct io_uring_restriction *res;
    4096             :         size_t size;
    4097             :         int i, ret;
    4098             : 
    4099             :         /* Restrictions allowed only if rings started disabled */
    4100           0 :         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
    4101             :                 return -EBADFD;
    4102             : 
    4103             :         /* We allow only a single restrictions registration */
    4104           0 :         if (ctx->restrictions.registered)
    4105             :                 return -EBUSY;
    4106             : 
    4107           0 :         if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
    4108             :                 return -EINVAL;
    4109             : 
    4110           0 :         size = array_size(nr_args, sizeof(*res));
    4111           0 :         if (size == SIZE_MAX)
    4112             :                 return -EOVERFLOW;
    4113             : 
    4114           0 :         res = memdup_user(arg, size);
    4115           0 :         if (IS_ERR(res))
    4116           0 :                 return PTR_ERR(res);
    4117             : 
    4118             :         ret = 0;
    4119             : 
    4120           0 :         for (i = 0; i < nr_args; i++) {
    4121           0 :                 switch (res[i].opcode) {
    4122             :                 case IORING_RESTRICTION_REGISTER_OP:
    4123           0 :                         if (res[i].register_op >= IORING_REGISTER_LAST) {
    4124             :                                 ret = -EINVAL;
    4125             :                                 goto out;
    4126             :                         }
    4127             : 
    4128           0 :                         __set_bit(res[i].register_op,
    4129             :                                   ctx->restrictions.register_op);
    4130             :                         break;
    4131             :                 case IORING_RESTRICTION_SQE_OP:
    4132           0 :                         if (res[i].sqe_op >= IORING_OP_LAST) {
    4133             :                                 ret = -EINVAL;
    4134             :                                 goto out;
    4135             :                         }
    4136             : 
    4137           0 :                         __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
    4138             :                         break;
    4139             :                 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
    4140           0 :                         ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
    4141           0 :                         break;
    4142             :                 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
    4143           0 :                         ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
    4144           0 :                         break;
    4145             :                 default:
    4146             :                         ret = -EINVAL;
    4147             :                         goto out;
    4148             :                 }
    4149             :         }
    4150             : 
    4151             : out:
    4152             :         /* Reset all restrictions if an error happened */
    4153           0 :         if (ret != 0)
    4154           0 :                 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
    4155             :         else
    4156           0 :                 ctx->restrictions.registered = true;
    4157             : 
    4158           0 :         kfree(res);
    4159           0 :         return ret;
    4160             : }
    4161             : 
    4162           0 : static int io_register_enable_rings(struct io_ring_ctx *ctx)
    4163             : {
    4164           0 :         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
    4165             :                 return -EBADFD;
    4166             : 
    4167           0 :         if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
    4168           0 :                 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
    4169             :                 /*
    4170             :                  * Lazy activation attempts would fail if it was polled before
    4171             :                  * submitter_task is set.
    4172             :                  */
    4173           0 :                 if (wq_has_sleeper(&ctx->poll_wq))
    4174           0 :                         io_activate_pollwq(ctx);
    4175             :         }
    4176             : 
    4177           0 :         if (ctx->restrictions.registered)
    4178           0 :                 ctx->restricted = 1;
    4179             : 
    4180           0 :         ctx->flags &= ~IORING_SETUP_R_DISABLED;
    4181           0 :         if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
    4182           0 :                 wake_up(&ctx->sq_data->wait);
    4183             :         return 0;
    4184             : }
    4185             : 
    4186           0 : static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
    4187             :                                        void __user *arg, unsigned len)
    4188             : {
    4189           0 :         struct io_uring_task *tctx = current->io_uring;
    4190             :         cpumask_var_t new_mask;
    4191             :         int ret;
    4192             : 
    4193           0 :         if (!tctx || !tctx->io_wq)
    4194             :                 return -EINVAL;
    4195             : 
    4196           0 :         if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
    4197             :                 return -ENOMEM;
    4198             : 
    4199           0 :         cpumask_clear(new_mask);
    4200           0 :         if (len > cpumask_size())
    4201           0 :                 len = cpumask_size();
    4202             : 
    4203             :         if (in_compat_syscall()) {
    4204             :                 ret = compat_get_bitmap(cpumask_bits(new_mask),
    4205             :                                         (const compat_ulong_t __user *)arg,
    4206             :                                         len * 8 /* CHAR_BIT */);
    4207             :         } else {
    4208           0 :                 ret = copy_from_user(new_mask, arg, len);
    4209             :         }
    4210             : 
    4211           0 :         if (ret) {
    4212             :                 free_cpumask_var(new_mask);
    4213             :                 return -EFAULT;
    4214             :         }
    4215             : 
    4216           0 :         ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
    4217           0 :         free_cpumask_var(new_mask);
    4218             :         return ret;
    4219             : }
    4220             : 
    4221           0 : static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
    4222             : {
    4223           0 :         struct io_uring_task *tctx = current->io_uring;
    4224             : 
    4225           0 :         if (!tctx || !tctx->io_wq)
    4226             :                 return -EINVAL;
    4227             : 
    4228           0 :         return io_wq_cpu_affinity(tctx->io_wq, NULL);
    4229             : }
    4230             : 
    4231           0 : static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
    4232             :                                                void __user *arg)
    4233             :         __must_hold(&ctx->uring_lock)
    4234             : {
    4235             :         struct io_tctx_node *node;
    4236           0 :         struct io_uring_task *tctx = NULL;
    4237           0 :         struct io_sq_data *sqd = NULL;
    4238             :         __u32 new_count[2];
    4239             :         int i, ret;
    4240             : 
    4241           0 :         if (copy_from_user(new_count, arg, sizeof(new_count)))
    4242             :                 return -EFAULT;
    4243           0 :         for (i = 0; i < ARRAY_SIZE(new_count); i++)
    4244           0 :                 if (new_count[i] > INT_MAX)
    4245             :                         return -EINVAL;
    4246             : 
    4247           0 :         if (ctx->flags & IORING_SETUP_SQPOLL) {
    4248           0 :                 sqd = ctx->sq_data;
    4249           0 :                 if (sqd) {
    4250             :                         /*
    4251             :                          * Observe the correct sqd->lock -> ctx->uring_lock
    4252             :                          * ordering. Fine to drop uring_lock here, we hold
    4253             :                          * a ref to the ctx.
    4254             :                          */
    4255           0 :                         refcount_inc(&sqd->refs);
    4256           0 :                         mutex_unlock(&ctx->uring_lock);
    4257           0 :                         mutex_lock(&sqd->lock);
    4258           0 :                         mutex_lock(&ctx->uring_lock);
    4259           0 :                         if (sqd->thread)
    4260           0 :                                 tctx = sqd->thread->io_uring;
    4261             :                 }
    4262             :         } else {
    4263           0 :                 tctx = current->io_uring;
    4264             :         }
    4265             : 
    4266             :         BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
    4267             : 
    4268           0 :         for (i = 0; i < ARRAY_SIZE(new_count); i++)
    4269           0 :                 if (new_count[i])
    4270           0 :                         ctx->iowq_limits[i] = new_count[i];
    4271           0 :         ctx->iowq_limits_set = true;
    4272             : 
    4273           0 :         if (tctx && tctx->io_wq) {
    4274           0 :                 ret = io_wq_max_workers(tctx->io_wq, new_count);
    4275           0 :                 if (ret)
    4276             :                         goto err;
    4277             :         } else {
    4278           0 :                 memset(new_count, 0, sizeof(new_count));
    4279             :         }
    4280             : 
    4281           0 :         if (sqd) {
    4282           0 :                 mutex_unlock(&sqd->lock);
    4283           0 :                 io_put_sq_data(sqd);
    4284             :         }
    4285             : 
    4286           0 :         if (copy_to_user(arg, new_count, sizeof(new_count)))
    4287             :                 return -EFAULT;
    4288             : 
    4289             :         /* that's it for SQPOLL, only the SQPOLL task creates requests */
    4290           0 :         if (sqd)
    4291             :                 return 0;
    4292             : 
    4293             :         /* now propagate the restriction to all registered users */
    4294           0 :         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
    4295           0 :                 struct io_uring_task *tctx = node->task->io_uring;
    4296             : 
    4297           0 :                 if (WARN_ON_ONCE(!tctx->io_wq))
    4298           0 :                         continue;
    4299             : 
    4300           0 :                 for (i = 0; i < ARRAY_SIZE(new_count); i++)
    4301           0 :                         new_count[i] = ctx->iowq_limits[i];
    4302             :                 /* ignore errors, it always returns zero anyway */
    4303           0 :                 (void)io_wq_max_workers(tctx->io_wq, new_count);
    4304             :         }
    4305             :         return 0;
    4306             : err:
    4307           0 :         if (sqd) {
    4308           0 :                 mutex_unlock(&sqd->lock);
    4309           0 :                 io_put_sq_data(sqd);
    4310             :         }
    4311             :         return ret;
    4312             : }
    4313             : 
    4314           0 : static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
    4315             :                                void __user *arg, unsigned nr_args)
    4316             :         __releases(ctx->uring_lock)
    4317             :         __acquires(ctx->uring_lock)
    4318             : {
    4319             :         int ret;
    4320             : 
    4321             :         /*
    4322             :          * We don't quiesce the refs for register anymore and so it can't be
    4323             :          * dying as we're holding a file ref here.
    4324             :          */
    4325           0 :         if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
    4326             :                 return -ENXIO;
    4327             : 
    4328           0 :         if (ctx->submitter_task && ctx->submitter_task != current)
    4329             :                 return -EEXIST;
    4330             : 
    4331           0 :         if (ctx->restricted) {
    4332           0 :                 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
    4333           0 :                 if (!test_bit(opcode, ctx->restrictions.register_op))
    4334             :                         return -EACCES;
    4335             :         }
    4336             : 
    4337           0 :         switch (opcode) {
    4338             :         case IORING_REGISTER_BUFFERS:
    4339           0 :                 ret = -EFAULT;
    4340           0 :                 if (!arg)
    4341             :                         break;
    4342           0 :                 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
    4343           0 :                 break;
    4344             :         case IORING_UNREGISTER_BUFFERS:
    4345           0 :                 ret = -EINVAL;
    4346           0 :                 if (arg || nr_args)
    4347             :                         break;
    4348           0 :                 ret = io_sqe_buffers_unregister(ctx);
    4349           0 :                 break;
    4350             :         case IORING_REGISTER_FILES:
    4351           0 :                 ret = -EFAULT;
    4352           0 :                 if (!arg)
    4353             :                         break;
    4354           0 :                 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
    4355           0 :                 break;
    4356             :         case IORING_UNREGISTER_FILES:
    4357           0 :                 ret = -EINVAL;
    4358           0 :                 if (arg || nr_args)
    4359             :                         break;
    4360           0 :                 ret = io_sqe_files_unregister(ctx);
    4361           0 :                 break;
    4362             :         case IORING_REGISTER_FILES_UPDATE:
    4363           0 :                 ret = io_register_files_update(ctx, arg, nr_args);
    4364           0 :                 break;
    4365             :         case IORING_REGISTER_EVENTFD:
    4366           0 :                 ret = -EINVAL;
    4367           0 :                 if (nr_args != 1)
    4368             :                         break;
    4369           0 :                 ret = io_eventfd_register(ctx, arg, 0);
    4370           0 :                 break;
    4371             :         case IORING_REGISTER_EVENTFD_ASYNC:
    4372           0 :                 ret = -EINVAL;
    4373           0 :                 if (nr_args != 1)
    4374             :                         break;
    4375           0 :                 ret = io_eventfd_register(ctx, arg, 1);
    4376           0 :                 break;
    4377             :         case IORING_UNREGISTER_EVENTFD:
    4378           0 :                 ret = -EINVAL;
    4379           0 :                 if (arg || nr_args)
    4380             :                         break;
    4381           0 :                 ret = io_eventfd_unregister(ctx);
    4382           0 :                 break;
    4383             :         case IORING_REGISTER_PROBE:
    4384           0 :                 ret = -EINVAL;
    4385           0 :                 if (!arg || nr_args > 256)
    4386             :                         break;
    4387           0 :                 ret = io_probe(ctx, arg, nr_args);
    4388           0 :                 break;
    4389             :         case IORING_REGISTER_PERSONALITY:
    4390           0 :                 ret = -EINVAL;
    4391           0 :                 if (arg || nr_args)
    4392             :                         break;
    4393           0 :                 ret = io_register_personality(ctx);
    4394           0 :                 break;
    4395             :         case IORING_UNREGISTER_PERSONALITY:
    4396           0 :                 ret = -EINVAL;
    4397           0 :                 if (arg)
    4398             :                         break;
    4399           0 :                 ret = io_unregister_personality(ctx, nr_args);
    4400           0 :                 break;
    4401             :         case IORING_REGISTER_ENABLE_RINGS:
    4402           0 :                 ret = -EINVAL;
    4403           0 :                 if (arg || nr_args)
    4404             :                         break;
    4405           0 :                 ret = io_register_enable_rings(ctx);
    4406           0 :                 break;
    4407             :         case IORING_REGISTER_RESTRICTIONS:
    4408           0 :                 ret = io_register_restrictions(ctx, arg, nr_args);
    4409           0 :                 break;
    4410             :         case IORING_REGISTER_FILES2:
    4411           0 :                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
    4412           0 :                 break;
    4413             :         case IORING_REGISTER_FILES_UPDATE2:
    4414           0 :                 ret = io_register_rsrc_update(ctx, arg, nr_args,
    4415             :                                               IORING_RSRC_FILE);
    4416           0 :                 break;
    4417             :         case IORING_REGISTER_BUFFERS2:
    4418           0 :                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
    4419           0 :                 break;
    4420             :         case IORING_REGISTER_BUFFERS_UPDATE:
    4421           0 :                 ret = io_register_rsrc_update(ctx, arg, nr_args,
    4422             :                                               IORING_RSRC_BUFFER);
    4423           0 :                 break;
    4424             :         case IORING_REGISTER_IOWQ_AFF:
    4425           0 :                 ret = -EINVAL;
    4426           0 :                 if (!arg || !nr_args)
    4427             :                         break;
    4428           0 :                 ret = io_register_iowq_aff(ctx, arg, nr_args);
    4429           0 :                 break;
    4430             :         case IORING_UNREGISTER_IOWQ_AFF:
    4431           0 :                 ret = -EINVAL;
    4432           0 :                 if (arg || nr_args)
    4433             :                         break;
    4434           0 :                 ret = io_unregister_iowq_aff(ctx);
    4435           0 :                 break;
    4436             :         case IORING_REGISTER_IOWQ_MAX_WORKERS:
    4437           0 :                 ret = -EINVAL;
    4438           0 :                 if (!arg || nr_args != 2)
    4439             :                         break;
    4440           0 :                 ret = io_register_iowq_max_workers(ctx, arg);
    4441           0 :                 break;
    4442             :         case IORING_REGISTER_RING_FDS:
    4443           0 :                 ret = io_ringfd_register(ctx, arg, nr_args);
    4444           0 :                 break;
    4445             :         case IORING_UNREGISTER_RING_FDS:
    4446           0 :                 ret = io_ringfd_unregister(ctx, arg, nr_args);
    4447           0 :                 break;
    4448             :         case IORING_REGISTER_PBUF_RING:
    4449           0 :                 ret = -EINVAL;
    4450           0 :                 if (!arg || nr_args != 1)
    4451             :                         break;
    4452           0 :                 ret = io_register_pbuf_ring(ctx, arg);
    4453           0 :                 break;
    4454             :         case IORING_UNREGISTER_PBUF_RING:
    4455           0 :                 ret = -EINVAL;
    4456           0 :                 if (!arg || nr_args != 1)
    4457             :                         break;
    4458           0 :                 ret = io_unregister_pbuf_ring(ctx, arg);
    4459           0 :                 break;
    4460             :         case IORING_REGISTER_SYNC_CANCEL:
    4461           0 :                 ret = -EINVAL;
    4462           0 :                 if (!arg || nr_args != 1)
    4463             :                         break;
    4464           0 :                 ret = io_sync_cancel(ctx, arg);
    4465           0 :                 break;
    4466             :         case IORING_REGISTER_FILE_ALLOC_RANGE:
    4467           0 :                 ret = -EINVAL;
    4468           0 :                 if (!arg || nr_args)
    4469             :                         break;
    4470           0 :                 ret = io_register_file_alloc_range(ctx, arg);
    4471           0 :                 break;
    4472             :         default:
    4473             :                 ret = -EINVAL;
    4474             :                 break;
    4475             :         }
    4476             : 
    4477             :         return ret;
    4478             : }
    4479             : 
    4480           0 : SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
    4481             :                 void __user *, arg, unsigned int, nr_args)
    4482             : {
    4483             :         struct io_ring_ctx *ctx;
    4484           0 :         long ret = -EBADF;
    4485             :         struct fd f;
    4486             :         bool use_registered_ring;
    4487             : 
    4488           0 :         use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
    4489           0 :         opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
    4490             : 
    4491           0 :         if (opcode >= IORING_REGISTER_LAST)
    4492             :                 return -EINVAL;
    4493             : 
    4494           0 :         if (use_registered_ring) {
    4495             :                 /*
    4496             :                  * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
    4497             :                  * need only dereference our task private array to find it.
    4498             :                  */
    4499           0 :                 struct io_uring_task *tctx = current->io_uring;
    4500             : 
    4501           0 :                 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
    4502             :                         return -EINVAL;
    4503           0 :                 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
    4504           0 :                 f.file = tctx->registered_rings[fd];
    4505           0 :                 f.flags = 0;
    4506           0 :                 if (unlikely(!f.file))
    4507             :                         return -EBADF;
    4508             :         } else {
    4509           0 :                 f = fdget(fd);
    4510           0 :                 if (unlikely(!f.file))
    4511             :                         return -EBADF;
    4512           0 :                 ret = -EOPNOTSUPP;
    4513           0 :                 if (!io_is_uring_fops(f.file))
    4514             :                         goto out_fput;
    4515             :         }
    4516             : 
    4517           0 :         ctx = f.file->private_data;
    4518             : 
    4519           0 :         mutex_lock(&ctx->uring_lock);
    4520           0 :         ret = __io_uring_register(ctx, opcode, arg, nr_args);
    4521           0 :         mutex_unlock(&ctx->uring_lock);
    4522           0 :         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
    4523             : out_fput:
    4524           0 :         fdput(f);
    4525             :         return ret;
    4526             : }
    4527             : 
    4528           1 : static int __init io_uring_init(void)
    4529             : {
    4530             : #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
    4531             :         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
    4532             :         BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \
    4533             : } while (0)
    4534             : 
    4535             : #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
    4536             :         __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, sizeof(etype), ename)
    4537             : #define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \
    4538             :         __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename)
    4539             :         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
    4540             :         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
    4541             :         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
    4542             :         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
    4543             :         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
    4544             :         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
    4545             :         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
    4546             :         BUILD_BUG_SQE_ELEM(8,  __u32,  cmd_op);
    4547             :         BUILD_BUG_SQE_ELEM(12, __u32, __pad1);
    4548             :         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
    4549             :         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
    4550             :         BUILD_BUG_SQE_ELEM(24, __u32,  len);
    4551             :         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
    4552             :         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
    4553             :         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
    4554             :         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
    4555             :         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
    4556             :         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
    4557             :         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
    4558             :         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
    4559             :         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
    4560             :         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
    4561             :         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
    4562             :         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
    4563             :         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
    4564             :         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
    4565             :         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
    4566             :         BUILD_BUG_SQE_ELEM(28, __u32,  rename_flags);
    4567             :         BUILD_BUG_SQE_ELEM(28, __u32,  unlink_flags);
    4568             :         BUILD_BUG_SQE_ELEM(28, __u32,  hardlink_flags);
    4569             :         BUILD_BUG_SQE_ELEM(28, __u32,  xattr_flags);
    4570             :         BUILD_BUG_SQE_ELEM(28, __u32,  msg_ring_flags);
    4571             :         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
    4572             :         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
    4573             :         BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
    4574             :         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
    4575             :         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
    4576             :         BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
    4577             :         BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
    4578             :         BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
    4579             :         BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
    4580             :         BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
    4581             :         BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
    4582             : 
    4583             :         BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
    4584             :                      sizeof(struct io_uring_rsrc_update));
    4585             :         BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
    4586             :                      sizeof(struct io_uring_rsrc_update2));
    4587             : 
    4588             :         /* ->buf_index is u16 */
    4589             :         BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
    4590             :         BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
    4591             :                      offsetof(struct io_uring_buf_ring, tail));
    4592             : 
    4593             :         /* should fit into one byte */
    4594             :         BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
    4595             :         BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
    4596             :         BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
    4597             : 
    4598             :         BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
    4599             : 
    4600             :         BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
    4601             : 
    4602           1 :         io_uring_optable_init();
    4603             : 
    4604           1 :         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
    4605             :                                 SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
    4606           1 :         return 0;
    4607             : };
    4608             : __initcall(io_uring_init);

Generated by: LCOV version 1.14