Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : #include <linux/kernel.h>
3 : #include <linux/errno.h>
4 : #include <linux/fs.h>
5 : #include <linux/file.h>
6 : #include <linux/mm.h>
7 : #include <linux/slab.h>
8 : #include <linux/namei.h>
9 : #include <linux/poll.h>
10 : #include <linux/io_uring.h>
11 :
12 : #include <uapi/linux/io_uring.h>
13 :
14 : #include "io_uring.h"
15 : #include "opdef.h"
16 : #include "kbuf.h"
17 :
18 : #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
19 :
20 : #define BGID_ARRAY 64
21 :
22 : struct io_provide_buf {
23 : struct file *file;
24 : __u64 addr;
25 : __u32 len;
26 : __u32 bgid;
27 : __u16 nbufs;
28 : __u16 bid;
29 : };
30 :
31 : static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
32 : unsigned int bgid)
33 : {
34 0 : if (ctx->io_bl && bgid < BGID_ARRAY)
35 0 : return &ctx->io_bl[bgid];
36 :
37 0 : return xa_load(&ctx->io_bl_xa, bgid);
38 : }
39 :
40 0 : static int io_buffer_add_list(struct io_ring_ctx *ctx,
41 : struct io_buffer_list *bl, unsigned int bgid)
42 : {
43 0 : bl->bgid = bgid;
44 0 : if (bgid < BGID_ARRAY)
45 : return 0;
46 :
47 0 : return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
48 : }
49 :
50 0 : void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
51 : {
52 0 : struct io_ring_ctx *ctx = req->ctx;
53 : struct io_buffer_list *bl;
54 : struct io_buffer *buf;
55 :
56 : /*
57 : * For legacy provided buffer mode, don't recycle if we already did
58 : * IO to this buffer. For ring-mapped provided buffer mode, we should
59 : * increment ring->head to explicitly monopolize the buffer to avoid
60 : * multiple use.
61 : */
62 0 : if (req->flags & REQ_F_PARTIAL_IO)
63 : return;
64 :
65 0 : io_ring_submit_lock(ctx, issue_flags);
66 :
67 0 : buf = req->kbuf;
68 0 : bl = io_buffer_get_list(ctx, buf->bgid);
69 0 : list_add(&buf->list, &bl->buf_list);
70 0 : req->flags &= ~REQ_F_BUFFER_SELECTED;
71 0 : req->buf_index = buf->bgid;
72 :
73 : io_ring_submit_unlock(ctx, issue_flags);
74 : return;
75 : }
76 :
77 0 : unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
78 : {
79 : unsigned int cflags;
80 :
81 : /*
82 : * We can add this buffer back to two lists:
83 : *
84 : * 1) The io_buffers_cache list. This one is protected by the
85 : * ctx->uring_lock. If we already hold this lock, add back to this
86 : * list as we can grab it from issue as well.
87 : * 2) The io_buffers_comp list. This one is protected by the
88 : * ctx->completion_lock.
89 : *
90 : * We migrate buffers from the comp_list to the issue cache list
91 : * when we need one.
92 : */
93 0 : if (req->flags & REQ_F_BUFFER_RING) {
94 : /* no buffers to recycle for this case */
95 0 : cflags = __io_put_kbuf_list(req, NULL);
96 0 : } else if (issue_flags & IO_URING_F_UNLOCKED) {
97 0 : struct io_ring_ctx *ctx = req->ctx;
98 :
99 0 : spin_lock(&ctx->completion_lock);
100 0 : cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp);
101 0 : spin_unlock(&ctx->completion_lock);
102 : } else {
103 0 : lockdep_assert_held(&req->ctx->uring_lock);
104 :
105 0 : cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
106 : }
107 0 : return cflags;
108 : }
109 :
110 0 : static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
111 : struct io_buffer_list *bl)
112 : {
113 0 : if (!list_empty(&bl->buf_list)) {
114 : struct io_buffer *kbuf;
115 :
116 0 : kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
117 0 : list_del(&kbuf->list);
118 0 : if (*len == 0 || *len > kbuf->len)
119 0 : *len = kbuf->len;
120 0 : req->flags |= REQ_F_BUFFER_SELECTED;
121 0 : req->kbuf = kbuf;
122 0 : req->buf_index = kbuf->bid;
123 0 : return u64_to_user_ptr(kbuf->addr);
124 : }
125 : return NULL;
126 : }
127 :
128 0 : static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
129 : struct io_buffer_list *bl,
130 : unsigned int issue_flags)
131 : {
132 0 : struct io_uring_buf_ring *br = bl->buf_ring;
133 : struct io_uring_buf *buf;
134 0 : __u16 head = bl->head;
135 :
136 0 : if (unlikely(smp_load_acquire(&br->tail) == head))
137 : return NULL;
138 :
139 0 : head &= bl->mask;
140 0 : if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
141 0 : buf = &br->bufs[head];
142 : } else {
143 0 : int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
144 0 : int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
145 0 : buf = page_address(bl->buf_pages[index]);
146 0 : buf += off;
147 : }
148 0 : if (*len == 0 || *len > buf->len)
149 0 : *len = buf->len;
150 0 : req->flags |= REQ_F_BUFFER_RING;
151 0 : req->buf_list = bl;
152 0 : req->buf_index = buf->bid;
153 :
154 0 : if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
155 : /*
156 : * If we came in unlocked, we have no choice but to consume the
157 : * buffer here, otherwise nothing ensures that the buffer won't
158 : * get used by others. This does mean it'll be pinned until the
159 : * IO completes, coming in unlocked means we're being called from
160 : * io-wq context and there may be further retries in async hybrid
161 : * mode. For the locked case, the caller must call commit when
162 : * the transfer completes (or if we get -EAGAIN and must poll of
163 : * retry).
164 : */
165 0 : req->buf_list = NULL;
166 0 : bl->head++;
167 : }
168 0 : return u64_to_user_ptr(buf->addr);
169 : }
170 :
171 0 : void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
172 : unsigned int issue_flags)
173 : {
174 0 : struct io_ring_ctx *ctx = req->ctx;
175 : struct io_buffer_list *bl;
176 0 : void __user *ret = NULL;
177 :
178 0 : io_ring_submit_lock(req->ctx, issue_flags);
179 :
180 0 : bl = io_buffer_get_list(ctx, req->buf_index);
181 0 : if (likely(bl)) {
182 0 : if (bl->buf_nr_pages)
183 0 : ret = io_ring_buffer_select(req, len, bl, issue_flags);
184 : else
185 0 : ret = io_provided_buffer_select(req, len, bl);
186 : }
187 0 : io_ring_submit_unlock(req->ctx, issue_flags);
188 0 : return ret;
189 : }
190 :
191 0 : static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
192 : {
193 : int i;
194 :
195 0 : ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
196 : GFP_KERNEL);
197 0 : if (!ctx->io_bl)
198 : return -ENOMEM;
199 :
200 0 : for (i = 0; i < BGID_ARRAY; i++) {
201 0 : INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
202 0 : ctx->io_bl[i].bgid = i;
203 : }
204 :
205 : return 0;
206 : }
207 :
208 0 : static int __io_remove_buffers(struct io_ring_ctx *ctx,
209 : struct io_buffer_list *bl, unsigned nbufs)
210 : {
211 0 : unsigned i = 0;
212 :
213 : /* shouldn't happen */
214 0 : if (!nbufs)
215 : return 0;
216 :
217 0 : if (bl->buf_nr_pages) {
218 : int j;
219 :
220 0 : i = bl->buf_ring->tail - bl->head;
221 0 : for (j = 0; j < bl->buf_nr_pages; j++)
222 0 : unpin_user_page(bl->buf_pages[j]);
223 0 : kvfree(bl->buf_pages);
224 : bl->buf_pages = NULL;
225 0 : bl->buf_nr_pages = 0;
226 : /* make sure it's seen as empty */
227 0 : INIT_LIST_HEAD(&bl->buf_list);
228 : return i;
229 : }
230 :
231 : /* the head kbuf is the list itself */
232 0 : while (!list_empty(&bl->buf_list)) {
233 : struct io_buffer *nxt;
234 :
235 0 : nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
236 0 : list_del(&nxt->list);
237 0 : if (++i == nbufs)
238 0 : return i;
239 0 : cond_resched();
240 : }
241 0 : i++;
242 :
243 0 : return i;
244 : }
245 :
246 0 : void io_destroy_buffers(struct io_ring_ctx *ctx)
247 : {
248 : struct io_buffer_list *bl;
249 : unsigned long index;
250 : int i;
251 :
252 0 : for (i = 0; i < BGID_ARRAY; i++) {
253 0 : if (!ctx->io_bl)
254 : break;
255 0 : __io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
256 : }
257 :
258 0 : xa_for_each(&ctx->io_bl_xa, index, bl) {
259 0 : xa_erase(&ctx->io_bl_xa, bl->bgid);
260 0 : __io_remove_buffers(ctx, bl, -1U);
261 0 : kfree(bl);
262 : }
263 :
264 0 : while (!list_empty(&ctx->io_buffers_pages)) {
265 : struct page *page;
266 :
267 0 : page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
268 0 : list_del_init(&page->lru);
269 0 : __free_page(page);
270 : }
271 0 : }
272 :
273 0 : int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
274 : {
275 0 : struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
276 : u64 tmp;
277 :
278 0 : if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
279 0 : sqe->splice_fd_in)
280 : return -EINVAL;
281 :
282 0 : tmp = READ_ONCE(sqe->fd);
283 0 : if (!tmp || tmp > USHRT_MAX)
284 : return -EINVAL;
285 :
286 0 : memset(p, 0, sizeof(*p));
287 0 : p->nbufs = tmp;
288 0 : p->bgid = READ_ONCE(sqe->buf_group);
289 0 : return 0;
290 : }
291 :
292 0 : int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
293 : {
294 0 : struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
295 0 : struct io_ring_ctx *ctx = req->ctx;
296 : struct io_buffer_list *bl;
297 0 : int ret = 0;
298 :
299 0 : io_ring_submit_lock(ctx, issue_flags);
300 :
301 0 : ret = -ENOENT;
302 0 : bl = io_buffer_get_list(ctx, p->bgid);
303 0 : if (bl) {
304 0 : ret = -EINVAL;
305 : /* can't use provide/remove buffers command on mapped buffers */
306 0 : if (!bl->buf_nr_pages)
307 0 : ret = __io_remove_buffers(ctx, bl, p->nbufs);
308 : }
309 0 : io_ring_submit_unlock(ctx, issue_flags);
310 0 : if (ret < 0)
311 0 : req_set_fail(req);
312 0 : io_req_set_res(req, ret, 0);
313 0 : return IOU_OK;
314 : }
315 :
316 0 : int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
317 : {
318 : unsigned long size, tmp_check;
319 0 : struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
320 : u64 tmp;
321 :
322 0 : if (sqe->rw_flags || sqe->splice_fd_in)
323 : return -EINVAL;
324 :
325 0 : tmp = READ_ONCE(sqe->fd);
326 0 : if (!tmp || tmp > USHRT_MAX)
327 : return -E2BIG;
328 0 : p->nbufs = tmp;
329 0 : p->addr = READ_ONCE(sqe->addr);
330 0 : p->len = READ_ONCE(sqe->len);
331 :
332 0 : if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
333 : &size))
334 : return -EOVERFLOW;
335 0 : if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
336 : return -EOVERFLOW;
337 :
338 0 : size = (unsigned long)p->len * p->nbufs;
339 0 : if (!access_ok(u64_to_user_ptr(p->addr), size))
340 : return -EFAULT;
341 :
342 0 : p->bgid = READ_ONCE(sqe->buf_group);
343 0 : tmp = READ_ONCE(sqe->off);
344 0 : if (tmp > USHRT_MAX)
345 : return -E2BIG;
346 0 : if (tmp + p->nbufs >= USHRT_MAX)
347 : return -EINVAL;
348 0 : p->bid = tmp;
349 0 : return 0;
350 : }
351 :
352 0 : static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
353 : {
354 : struct io_buffer *buf;
355 : struct page *page;
356 : int bufs_in_page;
357 :
358 : /*
359 : * Completions that don't happen inline (eg not under uring_lock) will
360 : * add to ->io_buffers_comp. If we don't have any free buffers, check
361 : * the completion list and splice those entries first.
362 : */
363 0 : if (!list_empty_careful(&ctx->io_buffers_comp)) {
364 0 : spin_lock(&ctx->completion_lock);
365 0 : if (!list_empty(&ctx->io_buffers_comp)) {
366 0 : list_splice_init(&ctx->io_buffers_comp,
367 : &ctx->io_buffers_cache);
368 0 : spin_unlock(&ctx->completion_lock);
369 0 : return 0;
370 : }
371 0 : spin_unlock(&ctx->completion_lock);
372 : }
373 :
374 : /*
375 : * No free buffers and no completion entries either. Allocate a new
376 : * page worth of buffer entries and add those to our freelist.
377 : */
378 0 : page = alloc_page(GFP_KERNEL_ACCOUNT);
379 0 : if (!page)
380 : return -ENOMEM;
381 :
382 0 : list_add(&page->lru, &ctx->io_buffers_pages);
383 :
384 0 : buf = page_address(page);
385 0 : bufs_in_page = PAGE_SIZE / sizeof(*buf);
386 0 : while (bufs_in_page) {
387 0 : list_add_tail(&buf->list, &ctx->io_buffers_cache);
388 0 : buf++;
389 0 : bufs_in_page--;
390 : }
391 :
392 : return 0;
393 : }
394 :
395 0 : static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
396 : struct io_buffer_list *bl)
397 : {
398 : struct io_buffer *buf;
399 0 : u64 addr = pbuf->addr;
400 0 : int i, bid = pbuf->bid;
401 :
402 0 : for (i = 0; i < pbuf->nbufs; i++) {
403 0 : if (list_empty(&ctx->io_buffers_cache) &&
404 0 : io_refill_buffer_cache(ctx))
405 : break;
406 0 : buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
407 : list);
408 0 : list_move_tail(&buf->list, &bl->buf_list);
409 0 : buf->addr = addr;
410 0 : buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
411 0 : buf->bid = bid;
412 0 : buf->bgid = pbuf->bgid;
413 0 : addr += pbuf->len;
414 0 : bid++;
415 0 : cond_resched();
416 : }
417 :
418 0 : return i ? 0 : -ENOMEM;
419 : }
420 :
421 0 : int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
422 : {
423 0 : struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
424 0 : struct io_ring_ctx *ctx = req->ctx;
425 : struct io_buffer_list *bl;
426 0 : int ret = 0;
427 :
428 0 : io_ring_submit_lock(ctx, issue_flags);
429 :
430 0 : if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
431 0 : ret = io_init_bl_list(ctx);
432 0 : if (ret)
433 : goto err;
434 : }
435 :
436 0 : bl = io_buffer_get_list(ctx, p->bgid);
437 0 : if (unlikely(!bl)) {
438 0 : bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
439 0 : if (!bl) {
440 : ret = -ENOMEM;
441 : goto err;
442 : }
443 0 : INIT_LIST_HEAD(&bl->buf_list);
444 0 : ret = io_buffer_add_list(ctx, bl, p->bgid);
445 0 : if (ret) {
446 0 : kfree(bl);
447 0 : goto err;
448 : }
449 : }
450 : /* can't add buffers via this command for a mapped buffer ring */
451 0 : if (bl->buf_nr_pages) {
452 : ret = -EINVAL;
453 : goto err;
454 : }
455 :
456 0 : ret = io_add_buffers(ctx, p, bl);
457 : err:
458 0 : io_ring_submit_unlock(ctx, issue_flags);
459 :
460 0 : if (ret < 0)
461 0 : req_set_fail(req);
462 0 : io_req_set_res(req, ret, 0);
463 0 : return IOU_OK;
464 : }
465 :
466 0 : int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
467 : {
468 : struct io_uring_buf_ring *br;
469 : struct io_uring_buf_reg reg;
470 0 : struct io_buffer_list *bl, *free_bl = NULL;
471 : struct page **pages;
472 : int nr_pages;
473 :
474 0 : if (copy_from_user(®, arg, sizeof(reg)))
475 : return -EFAULT;
476 :
477 0 : if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
478 : return -EINVAL;
479 0 : if (!reg.ring_addr)
480 : return -EFAULT;
481 0 : if (reg.ring_addr & ~PAGE_MASK)
482 : return -EINVAL;
483 0 : if (!is_power_of_2(reg.ring_entries))
484 : return -EINVAL;
485 :
486 : /* cannot disambiguate full vs empty due to head/tail size */
487 0 : if (reg.ring_entries >= 65536)
488 : return -EINVAL;
489 :
490 0 : if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
491 0 : int ret = io_init_bl_list(ctx);
492 0 : if (ret)
493 : return ret;
494 : }
495 :
496 0 : bl = io_buffer_get_list(ctx, reg.bgid);
497 0 : if (bl) {
498 : /* if mapped buffer ring OR classic exists, don't allow */
499 0 : if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
500 : return -EEXIST;
501 : } else {
502 0 : free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
503 0 : if (!bl)
504 : return -ENOMEM;
505 : }
506 :
507 0 : pages = io_pin_pages(reg.ring_addr,
508 0 : flex_array_size(br, bufs, reg.ring_entries),
509 : &nr_pages);
510 0 : if (IS_ERR(pages)) {
511 0 : kfree(free_bl);
512 0 : return PTR_ERR(pages);
513 : }
514 :
515 0 : br = page_address(pages[0]);
516 0 : bl->buf_pages = pages;
517 0 : bl->buf_nr_pages = nr_pages;
518 0 : bl->nr_entries = reg.ring_entries;
519 0 : bl->buf_ring = br;
520 0 : bl->mask = reg.ring_entries - 1;
521 0 : io_buffer_add_list(ctx, bl, reg.bgid);
522 0 : return 0;
523 : }
524 :
525 0 : int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
526 : {
527 : struct io_uring_buf_reg reg;
528 : struct io_buffer_list *bl;
529 :
530 0 : if (copy_from_user(®, arg, sizeof(reg)))
531 : return -EFAULT;
532 0 : if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
533 : return -EINVAL;
534 :
535 0 : bl = io_buffer_get_list(ctx, reg.bgid);
536 0 : if (!bl)
537 : return -ENOENT;
538 0 : if (!bl->buf_nr_pages)
539 : return -EINVAL;
540 :
541 0 : __io_remove_buffers(ctx, bl, -1U);
542 0 : if (bl->bgid >= BGID_ARRAY) {
543 0 : xa_erase(&ctx->io_bl_xa, bl->bgid);
544 0 : kfree(bl);
545 : }
546 : return 0;
547 : }
|