blob: d259e8a6cb2e0f27497485f80b9acd412b332eee [file] [log] [blame]
Jens Axboe2b188cc2019-01-07 10:46:33 -07001// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
7 * the application and kernel side. When the application reads the CQ ring
8 * tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
9 * the kernel uses after writing the tail. Failure to do so could cause a
10 * delay in when the application notices that completion events available.
11 * This isn't a fatal condition. Likewise, the application must use an
12 * appropriate smp_wmb() both before writing the SQ tail, and after writing
13 * the SQ tail. The first one orders the sqe writes with the tail write, and
14 * the latter is paired with the smp_rmb() the kernel will issue before
15 * reading the SQ tail on submission.
16 *
17 * Also see the examples in the liburing library:
18 *
19 * git://git.kernel.dk/liburing
20 *
21 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
22 * from data shared between the kernel and application. This is done both
23 * for ordering purposes, but also to ensure that once a value is loaded from
24 * data that the application could potentially modify, it remains stable.
25 *
26 * Copyright (C) 2018-2019 Jens Axboe
Christoph Hellwigc992fe22019-01-11 09:43:02 -070027 * Copyright (c) 2018-2019 Christoph Hellwig
Jens Axboe2b188cc2019-01-07 10:46:33 -070028 */
29#include <linux/kernel.h>
30#include <linux/init.h>
31#include <linux/errno.h>
32#include <linux/syscalls.h>
33#include <linux/compat.h>
34#include <linux/refcount.h>
35#include <linux/uio.h>
36
37#include <linux/sched/signal.h>
38#include <linux/fs.h>
39#include <linux/file.h>
40#include <linux/fdtable.h>
41#include <linux/mm.h>
42#include <linux/mman.h>
43#include <linux/mmu_context.h>
44#include <linux/percpu.h>
45#include <linux/slab.h>
46#include <linux/workqueue.h>
Jens Axboe6c271ce2019-01-10 11:22:30 -070047#include <linux/kthread.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070048#include <linux/blkdev.h>
Jens Axboeedafcce2019-01-09 09:16:05 -070049#include <linux/bvec.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070050#include <linux/net.h>
51#include <net/sock.h>
52#include <net/af_unix.h>
Jens Axboe6b063142019-01-10 22:13:58 -070053#include <net/scm.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070054#include <linux/anon_inodes.h>
55#include <linux/sched/mm.h>
56#include <linux/uaccess.h>
57#include <linux/nospec.h>
Jens Axboeedafcce2019-01-09 09:16:05 -070058#include <linux/sizes.h>
59#include <linux/hugetlb.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070060
61#include <uapi/linux/io_uring.h>
62
63#include "internal.h"
64
65#define IORING_MAX_ENTRIES 4096
Jens Axboe6b063142019-01-10 22:13:58 -070066#define IORING_MAX_FIXED_FILES 1024
Jens Axboe2b188cc2019-01-07 10:46:33 -070067
68struct io_uring {
69 u32 head ____cacheline_aligned_in_smp;
70 u32 tail ____cacheline_aligned_in_smp;
71};
72
73struct io_sq_ring {
74 struct io_uring r;
75 u32 ring_mask;
76 u32 ring_entries;
77 u32 dropped;
78 u32 flags;
79 u32 array[];
80};
81
82struct io_cq_ring {
83 struct io_uring r;
84 u32 ring_mask;
85 u32 ring_entries;
86 u32 overflow;
87 struct io_uring_cqe cqes[];
88};
89
Jens Axboeedafcce2019-01-09 09:16:05 -070090struct io_mapped_ubuf {
91 u64 ubuf;
92 size_t len;
93 struct bio_vec *bvec;
94 unsigned int nr_bvecs;
95};
96
Jens Axboe31b51512019-01-18 22:56:34 -070097struct async_list {
98 spinlock_t lock;
99 atomic_t cnt;
100 struct list_head list;
101
102 struct file *file;
103 off_t io_end;
104 size_t io_pages;
105};
106
Jens Axboe2b188cc2019-01-07 10:46:33 -0700107struct io_ring_ctx {
108 struct {
109 struct percpu_ref refs;
110 } ____cacheline_aligned_in_smp;
111
112 struct {
113 unsigned int flags;
114 bool compat;
115 bool account_mem;
116
117 /* SQ ring */
118 struct io_sq_ring *sq_ring;
119 unsigned cached_sq_head;
120 unsigned sq_entries;
121 unsigned sq_mask;
Jens Axboe6c271ce2019-01-10 11:22:30 -0700122 unsigned sq_thread_idle;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700123 struct io_uring_sqe *sq_sqes;
124 } ____cacheline_aligned_in_smp;
125
126 /* IO offload */
127 struct workqueue_struct *sqo_wq;
Jens Axboe6c271ce2019-01-10 11:22:30 -0700128 struct task_struct *sqo_thread; /* if using sq thread polling */
Jens Axboe2b188cc2019-01-07 10:46:33 -0700129 struct mm_struct *sqo_mm;
Jens Axboe6c271ce2019-01-10 11:22:30 -0700130 wait_queue_head_t sqo_wait;
131 unsigned sqo_stop;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700132
133 struct {
134 /* CQ ring */
135 struct io_cq_ring *cq_ring;
136 unsigned cached_cq_tail;
137 unsigned cq_entries;
138 unsigned cq_mask;
139 struct wait_queue_head cq_wait;
140 struct fasync_struct *cq_fasync;
141 } ____cacheline_aligned_in_smp;
142
Jens Axboe6b063142019-01-10 22:13:58 -0700143 /*
144 * If used, fixed file set. Writers must ensure that ->refs is dead,
145 * readers must ensure that ->refs is alive as long as the file* is
146 * used. Only updated through io_uring_register(2).
147 */
148 struct file **user_files;
149 unsigned nr_user_files;
150
Jens Axboeedafcce2019-01-09 09:16:05 -0700151 /* if used, fixed mapped user buffers */
152 unsigned nr_user_bufs;
153 struct io_mapped_ubuf *user_bufs;
154
Jens Axboe2b188cc2019-01-07 10:46:33 -0700155 struct user_struct *user;
156
157 struct completion ctx_done;
158
159 struct {
160 struct mutex uring_lock;
161 wait_queue_head_t wait;
162 } ____cacheline_aligned_in_smp;
163
164 struct {
165 spinlock_t completion_lock;
Jens Axboedef596e2019-01-09 08:59:42 -0700166 bool poll_multi_file;
167 /*
168 * ->poll_list is protected by the ctx->uring_lock for
169 * io_uring instances that don't use IORING_SETUP_SQPOLL.
170 * For SQPOLL, only the single threaded io_sq_thread() will
171 * manipulate the list, hence no extra locking is needed there.
172 */
173 struct list_head poll_list;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700174 struct list_head cancel_list;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700175 } ____cacheline_aligned_in_smp;
176
Jens Axboe31b51512019-01-18 22:56:34 -0700177 struct async_list pending_async[2];
178
Jens Axboe2b188cc2019-01-07 10:46:33 -0700179#if defined(CONFIG_UNIX)
180 struct socket *ring_sock;
181#endif
182};
183
184struct sqe_submit {
185 const struct io_uring_sqe *sqe;
186 unsigned short index;
187 bool has_user;
Jens Axboedef596e2019-01-09 08:59:42 -0700188 bool needs_lock;
Jens Axboe6c271ce2019-01-10 11:22:30 -0700189 bool needs_fixed_file;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700190};
191
Jens Axboe221c5eb2019-01-17 09:41:58 -0700192struct io_poll_iocb {
193 struct file *file;
194 struct wait_queue_head *head;
195 __poll_t events;
196 bool woken;
197 bool canceled;
198 struct wait_queue_entry wait;
199};
200
Jens Axboe2b188cc2019-01-07 10:46:33 -0700201struct io_kiocb {
Jens Axboe221c5eb2019-01-17 09:41:58 -0700202 union {
203 struct kiocb rw;
204 struct io_poll_iocb poll;
205 };
Jens Axboe2b188cc2019-01-07 10:46:33 -0700206
207 struct sqe_submit submit;
208
209 struct io_ring_ctx *ctx;
210 struct list_head list;
211 unsigned int flags;
Jens Axboec16361c2019-01-17 08:39:48 -0700212 refcount_t refs;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700213#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */
Jens Axboedef596e2019-01-09 08:59:42 -0700214#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
Jens Axboe6b063142019-01-10 22:13:58 -0700215#define REQ_F_FIXED_FILE 4 /* ctx owns file */
Jens Axboe31b51512019-01-18 22:56:34 -0700216#define REQ_F_SEQ_PREV 8 /* sequential with previous */
Jens Axboed530a402019-03-13 12:15:01 -0600217#define REQ_F_PREPPED 16 /* prep already done */
Jens Axboe2b188cc2019-01-07 10:46:33 -0700218 u64 user_data;
Jens Axboedef596e2019-01-09 08:59:42 -0700219 u64 error;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700220
221 struct work_struct work;
222};
223
224#define IO_PLUG_THRESHOLD 2
Jens Axboedef596e2019-01-09 08:59:42 -0700225#define IO_IOPOLL_BATCH 8
Jens Axboe2b188cc2019-01-07 10:46:33 -0700226
Jens Axboe9a56a232019-01-09 09:06:50 -0700227struct io_submit_state {
228 struct blk_plug plug;
229
230 /*
Jens Axboe2579f912019-01-09 09:10:43 -0700231 * io_kiocb alloc cache
232 */
233 void *reqs[IO_IOPOLL_BATCH];
234 unsigned int free_reqs;
235 unsigned int cur_req;
236
237 /*
Jens Axboe9a56a232019-01-09 09:06:50 -0700238 * File reference cache
239 */
240 struct file *file;
241 unsigned int fd;
242 unsigned int has_refs;
243 unsigned int used_refs;
244 unsigned int ios_left;
245};
246
Jens Axboe2b188cc2019-01-07 10:46:33 -0700247static struct kmem_cache *req_cachep;
248
249static const struct file_operations io_uring_fops;
250
251struct sock *io_uring_get_socket(struct file *file)
252{
253#if defined(CONFIG_UNIX)
254 if (file->f_op == &io_uring_fops) {
255 struct io_ring_ctx *ctx = file->private_data;
256
257 return ctx->ring_sock->sk;
258 }
259#endif
260 return NULL;
261}
262EXPORT_SYMBOL(io_uring_get_socket);
263
264static void io_ring_ctx_ref_free(struct percpu_ref *ref)
265{
266 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
267
268 complete(&ctx->ctx_done);
269}
270
271static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
272{
273 struct io_ring_ctx *ctx;
Jens Axboe31b51512019-01-18 22:56:34 -0700274 int i;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700275
276 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
277 if (!ctx)
278 return NULL;
279
280 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
281 kfree(ctx);
282 return NULL;
283 }
284
285 ctx->flags = p->flags;
286 init_waitqueue_head(&ctx->cq_wait);
287 init_completion(&ctx->ctx_done);
288 mutex_init(&ctx->uring_lock);
289 init_waitqueue_head(&ctx->wait);
Jens Axboe31b51512019-01-18 22:56:34 -0700290 for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
291 spin_lock_init(&ctx->pending_async[i].lock);
292 INIT_LIST_HEAD(&ctx->pending_async[i].list);
293 atomic_set(&ctx->pending_async[i].cnt, 0);
294 }
Jens Axboe2b188cc2019-01-07 10:46:33 -0700295 spin_lock_init(&ctx->completion_lock);
Jens Axboedef596e2019-01-09 08:59:42 -0700296 INIT_LIST_HEAD(&ctx->poll_list);
Jens Axboe221c5eb2019-01-17 09:41:58 -0700297 INIT_LIST_HEAD(&ctx->cancel_list);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700298 return ctx;
299}
300
301static void io_commit_cqring(struct io_ring_ctx *ctx)
302{
303 struct io_cq_ring *ring = ctx->cq_ring;
304
305 if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
306 /* order cqe stores with ring update */
307 smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
308
309 /*
310 * Write sider barrier of tail update, app has read side. See
311 * comment at the top of this file.
312 */
313 smp_wmb();
314
315 if (wq_has_sleeper(&ctx->cq_wait)) {
316 wake_up_interruptible(&ctx->cq_wait);
317 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
318 }
319 }
320}
321
322static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
323{
324 struct io_cq_ring *ring = ctx->cq_ring;
325 unsigned tail;
326
327 tail = ctx->cached_cq_tail;
328 /* See comment at the top of the file */
329 smp_rmb();
330 if (tail + 1 == READ_ONCE(ring->r.head))
331 return NULL;
332
333 ctx->cached_cq_tail++;
334 return &ring->cqes[tail & ctx->cq_mask];
335}
336
337static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
338 long res, unsigned ev_flags)
339{
340 struct io_uring_cqe *cqe;
341
342 /*
343 * If we can't get a cq entry, userspace overflowed the
344 * submission (by quite a lot). Increment the overflow count in
345 * the ring.
346 */
347 cqe = io_get_cqring(ctx);
348 if (cqe) {
349 WRITE_ONCE(cqe->user_data, ki_user_data);
350 WRITE_ONCE(cqe->res, res);
351 WRITE_ONCE(cqe->flags, ev_flags);
352 } else {
353 unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
354
355 WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
356 }
357}
358
359static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
360 long res, unsigned ev_flags)
361{
362 unsigned long flags;
363
364 spin_lock_irqsave(&ctx->completion_lock, flags);
365 io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
366 io_commit_cqring(ctx);
367 spin_unlock_irqrestore(&ctx->completion_lock, flags);
368
369 if (waitqueue_active(&ctx->wait))
370 wake_up(&ctx->wait);
Jens Axboe6c271ce2019-01-10 11:22:30 -0700371 if (waitqueue_active(&ctx->sqo_wait))
372 wake_up(&ctx->sqo_wait);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700373}
374
375static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
376{
377 percpu_ref_put_many(&ctx->refs, refs);
378
379 if (waitqueue_active(&ctx->wait))
380 wake_up(&ctx->wait);
381}
382
Jens Axboe2579f912019-01-09 09:10:43 -0700383static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
384 struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700385{
386 struct io_kiocb *req;
387
388 if (!percpu_ref_tryget(&ctx->refs))
389 return NULL;
390
Jens Axboe2579f912019-01-09 09:10:43 -0700391 if (!state) {
392 req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
393 if (unlikely(!req))
394 goto out;
395 } else if (!state->free_reqs) {
396 size_t sz;
397 int ret;
398
399 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
400 ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz,
401 state->reqs);
402 if (unlikely(ret <= 0))
403 goto out;
404 state->free_reqs = ret - 1;
405 state->cur_req = 1;
406 req = state->reqs[0];
407 } else {
408 req = state->reqs[state->cur_req];
409 state->free_reqs--;
410 state->cur_req++;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700411 }
412
Jens Axboe2579f912019-01-09 09:10:43 -0700413 req->ctx = ctx;
414 req->flags = 0;
Jens Axboee65ef562019-03-12 10:16:44 -0600415 /* one is dropped after submission, the other at completion */
416 refcount_set(&req->refs, 2);
Jens Axboe2579f912019-01-09 09:10:43 -0700417 return req;
418out:
Jens Axboe2b188cc2019-01-07 10:46:33 -0700419 io_ring_drop_ctx_refs(ctx, 1);
420 return NULL;
421}
422
Jens Axboedef596e2019-01-09 08:59:42 -0700423static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
424{
425 if (*nr) {
426 kmem_cache_free_bulk(req_cachep, *nr, reqs);
427 io_ring_drop_ctx_refs(ctx, *nr);
428 *nr = 0;
429 }
430}
431
Jens Axboe2b188cc2019-01-07 10:46:33 -0700432static void io_free_req(struct io_kiocb *req)
433{
Jens Axboee65ef562019-03-12 10:16:44 -0600434 io_ring_drop_ctx_refs(req->ctx, 1);
435 kmem_cache_free(req_cachep, req);
436}
437
438static void io_put_req(struct io_kiocb *req)
439{
440 if (refcount_dec_and_test(&req->refs))
441 io_free_req(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700442}
443
Jens Axboedef596e2019-01-09 08:59:42 -0700444/*
445 * Find and free completed poll iocbs
446 */
447static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
448 struct list_head *done)
449{
450 void *reqs[IO_IOPOLL_BATCH];
Jens Axboe9a56a232019-01-09 09:06:50 -0700451 int file_count, to_free;
452 struct file *file = NULL;
Jens Axboedef596e2019-01-09 08:59:42 -0700453 struct io_kiocb *req;
Jens Axboedef596e2019-01-09 08:59:42 -0700454
Jens Axboe9a56a232019-01-09 09:06:50 -0700455 file_count = to_free = 0;
Jens Axboedef596e2019-01-09 08:59:42 -0700456 while (!list_empty(done)) {
457 req = list_first_entry(done, struct io_kiocb, list);
458 list_del(&req->list);
459
460 io_cqring_fill_event(ctx, req->user_data, req->error, 0);
461
Jens Axboee65ef562019-03-12 10:16:44 -0600462 if (refcount_dec_and_test(&req->refs))
463 reqs[to_free++] = req;
Jens Axboedef596e2019-01-09 08:59:42 -0700464 (*nr_events)++;
465
Jens Axboe9a56a232019-01-09 09:06:50 -0700466 /*
467 * Batched puts of the same file, to avoid dirtying the
468 * file usage count multiple times, if avoidable.
469 */
Jens Axboe6b063142019-01-10 22:13:58 -0700470 if (!(req->flags & REQ_F_FIXED_FILE)) {
471 if (!file) {
472 file = req->rw.ki_filp;
473 file_count = 1;
474 } else if (file == req->rw.ki_filp) {
475 file_count++;
476 } else {
477 fput_many(file, file_count);
478 file = req->rw.ki_filp;
479 file_count = 1;
480 }
Jens Axboe9a56a232019-01-09 09:06:50 -0700481 }
482
Jens Axboedef596e2019-01-09 08:59:42 -0700483 if (to_free == ARRAY_SIZE(reqs))
484 io_free_req_many(ctx, reqs, &to_free);
485 }
486 io_commit_cqring(ctx);
487
Jens Axboe9a56a232019-01-09 09:06:50 -0700488 if (file)
489 fput_many(file, file_count);
Jens Axboedef596e2019-01-09 08:59:42 -0700490 io_free_req_many(ctx, reqs, &to_free);
491}
492
493static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
494 long min)
495{
496 struct io_kiocb *req, *tmp;
497 LIST_HEAD(done);
498 bool spin;
499 int ret;
500
501 /*
502 * Only spin for completions if we don't have multiple devices hanging
503 * off our complete list, and we're under the requested amount.
504 */
505 spin = !ctx->poll_multi_file && *nr_events < min;
506
507 ret = 0;
508 list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
509 struct kiocb *kiocb = &req->rw;
510
511 /*
512 * Move completed entries to our local list. If we find a
513 * request that requires polling, break out and complete
514 * the done list first, if we have entries there.
515 */
516 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
517 list_move_tail(&req->list, &done);
518 continue;
519 }
520 if (!list_empty(&done))
521 break;
522
523 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
524 if (ret < 0)
525 break;
526
527 if (ret && spin)
528 spin = false;
529 ret = 0;
530 }
531
532 if (!list_empty(&done))
533 io_iopoll_complete(ctx, nr_events, &done);
534
535 return ret;
536}
537
538/*
539 * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
540 * non-spinning poll check - we'll still enter the driver poll loop, but only
541 * as a non-spinning completion check.
542 */
543static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
544 long min)
545{
546 while (!list_empty(&ctx->poll_list)) {
547 int ret;
548
549 ret = io_do_iopoll(ctx, nr_events, min);
550 if (ret < 0)
551 return ret;
552 if (!min || *nr_events >= min)
553 return 0;
554 }
555
556 return 1;
557}
558
559/*
560 * We can't just wait for polled events to come to us, we have to actively
561 * find and complete them.
562 */
563static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
564{
565 if (!(ctx->flags & IORING_SETUP_IOPOLL))
566 return;
567
568 mutex_lock(&ctx->uring_lock);
569 while (!list_empty(&ctx->poll_list)) {
570 unsigned int nr_events = 0;
571
572 io_iopoll_getevents(ctx, &nr_events, 1);
573 }
574 mutex_unlock(&ctx->uring_lock);
575}
576
577static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
578 long min)
579{
580 int ret = 0;
581
582 do {
583 int tmin = 0;
584
585 if (*nr_events < min)
586 tmin = min - *nr_events;
587
588 ret = io_iopoll_getevents(ctx, nr_events, tmin);
589 if (ret <= 0)
590 break;
591 ret = 0;
592 } while (min && !*nr_events && !need_resched());
593
594 return ret;
595}
596
Jens Axboe2b188cc2019-01-07 10:46:33 -0700597static void kiocb_end_write(struct kiocb *kiocb)
598{
599 if (kiocb->ki_flags & IOCB_WRITE) {
600 struct inode *inode = file_inode(kiocb->ki_filp);
601
602 /*
603 * Tell lockdep we inherited freeze protection from submission
604 * thread.
605 */
606 if (S_ISREG(inode->i_mode))
607 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
608 file_end_write(kiocb->ki_filp);
609 }
610}
611
Jens Axboe6b063142019-01-10 22:13:58 -0700612static void io_fput(struct io_kiocb *req)
613{
614 if (!(req->flags & REQ_F_FIXED_FILE))
615 fput(req->rw.ki_filp);
616}
617
Jens Axboe2b188cc2019-01-07 10:46:33 -0700618static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
619{
620 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
621
622 kiocb_end_write(kiocb);
623
Jens Axboe6b063142019-01-10 22:13:58 -0700624 io_fput(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700625 io_cqring_add_event(req->ctx, req->user_data, res, 0);
Jens Axboee65ef562019-03-12 10:16:44 -0600626 io_put_req(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700627}
628
Jens Axboedef596e2019-01-09 08:59:42 -0700629static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
630{
631 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
632
633 kiocb_end_write(kiocb);
634
635 req->error = res;
636 if (res != -EAGAIN)
637 req->flags |= REQ_F_IOPOLL_COMPLETED;
638}
639
640/*
641 * After the iocb has been issued, it's safe to be found on the poll list.
642 * Adding the kiocb to the list AFTER submission ensures that we don't
643 * find it from a io_iopoll_getevents() thread before the issuer is done
644 * accessing the kiocb cookie.
645 */
646static void io_iopoll_req_issued(struct io_kiocb *req)
647{
648 struct io_ring_ctx *ctx = req->ctx;
649
650 /*
651 * Track whether we have multiple files in our lists. This will impact
652 * how we do polling eventually, not spinning if we're on potentially
653 * different devices.
654 */
655 if (list_empty(&ctx->poll_list)) {
656 ctx->poll_multi_file = false;
657 } else if (!ctx->poll_multi_file) {
658 struct io_kiocb *list_req;
659
660 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
661 list);
662 if (list_req->rw.ki_filp != req->rw.ki_filp)
663 ctx->poll_multi_file = true;
664 }
665
666 /*
667 * For fast devices, IO may have already completed. If it has, add
668 * it to the front so we find it first.
669 */
670 if (req->flags & REQ_F_IOPOLL_COMPLETED)
671 list_add(&req->list, &ctx->poll_list);
672 else
673 list_add_tail(&req->list, &ctx->poll_list);
674}
675
Jens Axboe9a56a232019-01-09 09:06:50 -0700676static void io_file_put(struct io_submit_state *state, struct file *file)
677{
678 if (!state) {
679 fput(file);
680 } else if (state->file) {
681 int diff = state->has_refs - state->used_refs;
682
683 if (diff)
684 fput_many(state->file, diff);
685 state->file = NULL;
686 }
687}
688
689/*
690 * Get as many references to a file as we have IOs left in this submission,
691 * assuming most submissions are for one file, or at least that each file
692 * has more than one submission.
693 */
694static struct file *io_file_get(struct io_submit_state *state, int fd)
695{
696 if (!state)
697 return fget(fd);
698
699 if (state->file) {
700 if (state->fd == fd) {
701 state->used_refs++;
702 state->ios_left--;
703 return state->file;
704 }
705 io_file_put(state, NULL);
706 }
707 state->file = fget_many(fd, state->ios_left);
708 if (!state->file)
709 return NULL;
710
711 state->fd = fd;
712 state->has_refs = state->ios_left;
713 state->used_refs = 1;
714 state->ios_left--;
715 return state->file;
716}
717
Jens Axboe2b188cc2019-01-07 10:46:33 -0700718/*
719 * If we tracked the file through the SCM inflight mechanism, we could support
720 * any file. For now, just ensure that anything potentially problematic is done
721 * inline.
722 */
723static bool io_file_supports_async(struct file *file)
724{
725 umode_t mode = file_inode(file)->i_mode;
726
727 if (S_ISBLK(mode) || S_ISCHR(mode))
728 return true;
729 if (S_ISREG(mode) && file->f_op != &io_uring_fops)
730 return true;
731
732 return false;
733}
734
Jens Axboe6c271ce2019-01-10 11:22:30 -0700735static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
Jens Axboe9a56a232019-01-09 09:06:50 -0700736 bool force_nonblock, struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700737{
Jens Axboe6c271ce2019-01-10 11:22:30 -0700738 const struct io_uring_sqe *sqe = s->sqe;
Jens Axboedef596e2019-01-09 08:59:42 -0700739 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700740 struct kiocb *kiocb = &req->rw;
Jens Axboe6b063142019-01-10 22:13:58 -0700741 unsigned ioprio, flags;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700742 int fd, ret;
743
744 /* For -EAGAIN retry, everything is already prepped */
Jens Axboed530a402019-03-13 12:15:01 -0600745 if (req->flags & REQ_F_PREPPED)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700746 return 0;
747
Jens Axboe6b063142019-01-10 22:13:58 -0700748 flags = READ_ONCE(sqe->flags);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700749 fd = READ_ONCE(sqe->fd);
Jens Axboe6b063142019-01-10 22:13:58 -0700750
751 if (flags & IOSQE_FIXED_FILE) {
752 if (unlikely(!ctx->user_files ||
753 (unsigned) fd >= ctx->nr_user_files))
754 return -EBADF;
755 kiocb->ki_filp = ctx->user_files[fd];
756 req->flags |= REQ_F_FIXED_FILE;
757 } else {
Jens Axboe6c271ce2019-01-10 11:22:30 -0700758 if (s->needs_fixed_file)
759 return -EBADF;
Jens Axboe6b063142019-01-10 22:13:58 -0700760 kiocb->ki_filp = io_file_get(state, fd);
761 if (unlikely(!kiocb->ki_filp))
762 return -EBADF;
763 if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
764 force_nonblock = false;
765 }
Jens Axboe2b188cc2019-01-07 10:46:33 -0700766 kiocb->ki_pos = READ_ONCE(sqe->off);
767 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
768 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
769
770 ioprio = READ_ONCE(sqe->ioprio);
771 if (ioprio) {
772 ret = ioprio_check_cap(ioprio);
773 if (ret)
774 goto out_fput;
775
776 kiocb->ki_ioprio = ioprio;
777 } else
778 kiocb->ki_ioprio = get_current_ioprio();
779
780 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
781 if (unlikely(ret))
782 goto out_fput;
783 if (force_nonblock) {
784 kiocb->ki_flags |= IOCB_NOWAIT;
785 req->flags |= REQ_F_FORCE_NONBLOCK;
786 }
Jens Axboedef596e2019-01-09 08:59:42 -0700787 if (ctx->flags & IORING_SETUP_IOPOLL) {
788 ret = -EOPNOTSUPP;
789 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
790 !kiocb->ki_filp->f_op->iopoll)
791 goto out_fput;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700792
Jens Axboedef596e2019-01-09 08:59:42 -0700793 req->error = 0;
794 kiocb->ki_flags |= IOCB_HIPRI;
795 kiocb->ki_complete = io_complete_rw_iopoll;
796 } else {
797 if (kiocb->ki_flags & IOCB_HIPRI) {
798 ret = -EINVAL;
799 goto out_fput;
800 }
801 kiocb->ki_complete = io_complete_rw;
802 }
Jens Axboed530a402019-03-13 12:15:01 -0600803 req->flags |= REQ_F_PREPPED;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700804 return 0;
805out_fput:
Jens Axboe6b063142019-01-10 22:13:58 -0700806 if (!(flags & IOSQE_FIXED_FILE)) {
807 /*
808 * in case of error, we didn't use this file reference. drop it.
809 */
810 if (state)
811 state->used_refs--;
812 io_file_put(state, kiocb->ki_filp);
813 }
Jens Axboe2b188cc2019-01-07 10:46:33 -0700814 return ret;
815}
816
817static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
818{
819 switch (ret) {
820 case -EIOCBQUEUED:
821 break;
822 case -ERESTARTSYS:
823 case -ERESTARTNOINTR:
824 case -ERESTARTNOHAND:
825 case -ERESTART_RESTARTBLOCK:
826 /*
827 * We can't just restart the syscall, since previously
828 * submitted sqes may already be in progress. Just fail this
829 * IO with EINTR.
830 */
831 ret = -EINTR;
832 /* fall through */
833 default:
834 kiocb->ki_complete(kiocb, ret, 0);
835 }
836}
837
Jens Axboeedafcce2019-01-09 09:16:05 -0700838static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
839 const struct io_uring_sqe *sqe,
840 struct iov_iter *iter)
841{
842 size_t len = READ_ONCE(sqe->len);
843 struct io_mapped_ubuf *imu;
844 unsigned index, buf_index;
845 size_t offset;
846 u64 buf_addr;
847
848 /* attempt to use fixed buffers without having provided iovecs */
849 if (unlikely(!ctx->user_bufs))
850 return -EFAULT;
851
852 buf_index = READ_ONCE(sqe->buf_index);
853 if (unlikely(buf_index >= ctx->nr_user_bufs))
854 return -EFAULT;
855
856 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
857 imu = &ctx->user_bufs[index];
858 buf_addr = READ_ONCE(sqe->addr);
859
860 /* overflow */
861 if (buf_addr + len < buf_addr)
862 return -EFAULT;
863 /* not inside the mapped region */
864 if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
865 return -EFAULT;
866
867 /*
868 * May not be a start of buffer, set size appropriately
869 * and advance us to the beginning.
870 */
871 offset = buf_addr - imu->ubuf;
872 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
873 if (offset)
874 iov_iter_advance(iter, offset);
875 return 0;
876}
877
Jens Axboe2b188cc2019-01-07 10:46:33 -0700878static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
879 const struct sqe_submit *s, struct iovec **iovec,
880 struct iov_iter *iter)
881{
882 const struct io_uring_sqe *sqe = s->sqe;
883 void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
884 size_t sqe_len = READ_ONCE(sqe->len);
Jens Axboeedafcce2019-01-09 09:16:05 -0700885 u8 opcode;
886
887 /*
888 * We're reading ->opcode for the second time, but the first read
889 * doesn't care whether it's _FIXED or not, so it doesn't matter
890 * whether ->opcode changes concurrently. The first read does care
891 * about whether it is a READ or a WRITE, so we don't trust this read
892 * for that purpose and instead let the caller pass in the read/write
893 * flag.
894 */
895 opcode = READ_ONCE(sqe->opcode);
896 if (opcode == IORING_OP_READ_FIXED ||
897 opcode == IORING_OP_WRITE_FIXED) {
Jens Axboee0c5c572019-03-12 10:18:47 -0600898 int ret = io_import_fixed(ctx, rw, sqe, iter);
Jens Axboeedafcce2019-01-09 09:16:05 -0700899 *iovec = NULL;
900 return ret;
901 }
Jens Axboe2b188cc2019-01-07 10:46:33 -0700902
903 if (!s->has_user)
904 return -EFAULT;
905
906#ifdef CONFIG_COMPAT
907 if (ctx->compat)
908 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
909 iovec, iter);
910#endif
911
912 return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
913}
914
Jens Axboe31b51512019-01-18 22:56:34 -0700915/*
916 * Make a note of the last file/offset/direction we punted to async
917 * context. We'll use this information to see if we can piggy back a
918 * sequential request onto the previous one, if it's still hasn't been
919 * completed by the async worker.
920 */
921static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
922{
923 struct async_list *async_list = &req->ctx->pending_async[rw];
924 struct kiocb *kiocb = &req->rw;
925 struct file *filp = kiocb->ki_filp;
926 off_t io_end = kiocb->ki_pos + len;
927
928 if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
929 unsigned long max_pages;
930
931 /* Use 8x RA size as a decent limiter for both reads/writes */
932 max_pages = filp->f_ra.ra_pages;
933 if (!max_pages)
934 max_pages = VM_MAX_READAHEAD >> (PAGE_SHIFT - 10);
935 max_pages *= 8;
936
937 /* If max pages are exceeded, reset the state */
938 len >>= PAGE_SHIFT;
939 if (async_list->io_pages + len <= max_pages) {
940 req->flags |= REQ_F_SEQ_PREV;
941 async_list->io_pages += len;
942 } else {
943 io_end = 0;
944 async_list->io_pages = 0;
945 }
946 }
947
948 /* New file? Reset state. */
949 if (async_list->file != filp) {
950 async_list->io_pages = 0;
951 async_list->file = filp;
952 }
953 async_list->io_end = io_end;
954}
955
Jens Axboee0c5c572019-03-12 10:18:47 -0600956static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
957 bool force_nonblock, struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700958{
959 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
960 struct kiocb *kiocb = &req->rw;
961 struct iov_iter iter;
962 struct file *file;
Jens Axboe31b51512019-01-18 22:56:34 -0700963 size_t iov_count;
Jens Axboee0c5c572019-03-12 10:18:47 -0600964 int ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700965
Jens Axboe6c271ce2019-01-10 11:22:30 -0700966 ret = io_prep_rw(req, s, force_nonblock, state);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700967 if (ret)
968 return ret;
969 file = kiocb->ki_filp;
970
971 ret = -EBADF;
972 if (unlikely(!(file->f_mode & FMODE_READ)))
973 goto out_fput;
974 ret = -EINVAL;
975 if (unlikely(!file->f_op->read_iter))
976 goto out_fput;
977
978 ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
979 if (ret)
980 goto out_fput;
981
Jens Axboe31b51512019-01-18 22:56:34 -0700982 iov_count = iov_iter_count(&iter);
983 ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700984 if (!ret) {
985 ssize_t ret2;
986
987 /* Catch -EAGAIN return for forced non-blocking submission */
988 ret2 = call_read_iter(file, kiocb, &iter);
Jens Axboe31b51512019-01-18 22:56:34 -0700989 if (!force_nonblock || ret2 != -EAGAIN) {
Jens Axboe2b188cc2019-01-07 10:46:33 -0700990 io_rw_done(kiocb, ret2);
Jens Axboe31b51512019-01-18 22:56:34 -0700991 } else {
992 /*
993 * If ->needs_lock is true, we're already in async
994 * context.
995 */
996 if (!s->needs_lock)
997 io_async_list_note(READ, req, iov_count);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700998 ret = -EAGAIN;
Jens Axboe31b51512019-01-18 22:56:34 -0700999 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07001000 }
1001 kfree(iovec);
1002out_fput:
1003 /* Hold on to the file for -EAGAIN */
1004 if (unlikely(ret && ret != -EAGAIN))
Jens Axboe6b063142019-01-10 22:13:58 -07001005 io_fput(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001006 return ret;
1007}
1008
Jens Axboee0c5c572019-03-12 10:18:47 -06001009static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
1010 bool force_nonblock, struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001011{
1012 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1013 struct kiocb *kiocb = &req->rw;
1014 struct iov_iter iter;
1015 struct file *file;
Jens Axboe31b51512019-01-18 22:56:34 -07001016 size_t iov_count;
Jens Axboee0c5c572019-03-12 10:18:47 -06001017 int ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001018
Jens Axboe6c271ce2019-01-10 11:22:30 -07001019 ret = io_prep_rw(req, s, force_nonblock, state);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001020 if (ret)
1021 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001022
1023 ret = -EBADF;
1024 file = kiocb->ki_filp;
1025 if (unlikely(!(file->f_mode & FMODE_WRITE)))
1026 goto out_fput;
1027 ret = -EINVAL;
1028 if (unlikely(!file->f_op->write_iter))
1029 goto out_fput;
1030
1031 ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
1032 if (ret)
1033 goto out_fput;
1034
Jens Axboe31b51512019-01-18 22:56:34 -07001035 iov_count = iov_iter_count(&iter);
1036
1037 ret = -EAGAIN;
1038 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
1039 /* If ->needs_lock is true, we're already in async context. */
1040 if (!s->needs_lock)
1041 io_async_list_note(WRITE, req, iov_count);
1042 goto out_free;
1043 }
1044
1045 ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001046 if (!ret) {
1047 /*
1048 * Open-code file_start_write here to grab freeze protection,
1049 * which will be released by another thread in
1050 * io_complete_rw(). Fool lockdep by telling it the lock got
1051 * released so that it doesn't complain about the held lock when
1052 * we return to userspace.
1053 */
1054 if (S_ISREG(file_inode(file)->i_mode)) {
1055 __sb_start_write(file_inode(file)->i_sb,
1056 SB_FREEZE_WRITE, true);
1057 __sb_writers_release(file_inode(file)->i_sb,
1058 SB_FREEZE_WRITE);
1059 }
1060 kiocb->ki_flags |= IOCB_WRITE;
1061 io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
1062 }
Jens Axboe31b51512019-01-18 22:56:34 -07001063out_free:
Jens Axboe2b188cc2019-01-07 10:46:33 -07001064 kfree(iovec);
1065out_fput:
Jens Axboe31b51512019-01-18 22:56:34 -07001066 /* Hold on to the file for -EAGAIN */
1067 if (unlikely(ret && ret != -EAGAIN))
Jens Axboe6b063142019-01-10 22:13:58 -07001068 io_fput(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001069 return ret;
1070}
1071
1072/*
1073 * IORING_OP_NOP just posts a completion event, nothing else.
1074 */
1075static int io_nop(struct io_kiocb *req, u64 user_data)
1076{
1077 struct io_ring_ctx *ctx = req->ctx;
1078 long err = 0;
1079
Jens Axboedef596e2019-01-09 08:59:42 -07001080 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1081 return -EINVAL;
1082
Jens Axboe2b188cc2019-01-07 10:46:33 -07001083 /*
1084 * Twilight zone - it's possible that someone issued an opcode that
1085 * has a file attached, then got -EAGAIN on submission, and changed
1086 * the sqe before we retried it from async context. Avoid dropping
1087 * a file reference for this malicious case, and flag the error.
1088 */
1089 if (req->rw.ki_filp) {
1090 err = -EBADF;
Jens Axboe6b063142019-01-10 22:13:58 -07001091 io_fput(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001092 }
1093 io_cqring_add_event(ctx, user_data, err, 0);
Jens Axboee65ef562019-03-12 10:16:44 -06001094 io_put_req(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001095 return 0;
1096}
1097
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001098static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1099{
Jens Axboe6b063142019-01-10 22:13:58 -07001100 struct io_ring_ctx *ctx = req->ctx;
1101 unsigned flags;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001102 int fd;
1103
Jens Axboed530a402019-03-13 12:15:01 -06001104 /* Prep already done (EAGAIN retry) */
1105 if (req->flags & REQ_F_PREPPED)
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001106 return 0;
1107
Jens Axboe6b063142019-01-10 22:13:58 -07001108 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboedef596e2019-01-09 08:59:42 -07001109 return -EINVAL;
Jens Axboeedafcce2019-01-09 09:16:05 -07001110 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001111 return -EINVAL;
1112
1113 fd = READ_ONCE(sqe->fd);
Jens Axboe6b063142019-01-10 22:13:58 -07001114 flags = READ_ONCE(sqe->flags);
1115
1116 if (flags & IOSQE_FIXED_FILE) {
1117 if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
1118 return -EBADF;
1119 req->rw.ki_filp = ctx->user_files[fd];
1120 req->flags |= REQ_F_FIXED_FILE;
1121 } else {
1122 req->rw.ki_filp = fget(fd);
1123 if (unlikely(!req->rw.ki_filp))
1124 return -EBADF;
1125 }
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001126
Jens Axboed530a402019-03-13 12:15:01 -06001127 req->flags |= REQ_F_PREPPED;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001128 return 0;
1129}
1130
1131static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1132 bool force_nonblock)
1133{
1134 loff_t sqe_off = READ_ONCE(sqe->off);
1135 loff_t sqe_len = READ_ONCE(sqe->len);
1136 loff_t end = sqe_off + sqe_len;
1137 unsigned fsync_flags;
1138 int ret;
1139
1140 fsync_flags = READ_ONCE(sqe->fsync_flags);
1141 if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
1142 return -EINVAL;
1143
1144 ret = io_prep_fsync(req, sqe);
1145 if (ret)
1146 return ret;
1147
1148 /* fsync always requires a blocking context */
1149 if (force_nonblock)
1150 return -EAGAIN;
1151
1152 ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
1153 end > 0 ? end : LLONG_MAX,
1154 fsync_flags & IORING_FSYNC_DATASYNC);
1155
Jens Axboe6b063142019-01-10 22:13:58 -07001156 io_fput(req);
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001157 io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
Jens Axboee65ef562019-03-12 10:16:44 -06001158 io_put_req(req);
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001159 return 0;
1160}
1161
Jens Axboe221c5eb2019-01-17 09:41:58 -07001162static void io_poll_remove_one(struct io_kiocb *req)
1163{
1164 struct io_poll_iocb *poll = &req->poll;
1165
1166 spin_lock(&poll->head->lock);
1167 WRITE_ONCE(poll->canceled, true);
1168 if (!list_empty(&poll->wait.entry)) {
1169 list_del_init(&poll->wait.entry);
1170 queue_work(req->ctx->sqo_wq, &req->work);
1171 }
1172 spin_unlock(&poll->head->lock);
1173
1174 list_del_init(&req->list);
1175}
1176
1177static void io_poll_remove_all(struct io_ring_ctx *ctx)
1178{
1179 struct io_kiocb *req;
1180
1181 spin_lock_irq(&ctx->completion_lock);
1182 while (!list_empty(&ctx->cancel_list)) {
1183 req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
1184 io_poll_remove_one(req);
1185 }
1186 spin_unlock_irq(&ctx->completion_lock);
1187}
1188
1189/*
1190 * Find a running poll command that matches one specified in sqe->addr,
1191 * and remove it if found.
1192 */
1193static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1194{
1195 struct io_ring_ctx *ctx = req->ctx;
1196 struct io_kiocb *poll_req, *next;
1197 int ret = -ENOENT;
1198
1199 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1200 return -EINVAL;
1201 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
1202 sqe->poll_events)
1203 return -EINVAL;
1204
1205 spin_lock_irq(&ctx->completion_lock);
1206 list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
1207 if (READ_ONCE(sqe->addr) == poll_req->user_data) {
1208 io_poll_remove_one(poll_req);
1209 ret = 0;
1210 break;
1211 }
1212 }
1213 spin_unlock_irq(&ctx->completion_lock);
1214
1215 io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
Jens Axboee65ef562019-03-12 10:16:44 -06001216 io_put_req(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07001217 return 0;
1218}
1219
1220static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
1221{
1222 io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
1223 io_fput(req);
Jens Axboee65ef562019-03-12 10:16:44 -06001224 io_put_req(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07001225}
1226
1227static void io_poll_complete_work(struct work_struct *work)
1228{
1229 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
1230 struct io_poll_iocb *poll = &req->poll;
1231 struct poll_table_struct pt = { ._key = poll->events };
1232 struct io_ring_ctx *ctx = req->ctx;
1233 __poll_t mask = 0;
1234
1235 if (!READ_ONCE(poll->canceled))
1236 mask = vfs_poll(poll->file, &pt) & poll->events;
1237
1238 /*
1239 * Note that ->ki_cancel callers also delete iocb from active_reqs after
1240 * calling ->ki_cancel. We need the ctx_lock roundtrip here to
1241 * synchronize with them. In the cancellation case the list_del_init
1242 * itself is not actually needed, but harmless so we keep it in to
1243 * avoid further branches in the fast path.
1244 */
1245 spin_lock_irq(&ctx->completion_lock);
1246 if (!mask && !READ_ONCE(poll->canceled)) {
1247 add_wait_queue(poll->head, &poll->wait);
1248 spin_unlock_irq(&ctx->completion_lock);
1249 return;
1250 }
1251 list_del_init(&req->list);
1252 spin_unlock_irq(&ctx->completion_lock);
1253
1254 io_poll_complete(req, mask);
1255}
1256
1257static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
1258 void *key)
1259{
1260 struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
1261 wait);
1262 struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
1263 struct io_ring_ctx *ctx = req->ctx;
1264 __poll_t mask = key_to_poll(key);
1265
1266 poll->woken = true;
1267
1268 /* for instances that support it check for an event match first: */
1269 if (mask) {
1270 unsigned long flags;
1271
1272 if (!(mask & poll->events))
1273 return 0;
1274
1275 /* try to complete the iocb inline if we can: */
1276 if (spin_trylock_irqsave(&ctx->completion_lock, flags)) {
1277 list_del(&req->list);
1278 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1279
1280 list_del_init(&poll->wait.entry);
1281 io_poll_complete(req, mask);
1282 return 1;
1283 }
1284 }
1285
1286 list_del_init(&poll->wait.entry);
1287 queue_work(ctx->sqo_wq, &req->work);
1288 return 1;
1289}
1290
1291struct io_poll_table {
1292 struct poll_table_struct pt;
1293 struct io_kiocb *req;
1294 int error;
1295};
1296
1297static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
1298 struct poll_table_struct *p)
1299{
1300 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
1301
1302 if (unlikely(pt->req->poll.head)) {
1303 pt->error = -EINVAL;
1304 return;
1305 }
1306
1307 pt->error = 0;
1308 pt->req->poll.head = head;
1309 add_wait_queue(head, &pt->req->poll.wait);
1310}
1311
1312static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1313{
1314 struct io_poll_iocb *poll = &req->poll;
1315 struct io_ring_ctx *ctx = req->ctx;
1316 struct io_poll_table ipt;
1317 unsigned flags;
1318 __poll_t mask;
1319 u16 events;
1320 int fd;
1321
1322 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1323 return -EINVAL;
1324 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
1325 return -EINVAL;
1326
1327 INIT_WORK(&req->work, io_poll_complete_work);
1328 events = READ_ONCE(sqe->poll_events);
1329 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
1330
1331 flags = READ_ONCE(sqe->flags);
1332 fd = READ_ONCE(sqe->fd);
1333
1334 if (flags & IOSQE_FIXED_FILE) {
1335 if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
1336 return -EBADF;
1337 poll->file = ctx->user_files[fd];
1338 req->flags |= REQ_F_FIXED_FILE;
1339 } else {
1340 poll->file = fget(fd);
1341 }
1342 if (unlikely(!poll->file))
1343 return -EBADF;
1344
1345 poll->head = NULL;
1346 poll->woken = false;
1347 poll->canceled = false;
1348
1349 ipt.pt._qproc = io_poll_queue_proc;
1350 ipt.pt._key = poll->events;
1351 ipt.req = req;
1352 ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
1353
1354 /* initialized the list so that we can do list_empty checks */
1355 INIT_LIST_HEAD(&poll->wait.entry);
1356 init_waitqueue_func_entry(&poll->wait, io_poll_wake);
1357
Jens Axboe221c5eb2019-01-17 09:41:58 -07001358 mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
1359 if (unlikely(!poll->head)) {
1360 /* we did not manage to set up a waitqueue, done */
1361 goto out;
1362 }
1363
1364 spin_lock_irq(&ctx->completion_lock);
1365 spin_lock(&poll->head->lock);
1366 if (poll->woken) {
1367 /* wake_up context handles the rest */
1368 mask = 0;
1369 ipt.error = 0;
1370 } else if (mask || ipt.error) {
1371 /* if we get an error or a mask we are done */
1372 WARN_ON_ONCE(list_empty(&poll->wait.entry));
1373 list_del_init(&poll->wait.entry);
1374 } else {
1375 /* actually waiting for an event */
1376 list_add_tail(&req->list, &ctx->cancel_list);
1377 }
1378 spin_unlock(&poll->head->lock);
1379 spin_unlock_irq(&ctx->completion_lock);
1380
1381out:
1382 if (unlikely(ipt.error)) {
1383 if (!(flags & IOSQE_FIXED_FILE))
1384 fput(poll->file);
1385 /*
1386 * Drop one of our refs to this req, __io_submit_sqe() will
1387 * drop the other one since we're returning an error.
1388 */
Jens Axboee65ef562019-03-12 10:16:44 -06001389 io_put_req(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07001390 return ipt.error;
1391 }
1392
1393 if (mask)
1394 io_poll_complete(req, mask);
Jens Axboe221c5eb2019-01-17 09:41:58 -07001395 return 0;
1396}
1397
Jens Axboe2b188cc2019-01-07 10:46:33 -07001398static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
Jens Axboe9a56a232019-01-09 09:06:50 -07001399 const struct sqe_submit *s, bool force_nonblock,
1400 struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001401{
Jens Axboee0c5c572019-03-12 10:18:47 -06001402 int ret, opcode;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001403
1404 if (unlikely(s->index >= ctx->sq_entries))
1405 return -EINVAL;
1406 req->user_data = READ_ONCE(s->sqe->user_data);
1407
1408 opcode = READ_ONCE(s->sqe->opcode);
1409 switch (opcode) {
1410 case IORING_OP_NOP:
1411 ret = io_nop(req, req->user_data);
1412 break;
1413 case IORING_OP_READV:
Jens Axboeedafcce2019-01-09 09:16:05 -07001414 if (unlikely(s->sqe->buf_index))
1415 return -EINVAL;
Jens Axboe9a56a232019-01-09 09:06:50 -07001416 ret = io_read(req, s, force_nonblock, state);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001417 break;
1418 case IORING_OP_WRITEV:
Jens Axboeedafcce2019-01-09 09:16:05 -07001419 if (unlikely(s->sqe->buf_index))
1420 return -EINVAL;
1421 ret = io_write(req, s, force_nonblock, state);
1422 break;
1423 case IORING_OP_READ_FIXED:
1424 ret = io_read(req, s, force_nonblock, state);
1425 break;
1426 case IORING_OP_WRITE_FIXED:
Jens Axboe9a56a232019-01-09 09:06:50 -07001427 ret = io_write(req, s, force_nonblock, state);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001428 break;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001429 case IORING_OP_FSYNC:
1430 ret = io_fsync(req, s->sqe, force_nonblock);
1431 break;
Jens Axboe221c5eb2019-01-17 09:41:58 -07001432 case IORING_OP_POLL_ADD:
1433 ret = io_poll_add(req, s->sqe);
1434 break;
1435 case IORING_OP_POLL_REMOVE:
1436 ret = io_poll_remove(req, s->sqe);
1437 break;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001438 default:
1439 ret = -EINVAL;
1440 break;
1441 }
1442
Jens Axboedef596e2019-01-09 08:59:42 -07001443 if (ret)
1444 return ret;
1445
1446 if (ctx->flags & IORING_SETUP_IOPOLL) {
1447 if (req->error == -EAGAIN)
1448 return -EAGAIN;
1449
1450 /* workqueue context doesn't hold uring_lock, grab it now */
1451 if (s->needs_lock)
1452 mutex_lock(&ctx->uring_lock);
1453 io_iopoll_req_issued(req);
1454 if (s->needs_lock)
1455 mutex_unlock(&ctx->uring_lock);
1456 }
1457
1458 return 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001459}
1460
Jens Axboe31b51512019-01-18 22:56:34 -07001461static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,
1462 const struct io_uring_sqe *sqe)
1463{
1464 switch (sqe->opcode) {
1465 case IORING_OP_READV:
1466 case IORING_OP_READ_FIXED:
1467 return &ctx->pending_async[READ];
1468 case IORING_OP_WRITEV:
1469 case IORING_OP_WRITE_FIXED:
1470 return &ctx->pending_async[WRITE];
1471 default:
1472 return NULL;
1473 }
1474}
1475
Jens Axboeedafcce2019-01-09 09:16:05 -07001476static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
1477{
1478 u8 opcode = READ_ONCE(sqe->opcode);
1479
1480 return !(opcode == IORING_OP_READ_FIXED ||
1481 opcode == IORING_OP_WRITE_FIXED);
1482}
1483
Jens Axboe2b188cc2019-01-07 10:46:33 -07001484static void io_sq_wq_submit_work(struct work_struct *work)
1485{
1486 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001487 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe31b51512019-01-18 22:56:34 -07001488 struct mm_struct *cur_mm = NULL;
1489 struct async_list *async_list;
1490 LIST_HEAD(req_list);
Jens Axboeedafcce2019-01-09 09:16:05 -07001491 mm_segment_t old_fs;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001492 int ret;
1493
Jens Axboe31b51512019-01-18 22:56:34 -07001494 async_list = io_async_list_from_sqe(ctx, req->submit.sqe);
1495restart:
1496 do {
1497 struct sqe_submit *s = &req->submit;
1498 const struct io_uring_sqe *sqe = s->sqe;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001499
Jens Axboe31b51512019-01-18 22:56:34 -07001500 /* Ensure we clear previously set forced non-block flag */
1501 req->flags &= ~REQ_F_FORCE_NONBLOCK;
1502 req->rw.ki_flags &= ~IOCB_NOWAIT;
1503
1504 ret = 0;
1505 if (io_sqe_needs_user(sqe) && !cur_mm) {
1506 if (!mmget_not_zero(ctx->sqo_mm)) {
1507 ret = -EFAULT;
1508 } else {
1509 cur_mm = ctx->sqo_mm;
1510 use_mm(cur_mm);
1511 old_fs = get_fs();
1512 set_fs(USER_DS);
1513 }
1514 }
1515
1516 if (!ret) {
1517 s->has_user = cur_mm != NULL;
1518 s->needs_lock = true;
1519 do {
1520 ret = __io_submit_sqe(ctx, req, s, false, NULL);
1521 /*
1522 * We can get EAGAIN for polled IO even though
1523 * we're forcing a sync submission from here,
1524 * since we can't wait for request slots on the
1525 * block side.
1526 */
1527 if (ret != -EAGAIN)
1528 break;
1529 cond_resched();
1530 } while (1);
Jens Axboee65ef562019-03-12 10:16:44 -06001531
1532 /* drop submission reference */
1533 io_put_req(req);
Jens Axboe31b51512019-01-18 22:56:34 -07001534 }
1535 if (ret) {
1536 io_cqring_add_event(ctx, sqe->user_data, ret, 0);
Jens Axboee65ef562019-03-12 10:16:44 -06001537 io_put_req(req);
Jens Axboe31b51512019-01-18 22:56:34 -07001538 }
1539
1540 /* async context always use a copy of the sqe */
1541 kfree(sqe);
1542
1543 if (!async_list)
1544 break;
1545 if (!list_empty(&req_list)) {
1546 req = list_first_entry(&req_list, struct io_kiocb,
1547 list);
1548 list_del(&req->list);
1549 continue;
1550 }
1551 if (list_empty(&async_list->list))
1552 break;
1553
1554 req = NULL;
1555 spin_lock(&async_list->lock);
1556 if (list_empty(&async_list->list)) {
1557 spin_unlock(&async_list->lock);
1558 break;
1559 }
1560 list_splice_init(&async_list->list, &req_list);
1561 spin_unlock(&async_list->lock);
1562
1563 req = list_first_entry(&req_list, struct io_kiocb, list);
1564 list_del(&req->list);
1565 } while (req);
Jens Axboeedafcce2019-01-09 09:16:05 -07001566
1567 /*
Jens Axboe31b51512019-01-18 22:56:34 -07001568 * Rare case of racing with a submitter. If we find the count has
1569 * dropped to zero AND we have pending work items, then restart
1570 * the processing. This is a tiny race window.
Jens Axboeedafcce2019-01-09 09:16:05 -07001571 */
Jens Axboe31b51512019-01-18 22:56:34 -07001572 if (async_list) {
1573 ret = atomic_dec_return(&async_list->cnt);
1574 while (!ret && !list_empty(&async_list->list)) {
1575 spin_lock(&async_list->lock);
1576 atomic_inc(&async_list->cnt);
1577 list_splice_init(&async_list->list, &req_list);
1578 spin_unlock(&async_list->lock);
1579
1580 if (!list_empty(&req_list)) {
1581 req = list_first_entry(&req_list,
1582 struct io_kiocb, list);
1583 list_del(&req->list);
1584 goto restart;
1585 }
1586 ret = atomic_dec_return(&async_list->cnt);
Jens Axboeedafcce2019-01-09 09:16:05 -07001587 }
Jens Axboeedafcce2019-01-09 09:16:05 -07001588 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07001589
Jens Axboe31b51512019-01-18 22:56:34 -07001590 if (cur_mm) {
Jens Axboeedafcce2019-01-09 09:16:05 -07001591 set_fs(old_fs);
Jens Axboe31b51512019-01-18 22:56:34 -07001592 unuse_mm(cur_mm);
1593 mmput(cur_mm);
Jens Axboeedafcce2019-01-09 09:16:05 -07001594 }
Jens Axboe31b51512019-01-18 22:56:34 -07001595}
Jens Axboe2b188cc2019-01-07 10:46:33 -07001596
Jens Axboe31b51512019-01-18 22:56:34 -07001597/*
1598 * See if we can piggy back onto previously submitted work, that is still
1599 * running. We currently only allow this if the new request is sequential
1600 * to the previous one we punted.
1601 */
1602static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
1603{
1604 bool ret = false;
1605
1606 if (!list)
1607 return false;
1608 if (!(req->flags & REQ_F_SEQ_PREV))
1609 return false;
1610 if (!atomic_read(&list->cnt))
1611 return false;
1612
1613 ret = true;
1614 spin_lock(&list->lock);
1615 list_add_tail(&req->list, &list->list);
1616 if (!atomic_read(&list->cnt)) {
1617 list_del_init(&req->list);
1618 ret = false;
1619 }
1620 spin_unlock(&list->lock);
1621 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001622}
1623
Jens Axboe9a56a232019-01-09 09:06:50 -07001624static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
1625 struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001626{
1627 struct io_kiocb *req;
Jens Axboee0c5c572019-03-12 10:18:47 -06001628 int ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001629
1630 /* enforce forwards compatibility on users */
Jens Axboe6b063142019-01-10 22:13:58 -07001631 if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
Jens Axboe2b188cc2019-01-07 10:46:33 -07001632 return -EINVAL;
1633
Jens Axboe2579f912019-01-09 09:10:43 -07001634 req = io_get_req(ctx, state);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001635 if (unlikely(!req))
1636 return -EAGAIN;
1637
Jens Axboe9a56a232019-01-09 09:06:50 -07001638 ret = __io_submit_sqe(ctx, req, s, true, state);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001639 if (ret == -EAGAIN) {
1640 struct io_uring_sqe *sqe_copy;
1641
1642 sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
1643 if (sqe_copy) {
Jens Axboe31b51512019-01-18 22:56:34 -07001644 struct async_list *list;
1645
Jens Axboe2b188cc2019-01-07 10:46:33 -07001646 memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
1647 s->sqe = sqe_copy;
1648
1649 memcpy(&req->submit, s, sizeof(*s));
Jens Axboe31b51512019-01-18 22:56:34 -07001650 list = io_async_list_from_sqe(ctx, s->sqe);
1651 if (!io_add_to_prev_work(list, req)) {
1652 if (list)
1653 atomic_inc(&list->cnt);
1654 INIT_WORK(&req->work, io_sq_wq_submit_work);
1655 queue_work(ctx->sqo_wq, &req->work);
1656 }
Jens Axboee65ef562019-03-12 10:16:44 -06001657
1658 /*
1659 * Queued up for async execution, worker will release
1660 * submit reference when the iocb is actually
1661 * submitted.
1662 */
1663 return 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001664 }
1665 }
Jens Axboee65ef562019-03-12 10:16:44 -06001666
1667 /* drop submission reference */
1668 io_put_req(req);
1669
1670 /* and drop final reference, if we failed */
Jens Axboe2b188cc2019-01-07 10:46:33 -07001671 if (ret)
Jens Axboee65ef562019-03-12 10:16:44 -06001672 io_put_req(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001673
1674 return ret;
1675}
1676
Jens Axboe9a56a232019-01-09 09:06:50 -07001677/*
1678 * Batched submission is done, ensure local IO is flushed out.
1679 */
1680static void io_submit_state_end(struct io_submit_state *state)
1681{
1682 blk_finish_plug(&state->plug);
1683 io_file_put(state, NULL);
Jens Axboe2579f912019-01-09 09:10:43 -07001684 if (state->free_reqs)
1685 kmem_cache_free_bulk(req_cachep, state->free_reqs,
1686 &state->reqs[state->cur_req]);
Jens Axboe9a56a232019-01-09 09:06:50 -07001687}
1688
1689/*
1690 * Start submission side cache.
1691 */
1692static void io_submit_state_start(struct io_submit_state *state,
1693 struct io_ring_ctx *ctx, unsigned max_ios)
1694{
1695 blk_start_plug(&state->plug);
Jens Axboe2579f912019-01-09 09:10:43 -07001696 state->free_reqs = 0;
Jens Axboe9a56a232019-01-09 09:06:50 -07001697 state->file = NULL;
1698 state->ios_left = max_ios;
1699}
1700
Jens Axboe2b188cc2019-01-07 10:46:33 -07001701static void io_commit_sqring(struct io_ring_ctx *ctx)
1702{
1703 struct io_sq_ring *ring = ctx->sq_ring;
1704
1705 if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
1706 /*
1707 * Ensure any loads from the SQEs are done at this point,
1708 * since once we write the new head, the application could
1709 * write new data to them.
1710 */
1711 smp_store_release(&ring->r.head, ctx->cached_sq_head);
1712
1713 /*
1714 * write side barrier of head update, app has read side. See
1715 * comment at the top of this file
1716 */
1717 smp_wmb();
1718 }
1719}
1720
1721/*
1722 * Undo last io_get_sqring()
1723 */
1724static void io_drop_sqring(struct io_ring_ctx *ctx)
1725{
1726 ctx->cached_sq_head--;
1727}
1728
1729/*
1730 * Fetch an sqe, if one is available. Note that s->sqe will point to memory
1731 * that is mapped by userspace. This means that care needs to be taken to
1732 * ensure that reads are stable, as we cannot rely on userspace always
1733 * being a good citizen. If members of the sqe are validated and then later
1734 * used, it's important that those reads are done through READ_ONCE() to
1735 * prevent a re-load down the line.
1736 */
1737static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
1738{
1739 struct io_sq_ring *ring = ctx->sq_ring;
1740 unsigned head;
1741
1742 /*
1743 * The cached sq head (or cq tail) serves two purposes:
1744 *
1745 * 1) allows us to batch the cost of updating the user visible
1746 * head updates.
1747 * 2) allows the kernel side to track the head on its own, even
1748 * though the application is the one updating it.
1749 */
1750 head = ctx->cached_sq_head;
1751 /* See comment at the top of this file */
1752 smp_rmb();
1753 if (head == READ_ONCE(ring->r.tail))
1754 return false;
1755
1756 head = READ_ONCE(ring->array[head & ctx->sq_mask]);
1757 if (head < ctx->sq_entries) {
1758 s->index = head;
1759 s->sqe = &ctx->sq_sqes[head];
1760 ctx->cached_sq_head++;
1761 return true;
1762 }
1763
1764 /* drop invalid entries */
1765 ctx->cached_sq_head++;
1766 ring->dropped++;
1767 /* See comment at the top of this file */
1768 smp_wmb();
1769 return false;
1770}
1771
Jens Axboe6c271ce2019-01-10 11:22:30 -07001772static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
1773 unsigned int nr, bool has_user, bool mm_fault)
1774{
1775 struct io_submit_state state, *statep = NULL;
1776 int ret, i, submitted = 0;
1777
1778 if (nr > IO_PLUG_THRESHOLD) {
1779 io_submit_state_start(&state, ctx, nr);
1780 statep = &state;
1781 }
1782
1783 for (i = 0; i < nr; i++) {
1784 if (unlikely(mm_fault)) {
1785 ret = -EFAULT;
1786 } else {
1787 sqes[i].has_user = has_user;
1788 sqes[i].needs_lock = true;
1789 sqes[i].needs_fixed_file = true;
1790 ret = io_submit_sqe(ctx, &sqes[i], statep);
1791 }
1792 if (!ret) {
1793 submitted++;
1794 continue;
1795 }
1796
1797 io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0);
1798 }
1799
1800 if (statep)
1801 io_submit_state_end(&state);
1802
1803 return submitted;
1804}
1805
1806static int io_sq_thread(void *data)
1807{
1808 struct sqe_submit sqes[IO_IOPOLL_BATCH];
1809 struct io_ring_ctx *ctx = data;
1810 struct mm_struct *cur_mm = NULL;
1811 mm_segment_t old_fs;
1812 DEFINE_WAIT(wait);
1813 unsigned inflight;
1814 unsigned long timeout;
1815
1816 old_fs = get_fs();
1817 set_fs(USER_DS);
1818
1819 timeout = inflight = 0;
1820 while (!kthread_should_stop() && !ctx->sqo_stop) {
1821 bool all_fixed, mm_fault = false;
1822 int i;
1823
1824 if (inflight) {
1825 unsigned nr_events = 0;
1826
1827 if (ctx->flags & IORING_SETUP_IOPOLL) {
1828 /*
1829 * We disallow the app entering submit/complete
1830 * with polling, but we still need to lock the
1831 * ring to prevent racing with polled issue
1832 * that got punted to a workqueue.
1833 */
1834 mutex_lock(&ctx->uring_lock);
1835 io_iopoll_check(ctx, &nr_events, 0);
1836 mutex_unlock(&ctx->uring_lock);
1837 } else {
1838 /*
1839 * Normal IO, just pretend everything completed.
1840 * We don't have to poll completions for that.
1841 */
1842 nr_events = inflight;
1843 }
1844
1845 inflight -= nr_events;
1846 if (!inflight)
1847 timeout = jiffies + ctx->sq_thread_idle;
1848 }
1849
1850 if (!io_get_sqring(ctx, &sqes[0])) {
1851 /*
1852 * We're polling. If we're within the defined idle
1853 * period, then let us spin without work before going
1854 * to sleep.
1855 */
1856 if (inflight || !time_after(jiffies, timeout)) {
1857 cpu_relax();
1858 continue;
1859 }
1860
1861 /*
1862 * Drop cur_mm before scheduling, we can't hold it for
1863 * long periods (or over schedule()). Do this before
1864 * adding ourselves to the waitqueue, as the unuse/drop
1865 * may sleep.
1866 */
1867 if (cur_mm) {
1868 unuse_mm(cur_mm);
1869 mmput(cur_mm);
1870 cur_mm = NULL;
1871 }
1872
1873 prepare_to_wait(&ctx->sqo_wait, &wait,
1874 TASK_INTERRUPTIBLE);
1875
1876 /* Tell userspace we may need a wakeup call */
1877 ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
1878 smp_wmb();
1879
1880 if (!io_get_sqring(ctx, &sqes[0])) {
1881 if (kthread_should_stop()) {
1882 finish_wait(&ctx->sqo_wait, &wait);
1883 break;
1884 }
1885 if (signal_pending(current))
1886 flush_signals(current);
1887 schedule();
1888 finish_wait(&ctx->sqo_wait, &wait);
1889
1890 ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
1891 smp_wmb();
1892 continue;
1893 }
1894 finish_wait(&ctx->sqo_wait, &wait);
1895
1896 ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
1897 smp_wmb();
1898 }
1899
1900 i = 0;
1901 all_fixed = true;
1902 do {
1903 if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
1904 all_fixed = false;
1905
1906 i++;
1907 if (i == ARRAY_SIZE(sqes))
1908 break;
1909 } while (io_get_sqring(ctx, &sqes[i]));
1910
1911 /* Unless all new commands are FIXED regions, grab mm */
1912 if (!all_fixed && !cur_mm) {
1913 mm_fault = !mmget_not_zero(ctx->sqo_mm);
1914 if (!mm_fault) {
1915 use_mm(ctx->sqo_mm);
1916 cur_mm = ctx->sqo_mm;
1917 }
1918 }
1919
1920 inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
1921 mm_fault);
1922
1923 /* Commit SQ ring head once we've consumed all SQEs */
1924 io_commit_sqring(ctx);
1925 }
1926
1927 set_fs(old_fs);
1928 if (cur_mm) {
1929 unuse_mm(cur_mm);
1930 mmput(cur_mm);
1931 }
1932 return 0;
1933}
1934
Jens Axboe2b188cc2019-01-07 10:46:33 -07001935static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
1936{
Jens Axboe9a56a232019-01-09 09:06:50 -07001937 struct io_submit_state state, *statep = NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001938 int i, ret = 0, submit = 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001939
Jens Axboe9a56a232019-01-09 09:06:50 -07001940 if (to_submit > IO_PLUG_THRESHOLD) {
1941 io_submit_state_start(&state, ctx, to_submit);
1942 statep = &state;
1943 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07001944
1945 for (i = 0; i < to_submit; i++) {
1946 struct sqe_submit s;
1947
1948 if (!io_get_sqring(ctx, &s))
1949 break;
1950
1951 s.has_user = true;
Jens Axboedef596e2019-01-09 08:59:42 -07001952 s.needs_lock = false;
Jens Axboe6c271ce2019-01-10 11:22:30 -07001953 s.needs_fixed_file = false;
Jens Axboedef596e2019-01-09 08:59:42 -07001954
Jens Axboe9a56a232019-01-09 09:06:50 -07001955 ret = io_submit_sqe(ctx, &s, statep);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001956 if (ret) {
1957 io_drop_sqring(ctx);
1958 break;
1959 }
1960
1961 submit++;
1962 }
1963 io_commit_sqring(ctx);
1964
Jens Axboe9a56a232019-01-09 09:06:50 -07001965 if (statep)
1966 io_submit_state_end(statep);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001967
1968 return submit ? submit : ret;
1969}
1970
1971static unsigned io_cqring_events(struct io_cq_ring *ring)
1972{
1973 return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
1974}
1975
1976/*
1977 * Wait until events become available, if we don't already have some. The
1978 * application must reap them itself, as they reside on the shared cq ring.
1979 */
1980static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
1981 const sigset_t __user *sig, size_t sigsz)
1982{
1983 struct io_cq_ring *ring = ctx->cq_ring;
1984 sigset_t ksigmask, sigsaved;
1985 DEFINE_WAIT(wait);
1986 int ret;
1987
1988 /* See comment at the top of this file */
1989 smp_rmb();
1990 if (io_cqring_events(ring) >= min_events)
1991 return 0;
1992
1993 if (sig) {
1994 ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
1995 if (ret)
1996 return ret;
1997 }
1998
1999 do {
2000 prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
2001
2002 ret = 0;
2003 /* See comment at the top of this file */
2004 smp_rmb();
2005 if (io_cqring_events(ring) >= min_events)
2006 break;
2007
2008 schedule();
2009
2010 ret = -EINTR;
2011 if (signal_pending(current))
2012 break;
2013 } while (1);
2014
2015 finish_wait(&ctx->wait, &wait);
2016
2017 if (sig)
2018 restore_user_sigmask(sig, &sigsaved);
2019
2020 return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
2021}
2022
Jens Axboe6b063142019-01-10 22:13:58 -07002023static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
2024{
2025#if defined(CONFIG_UNIX)
2026 if (ctx->ring_sock) {
2027 struct sock *sock = ctx->ring_sock->sk;
2028 struct sk_buff *skb;
2029
2030 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
2031 kfree_skb(skb);
2032 }
2033#else
2034 int i;
2035
2036 for (i = 0; i < ctx->nr_user_files; i++)
2037 fput(ctx->user_files[i]);
2038#endif
2039}
2040
2041static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
2042{
2043 if (!ctx->user_files)
2044 return -ENXIO;
2045
2046 __io_sqe_files_unregister(ctx);
2047 kfree(ctx->user_files);
2048 ctx->user_files = NULL;
2049 ctx->nr_user_files = 0;
2050 return 0;
2051}
2052
Jens Axboe6c271ce2019-01-10 11:22:30 -07002053static void io_sq_thread_stop(struct io_ring_ctx *ctx)
2054{
2055 if (ctx->sqo_thread) {
2056 ctx->sqo_stop = 1;
2057 mb();
2058 kthread_stop(ctx->sqo_thread);
2059 ctx->sqo_thread = NULL;
2060 }
2061}
2062
Jens Axboe6b063142019-01-10 22:13:58 -07002063static void io_finish_async(struct io_ring_ctx *ctx)
2064{
Jens Axboe6c271ce2019-01-10 11:22:30 -07002065 io_sq_thread_stop(ctx);
2066
Jens Axboe6b063142019-01-10 22:13:58 -07002067 if (ctx->sqo_wq) {
2068 destroy_workqueue(ctx->sqo_wq);
2069 ctx->sqo_wq = NULL;
2070 }
2071}
2072
2073#if defined(CONFIG_UNIX)
2074static void io_destruct_skb(struct sk_buff *skb)
2075{
2076 struct io_ring_ctx *ctx = skb->sk->sk_user_data;
2077
2078 io_finish_async(ctx);
2079 unix_destruct_scm(skb);
2080}
2081
2082/*
2083 * Ensure the UNIX gc is aware of our file set, so we are certain that
2084 * the io_uring can be safely unregistered on process exit, even if we have
2085 * loops in the file referencing.
2086 */
2087static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
2088{
2089 struct sock *sk = ctx->ring_sock->sk;
2090 struct scm_fp_list *fpl;
2091 struct sk_buff *skb;
2092 int i;
2093
2094 if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
2095 unsigned long inflight = ctx->user->unix_inflight + nr;
2096
2097 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
2098 return -EMFILE;
2099 }
2100
2101 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
2102 if (!fpl)
2103 return -ENOMEM;
2104
2105 skb = alloc_skb(0, GFP_KERNEL);
2106 if (!skb) {
2107 kfree(fpl);
2108 return -ENOMEM;
2109 }
2110
2111 skb->sk = sk;
2112 skb->destructor = io_destruct_skb;
2113
2114 fpl->user = get_uid(ctx->user);
2115 for (i = 0; i < nr; i++) {
2116 fpl->fp[i] = get_file(ctx->user_files[i + offset]);
2117 unix_inflight(fpl->user, fpl->fp[i]);
2118 }
2119
2120 fpl->max = fpl->count = nr;
2121 UNIXCB(skb).fp = fpl;
2122 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2123 skb_queue_head(&sk->sk_receive_queue, skb);
2124
2125 for (i = 0; i < nr; i++)
2126 fput(fpl->fp[i]);
2127
2128 return 0;
2129}
2130
2131/*
2132 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
2133 * causes regular reference counting to break down. We rely on the UNIX
2134 * garbage collection to take care of this problem for us.
2135 */
2136static int io_sqe_files_scm(struct io_ring_ctx *ctx)
2137{
2138 unsigned left, total;
2139 int ret = 0;
2140
2141 total = 0;
2142 left = ctx->nr_user_files;
2143 while (left) {
2144 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
2145 int ret;
2146
2147 ret = __io_sqe_files_scm(ctx, this_files, total);
2148 if (ret)
2149 break;
2150 left -= this_files;
2151 total += this_files;
2152 }
2153
2154 if (!ret)
2155 return 0;
2156
2157 while (total < ctx->nr_user_files) {
2158 fput(ctx->user_files[total]);
2159 total++;
2160 }
2161
2162 return ret;
2163}
2164#else
2165static int io_sqe_files_scm(struct io_ring_ctx *ctx)
2166{
2167 return 0;
2168}
2169#endif
2170
2171static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
2172 unsigned nr_args)
2173{
2174 __s32 __user *fds = (__s32 __user *) arg;
2175 int fd, ret = 0;
2176 unsigned i;
2177
2178 if (ctx->user_files)
2179 return -EBUSY;
2180 if (!nr_args)
2181 return -EINVAL;
2182 if (nr_args > IORING_MAX_FIXED_FILES)
2183 return -EMFILE;
2184
2185 ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
2186 if (!ctx->user_files)
2187 return -ENOMEM;
2188
2189 for (i = 0; i < nr_args; i++) {
2190 ret = -EFAULT;
2191 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
2192 break;
2193
2194 ctx->user_files[i] = fget(fd);
2195
2196 ret = -EBADF;
2197 if (!ctx->user_files[i])
2198 break;
2199 /*
2200 * Don't allow io_uring instances to be registered. If UNIX
2201 * isn't enabled, then this causes a reference cycle and this
2202 * instance can never get freed. If UNIX is enabled we'll
2203 * handle it just fine, but there's still no point in allowing
2204 * a ring fd as it doesn't support regular read/write anyway.
2205 */
2206 if (ctx->user_files[i]->f_op == &io_uring_fops) {
2207 fput(ctx->user_files[i]);
2208 break;
2209 }
2210 ctx->nr_user_files++;
2211 ret = 0;
2212 }
2213
2214 if (ret) {
2215 for (i = 0; i < ctx->nr_user_files; i++)
2216 fput(ctx->user_files[i]);
2217
2218 kfree(ctx->user_files);
2219 ctx->nr_user_files = 0;
2220 return ret;
2221 }
2222
2223 ret = io_sqe_files_scm(ctx);
2224 if (ret)
2225 io_sqe_files_unregister(ctx);
2226
2227 return ret;
2228}
2229
Jens Axboe6c271ce2019-01-10 11:22:30 -07002230static int io_sq_offload_start(struct io_ring_ctx *ctx,
2231 struct io_uring_params *p)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002232{
2233 int ret;
2234
Jens Axboe6c271ce2019-01-10 11:22:30 -07002235 init_waitqueue_head(&ctx->sqo_wait);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002236 mmgrab(current->mm);
2237 ctx->sqo_mm = current->mm;
2238
Jens Axboe6c271ce2019-01-10 11:22:30 -07002239 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
2240 if (!ctx->sq_thread_idle)
2241 ctx->sq_thread_idle = HZ;
2242
2243 ret = -EINVAL;
2244 if (!cpu_possible(p->sq_thread_cpu))
2245 goto err;
2246
2247 if (ctx->flags & IORING_SETUP_SQPOLL) {
2248 if (p->flags & IORING_SETUP_SQ_AFF) {
2249 int cpu;
2250
2251 cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
2252 ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
2253 ctx, cpu,
2254 "io_uring-sq");
2255 } else {
2256 ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
2257 "io_uring-sq");
2258 }
2259 if (IS_ERR(ctx->sqo_thread)) {
2260 ret = PTR_ERR(ctx->sqo_thread);
2261 ctx->sqo_thread = NULL;
2262 goto err;
2263 }
2264 wake_up_process(ctx->sqo_thread);
2265 } else if (p->flags & IORING_SETUP_SQ_AFF) {
2266 /* Can't have SQ_AFF without SQPOLL */
2267 ret = -EINVAL;
2268 goto err;
2269 }
2270
Jens Axboe2b188cc2019-01-07 10:46:33 -07002271 /* Do QD, or 2 * CPUS, whatever is smallest */
2272 ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
2273 min(ctx->sq_entries - 1, 2 * num_online_cpus()));
2274 if (!ctx->sqo_wq) {
2275 ret = -ENOMEM;
2276 goto err;
2277 }
2278
2279 return 0;
2280err:
Jens Axboe6c271ce2019-01-10 11:22:30 -07002281 io_sq_thread_stop(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002282 mmdrop(ctx->sqo_mm);
2283 ctx->sqo_mm = NULL;
2284 return ret;
2285}
2286
2287static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
2288{
2289 atomic_long_sub(nr_pages, &user->locked_vm);
2290}
2291
2292static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
2293{
2294 unsigned long page_limit, cur_pages, new_pages;
2295
2296 /* Don't allow more pages than we can safely lock */
2297 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
2298
2299 do {
2300 cur_pages = atomic_long_read(&user->locked_vm);
2301 new_pages = cur_pages + nr_pages;
2302 if (new_pages > page_limit)
2303 return -ENOMEM;
2304 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
2305 new_pages) != cur_pages);
2306
2307 return 0;
2308}
2309
2310static void io_mem_free(void *ptr)
2311{
2312 struct page *page = virt_to_head_page(ptr);
2313
2314 if (put_page_testzero(page))
2315 free_compound_page(page);
2316}
2317
2318static void *io_mem_alloc(size_t size)
2319{
2320 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
2321 __GFP_NORETRY;
2322
2323 return (void *) __get_free_pages(gfp_flags, get_order(size));
2324}
2325
2326static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
2327{
2328 struct io_sq_ring *sq_ring;
2329 struct io_cq_ring *cq_ring;
2330 size_t bytes;
2331
2332 bytes = struct_size(sq_ring, array, sq_entries);
2333 bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
2334 bytes += struct_size(cq_ring, cqes, cq_entries);
2335
2336 return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
2337}
2338
Jens Axboeedafcce2019-01-09 09:16:05 -07002339static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
2340{
2341 int i, j;
2342
2343 if (!ctx->user_bufs)
2344 return -ENXIO;
2345
2346 for (i = 0; i < ctx->nr_user_bufs; i++) {
2347 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
2348
2349 for (j = 0; j < imu->nr_bvecs; j++)
2350 put_page(imu->bvec[j].bv_page);
2351
2352 if (ctx->account_mem)
2353 io_unaccount_mem(ctx->user, imu->nr_bvecs);
2354 kfree(imu->bvec);
2355 imu->nr_bvecs = 0;
2356 }
2357
2358 kfree(ctx->user_bufs);
2359 ctx->user_bufs = NULL;
2360 ctx->nr_user_bufs = 0;
2361 return 0;
2362}
2363
2364static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
2365 void __user *arg, unsigned index)
2366{
2367 struct iovec __user *src;
2368
2369#ifdef CONFIG_COMPAT
2370 if (ctx->compat) {
2371 struct compat_iovec __user *ciovs;
2372 struct compat_iovec ciov;
2373
2374 ciovs = (struct compat_iovec __user *) arg;
2375 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
2376 return -EFAULT;
2377
2378 dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
2379 dst->iov_len = ciov.iov_len;
2380 return 0;
2381 }
2382#endif
2383 src = (struct iovec __user *) arg;
2384 if (copy_from_user(dst, &src[index], sizeof(*dst)))
2385 return -EFAULT;
2386 return 0;
2387}
2388
2389static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
2390 unsigned nr_args)
2391{
2392 struct vm_area_struct **vmas = NULL;
2393 struct page **pages = NULL;
2394 int i, j, got_pages = 0;
2395 int ret = -EINVAL;
2396
2397 if (ctx->user_bufs)
2398 return -EBUSY;
2399 if (!nr_args || nr_args > UIO_MAXIOV)
2400 return -EINVAL;
2401
2402 ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
2403 GFP_KERNEL);
2404 if (!ctx->user_bufs)
2405 return -ENOMEM;
2406
2407 for (i = 0; i < nr_args; i++) {
2408 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
2409 unsigned long off, start, end, ubuf;
2410 int pret, nr_pages;
2411 struct iovec iov;
2412 size_t size;
2413
2414 ret = io_copy_iov(ctx, &iov, arg, i);
2415 if (ret)
2416 break;
2417
2418 /*
2419 * Don't impose further limits on the size and buffer
2420 * constraints here, we'll -EINVAL later when IO is
2421 * submitted if they are wrong.
2422 */
2423 ret = -EFAULT;
2424 if (!iov.iov_base || !iov.iov_len)
2425 goto err;
2426
2427 /* arbitrary limit, but we need something */
2428 if (iov.iov_len > SZ_1G)
2429 goto err;
2430
2431 ubuf = (unsigned long) iov.iov_base;
2432 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
2433 start = ubuf >> PAGE_SHIFT;
2434 nr_pages = end - start;
2435
2436 if (ctx->account_mem) {
2437 ret = io_account_mem(ctx->user, nr_pages);
2438 if (ret)
2439 goto err;
2440 }
2441
2442 ret = 0;
2443 if (!pages || nr_pages > got_pages) {
2444 kfree(vmas);
2445 kfree(pages);
2446 pages = kmalloc_array(nr_pages, sizeof(struct page *),
2447 GFP_KERNEL);
2448 vmas = kmalloc_array(nr_pages,
2449 sizeof(struct vm_area_struct *),
2450 GFP_KERNEL);
2451 if (!pages || !vmas) {
2452 ret = -ENOMEM;
2453 if (ctx->account_mem)
2454 io_unaccount_mem(ctx->user, nr_pages);
2455 goto err;
2456 }
2457 got_pages = nr_pages;
2458 }
2459
2460 imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
2461 GFP_KERNEL);
2462 ret = -ENOMEM;
2463 if (!imu->bvec) {
2464 if (ctx->account_mem)
2465 io_unaccount_mem(ctx->user, nr_pages);
2466 goto err;
2467 }
2468
2469 ret = 0;
2470 down_read(&current->mm->mmap_sem);
2471 pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
2472 pages, vmas);
2473 if (pret == nr_pages) {
2474 /* don't support file backed memory */
2475 for (j = 0; j < nr_pages; j++) {
2476 struct vm_area_struct *vma = vmas[j];
2477
2478 if (vma->vm_file &&
2479 !is_file_hugepages(vma->vm_file)) {
2480 ret = -EOPNOTSUPP;
2481 break;
2482 }
2483 }
2484 } else {
2485 ret = pret < 0 ? pret : -EFAULT;
2486 }
2487 up_read(&current->mm->mmap_sem);
2488 if (ret) {
2489 /*
2490 * if we did partial map, or found file backed vmas,
2491 * release any pages we did get
2492 */
2493 if (pret > 0) {
2494 for (j = 0; j < pret; j++)
2495 put_page(pages[j]);
2496 }
2497 if (ctx->account_mem)
2498 io_unaccount_mem(ctx->user, nr_pages);
2499 goto err;
2500 }
2501
2502 off = ubuf & ~PAGE_MASK;
2503 size = iov.iov_len;
2504 for (j = 0; j < nr_pages; j++) {
2505 size_t vec_len;
2506
2507 vec_len = min_t(size_t, size, PAGE_SIZE - off);
2508 imu->bvec[j].bv_page = pages[j];
2509 imu->bvec[j].bv_len = vec_len;
2510 imu->bvec[j].bv_offset = off;
2511 off = 0;
2512 size -= vec_len;
2513 }
2514 /* store original address for later verification */
2515 imu->ubuf = ubuf;
2516 imu->len = iov.iov_len;
2517 imu->nr_bvecs = nr_pages;
2518
2519 ctx->nr_user_bufs++;
2520 }
2521 kfree(pages);
2522 kfree(vmas);
2523 return 0;
2524err:
2525 kfree(pages);
2526 kfree(vmas);
2527 io_sqe_buffer_unregister(ctx);
2528 return ret;
2529}
2530
Jens Axboe2b188cc2019-01-07 10:46:33 -07002531static void io_ring_ctx_free(struct io_ring_ctx *ctx)
2532{
Jens Axboe6b063142019-01-10 22:13:58 -07002533 io_finish_async(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002534 if (ctx->sqo_mm)
2535 mmdrop(ctx->sqo_mm);
Jens Axboedef596e2019-01-09 08:59:42 -07002536
2537 io_iopoll_reap_events(ctx);
Jens Axboeedafcce2019-01-09 09:16:05 -07002538 io_sqe_buffer_unregister(ctx);
Jens Axboe6b063142019-01-10 22:13:58 -07002539 io_sqe_files_unregister(ctx);
Jens Axboedef596e2019-01-09 08:59:42 -07002540
Jens Axboe2b188cc2019-01-07 10:46:33 -07002541#if defined(CONFIG_UNIX)
2542 if (ctx->ring_sock)
2543 sock_release(ctx->ring_sock);
2544#endif
2545
2546 io_mem_free(ctx->sq_ring);
2547 io_mem_free(ctx->sq_sqes);
2548 io_mem_free(ctx->cq_ring);
2549
2550 percpu_ref_exit(&ctx->refs);
2551 if (ctx->account_mem)
2552 io_unaccount_mem(ctx->user,
2553 ring_pages(ctx->sq_entries, ctx->cq_entries));
2554 free_uid(ctx->user);
2555 kfree(ctx);
2556}
2557
2558static __poll_t io_uring_poll(struct file *file, poll_table *wait)
2559{
2560 struct io_ring_ctx *ctx = file->private_data;
2561 __poll_t mask = 0;
2562
2563 poll_wait(file, &ctx->cq_wait, wait);
2564 /* See comment at the top of this file */
2565 smp_rmb();
2566 if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
2567 mask |= EPOLLOUT | EPOLLWRNORM;
2568 if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
2569 mask |= EPOLLIN | EPOLLRDNORM;
2570
2571 return mask;
2572}
2573
2574static int io_uring_fasync(int fd, struct file *file, int on)
2575{
2576 struct io_ring_ctx *ctx = file->private_data;
2577
2578 return fasync_helper(fd, file, on, &ctx->cq_fasync);
2579}
2580
2581static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
2582{
2583 mutex_lock(&ctx->uring_lock);
2584 percpu_ref_kill(&ctx->refs);
2585 mutex_unlock(&ctx->uring_lock);
2586
Jens Axboe221c5eb2019-01-17 09:41:58 -07002587 io_poll_remove_all(ctx);
Jens Axboedef596e2019-01-09 08:59:42 -07002588 io_iopoll_reap_events(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002589 wait_for_completion(&ctx->ctx_done);
2590 io_ring_ctx_free(ctx);
2591}
2592
2593static int io_uring_release(struct inode *inode, struct file *file)
2594{
2595 struct io_ring_ctx *ctx = file->private_data;
2596
2597 file->private_data = NULL;
2598 io_ring_ctx_wait_and_kill(ctx);
2599 return 0;
2600}
2601
2602static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
2603{
2604 loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
2605 unsigned long sz = vma->vm_end - vma->vm_start;
2606 struct io_ring_ctx *ctx = file->private_data;
2607 unsigned long pfn;
2608 struct page *page;
2609 void *ptr;
2610
2611 switch (offset) {
2612 case IORING_OFF_SQ_RING:
2613 ptr = ctx->sq_ring;
2614 break;
2615 case IORING_OFF_SQES:
2616 ptr = ctx->sq_sqes;
2617 break;
2618 case IORING_OFF_CQ_RING:
2619 ptr = ctx->cq_ring;
2620 break;
2621 default:
2622 return -EINVAL;
2623 }
2624
2625 page = virt_to_head_page(ptr);
2626 if (sz > (PAGE_SIZE << compound_order(page)))
2627 return -EINVAL;
2628
2629 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
2630 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2631}
2632
2633SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
2634 u32, min_complete, u32, flags, const sigset_t __user *, sig,
2635 size_t, sigsz)
2636{
2637 struct io_ring_ctx *ctx;
2638 long ret = -EBADF;
2639 int submitted = 0;
2640 struct fd f;
2641
Jens Axboe6c271ce2019-01-10 11:22:30 -07002642 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
Jens Axboe2b188cc2019-01-07 10:46:33 -07002643 return -EINVAL;
2644
2645 f = fdget(fd);
2646 if (!f.file)
2647 return -EBADF;
2648
2649 ret = -EOPNOTSUPP;
2650 if (f.file->f_op != &io_uring_fops)
2651 goto out_fput;
2652
2653 ret = -ENXIO;
2654 ctx = f.file->private_data;
2655 if (!percpu_ref_tryget(&ctx->refs))
2656 goto out_fput;
2657
Jens Axboe6c271ce2019-01-10 11:22:30 -07002658 /*
2659 * For SQ polling, the thread will do all submissions and completions.
2660 * Just return the requested submit count, and wake the thread if
2661 * we were asked to.
2662 */
2663 if (ctx->flags & IORING_SETUP_SQPOLL) {
2664 if (flags & IORING_ENTER_SQ_WAKEUP)
2665 wake_up(&ctx->sqo_wait);
2666 submitted = to_submit;
2667 goto out_ctx;
2668 }
2669
Jens Axboe2b188cc2019-01-07 10:46:33 -07002670 ret = 0;
2671 if (to_submit) {
2672 to_submit = min(to_submit, ctx->sq_entries);
2673
2674 mutex_lock(&ctx->uring_lock);
2675 submitted = io_ring_submit(ctx, to_submit);
2676 mutex_unlock(&ctx->uring_lock);
2677
2678 if (submitted < 0)
2679 goto out_ctx;
2680 }
2681 if (flags & IORING_ENTER_GETEVENTS) {
Jens Axboedef596e2019-01-09 08:59:42 -07002682 unsigned nr_events = 0;
2683
Jens Axboe2b188cc2019-01-07 10:46:33 -07002684 min_complete = min(min_complete, ctx->cq_entries);
2685
2686 /*
2687 * The application could have included the 'to_submit' count
2688 * in how many events it wanted to wait for. If we failed to
2689 * submit the desired count, we may need to adjust the number
2690 * of events to poll/wait for.
2691 */
2692 if (submitted < to_submit)
2693 min_complete = min_t(unsigned, submitted, min_complete);
2694
Jens Axboedef596e2019-01-09 08:59:42 -07002695 if (ctx->flags & IORING_SETUP_IOPOLL) {
2696 mutex_lock(&ctx->uring_lock);
2697 ret = io_iopoll_check(ctx, &nr_events, min_complete);
2698 mutex_unlock(&ctx->uring_lock);
2699 } else {
2700 ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
2701 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07002702 }
2703
2704out_ctx:
2705 io_ring_drop_ctx_refs(ctx, 1);
2706out_fput:
2707 fdput(f);
2708 return submitted ? submitted : ret;
2709}
2710
2711static const struct file_operations io_uring_fops = {
2712 .release = io_uring_release,
2713 .mmap = io_uring_mmap,
2714 .poll = io_uring_poll,
2715 .fasync = io_uring_fasync,
2716};
2717
2718static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
2719 struct io_uring_params *p)
2720{
2721 struct io_sq_ring *sq_ring;
2722 struct io_cq_ring *cq_ring;
2723 size_t size;
2724
2725 sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
2726 if (!sq_ring)
2727 return -ENOMEM;
2728
2729 ctx->sq_ring = sq_ring;
2730 sq_ring->ring_mask = p->sq_entries - 1;
2731 sq_ring->ring_entries = p->sq_entries;
2732 ctx->sq_mask = sq_ring->ring_mask;
2733 ctx->sq_entries = sq_ring->ring_entries;
2734
2735 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
2736 if (size == SIZE_MAX)
2737 return -EOVERFLOW;
2738
2739 ctx->sq_sqes = io_mem_alloc(size);
2740 if (!ctx->sq_sqes) {
2741 io_mem_free(ctx->sq_ring);
2742 return -ENOMEM;
2743 }
2744
2745 cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
2746 if (!cq_ring) {
2747 io_mem_free(ctx->sq_ring);
2748 io_mem_free(ctx->sq_sqes);
2749 return -ENOMEM;
2750 }
2751
2752 ctx->cq_ring = cq_ring;
2753 cq_ring->ring_mask = p->cq_entries - 1;
2754 cq_ring->ring_entries = p->cq_entries;
2755 ctx->cq_mask = cq_ring->ring_mask;
2756 ctx->cq_entries = cq_ring->ring_entries;
2757 return 0;
2758}
2759
2760/*
2761 * Allocate an anonymous fd, this is what constitutes the application
2762 * visible backing of an io_uring instance. The application mmaps this
2763 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
2764 * we have to tie this fd to a socket for file garbage collection purposes.
2765 */
2766static int io_uring_get_fd(struct io_ring_ctx *ctx)
2767{
2768 struct file *file;
2769 int ret;
2770
2771#if defined(CONFIG_UNIX)
2772 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
2773 &ctx->ring_sock);
2774 if (ret)
2775 return ret;
2776#endif
2777
2778 ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
2779 if (ret < 0)
2780 goto err;
2781
2782 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
2783 O_RDWR | O_CLOEXEC);
2784 if (IS_ERR(file)) {
2785 put_unused_fd(ret);
2786 ret = PTR_ERR(file);
2787 goto err;
2788 }
2789
2790#if defined(CONFIG_UNIX)
2791 ctx->ring_sock->file = file;
Jens Axboe6b063142019-01-10 22:13:58 -07002792 ctx->ring_sock->sk->sk_user_data = ctx;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002793#endif
2794 fd_install(ret, file);
2795 return ret;
2796err:
2797#if defined(CONFIG_UNIX)
2798 sock_release(ctx->ring_sock);
2799 ctx->ring_sock = NULL;
2800#endif
2801 return ret;
2802}
2803
2804static int io_uring_create(unsigned entries, struct io_uring_params *p)
2805{
2806 struct user_struct *user = NULL;
2807 struct io_ring_ctx *ctx;
2808 bool account_mem;
2809 int ret;
2810
2811 if (!entries || entries > IORING_MAX_ENTRIES)
2812 return -EINVAL;
2813
2814 /*
2815 * Use twice as many entries for the CQ ring. It's possible for the
2816 * application to drive a higher depth than the size of the SQ ring,
2817 * since the sqes are only used at submission time. This allows for
2818 * some flexibility in overcommitting a bit.
2819 */
2820 p->sq_entries = roundup_pow_of_two(entries);
2821 p->cq_entries = 2 * p->sq_entries;
2822
2823 user = get_uid(current_user());
2824 account_mem = !capable(CAP_IPC_LOCK);
2825
2826 if (account_mem) {
2827 ret = io_account_mem(user,
2828 ring_pages(p->sq_entries, p->cq_entries));
2829 if (ret) {
2830 free_uid(user);
2831 return ret;
2832 }
2833 }
2834
2835 ctx = io_ring_ctx_alloc(p);
2836 if (!ctx) {
2837 if (account_mem)
2838 io_unaccount_mem(user, ring_pages(p->sq_entries,
2839 p->cq_entries));
2840 free_uid(user);
2841 return -ENOMEM;
2842 }
2843 ctx->compat = in_compat_syscall();
2844 ctx->account_mem = account_mem;
2845 ctx->user = user;
2846
2847 ret = io_allocate_scq_urings(ctx, p);
2848 if (ret)
2849 goto err;
2850
Jens Axboe6c271ce2019-01-10 11:22:30 -07002851 ret = io_sq_offload_start(ctx, p);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002852 if (ret)
2853 goto err;
2854
2855 ret = io_uring_get_fd(ctx);
2856 if (ret < 0)
2857 goto err;
2858
2859 memset(&p->sq_off, 0, sizeof(p->sq_off));
2860 p->sq_off.head = offsetof(struct io_sq_ring, r.head);
2861 p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
2862 p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
2863 p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
2864 p->sq_off.flags = offsetof(struct io_sq_ring, flags);
2865 p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
2866 p->sq_off.array = offsetof(struct io_sq_ring, array);
2867
2868 memset(&p->cq_off, 0, sizeof(p->cq_off));
2869 p->cq_off.head = offsetof(struct io_cq_ring, r.head);
2870 p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
2871 p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
2872 p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
2873 p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
2874 p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
2875 return ret;
2876err:
2877 io_ring_ctx_wait_and_kill(ctx);
2878 return ret;
2879}
2880
2881/*
2882 * Sets up an aio uring context, and returns the fd. Applications asks for a
2883 * ring size, we return the actual sq/cq ring sizes (among other things) in the
2884 * params structure passed in.
2885 */
2886static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
2887{
2888 struct io_uring_params p;
2889 long ret;
2890 int i;
2891
2892 if (copy_from_user(&p, params, sizeof(p)))
2893 return -EFAULT;
2894 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
2895 if (p.resv[i])
2896 return -EINVAL;
2897 }
2898
Jens Axboe6c271ce2019-01-10 11:22:30 -07002899 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
2900 IORING_SETUP_SQ_AFF))
Jens Axboe2b188cc2019-01-07 10:46:33 -07002901 return -EINVAL;
2902
2903 ret = io_uring_create(entries, &p);
2904 if (ret < 0)
2905 return ret;
2906
2907 if (copy_to_user(params, &p, sizeof(p)))
2908 return -EFAULT;
2909
2910 return ret;
2911}
2912
2913SYSCALL_DEFINE2(io_uring_setup, u32, entries,
2914 struct io_uring_params __user *, params)
2915{
2916 return io_uring_setup(entries, params);
2917}
2918
Jens Axboeedafcce2019-01-09 09:16:05 -07002919static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
2920 void __user *arg, unsigned nr_args)
2921{
2922 int ret;
2923
2924 percpu_ref_kill(&ctx->refs);
2925 wait_for_completion(&ctx->ctx_done);
2926
2927 switch (opcode) {
2928 case IORING_REGISTER_BUFFERS:
2929 ret = io_sqe_buffer_register(ctx, arg, nr_args);
2930 break;
2931 case IORING_UNREGISTER_BUFFERS:
2932 ret = -EINVAL;
2933 if (arg || nr_args)
2934 break;
2935 ret = io_sqe_buffer_unregister(ctx);
2936 break;
Jens Axboe6b063142019-01-10 22:13:58 -07002937 case IORING_REGISTER_FILES:
2938 ret = io_sqe_files_register(ctx, arg, nr_args);
2939 break;
2940 case IORING_UNREGISTER_FILES:
2941 ret = -EINVAL;
2942 if (arg || nr_args)
2943 break;
2944 ret = io_sqe_files_unregister(ctx);
2945 break;
Jens Axboeedafcce2019-01-09 09:16:05 -07002946 default:
2947 ret = -EINVAL;
2948 break;
2949 }
2950
2951 /* bring the ctx back to life */
2952 reinit_completion(&ctx->ctx_done);
2953 percpu_ref_reinit(&ctx->refs);
2954 return ret;
2955}
2956
2957SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
2958 void __user *, arg, unsigned int, nr_args)
2959{
2960 struct io_ring_ctx *ctx;
2961 long ret = -EBADF;
2962 struct fd f;
2963
2964 f = fdget(fd);
2965 if (!f.file)
2966 return -EBADF;
2967
2968 ret = -EOPNOTSUPP;
2969 if (f.file->f_op != &io_uring_fops)
2970 goto out_fput;
2971
2972 ctx = f.file->private_data;
2973
2974 mutex_lock(&ctx->uring_lock);
2975 ret = __io_uring_register(ctx, opcode, arg, nr_args);
2976 mutex_unlock(&ctx->uring_lock);
2977out_fput:
2978 fdput(f);
2979 return ret;
2980}
2981
Jens Axboe2b188cc2019-01-07 10:46:33 -07002982static int __init io_uring_init(void)
2983{
2984 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
2985 return 0;
2986};
2987__initcall(io_uring_init);