blob: b998e98acd01e01a214b4e182879383b194ce74c [file] [log] [blame]
Jens Axboe2b188cc2019-01-07 10:46:33 -07001// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
7 * the application and kernel side. When the application reads the CQ ring
8 * tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
9 * the kernel uses after writing the tail. Failure to do so could cause a
10 * delay in when the application notices that completion events available.
11 * This isn't a fatal condition. Likewise, the application must use an
12 * appropriate smp_wmb() both before writing the SQ tail, and after writing
13 * the SQ tail. The first one orders the sqe writes with the tail write, and
14 * the latter is paired with the smp_rmb() the kernel will issue before
15 * reading the SQ tail on submission.
16 *
17 * Also see the examples in the liburing library:
18 *
19 * git://git.kernel.dk/liburing
20 *
21 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
22 * from data shared between the kernel and application. This is done both
23 * for ordering purposes, but also to ensure that once a value is loaded from
24 * data that the application could potentially modify, it remains stable.
25 *
26 * Copyright (C) 2018-2019 Jens Axboe
Christoph Hellwigc992fe22019-01-11 09:43:02 -070027 * Copyright (c) 2018-2019 Christoph Hellwig
Jens Axboe2b188cc2019-01-07 10:46:33 -070028 */
29#include <linux/kernel.h>
30#include <linux/init.h>
31#include <linux/errno.h>
32#include <linux/syscalls.h>
33#include <linux/compat.h>
34#include <linux/refcount.h>
35#include <linux/uio.h>
36
37#include <linux/sched/signal.h>
38#include <linux/fs.h>
39#include <linux/file.h>
40#include <linux/fdtable.h>
41#include <linux/mm.h>
42#include <linux/mman.h>
43#include <linux/mmu_context.h>
44#include <linux/percpu.h>
45#include <linux/slab.h>
46#include <linux/workqueue.h>
Jens Axboe6c271ce2019-01-10 11:22:30 -070047#include <linux/kthread.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070048#include <linux/blkdev.h>
Jens Axboeedafcce2019-01-09 09:16:05 -070049#include <linux/bvec.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070050#include <linux/net.h>
51#include <net/sock.h>
52#include <net/af_unix.h>
Jens Axboe6b063142019-01-10 22:13:58 -070053#include <net/scm.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070054#include <linux/anon_inodes.h>
55#include <linux/sched/mm.h>
56#include <linux/uaccess.h>
57#include <linux/nospec.h>
Jens Axboeedafcce2019-01-09 09:16:05 -070058#include <linux/sizes.h>
59#include <linux/hugetlb.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070060
61#include <uapi/linux/io_uring.h>
62
63#include "internal.h"
64
65#define IORING_MAX_ENTRIES 4096
Jens Axboe6b063142019-01-10 22:13:58 -070066#define IORING_MAX_FIXED_FILES 1024
Jens Axboe2b188cc2019-01-07 10:46:33 -070067
68struct io_uring {
69 u32 head ____cacheline_aligned_in_smp;
70 u32 tail ____cacheline_aligned_in_smp;
71};
72
73struct io_sq_ring {
74 struct io_uring r;
75 u32 ring_mask;
76 u32 ring_entries;
77 u32 dropped;
78 u32 flags;
79 u32 array[];
80};
81
82struct io_cq_ring {
83 struct io_uring r;
84 u32 ring_mask;
85 u32 ring_entries;
86 u32 overflow;
87 struct io_uring_cqe cqes[];
88};
89
Jens Axboeedafcce2019-01-09 09:16:05 -070090struct io_mapped_ubuf {
91 u64 ubuf;
92 size_t len;
93 struct bio_vec *bvec;
94 unsigned int nr_bvecs;
95};
96
Jens Axboe31b51512019-01-18 22:56:34 -070097struct async_list {
98 spinlock_t lock;
99 atomic_t cnt;
100 struct list_head list;
101
102 struct file *file;
103 off_t io_end;
104 size_t io_pages;
105};
106
Jens Axboe2b188cc2019-01-07 10:46:33 -0700107struct io_ring_ctx {
108 struct {
109 struct percpu_ref refs;
110 } ____cacheline_aligned_in_smp;
111
112 struct {
113 unsigned int flags;
114 bool compat;
115 bool account_mem;
116
117 /* SQ ring */
118 struct io_sq_ring *sq_ring;
119 unsigned cached_sq_head;
120 unsigned sq_entries;
121 unsigned sq_mask;
Jens Axboe6c271ce2019-01-10 11:22:30 -0700122 unsigned sq_thread_idle;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700123 struct io_uring_sqe *sq_sqes;
124 } ____cacheline_aligned_in_smp;
125
126 /* IO offload */
127 struct workqueue_struct *sqo_wq;
Jens Axboe6c271ce2019-01-10 11:22:30 -0700128 struct task_struct *sqo_thread; /* if using sq thread polling */
Jens Axboe2b188cc2019-01-07 10:46:33 -0700129 struct mm_struct *sqo_mm;
Jens Axboe6c271ce2019-01-10 11:22:30 -0700130 wait_queue_head_t sqo_wait;
131 unsigned sqo_stop;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700132
133 struct {
134 /* CQ ring */
135 struct io_cq_ring *cq_ring;
136 unsigned cached_cq_tail;
137 unsigned cq_entries;
138 unsigned cq_mask;
139 struct wait_queue_head cq_wait;
140 struct fasync_struct *cq_fasync;
141 } ____cacheline_aligned_in_smp;
142
Jens Axboe6b063142019-01-10 22:13:58 -0700143 /*
144 * If used, fixed file set. Writers must ensure that ->refs is dead,
145 * readers must ensure that ->refs is alive as long as the file* is
146 * used. Only updated through io_uring_register(2).
147 */
148 struct file **user_files;
149 unsigned nr_user_files;
150
Jens Axboeedafcce2019-01-09 09:16:05 -0700151 /* if used, fixed mapped user buffers */
152 unsigned nr_user_bufs;
153 struct io_mapped_ubuf *user_bufs;
154
Jens Axboe2b188cc2019-01-07 10:46:33 -0700155 struct user_struct *user;
156
157 struct completion ctx_done;
158
159 struct {
160 struct mutex uring_lock;
161 wait_queue_head_t wait;
162 } ____cacheline_aligned_in_smp;
163
164 struct {
165 spinlock_t completion_lock;
Jens Axboedef596e2019-01-09 08:59:42 -0700166 bool poll_multi_file;
167 /*
168 * ->poll_list is protected by the ctx->uring_lock for
169 * io_uring instances that don't use IORING_SETUP_SQPOLL.
170 * For SQPOLL, only the single threaded io_sq_thread() will
171 * manipulate the list, hence no extra locking is needed there.
172 */
173 struct list_head poll_list;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700174 struct list_head cancel_list;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700175 } ____cacheline_aligned_in_smp;
176
Jens Axboe31b51512019-01-18 22:56:34 -0700177 struct async_list pending_async[2];
178
Jens Axboe2b188cc2019-01-07 10:46:33 -0700179#if defined(CONFIG_UNIX)
180 struct socket *ring_sock;
181#endif
182};
183
184struct sqe_submit {
185 const struct io_uring_sqe *sqe;
186 unsigned short index;
187 bool has_user;
Jens Axboedef596e2019-01-09 08:59:42 -0700188 bool needs_lock;
Jens Axboe6c271ce2019-01-10 11:22:30 -0700189 bool needs_fixed_file;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700190};
191
Jens Axboe09bb8392019-03-13 12:39:28 -0600192/*
193 * First field must be the file pointer in all the
194 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
195 */
Jens Axboe221c5eb2019-01-17 09:41:58 -0700196struct io_poll_iocb {
197 struct file *file;
198 struct wait_queue_head *head;
199 __poll_t events;
Jens Axboe8c838782019-03-12 15:48:16 -0600200 bool done;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700201 bool canceled;
202 struct wait_queue_entry wait;
203};
204
Jens Axboe09bb8392019-03-13 12:39:28 -0600205/*
206 * NOTE! Each of the iocb union members has the file pointer
207 * as the first entry in their struct definition. So you can
208 * access the file pointer through any of the sub-structs,
209 * or directly as just 'ki_filp' in this struct.
210 */
Jens Axboe2b188cc2019-01-07 10:46:33 -0700211struct io_kiocb {
Jens Axboe221c5eb2019-01-17 09:41:58 -0700212 union {
Jens Axboe09bb8392019-03-13 12:39:28 -0600213 struct file *file;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700214 struct kiocb rw;
215 struct io_poll_iocb poll;
216 };
Jens Axboe2b188cc2019-01-07 10:46:33 -0700217
218 struct sqe_submit submit;
219
220 struct io_ring_ctx *ctx;
221 struct list_head list;
222 unsigned int flags;
Jens Axboec16361c2019-01-17 08:39:48 -0700223 refcount_t refs;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700224#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */
Jens Axboedef596e2019-01-09 08:59:42 -0700225#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
Jens Axboe6b063142019-01-10 22:13:58 -0700226#define REQ_F_FIXED_FILE 4 /* ctx owns file */
Jens Axboe31b51512019-01-18 22:56:34 -0700227#define REQ_F_SEQ_PREV 8 /* sequential with previous */
Jens Axboed530a402019-03-13 12:15:01 -0600228#define REQ_F_PREPPED 16 /* prep already done */
Jens Axboe2b188cc2019-01-07 10:46:33 -0700229 u64 user_data;
Jens Axboedef596e2019-01-09 08:59:42 -0700230 u64 error;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700231
232 struct work_struct work;
233};
234
235#define IO_PLUG_THRESHOLD 2
Jens Axboedef596e2019-01-09 08:59:42 -0700236#define IO_IOPOLL_BATCH 8
Jens Axboe2b188cc2019-01-07 10:46:33 -0700237
Jens Axboe9a56a232019-01-09 09:06:50 -0700238struct io_submit_state {
239 struct blk_plug plug;
240
241 /*
Jens Axboe2579f912019-01-09 09:10:43 -0700242 * io_kiocb alloc cache
243 */
244 void *reqs[IO_IOPOLL_BATCH];
245 unsigned int free_reqs;
246 unsigned int cur_req;
247
248 /*
Jens Axboe9a56a232019-01-09 09:06:50 -0700249 * File reference cache
250 */
251 struct file *file;
252 unsigned int fd;
253 unsigned int has_refs;
254 unsigned int used_refs;
255 unsigned int ios_left;
256};
257
Jens Axboe2b188cc2019-01-07 10:46:33 -0700258static struct kmem_cache *req_cachep;
259
260static const struct file_operations io_uring_fops;
261
262struct sock *io_uring_get_socket(struct file *file)
263{
264#if defined(CONFIG_UNIX)
265 if (file->f_op == &io_uring_fops) {
266 struct io_ring_ctx *ctx = file->private_data;
267
268 return ctx->ring_sock->sk;
269 }
270#endif
271 return NULL;
272}
273EXPORT_SYMBOL(io_uring_get_socket);
274
275static void io_ring_ctx_ref_free(struct percpu_ref *ref)
276{
277 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
278
279 complete(&ctx->ctx_done);
280}
281
282static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
283{
284 struct io_ring_ctx *ctx;
Jens Axboe31b51512019-01-18 22:56:34 -0700285 int i;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700286
287 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
288 if (!ctx)
289 return NULL;
290
291 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
292 kfree(ctx);
293 return NULL;
294 }
295
296 ctx->flags = p->flags;
297 init_waitqueue_head(&ctx->cq_wait);
298 init_completion(&ctx->ctx_done);
299 mutex_init(&ctx->uring_lock);
300 init_waitqueue_head(&ctx->wait);
Jens Axboe31b51512019-01-18 22:56:34 -0700301 for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
302 spin_lock_init(&ctx->pending_async[i].lock);
303 INIT_LIST_HEAD(&ctx->pending_async[i].list);
304 atomic_set(&ctx->pending_async[i].cnt, 0);
305 }
Jens Axboe2b188cc2019-01-07 10:46:33 -0700306 spin_lock_init(&ctx->completion_lock);
Jens Axboedef596e2019-01-09 08:59:42 -0700307 INIT_LIST_HEAD(&ctx->poll_list);
Jens Axboe221c5eb2019-01-17 09:41:58 -0700308 INIT_LIST_HEAD(&ctx->cancel_list);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700309 return ctx;
310}
311
312static void io_commit_cqring(struct io_ring_ctx *ctx)
313{
314 struct io_cq_ring *ring = ctx->cq_ring;
315
316 if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
317 /* order cqe stores with ring update */
318 smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
319
320 /*
321 * Write sider barrier of tail update, app has read side. See
322 * comment at the top of this file.
323 */
324 smp_wmb();
325
326 if (wq_has_sleeper(&ctx->cq_wait)) {
327 wake_up_interruptible(&ctx->cq_wait);
328 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
329 }
330 }
331}
332
333static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
334{
335 struct io_cq_ring *ring = ctx->cq_ring;
336 unsigned tail;
337
338 tail = ctx->cached_cq_tail;
339 /* See comment at the top of the file */
340 smp_rmb();
Jens Axboe74f464e2019-04-17 08:57:48 -0600341 if (tail - READ_ONCE(ring->r.head) == ring->ring_entries)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700342 return NULL;
343
344 ctx->cached_cq_tail++;
345 return &ring->cqes[tail & ctx->cq_mask];
346}
347
348static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
349 long res, unsigned ev_flags)
350{
351 struct io_uring_cqe *cqe;
352
353 /*
354 * If we can't get a cq entry, userspace overflowed the
355 * submission (by quite a lot). Increment the overflow count in
356 * the ring.
357 */
358 cqe = io_get_cqring(ctx);
359 if (cqe) {
360 WRITE_ONCE(cqe->user_data, ki_user_data);
361 WRITE_ONCE(cqe->res, res);
362 WRITE_ONCE(cqe->flags, ev_flags);
363 } else {
364 unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
365
366 WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
367 }
368}
369
Jens Axboe8c838782019-03-12 15:48:16 -0600370static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
371{
372 if (waitqueue_active(&ctx->wait))
373 wake_up(&ctx->wait);
374 if (waitqueue_active(&ctx->sqo_wait))
375 wake_up(&ctx->sqo_wait);
376}
377
378static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
Jens Axboe2b188cc2019-01-07 10:46:33 -0700379 long res, unsigned ev_flags)
380{
381 unsigned long flags;
382
383 spin_lock_irqsave(&ctx->completion_lock, flags);
Jens Axboe8c838782019-03-12 15:48:16 -0600384 io_cqring_fill_event(ctx, user_data, res, ev_flags);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700385 io_commit_cqring(ctx);
386 spin_unlock_irqrestore(&ctx->completion_lock, flags);
387
Jens Axboe8c838782019-03-12 15:48:16 -0600388 io_cqring_ev_posted(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700389}
390
391static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
392{
393 percpu_ref_put_many(&ctx->refs, refs);
394
395 if (waitqueue_active(&ctx->wait))
396 wake_up(&ctx->wait);
397}
398
Jens Axboe2579f912019-01-09 09:10:43 -0700399static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
400 struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700401{
Jens Axboefd6fab22019-03-14 16:30:06 -0600402 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700403 struct io_kiocb *req;
404
405 if (!percpu_ref_tryget(&ctx->refs))
406 return NULL;
407
Jens Axboe2579f912019-01-09 09:10:43 -0700408 if (!state) {
Jens Axboefd6fab22019-03-14 16:30:06 -0600409 req = kmem_cache_alloc(req_cachep, gfp);
Jens Axboe2579f912019-01-09 09:10:43 -0700410 if (unlikely(!req))
411 goto out;
412 } else if (!state->free_reqs) {
413 size_t sz;
414 int ret;
415
416 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
Jens Axboefd6fab22019-03-14 16:30:06 -0600417 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
418
419 /*
420 * Bulk alloc is all-or-nothing. If we fail to get a batch,
421 * retry single alloc to be on the safe side.
422 */
423 if (unlikely(ret <= 0)) {
424 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
425 if (!state->reqs[0])
426 goto out;
427 ret = 1;
428 }
Jens Axboe2579f912019-01-09 09:10:43 -0700429 state->free_reqs = ret - 1;
430 state->cur_req = 1;
431 req = state->reqs[0];
432 } else {
433 req = state->reqs[state->cur_req];
434 state->free_reqs--;
435 state->cur_req++;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700436 }
437
Jens Axboe2579f912019-01-09 09:10:43 -0700438 req->ctx = ctx;
439 req->flags = 0;
Jens Axboee65ef562019-03-12 10:16:44 -0600440 /* one is dropped after submission, the other at completion */
441 refcount_set(&req->refs, 2);
Jens Axboe2579f912019-01-09 09:10:43 -0700442 return req;
443out:
Jens Axboe2b188cc2019-01-07 10:46:33 -0700444 io_ring_drop_ctx_refs(ctx, 1);
445 return NULL;
446}
447
Jens Axboedef596e2019-01-09 08:59:42 -0700448static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
449{
450 if (*nr) {
451 kmem_cache_free_bulk(req_cachep, *nr, reqs);
452 io_ring_drop_ctx_refs(ctx, *nr);
453 *nr = 0;
454 }
455}
456
Jens Axboe2b188cc2019-01-07 10:46:33 -0700457static void io_free_req(struct io_kiocb *req)
458{
Jens Axboe09bb8392019-03-13 12:39:28 -0600459 if (req->file && !(req->flags & REQ_F_FIXED_FILE))
460 fput(req->file);
Jens Axboee65ef562019-03-12 10:16:44 -0600461 io_ring_drop_ctx_refs(req->ctx, 1);
462 kmem_cache_free(req_cachep, req);
463}
464
465static void io_put_req(struct io_kiocb *req)
466{
467 if (refcount_dec_and_test(&req->refs))
468 io_free_req(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700469}
470
Jens Axboedef596e2019-01-09 08:59:42 -0700471/*
472 * Find and free completed poll iocbs
473 */
474static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
475 struct list_head *done)
476{
477 void *reqs[IO_IOPOLL_BATCH];
478 struct io_kiocb *req;
Jens Axboe09bb8392019-03-13 12:39:28 -0600479 int to_free;
Jens Axboedef596e2019-01-09 08:59:42 -0700480
Jens Axboe09bb8392019-03-13 12:39:28 -0600481 to_free = 0;
Jens Axboedef596e2019-01-09 08:59:42 -0700482 while (!list_empty(done)) {
483 req = list_first_entry(done, struct io_kiocb, list);
484 list_del(&req->list);
485
486 io_cqring_fill_event(ctx, req->user_data, req->error, 0);
Jens Axboedef596e2019-01-09 08:59:42 -0700487 (*nr_events)++;
488
Jens Axboe09bb8392019-03-13 12:39:28 -0600489 if (refcount_dec_and_test(&req->refs)) {
490 /* If we're not using fixed files, we have to pair the
491 * completion part with the file put. Use regular
492 * completions for those, only batch free for fixed
493 * file.
494 */
495 if (req->flags & REQ_F_FIXED_FILE) {
496 reqs[to_free++] = req;
497 if (to_free == ARRAY_SIZE(reqs))
498 io_free_req_many(ctx, reqs, &to_free);
Jens Axboe6b063142019-01-10 22:13:58 -0700499 } else {
Jens Axboe09bb8392019-03-13 12:39:28 -0600500 io_free_req(req);
Jens Axboe6b063142019-01-10 22:13:58 -0700501 }
Jens Axboe9a56a232019-01-09 09:06:50 -0700502 }
Jens Axboedef596e2019-01-09 08:59:42 -0700503 }
Jens Axboedef596e2019-01-09 08:59:42 -0700504
Jens Axboe09bb8392019-03-13 12:39:28 -0600505 io_commit_cqring(ctx);
Jens Axboedef596e2019-01-09 08:59:42 -0700506 io_free_req_many(ctx, reqs, &to_free);
507}
508
509static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
510 long min)
511{
512 struct io_kiocb *req, *tmp;
513 LIST_HEAD(done);
514 bool spin;
515 int ret;
516
517 /*
518 * Only spin for completions if we don't have multiple devices hanging
519 * off our complete list, and we're under the requested amount.
520 */
521 spin = !ctx->poll_multi_file && *nr_events < min;
522
523 ret = 0;
524 list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
525 struct kiocb *kiocb = &req->rw;
526
527 /*
528 * Move completed entries to our local list. If we find a
529 * request that requires polling, break out and complete
530 * the done list first, if we have entries there.
531 */
532 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
533 list_move_tail(&req->list, &done);
534 continue;
535 }
536 if (!list_empty(&done))
537 break;
538
539 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
540 if (ret < 0)
541 break;
542
543 if (ret && spin)
544 spin = false;
545 ret = 0;
546 }
547
548 if (!list_empty(&done))
549 io_iopoll_complete(ctx, nr_events, &done);
550
551 return ret;
552}
553
554/*
555 * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
556 * non-spinning poll check - we'll still enter the driver poll loop, but only
557 * as a non-spinning completion check.
558 */
559static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
560 long min)
561{
562 while (!list_empty(&ctx->poll_list)) {
563 int ret;
564
565 ret = io_do_iopoll(ctx, nr_events, min);
566 if (ret < 0)
567 return ret;
568 if (!min || *nr_events >= min)
569 return 0;
570 }
571
572 return 1;
573}
574
575/*
576 * We can't just wait for polled events to come to us, we have to actively
577 * find and complete them.
578 */
579static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
580{
581 if (!(ctx->flags & IORING_SETUP_IOPOLL))
582 return;
583
584 mutex_lock(&ctx->uring_lock);
585 while (!list_empty(&ctx->poll_list)) {
586 unsigned int nr_events = 0;
587
588 io_iopoll_getevents(ctx, &nr_events, 1);
589 }
590 mutex_unlock(&ctx->uring_lock);
591}
592
593static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
594 long min)
595{
596 int ret = 0;
597
598 do {
599 int tmin = 0;
600
601 if (*nr_events < min)
602 tmin = min - *nr_events;
603
604 ret = io_iopoll_getevents(ctx, nr_events, tmin);
605 if (ret <= 0)
606 break;
607 ret = 0;
608 } while (min && !*nr_events && !need_resched());
609
610 return ret;
611}
612
Jens Axboe2b188cc2019-01-07 10:46:33 -0700613static void kiocb_end_write(struct kiocb *kiocb)
614{
615 if (kiocb->ki_flags & IOCB_WRITE) {
616 struct inode *inode = file_inode(kiocb->ki_filp);
617
618 /*
619 * Tell lockdep we inherited freeze protection from submission
620 * thread.
621 */
622 if (S_ISREG(inode->i_mode))
623 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
624 file_end_write(kiocb->ki_filp);
625 }
626}
627
628static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
629{
630 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
631
632 kiocb_end_write(kiocb);
633
Jens Axboe2b188cc2019-01-07 10:46:33 -0700634 io_cqring_add_event(req->ctx, req->user_data, res, 0);
Jens Axboee65ef562019-03-12 10:16:44 -0600635 io_put_req(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700636}
637
Jens Axboedef596e2019-01-09 08:59:42 -0700638static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
639{
640 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
641
642 kiocb_end_write(kiocb);
643
644 req->error = res;
645 if (res != -EAGAIN)
646 req->flags |= REQ_F_IOPOLL_COMPLETED;
647}
648
649/*
650 * After the iocb has been issued, it's safe to be found on the poll list.
651 * Adding the kiocb to the list AFTER submission ensures that we don't
652 * find it from a io_iopoll_getevents() thread before the issuer is done
653 * accessing the kiocb cookie.
654 */
655static void io_iopoll_req_issued(struct io_kiocb *req)
656{
657 struct io_ring_ctx *ctx = req->ctx;
658
659 /*
660 * Track whether we have multiple files in our lists. This will impact
661 * how we do polling eventually, not spinning if we're on potentially
662 * different devices.
663 */
664 if (list_empty(&ctx->poll_list)) {
665 ctx->poll_multi_file = false;
666 } else if (!ctx->poll_multi_file) {
667 struct io_kiocb *list_req;
668
669 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
670 list);
671 if (list_req->rw.ki_filp != req->rw.ki_filp)
672 ctx->poll_multi_file = true;
673 }
674
675 /*
676 * For fast devices, IO may have already completed. If it has, add
677 * it to the front so we find it first.
678 */
679 if (req->flags & REQ_F_IOPOLL_COMPLETED)
680 list_add(&req->list, &ctx->poll_list);
681 else
682 list_add_tail(&req->list, &ctx->poll_list);
683}
684
Jens Axboe3d6770f2019-04-13 11:50:54 -0600685static void io_file_put(struct io_submit_state *state)
Jens Axboe9a56a232019-01-09 09:06:50 -0700686{
Jens Axboe3d6770f2019-04-13 11:50:54 -0600687 if (state->file) {
Jens Axboe9a56a232019-01-09 09:06:50 -0700688 int diff = state->has_refs - state->used_refs;
689
690 if (diff)
691 fput_many(state->file, diff);
692 state->file = NULL;
693 }
694}
695
696/*
697 * Get as many references to a file as we have IOs left in this submission,
698 * assuming most submissions are for one file, or at least that each file
699 * has more than one submission.
700 */
701static struct file *io_file_get(struct io_submit_state *state, int fd)
702{
703 if (!state)
704 return fget(fd);
705
706 if (state->file) {
707 if (state->fd == fd) {
708 state->used_refs++;
709 state->ios_left--;
710 return state->file;
711 }
Jens Axboe3d6770f2019-04-13 11:50:54 -0600712 io_file_put(state);
Jens Axboe9a56a232019-01-09 09:06:50 -0700713 }
714 state->file = fget_many(fd, state->ios_left);
715 if (!state->file)
716 return NULL;
717
718 state->fd = fd;
719 state->has_refs = state->ios_left;
720 state->used_refs = 1;
721 state->ios_left--;
722 return state->file;
723}
724
Jens Axboe2b188cc2019-01-07 10:46:33 -0700725/*
726 * If we tracked the file through the SCM inflight mechanism, we could support
727 * any file. For now, just ensure that anything potentially problematic is done
728 * inline.
729 */
730static bool io_file_supports_async(struct file *file)
731{
732 umode_t mode = file_inode(file)->i_mode;
733
734 if (S_ISBLK(mode) || S_ISCHR(mode))
735 return true;
736 if (S_ISREG(mode) && file->f_op != &io_uring_fops)
737 return true;
738
739 return false;
740}
741
Jens Axboe6c271ce2019-01-10 11:22:30 -0700742static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
Jens Axboe9a56a232019-01-09 09:06:50 -0700743 bool force_nonblock, struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700744{
Jens Axboe6c271ce2019-01-10 11:22:30 -0700745 const struct io_uring_sqe *sqe = s->sqe;
Jens Axboedef596e2019-01-09 08:59:42 -0700746 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700747 struct kiocb *kiocb = &req->rw;
Jens Axboe09bb8392019-03-13 12:39:28 -0600748 unsigned ioprio;
749 int ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700750
Jens Axboe09bb8392019-03-13 12:39:28 -0600751 if (!req->file)
752 return -EBADF;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700753 /* For -EAGAIN retry, everything is already prepped */
Jens Axboed530a402019-03-13 12:15:01 -0600754 if (req->flags & REQ_F_PREPPED)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700755 return 0;
756
Jens Axboe09bb8392019-03-13 12:39:28 -0600757 if (force_nonblock && !io_file_supports_async(req->file))
758 force_nonblock = false;
Jens Axboe6b063142019-01-10 22:13:58 -0700759
Jens Axboe2b188cc2019-01-07 10:46:33 -0700760 kiocb->ki_pos = READ_ONCE(sqe->off);
761 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
762 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
763
764 ioprio = READ_ONCE(sqe->ioprio);
765 if (ioprio) {
766 ret = ioprio_check_cap(ioprio);
767 if (ret)
Jens Axboe09bb8392019-03-13 12:39:28 -0600768 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700769
770 kiocb->ki_ioprio = ioprio;
771 } else
772 kiocb->ki_ioprio = get_current_ioprio();
773
774 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
775 if (unlikely(ret))
Jens Axboe09bb8392019-03-13 12:39:28 -0600776 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700777 if (force_nonblock) {
778 kiocb->ki_flags |= IOCB_NOWAIT;
779 req->flags |= REQ_F_FORCE_NONBLOCK;
780 }
Jens Axboedef596e2019-01-09 08:59:42 -0700781 if (ctx->flags & IORING_SETUP_IOPOLL) {
Jens Axboedef596e2019-01-09 08:59:42 -0700782 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
783 !kiocb->ki_filp->f_op->iopoll)
Jens Axboe09bb8392019-03-13 12:39:28 -0600784 return -EOPNOTSUPP;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700785
Jens Axboedef596e2019-01-09 08:59:42 -0700786 req->error = 0;
787 kiocb->ki_flags |= IOCB_HIPRI;
788 kiocb->ki_complete = io_complete_rw_iopoll;
789 } else {
Jens Axboe09bb8392019-03-13 12:39:28 -0600790 if (kiocb->ki_flags & IOCB_HIPRI)
791 return -EINVAL;
Jens Axboedef596e2019-01-09 08:59:42 -0700792 kiocb->ki_complete = io_complete_rw;
793 }
Jens Axboed530a402019-03-13 12:15:01 -0600794 req->flags |= REQ_F_PREPPED;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700795 return 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700796}
797
798static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
799{
800 switch (ret) {
801 case -EIOCBQUEUED:
802 break;
803 case -ERESTARTSYS:
804 case -ERESTARTNOINTR:
805 case -ERESTARTNOHAND:
806 case -ERESTART_RESTARTBLOCK:
807 /*
808 * We can't just restart the syscall, since previously
809 * submitted sqes may already be in progress. Just fail this
810 * IO with EINTR.
811 */
812 ret = -EINTR;
813 /* fall through */
814 default:
815 kiocb->ki_complete(kiocb, ret, 0);
816 }
817}
818
Jens Axboeedafcce2019-01-09 09:16:05 -0700819static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
820 const struct io_uring_sqe *sqe,
821 struct iov_iter *iter)
822{
823 size_t len = READ_ONCE(sqe->len);
824 struct io_mapped_ubuf *imu;
825 unsigned index, buf_index;
826 size_t offset;
827 u64 buf_addr;
828
829 /* attempt to use fixed buffers without having provided iovecs */
830 if (unlikely(!ctx->user_bufs))
831 return -EFAULT;
832
833 buf_index = READ_ONCE(sqe->buf_index);
834 if (unlikely(buf_index >= ctx->nr_user_bufs))
835 return -EFAULT;
836
837 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
838 imu = &ctx->user_bufs[index];
839 buf_addr = READ_ONCE(sqe->addr);
840
841 /* overflow */
842 if (buf_addr + len < buf_addr)
843 return -EFAULT;
844 /* not inside the mapped region */
845 if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
846 return -EFAULT;
847
848 /*
849 * May not be a start of buffer, set size appropriately
850 * and advance us to the beginning.
851 */
852 offset = buf_addr - imu->ubuf;
853 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
854 if (offset)
855 iov_iter_advance(iter, offset);
Jens Axboe875f1d02019-02-27 13:05:25 -0700856
857 /* don't drop a reference to these pages */
858 iter->type |= ITER_BVEC_FLAG_NO_REF;
Jens Axboeedafcce2019-01-09 09:16:05 -0700859 return 0;
860}
861
Jens Axboe2b188cc2019-01-07 10:46:33 -0700862static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
863 const struct sqe_submit *s, struct iovec **iovec,
864 struct iov_iter *iter)
865{
866 const struct io_uring_sqe *sqe = s->sqe;
867 void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
868 size_t sqe_len = READ_ONCE(sqe->len);
Jens Axboeedafcce2019-01-09 09:16:05 -0700869 u8 opcode;
870
871 /*
872 * We're reading ->opcode for the second time, but the first read
873 * doesn't care whether it's _FIXED or not, so it doesn't matter
874 * whether ->opcode changes concurrently. The first read does care
875 * about whether it is a READ or a WRITE, so we don't trust this read
876 * for that purpose and instead let the caller pass in the read/write
877 * flag.
878 */
879 opcode = READ_ONCE(sqe->opcode);
880 if (opcode == IORING_OP_READ_FIXED ||
881 opcode == IORING_OP_WRITE_FIXED) {
Jens Axboee0c5c572019-03-12 10:18:47 -0600882 int ret = io_import_fixed(ctx, rw, sqe, iter);
Jens Axboeedafcce2019-01-09 09:16:05 -0700883 *iovec = NULL;
884 return ret;
885 }
Jens Axboe2b188cc2019-01-07 10:46:33 -0700886
887 if (!s->has_user)
888 return -EFAULT;
889
890#ifdef CONFIG_COMPAT
891 if (ctx->compat)
892 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
893 iovec, iter);
894#endif
895
896 return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
897}
898
Jens Axboe31b51512019-01-18 22:56:34 -0700899/*
900 * Make a note of the last file/offset/direction we punted to async
901 * context. We'll use this information to see if we can piggy back a
902 * sequential request onto the previous one, if it's still hasn't been
903 * completed by the async worker.
904 */
905static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
906{
907 struct async_list *async_list = &req->ctx->pending_async[rw];
908 struct kiocb *kiocb = &req->rw;
909 struct file *filp = kiocb->ki_filp;
910 off_t io_end = kiocb->ki_pos + len;
911
912 if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
913 unsigned long max_pages;
914
915 /* Use 8x RA size as a decent limiter for both reads/writes */
916 max_pages = filp->f_ra.ra_pages;
917 if (!max_pages)
Nikolay Borisovb5420232019-03-11 23:28:13 -0700918 max_pages = VM_READAHEAD_PAGES;
Jens Axboe31b51512019-01-18 22:56:34 -0700919 max_pages *= 8;
920
921 /* If max pages are exceeded, reset the state */
922 len >>= PAGE_SHIFT;
923 if (async_list->io_pages + len <= max_pages) {
924 req->flags |= REQ_F_SEQ_PREV;
925 async_list->io_pages += len;
926 } else {
927 io_end = 0;
928 async_list->io_pages = 0;
929 }
930 }
931
932 /* New file? Reset state. */
933 if (async_list->file != filp) {
934 async_list->io_pages = 0;
935 async_list->file = filp;
936 }
937 async_list->io_end = io_end;
938}
939
Jens Axboee0c5c572019-03-12 10:18:47 -0600940static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
941 bool force_nonblock, struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700942{
943 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
944 struct kiocb *kiocb = &req->rw;
945 struct iov_iter iter;
946 struct file *file;
Jens Axboe31b51512019-01-18 22:56:34 -0700947 size_t iov_count;
Jens Axboee0c5c572019-03-12 10:18:47 -0600948 int ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700949
Jens Axboe6c271ce2019-01-10 11:22:30 -0700950 ret = io_prep_rw(req, s, force_nonblock, state);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700951 if (ret)
952 return ret;
953 file = kiocb->ki_filp;
954
Jens Axboe2b188cc2019-01-07 10:46:33 -0700955 if (unlikely(!(file->f_mode & FMODE_READ)))
Jens Axboe09bb8392019-03-13 12:39:28 -0600956 return -EBADF;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700957 if (unlikely(!file->f_op->read_iter))
Jens Axboe09bb8392019-03-13 12:39:28 -0600958 return -EINVAL;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700959
960 ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
961 if (ret)
Jens Axboe09bb8392019-03-13 12:39:28 -0600962 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700963
Jens Axboe31b51512019-01-18 22:56:34 -0700964 iov_count = iov_iter_count(&iter);
965 ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700966 if (!ret) {
967 ssize_t ret2;
968
969 /* Catch -EAGAIN return for forced non-blocking submission */
970 ret2 = call_read_iter(file, kiocb, &iter);
Jens Axboe31b51512019-01-18 22:56:34 -0700971 if (!force_nonblock || ret2 != -EAGAIN) {
Jens Axboe2b188cc2019-01-07 10:46:33 -0700972 io_rw_done(kiocb, ret2);
Jens Axboe31b51512019-01-18 22:56:34 -0700973 } else {
974 /*
975 * If ->needs_lock is true, we're already in async
976 * context.
977 */
978 if (!s->needs_lock)
979 io_async_list_note(READ, req, iov_count);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700980 ret = -EAGAIN;
Jens Axboe31b51512019-01-18 22:56:34 -0700981 }
Jens Axboe2b188cc2019-01-07 10:46:33 -0700982 }
983 kfree(iovec);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700984 return ret;
985}
986
Jens Axboee0c5c572019-03-12 10:18:47 -0600987static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
988 bool force_nonblock, struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700989{
990 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
991 struct kiocb *kiocb = &req->rw;
992 struct iov_iter iter;
993 struct file *file;
Jens Axboe31b51512019-01-18 22:56:34 -0700994 size_t iov_count;
Jens Axboee0c5c572019-03-12 10:18:47 -0600995 int ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700996
Jens Axboe6c271ce2019-01-10 11:22:30 -0700997 ret = io_prep_rw(req, s, force_nonblock, state);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700998 if (ret)
999 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001000
Jens Axboe2b188cc2019-01-07 10:46:33 -07001001 file = kiocb->ki_filp;
1002 if (unlikely(!(file->f_mode & FMODE_WRITE)))
Jens Axboe09bb8392019-03-13 12:39:28 -06001003 return -EBADF;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001004 if (unlikely(!file->f_op->write_iter))
Jens Axboe09bb8392019-03-13 12:39:28 -06001005 return -EINVAL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001006
1007 ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
1008 if (ret)
Jens Axboe09bb8392019-03-13 12:39:28 -06001009 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001010
Jens Axboe31b51512019-01-18 22:56:34 -07001011 iov_count = iov_iter_count(&iter);
1012
1013 ret = -EAGAIN;
1014 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
1015 /* If ->needs_lock is true, we're already in async context. */
1016 if (!s->needs_lock)
1017 io_async_list_note(WRITE, req, iov_count);
1018 goto out_free;
1019 }
1020
1021 ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001022 if (!ret) {
Roman Penyaev9bf79332019-03-25 20:09:24 +01001023 ssize_t ret2;
1024
Jens Axboe2b188cc2019-01-07 10:46:33 -07001025 /*
1026 * Open-code file_start_write here to grab freeze protection,
1027 * which will be released by another thread in
1028 * io_complete_rw(). Fool lockdep by telling it the lock got
1029 * released so that it doesn't complain about the held lock when
1030 * we return to userspace.
1031 */
1032 if (S_ISREG(file_inode(file)->i_mode)) {
1033 __sb_start_write(file_inode(file)->i_sb,
1034 SB_FREEZE_WRITE, true);
1035 __sb_writers_release(file_inode(file)->i_sb,
1036 SB_FREEZE_WRITE);
1037 }
1038 kiocb->ki_flags |= IOCB_WRITE;
Roman Penyaev9bf79332019-03-25 20:09:24 +01001039
1040 ret2 = call_write_iter(file, kiocb, &iter);
1041 if (!force_nonblock || ret2 != -EAGAIN) {
1042 io_rw_done(kiocb, ret2);
1043 } else {
1044 /*
1045 * If ->needs_lock is true, we're already in async
1046 * context.
1047 */
1048 if (!s->needs_lock)
1049 io_async_list_note(WRITE, req, iov_count);
1050 ret = -EAGAIN;
1051 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07001052 }
Jens Axboe31b51512019-01-18 22:56:34 -07001053out_free:
Jens Axboe2b188cc2019-01-07 10:46:33 -07001054 kfree(iovec);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001055 return ret;
1056}
1057
1058/*
1059 * IORING_OP_NOP just posts a completion event, nothing else.
1060 */
1061static int io_nop(struct io_kiocb *req, u64 user_data)
1062{
1063 struct io_ring_ctx *ctx = req->ctx;
1064 long err = 0;
1065
Jens Axboedef596e2019-01-09 08:59:42 -07001066 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1067 return -EINVAL;
1068
Jens Axboe2b188cc2019-01-07 10:46:33 -07001069 io_cqring_add_event(ctx, user_data, err, 0);
Jens Axboee65ef562019-03-12 10:16:44 -06001070 io_put_req(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001071 return 0;
1072}
1073
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001074static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1075{
Jens Axboe6b063142019-01-10 22:13:58 -07001076 struct io_ring_ctx *ctx = req->ctx;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001077
Jens Axboe09bb8392019-03-13 12:39:28 -06001078 if (!req->file)
1079 return -EBADF;
Jens Axboed530a402019-03-13 12:15:01 -06001080 /* Prep already done (EAGAIN retry) */
1081 if (req->flags & REQ_F_PREPPED)
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001082 return 0;
1083
Jens Axboe6b063142019-01-10 22:13:58 -07001084 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboedef596e2019-01-09 08:59:42 -07001085 return -EINVAL;
Jens Axboeedafcce2019-01-09 09:16:05 -07001086 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001087 return -EINVAL;
1088
Jens Axboed530a402019-03-13 12:15:01 -06001089 req->flags |= REQ_F_PREPPED;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001090 return 0;
1091}
1092
1093static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1094 bool force_nonblock)
1095{
1096 loff_t sqe_off = READ_ONCE(sqe->off);
1097 loff_t sqe_len = READ_ONCE(sqe->len);
1098 loff_t end = sqe_off + sqe_len;
1099 unsigned fsync_flags;
1100 int ret;
1101
1102 fsync_flags = READ_ONCE(sqe->fsync_flags);
1103 if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
1104 return -EINVAL;
1105
1106 ret = io_prep_fsync(req, sqe);
1107 if (ret)
1108 return ret;
1109
1110 /* fsync always requires a blocking context */
1111 if (force_nonblock)
1112 return -EAGAIN;
1113
1114 ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
1115 end > 0 ? end : LLONG_MAX,
1116 fsync_flags & IORING_FSYNC_DATASYNC);
1117
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001118 io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
Jens Axboee65ef562019-03-12 10:16:44 -06001119 io_put_req(req);
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001120 return 0;
1121}
1122
Jens Axboe221c5eb2019-01-17 09:41:58 -07001123static void io_poll_remove_one(struct io_kiocb *req)
1124{
1125 struct io_poll_iocb *poll = &req->poll;
1126
1127 spin_lock(&poll->head->lock);
1128 WRITE_ONCE(poll->canceled, true);
1129 if (!list_empty(&poll->wait.entry)) {
1130 list_del_init(&poll->wait.entry);
1131 queue_work(req->ctx->sqo_wq, &req->work);
1132 }
1133 spin_unlock(&poll->head->lock);
1134
1135 list_del_init(&req->list);
1136}
1137
1138static void io_poll_remove_all(struct io_ring_ctx *ctx)
1139{
1140 struct io_kiocb *req;
1141
1142 spin_lock_irq(&ctx->completion_lock);
1143 while (!list_empty(&ctx->cancel_list)) {
1144 req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
1145 io_poll_remove_one(req);
1146 }
1147 spin_unlock_irq(&ctx->completion_lock);
1148}
1149
1150/*
1151 * Find a running poll command that matches one specified in sqe->addr,
1152 * and remove it if found.
1153 */
1154static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1155{
1156 struct io_ring_ctx *ctx = req->ctx;
1157 struct io_kiocb *poll_req, *next;
1158 int ret = -ENOENT;
1159
1160 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1161 return -EINVAL;
1162 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
1163 sqe->poll_events)
1164 return -EINVAL;
1165
1166 spin_lock_irq(&ctx->completion_lock);
1167 list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
1168 if (READ_ONCE(sqe->addr) == poll_req->user_data) {
1169 io_poll_remove_one(poll_req);
1170 ret = 0;
1171 break;
1172 }
1173 }
1174 spin_unlock_irq(&ctx->completion_lock);
1175
1176 io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
Jens Axboee65ef562019-03-12 10:16:44 -06001177 io_put_req(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07001178 return 0;
1179}
1180
Jens Axboe8c838782019-03-12 15:48:16 -06001181static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
1182 __poll_t mask)
Jens Axboe221c5eb2019-01-17 09:41:58 -07001183{
Jens Axboe8c838782019-03-12 15:48:16 -06001184 req->poll.done = true;
1185 io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0);
1186 io_commit_cqring(ctx);
Jens Axboe221c5eb2019-01-17 09:41:58 -07001187}
1188
1189static void io_poll_complete_work(struct work_struct *work)
1190{
1191 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
1192 struct io_poll_iocb *poll = &req->poll;
1193 struct poll_table_struct pt = { ._key = poll->events };
1194 struct io_ring_ctx *ctx = req->ctx;
1195 __poll_t mask = 0;
1196
1197 if (!READ_ONCE(poll->canceled))
1198 mask = vfs_poll(poll->file, &pt) & poll->events;
1199
1200 /*
1201 * Note that ->ki_cancel callers also delete iocb from active_reqs after
1202 * calling ->ki_cancel. We need the ctx_lock roundtrip here to
1203 * synchronize with them. In the cancellation case the list_del_init
1204 * itself is not actually needed, but harmless so we keep it in to
1205 * avoid further branches in the fast path.
1206 */
1207 spin_lock_irq(&ctx->completion_lock);
1208 if (!mask && !READ_ONCE(poll->canceled)) {
1209 add_wait_queue(poll->head, &poll->wait);
1210 spin_unlock_irq(&ctx->completion_lock);
1211 return;
1212 }
1213 list_del_init(&req->list);
Jens Axboe8c838782019-03-12 15:48:16 -06001214 io_poll_complete(ctx, req, mask);
Jens Axboe221c5eb2019-01-17 09:41:58 -07001215 spin_unlock_irq(&ctx->completion_lock);
1216
Jens Axboe8c838782019-03-12 15:48:16 -06001217 io_cqring_ev_posted(ctx);
1218 io_put_req(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07001219}
1220
1221static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
1222 void *key)
1223{
1224 struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
1225 wait);
1226 struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
1227 struct io_ring_ctx *ctx = req->ctx;
1228 __poll_t mask = key_to_poll(key);
Jens Axboe8c838782019-03-12 15:48:16 -06001229 unsigned long flags;
Jens Axboe221c5eb2019-01-17 09:41:58 -07001230
1231 /* for instances that support it check for an event match first: */
Jens Axboe8c838782019-03-12 15:48:16 -06001232 if (mask && !(mask & poll->events))
1233 return 0;
Jens Axboe221c5eb2019-01-17 09:41:58 -07001234
1235 list_del_init(&poll->wait.entry);
Jens Axboe8c838782019-03-12 15:48:16 -06001236
1237 if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
1238 list_del(&req->list);
1239 io_poll_complete(ctx, req, mask);
1240 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1241
1242 io_cqring_ev_posted(ctx);
1243 io_put_req(req);
1244 } else {
1245 queue_work(ctx->sqo_wq, &req->work);
1246 }
1247
Jens Axboe221c5eb2019-01-17 09:41:58 -07001248 return 1;
1249}
1250
1251struct io_poll_table {
1252 struct poll_table_struct pt;
1253 struct io_kiocb *req;
1254 int error;
1255};
1256
1257static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
1258 struct poll_table_struct *p)
1259{
1260 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
1261
1262 if (unlikely(pt->req->poll.head)) {
1263 pt->error = -EINVAL;
1264 return;
1265 }
1266
1267 pt->error = 0;
1268 pt->req->poll.head = head;
1269 add_wait_queue(head, &pt->req->poll.wait);
1270}
1271
1272static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1273{
1274 struct io_poll_iocb *poll = &req->poll;
1275 struct io_ring_ctx *ctx = req->ctx;
1276 struct io_poll_table ipt;
Jens Axboe8c838782019-03-12 15:48:16 -06001277 bool cancel = false;
Jens Axboe221c5eb2019-01-17 09:41:58 -07001278 __poll_t mask;
1279 u16 events;
Jens Axboe221c5eb2019-01-17 09:41:58 -07001280
1281 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1282 return -EINVAL;
1283 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
1284 return -EINVAL;
Jens Axboe09bb8392019-03-13 12:39:28 -06001285 if (!poll->file)
1286 return -EBADF;
Jens Axboe221c5eb2019-01-17 09:41:58 -07001287
1288 INIT_WORK(&req->work, io_poll_complete_work);
1289 events = READ_ONCE(sqe->poll_events);
1290 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
1291
Jens Axboe221c5eb2019-01-17 09:41:58 -07001292 poll->head = NULL;
Jens Axboe8c838782019-03-12 15:48:16 -06001293 poll->done = false;
Jens Axboe221c5eb2019-01-17 09:41:58 -07001294 poll->canceled = false;
1295
1296 ipt.pt._qproc = io_poll_queue_proc;
1297 ipt.pt._key = poll->events;
1298 ipt.req = req;
1299 ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
1300
1301 /* initialized the list so that we can do list_empty checks */
1302 INIT_LIST_HEAD(&poll->wait.entry);
1303 init_waitqueue_func_entry(&poll->wait, io_poll_wake);
1304
Jens Axboe221c5eb2019-01-17 09:41:58 -07001305 mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
Jens Axboe221c5eb2019-01-17 09:41:58 -07001306
1307 spin_lock_irq(&ctx->completion_lock);
Jens Axboe8c838782019-03-12 15:48:16 -06001308 if (likely(poll->head)) {
1309 spin_lock(&poll->head->lock);
1310 if (unlikely(list_empty(&poll->wait.entry))) {
1311 if (ipt.error)
1312 cancel = true;
1313 ipt.error = 0;
1314 mask = 0;
1315 }
1316 if (mask || ipt.error)
1317 list_del_init(&poll->wait.entry);
1318 else if (cancel)
1319 WRITE_ONCE(poll->canceled, true);
1320 else if (!poll->done) /* actually waiting for an event */
1321 list_add_tail(&req->list, &ctx->cancel_list);
1322 spin_unlock(&poll->head->lock);
Jens Axboe221c5eb2019-01-17 09:41:58 -07001323 }
Jens Axboe8c838782019-03-12 15:48:16 -06001324 if (mask) { /* no async, we'd stolen it */
1325 req->error = mangle_poll(mask);
1326 ipt.error = 0;
1327 io_poll_complete(ctx, req, mask);
1328 }
Jens Axboe221c5eb2019-01-17 09:41:58 -07001329 spin_unlock_irq(&ctx->completion_lock);
1330
Jens Axboe8c838782019-03-12 15:48:16 -06001331 if (mask) {
1332 io_cqring_ev_posted(ctx);
Jens Axboee65ef562019-03-12 10:16:44 -06001333 io_put_req(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07001334 }
Jens Axboe8c838782019-03-12 15:48:16 -06001335 return ipt.error;
Jens Axboe221c5eb2019-01-17 09:41:58 -07001336}
1337
Jens Axboe2b188cc2019-01-07 10:46:33 -07001338static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
Jens Axboe9a56a232019-01-09 09:06:50 -07001339 const struct sqe_submit *s, bool force_nonblock,
1340 struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001341{
Jens Axboee0c5c572019-03-12 10:18:47 -06001342 int ret, opcode;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001343
1344 if (unlikely(s->index >= ctx->sq_entries))
1345 return -EINVAL;
1346 req->user_data = READ_ONCE(s->sqe->user_data);
1347
1348 opcode = READ_ONCE(s->sqe->opcode);
1349 switch (opcode) {
1350 case IORING_OP_NOP:
1351 ret = io_nop(req, req->user_data);
1352 break;
1353 case IORING_OP_READV:
Jens Axboeedafcce2019-01-09 09:16:05 -07001354 if (unlikely(s->sqe->buf_index))
1355 return -EINVAL;
Jens Axboe9a56a232019-01-09 09:06:50 -07001356 ret = io_read(req, s, force_nonblock, state);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001357 break;
1358 case IORING_OP_WRITEV:
Jens Axboeedafcce2019-01-09 09:16:05 -07001359 if (unlikely(s->sqe->buf_index))
1360 return -EINVAL;
1361 ret = io_write(req, s, force_nonblock, state);
1362 break;
1363 case IORING_OP_READ_FIXED:
1364 ret = io_read(req, s, force_nonblock, state);
1365 break;
1366 case IORING_OP_WRITE_FIXED:
Jens Axboe9a56a232019-01-09 09:06:50 -07001367 ret = io_write(req, s, force_nonblock, state);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001368 break;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07001369 case IORING_OP_FSYNC:
1370 ret = io_fsync(req, s->sqe, force_nonblock);
1371 break;
Jens Axboe221c5eb2019-01-17 09:41:58 -07001372 case IORING_OP_POLL_ADD:
1373 ret = io_poll_add(req, s->sqe);
1374 break;
1375 case IORING_OP_POLL_REMOVE:
1376 ret = io_poll_remove(req, s->sqe);
1377 break;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001378 default:
1379 ret = -EINVAL;
1380 break;
1381 }
1382
Jens Axboedef596e2019-01-09 08:59:42 -07001383 if (ret)
1384 return ret;
1385
1386 if (ctx->flags & IORING_SETUP_IOPOLL) {
1387 if (req->error == -EAGAIN)
1388 return -EAGAIN;
1389
1390 /* workqueue context doesn't hold uring_lock, grab it now */
1391 if (s->needs_lock)
1392 mutex_lock(&ctx->uring_lock);
1393 io_iopoll_req_issued(req);
1394 if (s->needs_lock)
1395 mutex_unlock(&ctx->uring_lock);
1396 }
1397
1398 return 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001399}
1400
Jens Axboe31b51512019-01-18 22:56:34 -07001401static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,
1402 const struct io_uring_sqe *sqe)
1403{
1404 switch (sqe->opcode) {
1405 case IORING_OP_READV:
1406 case IORING_OP_READ_FIXED:
1407 return &ctx->pending_async[READ];
1408 case IORING_OP_WRITEV:
1409 case IORING_OP_WRITE_FIXED:
1410 return &ctx->pending_async[WRITE];
1411 default:
1412 return NULL;
1413 }
1414}
1415
Jens Axboeedafcce2019-01-09 09:16:05 -07001416static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
1417{
1418 u8 opcode = READ_ONCE(sqe->opcode);
1419
1420 return !(opcode == IORING_OP_READ_FIXED ||
1421 opcode == IORING_OP_WRITE_FIXED);
1422}
1423
Jens Axboe2b188cc2019-01-07 10:46:33 -07001424static void io_sq_wq_submit_work(struct work_struct *work)
1425{
1426 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001427 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe31b51512019-01-18 22:56:34 -07001428 struct mm_struct *cur_mm = NULL;
1429 struct async_list *async_list;
1430 LIST_HEAD(req_list);
Jens Axboeedafcce2019-01-09 09:16:05 -07001431 mm_segment_t old_fs;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001432 int ret;
1433
Jens Axboe31b51512019-01-18 22:56:34 -07001434 async_list = io_async_list_from_sqe(ctx, req->submit.sqe);
1435restart:
1436 do {
1437 struct sqe_submit *s = &req->submit;
1438 const struct io_uring_sqe *sqe = s->sqe;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001439
Jens Axboe31b51512019-01-18 22:56:34 -07001440 /* Ensure we clear previously set forced non-block flag */
1441 req->flags &= ~REQ_F_FORCE_NONBLOCK;
1442 req->rw.ki_flags &= ~IOCB_NOWAIT;
1443
1444 ret = 0;
1445 if (io_sqe_needs_user(sqe) && !cur_mm) {
1446 if (!mmget_not_zero(ctx->sqo_mm)) {
1447 ret = -EFAULT;
1448 } else {
1449 cur_mm = ctx->sqo_mm;
1450 use_mm(cur_mm);
1451 old_fs = get_fs();
1452 set_fs(USER_DS);
1453 }
1454 }
1455
1456 if (!ret) {
1457 s->has_user = cur_mm != NULL;
1458 s->needs_lock = true;
1459 do {
1460 ret = __io_submit_sqe(ctx, req, s, false, NULL);
1461 /*
1462 * We can get EAGAIN for polled IO even though
1463 * we're forcing a sync submission from here,
1464 * since we can't wait for request slots on the
1465 * block side.
1466 */
1467 if (ret != -EAGAIN)
1468 break;
1469 cond_resched();
1470 } while (1);
Jens Axboee65ef562019-03-12 10:16:44 -06001471
1472 /* drop submission reference */
1473 io_put_req(req);
Jens Axboe31b51512019-01-18 22:56:34 -07001474 }
1475 if (ret) {
1476 io_cqring_add_event(ctx, sqe->user_data, ret, 0);
Jens Axboee65ef562019-03-12 10:16:44 -06001477 io_put_req(req);
Jens Axboe31b51512019-01-18 22:56:34 -07001478 }
1479
1480 /* async context always use a copy of the sqe */
1481 kfree(sqe);
1482
1483 if (!async_list)
1484 break;
1485 if (!list_empty(&req_list)) {
1486 req = list_first_entry(&req_list, struct io_kiocb,
1487 list);
1488 list_del(&req->list);
1489 continue;
1490 }
1491 if (list_empty(&async_list->list))
1492 break;
1493
1494 req = NULL;
1495 spin_lock(&async_list->lock);
1496 if (list_empty(&async_list->list)) {
1497 spin_unlock(&async_list->lock);
1498 break;
1499 }
1500 list_splice_init(&async_list->list, &req_list);
1501 spin_unlock(&async_list->lock);
1502
1503 req = list_first_entry(&req_list, struct io_kiocb, list);
1504 list_del(&req->list);
1505 } while (req);
Jens Axboeedafcce2019-01-09 09:16:05 -07001506
1507 /*
Jens Axboe31b51512019-01-18 22:56:34 -07001508 * Rare case of racing with a submitter. If we find the count has
1509 * dropped to zero AND we have pending work items, then restart
1510 * the processing. This is a tiny race window.
Jens Axboeedafcce2019-01-09 09:16:05 -07001511 */
Jens Axboe31b51512019-01-18 22:56:34 -07001512 if (async_list) {
1513 ret = atomic_dec_return(&async_list->cnt);
1514 while (!ret && !list_empty(&async_list->list)) {
1515 spin_lock(&async_list->lock);
1516 atomic_inc(&async_list->cnt);
1517 list_splice_init(&async_list->list, &req_list);
1518 spin_unlock(&async_list->lock);
1519
1520 if (!list_empty(&req_list)) {
1521 req = list_first_entry(&req_list,
1522 struct io_kiocb, list);
1523 list_del(&req->list);
1524 goto restart;
1525 }
1526 ret = atomic_dec_return(&async_list->cnt);
Jens Axboeedafcce2019-01-09 09:16:05 -07001527 }
Jens Axboeedafcce2019-01-09 09:16:05 -07001528 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07001529
Jens Axboe31b51512019-01-18 22:56:34 -07001530 if (cur_mm) {
Jens Axboeedafcce2019-01-09 09:16:05 -07001531 set_fs(old_fs);
Jens Axboe31b51512019-01-18 22:56:34 -07001532 unuse_mm(cur_mm);
1533 mmput(cur_mm);
Jens Axboeedafcce2019-01-09 09:16:05 -07001534 }
Jens Axboe31b51512019-01-18 22:56:34 -07001535}
Jens Axboe2b188cc2019-01-07 10:46:33 -07001536
Jens Axboe31b51512019-01-18 22:56:34 -07001537/*
1538 * See if we can piggy back onto previously submitted work, that is still
1539 * running. We currently only allow this if the new request is sequential
1540 * to the previous one we punted.
1541 */
1542static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
1543{
1544 bool ret = false;
1545
1546 if (!list)
1547 return false;
1548 if (!(req->flags & REQ_F_SEQ_PREV))
1549 return false;
1550 if (!atomic_read(&list->cnt))
1551 return false;
1552
1553 ret = true;
1554 spin_lock(&list->lock);
1555 list_add_tail(&req->list, &list->list);
1556 if (!atomic_read(&list->cnt)) {
1557 list_del_init(&req->list);
1558 ret = false;
1559 }
1560 spin_unlock(&list->lock);
1561 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001562}
1563
Jens Axboe09bb8392019-03-13 12:39:28 -06001564static bool io_op_needs_file(const struct io_uring_sqe *sqe)
1565{
1566 int op = READ_ONCE(sqe->opcode);
1567
1568 switch (op) {
1569 case IORING_OP_NOP:
1570 case IORING_OP_POLL_REMOVE:
1571 return false;
1572 default:
1573 return true;
1574 }
1575}
1576
1577static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
1578 struct io_submit_state *state, struct io_kiocb *req)
1579{
1580 unsigned flags;
1581 int fd;
1582
1583 flags = READ_ONCE(s->sqe->flags);
1584 fd = READ_ONCE(s->sqe->fd);
1585
1586 if (!io_op_needs_file(s->sqe)) {
1587 req->file = NULL;
1588 return 0;
1589 }
1590
1591 if (flags & IOSQE_FIXED_FILE) {
1592 if (unlikely(!ctx->user_files ||
1593 (unsigned) fd >= ctx->nr_user_files))
1594 return -EBADF;
1595 req->file = ctx->user_files[fd];
1596 req->flags |= REQ_F_FIXED_FILE;
1597 } else {
1598 if (s->needs_fixed_file)
1599 return -EBADF;
1600 req->file = io_file_get(state, fd);
1601 if (unlikely(!req->file))
1602 return -EBADF;
1603 }
1604
1605 return 0;
1606}
1607
Jens Axboe9a56a232019-01-09 09:06:50 -07001608static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
1609 struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001610{
1611 struct io_kiocb *req;
Jens Axboee0c5c572019-03-12 10:18:47 -06001612 int ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001613
1614 /* enforce forwards compatibility on users */
Jens Axboe6b063142019-01-10 22:13:58 -07001615 if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
Jens Axboe2b188cc2019-01-07 10:46:33 -07001616 return -EINVAL;
1617
Jens Axboe2579f912019-01-09 09:10:43 -07001618 req = io_get_req(ctx, state);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001619 if (unlikely(!req))
1620 return -EAGAIN;
1621
Jens Axboe09bb8392019-03-13 12:39:28 -06001622 ret = io_req_set_file(ctx, s, state, req);
1623 if (unlikely(ret))
1624 goto out;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001625
Jens Axboe9a56a232019-01-09 09:06:50 -07001626 ret = __io_submit_sqe(ctx, req, s, true, state);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001627 if (ret == -EAGAIN) {
1628 struct io_uring_sqe *sqe_copy;
1629
1630 sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
1631 if (sqe_copy) {
Jens Axboe31b51512019-01-18 22:56:34 -07001632 struct async_list *list;
1633
Jens Axboe2b188cc2019-01-07 10:46:33 -07001634 memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
1635 s->sqe = sqe_copy;
1636
1637 memcpy(&req->submit, s, sizeof(*s));
Jens Axboe31b51512019-01-18 22:56:34 -07001638 list = io_async_list_from_sqe(ctx, s->sqe);
1639 if (!io_add_to_prev_work(list, req)) {
1640 if (list)
1641 atomic_inc(&list->cnt);
1642 INIT_WORK(&req->work, io_sq_wq_submit_work);
1643 queue_work(ctx->sqo_wq, &req->work);
1644 }
Jens Axboee65ef562019-03-12 10:16:44 -06001645
1646 /*
1647 * Queued up for async execution, worker will release
1648 * submit reference when the iocb is actually
1649 * submitted.
1650 */
1651 return 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001652 }
1653 }
Jens Axboee65ef562019-03-12 10:16:44 -06001654
Jens Axboe09bb8392019-03-13 12:39:28 -06001655out:
Jens Axboee65ef562019-03-12 10:16:44 -06001656 /* drop submission reference */
1657 io_put_req(req);
1658
1659 /* and drop final reference, if we failed */
Jens Axboe2b188cc2019-01-07 10:46:33 -07001660 if (ret)
Jens Axboee65ef562019-03-12 10:16:44 -06001661 io_put_req(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001662
1663 return ret;
1664}
1665
Jens Axboe9a56a232019-01-09 09:06:50 -07001666/*
1667 * Batched submission is done, ensure local IO is flushed out.
1668 */
1669static void io_submit_state_end(struct io_submit_state *state)
1670{
1671 blk_finish_plug(&state->plug);
Jens Axboe3d6770f2019-04-13 11:50:54 -06001672 io_file_put(state);
Jens Axboe2579f912019-01-09 09:10:43 -07001673 if (state->free_reqs)
1674 kmem_cache_free_bulk(req_cachep, state->free_reqs,
1675 &state->reqs[state->cur_req]);
Jens Axboe9a56a232019-01-09 09:06:50 -07001676}
1677
1678/*
1679 * Start submission side cache.
1680 */
1681static void io_submit_state_start(struct io_submit_state *state,
1682 struct io_ring_ctx *ctx, unsigned max_ios)
1683{
1684 blk_start_plug(&state->plug);
Jens Axboe2579f912019-01-09 09:10:43 -07001685 state->free_reqs = 0;
Jens Axboe9a56a232019-01-09 09:06:50 -07001686 state->file = NULL;
1687 state->ios_left = max_ios;
1688}
1689
Jens Axboe2b188cc2019-01-07 10:46:33 -07001690static void io_commit_sqring(struct io_ring_ctx *ctx)
1691{
1692 struct io_sq_ring *ring = ctx->sq_ring;
1693
1694 if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
1695 /*
1696 * Ensure any loads from the SQEs are done at this point,
1697 * since once we write the new head, the application could
1698 * write new data to them.
1699 */
1700 smp_store_release(&ring->r.head, ctx->cached_sq_head);
1701
1702 /*
1703 * write side barrier of head update, app has read side. See
1704 * comment at the top of this file
1705 */
1706 smp_wmb();
1707 }
1708}
1709
1710/*
1711 * Undo last io_get_sqring()
1712 */
1713static void io_drop_sqring(struct io_ring_ctx *ctx)
1714{
1715 ctx->cached_sq_head--;
1716}
1717
1718/*
1719 * Fetch an sqe, if one is available. Note that s->sqe will point to memory
1720 * that is mapped by userspace. This means that care needs to be taken to
1721 * ensure that reads are stable, as we cannot rely on userspace always
1722 * being a good citizen. If members of the sqe are validated and then later
1723 * used, it's important that those reads are done through READ_ONCE() to
1724 * prevent a re-load down the line.
1725 */
1726static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
1727{
1728 struct io_sq_ring *ring = ctx->sq_ring;
1729 unsigned head;
1730
1731 /*
1732 * The cached sq head (or cq tail) serves two purposes:
1733 *
1734 * 1) allows us to batch the cost of updating the user visible
1735 * head updates.
1736 * 2) allows the kernel side to track the head on its own, even
1737 * though the application is the one updating it.
1738 */
1739 head = ctx->cached_sq_head;
1740 /* See comment at the top of this file */
1741 smp_rmb();
Stefan Bühlere523a292019-04-19 11:57:44 +02001742 /* make sure SQ entry isn't read before tail */
1743 if (head == smp_load_acquire(&ring->r.tail))
Jens Axboe2b188cc2019-01-07 10:46:33 -07001744 return false;
1745
1746 head = READ_ONCE(ring->array[head & ctx->sq_mask]);
1747 if (head < ctx->sq_entries) {
1748 s->index = head;
1749 s->sqe = &ctx->sq_sqes[head];
1750 ctx->cached_sq_head++;
1751 return true;
1752 }
1753
1754 /* drop invalid entries */
1755 ctx->cached_sq_head++;
1756 ring->dropped++;
1757 /* See comment at the top of this file */
1758 smp_wmb();
1759 return false;
1760}
1761
Jens Axboe6c271ce2019-01-10 11:22:30 -07001762static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
1763 unsigned int nr, bool has_user, bool mm_fault)
1764{
1765 struct io_submit_state state, *statep = NULL;
1766 int ret, i, submitted = 0;
1767
1768 if (nr > IO_PLUG_THRESHOLD) {
1769 io_submit_state_start(&state, ctx, nr);
1770 statep = &state;
1771 }
1772
1773 for (i = 0; i < nr; i++) {
1774 if (unlikely(mm_fault)) {
1775 ret = -EFAULT;
1776 } else {
1777 sqes[i].has_user = has_user;
1778 sqes[i].needs_lock = true;
1779 sqes[i].needs_fixed_file = true;
1780 ret = io_submit_sqe(ctx, &sqes[i], statep);
1781 }
1782 if (!ret) {
1783 submitted++;
1784 continue;
1785 }
1786
1787 io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0);
1788 }
1789
1790 if (statep)
1791 io_submit_state_end(&state);
1792
1793 return submitted;
1794}
1795
1796static int io_sq_thread(void *data)
1797{
1798 struct sqe_submit sqes[IO_IOPOLL_BATCH];
1799 struct io_ring_ctx *ctx = data;
1800 struct mm_struct *cur_mm = NULL;
1801 mm_segment_t old_fs;
1802 DEFINE_WAIT(wait);
1803 unsigned inflight;
1804 unsigned long timeout;
1805
1806 old_fs = get_fs();
1807 set_fs(USER_DS);
1808
1809 timeout = inflight = 0;
1810 while (!kthread_should_stop() && !ctx->sqo_stop) {
1811 bool all_fixed, mm_fault = false;
1812 int i;
1813
1814 if (inflight) {
1815 unsigned nr_events = 0;
1816
1817 if (ctx->flags & IORING_SETUP_IOPOLL) {
1818 /*
1819 * We disallow the app entering submit/complete
1820 * with polling, but we still need to lock the
1821 * ring to prevent racing with polled issue
1822 * that got punted to a workqueue.
1823 */
1824 mutex_lock(&ctx->uring_lock);
1825 io_iopoll_check(ctx, &nr_events, 0);
1826 mutex_unlock(&ctx->uring_lock);
1827 } else {
1828 /*
1829 * Normal IO, just pretend everything completed.
1830 * We don't have to poll completions for that.
1831 */
1832 nr_events = inflight;
1833 }
1834
1835 inflight -= nr_events;
1836 if (!inflight)
1837 timeout = jiffies + ctx->sq_thread_idle;
1838 }
1839
1840 if (!io_get_sqring(ctx, &sqes[0])) {
1841 /*
1842 * We're polling. If we're within the defined idle
1843 * period, then let us spin without work before going
1844 * to sleep.
1845 */
1846 if (inflight || !time_after(jiffies, timeout)) {
1847 cpu_relax();
1848 continue;
1849 }
1850
1851 /*
1852 * Drop cur_mm before scheduling, we can't hold it for
1853 * long periods (or over schedule()). Do this before
1854 * adding ourselves to the waitqueue, as the unuse/drop
1855 * may sleep.
1856 */
1857 if (cur_mm) {
1858 unuse_mm(cur_mm);
1859 mmput(cur_mm);
1860 cur_mm = NULL;
1861 }
1862
1863 prepare_to_wait(&ctx->sqo_wait, &wait,
1864 TASK_INTERRUPTIBLE);
1865
1866 /* Tell userspace we may need a wakeup call */
1867 ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
Stefan Bühler0d7bae62019-04-19 11:57:45 +02001868 /* make sure to read SQ tail after writing flags */
1869 smp_mb();
Jens Axboe6c271ce2019-01-10 11:22:30 -07001870
1871 if (!io_get_sqring(ctx, &sqes[0])) {
1872 if (kthread_should_stop()) {
1873 finish_wait(&ctx->sqo_wait, &wait);
1874 break;
1875 }
1876 if (signal_pending(current))
1877 flush_signals(current);
1878 schedule();
1879 finish_wait(&ctx->sqo_wait, &wait);
1880
1881 ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
1882 smp_wmb();
1883 continue;
1884 }
1885 finish_wait(&ctx->sqo_wait, &wait);
1886
1887 ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
1888 smp_wmb();
1889 }
1890
1891 i = 0;
1892 all_fixed = true;
1893 do {
1894 if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
1895 all_fixed = false;
1896
1897 i++;
1898 if (i == ARRAY_SIZE(sqes))
1899 break;
1900 } while (io_get_sqring(ctx, &sqes[i]));
1901
1902 /* Unless all new commands are FIXED regions, grab mm */
1903 if (!all_fixed && !cur_mm) {
1904 mm_fault = !mmget_not_zero(ctx->sqo_mm);
1905 if (!mm_fault) {
1906 use_mm(ctx->sqo_mm);
1907 cur_mm = ctx->sqo_mm;
1908 }
1909 }
1910
1911 inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
1912 mm_fault);
1913
1914 /* Commit SQ ring head once we've consumed all SQEs */
1915 io_commit_sqring(ctx);
1916 }
1917
1918 set_fs(old_fs);
1919 if (cur_mm) {
1920 unuse_mm(cur_mm);
1921 mmput(cur_mm);
1922 }
Jens Axboe06058632019-04-13 09:26:03 -06001923
1924 if (kthread_should_park())
1925 kthread_parkme();
1926
Jens Axboe6c271ce2019-01-10 11:22:30 -07001927 return 0;
1928}
1929
Jens Axboe2b188cc2019-01-07 10:46:33 -07001930static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
1931{
Jens Axboe9a56a232019-01-09 09:06:50 -07001932 struct io_submit_state state, *statep = NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001933 int i, ret = 0, submit = 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001934
Jens Axboe9a56a232019-01-09 09:06:50 -07001935 if (to_submit > IO_PLUG_THRESHOLD) {
1936 io_submit_state_start(&state, ctx, to_submit);
1937 statep = &state;
1938 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07001939
1940 for (i = 0; i < to_submit; i++) {
1941 struct sqe_submit s;
1942
1943 if (!io_get_sqring(ctx, &s))
1944 break;
1945
1946 s.has_user = true;
Jens Axboedef596e2019-01-09 08:59:42 -07001947 s.needs_lock = false;
Jens Axboe6c271ce2019-01-10 11:22:30 -07001948 s.needs_fixed_file = false;
Jens Axboedef596e2019-01-09 08:59:42 -07001949
Jens Axboe9a56a232019-01-09 09:06:50 -07001950 ret = io_submit_sqe(ctx, &s, statep);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001951 if (ret) {
1952 io_drop_sqring(ctx);
1953 break;
1954 }
1955
1956 submit++;
1957 }
1958 io_commit_sqring(ctx);
1959
Jens Axboe9a56a232019-01-09 09:06:50 -07001960 if (statep)
1961 io_submit_state_end(statep);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001962
1963 return submit ? submit : ret;
1964}
1965
1966static unsigned io_cqring_events(struct io_cq_ring *ring)
1967{
1968 return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
1969}
1970
1971/*
1972 * Wait until events become available, if we don't already have some. The
1973 * application must reap them itself, as they reside on the shared cq ring.
1974 */
1975static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
1976 const sigset_t __user *sig, size_t sigsz)
1977{
1978 struct io_cq_ring *ring = ctx->cq_ring;
1979 sigset_t ksigmask, sigsaved;
1980 DEFINE_WAIT(wait);
1981 int ret;
1982
1983 /* See comment at the top of this file */
1984 smp_rmb();
1985 if (io_cqring_events(ring) >= min_events)
1986 return 0;
1987
1988 if (sig) {
Arnd Bergmann9e75ad52019-03-25 15:34:53 +01001989#ifdef CONFIG_COMPAT
1990 if (in_compat_syscall())
1991 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
1992 &ksigmask, &sigsaved, sigsz);
1993 else
1994#endif
1995 ret = set_user_sigmask(sig, &ksigmask,
1996 &sigsaved, sigsz);
1997
Jens Axboe2b188cc2019-01-07 10:46:33 -07001998 if (ret)
1999 return ret;
2000 }
2001
2002 do {
2003 prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
2004
2005 ret = 0;
2006 /* See comment at the top of this file */
2007 smp_rmb();
2008 if (io_cqring_events(ring) >= min_events)
2009 break;
2010
2011 schedule();
2012
2013 ret = -EINTR;
2014 if (signal_pending(current))
2015 break;
2016 } while (1);
2017
2018 finish_wait(&ctx->wait, &wait);
2019
2020 if (sig)
2021 restore_user_sigmask(sig, &sigsaved);
2022
2023 return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
2024}
2025
Jens Axboe6b063142019-01-10 22:13:58 -07002026static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
2027{
2028#if defined(CONFIG_UNIX)
2029 if (ctx->ring_sock) {
2030 struct sock *sock = ctx->ring_sock->sk;
2031 struct sk_buff *skb;
2032
2033 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
2034 kfree_skb(skb);
2035 }
2036#else
2037 int i;
2038
2039 for (i = 0; i < ctx->nr_user_files; i++)
2040 fput(ctx->user_files[i]);
2041#endif
2042}
2043
2044static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
2045{
2046 if (!ctx->user_files)
2047 return -ENXIO;
2048
2049 __io_sqe_files_unregister(ctx);
2050 kfree(ctx->user_files);
2051 ctx->user_files = NULL;
2052 ctx->nr_user_files = 0;
2053 return 0;
2054}
2055
Jens Axboe6c271ce2019-01-10 11:22:30 -07002056static void io_sq_thread_stop(struct io_ring_ctx *ctx)
2057{
2058 if (ctx->sqo_thread) {
2059 ctx->sqo_stop = 1;
2060 mb();
Jens Axboe06058632019-04-13 09:26:03 -06002061 kthread_park(ctx->sqo_thread);
Jens Axboe6c271ce2019-01-10 11:22:30 -07002062 kthread_stop(ctx->sqo_thread);
2063 ctx->sqo_thread = NULL;
2064 }
2065}
2066
Jens Axboe6b063142019-01-10 22:13:58 -07002067static void io_finish_async(struct io_ring_ctx *ctx)
2068{
Jens Axboe6c271ce2019-01-10 11:22:30 -07002069 io_sq_thread_stop(ctx);
2070
Jens Axboe6b063142019-01-10 22:13:58 -07002071 if (ctx->sqo_wq) {
2072 destroy_workqueue(ctx->sqo_wq);
2073 ctx->sqo_wq = NULL;
2074 }
2075}
2076
2077#if defined(CONFIG_UNIX)
2078static void io_destruct_skb(struct sk_buff *skb)
2079{
2080 struct io_ring_ctx *ctx = skb->sk->sk_user_data;
2081
2082 io_finish_async(ctx);
2083 unix_destruct_scm(skb);
2084}
2085
2086/*
2087 * Ensure the UNIX gc is aware of our file set, so we are certain that
2088 * the io_uring can be safely unregistered on process exit, even if we have
2089 * loops in the file referencing.
2090 */
2091static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
2092{
2093 struct sock *sk = ctx->ring_sock->sk;
2094 struct scm_fp_list *fpl;
2095 struct sk_buff *skb;
2096 int i;
2097
2098 if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
2099 unsigned long inflight = ctx->user->unix_inflight + nr;
2100
2101 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
2102 return -EMFILE;
2103 }
2104
2105 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
2106 if (!fpl)
2107 return -ENOMEM;
2108
2109 skb = alloc_skb(0, GFP_KERNEL);
2110 if (!skb) {
2111 kfree(fpl);
2112 return -ENOMEM;
2113 }
2114
2115 skb->sk = sk;
2116 skb->destructor = io_destruct_skb;
2117
2118 fpl->user = get_uid(ctx->user);
2119 for (i = 0; i < nr; i++) {
2120 fpl->fp[i] = get_file(ctx->user_files[i + offset]);
2121 unix_inflight(fpl->user, fpl->fp[i]);
2122 }
2123
2124 fpl->max = fpl->count = nr;
2125 UNIXCB(skb).fp = fpl;
2126 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2127 skb_queue_head(&sk->sk_receive_queue, skb);
2128
2129 for (i = 0; i < nr; i++)
2130 fput(fpl->fp[i]);
2131
2132 return 0;
2133}
2134
2135/*
2136 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
2137 * causes regular reference counting to break down. We rely on the UNIX
2138 * garbage collection to take care of this problem for us.
2139 */
2140static int io_sqe_files_scm(struct io_ring_ctx *ctx)
2141{
2142 unsigned left, total;
2143 int ret = 0;
2144
2145 total = 0;
2146 left = ctx->nr_user_files;
2147 while (left) {
2148 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
2149 int ret;
2150
2151 ret = __io_sqe_files_scm(ctx, this_files, total);
2152 if (ret)
2153 break;
2154 left -= this_files;
2155 total += this_files;
2156 }
2157
2158 if (!ret)
2159 return 0;
2160
2161 while (total < ctx->nr_user_files) {
2162 fput(ctx->user_files[total]);
2163 total++;
2164 }
2165
2166 return ret;
2167}
2168#else
2169static int io_sqe_files_scm(struct io_ring_ctx *ctx)
2170{
2171 return 0;
2172}
2173#endif
2174
2175static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
2176 unsigned nr_args)
2177{
2178 __s32 __user *fds = (__s32 __user *) arg;
2179 int fd, ret = 0;
2180 unsigned i;
2181
2182 if (ctx->user_files)
2183 return -EBUSY;
2184 if (!nr_args)
2185 return -EINVAL;
2186 if (nr_args > IORING_MAX_FIXED_FILES)
2187 return -EMFILE;
2188
2189 ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
2190 if (!ctx->user_files)
2191 return -ENOMEM;
2192
2193 for (i = 0; i < nr_args; i++) {
2194 ret = -EFAULT;
2195 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
2196 break;
2197
2198 ctx->user_files[i] = fget(fd);
2199
2200 ret = -EBADF;
2201 if (!ctx->user_files[i])
2202 break;
2203 /*
2204 * Don't allow io_uring instances to be registered. If UNIX
2205 * isn't enabled, then this causes a reference cycle and this
2206 * instance can never get freed. If UNIX is enabled we'll
2207 * handle it just fine, but there's still no point in allowing
2208 * a ring fd as it doesn't support regular read/write anyway.
2209 */
2210 if (ctx->user_files[i]->f_op == &io_uring_fops) {
2211 fput(ctx->user_files[i]);
2212 break;
2213 }
2214 ctx->nr_user_files++;
2215 ret = 0;
2216 }
2217
2218 if (ret) {
2219 for (i = 0; i < ctx->nr_user_files; i++)
2220 fput(ctx->user_files[i]);
2221
2222 kfree(ctx->user_files);
Jens Axboe25adf502019-04-03 09:52:40 -06002223 ctx->user_files = NULL;
Jens Axboe6b063142019-01-10 22:13:58 -07002224 ctx->nr_user_files = 0;
2225 return ret;
2226 }
2227
2228 ret = io_sqe_files_scm(ctx);
2229 if (ret)
2230 io_sqe_files_unregister(ctx);
2231
2232 return ret;
2233}
2234
Jens Axboe6c271ce2019-01-10 11:22:30 -07002235static int io_sq_offload_start(struct io_ring_ctx *ctx,
2236 struct io_uring_params *p)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002237{
2238 int ret;
2239
Jens Axboe6c271ce2019-01-10 11:22:30 -07002240 init_waitqueue_head(&ctx->sqo_wait);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002241 mmgrab(current->mm);
2242 ctx->sqo_mm = current->mm;
2243
Jens Axboe6c271ce2019-01-10 11:22:30 -07002244 ret = -EINVAL;
2245 if (!cpu_possible(p->sq_thread_cpu))
2246 goto err;
2247
2248 if (ctx->flags & IORING_SETUP_SQPOLL) {
Jens Axboe3ec482d2019-04-08 10:51:01 -06002249 ret = -EPERM;
2250 if (!capable(CAP_SYS_ADMIN))
2251 goto err;
2252
Jens Axboe917257d2019-04-13 09:28:55 -06002253 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
2254 if (!ctx->sq_thread_idle)
2255 ctx->sq_thread_idle = HZ;
2256
Jens Axboe6c271ce2019-01-10 11:22:30 -07002257 if (p->flags & IORING_SETUP_SQ_AFF) {
2258 int cpu;
2259
2260 cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
Jens Axboe917257d2019-04-13 09:28:55 -06002261 ret = -EINVAL;
2262 if (!cpu_possible(p->sq_thread_cpu))
2263 goto err;
2264
Jens Axboe6c271ce2019-01-10 11:22:30 -07002265 ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
2266 ctx, cpu,
2267 "io_uring-sq");
2268 } else {
2269 ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
2270 "io_uring-sq");
2271 }
2272 if (IS_ERR(ctx->sqo_thread)) {
2273 ret = PTR_ERR(ctx->sqo_thread);
2274 ctx->sqo_thread = NULL;
2275 goto err;
2276 }
2277 wake_up_process(ctx->sqo_thread);
2278 } else if (p->flags & IORING_SETUP_SQ_AFF) {
2279 /* Can't have SQ_AFF without SQPOLL */
2280 ret = -EINVAL;
2281 goto err;
2282 }
2283
Jens Axboe2b188cc2019-01-07 10:46:33 -07002284 /* Do QD, or 2 * CPUS, whatever is smallest */
2285 ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
2286 min(ctx->sq_entries - 1, 2 * num_online_cpus()));
2287 if (!ctx->sqo_wq) {
2288 ret = -ENOMEM;
2289 goto err;
2290 }
2291
2292 return 0;
2293err:
Jens Axboe6c271ce2019-01-10 11:22:30 -07002294 io_sq_thread_stop(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002295 mmdrop(ctx->sqo_mm);
2296 ctx->sqo_mm = NULL;
2297 return ret;
2298}
2299
2300static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
2301{
2302 atomic_long_sub(nr_pages, &user->locked_vm);
2303}
2304
2305static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
2306{
2307 unsigned long page_limit, cur_pages, new_pages;
2308
2309 /* Don't allow more pages than we can safely lock */
2310 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
2311
2312 do {
2313 cur_pages = atomic_long_read(&user->locked_vm);
2314 new_pages = cur_pages + nr_pages;
2315 if (new_pages > page_limit)
2316 return -ENOMEM;
2317 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
2318 new_pages) != cur_pages);
2319
2320 return 0;
2321}
2322
2323static void io_mem_free(void *ptr)
2324{
2325 struct page *page = virt_to_head_page(ptr);
2326
2327 if (put_page_testzero(page))
2328 free_compound_page(page);
2329}
2330
2331static void *io_mem_alloc(size_t size)
2332{
2333 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
2334 __GFP_NORETRY;
2335
2336 return (void *) __get_free_pages(gfp_flags, get_order(size));
2337}
2338
2339static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
2340{
2341 struct io_sq_ring *sq_ring;
2342 struct io_cq_ring *cq_ring;
2343 size_t bytes;
2344
2345 bytes = struct_size(sq_ring, array, sq_entries);
2346 bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
2347 bytes += struct_size(cq_ring, cqes, cq_entries);
2348
2349 return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
2350}
2351
Jens Axboeedafcce2019-01-09 09:16:05 -07002352static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
2353{
2354 int i, j;
2355
2356 if (!ctx->user_bufs)
2357 return -ENXIO;
2358
2359 for (i = 0; i < ctx->nr_user_bufs; i++) {
2360 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
2361
2362 for (j = 0; j < imu->nr_bvecs; j++)
2363 put_page(imu->bvec[j].bv_page);
2364
2365 if (ctx->account_mem)
2366 io_unaccount_mem(ctx->user, imu->nr_bvecs);
2367 kfree(imu->bvec);
2368 imu->nr_bvecs = 0;
2369 }
2370
2371 kfree(ctx->user_bufs);
2372 ctx->user_bufs = NULL;
2373 ctx->nr_user_bufs = 0;
2374 return 0;
2375}
2376
2377static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
2378 void __user *arg, unsigned index)
2379{
2380 struct iovec __user *src;
2381
2382#ifdef CONFIG_COMPAT
2383 if (ctx->compat) {
2384 struct compat_iovec __user *ciovs;
2385 struct compat_iovec ciov;
2386
2387 ciovs = (struct compat_iovec __user *) arg;
2388 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
2389 return -EFAULT;
2390
2391 dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
2392 dst->iov_len = ciov.iov_len;
2393 return 0;
2394 }
2395#endif
2396 src = (struct iovec __user *) arg;
2397 if (copy_from_user(dst, &src[index], sizeof(*dst)))
2398 return -EFAULT;
2399 return 0;
2400}
2401
2402static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
2403 unsigned nr_args)
2404{
2405 struct vm_area_struct **vmas = NULL;
2406 struct page **pages = NULL;
2407 int i, j, got_pages = 0;
2408 int ret = -EINVAL;
2409
2410 if (ctx->user_bufs)
2411 return -EBUSY;
2412 if (!nr_args || nr_args > UIO_MAXIOV)
2413 return -EINVAL;
2414
2415 ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
2416 GFP_KERNEL);
2417 if (!ctx->user_bufs)
2418 return -ENOMEM;
2419
2420 for (i = 0; i < nr_args; i++) {
2421 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
2422 unsigned long off, start, end, ubuf;
2423 int pret, nr_pages;
2424 struct iovec iov;
2425 size_t size;
2426
2427 ret = io_copy_iov(ctx, &iov, arg, i);
2428 if (ret)
2429 break;
2430
2431 /*
2432 * Don't impose further limits on the size and buffer
2433 * constraints here, we'll -EINVAL later when IO is
2434 * submitted if they are wrong.
2435 */
2436 ret = -EFAULT;
2437 if (!iov.iov_base || !iov.iov_len)
2438 goto err;
2439
2440 /* arbitrary limit, but we need something */
2441 if (iov.iov_len > SZ_1G)
2442 goto err;
2443
2444 ubuf = (unsigned long) iov.iov_base;
2445 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
2446 start = ubuf >> PAGE_SHIFT;
2447 nr_pages = end - start;
2448
2449 if (ctx->account_mem) {
2450 ret = io_account_mem(ctx->user, nr_pages);
2451 if (ret)
2452 goto err;
2453 }
2454
2455 ret = 0;
2456 if (!pages || nr_pages > got_pages) {
2457 kfree(vmas);
2458 kfree(pages);
2459 pages = kmalloc_array(nr_pages, sizeof(struct page *),
2460 GFP_KERNEL);
2461 vmas = kmalloc_array(nr_pages,
2462 sizeof(struct vm_area_struct *),
2463 GFP_KERNEL);
2464 if (!pages || !vmas) {
2465 ret = -ENOMEM;
2466 if (ctx->account_mem)
2467 io_unaccount_mem(ctx->user, nr_pages);
2468 goto err;
2469 }
2470 got_pages = nr_pages;
2471 }
2472
2473 imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
2474 GFP_KERNEL);
2475 ret = -ENOMEM;
2476 if (!imu->bvec) {
2477 if (ctx->account_mem)
2478 io_unaccount_mem(ctx->user, nr_pages);
2479 goto err;
2480 }
2481
2482 ret = 0;
2483 down_read(&current->mm->mmap_sem);
2484 pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
2485 pages, vmas);
2486 if (pret == nr_pages) {
2487 /* don't support file backed memory */
2488 for (j = 0; j < nr_pages; j++) {
2489 struct vm_area_struct *vma = vmas[j];
2490
2491 if (vma->vm_file &&
2492 !is_file_hugepages(vma->vm_file)) {
2493 ret = -EOPNOTSUPP;
2494 break;
2495 }
2496 }
2497 } else {
2498 ret = pret < 0 ? pret : -EFAULT;
2499 }
2500 up_read(&current->mm->mmap_sem);
2501 if (ret) {
2502 /*
2503 * if we did partial map, or found file backed vmas,
2504 * release any pages we did get
2505 */
2506 if (pret > 0) {
2507 for (j = 0; j < pret; j++)
2508 put_page(pages[j]);
2509 }
2510 if (ctx->account_mem)
2511 io_unaccount_mem(ctx->user, nr_pages);
2512 goto err;
2513 }
2514
2515 off = ubuf & ~PAGE_MASK;
2516 size = iov.iov_len;
2517 for (j = 0; j < nr_pages; j++) {
2518 size_t vec_len;
2519
2520 vec_len = min_t(size_t, size, PAGE_SIZE - off);
2521 imu->bvec[j].bv_page = pages[j];
2522 imu->bvec[j].bv_len = vec_len;
2523 imu->bvec[j].bv_offset = off;
2524 off = 0;
2525 size -= vec_len;
2526 }
2527 /* store original address for later verification */
2528 imu->ubuf = ubuf;
2529 imu->len = iov.iov_len;
2530 imu->nr_bvecs = nr_pages;
2531
2532 ctx->nr_user_bufs++;
2533 }
2534 kfree(pages);
2535 kfree(vmas);
2536 return 0;
2537err:
2538 kfree(pages);
2539 kfree(vmas);
2540 io_sqe_buffer_unregister(ctx);
2541 return ret;
2542}
2543
Jens Axboe2b188cc2019-01-07 10:46:33 -07002544static void io_ring_ctx_free(struct io_ring_ctx *ctx)
2545{
Jens Axboe6b063142019-01-10 22:13:58 -07002546 io_finish_async(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002547 if (ctx->sqo_mm)
2548 mmdrop(ctx->sqo_mm);
Jens Axboedef596e2019-01-09 08:59:42 -07002549
2550 io_iopoll_reap_events(ctx);
Jens Axboeedafcce2019-01-09 09:16:05 -07002551 io_sqe_buffer_unregister(ctx);
Jens Axboe6b063142019-01-10 22:13:58 -07002552 io_sqe_files_unregister(ctx);
Jens Axboedef596e2019-01-09 08:59:42 -07002553
Jens Axboe2b188cc2019-01-07 10:46:33 -07002554#if defined(CONFIG_UNIX)
2555 if (ctx->ring_sock)
2556 sock_release(ctx->ring_sock);
2557#endif
2558
2559 io_mem_free(ctx->sq_ring);
2560 io_mem_free(ctx->sq_sqes);
2561 io_mem_free(ctx->cq_ring);
2562
2563 percpu_ref_exit(&ctx->refs);
2564 if (ctx->account_mem)
2565 io_unaccount_mem(ctx->user,
2566 ring_pages(ctx->sq_entries, ctx->cq_entries));
2567 free_uid(ctx->user);
2568 kfree(ctx);
2569}
2570
2571static __poll_t io_uring_poll(struct file *file, poll_table *wait)
2572{
2573 struct io_ring_ctx *ctx = file->private_data;
2574 __poll_t mask = 0;
2575
2576 poll_wait(file, &ctx->cq_wait, wait);
2577 /* See comment at the top of this file */
2578 smp_rmb();
Stefan Bühlerfb775fa2019-04-19 11:57:46 +02002579 if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head !=
2580 ctx->sq_ring->ring_entries)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002581 mask |= EPOLLOUT | EPOLLWRNORM;
2582 if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
2583 mask |= EPOLLIN | EPOLLRDNORM;
2584
2585 return mask;
2586}
2587
2588static int io_uring_fasync(int fd, struct file *file, int on)
2589{
2590 struct io_ring_ctx *ctx = file->private_data;
2591
2592 return fasync_helper(fd, file, on, &ctx->cq_fasync);
2593}
2594
2595static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
2596{
2597 mutex_lock(&ctx->uring_lock);
2598 percpu_ref_kill(&ctx->refs);
2599 mutex_unlock(&ctx->uring_lock);
2600
Jens Axboe221c5eb2019-01-17 09:41:58 -07002601 io_poll_remove_all(ctx);
Jens Axboedef596e2019-01-09 08:59:42 -07002602 io_iopoll_reap_events(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002603 wait_for_completion(&ctx->ctx_done);
2604 io_ring_ctx_free(ctx);
2605}
2606
2607static int io_uring_release(struct inode *inode, struct file *file)
2608{
2609 struct io_ring_ctx *ctx = file->private_data;
2610
2611 file->private_data = NULL;
2612 io_ring_ctx_wait_and_kill(ctx);
2613 return 0;
2614}
2615
2616static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
2617{
2618 loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
2619 unsigned long sz = vma->vm_end - vma->vm_start;
2620 struct io_ring_ctx *ctx = file->private_data;
2621 unsigned long pfn;
2622 struct page *page;
2623 void *ptr;
2624
2625 switch (offset) {
2626 case IORING_OFF_SQ_RING:
2627 ptr = ctx->sq_ring;
2628 break;
2629 case IORING_OFF_SQES:
2630 ptr = ctx->sq_sqes;
2631 break;
2632 case IORING_OFF_CQ_RING:
2633 ptr = ctx->cq_ring;
2634 break;
2635 default:
2636 return -EINVAL;
2637 }
2638
2639 page = virt_to_head_page(ptr);
2640 if (sz > (PAGE_SIZE << compound_order(page)))
2641 return -EINVAL;
2642
2643 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
2644 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2645}
2646
2647SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
2648 u32, min_complete, u32, flags, const sigset_t __user *, sig,
2649 size_t, sigsz)
2650{
2651 struct io_ring_ctx *ctx;
2652 long ret = -EBADF;
2653 int submitted = 0;
2654 struct fd f;
2655
Jens Axboe6c271ce2019-01-10 11:22:30 -07002656 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
Jens Axboe2b188cc2019-01-07 10:46:33 -07002657 return -EINVAL;
2658
2659 f = fdget(fd);
2660 if (!f.file)
2661 return -EBADF;
2662
2663 ret = -EOPNOTSUPP;
2664 if (f.file->f_op != &io_uring_fops)
2665 goto out_fput;
2666
2667 ret = -ENXIO;
2668 ctx = f.file->private_data;
2669 if (!percpu_ref_tryget(&ctx->refs))
2670 goto out_fput;
2671
Jens Axboe6c271ce2019-01-10 11:22:30 -07002672 /*
2673 * For SQ polling, the thread will do all submissions and completions.
2674 * Just return the requested submit count, and wake the thread if
2675 * we were asked to.
2676 */
2677 if (ctx->flags & IORING_SETUP_SQPOLL) {
2678 if (flags & IORING_ENTER_SQ_WAKEUP)
2679 wake_up(&ctx->sqo_wait);
2680 submitted = to_submit;
2681 goto out_ctx;
2682 }
2683
Jens Axboe2b188cc2019-01-07 10:46:33 -07002684 ret = 0;
2685 if (to_submit) {
2686 to_submit = min(to_submit, ctx->sq_entries);
2687
2688 mutex_lock(&ctx->uring_lock);
2689 submitted = io_ring_submit(ctx, to_submit);
2690 mutex_unlock(&ctx->uring_lock);
2691
2692 if (submitted < 0)
2693 goto out_ctx;
2694 }
2695 if (flags & IORING_ENTER_GETEVENTS) {
Jens Axboedef596e2019-01-09 08:59:42 -07002696 unsigned nr_events = 0;
2697
Jens Axboe2b188cc2019-01-07 10:46:33 -07002698 min_complete = min(min_complete, ctx->cq_entries);
2699
2700 /*
2701 * The application could have included the 'to_submit' count
2702 * in how many events it wanted to wait for. If we failed to
2703 * submit the desired count, we may need to adjust the number
2704 * of events to poll/wait for.
2705 */
2706 if (submitted < to_submit)
2707 min_complete = min_t(unsigned, submitted, min_complete);
2708
Jens Axboedef596e2019-01-09 08:59:42 -07002709 if (ctx->flags & IORING_SETUP_IOPOLL) {
2710 mutex_lock(&ctx->uring_lock);
2711 ret = io_iopoll_check(ctx, &nr_events, min_complete);
2712 mutex_unlock(&ctx->uring_lock);
2713 } else {
2714 ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
2715 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07002716 }
2717
2718out_ctx:
2719 io_ring_drop_ctx_refs(ctx, 1);
2720out_fput:
2721 fdput(f);
2722 return submitted ? submitted : ret;
2723}
2724
2725static const struct file_operations io_uring_fops = {
2726 .release = io_uring_release,
2727 .mmap = io_uring_mmap,
2728 .poll = io_uring_poll,
2729 .fasync = io_uring_fasync,
2730};
2731
2732static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
2733 struct io_uring_params *p)
2734{
2735 struct io_sq_ring *sq_ring;
2736 struct io_cq_ring *cq_ring;
2737 size_t size;
2738
2739 sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
2740 if (!sq_ring)
2741 return -ENOMEM;
2742
2743 ctx->sq_ring = sq_ring;
2744 sq_ring->ring_mask = p->sq_entries - 1;
2745 sq_ring->ring_entries = p->sq_entries;
2746 ctx->sq_mask = sq_ring->ring_mask;
2747 ctx->sq_entries = sq_ring->ring_entries;
2748
2749 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
2750 if (size == SIZE_MAX)
2751 return -EOVERFLOW;
2752
2753 ctx->sq_sqes = io_mem_alloc(size);
2754 if (!ctx->sq_sqes) {
2755 io_mem_free(ctx->sq_ring);
2756 return -ENOMEM;
2757 }
2758
2759 cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
2760 if (!cq_ring) {
2761 io_mem_free(ctx->sq_ring);
2762 io_mem_free(ctx->sq_sqes);
2763 return -ENOMEM;
2764 }
2765
2766 ctx->cq_ring = cq_ring;
2767 cq_ring->ring_mask = p->cq_entries - 1;
2768 cq_ring->ring_entries = p->cq_entries;
2769 ctx->cq_mask = cq_ring->ring_mask;
2770 ctx->cq_entries = cq_ring->ring_entries;
2771 return 0;
2772}
2773
2774/*
2775 * Allocate an anonymous fd, this is what constitutes the application
2776 * visible backing of an io_uring instance. The application mmaps this
2777 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
2778 * we have to tie this fd to a socket for file garbage collection purposes.
2779 */
2780static int io_uring_get_fd(struct io_ring_ctx *ctx)
2781{
2782 struct file *file;
2783 int ret;
2784
2785#if defined(CONFIG_UNIX)
2786 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
2787 &ctx->ring_sock);
2788 if (ret)
2789 return ret;
2790#endif
2791
2792 ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
2793 if (ret < 0)
2794 goto err;
2795
2796 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
2797 O_RDWR | O_CLOEXEC);
2798 if (IS_ERR(file)) {
2799 put_unused_fd(ret);
2800 ret = PTR_ERR(file);
2801 goto err;
2802 }
2803
2804#if defined(CONFIG_UNIX)
2805 ctx->ring_sock->file = file;
Jens Axboe6b063142019-01-10 22:13:58 -07002806 ctx->ring_sock->sk->sk_user_data = ctx;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002807#endif
2808 fd_install(ret, file);
2809 return ret;
2810err:
2811#if defined(CONFIG_UNIX)
2812 sock_release(ctx->ring_sock);
2813 ctx->ring_sock = NULL;
2814#endif
2815 return ret;
2816}
2817
2818static int io_uring_create(unsigned entries, struct io_uring_params *p)
2819{
2820 struct user_struct *user = NULL;
2821 struct io_ring_ctx *ctx;
2822 bool account_mem;
2823 int ret;
2824
2825 if (!entries || entries > IORING_MAX_ENTRIES)
2826 return -EINVAL;
2827
2828 /*
2829 * Use twice as many entries for the CQ ring. It's possible for the
2830 * application to drive a higher depth than the size of the SQ ring,
2831 * since the sqes are only used at submission time. This allows for
2832 * some flexibility in overcommitting a bit.
2833 */
2834 p->sq_entries = roundup_pow_of_two(entries);
2835 p->cq_entries = 2 * p->sq_entries;
2836
2837 user = get_uid(current_user());
2838 account_mem = !capable(CAP_IPC_LOCK);
2839
2840 if (account_mem) {
2841 ret = io_account_mem(user,
2842 ring_pages(p->sq_entries, p->cq_entries));
2843 if (ret) {
2844 free_uid(user);
2845 return ret;
2846 }
2847 }
2848
2849 ctx = io_ring_ctx_alloc(p);
2850 if (!ctx) {
2851 if (account_mem)
2852 io_unaccount_mem(user, ring_pages(p->sq_entries,
2853 p->cq_entries));
2854 free_uid(user);
2855 return -ENOMEM;
2856 }
2857 ctx->compat = in_compat_syscall();
2858 ctx->account_mem = account_mem;
2859 ctx->user = user;
2860
2861 ret = io_allocate_scq_urings(ctx, p);
2862 if (ret)
2863 goto err;
2864
Jens Axboe6c271ce2019-01-10 11:22:30 -07002865 ret = io_sq_offload_start(ctx, p);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002866 if (ret)
2867 goto err;
2868
2869 ret = io_uring_get_fd(ctx);
2870 if (ret < 0)
2871 goto err;
2872
2873 memset(&p->sq_off, 0, sizeof(p->sq_off));
2874 p->sq_off.head = offsetof(struct io_sq_ring, r.head);
2875 p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
2876 p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
2877 p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
2878 p->sq_off.flags = offsetof(struct io_sq_ring, flags);
2879 p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
2880 p->sq_off.array = offsetof(struct io_sq_ring, array);
2881
2882 memset(&p->cq_off, 0, sizeof(p->cq_off));
2883 p->cq_off.head = offsetof(struct io_cq_ring, r.head);
2884 p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
2885 p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
2886 p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
2887 p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
2888 p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
2889 return ret;
2890err:
2891 io_ring_ctx_wait_and_kill(ctx);
2892 return ret;
2893}
2894
2895/*
2896 * Sets up an aio uring context, and returns the fd. Applications asks for a
2897 * ring size, we return the actual sq/cq ring sizes (among other things) in the
2898 * params structure passed in.
2899 */
2900static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
2901{
2902 struct io_uring_params p;
2903 long ret;
2904 int i;
2905
2906 if (copy_from_user(&p, params, sizeof(p)))
2907 return -EFAULT;
2908 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
2909 if (p.resv[i])
2910 return -EINVAL;
2911 }
2912
Jens Axboe6c271ce2019-01-10 11:22:30 -07002913 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
2914 IORING_SETUP_SQ_AFF))
Jens Axboe2b188cc2019-01-07 10:46:33 -07002915 return -EINVAL;
2916
2917 ret = io_uring_create(entries, &p);
2918 if (ret < 0)
2919 return ret;
2920
2921 if (copy_to_user(params, &p, sizeof(p)))
2922 return -EFAULT;
2923
2924 return ret;
2925}
2926
2927SYSCALL_DEFINE2(io_uring_setup, u32, entries,
2928 struct io_uring_params __user *, params)
2929{
2930 return io_uring_setup(entries, params);
2931}
2932
Jens Axboeedafcce2019-01-09 09:16:05 -07002933static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
2934 void __user *arg, unsigned nr_args)
Jens Axboeb19062a2019-04-15 10:49:38 -06002935 __releases(ctx->uring_lock)
2936 __acquires(ctx->uring_lock)
Jens Axboeedafcce2019-01-09 09:16:05 -07002937{
2938 int ret;
2939
Jens Axboe35fa71a2019-04-22 10:23:23 -06002940 /*
2941 * We're inside the ring mutex, if the ref is already dying, then
2942 * someone else killed the ctx or is already going through
2943 * io_uring_register().
2944 */
2945 if (percpu_ref_is_dying(&ctx->refs))
2946 return -ENXIO;
2947
Jens Axboeedafcce2019-01-09 09:16:05 -07002948 percpu_ref_kill(&ctx->refs);
Jens Axboeb19062a2019-04-15 10:49:38 -06002949
2950 /*
2951 * Drop uring mutex before waiting for references to exit. If another
2952 * thread is currently inside io_uring_enter() it might need to grab
2953 * the uring_lock to make progress. If we hold it here across the drain
2954 * wait, then we can deadlock. It's safe to drop the mutex here, since
2955 * no new references will come in after we've killed the percpu ref.
2956 */
2957 mutex_unlock(&ctx->uring_lock);
Jens Axboeedafcce2019-01-09 09:16:05 -07002958 wait_for_completion(&ctx->ctx_done);
Jens Axboeb19062a2019-04-15 10:49:38 -06002959 mutex_lock(&ctx->uring_lock);
Jens Axboeedafcce2019-01-09 09:16:05 -07002960
2961 switch (opcode) {
2962 case IORING_REGISTER_BUFFERS:
2963 ret = io_sqe_buffer_register(ctx, arg, nr_args);
2964 break;
2965 case IORING_UNREGISTER_BUFFERS:
2966 ret = -EINVAL;
2967 if (arg || nr_args)
2968 break;
2969 ret = io_sqe_buffer_unregister(ctx);
2970 break;
Jens Axboe6b063142019-01-10 22:13:58 -07002971 case IORING_REGISTER_FILES:
2972 ret = io_sqe_files_register(ctx, arg, nr_args);
2973 break;
2974 case IORING_UNREGISTER_FILES:
2975 ret = -EINVAL;
2976 if (arg || nr_args)
2977 break;
2978 ret = io_sqe_files_unregister(ctx);
2979 break;
Jens Axboeedafcce2019-01-09 09:16:05 -07002980 default:
2981 ret = -EINVAL;
2982 break;
2983 }
2984
2985 /* bring the ctx back to life */
2986 reinit_completion(&ctx->ctx_done);
2987 percpu_ref_reinit(&ctx->refs);
2988 return ret;
2989}
2990
2991SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
2992 void __user *, arg, unsigned int, nr_args)
2993{
2994 struct io_ring_ctx *ctx;
2995 long ret = -EBADF;
2996 struct fd f;
2997
2998 f = fdget(fd);
2999 if (!f.file)
3000 return -EBADF;
3001
3002 ret = -EOPNOTSUPP;
3003 if (f.file->f_op != &io_uring_fops)
3004 goto out_fput;
3005
3006 ctx = f.file->private_data;
3007
3008 mutex_lock(&ctx->uring_lock);
3009 ret = __io_uring_register(ctx, opcode, arg, nr_args);
3010 mutex_unlock(&ctx->uring_lock);
3011out_fput:
3012 fdput(f);
3013 return ret;
3014}
3015
Jens Axboe2b188cc2019-01-07 10:46:33 -07003016static int __init io_uring_init(void)
3017{
3018 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
3019 return 0;
3020};
3021__initcall(io_uring_init);