blob: 1a58f20428150758671857b9e0bc46248d91ccbf [file] [log] [blame]
Jens Axboe2b188cc2019-01-07 10:46:33 -07001// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
Stefan Bühler1e84b972019-04-24 23:54:16 +02007 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqring (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
Jens Axboe2b188cc2019-01-07 10:46:33 -070029 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
Christoph Hellwigc992fe22019-01-11 09:43:02 -070040 * Copyright (c) 2018-2019 Christoph Hellwig
Jens Axboe2b188cc2019-01-07 10:46:33 -070041 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
47#include <linux/refcount.h>
48#include <linux/uio.h>
Pavel Begunkov6b47ee62020-01-18 20:22:41 +030049#include <linux/bits.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070050
51#include <linux/sched/signal.h>
52#include <linux/fs.h>
53#include <linux/file.h>
54#include <linux/fdtable.h>
55#include <linux/mm.h>
56#include <linux/mman.h>
57#include <linux/mmu_context.h>
58#include <linux/percpu.h>
59#include <linux/slab.h>
Jens Axboe6c271ce2019-01-10 11:22:30 -070060#include <linux/kthread.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070061#include <linux/blkdev.h>
Jens Axboeedafcce2019-01-09 09:16:05 -070062#include <linux/bvec.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070063#include <linux/net.h>
64#include <net/sock.h>
65#include <net/af_unix.h>
Jens Axboe6b063142019-01-10 22:13:58 -070066#include <net/scm.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070067#include <linux/anon_inodes.h>
68#include <linux/sched/mm.h>
69#include <linux/uaccess.h>
70#include <linux/nospec.h>
Jens Axboeedafcce2019-01-09 09:16:05 -070071#include <linux/sizes.h>
72#include <linux/hugetlb.h>
Jens Axboeaa4c3962019-11-29 10:14:00 -070073#include <linux/highmem.h>
Jens Axboe15b71ab2019-12-11 11:20:36 -070074#include <linux/namei.h>
75#include <linux/fsnotify.h>
Jens Axboe4840e412019-12-25 22:03:45 -070076#include <linux/fadvise.h>
Jens Axboe3e4827b2020-01-08 15:18:09 -070077#include <linux/eventpoll.h>
Jens Axboeff002b32020-02-07 16:05:21 -070078#include <linux/fs_struct.h>
Pavel Begunkov7d67af22020-02-24 11:32:45 +030079#include <linux/splice.h>
Jens Axboeb41e9852020-02-17 09:52:41 -070080#include <linux/task_work.h>
Jens Axboe2b188cc2019-01-07 10:46:33 -070081
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +020082#define CREATE_TRACE_POINTS
83#include <trace/events/io_uring.h>
84
Jens Axboe2b188cc2019-01-07 10:46:33 -070085#include <uapi/linux/io_uring.h>
86
87#include "internal.h"
Jens Axboe561fb042019-10-24 07:25:42 -060088#include "io-wq.h"
Jens Axboe2b188cc2019-01-07 10:46:33 -070089
Daniel Xu5277dea2019-09-14 14:23:45 -070090#define IORING_MAX_ENTRIES 32768
Jens Axboe33a107f2019-10-04 12:10:03 -060091#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
Jens Axboe65e19f52019-10-26 07:20:21 -060092
93/*
94 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
95 */
96#define IORING_FILE_TABLE_SHIFT 9
97#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
98#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
99#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700100
101struct io_uring {
102 u32 head ____cacheline_aligned_in_smp;
103 u32 tail ____cacheline_aligned_in_smp;
104};
105
Stefan Bühler1e84b972019-04-24 23:54:16 +0200106/*
Hristo Venev75b28af2019-08-26 17:23:46 +0000107 * This data is shared with the application through the mmap at offsets
108 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
Stefan Bühler1e84b972019-04-24 23:54:16 +0200109 *
110 * The offsets to the member fields are published through struct
111 * io_sqring_offsets when calling io_uring_setup.
112 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000113struct io_rings {
Stefan Bühler1e84b972019-04-24 23:54:16 +0200114 /*
115 * Head and tail offsets into the ring; the offsets need to be
116 * masked to get valid indices.
117 *
Hristo Venev75b28af2019-08-26 17:23:46 +0000118 * The kernel controls head of the sq ring and the tail of the cq ring,
119 * and the application controls tail of the sq ring and the head of the
120 * cq ring.
Stefan Bühler1e84b972019-04-24 23:54:16 +0200121 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000122 struct io_uring sq, cq;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200123 /*
Hristo Venev75b28af2019-08-26 17:23:46 +0000124 * Bitmasks to apply to head and tail offsets (constant, equals
Stefan Bühler1e84b972019-04-24 23:54:16 +0200125 * ring_entries - 1)
126 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000127 u32 sq_ring_mask, cq_ring_mask;
128 /* Ring sizes (constant, power of 2) */
129 u32 sq_ring_entries, cq_ring_entries;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200130 /*
131 * Number of invalid entries dropped by the kernel due to
132 * invalid index stored in array
133 *
134 * Written by the kernel, shouldn't be modified by the
135 * application (i.e. get number of "new events" by comparing to
136 * cached value).
137 *
138 * After a new SQ head value was read by the application this
139 * counter includes all submissions that were dropped reaching
140 * the new SQ head (and possibly more).
141 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000142 u32 sq_dropped;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200143 /*
144 * Runtime flags
145 *
146 * Written by the kernel, shouldn't be modified by the
147 * application.
148 *
149 * The application needs a full memory barrier before checking
150 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
151 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000152 u32 sq_flags;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200153 /*
154 * Number of completion events lost because the queue was full;
155 * this should be avoided by the application by making sure
LimingWu0b4295b2019-12-05 20:18:18 +0800156 * there are not more requests pending than there is space in
Stefan Bühler1e84b972019-04-24 23:54:16 +0200157 * the completion queue.
158 *
159 * Written by the kernel, shouldn't be modified by the
160 * application (i.e. get number of "new events" by comparing to
161 * cached value).
162 *
163 * As completion events come in out of order this counter is not
164 * ordered with any other data.
165 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000166 u32 cq_overflow;
Stefan Bühler1e84b972019-04-24 23:54:16 +0200167 /*
168 * Ring buffer of completion events.
169 *
170 * The kernel writes completion events fresh every time they are
171 * produced, so the application is allowed to modify pending
172 * entries.
173 */
Hristo Venev75b28af2019-08-26 17:23:46 +0000174 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700175};
176
Jens Axboeedafcce2019-01-09 09:16:05 -0700177struct io_mapped_ubuf {
178 u64 ubuf;
179 size_t len;
180 struct bio_vec *bvec;
181 unsigned int nr_bvecs;
182};
183
Jens Axboe65e19f52019-10-26 07:20:21 -0600184struct fixed_file_table {
185 struct file **files;
Jens Axboe31b51512019-01-18 22:56:34 -0700186};
187
Jens Axboe05f3fb32019-12-09 11:22:50 -0700188struct fixed_file_data {
189 struct fixed_file_table *table;
190 struct io_ring_ctx *ctx;
191
192 struct percpu_ref refs;
193 struct llist_head put_llist;
Jens Axboe05f3fb32019-12-09 11:22:50 -0700194 struct work_struct ref_work;
195 struct completion done;
196};
197
Jens Axboe5a2e7452020-02-23 16:23:11 -0700198struct io_buffer {
199 struct list_head list;
200 __u64 addr;
201 __s32 len;
202 __u16 bid;
203};
204
Jens Axboe2b188cc2019-01-07 10:46:33 -0700205struct io_ring_ctx {
206 struct {
207 struct percpu_ref refs;
208 } ____cacheline_aligned_in_smp;
209
210 struct {
211 unsigned int flags;
Randy Dunlape1d85332020-02-05 20:57:10 -0800212 unsigned int compat: 1;
213 unsigned int account_mem: 1;
214 unsigned int cq_overflow_flushed: 1;
215 unsigned int drain_next: 1;
216 unsigned int eventfd_async: 1;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700217
Hristo Venev75b28af2019-08-26 17:23:46 +0000218 /*
219 * Ring buffer of indices into array of io_uring_sqe, which is
220 * mmapped by the application using the IORING_OFF_SQES offset.
221 *
222 * This indirection could e.g. be used to assign fixed
223 * io_uring_sqe entries to operations and only submit them to
224 * the queue when needed.
225 *
226 * The kernel modifies neither the indices array nor the entries
227 * array.
228 */
229 u32 *sq_array;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700230 unsigned cached_sq_head;
231 unsigned sq_entries;
232 unsigned sq_mask;
Jens Axboe6c271ce2019-01-10 11:22:30 -0700233 unsigned sq_thread_idle;
Jens Axboe498ccd92019-10-25 10:04:25 -0600234 unsigned cached_sq_dropped;
Jens Axboe206aefd2019-11-07 18:27:42 -0700235 atomic_t cached_cq_overflow;
Jens Axboead3eb2c2019-12-18 17:12:20 -0700236 unsigned long sq_check_overflow;
Jens Axboede0617e2019-04-06 21:51:27 -0600237
238 struct list_head defer_list;
Jens Axboe5262f562019-09-17 12:26:57 -0600239 struct list_head timeout_list;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700240 struct list_head cq_overflow_list;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700241
Jens Axboefcb323c2019-10-24 12:39:47 -0600242 wait_queue_head_t inflight_wait;
Jens Axboead3eb2c2019-12-18 17:12:20 -0700243 struct io_uring_sqe *sq_sqes;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700244 } ____cacheline_aligned_in_smp;
245
Hristo Venev75b28af2019-08-26 17:23:46 +0000246 struct io_rings *rings;
247
Jens Axboe2b188cc2019-01-07 10:46:33 -0700248 /* IO offload */
Jens Axboe561fb042019-10-24 07:25:42 -0600249 struct io_wq *io_wq;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700250 struct task_struct *sqo_thread; /* if using sq thread polling */
251 struct mm_struct *sqo_mm;
252 wait_queue_head_t sqo_wait;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700253
Jens Axboe6b063142019-01-10 22:13:58 -0700254 /*
255 * If used, fixed file set. Writers must ensure that ->refs is dead,
256 * readers must ensure that ->refs is alive as long as the file* is
257 * used. Only updated through io_uring_register(2).
258 */
Jens Axboe05f3fb32019-12-09 11:22:50 -0700259 struct fixed_file_data *file_data;
Jens Axboe6b063142019-01-10 22:13:58 -0700260 unsigned nr_user_files;
Pavel Begunkovb14cca02020-01-17 04:45:59 +0300261 int ring_fd;
262 struct file *ring_file;
Jens Axboe6b063142019-01-10 22:13:58 -0700263
Jens Axboeedafcce2019-01-09 09:16:05 -0700264 /* if used, fixed mapped user buffers */
265 unsigned nr_user_bufs;
266 struct io_mapped_ubuf *user_bufs;
267
Jens Axboe2b188cc2019-01-07 10:46:33 -0700268 struct user_struct *user;
269
Jens Axboe0b8c0ec2019-12-02 08:50:00 -0700270 const struct cred *creds;
Jens Axboe181e4482019-11-25 08:52:30 -0700271
Jens Axboe206aefd2019-11-07 18:27:42 -0700272 /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
273 struct completion *completions;
274
Jens Axboe0ddf92e2019-11-08 08:52:53 -0700275 /* if all else fails... */
276 struct io_kiocb *fallback_req;
277
Jens Axboe206aefd2019-11-07 18:27:42 -0700278#if defined(CONFIG_UNIX)
279 struct socket *ring_sock;
280#endif
281
Jens Axboe5a2e7452020-02-23 16:23:11 -0700282 struct idr io_buffer_idr;
283
Jens Axboe071698e2020-01-28 10:04:42 -0700284 struct idr personality_idr;
285
Jens Axboe206aefd2019-11-07 18:27:42 -0700286 struct {
287 unsigned cached_cq_tail;
288 unsigned cq_entries;
289 unsigned cq_mask;
290 atomic_t cq_timeouts;
Jens Axboead3eb2c2019-12-18 17:12:20 -0700291 unsigned long cq_check_overflow;
Jens Axboe206aefd2019-11-07 18:27:42 -0700292 struct wait_queue_head cq_wait;
293 struct fasync_struct *cq_fasync;
294 struct eventfd_ctx *cq_ev_fd;
295 } ____cacheline_aligned_in_smp;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700296
297 struct {
298 struct mutex uring_lock;
299 wait_queue_head_t wait;
300 } ____cacheline_aligned_in_smp;
301
302 struct {
303 spinlock_t completion_lock;
Jens Axboee94f1412019-12-19 12:06:02 -0700304
Jens Axboedef596e2019-01-09 08:59:42 -0700305 /*
306 * ->poll_list is protected by the ctx->uring_lock for
307 * io_uring instances that don't use IORING_SETUP_SQPOLL.
308 * For SQPOLL, only the single threaded io_sq_thread() will
309 * manipulate the list, hence no extra locking is needed there.
310 */
311 struct list_head poll_list;
Jens Axboe78076bb2019-12-04 19:56:40 -0700312 struct hlist_head *cancel_hash;
313 unsigned cancel_hash_bits;
Jens Axboee94f1412019-12-19 12:06:02 -0700314 bool poll_multi_file;
Jens Axboefcb323c2019-10-24 12:39:47 -0600315
316 spinlock_t inflight_lock;
317 struct list_head inflight_list;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700318 } ____cacheline_aligned_in_smp;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700319};
320
Jens Axboe09bb8392019-03-13 12:39:28 -0600321/*
322 * First field must be the file pointer in all the
323 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
324 */
Jens Axboe221c5eb2019-01-17 09:41:58 -0700325struct io_poll_iocb {
326 struct file *file;
Jens Axboe0969e782019-12-17 18:40:57 -0700327 union {
328 struct wait_queue_head *head;
329 u64 addr;
330 };
Jens Axboe221c5eb2019-01-17 09:41:58 -0700331 __poll_t events;
Jens Axboe8c838782019-03-12 15:48:16 -0600332 bool done;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700333 bool canceled;
Jens Axboe392edb42019-12-09 17:52:20 -0700334 struct wait_queue_entry wait;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700335};
336
Jens Axboeb5dba592019-12-11 14:02:38 -0700337struct io_close {
338 struct file *file;
339 struct file *put_file;
340 int fd;
341};
342
Jens Axboead8a48a2019-11-15 08:49:11 -0700343struct io_timeout_data {
344 struct io_kiocb *req;
345 struct hrtimer timer;
346 struct timespec64 ts;
347 enum hrtimer_mode mode;
Pavel Begunkovcc42e0a2019-11-25 23:14:38 +0300348 u32 seq_offset;
Jens Axboead8a48a2019-11-15 08:49:11 -0700349};
350
Jens Axboe8ed8d3c2019-12-16 11:55:28 -0700351struct io_accept {
352 struct file *file;
353 struct sockaddr __user *addr;
354 int __user *addr_len;
355 int flags;
356};
357
358struct io_sync {
359 struct file *file;
360 loff_t len;
361 loff_t off;
362 int flags;
Jens Axboed63d1b52019-12-10 10:38:56 -0700363 int mode;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -0700364};
365
Jens Axboefbf23842019-12-17 18:45:56 -0700366struct io_cancel {
367 struct file *file;
368 u64 addr;
369};
370
Jens Axboeb29472e2019-12-17 18:50:29 -0700371struct io_timeout {
372 struct file *file;
373 u64 addr;
374 int flags;
Jens Axboe26a61672019-12-20 09:02:01 -0700375 unsigned count;
Jens Axboeb29472e2019-12-17 18:50:29 -0700376};
377
Jens Axboe9adbd452019-12-20 08:45:55 -0700378struct io_rw {
379 /* NOTE: kiocb has the file as the first member, so don't do it here */
380 struct kiocb kiocb;
381 u64 addr;
382 u64 len;
383};
384
Jens Axboe3fbb51c2019-12-20 08:51:52 -0700385struct io_connect {
386 struct file *file;
387 struct sockaddr __user *addr;
388 int addr_len;
389};
390
Jens Axboee47293f2019-12-20 08:58:21 -0700391struct io_sr_msg {
392 struct file *file;
Jens Axboefddafac2020-01-04 20:19:44 -0700393 union {
394 struct user_msghdr __user *msg;
395 void __user *buf;
396 };
Jens Axboee47293f2019-12-20 08:58:21 -0700397 int msg_flags;
Jens Axboefddafac2020-01-04 20:19:44 -0700398 size_t len;
Jens Axboee47293f2019-12-20 08:58:21 -0700399};
400
Jens Axboe15b71ab2019-12-11 11:20:36 -0700401struct io_open {
402 struct file *file;
403 int dfd;
Jens Axboeeddc7ef2019-12-13 21:18:10 -0700404 union {
Jens Axboeeddc7ef2019-12-13 21:18:10 -0700405 unsigned mask;
406 };
Jens Axboe15b71ab2019-12-11 11:20:36 -0700407 struct filename *filename;
Jens Axboeeddc7ef2019-12-13 21:18:10 -0700408 struct statx __user *buffer;
Jens Axboec12cedf2020-01-08 17:41:21 -0700409 struct open_how how;
Jens Axboe15b71ab2019-12-11 11:20:36 -0700410};
411
Jens Axboe05f3fb32019-12-09 11:22:50 -0700412struct io_files_update {
413 struct file *file;
414 u64 arg;
415 u32 nr_args;
416 u32 offset;
417};
418
Jens Axboe4840e412019-12-25 22:03:45 -0700419struct io_fadvise {
420 struct file *file;
421 u64 offset;
422 u32 len;
423 u32 advice;
424};
425
Jens Axboec1ca7572019-12-25 22:18:28 -0700426struct io_madvise {
427 struct file *file;
428 u64 addr;
429 u32 len;
430 u32 advice;
431};
432
Jens Axboe3e4827b2020-01-08 15:18:09 -0700433struct io_epoll {
434 struct file *file;
435 int epfd;
436 int op;
437 int fd;
438 struct epoll_event event;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700439};
440
Pavel Begunkov7d67af22020-02-24 11:32:45 +0300441struct io_splice {
442 struct file *file_out;
443 struct file *file_in;
444 loff_t off_out;
445 loff_t off_in;
446 u64 len;
447 unsigned int flags;
448};
449
Jens Axboeddf0322d2020-02-23 16:41:33 -0700450struct io_provide_buf {
451 struct file *file;
452 __u64 addr;
453 __s32 len;
454 __u32 bgid;
455 __u16 nbufs;
456 __u16 bid;
457};
458
Jens Axboef499a022019-12-02 16:28:46 -0700459struct io_async_connect {
460 struct sockaddr_storage address;
461};
462
Jens Axboe03b12302019-12-02 18:50:25 -0700463struct io_async_msghdr {
464 struct iovec fast_iov[UIO_FASTIOV];
465 struct iovec *iov;
466 struct sockaddr __user *uaddr;
467 struct msghdr msg;
Jens Axboeb5379162020-02-09 11:29:15 -0700468 struct sockaddr_storage addr;
Jens Axboe03b12302019-12-02 18:50:25 -0700469};
470
Jens Axboef67676d2019-12-02 11:03:47 -0700471struct io_async_rw {
472 struct iovec fast_iov[UIO_FASTIOV];
473 struct iovec *iov;
474 ssize_t nr_segs;
475 ssize_t size;
476};
477
Jens Axboe1a6b74f2019-12-02 10:33:15 -0700478struct io_async_ctx {
Jens Axboef67676d2019-12-02 11:03:47 -0700479 union {
480 struct io_async_rw rw;
Jens Axboe03b12302019-12-02 18:50:25 -0700481 struct io_async_msghdr msg;
Jens Axboef499a022019-12-02 16:28:46 -0700482 struct io_async_connect connect;
Jens Axboe2d283902019-12-04 11:08:05 -0700483 struct io_timeout_data timeout;
Jens Axboef67676d2019-12-02 11:03:47 -0700484 };
Jens Axboe1a6b74f2019-12-02 10:33:15 -0700485};
486
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300487enum {
488 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
489 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
490 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
491 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
492 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
493
494 REQ_F_LINK_NEXT_BIT,
495 REQ_F_FAIL_LINK_BIT,
496 REQ_F_INFLIGHT_BIT,
497 REQ_F_CUR_POS_BIT,
498 REQ_F_NOWAIT_BIT,
499 REQ_F_IOPOLL_COMPLETED_BIT,
500 REQ_F_LINK_TIMEOUT_BIT,
501 REQ_F_TIMEOUT_BIT,
502 REQ_F_ISREG_BIT,
503 REQ_F_MUST_PUNT_BIT,
504 REQ_F_TIMEOUT_NOSEQ_BIT,
505 REQ_F_COMP_LOCKED_BIT,
Pavel Begunkov99bc4c32020-02-07 22:04:45 +0300506 REQ_F_NEED_CLEANUP_BIT,
Jens Axboe2ca10252020-02-13 17:17:35 -0700507 REQ_F_OVERFLOW_BIT,
Jens Axboed7718a92020-02-14 22:23:12 -0700508 REQ_F_POLLED_BIT,
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300509};
510
511enum {
512 /* ctx owns file */
513 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
514 /* drain existing IO first */
515 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
516 /* linked sqes */
517 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
518 /* doesn't sever on completion < 0 */
519 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
520 /* IOSQE_ASYNC */
521 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
522
523 /* already grabbed next link */
524 REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
525 /* fail rest of links */
526 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
527 /* on inflight list */
528 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
529 /* read/write uses file position */
530 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
531 /* must not punt to workers */
532 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
533 /* polled IO has completed */
534 REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
535 /* has linked timeout */
536 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
537 /* timeout request */
538 REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
539 /* regular file */
540 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
541 /* must be punted even for NONBLOCK */
542 REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
543 /* no timeout sequence */
544 REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
545 /* completion under lock */
546 REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
Pavel Begunkov99bc4c32020-02-07 22:04:45 +0300547 /* needs cleanup */
548 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
Jens Axboe2ca10252020-02-13 17:17:35 -0700549 /* in overflow list */
550 REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT),
Jens Axboed7718a92020-02-14 22:23:12 -0700551 /* already went through poll handler */
552 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
553};
554
555struct async_poll {
556 struct io_poll_iocb poll;
557 struct io_wq_work work;
Pavel Begunkov6b47ee62020-01-18 20:22:41 +0300558};
559
Jens Axboe09bb8392019-03-13 12:39:28 -0600560/*
561 * NOTE! Each of the iocb union members has the file pointer
562 * as the first entry in their struct definition. So you can
563 * access the file pointer through any of the sub-structs,
564 * or directly as just 'ki_filp' in this struct.
565 */
Jens Axboe2b188cc2019-01-07 10:46:33 -0700566struct io_kiocb {
Jens Axboe221c5eb2019-01-17 09:41:58 -0700567 union {
Jens Axboe09bb8392019-03-13 12:39:28 -0600568 struct file *file;
Jens Axboe9adbd452019-12-20 08:45:55 -0700569 struct io_rw rw;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700570 struct io_poll_iocb poll;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -0700571 struct io_accept accept;
572 struct io_sync sync;
Jens Axboefbf23842019-12-17 18:45:56 -0700573 struct io_cancel cancel;
Jens Axboeb29472e2019-12-17 18:50:29 -0700574 struct io_timeout timeout;
Jens Axboe3fbb51c2019-12-20 08:51:52 -0700575 struct io_connect connect;
Jens Axboee47293f2019-12-20 08:58:21 -0700576 struct io_sr_msg sr_msg;
Jens Axboe15b71ab2019-12-11 11:20:36 -0700577 struct io_open open;
Jens Axboeb5dba592019-12-11 14:02:38 -0700578 struct io_close close;
Jens Axboe05f3fb32019-12-09 11:22:50 -0700579 struct io_files_update files_update;
Jens Axboe4840e412019-12-25 22:03:45 -0700580 struct io_fadvise fadvise;
Jens Axboec1ca7572019-12-25 22:18:28 -0700581 struct io_madvise madvise;
Jens Axboe3e4827b2020-01-08 15:18:09 -0700582 struct io_epoll epoll;
Pavel Begunkov7d67af22020-02-24 11:32:45 +0300583 struct io_splice splice;
Jens Axboeddf0322d2020-02-23 16:41:33 -0700584 struct io_provide_buf pbuf;
Jens Axboe221c5eb2019-01-17 09:41:58 -0700585 };
Jens Axboe2b188cc2019-01-07 10:46:33 -0700586
Jens Axboe1a6b74f2019-12-02 10:33:15 -0700587 struct io_async_ctx *io;
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +0300588 bool needs_fixed_file;
Jens Axboed625c6e2019-12-17 19:53:05 -0700589 u8 opcode;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700590
591 struct io_ring_ctx *ctx;
Jens Axboed7718a92020-02-14 22:23:12 -0700592 struct list_head list;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700593 unsigned int flags;
Jens Axboec16361c2019-01-17 08:39:48 -0700594 refcount_t refs;
Jens Axboed7718a92020-02-14 22:23:12 -0700595 struct task_struct *task;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700596 u64 user_data;
Jens Axboe9e645e112019-05-10 16:07:28 -0600597 u32 result;
Jens Axboede0617e2019-04-06 21:51:27 -0600598 u32 sequence;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700599
Jens Axboed7718a92020-02-14 22:23:12 -0700600 struct list_head link_list;
601
Jens Axboefcb323c2019-10-24 12:39:47 -0600602 struct list_head inflight_entry;
603
Jens Axboeb41e9852020-02-17 09:52:41 -0700604 union {
605 /*
606 * Only commands that never go async can use the below fields,
Jens Axboed7718a92020-02-14 22:23:12 -0700607 * obviously. Right now only IORING_OP_POLL_ADD uses them, and
608 * async armed poll handlers for regular commands. The latter
609 * restore the work, if needed.
Jens Axboeb41e9852020-02-17 09:52:41 -0700610 */
611 struct {
Jens Axboeb41e9852020-02-17 09:52:41 -0700612 struct callback_head task_work;
Jens Axboed7718a92020-02-14 22:23:12 -0700613 struct hlist_node hash_node;
614 struct async_poll *apoll;
Jens Axboeb41e9852020-02-17 09:52:41 -0700615 };
616 struct io_wq_work work;
617 };
Jens Axboe2b188cc2019-01-07 10:46:33 -0700618};
619
620#define IO_PLUG_THRESHOLD 2
Jens Axboedef596e2019-01-09 08:59:42 -0700621#define IO_IOPOLL_BATCH 8
Jens Axboe2b188cc2019-01-07 10:46:33 -0700622
Jens Axboe9a56a232019-01-09 09:06:50 -0700623struct io_submit_state {
624 struct blk_plug plug;
625
626 /*
Jens Axboe2579f912019-01-09 09:10:43 -0700627 * io_kiocb alloc cache
628 */
629 void *reqs[IO_IOPOLL_BATCH];
Pavel Begunkov6c8a3132020-02-01 03:58:00 +0300630 unsigned int free_reqs;
Jens Axboe2579f912019-01-09 09:10:43 -0700631
632 /*
Jens Axboe9a56a232019-01-09 09:06:50 -0700633 * File reference cache
634 */
635 struct file *file;
636 unsigned int fd;
637 unsigned int has_refs;
638 unsigned int used_refs;
639 unsigned int ios_left;
640};
641
Jens Axboed3656342019-12-18 09:50:26 -0700642struct io_op_def {
643 /* needs req->io allocated for deferral/async */
644 unsigned async_ctx : 1;
645 /* needs current->mm setup, does mm access */
646 unsigned needs_mm : 1;
647 /* needs req->file assigned */
648 unsigned needs_file : 1;
649 /* needs req->file assigned IFF fd is >= 0 */
650 unsigned fd_non_neg : 1;
651 /* hash wq insertion if file is a regular file */
652 unsigned hash_reg_file : 1;
653 /* unbound wq insertion if file is a non-regular file */
654 unsigned unbound_nonreg_file : 1;
Jens Axboe66f4af92020-01-16 15:36:52 -0700655 /* opcode is not supported by this kernel */
656 unsigned not_supported : 1;
Jens Axboef86cd202020-01-29 13:46:44 -0700657 /* needs file table */
658 unsigned file_table : 1;
Jens Axboeff002b32020-02-07 16:05:21 -0700659 /* needs ->fs */
660 unsigned needs_fs : 1;
Jens Axboe8a727582020-02-20 09:59:44 -0700661 /* set if opcode supports polled "wait" */
662 unsigned pollin : 1;
663 unsigned pollout : 1;
Jens Axboed3656342019-12-18 09:50:26 -0700664};
665
666static const struct io_op_def io_op_defs[] = {
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300667 [IORING_OP_NOP] = {},
668 [IORING_OP_READV] = {
Jens Axboed3656342019-12-18 09:50:26 -0700669 .async_ctx = 1,
670 .needs_mm = 1,
671 .needs_file = 1,
672 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700673 .pollin = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700674 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300675 [IORING_OP_WRITEV] = {
Jens Axboed3656342019-12-18 09:50:26 -0700676 .async_ctx = 1,
677 .needs_mm = 1,
678 .needs_file = 1,
679 .hash_reg_file = 1,
680 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700681 .pollout = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700682 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300683 [IORING_OP_FSYNC] = {
Jens Axboed3656342019-12-18 09:50:26 -0700684 .needs_file = 1,
685 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300686 [IORING_OP_READ_FIXED] = {
Jens Axboed3656342019-12-18 09:50:26 -0700687 .needs_file = 1,
688 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700689 .pollin = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700690 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300691 [IORING_OP_WRITE_FIXED] = {
Jens Axboed3656342019-12-18 09:50:26 -0700692 .needs_file = 1,
693 .hash_reg_file = 1,
694 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700695 .pollout = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700696 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300697 [IORING_OP_POLL_ADD] = {
Jens Axboed3656342019-12-18 09:50:26 -0700698 .needs_file = 1,
699 .unbound_nonreg_file = 1,
700 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300701 [IORING_OP_POLL_REMOVE] = {},
702 [IORING_OP_SYNC_FILE_RANGE] = {
Jens Axboed3656342019-12-18 09:50:26 -0700703 .needs_file = 1,
704 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300705 [IORING_OP_SENDMSG] = {
Jens Axboed3656342019-12-18 09:50:26 -0700706 .async_ctx = 1,
707 .needs_mm = 1,
708 .needs_file = 1,
709 .unbound_nonreg_file = 1,
Jens Axboeff002b32020-02-07 16:05:21 -0700710 .needs_fs = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700711 .pollout = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700712 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300713 [IORING_OP_RECVMSG] = {
Jens Axboed3656342019-12-18 09:50:26 -0700714 .async_ctx = 1,
715 .needs_mm = 1,
716 .needs_file = 1,
717 .unbound_nonreg_file = 1,
Jens Axboeff002b32020-02-07 16:05:21 -0700718 .needs_fs = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700719 .pollin = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700720 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300721 [IORING_OP_TIMEOUT] = {
Jens Axboed3656342019-12-18 09:50:26 -0700722 .async_ctx = 1,
723 .needs_mm = 1,
724 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300725 [IORING_OP_TIMEOUT_REMOVE] = {},
726 [IORING_OP_ACCEPT] = {
Jens Axboed3656342019-12-18 09:50:26 -0700727 .needs_mm = 1,
728 .needs_file = 1,
729 .unbound_nonreg_file = 1,
Jens Axboef86cd202020-01-29 13:46:44 -0700730 .file_table = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700731 .pollin = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700732 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300733 [IORING_OP_ASYNC_CANCEL] = {},
734 [IORING_OP_LINK_TIMEOUT] = {
Jens Axboed3656342019-12-18 09:50:26 -0700735 .async_ctx = 1,
736 .needs_mm = 1,
737 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300738 [IORING_OP_CONNECT] = {
Jens Axboed3656342019-12-18 09:50:26 -0700739 .async_ctx = 1,
740 .needs_mm = 1,
741 .needs_file = 1,
742 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700743 .pollout = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700744 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300745 [IORING_OP_FALLOCATE] = {
Jens Axboed3656342019-12-18 09:50:26 -0700746 .needs_file = 1,
747 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300748 [IORING_OP_OPENAT] = {
Jens Axboed3656342019-12-18 09:50:26 -0700749 .needs_file = 1,
750 .fd_non_neg = 1,
Jens Axboef86cd202020-01-29 13:46:44 -0700751 .file_table = 1,
Jens Axboeff002b32020-02-07 16:05:21 -0700752 .needs_fs = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700753 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300754 [IORING_OP_CLOSE] = {
Jens Axboed3656342019-12-18 09:50:26 -0700755 .needs_file = 1,
Jens Axboef86cd202020-01-29 13:46:44 -0700756 .file_table = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700757 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300758 [IORING_OP_FILES_UPDATE] = {
Jens Axboed3656342019-12-18 09:50:26 -0700759 .needs_mm = 1,
Jens Axboef86cd202020-01-29 13:46:44 -0700760 .file_table = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700761 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300762 [IORING_OP_STATX] = {
Jens Axboed3656342019-12-18 09:50:26 -0700763 .needs_mm = 1,
764 .needs_file = 1,
765 .fd_non_neg = 1,
Jens Axboeff002b32020-02-07 16:05:21 -0700766 .needs_fs = 1,
Jens Axboed3656342019-12-18 09:50:26 -0700767 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300768 [IORING_OP_READ] = {
Jens Axboe3a6820f2019-12-22 15:19:35 -0700769 .needs_mm = 1,
770 .needs_file = 1,
771 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700772 .pollin = 1,
Jens Axboe3a6820f2019-12-22 15:19:35 -0700773 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300774 [IORING_OP_WRITE] = {
Jens Axboe3a6820f2019-12-22 15:19:35 -0700775 .needs_mm = 1,
776 .needs_file = 1,
777 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700778 .pollout = 1,
Jens Axboe3a6820f2019-12-22 15:19:35 -0700779 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300780 [IORING_OP_FADVISE] = {
Jens Axboe4840e412019-12-25 22:03:45 -0700781 .needs_file = 1,
782 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300783 [IORING_OP_MADVISE] = {
Jens Axboec1ca7572019-12-25 22:18:28 -0700784 .needs_mm = 1,
785 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300786 [IORING_OP_SEND] = {
Jens Axboefddafac2020-01-04 20:19:44 -0700787 .needs_mm = 1,
788 .needs_file = 1,
789 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700790 .pollout = 1,
Jens Axboefddafac2020-01-04 20:19:44 -0700791 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300792 [IORING_OP_RECV] = {
Jens Axboefddafac2020-01-04 20:19:44 -0700793 .needs_mm = 1,
794 .needs_file = 1,
795 .unbound_nonreg_file = 1,
Jens Axboe8a727582020-02-20 09:59:44 -0700796 .pollin = 1,
Jens Axboefddafac2020-01-04 20:19:44 -0700797 },
Pavel Begunkov0463b6c2020-01-18 21:35:38 +0300798 [IORING_OP_OPENAT2] = {
Jens Axboecebdb982020-01-08 17:59:24 -0700799 .needs_file = 1,
800 .fd_non_neg = 1,
Jens Axboef86cd202020-01-29 13:46:44 -0700801 .file_table = 1,
Jens Axboeff002b32020-02-07 16:05:21 -0700802 .needs_fs = 1,
Jens Axboecebdb982020-01-08 17:59:24 -0700803 },
Jens Axboe3e4827b2020-01-08 15:18:09 -0700804 [IORING_OP_EPOLL_CTL] = {
805 .unbound_nonreg_file = 1,
806 .file_table = 1,
807 },
Pavel Begunkov7d67af22020-02-24 11:32:45 +0300808 [IORING_OP_SPLICE] = {
809 .needs_file = 1,
810 .hash_reg_file = 1,
811 .unbound_nonreg_file = 1,
Jens Axboeddf0322d2020-02-23 16:41:33 -0700812 },
813 [IORING_OP_PROVIDE_BUFFERS] = {},
Jens Axboed3656342019-12-18 09:50:26 -0700814};
815
Jens Axboe561fb042019-10-24 07:25:42 -0600816static void io_wq_submit_work(struct io_wq_work **workptr);
Jens Axboe78e19bb2019-11-06 15:21:34 -0700817static void io_cqring_fill_event(struct io_kiocb *req, long res);
Jackie Liuec9c02a2019-11-08 23:50:36 +0800818static void io_put_req(struct io_kiocb *req);
Jens Axboe978db572019-11-14 22:39:04 -0700819static void __io_double_put_req(struct io_kiocb *req);
Jens Axboe94ae5e72019-11-14 19:39:52 -0700820static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
821static void io_queue_linked_timeout(struct io_kiocb *req);
Jens Axboe05f3fb32019-12-09 11:22:50 -0700822static int __io_sqe_files_update(struct io_ring_ctx *ctx,
823 struct io_uring_files_update *ip,
824 unsigned nr_args);
Jens Axboef86cd202020-01-29 13:46:44 -0700825static int io_grab_files(struct io_kiocb *req);
Jens Axboe2faf8522020-02-04 19:54:55 -0700826static void io_ring_file_ref_flush(struct fixed_file_data *data);
Pavel Begunkov99bc4c32020-02-07 22:04:45 +0300827static void io_cleanup_req(struct io_kiocb *req);
Jens Axboeb41e9852020-02-17 09:52:41 -0700828static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
829 int fd, struct file **out_file, bool fixed);
830static void __io_queue_sqe(struct io_kiocb *req,
831 const struct io_uring_sqe *sqe);
Jens Axboede0617e2019-04-06 21:51:27 -0600832
Jens Axboe2b188cc2019-01-07 10:46:33 -0700833static struct kmem_cache *req_cachep;
834
835static const struct file_operations io_uring_fops;
836
837struct sock *io_uring_get_socket(struct file *file)
838{
839#if defined(CONFIG_UNIX)
840 if (file->f_op == &io_uring_fops) {
841 struct io_ring_ctx *ctx = file->private_data;
842
843 return ctx->ring_sock->sk;
844 }
845#endif
846 return NULL;
847}
848EXPORT_SYMBOL(io_uring_get_socket);
849
850static void io_ring_ctx_ref_free(struct percpu_ref *ref)
851{
852 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
853
Jens Axboe206aefd2019-11-07 18:27:42 -0700854 complete(&ctx->completions[0]);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700855}
856
857static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
858{
859 struct io_ring_ctx *ctx;
Jens Axboe78076bb2019-12-04 19:56:40 -0700860 int hash_bits;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700861
862 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
863 if (!ctx)
864 return NULL;
865
Jens Axboe0ddf92e2019-11-08 08:52:53 -0700866 ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
867 if (!ctx->fallback_req)
868 goto err;
869
Jens Axboe206aefd2019-11-07 18:27:42 -0700870 ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
871 if (!ctx->completions)
872 goto err;
873
Jens Axboe78076bb2019-12-04 19:56:40 -0700874 /*
875 * Use 5 bits less than the max cq entries, that should give us around
876 * 32 entries per hash list if totally full and uniformly spread.
877 */
878 hash_bits = ilog2(p->cq_entries);
879 hash_bits -= 5;
880 if (hash_bits <= 0)
881 hash_bits = 1;
882 ctx->cancel_hash_bits = hash_bits;
883 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
884 GFP_KERNEL);
885 if (!ctx->cancel_hash)
886 goto err;
887 __hash_init(ctx->cancel_hash, 1U << hash_bits);
888
Roman Gushchin21482892019-05-07 10:01:48 -0700889 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
Jens Axboe206aefd2019-11-07 18:27:42 -0700890 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
891 goto err;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700892
893 ctx->flags = p->flags;
894 init_waitqueue_head(&ctx->cq_wait);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -0700895 INIT_LIST_HEAD(&ctx->cq_overflow_list);
Jens Axboe206aefd2019-11-07 18:27:42 -0700896 init_completion(&ctx->completions[0]);
897 init_completion(&ctx->completions[1]);
Jens Axboe5a2e7452020-02-23 16:23:11 -0700898 idr_init(&ctx->io_buffer_idr);
Jens Axboe071698e2020-01-28 10:04:42 -0700899 idr_init(&ctx->personality_idr);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700900 mutex_init(&ctx->uring_lock);
901 init_waitqueue_head(&ctx->wait);
902 spin_lock_init(&ctx->completion_lock);
Jens Axboedef596e2019-01-09 08:59:42 -0700903 INIT_LIST_HEAD(&ctx->poll_list);
Jens Axboede0617e2019-04-06 21:51:27 -0600904 INIT_LIST_HEAD(&ctx->defer_list);
Jens Axboe5262f562019-09-17 12:26:57 -0600905 INIT_LIST_HEAD(&ctx->timeout_list);
Jens Axboefcb323c2019-10-24 12:39:47 -0600906 init_waitqueue_head(&ctx->inflight_wait);
907 spin_lock_init(&ctx->inflight_lock);
908 INIT_LIST_HEAD(&ctx->inflight_list);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700909 return ctx;
Jens Axboe206aefd2019-11-07 18:27:42 -0700910err:
Jens Axboe0ddf92e2019-11-08 08:52:53 -0700911 if (ctx->fallback_req)
912 kmem_cache_free(req_cachep, ctx->fallback_req);
Jens Axboe206aefd2019-11-07 18:27:42 -0700913 kfree(ctx->completions);
Jens Axboe78076bb2019-12-04 19:56:40 -0700914 kfree(ctx->cancel_hash);
Jens Axboe206aefd2019-11-07 18:27:42 -0700915 kfree(ctx);
916 return NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700917}
918
Bob Liu9d858b22019-11-13 18:06:25 +0800919static inline bool __req_need_defer(struct io_kiocb *req)
Jens Axboede0617e2019-04-06 21:51:27 -0600920{
Jackie Liua197f662019-11-08 08:09:12 -0700921 struct io_ring_ctx *ctx = req->ctx;
922
Jens Axboe498ccd92019-10-25 10:04:25 -0600923 return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
924 + atomic_read(&ctx->cached_cq_overflow);
Jens Axboede0617e2019-04-06 21:51:27 -0600925}
926
Bob Liu9d858b22019-11-13 18:06:25 +0800927static inline bool req_need_defer(struct io_kiocb *req)
Jens Axboe7adf4ea2019-10-10 21:42:58 -0600928{
Pavel Begunkov87987892020-01-18 01:22:30 +0300929 if (unlikely(req->flags & REQ_F_IO_DRAIN))
Bob Liu9d858b22019-11-13 18:06:25 +0800930 return __req_need_defer(req);
Jens Axboe7adf4ea2019-10-10 21:42:58 -0600931
Bob Liu9d858b22019-11-13 18:06:25 +0800932 return false;
Jens Axboe7adf4ea2019-10-10 21:42:58 -0600933}
934
935static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
Jens Axboede0617e2019-04-06 21:51:27 -0600936{
937 struct io_kiocb *req;
938
Jens Axboe7adf4ea2019-10-10 21:42:58 -0600939 req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
Bob Liu9d858b22019-11-13 18:06:25 +0800940 if (req && !req_need_defer(req)) {
Jens Axboede0617e2019-04-06 21:51:27 -0600941 list_del_init(&req->list);
942 return req;
943 }
944
945 return NULL;
946}
947
Jens Axboe5262f562019-09-17 12:26:57 -0600948static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
949{
Jens Axboe7adf4ea2019-10-10 21:42:58 -0600950 struct io_kiocb *req;
951
952 req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
Jens Axboe93bd25b2019-11-11 23:34:31 -0700953 if (req) {
954 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
955 return NULL;
Linus Torvaldsfb4b3d32019-11-25 10:40:27 -0800956 if (!__req_need_defer(req)) {
Jens Axboe93bd25b2019-11-11 23:34:31 -0700957 list_del_init(&req->list);
958 return req;
959 }
Jens Axboe7adf4ea2019-10-10 21:42:58 -0600960 }
961
962 return NULL;
Jens Axboe5262f562019-09-17 12:26:57 -0600963}
964
Jens Axboede0617e2019-04-06 21:51:27 -0600965static void __io_commit_cqring(struct io_ring_ctx *ctx)
Jens Axboe2b188cc2019-01-07 10:46:33 -0700966{
Hristo Venev75b28af2019-08-26 17:23:46 +0000967 struct io_rings *rings = ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -0700968
Pavel Begunkov07910152020-01-17 03:52:46 +0300969 /* order cqe stores with ring update */
970 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700971
Pavel Begunkov07910152020-01-17 03:52:46 +0300972 if (wq_has_sleeper(&ctx->cq_wait)) {
973 wake_up_interruptible(&ctx->cq_wait);
974 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
Jens Axboe2b188cc2019-01-07 10:46:33 -0700975 }
976}
977
Jens Axboecccf0ee2020-01-27 16:34:48 -0700978static inline void io_req_work_grab_env(struct io_kiocb *req,
979 const struct io_op_def *def)
Jens Axboe18d9be12019-09-10 09:13:05 -0600980{
Jens Axboecccf0ee2020-01-27 16:34:48 -0700981 if (!req->work.mm && def->needs_mm) {
982 mmgrab(current->mm);
983 req->work.mm = current->mm;
984 }
985 if (!req->work.creds)
986 req->work.creds = get_current_cred();
Jens Axboeff002b32020-02-07 16:05:21 -0700987 if (!req->work.fs && def->needs_fs) {
988 spin_lock(&current->fs->lock);
989 if (!current->fs->in_exec) {
990 req->work.fs = current->fs;
991 req->work.fs->users++;
992 } else {
993 req->work.flags |= IO_WQ_WORK_CANCEL;
994 }
995 spin_unlock(&current->fs->lock);
996 }
Jens Axboe6ab23142020-02-08 20:23:59 -0700997 if (!req->work.task_pid)
998 req->work.task_pid = task_pid_vnr(current);
Jens Axboecccf0ee2020-01-27 16:34:48 -0700999}
1000
1001static inline void io_req_work_drop_env(struct io_kiocb *req)
1002{
1003 if (req->work.mm) {
1004 mmdrop(req->work.mm);
1005 req->work.mm = NULL;
1006 }
1007 if (req->work.creds) {
1008 put_cred(req->work.creds);
1009 req->work.creds = NULL;
1010 }
Jens Axboeff002b32020-02-07 16:05:21 -07001011 if (req->work.fs) {
1012 struct fs_struct *fs = req->work.fs;
1013
1014 spin_lock(&req->work.fs->lock);
1015 if (--fs->users)
1016 fs = NULL;
1017 spin_unlock(&req->work.fs->lock);
1018 if (fs)
1019 free_fs_struct(fs);
1020 }
Jens Axboe561fb042019-10-24 07:25:42 -06001021}
1022
Jens Axboe94ae5e72019-11-14 19:39:52 -07001023static inline bool io_prep_async_work(struct io_kiocb *req,
1024 struct io_kiocb **link)
Jens Axboe561fb042019-10-24 07:25:42 -06001025{
Jens Axboed3656342019-12-18 09:50:26 -07001026 const struct io_op_def *def = &io_op_defs[req->opcode];
Jens Axboe561fb042019-10-24 07:25:42 -06001027 bool do_hashed = false;
Jens Axboe54a91f32019-09-10 09:15:04 -06001028
Jens Axboed3656342019-12-18 09:50:26 -07001029 if (req->flags & REQ_F_ISREG) {
1030 if (def->hash_reg_file)
Jens Axboe3529d8c2019-12-19 18:24:38 -07001031 do_hashed = true;
Jens Axboed3656342019-12-18 09:50:26 -07001032 } else {
1033 if (def->unbound_nonreg_file)
Jens Axboe3529d8c2019-12-19 18:24:38 -07001034 req->work.flags |= IO_WQ_WORK_UNBOUND;
Jens Axboe54a91f32019-09-10 09:15:04 -06001035 }
Jens Axboecccf0ee2020-01-27 16:34:48 -07001036
1037 io_req_work_grab_env(req, def);
Jens Axboe54a91f32019-09-10 09:15:04 -06001038
Jens Axboe94ae5e72019-11-14 19:39:52 -07001039 *link = io_prep_linked_timeout(req);
Jens Axboe561fb042019-10-24 07:25:42 -06001040 return do_hashed;
1041}
1042
Jackie Liua197f662019-11-08 08:09:12 -07001043static inline void io_queue_async_work(struct io_kiocb *req)
Jens Axboe561fb042019-10-24 07:25:42 -06001044{
Jackie Liua197f662019-11-08 08:09:12 -07001045 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe94ae5e72019-11-14 19:39:52 -07001046 struct io_kiocb *link;
1047 bool do_hashed;
1048
1049 do_hashed = io_prep_async_work(req, &link);
Jens Axboe561fb042019-10-24 07:25:42 -06001050
1051 trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
1052 req->flags);
1053 if (!do_hashed) {
1054 io_wq_enqueue(ctx->io_wq, &req->work);
1055 } else {
1056 io_wq_enqueue_hashed(ctx->io_wq, &req->work,
1057 file_inode(req->file));
1058 }
Jens Axboe94ae5e72019-11-14 19:39:52 -07001059
1060 if (link)
1061 io_queue_linked_timeout(link);
Jens Axboe18d9be12019-09-10 09:13:05 -06001062}
1063
Jens Axboe5262f562019-09-17 12:26:57 -06001064static void io_kill_timeout(struct io_kiocb *req)
1065{
1066 int ret;
1067
Jens Axboe2d283902019-12-04 11:08:05 -07001068 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
Jens Axboe5262f562019-09-17 12:26:57 -06001069 if (ret != -1) {
1070 atomic_inc(&req->ctx->cq_timeouts);
Jens Axboe842f9612019-10-29 12:34:10 -06001071 list_del_init(&req->list);
Jens Axboe78e19bb2019-11-06 15:21:34 -07001072 io_cqring_fill_event(req, 0);
Jackie Liuec9c02a2019-11-08 23:50:36 +08001073 io_put_req(req);
Jens Axboe5262f562019-09-17 12:26:57 -06001074 }
1075}
1076
1077static void io_kill_timeouts(struct io_ring_ctx *ctx)
1078{
1079 struct io_kiocb *req, *tmp;
1080
1081 spin_lock_irq(&ctx->completion_lock);
1082 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
1083 io_kill_timeout(req);
1084 spin_unlock_irq(&ctx->completion_lock);
1085}
1086
Jens Axboede0617e2019-04-06 21:51:27 -06001087static void io_commit_cqring(struct io_ring_ctx *ctx)
1088{
1089 struct io_kiocb *req;
1090
Jens Axboe5262f562019-09-17 12:26:57 -06001091 while ((req = io_get_timeout_req(ctx)) != NULL)
1092 io_kill_timeout(req);
1093
Jens Axboede0617e2019-04-06 21:51:27 -06001094 __io_commit_cqring(ctx);
1095
Pavel Begunkov87987892020-01-18 01:22:30 +03001096 while ((req = io_get_deferred_req(ctx)) != NULL)
Jackie Liua197f662019-11-08 08:09:12 -07001097 io_queue_async_work(req);
Jens Axboede0617e2019-04-06 21:51:27 -06001098}
1099
Jens Axboe2b188cc2019-01-07 10:46:33 -07001100static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1101{
Hristo Venev75b28af2019-08-26 17:23:46 +00001102 struct io_rings *rings = ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001103 unsigned tail;
1104
1105 tail = ctx->cached_cq_tail;
Stefan Bühler115e12e2019-04-24 23:54:18 +02001106 /*
1107 * writes to the cq entry need to come after reading head; the
1108 * control dependency is enough as we're using WRITE_ONCE to
1109 * fill the cq entry
1110 */
Hristo Venev75b28af2019-08-26 17:23:46 +00001111 if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001112 return NULL;
1113
1114 ctx->cached_cq_tail++;
Hristo Venev75b28af2019-08-26 17:23:46 +00001115 return &rings->cqes[tail & ctx->cq_mask];
Jens Axboe2b188cc2019-01-07 10:46:33 -07001116}
1117
Jens Axboef2842ab2020-01-08 11:04:00 -07001118static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1119{
Jens Axboef0b493e2020-02-01 21:30:11 -07001120 if (!ctx->cq_ev_fd)
1121 return false;
Jens Axboef2842ab2020-01-08 11:04:00 -07001122 if (!ctx->eventfd_async)
1123 return true;
Jens Axboeb41e9852020-02-17 09:52:41 -07001124 return io_wq_current_is_worker();
Jens Axboef2842ab2020-01-08 11:04:00 -07001125}
1126
Jens Axboeb41e9852020-02-17 09:52:41 -07001127static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
Jens Axboe8c838782019-03-12 15:48:16 -06001128{
1129 if (waitqueue_active(&ctx->wait))
1130 wake_up(&ctx->wait);
1131 if (waitqueue_active(&ctx->sqo_wait))
1132 wake_up(&ctx->sqo_wait);
Jens Axboeb41e9852020-02-17 09:52:41 -07001133 if (io_should_trigger_evfd(ctx))
Jens Axboe9b402842019-04-11 11:45:41 -06001134 eventfd_signal(ctx->cq_ev_fd, 1);
Jens Axboe8c838782019-03-12 15:48:16 -06001135}
1136
Jens Axboec4a2ed72019-11-21 21:01:26 -07001137/* Returns true if there are no backlogged entries after the flush */
1138static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001139{
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001140 struct io_rings *rings = ctx->rings;
1141 struct io_uring_cqe *cqe;
1142 struct io_kiocb *req;
1143 unsigned long flags;
1144 LIST_HEAD(list);
1145
1146 if (!force) {
1147 if (list_empty_careful(&ctx->cq_overflow_list))
Jens Axboec4a2ed72019-11-21 21:01:26 -07001148 return true;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001149 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1150 rings->cq_ring_entries))
Jens Axboec4a2ed72019-11-21 21:01:26 -07001151 return false;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001152 }
1153
1154 spin_lock_irqsave(&ctx->completion_lock, flags);
1155
1156 /* if force is set, the ring is going away. always drop after that */
1157 if (force)
Jens Axboe69b3e542020-01-08 11:01:46 -07001158 ctx->cq_overflow_flushed = 1;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001159
Jens Axboec4a2ed72019-11-21 21:01:26 -07001160 cqe = NULL;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001161 while (!list_empty(&ctx->cq_overflow_list)) {
1162 cqe = io_get_cqring(ctx);
1163 if (!cqe && !force)
1164 break;
1165
1166 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
1167 list);
1168 list_move(&req->list, &list);
Jens Axboe2ca10252020-02-13 17:17:35 -07001169 req->flags &= ~REQ_F_OVERFLOW;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001170 if (cqe) {
1171 WRITE_ONCE(cqe->user_data, req->user_data);
1172 WRITE_ONCE(cqe->res, req->result);
1173 WRITE_ONCE(cqe->flags, 0);
1174 } else {
1175 WRITE_ONCE(ctx->rings->cq_overflow,
1176 atomic_inc_return(&ctx->cached_cq_overflow));
1177 }
1178 }
1179
1180 io_commit_cqring(ctx);
Jens Axboead3eb2c2019-12-18 17:12:20 -07001181 if (cqe) {
1182 clear_bit(0, &ctx->sq_check_overflow);
1183 clear_bit(0, &ctx->cq_check_overflow);
1184 }
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001185 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1186 io_cqring_ev_posted(ctx);
1187
1188 while (!list_empty(&list)) {
1189 req = list_first_entry(&list, struct io_kiocb, list);
1190 list_del(&req->list);
Jackie Liuec9c02a2019-11-08 23:50:36 +08001191 io_put_req(req);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001192 }
Jens Axboec4a2ed72019-11-21 21:01:26 -07001193
1194 return cqe != NULL;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001195}
1196
Jens Axboe78e19bb2019-11-06 15:21:34 -07001197static void io_cqring_fill_event(struct io_kiocb *req, long res)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001198{
Jens Axboe78e19bb2019-11-06 15:21:34 -07001199 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001200 struct io_uring_cqe *cqe;
1201
Jens Axboe78e19bb2019-11-06 15:21:34 -07001202 trace_io_uring_complete(ctx, req->user_data, res);
Jens Axboe51c3ff62019-11-03 06:52:50 -07001203
Jens Axboe2b188cc2019-01-07 10:46:33 -07001204 /*
1205 * If we can't get a cq entry, userspace overflowed the
1206 * submission (by quite a lot). Increment the overflow count in
1207 * the ring.
1208 */
1209 cqe = io_get_cqring(ctx);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001210 if (likely(cqe)) {
Jens Axboe78e19bb2019-11-06 15:21:34 -07001211 WRITE_ONCE(cqe->user_data, req->user_data);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001212 WRITE_ONCE(cqe->res, res);
1213 WRITE_ONCE(cqe->flags, 0);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001214 } else if (ctx->cq_overflow_flushed) {
Jens Axboe2b188cc2019-01-07 10:46:33 -07001215 WRITE_ONCE(ctx->rings->cq_overflow,
1216 atomic_inc_return(&ctx->cached_cq_overflow));
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001217 } else {
Jens Axboead3eb2c2019-12-18 17:12:20 -07001218 if (list_empty(&ctx->cq_overflow_list)) {
1219 set_bit(0, &ctx->sq_check_overflow);
1220 set_bit(0, &ctx->cq_check_overflow);
1221 }
Jens Axboe2ca10252020-02-13 17:17:35 -07001222 req->flags |= REQ_F_OVERFLOW;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001223 refcount_inc(&req->refs);
1224 req->result = res;
1225 list_add_tail(&req->list, &ctx->cq_overflow_list);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001226 }
1227}
1228
Jens Axboe78e19bb2019-11-06 15:21:34 -07001229static void io_cqring_add_event(struct io_kiocb *req, long res)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001230{
Jens Axboe78e19bb2019-11-06 15:21:34 -07001231 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001232 unsigned long flags;
1233
1234 spin_lock_irqsave(&ctx->completion_lock, flags);
Jens Axboe78e19bb2019-11-06 15:21:34 -07001235 io_cqring_fill_event(req, res);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001236 io_commit_cqring(ctx);
1237 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1238
Jens Axboe8c838782019-03-12 15:48:16 -06001239 io_cqring_ev_posted(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001240}
1241
Jens Axboe0ddf92e2019-11-08 08:52:53 -07001242static inline bool io_is_fallback_req(struct io_kiocb *req)
1243{
1244 return req == (struct io_kiocb *)
1245 ((unsigned long) req->ctx->fallback_req & ~1UL);
1246}
1247
1248static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1249{
1250 struct io_kiocb *req;
1251
1252 req = ctx->fallback_req;
1253 if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
1254 return req;
1255
1256 return NULL;
1257}
1258
Jens Axboe2579f912019-01-09 09:10:43 -07001259static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
1260 struct io_submit_state *state)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001261{
Jens Axboefd6fab22019-03-14 16:30:06 -06001262 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001263 struct io_kiocb *req;
1264
Jens Axboe2579f912019-01-09 09:10:43 -07001265 if (!state) {
Jens Axboefd6fab22019-03-14 16:30:06 -06001266 req = kmem_cache_alloc(req_cachep, gfp);
Jens Axboe2579f912019-01-09 09:10:43 -07001267 if (unlikely(!req))
Jens Axboe0ddf92e2019-11-08 08:52:53 -07001268 goto fallback;
Jens Axboe2579f912019-01-09 09:10:43 -07001269 } else if (!state->free_reqs) {
1270 size_t sz;
1271 int ret;
1272
1273 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
Jens Axboefd6fab22019-03-14 16:30:06 -06001274 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1275
1276 /*
1277 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1278 * retry single alloc to be on the safe side.
1279 */
1280 if (unlikely(ret <= 0)) {
1281 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1282 if (!state->reqs[0])
Jens Axboe0ddf92e2019-11-08 08:52:53 -07001283 goto fallback;
Jens Axboefd6fab22019-03-14 16:30:06 -06001284 ret = 1;
1285 }
Jens Axboe2579f912019-01-09 09:10:43 -07001286 state->free_reqs = ret - 1;
Pavel Begunkov6c8a3132020-02-01 03:58:00 +03001287 req = state->reqs[ret - 1];
Jens Axboe2579f912019-01-09 09:10:43 -07001288 } else {
Jens Axboe2579f912019-01-09 09:10:43 -07001289 state->free_reqs--;
Pavel Begunkov6c8a3132020-02-01 03:58:00 +03001290 req = state->reqs[state->free_reqs];
Jens Axboe2b188cc2019-01-07 10:46:33 -07001291 }
1292
Jens Axboe0ddf92e2019-11-08 08:52:53 -07001293got_it:
Jens Axboe1a6b74f2019-12-02 10:33:15 -07001294 req->io = NULL;
Jens Axboe60c112b2019-06-21 10:20:18 -06001295 req->file = NULL;
Jens Axboe2579f912019-01-09 09:10:43 -07001296 req->ctx = ctx;
1297 req->flags = 0;
Jens Axboee65ef562019-03-12 10:16:44 -06001298 /* one is dropped after submission, the other at completion */
1299 refcount_set(&req->refs, 2);
Jens Axboe9e645e112019-05-10 16:07:28 -06001300 req->result = 0;
Jens Axboe561fb042019-10-24 07:25:42 -06001301 INIT_IO_WORK(&req->work, io_wq_submit_work);
Jens Axboe2579f912019-01-09 09:10:43 -07001302 return req;
Jens Axboe0ddf92e2019-11-08 08:52:53 -07001303fallback:
1304 req = io_get_fallback_req(ctx);
1305 if (req)
1306 goto got_it;
Pavel Begunkov6805b322019-10-08 02:18:42 +03001307 percpu_ref_put(&ctx->refs);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001308 return NULL;
1309}
1310
Pavel Begunkov8da11c12020-02-24 11:32:44 +03001311static inline void io_put_file(struct io_kiocb *req, struct file *file,
1312 bool fixed)
1313{
1314 if (fixed)
1315 percpu_ref_put(&req->ctx->file_data->refs);
1316 else
1317 fput(file);
1318}
1319
Pavel Begunkov2b85edf2019-12-28 14:13:03 +03001320static void __io_req_do_free(struct io_kiocb *req)
Jens Axboedef596e2019-01-09 08:59:42 -07001321{
Pavel Begunkov2b85edf2019-12-28 14:13:03 +03001322 if (likely(!io_is_fallback_req(req)))
1323 kmem_cache_free(req_cachep, req);
1324 else
1325 clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
1326}
1327
Jens Axboec6ca97b302019-12-28 12:11:08 -07001328static void __io_req_aux_free(struct io_kiocb *req)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001329{
Pavel Begunkov929a3af2020-02-19 00:19:09 +03001330 if (req->flags & REQ_F_NEED_CLEANUP)
1331 io_cleanup_req(req);
1332
YueHaibing96fd84d2020-01-07 22:22:44 +08001333 kfree(req->io);
Pavel Begunkov8da11c12020-02-24 11:32:44 +03001334 if (req->file)
1335 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
Jens Axboecccf0ee2020-01-27 16:34:48 -07001336
1337 io_req_work_drop_env(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001338}
1339
1340static void __io_free_req(struct io_kiocb *req)
1341{
Jens Axboec6ca97b302019-12-28 12:11:08 -07001342 __io_req_aux_free(req);
Jens Axboefcb323c2019-10-24 12:39:47 -06001343
Jens Axboefcb323c2019-10-24 12:39:47 -06001344 if (req->flags & REQ_F_INFLIGHT) {
Jens Axboec6ca97b302019-12-28 12:11:08 -07001345 struct io_ring_ctx *ctx = req->ctx;
Jens Axboefcb323c2019-10-24 12:39:47 -06001346 unsigned long flags;
1347
1348 spin_lock_irqsave(&ctx->inflight_lock, flags);
1349 list_del(&req->inflight_entry);
1350 if (waitqueue_active(&ctx->inflight_wait))
1351 wake_up(&ctx->inflight_wait);
1352 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1353 }
Pavel Begunkov2b85edf2019-12-28 14:13:03 +03001354
1355 percpu_ref_put(&req->ctx->refs);
1356 __io_req_do_free(req);
Jens Axboee65ef562019-03-12 10:16:44 -06001357}
1358
Jens Axboec6ca97b302019-12-28 12:11:08 -07001359struct req_batch {
1360 void *reqs[IO_IOPOLL_BATCH];
1361 int to_free;
1362 int need_iter;
1363};
1364
1365static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
1366{
Jens Axboe10fef4b2020-01-09 07:52:28 -07001367 int fixed_refs = rb->to_free;
1368
Jens Axboec6ca97b302019-12-28 12:11:08 -07001369 if (!rb->to_free)
1370 return;
1371 if (rb->need_iter) {
1372 int i, inflight = 0;
1373 unsigned long flags;
1374
Jens Axboe10fef4b2020-01-09 07:52:28 -07001375 fixed_refs = 0;
Jens Axboec6ca97b302019-12-28 12:11:08 -07001376 for (i = 0; i < rb->to_free; i++) {
1377 struct io_kiocb *req = rb->reqs[i];
1378
Jens Axboe10fef4b2020-01-09 07:52:28 -07001379 if (req->flags & REQ_F_FIXED_FILE) {
Jens Axboec6ca97b302019-12-28 12:11:08 -07001380 req->file = NULL;
Jens Axboe10fef4b2020-01-09 07:52:28 -07001381 fixed_refs++;
1382 }
Jens Axboec6ca97b302019-12-28 12:11:08 -07001383 if (req->flags & REQ_F_INFLIGHT)
1384 inflight++;
Jens Axboec6ca97b302019-12-28 12:11:08 -07001385 __io_req_aux_free(req);
1386 }
1387 if (!inflight)
1388 goto do_free;
1389
1390 spin_lock_irqsave(&ctx->inflight_lock, flags);
1391 for (i = 0; i < rb->to_free; i++) {
1392 struct io_kiocb *req = rb->reqs[i];
1393
Jens Axboe10fef4b2020-01-09 07:52:28 -07001394 if (req->flags & REQ_F_INFLIGHT) {
Jens Axboec6ca97b302019-12-28 12:11:08 -07001395 list_del(&req->inflight_entry);
1396 if (!--inflight)
1397 break;
1398 }
1399 }
1400 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1401
1402 if (waitqueue_active(&ctx->inflight_wait))
1403 wake_up(&ctx->inflight_wait);
1404 }
1405do_free:
1406 kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
Jens Axboe10fef4b2020-01-09 07:52:28 -07001407 if (fixed_refs)
1408 percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
Jens Axboec6ca97b302019-12-28 12:11:08 -07001409 percpu_ref_put_many(&ctx->refs, rb->to_free);
Jens Axboec6ca97b302019-12-28 12:11:08 -07001410 rb->to_free = rb->need_iter = 0;
Jens Axboee65ef562019-03-12 10:16:44 -06001411}
1412
Jackie Liua197f662019-11-08 08:09:12 -07001413static bool io_link_cancel_timeout(struct io_kiocb *req)
Jens Axboe9e645e112019-05-10 16:07:28 -06001414{
Jackie Liua197f662019-11-08 08:09:12 -07001415 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2665abf2019-11-05 12:40:47 -07001416 int ret;
1417
Jens Axboe2d283902019-12-04 11:08:05 -07001418 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
Jens Axboe2665abf2019-11-05 12:40:47 -07001419 if (ret != -1) {
Jens Axboe78e19bb2019-11-06 15:21:34 -07001420 io_cqring_fill_event(req, -ECANCELED);
Jens Axboe2665abf2019-11-05 12:40:47 -07001421 io_commit_cqring(ctx);
1422 req->flags &= ~REQ_F_LINK;
Jackie Liuec9c02a2019-11-08 23:50:36 +08001423 io_put_req(req);
Jens Axboe2665abf2019-11-05 12:40:47 -07001424 return true;
1425 }
1426
1427 return false;
1428}
1429
Jens Axboeba816ad2019-09-28 11:36:45 -06001430static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
Jens Axboe9e645e112019-05-10 16:07:28 -06001431{
Jens Axboe2665abf2019-11-05 12:40:47 -07001432 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2665abf2019-11-05 12:40:47 -07001433 bool wake_ev = false;
Jens Axboe9e645e112019-05-10 16:07:28 -06001434
Jens Axboe4d7dd462019-11-20 13:03:52 -07001435 /* Already got next link */
1436 if (req->flags & REQ_F_LINK_NEXT)
1437 return;
1438
Jens Axboe9e645e112019-05-10 16:07:28 -06001439 /*
1440 * The list should never be empty when we are called here. But could
1441 * potentially happen if the chain is messed up, check to be on the
1442 * safe side.
1443 */
Pavel Begunkov44932332019-12-05 16:16:35 +03001444 while (!list_empty(&req->link_list)) {
1445 struct io_kiocb *nxt = list_first_entry(&req->link_list,
1446 struct io_kiocb, link_list);
Jens Axboe94ae5e72019-11-14 19:39:52 -07001447
Pavel Begunkov44932332019-12-05 16:16:35 +03001448 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
1449 (nxt->flags & REQ_F_TIMEOUT))) {
1450 list_del_init(&nxt->link_list);
Jens Axboe94ae5e72019-11-14 19:39:52 -07001451 wake_ev |= io_link_cancel_timeout(nxt);
Jens Axboe94ae5e72019-11-14 19:39:52 -07001452 req->flags &= ~REQ_F_LINK_TIMEOUT;
1453 continue;
1454 }
Jens Axboe9e645e112019-05-10 16:07:28 -06001455
Pavel Begunkov44932332019-12-05 16:16:35 +03001456 list_del_init(&req->link_list);
1457 if (!list_empty(&nxt->link_list))
1458 nxt->flags |= REQ_F_LINK;
Pavel Begunkovb18fdf72019-11-21 23:21:02 +03001459 *nxtptr = nxt;
Jens Axboe94ae5e72019-11-14 19:39:52 -07001460 break;
Jens Axboe9e645e112019-05-10 16:07:28 -06001461 }
Jens Axboe2665abf2019-11-05 12:40:47 -07001462
Jens Axboe4d7dd462019-11-20 13:03:52 -07001463 req->flags |= REQ_F_LINK_NEXT;
Jens Axboe2665abf2019-11-05 12:40:47 -07001464 if (wake_ev)
1465 io_cqring_ev_posted(ctx);
Jens Axboe9e645e112019-05-10 16:07:28 -06001466}
1467
1468/*
1469 * Called if REQ_F_LINK is set, and we fail the head request
1470 */
1471static void io_fail_links(struct io_kiocb *req)
1472{
Jens Axboe2665abf2019-11-05 12:40:47 -07001473 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2665abf2019-11-05 12:40:47 -07001474 unsigned long flags;
1475
1476 spin_lock_irqsave(&ctx->completion_lock, flags);
Jens Axboe9e645e112019-05-10 16:07:28 -06001477
1478 while (!list_empty(&req->link_list)) {
Pavel Begunkov44932332019-12-05 16:16:35 +03001479 struct io_kiocb *link = list_first_entry(&req->link_list,
1480 struct io_kiocb, link_list);
Jens Axboe9e645e112019-05-10 16:07:28 -06001481
Pavel Begunkov44932332019-12-05 16:16:35 +03001482 list_del_init(&link->link_list);
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +02001483 trace_io_uring_fail_link(req, link);
Jens Axboe2665abf2019-11-05 12:40:47 -07001484
1485 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
Jens Axboed625c6e2019-12-17 19:53:05 -07001486 link->opcode == IORING_OP_LINK_TIMEOUT) {
Jackie Liua197f662019-11-08 08:09:12 -07001487 io_link_cancel_timeout(link);
Jens Axboe2665abf2019-11-05 12:40:47 -07001488 } else {
Jens Axboe78e19bb2019-11-06 15:21:34 -07001489 io_cqring_fill_event(link, -ECANCELED);
Jens Axboe978db572019-11-14 22:39:04 -07001490 __io_double_put_req(link);
Jens Axboe2665abf2019-11-05 12:40:47 -07001491 }
Jens Axboe5d960722019-11-19 15:31:28 -07001492 req->flags &= ~REQ_F_LINK_TIMEOUT;
Jens Axboe9e645e112019-05-10 16:07:28 -06001493 }
Jens Axboe2665abf2019-11-05 12:40:47 -07001494
1495 io_commit_cqring(ctx);
1496 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1497 io_cqring_ev_posted(ctx);
Jens Axboe9e645e112019-05-10 16:07:28 -06001498}
1499
Jens Axboe4d7dd462019-11-20 13:03:52 -07001500static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
Jens Axboe9e645e112019-05-10 16:07:28 -06001501{
Jens Axboe4d7dd462019-11-20 13:03:52 -07001502 if (likely(!(req->flags & REQ_F_LINK)))
Jens Axboe2665abf2019-11-05 12:40:47 -07001503 return;
Jens Axboe2665abf2019-11-05 12:40:47 -07001504
Jens Axboe9e645e112019-05-10 16:07:28 -06001505 /*
1506 * If LINK is set, we have dependent requests in this chain. If we
1507 * didn't fail this request, queue the first one up, moving any other
1508 * dependencies to the next request. In case of failure, fail the rest
1509 * of the chain.
1510 */
Jens Axboe2665abf2019-11-05 12:40:47 -07001511 if (req->flags & REQ_F_FAIL_LINK) {
1512 io_fail_links(req);
Jens Axboe7c9e7f02019-11-12 08:15:53 -07001513 } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1514 REQ_F_LINK_TIMEOUT) {
Jens Axboe2665abf2019-11-05 12:40:47 -07001515 struct io_ring_ctx *ctx = req->ctx;
1516 unsigned long flags;
1517
1518 /*
1519 * If this is a timeout link, we could be racing with the
1520 * timeout timer. Grab the completion lock for this case to
Jens Axboe7c9e7f02019-11-12 08:15:53 -07001521 * protect against that.
Jens Axboe2665abf2019-11-05 12:40:47 -07001522 */
1523 spin_lock_irqsave(&ctx->completion_lock, flags);
1524 io_req_link_next(req, nxt);
1525 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1526 } else {
1527 io_req_link_next(req, nxt);
Jens Axboe9e645e112019-05-10 16:07:28 -06001528 }
Jens Axboe4d7dd462019-11-20 13:03:52 -07001529}
Jens Axboe9e645e112019-05-10 16:07:28 -06001530
Jackie Liuc69f8db2019-11-09 11:00:08 +08001531static void io_free_req(struct io_kiocb *req)
1532{
Pavel Begunkov944e58b2019-11-21 23:21:01 +03001533 struct io_kiocb *nxt = NULL;
1534
1535 io_req_find_next(req, &nxt);
Pavel Begunkov70cf9f32019-11-21 23:21:00 +03001536 __io_free_req(req);
Pavel Begunkov944e58b2019-11-21 23:21:01 +03001537
1538 if (nxt)
1539 io_queue_async_work(nxt);
Jackie Liuc69f8db2019-11-09 11:00:08 +08001540}
1541
Pavel Begunkov7a743e22020-03-03 21:33:13 +03001542static void io_link_work_cb(struct io_wq_work **workptr)
1543{
1544 struct io_wq_work *work = *workptr;
1545 struct io_kiocb *link = work->data;
1546
1547 io_queue_linked_timeout(link);
1548 io_wq_submit_work(workptr);
1549}
1550
1551static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
1552{
1553 struct io_kiocb *link;
1554
1555 *workptr = &nxt->work;
1556 link = io_prep_linked_timeout(nxt);
1557 if (link) {
1558 nxt->work.func = io_link_work_cb;
1559 nxt->work.data = link;
1560 }
1561}
1562
Jens Axboeba816ad2019-09-28 11:36:45 -06001563/*
1564 * Drop reference to request, return next in chain (if there is one) if this
1565 * was the last reference to this request.
1566 */
Pavel Begunkovf9bd67f2019-11-21 23:21:03 +03001567__attribute__((nonnull))
Jackie Liuec9c02a2019-11-08 23:50:36 +08001568static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
Jens Axboee65ef562019-03-12 10:16:44 -06001569{
Jens Axboe2a44f462020-02-25 13:25:41 -07001570 if (refcount_dec_and_test(&req->refs)) {
1571 io_req_find_next(req, nxtptr);
Jens Axboe4d7dd462019-11-20 13:03:52 -07001572 __io_free_req(req);
Jens Axboe2a44f462020-02-25 13:25:41 -07001573 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07001574}
1575
Jens Axboe2b188cc2019-01-07 10:46:33 -07001576static void io_put_req(struct io_kiocb *req)
1577{
Jens Axboedef596e2019-01-09 08:59:42 -07001578 if (refcount_dec_and_test(&req->refs))
1579 io_free_req(req);
1580}
1581
Pavel Begunkove9fd9392020-03-04 16:14:12 +03001582static void io_steal_work(struct io_kiocb *req,
1583 struct io_wq_work **workptr)
Pavel Begunkov7a743e22020-03-03 21:33:13 +03001584{
1585 /*
1586 * It's in an io-wq worker, so there always should be at least
1587 * one reference, which will be dropped in io_put_work() just
1588 * after the current handler returns.
1589 *
1590 * It also means, that if the counter dropped to 1, then there is
1591 * no asynchronous users left, so it's safe to steal the next work.
1592 */
Pavel Begunkov7a743e22020-03-03 21:33:13 +03001593 if (refcount_read(&req->refs) == 1) {
1594 struct io_kiocb *nxt = NULL;
1595
1596 io_req_find_next(req, &nxt);
1597 if (nxt)
1598 io_wq_assign_next(workptr, nxt);
1599 }
1600}
1601
Jens Axboe978db572019-11-14 22:39:04 -07001602/*
1603 * Must only be used if we don't need to care about links, usually from
1604 * within the completion handling itself.
1605 */
1606static void __io_double_put_req(struct io_kiocb *req)
Jens Axboea3a0e432019-08-20 11:03:11 -06001607{
Jens Axboe78e19bb2019-11-06 15:21:34 -07001608 /* drop both submit and complete references */
1609 if (refcount_sub_and_test(2, &req->refs))
1610 __io_free_req(req);
1611}
1612
Jens Axboe978db572019-11-14 22:39:04 -07001613static void io_double_put_req(struct io_kiocb *req)
1614{
1615 /* drop both submit and complete references */
1616 if (refcount_sub_and_test(2, &req->refs))
1617 io_free_req(req);
1618}
1619
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001620static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
Jens Axboea3a0e432019-08-20 11:03:11 -06001621{
Jens Axboe84f97dc2019-11-06 11:27:53 -07001622 struct io_rings *rings = ctx->rings;
1623
Jens Axboead3eb2c2019-12-18 17:12:20 -07001624 if (test_bit(0, &ctx->cq_check_overflow)) {
1625 /*
1626 * noflush == true is from the waitqueue handler, just ensure
1627 * we wake up the task, and the next invocation will flush the
1628 * entries. We cannot safely to it from here.
1629 */
1630 if (noflush && !list_empty(&ctx->cq_overflow_list))
1631 return -1U;
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001632
Jens Axboead3eb2c2019-12-18 17:12:20 -07001633 io_cqring_overflow_flush(ctx, false);
1634 }
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001635
Jens Axboea3a0e432019-08-20 11:03:11 -06001636 /* See comment at the top of this file */
1637 smp_rmb();
Jens Axboead3eb2c2019-12-18 17:12:20 -07001638 return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
Jens Axboea3a0e432019-08-20 11:03:11 -06001639}
1640
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03001641static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1642{
1643 struct io_rings *rings = ctx->rings;
1644
1645 /* make sure SQ entry isn't read before tail */
1646 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1647}
1648
Jens Axboe8237e042019-12-28 10:48:22 -07001649static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
Jens Axboee94f1412019-12-19 12:06:02 -07001650{
Jens Axboec6ca97b302019-12-28 12:11:08 -07001651 if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req))
1652 return false;
Jens Axboee94f1412019-12-19 12:06:02 -07001653
Jens Axboec6ca97b302019-12-28 12:11:08 -07001654 if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
1655 rb->need_iter++;
1656
1657 rb->reqs[rb->to_free++] = req;
1658 if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
1659 io_free_req_many(req->ctx, rb);
1660 return true;
Jens Axboee94f1412019-12-19 12:06:02 -07001661}
1662
Jens Axboedef596e2019-01-09 08:59:42 -07001663/*
1664 * Find and free completed poll iocbs
1665 */
1666static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1667 struct list_head *done)
1668{
Jens Axboe8237e042019-12-28 10:48:22 -07001669 struct req_batch rb;
Jens Axboedef596e2019-01-09 08:59:42 -07001670 struct io_kiocb *req;
Jens Axboedef596e2019-01-09 08:59:42 -07001671
Jens Axboec6ca97b302019-12-28 12:11:08 -07001672 rb.to_free = rb.need_iter = 0;
Jens Axboedef596e2019-01-09 08:59:42 -07001673 while (!list_empty(done)) {
1674 req = list_first_entry(done, struct io_kiocb, list);
1675 list_del(&req->list);
1676
Jens Axboe78e19bb2019-11-06 15:21:34 -07001677 io_cqring_fill_event(req, req->result);
Jens Axboedef596e2019-01-09 08:59:42 -07001678 (*nr_events)++;
1679
Jens Axboe8237e042019-12-28 10:48:22 -07001680 if (refcount_dec_and_test(&req->refs) &&
1681 !io_req_multi_free(&rb, req))
1682 io_free_req(req);
Jens Axboedef596e2019-01-09 08:59:42 -07001683 }
Jens Axboedef596e2019-01-09 08:59:42 -07001684
Jens Axboe09bb8392019-03-13 12:39:28 -06001685 io_commit_cqring(ctx);
Jens Axboe8237e042019-12-28 10:48:22 -07001686 io_free_req_many(ctx, &rb);
Jens Axboedef596e2019-01-09 08:59:42 -07001687}
1688
1689static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1690 long min)
1691{
1692 struct io_kiocb *req, *tmp;
1693 LIST_HEAD(done);
1694 bool spin;
1695 int ret;
1696
1697 /*
1698 * Only spin for completions if we don't have multiple devices hanging
1699 * off our complete list, and we're under the requested amount.
1700 */
1701 spin = !ctx->poll_multi_file && *nr_events < min;
1702
1703 ret = 0;
1704 list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
Jens Axboe9adbd452019-12-20 08:45:55 -07001705 struct kiocb *kiocb = &req->rw.kiocb;
Jens Axboedef596e2019-01-09 08:59:42 -07001706
1707 /*
1708 * Move completed entries to our local list. If we find a
1709 * request that requires polling, break out and complete
1710 * the done list first, if we have entries there.
1711 */
1712 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
1713 list_move_tail(&req->list, &done);
1714 continue;
1715 }
1716 if (!list_empty(&done))
1717 break;
1718
1719 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1720 if (ret < 0)
1721 break;
1722
1723 if (ret && spin)
1724 spin = false;
1725 ret = 0;
1726 }
1727
1728 if (!list_empty(&done))
1729 io_iopoll_complete(ctx, nr_events, &done);
1730
1731 return ret;
1732}
1733
1734/*
Brian Gianforcarod195a662019-12-13 03:09:50 -08001735 * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
Jens Axboedef596e2019-01-09 08:59:42 -07001736 * non-spinning poll check - we'll still enter the driver poll loop, but only
1737 * as a non-spinning completion check.
1738 */
1739static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1740 long min)
1741{
Jens Axboe08f54392019-08-21 22:19:11 -06001742 while (!list_empty(&ctx->poll_list) && !need_resched()) {
Jens Axboedef596e2019-01-09 08:59:42 -07001743 int ret;
1744
1745 ret = io_do_iopoll(ctx, nr_events, min);
1746 if (ret < 0)
1747 return ret;
1748 if (!min || *nr_events >= min)
1749 return 0;
1750 }
1751
1752 return 1;
1753}
1754
1755/*
1756 * We can't just wait for polled events to come to us, we have to actively
1757 * find and complete them.
1758 */
1759static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1760{
1761 if (!(ctx->flags & IORING_SETUP_IOPOLL))
1762 return;
1763
1764 mutex_lock(&ctx->uring_lock);
1765 while (!list_empty(&ctx->poll_list)) {
1766 unsigned int nr_events = 0;
1767
1768 io_iopoll_getevents(ctx, &nr_events, 1);
Jens Axboe08f54392019-08-21 22:19:11 -06001769
1770 /*
1771 * Ensure we allow local-to-the-cpu processing to take place,
1772 * in this case we need to ensure that we reap all events.
1773 */
1774 cond_resched();
Jens Axboedef596e2019-01-09 08:59:42 -07001775 }
1776 mutex_unlock(&ctx->uring_lock);
1777}
1778
Xiaoguang Wangc7849be2020-02-22 14:46:05 +08001779static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1780 long min)
Jens Axboedef596e2019-01-09 08:59:42 -07001781{
Jens Axboe2b2ed972019-10-25 10:06:15 -06001782 int iters = 0, ret = 0;
Jens Axboedef596e2019-01-09 08:59:42 -07001783
Xiaoguang Wangc7849be2020-02-22 14:46:05 +08001784 /*
1785 * We disallow the app entering submit/complete with polling, but we
1786 * still need to lock the ring to prevent racing with polled issue
1787 * that got punted to a workqueue.
1788 */
1789 mutex_lock(&ctx->uring_lock);
Jens Axboedef596e2019-01-09 08:59:42 -07001790 do {
1791 int tmin = 0;
1792
Jens Axboe500f9fb2019-08-19 12:15:59 -06001793 /*
Jens Axboea3a0e432019-08-20 11:03:11 -06001794 * Don't enter poll loop if we already have events pending.
1795 * If we do, we can potentially be spinning for commands that
1796 * already triggered a CQE (eg in error).
1797 */
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07001798 if (io_cqring_events(ctx, false))
Jens Axboea3a0e432019-08-20 11:03:11 -06001799 break;
1800
1801 /*
Jens Axboe500f9fb2019-08-19 12:15:59 -06001802 * If a submit got punted to a workqueue, we can have the
1803 * application entering polling for a command before it gets
1804 * issued. That app will hold the uring_lock for the duration
1805 * of the poll right here, so we need to take a breather every
1806 * now and then to ensure that the issue has a chance to add
1807 * the poll to the issued list. Otherwise we can spin here
1808 * forever, while the workqueue is stuck trying to acquire the
1809 * very same mutex.
1810 */
1811 if (!(++iters & 7)) {
1812 mutex_unlock(&ctx->uring_lock);
1813 mutex_lock(&ctx->uring_lock);
1814 }
1815
Jens Axboedef596e2019-01-09 08:59:42 -07001816 if (*nr_events < min)
1817 tmin = min - *nr_events;
1818
1819 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1820 if (ret <= 0)
1821 break;
1822 ret = 0;
1823 } while (min && !*nr_events && !need_resched());
1824
Jens Axboe500f9fb2019-08-19 12:15:59 -06001825 mutex_unlock(&ctx->uring_lock);
Jens Axboedef596e2019-01-09 08:59:42 -07001826 return ret;
1827}
1828
Jens Axboe491381ce2019-10-17 09:20:46 -06001829static void kiocb_end_write(struct io_kiocb *req)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001830{
Jens Axboe491381ce2019-10-17 09:20:46 -06001831 /*
1832 * Tell lockdep we inherited freeze protection from submission
1833 * thread.
1834 */
1835 if (req->flags & REQ_F_ISREG) {
1836 struct inode *inode = file_inode(req->file);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001837
Jens Axboe491381ce2019-10-17 09:20:46 -06001838 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001839 }
Jens Axboe491381ce2019-10-17 09:20:46 -06001840 file_end_write(req->file);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001841}
1842
Jens Axboe4e88d6e2019-12-07 20:59:47 -07001843static inline void req_set_fail_links(struct io_kiocb *req)
1844{
1845 if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1846 req->flags |= REQ_F_FAIL_LINK;
1847}
1848
Jens Axboeba816ad2019-09-28 11:36:45 -06001849static void io_complete_rw_common(struct kiocb *kiocb, long res)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001850{
Jens Axboe9adbd452019-12-20 08:45:55 -07001851 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001852
Jens Axboe491381ce2019-10-17 09:20:46 -06001853 if (kiocb->ki_flags & IOCB_WRITE)
1854 kiocb_end_write(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001855
Jens Axboe4e88d6e2019-12-07 20:59:47 -07001856 if (res != req->result)
1857 req_set_fail_links(req);
Jens Axboe78e19bb2019-11-06 15:21:34 -07001858 io_cqring_add_event(req, res);
Jens Axboeba816ad2019-09-28 11:36:45 -06001859}
1860
1861static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1862{
Jens Axboe9adbd452019-12-20 08:45:55 -07001863 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
Jens Axboeba816ad2019-09-28 11:36:45 -06001864
1865 io_complete_rw_common(kiocb, res);
Jens Axboee65ef562019-03-12 10:16:44 -06001866 io_put_req(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07001867}
1868
Jens Axboedef596e2019-01-09 08:59:42 -07001869static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1870{
Jens Axboe9adbd452019-12-20 08:45:55 -07001871 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
Jens Axboedef596e2019-01-09 08:59:42 -07001872
Jens Axboe491381ce2019-10-17 09:20:46 -06001873 if (kiocb->ki_flags & IOCB_WRITE)
1874 kiocb_end_write(req);
Jens Axboedef596e2019-01-09 08:59:42 -07001875
Jens Axboe4e88d6e2019-12-07 20:59:47 -07001876 if (res != req->result)
1877 req_set_fail_links(req);
Jens Axboe9e645e112019-05-10 16:07:28 -06001878 req->result = res;
Jens Axboedef596e2019-01-09 08:59:42 -07001879 if (res != -EAGAIN)
1880 req->flags |= REQ_F_IOPOLL_COMPLETED;
1881}
1882
1883/*
1884 * After the iocb has been issued, it's safe to be found on the poll list.
1885 * Adding the kiocb to the list AFTER submission ensures that we don't
1886 * find it from a io_iopoll_getevents() thread before the issuer is done
1887 * accessing the kiocb cookie.
1888 */
1889static void io_iopoll_req_issued(struct io_kiocb *req)
1890{
1891 struct io_ring_ctx *ctx = req->ctx;
1892
1893 /*
1894 * Track whether we have multiple files in our lists. This will impact
1895 * how we do polling eventually, not spinning if we're on potentially
1896 * different devices.
1897 */
1898 if (list_empty(&ctx->poll_list)) {
1899 ctx->poll_multi_file = false;
1900 } else if (!ctx->poll_multi_file) {
1901 struct io_kiocb *list_req;
1902
1903 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
1904 list);
Jens Axboe9adbd452019-12-20 08:45:55 -07001905 if (list_req->file != req->file)
Jens Axboedef596e2019-01-09 08:59:42 -07001906 ctx->poll_multi_file = true;
1907 }
1908
1909 /*
1910 * For fast devices, IO may have already completed. If it has, add
1911 * it to the front so we find it first.
1912 */
1913 if (req->flags & REQ_F_IOPOLL_COMPLETED)
1914 list_add(&req->list, &ctx->poll_list);
1915 else
1916 list_add_tail(&req->list, &ctx->poll_list);
Xiaoguang Wangbdcd3ea2020-02-25 22:12:08 +08001917
1918 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
1919 wq_has_sleeper(&ctx->sqo_wait))
1920 wake_up(&ctx->sqo_wait);
Jens Axboedef596e2019-01-09 08:59:42 -07001921}
1922
Jens Axboe3d6770f2019-04-13 11:50:54 -06001923static void io_file_put(struct io_submit_state *state)
Jens Axboe9a56a232019-01-09 09:06:50 -07001924{
Jens Axboe3d6770f2019-04-13 11:50:54 -06001925 if (state->file) {
Jens Axboe9a56a232019-01-09 09:06:50 -07001926 int diff = state->has_refs - state->used_refs;
1927
1928 if (diff)
1929 fput_many(state->file, diff);
1930 state->file = NULL;
1931 }
1932}
1933
1934/*
1935 * Get as many references to a file as we have IOs left in this submission,
1936 * assuming most submissions are for one file, or at least that each file
1937 * has more than one submission.
1938 */
Pavel Begunkov8da11c12020-02-24 11:32:44 +03001939static struct file *__io_file_get(struct io_submit_state *state, int fd)
Jens Axboe9a56a232019-01-09 09:06:50 -07001940{
1941 if (!state)
1942 return fget(fd);
1943
1944 if (state->file) {
1945 if (state->fd == fd) {
1946 state->used_refs++;
1947 state->ios_left--;
1948 return state->file;
1949 }
Jens Axboe3d6770f2019-04-13 11:50:54 -06001950 io_file_put(state);
Jens Axboe9a56a232019-01-09 09:06:50 -07001951 }
1952 state->file = fget_many(fd, state->ios_left);
1953 if (!state->file)
1954 return NULL;
1955
1956 state->fd = fd;
1957 state->has_refs = state->ios_left;
1958 state->used_refs = 1;
1959 state->ios_left--;
1960 return state->file;
1961}
1962
Jens Axboe2b188cc2019-01-07 10:46:33 -07001963/*
1964 * If we tracked the file through the SCM inflight mechanism, we could support
1965 * any file. For now, just ensure that anything potentially problematic is done
1966 * inline.
1967 */
1968static bool io_file_supports_async(struct file *file)
1969{
1970 umode_t mode = file_inode(file)->i_mode;
1971
Jens Axboe10d59342019-12-09 20:16:22 -07001972 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
Jens Axboe2b188cc2019-01-07 10:46:33 -07001973 return true;
1974 if (S_ISREG(mode) && file->f_op != &io_uring_fops)
1975 return true;
1976
1977 return false;
1978}
1979
Jens Axboe3529d8c2019-12-19 18:24:38 -07001980static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1981 bool force_nonblock)
Jens Axboe2b188cc2019-01-07 10:46:33 -07001982{
Jens Axboedef596e2019-01-09 08:59:42 -07001983 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe9adbd452019-12-20 08:45:55 -07001984 struct kiocb *kiocb = &req->rw.kiocb;
Jens Axboe09bb8392019-03-13 12:39:28 -06001985 unsigned ioprio;
1986 int ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07001987
Jens Axboe491381ce2019-10-17 09:20:46 -06001988 if (S_ISREG(file_inode(req->file)->i_mode))
1989 req->flags |= REQ_F_ISREG;
1990
Jens Axboe2b188cc2019-01-07 10:46:33 -07001991 kiocb->ki_pos = READ_ONCE(sqe->off);
Jens Axboeba042912019-12-25 16:33:42 -07001992 if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
1993 req->flags |= REQ_F_CUR_POS;
1994 kiocb->ki_pos = req->file->f_pos;
1995 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07001996 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
Pavel Begunkov3e577dc2020-02-01 03:58:42 +03001997 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
1998 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
1999 if (unlikely(ret))
2000 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002001
2002 ioprio = READ_ONCE(sqe->ioprio);
2003 if (ioprio) {
2004 ret = ioprio_check_cap(ioprio);
2005 if (ret)
Jens Axboe09bb8392019-03-13 12:39:28 -06002006 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002007
2008 kiocb->ki_ioprio = ioprio;
2009 } else
2010 kiocb->ki_ioprio = get_current_ioprio();
2011
Stefan Bühler8449eed2019-04-27 20:34:19 +02002012 /* don't allow async punt if RWF_NOWAIT was requested */
Jens Axboe491381ce2019-10-17 09:20:46 -06002013 if ((kiocb->ki_flags & IOCB_NOWAIT) ||
2014 (req->file->f_flags & O_NONBLOCK))
Stefan Bühler8449eed2019-04-27 20:34:19 +02002015 req->flags |= REQ_F_NOWAIT;
2016
2017 if (force_nonblock)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002018 kiocb->ki_flags |= IOCB_NOWAIT;
Stefan Bühler8449eed2019-04-27 20:34:19 +02002019
Jens Axboedef596e2019-01-09 08:59:42 -07002020 if (ctx->flags & IORING_SETUP_IOPOLL) {
Jens Axboedef596e2019-01-09 08:59:42 -07002021 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2022 !kiocb->ki_filp->f_op->iopoll)
Jens Axboe09bb8392019-03-13 12:39:28 -06002023 return -EOPNOTSUPP;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002024
Jens Axboedef596e2019-01-09 08:59:42 -07002025 kiocb->ki_flags |= IOCB_HIPRI;
2026 kiocb->ki_complete = io_complete_rw_iopoll;
Jens Axboe6873e0b2019-10-30 13:53:09 -06002027 req->result = 0;
Jens Axboedef596e2019-01-09 08:59:42 -07002028 } else {
Jens Axboe09bb8392019-03-13 12:39:28 -06002029 if (kiocb->ki_flags & IOCB_HIPRI)
2030 return -EINVAL;
Jens Axboedef596e2019-01-09 08:59:42 -07002031 kiocb->ki_complete = io_complete_rw;
2032 }
Jens Axboe9adbd452019-12-20 08:45:55 -07002033
Jens Axboe3529d8c2019-12-19 18:24:38 -07002034 req->rw.addr = READ_ONCE(sqe->addr);
2035 req->rw.len = READ_ONCE(sqe->len);
Jens Axboe9adbd452019-12-20 08:45:55 -07002036 /* we own ->private, reuse it for the buffer index */
2037 req->rw.kiocb.private = (void *) (unsigned long)
Jens Axboe3529d8c2019-12-19 18:24:38 -07002038 READ_ONCE(sqe->buf_index);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002039 return 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002040}
2041
2042static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2043{
2044 switch (ret) {
2045 case -EIOCBQUEUED:
2046 break;
2047 case -ERESTARTSYS:
2048 case -ERESTARTNOINTR:
2049 case -ERESTARTNOHAND:
2050 case -ERESTART_RESTARTBLOCK:
2051 /*
2052 * We can't just restart the syscall, since previously
2053 * submitted sqes may already be in progress. Just fail this
2054 * IO with EINTR.
2055 */
2056 ret = -EINTR;
2057 /* fall through */
2058 default:
2059 kiocb->ki_complete(kiocb, ret, 0);
2060 }
2061}
2062
Pavel Begunkov014db002020-03-03 21:33:12 +03002063static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
Jens Axboeba816ad2019-09-28 11:36:45 -06002064{
Jens Axboeba042912019-12-25 16:33:42 -07002065 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2066
2067 if (req->flags & REQ_F_CUR_POS)
2068 req->file->f_pos = kiocb->ki_pos;
Pavel Begunkovbcaec082020-02-24 11:30:18 +03002069 if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
Pavel Begunkov014db002020-03-03 21:33:12 +03002070 io_complete_rw(kiocb, ret, 0);
Jens Axboeba816ad2019-09-28 11:36:45 -06002071 else
2072 io_rw_done(kiocb, ret);
2073}
2074
Jens Axboe9adbd452019-12-20 08:45:55 -07002075static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
Pavel Begunkov7d009162019-11-25 23:14:40 +03002076 struct iov_iter *iter)
Jens Axboeedafcce2019-01-09 09:16:05 -07002077{
Jens Axboe9adbd452019-12-20 08:45:55 -07002078 struct io_ring_ctx *ctx = req->ctx;
2079 size_t len = req->rw.len;
Jens Axboeedafcce2019-01-09 09:16:05 -07002080 struct io_mapped_ubuf *imu;
2081 unsigned index, buf_index;
2082 size_t offset;
2083 u64 buf_addr;
2084
2085 /* attempt to use fixed buffers without having provided iovecs */
2086 if (unlikely(!ctx->user_bufs))
2087 return -EFAULT;
2088
Jens Axboe9adbd452019-12-20 08:45:55 -07002089 buf_index = (unsigned long) req->rw.kiocb.private;
Jens Axboeedafcce2019-01-09 09:16:05 -07002090 if (unlikely(buf_index >= ctx->nr_user_bufs))
2091 return -EFAULT;
2092
2093 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2094 imu = &ctx->user_bufs[index];
Jens Axboe9adbd452019-12-20 08:45:55 -07002095 buf_addr = req->rw.addr;
Jens Axboeedafcce2019-01-09 09:16:05 -07002096
2097 /* overflow */
2098 if (buf_addr + len < buf_addr)
2099 return -EFAULT;
2100 /* not inside the mapped region */
2101 if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2102 return -EFAULT;
2103
2104 /*
2105 * May not be a start of buffer, set size appropriately
2106 * and advance us to the beginning.
2107 */
2108 offset = buf_addr - imu->ubuf;
2109 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
Jens Axboebd11b3a2019-07-20 08:37:31 -06002110
2111 if (offset) {
2112 /*
2113 * Don't use iov_iter_advance() here, as it's really slow for
2114 * using the latter parts of a big fixed buffer - it iterates
2115 * over each segment manually. We can cheat a bit here, because
2116 * we know that:
2117 *
2118 * 1) it's a BVEC iter, we set it up
2119 * 2) all bvecs are PAGE_SIZE in size, except potentially the
2120 * first and last bvec
2121 *
2122 * So just find our index, and adjust the iterator afterwards.
2123 * If the offset is within the first bvec (or the whole first
2124 * bvec, just use iov_iter_advance(). This makes it easier
2125 * since we can just skip the first segment, which may not
2126 * be PAGE_SIZE aligned.
2127 */
2128 const struct bio_vec *bvec = imu->bvec;
2129
2130 if (offset <= bvec->bv_len) {
2131 iov_iter_advance(iter, offset);
2132 } else {
2133 unsigned long seg_skip;
2134
2135 /* skip first vec */
2136 offset -= bvec->bv_len;
2137 seg_skip = 1 + (offset >> PAGE_SHIFT);
2138
2139 iter->bvec = bvec + seg_skip;
2140 iter->nr_segs -= seg_skip;
Aleix Roca Nonell99c79f62019-08-15 14:03:22 +02002141 iter->count -= bvec->bv_len + offset;
Jens Axboebd11b3a2019-07-20 08:37:31 -06002142 iter->iov_offset = offset & ~PAGE_MASK;
Jens Axboebd11b3a2019-07-20 08:37:31 -06002143 }
2144 }
2145
Jens Axboe5e559562019-11-13 16:12:46 -07002146 return len;
Jens Axboeedafcce2019-01-09 09:16:05 -07002147}
2148
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03002149static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2150 struct iovec **iovec, struct iov_iter *iter)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002151{
Jens Axboe9adbd452019-12-20 08:45:55 -07002152 void __user *buf = u64_to_user_ptr(req->rw.addr);
2153 size_t sqe_len = req->rw.len;
Jens Axboeedafcce2019-01-09 09:16:05 -07002154 u8 opcode;
2155
Jens Axboed625c6e2019-12-17 19:53:05 -07002156 opcode = req->opcode;
Pavel Begunkov7d009162019-11-25 23:14:40 +03002157 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
Jens Axboeedafcce2019-01-09 09:16:05 -07002158 *iovec = NULL;
Jens Axboe9adbd452019-12-20 08:45:55 -07002159 return io_import_fixed(req, rw, iter);
Jens Axboeedafcce2019-01-09 09:16:05 -07002160 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07002161
Jens Axboe9adbd452019-12-20 08:45:55 -07002162 /* buffer index only valid with fixed read/write */
2163 if (req->rw.kiocb.private)
2164 return -EINVAL;
2165
Jens Axboe3a6820f2019-12-22 15:19:35 -07002166 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2167 ssize_t ret;
2168 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2169 *iovec = NULL;
Jens Axboe3a901592020-02-25 17:48:55 -07002170 return ret < 0 ? ret : sqe_len;
Jens Axboe3a6820f2019-12-22 15:19:35 -07002171 }
2172
Jens Axboef67676d2019-12-02 11:03:47 -07002173 if (req->io) {
2174 struct io_async_rw *iorw = &req->io->rw;
2175
2176 *iovec = iorw->iov;
2177 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
2178 if (iorw->iov == iorw->fast_iov)
2179 *iovec = NULL;
2180 return iorw->size;
2181 }
2182
Jens Axboe2b188cc2019-01-07 10:46:33 -07002183#ifdef CONFIG_COMPAT
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03002184 if (req->ctx->compat)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002185 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
2186 iovec, iter);
2187#endif
2188
2189 return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
2190}
2191
Jens Axboe32960612019-09-23 11:05:34 -06002192/*
2193 * For files that don't have ->read_iter() and ->write_iter(), handle them
2194 * by looping over ->read() or ->write() manually.
2195 */
2196static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
2197 struct iov_iter *iter)
2198{
2199 ssize_t ret = 0;
2200
2201 /*
2202 * Don't support polled IO through this interface, and we can't
2203 * support non-blocking either. For the latter, this just causes
2204 * the kiocb to be handled from an async context.
2205 */
2206 if (kiocb->ki_flags & IOCB_HIPRI)
2207 return -EOPNOTSUPP;
2208 if (kiocb->ki_flags & IOCB_NOWAIT)
2209 return -EAGAIN;
2210
2211 while (iov_iter_count(iter)) {
Pavel Begunkov311ae9e2019-11-24 11:58:24 +03002212 struct iovec iovec;
Jens Axboe32960612019-09-23 11:05:34 -06002213 ssize_t nr;
2214
Pavel Begunkov311ae9e2019-11-24 11:58:24 +03002215 if (!iov_iter_is_bvec(iter)) {
2216 iovec = iov_iter_iovec(iter);
2217 } else {
2218 /* fixed buffers import bvec */
2219 iovec.iov_base = kmap(iter->bvec->bv_page)
2220 + iter->iov_offset;
2221 iovec.iov_len = min(iter->count,
2222 iter->bvec->bv_len - iter->iov_offset);
2223 }
2224
Jens Axboe32960612019-09-23 11:05:34 -06002225 if (rw == READ) {
2226 nr = file->f_op->read(file, iovec.iov_base,
2227 iovec.iov_len, &kiocb->ki_pos);
2228 } else {
2229 nr = file->f_op->write(file, iovec.iov_base,
2230 iovec.iov_len, &kiocb->ki_pos);
2231 }
2232
Pavel Begunkov311ae9e2019-11-24 11:58:24 +03002233 if (iov_iter_is_bvec(iter))
2234 kunmap(iter->bvec->bv_page);
2235
Jens Axboe32960612019-09-23 11:05:34 -06002236 if (nr < 0) {
2237 if (!ret)
2238 ret = nr;
2239 break;
2240 }
2241 ret += nr;
2242 if (nr != iovec.iov_len)
2243 break;
2244 iov_iter_advance(iter, nr);
2245 }
2246
2247 return ret;
2248}
2249
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002250static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
Jens Axboef67676d2019-12-02 11:03:47 -07002251 struct iovec *iovec, struct iovec *fast_iov,
2252 struct iov_iter *iter)
2253{
2254 req->io->rw.nr_segs = iter->nr_segs;
2255 req->io->rw.size = io_size;
2256 req->io->rw.iov = iovec;
2257 if (!req->io->rw.iov) {
2258 req->io->rw.iov = req->io->rw.fast_iov;
2259 memcpy(req->io->rw.iov, fast_iov,
2260 sizeof(struct iovec) * iter->nr_segs);
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03002261 } else {
2262 req->flags |= REQ_F_NEED_CLEANUP;
Jens Axboef67676d2019-12-02 11:03:47 -07002263 }
2264}
2265
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002266static int io_alloc_async_ctx(struct io_kiocb *req)
Jens Axboef67676d2019-12-02 11:03:47 -07002267{
Jens Axboed3656342019-12-18 09:50:26 -07002268 if (!io_op_defs[req->opcode].async_ctx)
2269 return 0;
Jens Axboef67676d2019-12-02 11:03:47 -07002270 req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
Jens Axboe06b76d42019-12-19 14:44:26 -07002271 return req->io == NULL;
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002272}
2273
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002274static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
2275 struct iovec *iovec, struct iovec *fast_iov,
2276 struct iov_iter *iter)
2277{
Jens Axboe980ad262020-01-24 23:08:54 -07002278 if (!io_op_defs[req->opcode].async_ctx)
Jens Axboe74566df2020-01-13 19:23:24 -07002279 return 0;
Jens Axboe5d204bc2020-01-31 12:06:52 -07002280 if (!req->io) {
2281 if (io_alloc_async_ctx(req))
2282 return -ENOMEM;
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002283
Jens Axboe5d204bc2020-01-31 12:06:52 -07002284 io_req_map_rw(req, io_size, iovec, fast_iov, iter);
2285 }
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002286 return 0;
Jens Axboef67676d2019-12-02 11:03:47 -07002287}
2288
Jens Axboe3529d8c2019-12-19 18:24:38 -07002289static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2290 bool force_nonblock)
Jens Axboef67676d2019-12-02 11:03:47 -07002291{
Jens Axboe3529d8c2019-12-19 18:24:38 -07002292 struct io_async_ctx *io;
2293 struct iov_iter iter;
Jens Axboef67676d2019-12-02 11:03:47 -07002294 ssize_t ret;
2295
Jens Axboe3529d8c2019-12-19 18:24:38 -07002296 ret = io_prep_rw(req, sqe, force_nonblock);
2297 if (ret)
2298 return ret;
Jens Axboef67676d2019-12-02 11:03:47 -07002299
Jens Axboe3529d8c2019-12-19 18:24:38 -07002300 if (unlikely(!(req->file->f_mode & FMODE_READ)))
2301 return -EBADF;
Jens Axboef67676d2019-12-02 11:03:47 -07002302
Pavel Begunkov5f798be2020-02-08 13:28:02 +03002303 /* either don't need iovec imported or already have it */
2304 if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
Jens Axboe3529d8c2019-12-19 18:24:38 -07002305 return 0;
2306
2307 io = req->io;
2308 io->rw.iov = io->rw.fast_iov;
2309 req->io = NULL;
2310 ret = io_import_iovec(READ, req, &io->rw.iov, &iter);
2311 req->io = io;
2312 if (ret < 0)
2313 return ret;
2314
2315 io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2316 return 0;
Jens Axboef67676d2019-12-02 11:03:47 -07002317}
2318
Pavel Begunkov014db002020-03-03 21:33:12 +03002319static int io_read(struct io_kiocb *req, bool force_nonblock)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002320{
2321 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
Jens Axboe9adbd452019-12-20 08:45:55 -07002322 struct kiocb *kiocb = &req->rw.kiocb;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002323 struct iov_iter iter;
Jens Axboe31b51512019-01-18 22:56:34 -07002324 size_t iov_count;
Jens Axboef67676d2019-12-02 11:03:47 -07002325 ssize_t io_size, ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002326
Jens Axboe3529d8c2019-12-19 18:24:38 -07002327 ret = io_import_iovec(READ, req, &iovec, &iter);
Jens Axboe06b76d42019-12-19 14:44:26 -07002328 if (ret < 0)
2329 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002330
Jens Axboefd6c2e42019-12-18 12:19:41 -07002331 /* Ensure we clear previously set non-block flag */
2332 if (!force_nonblock)
Jens Axboe29de5f62020-02-20 09:56:08 -07002333 kiocb->ki_flags &= ~IOCB_NOWAIT;
Jens Axboefd6c2e42019-12-18 12:19:41 -07002334
Bijan Mottahedeh797f3f52020-01-15 18:37:45 -08002335 req->result = 0;
Jens Axboef67676d2019-12-02 11:03:47 -07002336 io_size = ret;
Jens Axboe9e645e112019-05-10 16:07:28 -06002337 if (req->flags & REQ_F_LINK)
Jens Axboef67676d2019-12-02 11:03:47 -07002338 req->result = io_size;
2339
2340 /*
2341 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2342 * we know to async punt it even if it was opened O_NONBLOCK
2343 */
Jens Axboe29de5f62020-02-20 09:56:08 -07002344 if (force_nonblock && !io_file_supports_async(req->file))
Jens Axboef67676d2019-12-02 11:03:47 -07002345 goto copy_iov;
Jens Axboe9e645e112019-05-10 16:07:28 -06002346
Jens Axboe31b51512019-01-18 22:56:34 -07002347 iov_count = iov_iter_count(&iter);
Jens Axboe9adbd452019-12-20 08:45:55 -07002348 ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002349 if (!ret) {
2350 ssize_t ret2;
2351
Jens Axboe9adbd452019-12-20 08:45:55 -07002352 if (req->file->f_op->read_iter)
2353 ret2 = call_read_iter(req->file, kiocb, &iter);
Jens Axboe32960612019-09-23 11:05:34 -06002354 else
Jens Axboe9adbd452019-12-20 08:45:55 -07002355 ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
Jens Axboe32960612019-09-23 11:05:34 -06002356
Jens Axboe9d93a3f2019-05-15 13:53:07 -06002357 /* Catch -EAGAIN return for forced non-blocking submission */
Jens Axboef67676d2019-12-02 11:03:47 -07002358 if (!force_nonblock || ret2 != -EAGAIN) {
Pavel Begunkov014db002020-03-03 21:33:12 +03002359 kiocb_done(kiocb, ret2);
Jens Axboef67676d2019-12-02 11:03:47 -07002360 } else {
2361copy_iov:
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002362 ret = io_setup_async_rw(req, io_size, iovec,
Jens Axboef67676d2019-12-02 11:03:47 -07002363 inline_vecs, &iter);
2364 if (ret)
2365 goto out_free;
Jens Axboe29de5f62020-02-20 09:56:08 -07002366 /* any defer here is final, must blocking retry */
2367 if (!(req->flags & REQ_F_NOWAIT))
2368 req->flags |= REQ_F_MUST_PUNT;
Jens Axboef67676d2019-12-02 11:03:47 -07002369 return -EAGAIN;
2370 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07002371 }
Jens Axboef67676d2019-12-02 11:03:47 -07002372out_free:
Pavel Begunkov1e950812020-02-06 19:51:16 +03002373 kfree(iovec);
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03002374 req->flags &= ~REQ_F_NEED_CLEANUP;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002375 return ret;
2376}
2377
Jens Axboe3529d8c2019-12-19 18:24:38 -07002378static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2379 bool force_nonblock)
Jens Axboef67676d2019-12-02 11:03:47 -07002380{
Jens Axboe3529d8c2019-12-19 18:24:38 -07002381 struct io_async_ctx *io;
2382 struct iov_iter iter;
Jens Axboef67676d2019-12-02 11:03:47 -07002383 ssize_t ret;
2384
Jens Axboe3529d8c2019-12-19 18:24:38 -07002385 ret = io_prep_rw(req, sqe, force_nonblock);
2386 if (ret)
2387 return ret;
Jens Axboef67676d2019-12-02 11:03:47 -07002388
Jens Axboe3529d8c2019-12-19 18:24:38 -07002389 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
2390 return -EBADF;
Jens Axboef67676d2019-12-02 11:03:47 -07002391
Pavel Begunkov5f798be2020-02-08 13:28:02 +03002392 /* either don't need iovec imported or already have it */
2393 if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
Jens Axboe3529d8c2019-12-19 18:24:38 -07002394 return 0;
2395
2396 io = req->io;
2397 io->rw.iov = io->rw.fast_iov;
2398 req->io = NULL;
2399 ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter);
2400 req->io = io;
2401 if (ret < 0)
2402 return ret;
2403
2404 io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2405 return 0;
Jens Axboef67676d2019-12-02 11:03:47 -07002406}
2407
Pavel Begunkov014db002020-03-03 21:33:12 +03002408static int io_write(struct io_kiocb *req, bool force_nonblock)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002409{
2410 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
Jens Axboe9adbd452019-12-20 08:45:55 -07002411 struct kiocb *kiocb = &req->rw.kiocb;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002412 struct iov_iter iter;
Jens Axboe31b51512019-01-18 22:56:34 -07002413 size_t iov_count;
Jens Axboef67676d2019-12-02 11:03:47 -07002414 ssize_t ret, io_size;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002415
Jens Axboe3529d8c2019-12-19 18:24:38 -07002416 ret = io_import_iovec(WRITE, req, &iovec, &iter);
Jens Axboe06b76d42019-12-19 14:44:26 -07002417 if (ret < 0)
2418 return ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002419
Jens Axboefd6c2e42019-12-18 12:19:41 -07002420 /* Ensure we clear previously set non-block flag */
2421 if (!force_nonblock)
Jens Axboe9adbd452019-12-20 08:45:55 -07002422 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
Jens Axboefd6c2e42019-12-18 12:19:41 -07002423
Bijan Mottahedeh797f3f52020-01-15 18:37:45 -08002424 req->result = 0;
Jens Axboef67676d2019-12-02 11:03:47 -07002425 io_size = ret;
Jens Axboe9e645e112019-05-10 16:07:28 -06002426 if (req->flags & REQ_F_LINK)
Jens Axboef67676d2019-12-02 11:03:47 -07002427 req->result = io_size;
2428
2429 /*
2430 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2431 * we know to async punt it even if it was opened O_NONBLOCK
2432 */
Jens Axboe29de5f62020-02-20 09:56:08 -07002433 if (force_nonblock && !io_file_supports_async(req->file))
Jens Axboef67676d2019-12-02 11:03:47 -07002434 goto copy_iov;
Jens Axboef67676d2019-12-02 11:03:47 -07002435
Jens Axboe10d59342019-12-09 20:16:22 -07002436 /* file path doesn't support NOWAIT for non-direct_IO */
2437 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
2438 (req->flags & REQ_F_ISREG))
Jens Axboef67676d2019-12-02 11:03:47 -07002439 goto copy_iov;
Jens Axboe9e645e112019-05-10 16:07:28 -06002440
Jens Axboe31b51512019-01-18 22:56:34 -07002441 iov_count = iov_iter_count(&iter);
Jens Axboe9adbd452019-12-20 08:45:55 -07002442 ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002443 if (!ret) {
Roman Penyaev9bf79332019-03-25 20:09:24 +01002444 ssize_t ret2;
2445
Jens Axboe2b188cc2019-01-07 10:46:33 -07002446 /*
2447 * Open-code file_start_write here to grab freeze protection,
2448 * which will be released by another thread in
2449 * io_complete_rw(). Fool lockdep by telling it the lock got
2450 * released so that it doesn't complain about the held lock when
2451 * we return to userspace.
2452 */
Jens Axboe491381ce2019-10-17 09:20:46 -06002453 if (req->flags & REQ_F_ISREG) {
Jens Axboe9adbd452019-12-20 08:45:55 -07002454 __sb_start_write(file_inode(req->file)->i_sb,
Jens Axboe2b188cc2019-01-07 10:46:33 -07002455 SB_FREEZE_WRITE, true);
Jens Axboe9adbd452019-12-20 08:45:55 -07002456 __sb_writers_release(file_inode(req->file)->i_sb,
Jens Axboe2b188cc2019-01-07 10:46:33 -07002457 SB_FREEZE_WRITE);
2458 }
2459 kiocb->ki_flags |= IOCB_WRITE;
Roman Penyaev9bf79332019-03-25 20:09:24 +01002460
Jens Axboe9adbd452019-12-20 08:45:55 -07002461 if (req->file->f_op->write_iter)
2462 ret2 = call_write_iter(req->file, kiocb, &iter);
Jens Axboe32960612019-09-23 11:05:34 -06002463 else
Jens Axboe9adbd452019-12-20 08:45:55 -07002464 ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
Jens Axboefaac9962020-02-07 15:45:22 -07002465 /*
2466 * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just
2467 * retry them without IOCB_NOWAIT.
2468 */
2469 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
2470 ret2 = -EAGAIN;
Jens Axboef67676d2019-12-02 11:03:47 -07002471 if (!force_nonblock || ret2 != -EAGAIN) {
Pavel Begunkov014db002020-03-03 21:33:12 +03002472 kiocb_done(kiocb, ret2);
Jens Axboef67676d2019-12-02 11:03:47 -07002473 } else {
2474copy_iov:
Jens Axboeb7bb4f72019-12-15 22:13:43 -07002475 ret = io_setup_async_rw(req, io_size, iovec,
Jens Axboef67676d2019-12-02 11:03:47 -07002476 inline_vecs, &iter);
2477 if (ret)
2478 goto out_free;
Jens Axboe29de5f62020-02-20 09:56:08 -07002479 /* any defer here is final, must blocking retry */
2480 req->flags |= REQ_F_MUST_PUNT;
Jens Axboef67676d2019-12-02 11:03:47 -07002481 return -EAGAIN;
2482 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07002483 }
Jens Axboe31b51512019-01-18 22:56:34 -07002484out_free:
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03002485 req->flags &= ~REQ_F_NEED_CLEANUP;
Pavel Begunkov1e950812020-02-06 19:51:16 +03002486 kfree(iovec);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002487 return ret;
2488}
2489
Pavel Begunkov7d67af22020-02-24 11:32:45 +03002490static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2491{
2492 struct io_splice* sp = &req->splice;
2493 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
2494 int ret;
2495
2496 if (req->flags & REQ_F_NEED_CLEANUP)
2497 return 0;
2498
2499 sp->file_in = NULL;
2500 sp->off_in = READ_ONCE(sqe->splice_off_in);
2501 sp->off_out = READ_ONCE(sqe->off);
2502 sp->len = READ_ONCE(sqe->len);
2503 sp->flags = READ_ONCE(sqe->splice_flags);
2504
2505 if (unlikely(sp->flags & ~valid_flags))
2506 return -EINVAL;
2507
2508 ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
2509 (sp->flags & SPLICE_F_FD_IN_FIXED));
2510 if (ret)
2511 return ret;
2512 req->flags |= REQ_F_NEED_CLEANUP;
2513
2514 if (!S_ISREG(file_inode(sp->file_in)->i_mode))
2515 req->work.flags |= IO_WQ_WORK_UNBOUND;
2516
2517 return 0;
2518}
2519
2520static bool io_splice_punt(struct file *file)
2521{
2522 if (get_pipe_info(file))
2523 return false;
2524 if (!io_file_supports_async(file))
2525 return true;
2526 return !(file->f_mode & O_NONBLOCK);
2527}
2528
Pavel Begunkov014db002020-03-03 21:33:12 +03002529static int io_splice(struct io_kiocb *req, bool force_nonblock)
Pavel Begunkov7d67af22020-02-24 11:32:45 +03002530{
2531 struct io_splice *sp = &req->splice;
2532 struct file *in = sp->file_in;
2533 struct file *out = sp->file_out;
2534 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
2535 loff_t *poff_in, *poff_out;
2536 long ret;
2537
2538 if (force_nonblock) {
2539 if (io_splice_punt(in) || io_splice_punt(out))
2540 return -EAGAIN;
2541 flags |= SPLICE_F_NONBLOCK;
2542 }
2543
2544 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
2545 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
2546 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
2547 if (force_nonblock && ret == -EAGAIN)
2548 return -EAGAIN;
2549
2550 io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
2551 req->flags &= ~REQ_F_NEED_CLEANUP;
2552
2553 io_cqring_add_event(req, ret);
2554 if (ret != sp->len)
2555 req_set_fail_links(req);
Pavel Begunkov014db002020-03-03 21:33:12 +03002556 io_put_req(req);
Pavel Begunkov7d67af22020-02-24 11:32:45 +03002557 return 0;
2558}
2559
Jens Axboe2b188cc2019-01-07 10:46:33 -07002560/*
2561 * IORING_OP_NOP just posts a completion event, nothing else.
2562 */
Jens Axboe78e19bb2019-11-06 15:21:34 -07002563static int io_nop(struct io_kiocb *req)
Jens Axboe2b188cc2019-01-07 10:46:33 -07002564{
2565 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2b188cc2019-01-07 10:46:33 -07002566
Jens Axboedef596e2019-01-09 08:59:42 -07002567 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2568 return -EINVAL;
2569
Jens Axboe78e19bb2019-11-06 15:21:34 -07002570 io_cqring_add_event(req, 0);
Jens Axboee65ef562019-03-12 10:16:44 -06002571 io_put_req(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07002572 return 0;
2573}
2574
Jens Axboe3529d8c2019-12-19 18:24:38 -07002575static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002576{
Jens Axboe6b063142019-01-10 22:13:58 -07002577 struct io_ring_ctx *ctx = req->ctx;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002578
Jens Axboe09bb8392019-03-13 12:39:28 -06002579 if (!req->file)
2580 return -EBADF;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002581
Jens Axboe6b063142019-01-10 22:13:58 -07002582 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboedef596e2019-01-09 08:59:42 -07002583 return -EINVAL;
Jens Axboeedafcce2019-01-09 09:16:05 -07002584 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002585 return -EINVAL;
2586
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002587 req->sync.flags = READ_ONCE(sqe->fsync_flags);
2588 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
2589 return -EINVAL;
2590
2591 req->sync.off = READ_ONCE(sqe->off);
2592 req->sync.len = READ_ONCE(sqe->len);
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002593 return 0;
2594}
2595
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002596static bool io_req_cancelled(struct io_kiocb *req)
2597{
2598 if (req->work.flags & IO_WQ_WORK_CANCEL) {
2599 req_set_fail_links(req);
2600 io_cqring_add_event(req, -ECANCELED);
Pavel Begunkove9fd9392020-03-04 16:14:12 +03002601 io_put_req(req);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002602 return true;
2603 }
2604
2605 return false;
2606}
2607
Pavel Begunkov014db002020-03-03 21:33:12 +03002608static void __io_fsync(struct io_kiocb *req)
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002609{
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002610 loff_t end = req->sync.off + req->sync.len;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002611 int ret;
2612
Jens Axboe9adbd452019-12-20 08:45:55 -07002613 ret = vfs_fsync_range(req->file, req->sync.off,
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002614 end > 0 ? end : LLONG_MAX,
2615 req->sync.flags & IORING_FSYNC_DATASYNC);
2616 if (ret < 0)
2617 req_set_fail_links(req);
2618 io_cqring_add_event(req, ret);
Pavel Begunkov014db002020-03-03 21:33:12 +03002619 io_put_req(req);
Pavel Begunkov5ea62162020-02-24 11:30:16 +03002620}
2621
2622static void io_fsync_finish(struct io_wq_work **workptr)
2623{
2624 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
Pavel Begunkov5ea62162020-02-24 11:30:16 +03002625
2626 if (io_req_cancelled(req))
2627 return;
Pavel Begunkov014db002020-03-03 21:33:12 +03002628 __io_fsync(req);
Pavel Begunkove9fd9392020-03-04 16:14:12 +03002629 io_steal_work(req, workptr);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002630}
2631
Pavel Begunkov014db002020-03-03 21:33:12 +03002632static int io_fsync(struct io_kiocb *req, bool force_nonblock)
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002633{
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002634 /* fsync always requires a blocking context */
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002635 if (force_nonblock) {
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002636 req->work.func = io_fsync_finish;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002637 return -EAGAIN;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07002638 }
Pavel Begunkov014db002020-03-03 21:33:12 +03002639 __io_fsync(req);
Christoph Hellwigc992fe22019-01-11 09:43:02 -07002640 return 0;
2641}
2642
Pavel Begunkov014db002020-03-03 21:33:12 +03002643static void __io_fallocate(struct io_kiocb *req)
Jens Axboed63d1b52019-12-10 10:38:56 -07002644{
Jens Axboed63d1b52019-12-10 10:38:56 -07002645 int ret;
2646
2647 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
2648 req->sync.len);
2649 if (ret < 0)
2650 req_set_fail_links(req);
2651 io_cqring_add_event(req, ret);
Pavel Begunkov014db002020-03-03 21:33:12 +03002652 io_put_req(req);
Pavel Begunkov5ea62162020-02-24 11:30:16 +03002653}
2654
2655static void io_fallocate_finish(struct io_wq_work **workptr)
2656{
2657 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
Pavel Begunkov5ea62162020-02-24 11:30:16 +03002658
Pavel Begunkov594506f2020-03-03 21:33:11 +03002659 if (io_req_cancelled(req))
2660 return;
Pavel Begunkov014db002020-03-03 21:33:12 +03002661 __io_fallocate(req);
Pavel Begunkove9fd9392020-03-04 16:14:12 +03002662 io_steal_work(req, workptr);
Jens Axboed63d1b52019-12-10 10:38:56 -07002663}
2664
2665static int io_fallocate_prep(struct io_kiocb *req,
2666 const struct io_uring_sqe *sqe)
2667{
2668 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
2669 return -EINVAL;
2670
2671 req->sync.off = READ_ONCE(sqe->off);
2672 req->sync.len = READ_ONCE(sqe->addr);
2673 req->sync.mode = READ_ONCE(sqe->len);
2674 return 0;
2675}
2676
Pavel Begunkov014db002020-03-03 21:33:12 +03002677static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
Jens Axboed63d1b52019-12-10 10:38:56 -07002678{
Jens Axboed63d1b52019-12-10 10:38:56 -07002679 /* fallocate always requiring blocking context */
2680 if (force_nonblock) {
Jens Axboed63d1b52019-12-10 10:38:56 -07002681 req->work.func = io_fallocate_finish;
2682 return -EAGAIN;
2683 }
2684
Pavel Begunkov014db002020-03-03 21:33:12 +03002685 __io_fallocate(req);
Jens Axboed63d1b52019-12-10 10:38:56 -07002686 return 0;
2687}
2688
Jens Axboe15b71ab2019-12-11 11:20:36 -07002689static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2690{
Jens Axboef8748882020-01-08 17:47:02 -07002691 const char __user *fname;
Jens Axboe15b71ab2019-12-11 11:20:36 -07002692 int ret;
2693
2694 if (sqe->ioprio || sqe->buf_index)
2695 return -EINVAL;
Jens Axboecf3040c2020-02-06 21:31:40 -07002696 if (sqe->flags & IOSQE_FIXED_FILE)
2697 return -EBADF;
Pavel Begunkov0bdbdd02020-02-08 13:28:03 +03002698 if (req->flags & REQ_F_NEED_CLEANUP)
2699 return 0;
Jens Axboe15b71ab2019-12-11 11:20:36 -07002700
2701 req->open.dfd = READ_ONCE(sqe->fd);
Jens Axboec12cedf2020-01-08 17:41:21 -07002702 req->open.how.mode = READ_ONCE(sqe->len);
Jens Axboef8748882020-01-08 17:47:02 -07002703 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
Jens Axboec12cedf2020-01-08 17:41:21 -07002704 req->open.how.flags = READ_ONCE(sqe->open_flags);
Jens Axboe15b71ab2019-12-11 11:20:36 -07002705
Jens Axboef8748882020-01-08 17:47:02 -07002706 req->open.filename = getname(fname);
Jens Axboe15b71ab2019-12-11 11:20:36 -07002707 if (IS_ERR(req->open.filename)) {
2708 ret = PTR_ERR(req->open.filename);
2709 req->open.filename = NULL;
2710 return ret;
2711 }
2712
Pavel Begunkov8fef80b2020-02-07 23:59:53 +03002713 req->flags |= REQ_F_NEED_CLEANUP;
Jens Axboe15b71ab2019-12-11 11:20:36 -07002714 return 0;
2715}
2716
Jens Axboecebdb982020-01-08 17:59:24 -07002717static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2718{
2719 struct open_how __user *how;
2720 const char __user *fname;
2721 size_t len;
2722 int ret;
2723
2724 if (sqe->ioprio || sqe->buf_index)
2725 return -EINVAL;
Jens Axboecf3040c2020-02-06 21:31:40 -07002726 if (sqe->flags & IOSQE_FIXED_FILE)
2727 return -EBADF;
Pavel Begunkov0bdbdd02020-02-08 13:28:03 +03002728 if (req->flags & REQ_F_NEED_CLEANUP)
2729 return 0;
Jens Axboecebdb982020-01-08 17:59:24 -07002730
2731 req->open.dfd = READ_ONCE(sqe->fd);
2732 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
2733 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
2734 len = READ_ONCE(sqe->len);
2735
2736 if (len < OPEN_HOW_SIZE_VER0)
2737 return -EINVAL;
2738
2739 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
2740 len);
2741 if (ret)
2742 return ret;
2743
2744 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
2745 req->open.how.flags |= O_LARGEFILE;
2746
2747 req->open.filename = getname(fname);
2748 if (IS_ERR(req->open.filename)) {
2749 ret = PTR_ERR(req->open.filename);
2750 req->open.filename = NULL;
2751 return ret;
2752 }
2753
Pavel Begunkov8fef80b2020-02-07 23:59:53 +03002754 req->flags |= REQ_F_NEED_CLEANUP;
Jens Axboecebdb982020-01-08 17:59:24 -07002755 return 0;
2756}
2757
Pavel Begunkov014db002020-03-03 21:33:12 +03002758static int io_openat2(struct io_kiocb *req, bool force_nonblock)
Jens Axboe15b71ab2019-12-11 11:20:36 -07002759{
2760 struct open_flags op;
Jens Axboe15b71ab2019-12-11 11:20:36 -07002761 struct file *file;
2762 int ret;
2763
Jens Axboef86cd202020-01-29 13:46:44 -07002764 if (force_nonblock)
Jens Axboe15b71ab2019-12-11 11:20:36 -07002765 return -EAGAIN;
Jens Axboe15b71ab2019-12-11 11:20:36 -07002766
Jens Axboecebdb982020-01-08 17:59:24 -07002767 ret = build_open_flags(&req->open.how, &op);
Jens Axboe15b71ab2019-12-11 11:20:36 -07002768 if (ret)
2769 goto err;
2770
Jens Axboecebdb982020-01-08 17:59:24 -07002771 ret = get_unused_fd_flags(req->open.how.flags);
Jens Axboe15b71ab2019-12-11 11:20:36 -07002772 if (ret < 0)
2773 goto err;
2774
2775 file = do_filp_open(req->open.dfd, req->open.filename, &op);
2776 if (IS_ERR(file)) {
2777 put_unused_fd(ret);
2778 ret = PTR_ERR(file);
2779 } else {
2780 fsnotify_open(file);
2781 fd_install(ret, file);
2782 }
2783err:
2784 putname(req->open.filename);
Pavel Begunkov8fef80b2020-02-07 23:59:53 +03002785 req->flags &= ~REQ_F_NEED_CLEANUP;
Jens Axboe15b71ab2019-12-11 11:20:36 -07002786 if (ret < 0)
2787 req_set_fail_links(req);
2788 io_cqring_add_event(req, ret);
Pavel Begunkov014db002020-03-03 21:33:12 +03002789 io_put_req(req);
Jens Axboe15b71ab2019-12-11 11:20:36 -07002790 return 0;
2791}
2792
Pavel Begunkov014db002020-03-03 21:33:12 +03002793static int io_openat(struct io_kiocb *req, bool force_nonblock)
Jens Axboecebdb982020-01-08 17:59:24 -07002794{
2795 req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
Pavel Begunkov014db002020-03-03 21:33:12 +03002796 return io_openat2(req, force_nonblock);
Jens Axboecebdb982020-01-08 17:59:24 -07002797}
2798
Jens Axboeddf0322d2020-02-23 16:41:33 -07002799static int io_provide_buffers_prep(struct io_kiocb *req,
2800 const struct io_uring_sqe *sqe)
2801{
2802 struct io_provide_buf *p = &req->pbuf;
2803 u64 tmp;
2804
2805 if (sqe->ioprio || sqe->rw_flags)
2806 return -EINVAL;
2807
2808 tmp = READ_ONCE(sqe->fd);
2809 if (!tmp || tmp > USHRT_MAX)
2810 return -E2BIG;
2811 p->nbufs = tmp;
2812 p->addr = READ_ONCE(sqe->addr);
2813 p->len = READ_ONCE(sqe->len);
2814
2815 if (!access_ok(u64_to_user_ptr(p->addr), p->len))
2816 return -EFAULT;
2817
2818 p->bgid = READ_ONCE(sqe->buf_group);
2819 tmp = READ_ONCE(sqe->off);
2820 if (tmp > USHRT_MAX)
2821 return -E2BIG;
2822 p->bid = tmp;
2823 return 0;
2824}
2825
2826static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
2827{
2828 struct io_buffer *buf;
2829 u64 addr = pbuf->addr;
2830 int i, bid = pbuf->bid;
2831
2832 for (i = 0; i < pbuf->nbufs; i++) {
2833 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
2834 if (!buf)
2835 break;
2836
2837 buf->addr = addr;
2838 buf->len = pbuf->len;
2839 buf->bid = bid;
2840 addr += pbuf->len;
2841 bid++;
2842 if (!*head) {
2843 INIT_LIST_HEAD(&buf->list);
2844 *head = buf;
2845 } else {
2846 list_add_tail(&buf->list, &(*head)->list);
2847 }
2848 }
2849
2850 return i ? i : -ENOMEM;
2851}
2852
2853static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2854{
2855 if (needs_lock)
2856 mutex_unlock(&ctx->uring_lock);
2857}
2858
2859static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2860{
2861 /*
2862 * "Normal" inline submissions always hold the uring_lock, since we
2863 * grab it from the system call. Same is true for the SQPOLL offload.
2864 * The only exception is when we've detached the request and issue it
2865 * from an async worker thread, grab the lock for that case.
2866 */
2867 if (needs_lock)
2868 mutex_lock(&ctx->uring_lock);
2869}
2870
2871static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
2872{
2873 struct io_provide_buf *p = &req->pbuf;
2874 struct io_ring_ctx *ctx = req->ctx;
2875 struct io_buffer *head, *list;
2876 int ret = 0;
2877
2878 io_ring_submit_lock(ctx, !force_nonblock);
2879
2880 lockdep_assert_held(&ctx->uring_lock);
2881
2882 list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
2883
2884 ret = io_add_buffers(p, &head);
2885 if (ret < 0)
2886 goto out;
2887
2888 if (!list) {
2889 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
2890 GFP_KERNEL);
2891 if (ret < 0) {
2892 while (!list_empty(&head->list)) {
2893 struct io_buffer *buf;
2894
2895 buf = list_first_entry(&head->list,
2896 struct io_buffer, list);
2897 list_del(&buf->list);
2898 kfree(buf);
2899 }
2900 kfree(head);
2901 goto out;
2902 }
2903 }
2904out:
2905 io_ring_submit_unlock(ctx, !force_nonblock);
2906 if (ret < 0)
2907 req_set_fail_links(req);
2908 io_cqring_add_event(req, ret);
2909 io_put_req(req);
2910 return 0;
2911}
2912
Jens Axboe3e4827b2020-01-08 15:18:09 -07002913static int io_epoll_ctl_prep(struct io_kiocb *req,
2914 const struct io_uring_sqe *sqe)
2915{
2916#if defined(CONFIG_EPOLL)
2917 if (sqe->ioprio || sqe->buf_index)
2918 return -EINVAL;
2919
2920 req->epoll.epfd = READ_ONCE(sqe->fd);
2921 req->epoll.op = READ_ONCE(sqe->len);
2922 req->epoll.fd = READ_ONCE(sqe->off);
2923
2924 if (ep_op_has_event(req->epoll.op)) {
2925 struct epoll_event __user *ev;
2926
2927 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
2928 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
2929 return -EFAULT;
2930 }
2931
2932 return 0;
2933#else
2934 return -EOPNOTSUPP;
2935#endif
2936}
2937
Pavel Begunkov014db002020-03-03 21:33:12 +03002938static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
Jens Axboe3e4827b2020-01-08 15:18:09 -07002939{
2940#if defined(CONFIG_EPOLL)
2941 struct io_epoll *ie = &req->epoll;
2942 int ret;
2943
2944 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
2945 if (force_nonblock && ret == -EAGAIN)
2946 return -EAGAIN;
2947
2948 if (ret < 0)
2949 req_set_fail_links(req);
2950 io_cqring_add_event(req, ret);
Pavel Begunkov014db002020-03-03 21:33:12 +03002951 io_put_req(req);
Jens Axboe3e4827b2020-01-08 15:18:09 -07002952 return 0;
2953#else
2954 return -EOPNOTSUPP;
2955#endif
2956}
2957
Jens Axboec1ca7572019-12-25 22:18:28 -07002958static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2959{
2960#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
2961 if (sqe->ioprio || sqe->buf_index || sqe->off)
2962 return -EINVAL;
2963
2964 req->madvise.addr = READ_ONCE(sqe->addr);
2965 req->madvise.len = READ_ONCE(sqe->len);
2966 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
2967 return 0;
2968#else
2969 return -EOPNOTSUPP;
2970#endif
2971}
2972
Pavel Begunkov014db002020-03-03 21:33:12 +03002973static int io_madvise(struct io_kiocb *req, bool force_nonblock)
Jens Axboec1ca7572019-12-25 22:18:28 -07002974{
2975#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
2976 struct io_madvise *ma = &req->madvise;
2977 int ret;
2978
2979 if (force_nonblock)
2980 return -EAGAIN;
2981
2982 ret = do_madvise(ma->addr, ma->len, ma->advice);
2983 if (ret < 0)
2984 req_set_fail_links(req);
2985 io_cqring_add_event(req, ret);
Pavel Begunkov014db002020-03-03 21:33:12 +03002986 io_put_req(req);
Jens Axboec1ca7572019-12-25 22:18:28 -07002987 return 0;
2988#else
2989 return -EOPNOTSUPP;
2990#endif
2991}
2992
Jens Axboe4840e412019-12-25 22:03:45 -07002993static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2994{
2995 if (sqe->ioprio || sqe->buf_index || sqe->addr)
2996 return -EINVAL;
2997
2998 req->fadvise.offset = READ_ONCE(sqe->off);
2999 req->fadvise.len = READ_ONCE(sqe->len);
3000 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
3001 return 0;
3002}
3003
Pavel Begunkov014db002020-03-03 21:33:12 +03003004static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
Jens Axboe4840e412019-12-25 22:03:45 -07003005{
3006 struct io_fadvise *fa = &req->fadvise;
3007 int ret;
3008
Jens Axboe3e694262020-02-01 09:22:49 -07003009 if (force_nonblock) {
3010 switch (fa->advice) {
3011 case POSIX_FADV_NORMAL:
3012 case POSIX_FADV_RANDOM:
3013 case POSIX_FADV_SEQUENTIAL:
3014 break;
3015 default:
3016 return -EAGAIN;
3017 }
3018 }
Jens Axboe4840e412019-12-25 22:03:45 -07003019
3020 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
3021 if (ret < 0)
3022 req_set_fail_links(req);
3023 io_cqring_add_event(req, ret);
Pavel Begunkov014db002020-03-03 21:33:12 +03003024 io_put_req(req);
Jens Axboe4840e412019-12-25 22:03:45 -07003025 return 0;
3026}
3027
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003028static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3029{
Jens Axboef8748882020-01-08 17:47:02 -07003030 const char __user *fname;
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003031 unsigned lookup_flags;
3032 int ret;
3033
3034 if (sqe->ioprio || sqe->buf_index)
3035 return -EINVAL;
Jens Axboecf3040c2020-02-06 21:31:40 -07003036 if (sqe->flags & IOSQE_FIXED_FILE)
3037 return -EBADF;
Pavel Begunkov0bdbdd02020-02-08 13:28:03 +03003038 if (req->flags & REQ_F_NEED_CLEANUP)
3039 return 0;
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003040
3041 req->open.dfd = READ_ONCE(sqe->fd);
3042 req->open.mask = READ_ONCE(sqe->len);
Jens Axboef8748882020-01-08 17:47:02 -07003043 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003044 req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
Jens Axboec12cedf2020-01-08 17:41:21 -07003045 req->open.how.flags = READ_ONCE(sqe->statx_flags);
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003046
Jens Axboec12cedf2020-01-08 17:41:21 -07003047 if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003048 return -EINVAL;
3049
Jens Axboef8748882020-01-08 17:47:02 -07003050 req->open.filename = getname_flags(fname, lookup_flags, NULL);
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003051 if (IS_ERR(req->open.filename)) {
3052 ret = PTR_ERR(req->open.filename);
3053 req->open.filename = NULL;
3054 return ret;
3055 }
3056
Pavel Begunkov8fef80b2020-02-07 23:59:53 +03003057 req->flags |= REQ_F_NEED_CLEANUP;
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003058 return 0;
3059}
3060
Pavel Begunkov014db002020-03-03 21:33:12 +03003061static int io_statx(struct io_kiocb *req, bool force_nonblock)
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003062{
3063 struct io_open *ctx = &req->open;
3064 unsigned lookup_flags;
3065 struct path path;
3066 struct kstat stat;
3067 int ret;
3068
3069 if (force_nonblock)
3070 return -EAGAIN;
3071
Jens Axboec12cedf2020-01-08 17:41:21 -07003072 if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003073 return -EINVAL;
3074
3075retry:
3076 /* filename_lookup() drops it, keep a reference */
3077 ctx->filename->refcnt++;
3078
3079 ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
3080 NULL);
3081 if (ret)
3082 goto err;
3083
Jens Axboec12cedf2020-01-08 17:41:21 -07003084 ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags);
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003085 path_put(&path);
3086 if (retry_estale(ret, lookup_flags)) {
3087 lookup_flags |= LOOKUP_REVAL;
3088 goto retry;
3089 }
3090 if (!ret)
3091 ret = cp_statx(&stat, ctx->buffer);
3092err:
3093 putname(ctx->filename);
Pavel Begunkov8fef80b2020-02-07 23:59:53 +03003094 req->flags &= ~REQ_F_NEED_CLEANUP;
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003095 if (ret < 0)
3096 req_set_fail_links(req);
3097 io_cqring_add_event(req, ret);
Pavel Begunkov014db002020-03-03 21:33:12 +03003098 io_put_req(req);
Jens Axboeeddc7ef2019-12-13 21:18:10 -07003099 return 0;
3100}
3101
Jens Axboeb5dba592019-12-11 14:02:38 -07003102static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3103{
3104 /*
3105 * If we queue this for async, it must not be cancellable. That would
3106 * leave the 'file' in an undeterminate state.
3107 */
3108 req->work.flags |= IO_WQ_WORK_NO_CANCEL;
3109
3110 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
3111 sqe->rw_flags || sqe->buf_index)
3112 return -EINVAL;
3113 if (sqe->flags & IOSQE_FIXED_FILE)
Jens Axboecf3040c2020-02-06 21:31:40 -07003114 return -EBADF;
Jens Axboeb5dba592019-12-11 14:02:38 -07003115
3116 req->close.fd = READ_ONCE(sqe->fd);
3117 if (req->file->f_op == &io_uring_fops ||
Pavel Begunkovb14cca02020-01-17 04:45:59 +03003118 req->close.fd == req->ctx->ring_fd)
Jens Axboeb5dba592019-12-11 14:02:38 -07003119 return -EBADF;
3120
3121 return 0;
3122}
3123
Pavel Begunkova93b3332020-02-08 14:04:34 +03003124/* only called when __close_fd_get_file() is done */
Pavel Begunkov014db002020-03-03 21:33:12 +03003125static void __io_close_finish(struct io_kiocb *req)
Pavel Begunkova93b3332020-02-08 14:04:34 +03003126{
3127 int ret;
3128
3129 ret = filp_close(req->close.put_file, req->work.files);
3130 if (ret < 0)
3131 req_set_fail_links(req);
3132 io_cqring_add_event(req, ret);
3133 fput(req->close.put_file);
Pavel Begunkov014db002020-03-03 21:33:12 +03003134 io_put_req(req);
Pavel Begunkova93b3332020-02-08 14:04:34 +03003135}
3136
Jens Axboeb5dba592019-12-11 14:02:38 -07003137static void io_close_finish(struct io_wq_work **workptr)
3138{
3139 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
Jens Axboeb5dba592019-12-11 14:02:38 -07003140
Pavel Begunkov7fbeb952020-02-16 01:01:18 +03003141 /* not cancellable, don't do io_req_cancelled() */
Pavel Begunkov014db002020-03-03 21:33:12 +03003142 __io_close_finish(req);
Pavel Begunkove9fd9392020-03-04 16:14:12 +03003143 io_steal_work(req, workptr);
Jens Axboeb5dba592019-12-11 14:02:38 -07003144}
3145
Pavel Begunkov014db002020-03-03 21:33:12 +03003146static int io_close(struct io_kiocb *req, bool force_nonblock)
Jens Axboeb5dba592019-12-11 14:02:38 -07003147{
3148 int ret;
3149
3150 req->close.put_file = NULL;
3151 ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
3152 if (ret < 0)
3153 return ret;
3154
3155 /* if the file has a flush method, be safe and punt to async */
Pavel Begunkova2100672020-03-02 23:45:16 +03003156 if (req->close.put_file->f_op->flush && force_nonblock) {
Pavel Begunkov594506f2020-03-03 21:33:11 +03003157 /* submission ref will be dropped, take it for async */
3158 refcount_inc(&req->refs);
3159
Pavel Begunkova2100672020-03-02 23:45:16 +03003160 req->work.func = io_close_finish;
3161 /*
3162 * Do manual async queue here to avoid grabbing files - we don't
3163 * need the files, and it'll cause io_close_finish() to close
3164 * the file again and cause a double CQE entry for this request
3165 */
3166 io_queue_async_work(req);
3167 return 0;
3168 }
Jens Axboeb5dba592019-12-11 14:02:38 -07003169
3170 /*
3171 * No ->flush(), safely close from here and just punt the
3172 * fput() to async context.
3173 */
Pavel Begunkov014db002020-03-03 21:33:12 +03003174 __io_close_finish(req);
Pavel Begunkova93b3332020-02-08 14:04:34 +03003175 return 0;
Jens Axboeb5dba592019-12-11 14:02:38 -07003176}
3177
Jens Axboe3529d8c2019-12-19 18:24:38 -07003178static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jens Axboe5d17b4a2019-04-09 14:56:44 -06003179{
3180 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06003181
3182 if (!req->file)
3183 return -EBADF;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06003184
3185 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3186 return -EINVAL;
3187 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3188 return -EINVAL;
3189
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003190 req->sync.off = READ_ONCE(sqe->off);
3191 req->sync.len = READ_ONCE(sqe->len);
3192 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003193 return 0;
3194}
3195
Pavel Begunkov014db002020-03-03 21:33:12 +03003196static void __io_sync_file_range(struct io_kiocb *req)
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003197{
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003198 int ret;
3199
Jens Axboe9adbd452019-12-20 08:45:55 -07003200 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003201 req->sync.flags);
3202 if (ret < 0)
3203 req_set_fail_links(req);
3204 io_cqring_add_event(req, ret);
Pavel Begunkov014db002020-03-03 21:33:12 +03003205 io_put_req(req);
Pavel Begunkov5ea62162020-02-24 11:30:16 +03003206}
3207
3208
3209static void io_sync_file_range_finish(struct io_wq_work **workptr)
3210{
3211 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3212 struct io_kiocb *nxt = NULL;
3213
3214 if (io_req_cancelled(req))
3215 return;
Pavel Begunkov014db002020-03-03 21:33:12 +03003216 __io_sync_file_range(req);
Pavel Begunkov594506f2020-03-03 21:33:11 +03003217 io_put_req(req); /* put submission ref */
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003218 if (nxt)
Jens Axboe78912932020-01-14 22:09:06 -07003219 io_wq_assign_next(workptr, nxt);
Jens Axboe5d17b4a2019-04-09 14:56:44 -06003220}
3221
Pavel Begunkov014db002020-03-03 21:33:12 +03003222static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
Jens Axboe5d17b4a2019-04-09 14:56:44 -06003223{
Jens Axboe5d17b4a2019-04-09 14:56:44 -06003224 /* sync_file_range always requires a blocking context */
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003225 if (force_nonblock) {
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003226 req->work.func = io_sync_file_range_finish;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06003227 return -EAGAIN;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003228 }
Jens Axboe5d17b4a2019-04-09 14:56:44 -06003229
Pavel Begunkov014db002020-03-03 21:33:12 +03003230 __io_sync_file_range(req);
Jens Axboe5d17b4a2019-04-09 14:56:44 -06003231 return 0;
3232}
3233
Pavel Begunkov02d27d82020-02-28 10:36:36 +03003234static int io_setup_async_msg(struct io_kiocb *req,
3235 struct io_async_msghdr *kmsg)
3236{
3237 if (req->io)
3238 return -EAGAIN;
3239 if (io_alloc_async_ctx(req)) {
3240 if (kmsg->iov != kmsg->fast_iov)
3241 kfree(kmsg->iov);
3242 return -ENOMEM;
3243 }
3244 req->flags |= REQ_F_NEED_CLEANUP;
3245 memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
3246 return -EAGAIN;
3247}
3248
Jens Axboe3529d8c2019-12-19 18:24:38 -07003249static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jens Axboeaa1fa282019-04-19 13:38:09 -06003250{
Jens Axboe03b12302019-12-02 18:50:25 -07003251#if defined(CONFIG_NET)
Jens Axboee47293f2019-12-20 08:58:21 -07003252 struct io_sr_msg *sr = &req->sr_msg;
Jens Axboe3529d8c2019-12-19 18:24:38 -07003253 struct io_async_ctx *io = req->io;
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03003254 int ret;
Jens Axboe03b12302019-12-02 18:50:25 -07003255
Jens Axboee47293f2019-12-20 08:58:21 -07003256 sr->msg_flags = READ_ONCE(sqe->msg_flags);
3257 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
Jens Axboefddafac2020-01-04 20:19:44 -07003258 sr->len = READ_ONCE(sqe->len);
Jens Axboe3529d8c2019-12-19 18:24:38 -07003259
Jens Axboed8768362020-02-27 14:17:49 -07003260#ifdef CONFIG_COMPAT
3261 if (req->ctx->compat)
3262 sr->msg_flags |= MSG_CMSG_COMPAT;
3263#endif
3264
Jens Axboefddafac2020-01-04 20:19:44 -07003265 if (!io || req->opcode == IORING_OP_SEND)
Jens Axboe3529d8c2019-12-19 18:24:38 -07003266 return 0;
Pavel Begunkov5f798be2020-02-08 13:28:02 +03003267 /* iovec is already imported */
3268 if (req->flags & REQ_F_NEED_CLEANUP)
3269 return 0;
Jens Axboe3529d8c2019-12-19 18:24:38 -07003270
Jens Axboed9688562019-12-09 19:35:20 -07003271 io->msg.iov = io->msg.fast_iov;
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03003272 ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
Jens Axboee47293f2019-12-20 08:58:21 -07003273 &io->msg.iov);
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03003274 if (!ret)
3275 req->flags |= REQ_F_NEED_CLEANUP;
3276 return ret;
Jens Axboe03b12302019-12-02 18:50:25 -07003277#else
Jens Axboee47293f2019-12-20 08:58:21 -07003278 return -EOPNOTSUPP;
Jens Axboe03b12302019-12-02 18:50:25 -07003279#endif
3280}
3281
Pavel Begunkov014db002020-03-03 21:33:12 +03003282static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
Jens Axboe03b12302019-12-02 18:50:25 -07003283{
3284#if defined(CONFIG_NET)
Jens Axboe0b416c32019-12-15 10:57:46 -07003285 struct io_async_msghdr *kmsg = NULL;
Jens Axboe03b12302019-12-02 18:50:25 -07003286 struct socket *sock;
3287 int ret;
3288
3289 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3290 return -EINVAL;
3291
3292 sock = sock_from_file(req->file, &ret);
3293 if (sock) {
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003294 struct io_async_ctx io;
Jens Axboe03b12302019-12-02 18:50:25 -07003295 unsigned flags;
3296
Jens Axboe03b12302019-12-02 18:50:25 -07003297 if (req->io) {
Jens Axboe0b416c32019-12-15 10:57:46 -07003298 kmsg = &req->io->msg;
Jens Axboeb5379162020-02-09 11:29:15 -07003299 kmsg->msg.msg_name = &req->io->msg.addr;
Jens Axboe0b416c32019-12-15 10:57:46 -07003300 /* if iov is set, it's allocated already */
3301 if (!kmsg->iov)
3302 kmsg->iov = kmsg->fast_iov;
3303 kmsg->msg.msg_iter.iov = kmsg->iov;
Jens Axboe03b12302019-12-02 18:50:25 -07003304 } else {
Jens Axboe3529d8c2019-12-19 18:24:38 -07003305 struct io_sr_msg *sr = &req->sr_msg;
3306
Jens Axboe0b416c32019-12-15 10:57:46 -07003307 kmsg = &io.msg;
Jens Axboeb5379162020-02-09 11:29:15 -07003308 kmsg->msg.msg_name = &io.msg.addr;
Jens Axboe3529d8c2019-12-19 18:24:38 -07003309
3310 io.msg.iov = io.msg.fast_iov;
3311 ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
3312 sr->msg_flags, &io.msg.iov);
Jens Axboe03b12302019-12-02 18:50:25 -07003313 if (ret)
Jens Axboe3529d8c2019-12-19 18:24:38 -07003314 return ret;
Jens Axboe03b12302019-12-02 18:50:25 -07003315 }
3316
Jens Axboee47293f2019-12-20 08:58:21 -07003317 flags = req->sr_msg.msg_flags;
3318 if (flags & MSG_DONTWAIT)
3319 req->flags |= REQ_F_NOWAIT;
3320 else if (force_nonblock)
3321 flags |= MSG_DONTWAIT;
3322
Jens Axboe0b416c32019-12-15 10:57:46 -07003323 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
Pavel Begunkov02d27d82020-02-28 10:36:36 +03003324 if (force_nonblock && ret == -EAGAIN)
3325 return io_setup_async_msg(req, kmsg);
Jens Axboe03b12302019-12-02 18:50:25 -07003326 if (ret == -ERESTARTSYS)
3327 ret = -EINTR;
3328 }
3329
Pavel Begunkov1e950812020-02-06 19:51:16 +03003330 if (kmsg && kmsg->iov != kmsg->fast_iov)
Jens Axboe0b416c32019-12-15 10:57:46 -07003331 kfree(kmsg->iov);
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03003332 req->flags &= ~REQ_F_NEED_CLEANUP;
Jens Axboe03b12302019-12-02 18:50:25 -07003333 io_cqring_add_event(req, ret);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003334 if (ret < 0)
3335 req_set_fail_links(req);
Pavel Begunkov014db002020-03-03 21:33:12 +03003336 io_put_req(req);
Jens Axboe03b12302019-12-02 18:50:25 -07003337 return 0;
3338#else
3339 return -EOPNOTSUPP;
3340#endif
3341}
3342
Pavel Begunkov014db002020-03-03 21:33:12 +03003343static int io_send(struct io_kiocb *req, bool force_nonblock)
Jens Axboefddafac2020-01-04 20:19:44 -07003344{
3345#if defined(CONFIG_NET)
3346 struct socket *sock;
3347 int ret;
3348
3349 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3350 return -EINVAL;
3351
3352 sock = sock_from_file(req->file, &ret);
3353 if (sock) {
3354 struct io_sr_msg *sr = &req->sr_msg;
3355 struct msghdr msg;
3356 struct iovec iov;
3357 unsigned flags;
3358
3359 ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
3360 &msg.msg_iter);
3361 if (ret)
3362 return ret;
3363
3364 msg.msg_name = NULL;
3365 msg.msg_control = NULL;
3366 msg.msg_controllen = 0;
3367 msg.msg_namelen = 0;
3368
3369 flags = req->sr_msg.msg_flags;
3370 if (flags & MSG_DONTWAIT)
3371 req->flags |= REQ_F_NOWAIT;
3372 else if (force_nonblock)
3373 flags |= MSG_DONTWAIT;
3374
Jens Axboe0b7b21e2020-01-31 08:34:59 -07003375 msg.msg_flags = flags;
3376 ret = sock_sendmsg(sock, &msg);
Jens Axboefddafac2020-01-04 20:19:44 -07003377 if (force_nonblock && ret == -EAGAIN)
3378 return -EAGAIN;
3379 if (ret == -ERESTARTSYS)
3380 ret = -EINTR;
3381 }
3382
3383 io_cqring_add_event(req, ret);
3384 if (ret < 0)
3385 req_set_fail_links(req);
Pavel Begunkov014db002020-03-03 21:33:12 +03003386 io_put_req(req);
Jens Axboefddafac2020-01-04 20:19:44 -07003387 return 0;
3388#else
3389 return -EOPNOTSUPP;
3390#endif
3391}
3392
Jens Axboe3529d8c2019-12-19 18:24:38 -07003393static int io_recvmsg_prep(struct io_kiocb *req,
3394 const struct io_uring_sqe *sqe)
Jens Axboe03b12302019-12-02 18:50:25 -07003395{
3396#if defined(CONFIG_NET)
Jens Axboee47293f2019-12-20 08:58:21 -07003397 struct io_sr_msg *sr = &req->sr_msg;
Jens Axboe3529d8c2019-12-19 18:24:38 -07003398 struct io_async_ctx *io = req->io;
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03003399 int ret;
Jens Axboe06b76d42019-12-19 14:44:26 -07003400
Jens Axboe3529d8c2019-12-19 18:24:38 -07003401 sr->msg_flags = READ_ONCE(sqe->msg_flags);
3402 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
Jens Axboe0b7b21e2020-01-31 08:34:59 -07003403 sr->len = READ_ONCE(sqe->len);
Jens Axboe3529d8c2019-12-19 18:24:38 -07003404
Jens Axboed8768362020-02-27 14:17:49 -07003405#ifdef CONFIG_COMPAT
3406 if (req->ctx->compat)
3407 sr->msg_flags |= MSG_CMSG_COMPAT;
3408#endif
3409
Jens Axboefddafac2020-01-04 20:19:44 -07003410 if (!io || req->opcode == IORING_OP_RECV)
Jens Axboe06b76d42019-12-19 14:44:26 -07003411 return 0;
Pavel Begunkov5f798be2020-02-08 13:28:02 +03003412 /* iovec is already imported */
3413 if (req->flags & REQ_F_NEED_CLEANUP)
3414 return 0;
Jens Axboe03b12302019-12-02 18:50:25 -07003415
Jens Axboed9688562019-12-09 19:35:20 -07003416 io->msg.iov = io->msg.fast_iov;
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03003417 ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
Jens Axboee47293f2019-12-20 08:58:21 -07003418 &io->msg.uaddr, &io->msg.iov);
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03003419 if (!ret)
3420 req->flags |= REQ_F_NEED_CLEANUP;
3421 return ret;
Jens Axboe03b12302019-12-02 18:50:25 -07003422#else
Jens Axboee47293f2019-12-20 08:58:21 -07003423 return -EOPNOTSUPP;
Jens Axboe03b12302019-12-02 18:50:25 -07003424#endif
3425}
3426
Pavel Begunkov014db002020-03-03 21:33:12 +03003427static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
Jens Axboe03b12302019-12-02 18:50:25 -07003428{
3429#if defined(CONFIG_NET)
Jens Axboe0b416c32019-12-15 10:57:46 -07003430 struct io_async_msghdr *kmsg = NULL;
Jens Axboe0fa03c62019-04-19 13:34:07 -06003431 struct socket *sock;
3432 int ret;
3433
3434 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3435 return -EINVAL;
3436
3437 sock = sock_from_file(req->file, &ret);
3438 if (sock) {
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003439 struct io_async_ctx io;
Jens Axboe0fa03c62019-04-19 13:34:07 -06003440 unsigned flags;
3441
Jens Axboe03b12302019-12-02 18:50:25 -07003442 if (req->io) {
Jens Axboe0b416c32019-12-15 10:57:46 -07003443 kmsg = &req->io->msg;
Jens Axboeb5379162020-02-09 11:29:15 -07003444 kmsg->msg.msg_name = &req->io->msg.addr;
Jens Axboe0b416c32019-12-15 10:57:46 -07003445 /* if iov is set, it's allocated already */
3446 if (!kmsg->iov)
3447 kmsg->iov = kmsg->fast_iov;
3448 kmsg->msg.msg_iter.iov = kmsg->iov;
Jens Axboe03b12302019-12-02 18:50:25 -07003449 } else {
Jens Axboe3529d8c2019-12-19 18:24:38 -07003450 struct io_sr_msg *sr = &req->sr_msg;
3451
Jens Axboe0b416c32019-12-15 10:57:46 -07003452 kmsg = &io.msg;
Jens Axboeb5379162020-02-09 11:29:15 -07003453 kmsg->msg.msg_name = &io.msg.addr;
Jens Axboe3529d8c2019-12-19 18:24:38 -07003454
3455 io.msg.iov = io.msg.fast_iov;
3456 ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
3457 sr->msg_flags, &io.msg.uaddr,
3458 &io.msg.iov);
Jens Axboe03b12302019-12-02 18:50:25 -07003459 if (ret)
Jens Axboe3529d8c2019-12-19 18:24:38 -07003460 return ret;
Jens Axboe03b12302019-12-02 18:50:25 -07003461 }
Jens Axboe0fa03c62019-04-19 13:34:07 -06003462
Jens Axboee47293f2019-12-20 08:58:21 -07003463 flags = req->sr_msg.msg_flags;
3464 if (flags & MSG_DONTWAIT)
3465 req->flags |= REQ_F_NOWAIT;
3466 else if (force_nonblock)
3467 flags |= MSG_DONTWAIT;
3468
3469 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
3470 kmsg->uaddr, flags);
Pavel Begunkov02d27d82020-02-28 10:36:36 +03003471 if (force_nonblock && ret == -EAGAIN)
3472 return io_setup_async_msg(req, kmsg);
Jens Axboe441cdbd2019-12-02 18:49:10 -07003473 if (ret == -ERESTARTSYS)
3474 ret = -EINTR;
Jens Axboe0fa03c62019-04-19 13:34:07 -06003475 }
3476
Pavel Begunkov1e950812020-02-06 19:51:16 +03003477 if (kmsg && kmsg->iov != kmsg->fast_iov)
Jens Axboe0b416c32019-12-15 10:57:46 -07003478 kfree(kmsg->iov);
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03003479 req->flags &= ~REQ_F_NEED_CLEANUP;
Jens Axboe78e19bb2019-11-06 15:21:34 -07003480 io_cqring_add_event(req, ret);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003481 if (ret < 0)
3482 req_set_fail_links(req);
Pavel Begunkov014db002020-03-03 21:33:12 +03003483 io_put_req(req);
Jens Axboe0fa03c62019-04-19 13:34:07 -06003484 return 0;
3485#else
3486 return -EOPNOTSUPP;
3487#endif
3488}
3489
Pavel Begunkov014db002020-03-03 21:33:12 +03003490static int io_recv(struct io_kiocb *req, bool force_nonblock)
Jens Axboefddafac2020-01-04 20:19:44 -07003491{
3492#if defined(CONFIG_NET)
3493 struct socket *sock;
3494 int ret;
3495
3496 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3497 return -EINVAL;
3498
3499 sock = sock_from_file(req->file, &ret);
3500 if (sock) {
3501 struct io_sr_msg *sr = &req->sr_msg;
3502 struct msghdr msg;
3503 struct iovec iov;
3504 unsigned flags;
3505
3506 ret = import_single_range(READ, sr->buf, sr->len, &iov,
3507 &msg.msg_iter);
3508 if (ret)
3509 return ret;
3510
3511 msg.msg_name = NULL;
3512 msg.msg_control = NULL;
3513 msg.msg_controllen = 0;
3514 msg.msg_namelen = 0;
3515 msg.msg_iocb = NULL;
3516 msg.msg_flags = 0;
3517
3518 flags = req->sr_msg.msg_flags;
3519 if (flags & MSG_DONTWAIT)
3520 req->flags |= REQ_F_NOWAIT;
3521 else if (force_nonblock)
3522 flags |= MSG_DONTWAIT;
3523
Jens Axboe0b7b21e2020-01-31 08:34:59 -07003524 ret = sock_recvmsg(sock, &msg, flags);
Jens Axboefddafac2020-01-04 20:19:44 -07003525 if (force_nonblock && ret == -EAGAIN)
3526 return -EAGAIN;
3527 if (ret == -ERESTARTSYS)
3528 ret = -EINTR;
3529 }
3530
3531 io_cqring_add_event(req, ret);
3532 if (ret < 0)
3533 req_set_fail_links(req);
Pavel Begunkov014db002020-03-03 21:33:12 +03003534 io_put_req(req);
Jens Axboefddafac2020-01-04 20:19:44 -07003535 return 0;
3536#else
3537 return -EOPNOTSUPP;
3538#endif
3539}
3540
3541
Jens Axboe3529d8c2019-12-19 18:24:38 -07003542static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jens Axboe17f2fe32019-10-17 14:42:58 -06003543{
3544#if defined(CONFIG_NET)
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003545 struct io_accept *accept = &req->accept;
3546
Jens Axboe17f2fe32019-10-17 14:42:58 -06003547 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3548 return -EINVAL;
Hrvoje Zeba8042d6c2019-11-25 14:40:22 -05003549 if (sqe->ioprio || sqe->len || sqe->buf_index)
Jens Axboe17f2fe32019-10-17 14:42:58 -06003550 return -EINVAL;
3551
Jens Axboed55e5f52019-12-11 16:12:15 -07003552 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3553 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003554 accept->flags = READ_ONCE(sqe->accept_flags);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003555 return 0;
3556#else
3557 return -EOPNOTSUPP;
3558#endif
3559}
Jens Axboe17f2fe32019-10-17 14:42:58 -06003560
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003561#if defined(CONFIG_NET)
Pavel Begunkov014db002020-03-03 21:33:12 +03003562static int __io_accept(struct io_kiocb *req, bool force_nonblock)
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003563{
3564 struct io_accept *accept = &req->accept;
3565 unsigned file_flags;
3566 int ret;
3567
3568 file_flags = force_nonblock ? O_NONBLOCK : 0;
3569 ret = __sys_accept4_file(req->file, file_flags, accept->addr,
3570 accept->addr_len, accept->flags);
3571 if (ret == -EAGAIN && force_nonblock)
Jens Axboe17f2fe32019-10-17 14:42:58 -06003572 return -EAGAIN;
Jens Axboe8e3cca12019-11-09 19:52:33 -07003573 if (ret == -ERESTARTSYS)
3574 ret = -EINTR;
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003575 if (ret < 0)
3576 req_set_fail_links(req);
Jens Axboe78e19bb2019-11-06 15:21:34 -07003577 io_cqring_add_event(req, ret);
Pavel Begunkov014db002020-03-03 21:33:12 +03003578 io_put_req(req);
Jens Axboe17f2fe32019-10-17 14:42:58 -06003579 return 0;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003580}
3581
3582static void io_accept_finish(struct io_wq_work **workptr)
3583{
3584 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003585
3586 if (io_req_cancelled(req))
3587 return;
Pavel Begunkov014db002020-03-03 21:33:12 +03003588 __io_accept(req, false);
Pavel Begunkove9fd9392020-03-04 16:14:12 +03003589 io_steal_work(req, workptr);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003590}
3591#endif
3592
Pavel Begunkov014db002020-03-03 21:33:12 +03003593static int io_accept(struct io_kiocb *req, bool force_nonblock)
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003594{
3595#if defined(CONFIG_NET)
3596 int ret;
3597
Pavel Begunkov014db002020-03-03 21:33:12 +03003598 ret = __io_accept(req, force_nonblock);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003599 if (ret == -EAGAIN && force_nonblock) {
3600 req->work.func = io_accept_finish;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07003601 return -EAGAIN;
3602 }
3603 return 0;
Jens Axboe17f2fe32019-10-17 14:42:58 -06003604#else
3605 return -EOPNOTSUPP;
3606#endif
3607}
3608
Jens Axboe3529d8c2019-12-19 18:24:38 -07003609static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jens Axboef499a022019-12-02 16:28:46 -07003610{
3611#if defined(CONFIG_NET)
Jens Axboe3529d8c2019-12-19 18:24:38 -07003612 struct io_connect *conn = &req->connect;
3613 struct io_async_ctx *io = req->io;
Jens Axboef499a022019-12-02 16:28:46 -07003614
Jens Axboe3fbb51c2019-12-20 08:51:52 -07003615 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3616 return -EINVAL;
3617 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
3618 return -EINVAL;
3619
Jens Axboe3529d8c2019-12-19 18:24:38 -07003620 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3621 conn->addr_len = READ_ONCE(sqe->addr2);
3622
3623 if (!io)
3624 return 0;
3625
3626 return move_addr_to_kernel(conn->addr, conn->addr_len,
Jens Axboe3fbb51c2019-12-20 08:51:52 -07003627 &io->connect.address);
Jens Axboef499a022019-12-02 16:28:46 -07003628#else
Jens Axboe3fbb51c2019-12-20 08:51:52 -07003629 return -EOPNOTSUPP;
Jens Axboef499a022019-12-02 16:28:46 -07003630#endif
3631}
3632
Pavel Begunkov014db002020-03-03 21:33:12 +03003633static int io_connect(struct io_kiocb *req, bool force_nonblock)
Jens Axboef8e85cf2019-11-23 14:24:24 -07003634{
3635#if defined(CONFIG_NET)
Jens Axboef499a022019-12-02 16:28:46 -07003636 struct io_async_ctx __io, *io;
Jens Axboef8e85cf2019-11-23 14:24:24 -07003637 unsigned file_flags;
Jens Axboe3fbb51c2019-12-20 08:51:52 -07003638 int ret;
Jens Axboef8e85cf2019-11-23 14:24:24 -07003639
Jens Axboef499a022019-12-02 16:28:46 -07003640 if (req->io) {
3641 io = req->io;
3642 } else {
Jens Axboe3529d8c2019-12-19 18:24:38 -07003643 ret = move_addr_to_kernel(req->connect.addr,
3644 req->connect.addr_len,
3645 &__io.connect.address);
Jens Axboef499a022019-12-02 16:28:46 -07003646 if (ret)
3647 goto out;
3648 io = &__io;
3649 }
3650
Jens Axboe3fbb51c2019-12-20 08:51:52 -07003651 file_flags = force_nonblock ? O_NONBLOCK : 0;
3652
3653 ret = __sys_connect_file(req->file, &io->connect.address,
3654 req->connect.addr_len, file_flags);
Jens Axboe87f80d62019-12-03 11:23:54 -07003655 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003656 if (req->io)
3657 return -EAGAIN;
3658 if (io_alloc_async_ctx(req)) {
Jens Axboef499a022019-12-02 16:28:46 -07003659 ret = -ENOMEM;
3660 goto out;
3661 }
Jens Axboeb7bb4f72019-12-15 22:13:43 -07003662 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
Jens Axboef8e85cf2019-11-23 14:24:24 -07003663 return -EAGAIN;
Jens Axboef499a022019-12-02 16:28:46 -07003664 }
Jens Axboef8e85cf2019-11-23 14:24:24 -07003665 if (ret == -ERESTARTSYS)
3666 ret = -EINTR;
Jens Axboef499a022019-12-02 16:28:46 -07003667out:
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003668 if (ret < 0)
3669 req_set_fail_links(req);
Jens Axboef8e85cf2019-11-23 14:24:24 -07003670 io_cqring_add_event(req, ret);
Pavel Begunkov014db002020-03-03 21:33:12 +03003671 io_put_req(req);
Jens Axboef8e85cf2019-11-23 14:24:24 -07003672 return 0;
3673#else
3674 return -EOPNOTSUPP;
3675#endif
3676}
3677
Jens Axboed7718a92020-02-14 22:23:12 -07003678struct io_poll_table {
3679 struct poll_table_struct pt;
3680 struct io_kiocb *req;
3681 int error;
3682};
3683
3684static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
3685 struct wait_queue_head *head)
Jens Axboe221c5eb2019-01-17 09:41:58 -07003686{
Jens Axboed7718a92020-02-14 22:23:12 -07003687 if (unlikely(poll->head)) {
3688 pt->error = -EINVAL;
3689 return;
3690 }
3691
3692 pt->error = 0;
3693 poll->head = head;
3694 add_wait_queue(head, &poll->wait);
3695}
3696
3697static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
3698 struct poll_table_struct *p)
3699{
3700 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
3701
3702 __io_queue_proc(&pt->req->apoll->poll, pt, head);
3703}
3704
3705static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
3706 __poll_t mask, task_work_func_t func)
3707{
3708 struct task_struct *tsk;
3709
3710 /* for instances that support it check for an event match first: */
3711 if (mask && !(mask & poll->events))
3712 return 0;
3713
3714 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
3715
3716 list_del_init(&poll->wait.entry);
3717
3718 tsk = req->task;
3719 req->result = mask;
3720 init_task_work(&req->task_work, func);
3721 /*
3722 * If this fails, then the task is exiting. If that is the case, then
3723 * the exit check will ultimately cancel these work items. Hence we
3724 * don't need to check here and handle it specifically.
3725 */
3726 task_work_add(tsk, &req->task_work, true);
3727 wake_up_process(tsk);
3728 return 1;
3729}
3730
3731static void io_async_task_func(struct callback_head *cb)
3732{
3733 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
3734 struct async_poll *apoll = req->apoll;
3735 struct io_ring_ctx *ctx = req->ctx;
3736
3737 trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
3738
3739 WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry));
3740
3741 if (hash_hashed(&req->hash_node)) {
3742 spin_lock_irq(&ctx->completion_lock);
3743 hash_del(&req->hash_node);
3744 spin_unlock_irq(&ctx->completion_lock);
3745 }
3746
3747 /* restore ->work in case we need to retry again */
3748 memcpy(&req->work, &apoll->work, sizeof(req->work));
3749
3750 __set_current_state(TASK_RUNNING);
3751 mutex_lock(&ctx->uring_lock);
3752 __io_queue_sqe(req, NULL);
3753 mutex_unlock(&ctx->uring_lock);
3754
3755 kfree(apoll);
3756}
3757
3758static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
3759 void *key)
3760{
3761 struct io_kiocb *req = wait->private;
3762 struct io_poll_iocb *poll = &req->apoll->poll;
3763
3764 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
3765 key_to_poll(key));
3766
3767 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
3768}
3769
3770static void io_poll_req_insert(struct io_kiocb *req)
3771{
3772 struct io_ring_ctx *ctx = req->ctx;
3773 struct hlist_head *list;
3774
3775 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
3776 hlist_add_head(&req->hash_node, list);
3777}
3778
3779static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
3780 struct io_poll_iocb *poll,
3781 struct io_poll_table *ipt, __poll_t mask,
3782 wait_queue_func_t wake_func)
3783 __acquires(&ctx->completion_lock)
3784{
3785 struct io_ring_ctx *ctx = req->ctx;
3786 bool cancel = false;
3787
3788 poll->file = req->file;
3789 poll->head = NULL;
3790 poll->done = poll->canceled = false;
3791 poll->events = mask;
3792
3793 ipt->pt._key = mask;
3794 ipt->req = req;
3795 ipt->error = -EINVAL;
3796
3797 INIT_LIST_HEAD(&poll->wait.entry);
3798 init_waitqueue_func_entry(&poll->wait, wake_func);
3799 poll->wait.private = req;
3800
3801 mask = vfs_poll(req->file, &ipt->pt) & poll->events;
3802
3803 spin_lock_irq(&ctx->completion_lock);
3804 if (likely(poll->head)) {
3805 spin_lock(&poll->head->lock);
3806 if (unlikely(list_empty(&poll->wait.entry))) {
3807 if (ipt->error)
3808 cancel = true;
3809 ipt->error = 0;
3810 mask = 0;
3811 }
3812 if (mask || ipt->error)
3813 list_del_init(&poll->wait.entry);
3814 else if (cancel)
3815 WRITE_ONCE(poll->canceled, true);
3816 else if (!poll->done) /* actually waiting for an event */
3817 io_poll_req_insert(req);
3818 spin_unlock(&poll->head->lock);
3819 }
3820
3821 return mask;
3822}
3823
3824static bool io_arm_poll_handler(struct io_kiocb *req)
3825{
3826 const struct io_op_def *def = &io_op_defs[req->opcode];
3827 struct io_ring_ctx *ctx = req->ctx;
3828 struct async_poll *apoll;
3829 struct io_poll_table ipt;
3830 __poll_t mask, ret;
3831
3832 if (!req->file || !file_can_poll(req->file))
3833 return false;
3834 if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
3835 return false;
3836 if (!def->pollin && !def->pollout)
3837 return false;
3838
3839 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
3840 if (unlikely(!apoll))
3841 return false;
3842
3843 req->flags |= REQ_F_POLLED;
3844 memcpy(&apoll->work, &req->work, sizeof(req->work));
3845
3846 /*
3847 * Don't need a reference here, as we're adding it to the task
3848 * task_works list. If the task exits, the list is pruned.
3849 */
3850 req->task = current;
3851 req->apoll = apoll;
3852 INIT_HLIST_NODE(&req->hash_node);
3853
Nathan Chancellor8755d972020-03-02 16:01:19 -07003854 mask = 0;
Jens Axboed7718a92020-02-14 22:23:12 -07003855 if (def->pollin)
Nathan Chancellor8755d972020-03-02 16:01:19 -07003856 mask |= POLLIN | POLLRDNORM;
Jens Axboed7718a92020-02-14 22:23:12 -07003857 if (def->pollout)
3858 mask |= POLLOUT | POLLWRNORM;
3859 mask |= POLLERR | POLLPRI;
3860
3861 ipt.pt._qproc = io_async_queue_proc;
3862
3863 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
3864 io_async_wake);
3865 if (ret) {
3866 ipt.error = 0;
3867 apoll->poll.done = true;
3868 spin_unlock_irq(&ctx->completion_lock);
3869 memcpy(&req->work, &apoll->work, sizeof(req->work));
3870 kfree(apoll);
3871 return false;
3872 }
3873 spin_unlock_irq(&ctx->completion_lock);
3874 trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
3875 apoll->poll.events);
3876 return true;
3877}
3878
3879static bool __io_poll_remove_one(struct io_kiocb *req,
3880 struct io_poll_iocb *poll)
3881{
Jens Axboeb41e9852020-02-17 09:52:41 -07003882 bool do_complete = false;
Jens Axboe221c5eb2019-01-17 09:41:58 -07003883
3884 spin_lock(&poll->head->lock);
3885 WRITE_ONCE(poll->canceled, true);
Jens Axboe392edb42019-12-09 17:52:20 -07003886 if (!list_empty(&poll->wait.entry)) {
3887 list_del_init(&poll->wait.entry);
Jens Axboeb41e9852020-02-17 09:52:41 -07003888 do_complete = true;
Jens Axboe221c5eb2019-01-17 09:41:58 -07003889 }
3890 spin_unlock(&poll->head->lock);
Jens Axboed7718a92020-02-14 22:23:12 -07003891 return do_complete;
3892}
3893
3894static bool io_poll_remove_one(struct io_kiocb *req)
3895{
3896 bool do_complete;
3897
3898 if (req->opcode == IORING_OP_POLL_ADD) {
3899 do_complete = __io_poll_remove_one(req, &req->poll);
3900 } else {
3901 /* non-poll requests have submit ref still */
3902 do_complete = __io_poll_remove_one(req, &req->apoll->poll);
3903 if (do_complete)
3904 io_put_req(req);
3905 }
3906
Jens Axboe78076bb2019-12-04 19:56:40 -07003907 hash_del(&req->hash_node);
Jens Axboed7718a92020-02-14 22:23:12 -07003908
Jens Axboeb41e9852020-02-17 09:52:41 -07003909 if (do_complete) {
3910 io_cqring_fill_event(req, -ECANCELED);
3911 io_commit_cqring(req->ctx);
3912 req->flags |= REQ_F_COMP_LOCKED;
3913 io_put_req(req);
3914 }
3915
3916 return do_complete;
Jens Axboe221c5eb2019-01-17 09:41:58 -07003917}
3918
3919static void io_poll_remove_all(struct io_ring_ctx *ctx)
3920{
Jens Axboe78076bb2019-12-04 19:56:40 -07003921 struct hlist_node *tmp;
Jens Axboe221c5eb2019-01-17 09:41:58 -07003922 struct io_kiocb *req;
Jens Axboe78076bb2019-12-04 19:56:40 -07003923 int i;
Jens Axboe221c5eb2019-01-17 09:41:58 -07003924
3925 spin_lock_irq(&ctx->completion_lock);
Jens Axboe78076bb2019-12-04 19:56:40 -07003926 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
3927 struct hlist_head *list;
3928
3929 list = &ctx->cancel_hash[i];
3930 hlist_for_each_entry_safe(req, tmp, list, hash_node)
3931 io_poll_remove_one(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07003932 }
3933 spin_unlock_irq(&ctx->completion_lock);
Jens Axboeb41e9852020-02-17 09:52:41 -07003934
3935 io_cqring_ev_posted(ctx);
Jens Axboe221c5eb2019-01-17 09:41:58 -07003936}
3937
Jens Axboe47f46762019-11-09 17:43:02 -07003938static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
3939{
Jens Axboe78076bb2019-12-04 19:56:40 -07003940 struct hlist_head *list;
Jens Axboe47f46762019-11-09 17:43:02 -07003941 struct io_kiocb *req;
3942
Jens Axboe78076bb2019-12-04 19:56:40 -07003943 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
3944 hlist_for_each_entry(req, list, hash_node) {
Jens Axboeb41e9852020-02-17 09:52:41 -07003945 if (sqe_addr != req->user_data)
3946 continue;
3947 if (io_poll_remove_one(req))
Jens Axboeeac406c2019-11-14 12:09:58 -07003948 return 0;
Jens Axboeb41e9852020-02-17 09:52:41 -07003949 return -EALREADY;
Jens Axboe47f46762019-11-09 17:43:02 -07003950 }
3951
3952 return -ENOENT;
3953}
3954
Jens Axboe3529d8c2019-12-19 18:24:38 -07003955static int io_poll_remove_prep(struct io_kiocb *req,
3956 const struct io_uring_sqe *sqe)
Jens Axboe221c5eb2019-01-17 09:41:58 -07003957{
Jens Axboe221c5eb2019-01-17 09:41:58 -07003958 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3959 return -EINVAL;
3960 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
3961 sqe->poll_events)
3962 return -EINVAL;
3963
Jens Axboe0969e782019-12-17 18:40:57 -07003964 req->poll.addr = READ_ONCE(sqe->addr);
Jens Axboe0969e782019-12-17 18:40:57 -07003965 return 0;
3966}
3967
3968/*
3969 * Find a running poll command that matches one specified in sqe->addr,
3970 * and remove it if found.
3971 */
3972static int io_poll_remove(struct io_kiocb *req)
3973{
3974 struct io_ring_ctx *ctx = req->ctx;
3975 u64 addr;
3976 int ret;
3977
Jens Axboe0969e782019-12-17 18:40:57 -07003978 addr = req->poll.addr;
Jens Axboe221c5eb2019-01-17 09:41:58 -07003979 spin_lock_irq(&ctx->completion_lock);
Jens Axboe0969e782019-12-17 18:40:57 -07003980 ret = io_poll_cancel(ctx, addr);
Jens Axboe221c5eb2019-01-17 09:41:58 -07003981 spin_unlock_irq(&ctx->completion_lock);
3982
Jens Axboe78e19bb2019-11-06 15:21:34 -07003983 io_cqring_add_event(req, ret);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07003984 if (ret < 0)
3985 req_set_fail_links(req);
Jens Axboee65ef562019-03-12 10:16:44 -06003986 io_put_req(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07003987 return 0;
3988}
3989
Jens Axboeb0dd8a42019-11-18 12:14:54 -07003990static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
Jens Axboe221c5eb2019-01-17 09:41:58 -07003991{
Jackie Liua197f662019-11-08 08:09:12 -07003992 struct io_ring_ctx *ctx = req->ctx;
3993
Jens Axboe8c838782019-03-12 15:48:16 -06003994 req->poll.done = true;
Pavel Begunkovb0a20342020-02-28 10:36:35 +03003995 io_cqring_fill_event(req, error ? error : mangle_poll(mask));
Jens Axboe8c838782019-03-12 15:48:16 -06003996 io_commit_cqring(ctx);
Jens Axboe221c5eb2019-01-17 09:41:58 -07003997}
3998
Jens Axboeb41e9852020-02-17 09:52:41 -07003999static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
Jens Axboe221c5eb2019-01-17 09:41:58 -07004000{
Jens Axboe221c5eb2019-01-17 09:41:58 -07004001 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe221c5eb2019-01-17 09:41:58 -07004002
Jens Axboe221c5eb2019-01-17 09:41:58 -07004003 spin_lock_irq(&ctx->completion_lock);
Jens Axboe78076bb2019-12-04 19:56:40 -07004004 hash_del(&req->hash_node);
Jens Axboeb41e9852020-02-17 09:52:41 -07004005 io_poll_complete(req, req->result, 0);
4006 req->flags |= REQ_F_COMP_LOCKED;
4007 io_put_req_find_next(req, nxt);
Jens Axboe221c5eb2019-01-17 09:41:58 -07004008 spin_unlock_irq(&ctx->completion_lock);
4009
Jens Axboe8c838782019-03-12 15:48:16 -06004010 io_cqring_ev_posted(ctx);
Jens Axboeb41e9852020-02-17 09:52:41 -07004011}
Jens Axboe89723d02019-11-05 15:32:58 -07004012
Jens Axboeb41e9852020-02-17 09:52:41 -07004013static void io_poll_task_func(struct callback_head *cb)
4014{
4015 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4016 struct io_kiocb *nxt = NULL;
4017
4018 io_poll_task_handler(req, &nxt);
Jens Axboed7718a92020-02-14 22:23:12 -07004019 if (nxt) {
4020 struct io_ring_ctx *ctx = nxt->ctx;
4021
4022 mutex_lock(&ctx->uring_lock);
Jens Axboeb41e9852020-02-17 09:52:41 -07004023 __io_queue_sqe(nxt, NULL);
Jens Axboed7718a92020-02-14 22:23:12 -07004024 mutex_unlock(&ctx->uring_lock);
4025 }
Jens Axboef0b493e2020-02-01 21:30:11 -07004026}
4027
Jens Axboe221c5eb2019-01-17 09:41:58 -07004028static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4029 void *key)
4030{
Jens Axboec2f2eb72020-02-10 09:07:05 -07004031 struct io_kiocb *req = wait->private;
4032 struct io_poll_iocb *poll = &req->poll;
Jens Axboe221c5eb2019-01-17 09:41:58 -07004033
Jens Axboed7718a92020-02-14 22:23:12 -07004034 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
Jens Axboe221c5eb2019-01-17 09:41:58 -07004035}
4036
Jens Axboe221c5eb2019-01-17 09:41:58 -07004037static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
4038 struct poll_table_struct *p)
4039{
4040 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4041
Jens Axboed7718a92020-02-14 22:23:12 -07004042 __io_queue_proc(&pt->req->poll, pt, head);
Jens Axboeeac406c2019-11-14 12:09:58 -07004043}
4044
Jens Axboe3529d8c2019-12-19 18:24:38 -07004045static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jens Axboe221c5eb2019-01-17 09:41:58 -07004046{
4047 struct io_poll_iocb *poll = &req->poll;
Jens Axboe221c5eb2019-01-17 09:41:58 -07004048 u16 events;
Jens Axboe221c5eb2019-01-17 09:41:58 -07004049
4050 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4051 return -EINVAL;
4052 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
4053 return -EINVAL;
Jens Axboe09bb8392019-03-13 12:39:28 -06004054 if (!poll->file)
4055 return -EBADF;
Jens Axboe221c5eb2019-01-17 09:41:58 -07004056
Jens Axboe221c5eb2019-01-17 09:41:58 -07004057 events = READ_ONCE(sqe->poll_events);
4058 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
Jens Axboeb41e9852020-02-17 09:52:41 -07004059
Jens Axboed7718a92020-02-14 22:23:12 -07004060 /*
4061 * Don't need a reference here, as we're adding it to the task
4062 * task_works list. If the task exits, the list is pruned.
4063 */
Jens Axboeb41e9852020-02-17 09:52:41 -07004064 req->task = current;
Jens Axboe0969e782019-12-17 18:40:57 -07004065 return 0;
4066}
4067
Pavel Begunkov014db002020-03-03 21:33:12 +03004068static int io_poll_add(struct io_kiocb *req)
Jens Axboe0969e782019-12-17 18:40:57 -07004069{
4070 struct io_poll_iocb *poll = &req->poll;
4071 struct io_ring_ctx *ctx = req->ctx;
4072 struct io_poll_table ipt;
Jens Axboe0969e782019-12-17 18:40:57 -07004073 __poll_t mask;
Jens Axboe0969e782019-12-17 18:40:57 -07004074
Jens Axboe78076bb2019-12-04 19:56:40 -07004075 INIT_HLIST_NODE(&req->hash_node);
Jens Axboe36703242019-07-25 10:20:18 -06004076 INIT_LIST_HEAD(&req->list);
Jens Axboed7718a92020-02-14 22:23:12 -07004077 ipt.pt._qproc = io_poll_queue_proc;
Jens Axboe36703242019-07-25 10:20:18 -06004078
Jens Axboed7718a92020-02-14 22:23:12 -07004079 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
4080 io_poll_wake);
Jens Axboe221c5eb2019-01-17 09:41:58 -07004081
Jens Axboe8c838782019-03-12 15:48:16 -06004082 if (mask) { /* no async, we'd stolen it */
Jens Axboe8c838782019-03-12 15:48:16 -06004083 ipt.error = 0;
Jens Axboeb0dd8a42019-11-18 12:14:54 -07004084 io_poll_complete(req, mask, 0);
Jens Axboe8c838782019-03-12 15:48:16 -06004085 }
Jens Axboe221c5eb2019-01-17 09:41:58 -07004086 spin_unlock_irq(&ctx->completion_lock);
4087
Jens Axboe8c838782019-03-12 15:48:16 -06004088 if (mask) {
4089 io_cqring_ev_posted(ctx);
Pavel Begunkov014db002020-03-03 21:33:12 +03004090 io_put_req(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07004091 }
Jens Axboe8c838782019-03-12 15:48:16 -06004092 return ipt.error;
Jens Axboe221c5eb2019-01-17 09:41:58 -07004093}
4094
Jens Axboe5262f562019-09-17 12:26:57 -06004095static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
4096{
Jens Axboead8a48a2019-11-15 08:49:11 -07004097 struct io_timeout_data *data = container_of(timer,
4098 struct io_timeout_data, timer);
4099 struct io_kiocb *req = data->req;
4100 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe5262f562019-09-17 12:26:57 -06004101 unsigned long flags;
4102
Jens Axboe5262f562019-09-17 12:26:57 -06004103 atomic_inc(&ctx->cq_timeouts);
4104
4105 spin_lock_irqsave(&ctx->completion_lock, flags);
zhangyi (F)ef036812019-10-23 15:10:08 +08004106 /*
Jens Axboe11365042019-10-16 09:08:32 -06004107 * We could be racing with timeout deletion. If the list is empty,
4108 * then timeout lookup already found it and will be handling it.
zhangyi (F)ef036812019-10-23 15:10:08 +08004109 */
Jens Axboe842f9612019-10-29 12:34:10 -06004110 if (!list_empty(&req->list)) {
Jens Axboe11365042019-10-16 09:08:32 -06004111 struct io_kiocb *prev;
Jens Axboe5262f562019-09-17 12:26:57 -06004112
Jens Axboe11365042019-10-16 09:08:32 -06004113 /*
4114 * Adjust the reqs sequence before the current one because it
Brian Gianforcarod195a662019-12-13 03:09:50 -08004115 * will consume a slot in the cq_ring and the cq_tail
Jens Axboe11365042019-10-16 09:08:32 -06004116 * pointer will be increased, otherwise other timeout reqs may
4117 * return in advance without waiting for enough wait_nr.
4118 */
4119 prev = req;
4120 list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
4121 prev->sequence++;
Jens Axboe11365042019-10-16 09:08:32 -06004122 list_del_init(&req->list);
Jens Axboe11365042019-10-16 09:08:32 -06004123 }
Jens Axboe842f9612019-10-29 12:34:10 -06004124
Jens Axboe78e19bb2019-11-06 15:21:34 -07004125 io_cqring_fill_event(req, -ETIME);
Jens Axboe5262f562019-09-17 12:26:57 -06004126 io_commit_cqring(ctx);
4127 spin_unlock_irqrestore(&ctx->completion_lock, flags);
4128
4129 io_cqring_ev_posted(ctx);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07004130 req_set_fail_links(req);
Jens Axboe5262f562019-09-17 12:26:57 -06004131 io_put_req(req);
4132 return HRTIMER_NORESTART;
4133}
4134
Jens Axboe47f46762019-11-09 17:43:02 -07004135static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
4136{
4137 struct io_kiocb *req;
4138 int ret = -ENOENT;
4139
4140 list_for_each_entry(req, &ctx->timeout_list, list) {
4141 if (user_data == req->user_data) {
4142 list_del_init(&req->list);
4143 ret = 0;
4144 break;
4145 }
4146 }
4147
4148 if (ret == -ENOENT)
4149 return ret;
4150
Jens Axboe2d283902019-12-04 11:08:05 -07004151 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
Jens Axboe47f46762019-11-09 17:43:02 -07004152 if (ret == -1)
4153 return -EALREADY;
4154
Jens Axboe4e88d6e2019-12-07 20:59:47 -07004155 req_set_fail_links(req);
Jens Axboe47f46762019-11-09 17:43:02 -07004156 io_cqring_fill_event(req, -ECANCELED);
4157 io_put_req(req);
4158 return 0;
4159}
4160
Jens Axboe3529d8c2019-12-19 18:24:38 -07004161static int io_timeout_remove_prep(struct io_kiocb *req,
4162 const struct io_uring_sqe *sqe)
Jens Axboeb29472e2019-12-17 18:50:29 -07004163{
Jens Axboeb29472e2019-12-17 18:50:29 -07004164 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4165 return -EINVAL;
4166 if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
4167 return -EINVAL;
4168
4169 req->timeout.addr = READ_ONCE(sqe->addr);
4170 req->timeout.flags = READ_ONCE(sqe->timeout_flags);
4171 if (req->timeout.flags)
4172 return -EINVAL;
4173
Jens Axboeb29472e2019-12-17 18:50:29 -07004174 return 0;
4175}
4176
Jens Axboe11365042019-10-16 09:08:32 -06004177/*
4178 * Remove or update an existing timeout command
4179 */
Jens Axboefc4df992019-12-10 14:38:45 -07004180static int io_timeout_remove(struct io_kiocb *req)
Jens Axboe11365042019-10-16 09:08:32 -06004181{
4182 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe47f46762019-11-09 17:43:02 -07004183 int ret;
Jens Axboe11365042019-10-16 09:08:32 -06004184
Jens Axboe11365042019-10-16 09:08:32 -06004185 spin_lock_irq(&ctx->completion_lock);
Jens Axboeb29472e2019-12-17 18:50:29 -07004186 ret = io_timeout_cancel(ctx, req->timeout.addr);
Jens Axboe11365042019-10-16 09:08:32 -06004187
Jens Axboe47f46762019-11-09 17:43:02 -07004188 io_cqring_fill_event(req, ret);
Jens Axboe11365042019-10-16 09:08:32 -06004189 io_commit_cqring(ctx);
4190 spin_unlock_irq(&ctx->completion_lock);
Jens Axboe5262f562019-09-17 12:26:57 -06004191 io_cqring_ev_posted(ctx);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07004192 if (ret < 0)
4193 req_set_fail_links(req);
Jackie Liuec9c02a2019-11-08 23:50:36 +08004194 io_put_req(req);
Jens Axboe11365042019-10-16 09:08:32 -06004195 return 0;
Jens Axboe5262f562019-09-17 12:26:57 -06004196}
4197
Jens Axboe3529d8c2019-12-19 18:24:38 -07004198static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
Jens Axboe2d283902019-12-04 11:08:05 -07004199 bool is_timeout_link)
Jens Axboe5262f562019-09-17 12:26:57 -06004200{
Jens Axboead8a48a2019-11-15 08:49:11 -07004201 struct io_timeout_data *data;
Jens Axboea41525a2019-10-15 16:48:15 -06004202 unsigned flags;
Jens Axboe5262f562019-09-17 12:26:57 -06004203
Jens Axboead8a48a2019-11-15 08:49:11 -07004204 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboe5262f562019-09-17 12:26:57 -06004205 return -EINVAL;
Jens Axboead8a48a2019-11-15 08:49:11 -07004206 if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
Jens Axboea41525a2019-10-15 16:48:15 -06004207 return -EINVAL;
Jens Axboe2d283902019-12-04 11:08:05 -07004208 if (sqe->off && is_timeout_link)
4209 return -EINVAL;
Jens Axboea41525a2019-10-15 16:48:15 -06004210 flags = READ_ONCE(sqe->timeout_flags);
4211 if (flags & ~IORING_TIMEOUT_ABS)
Jens Axboe5262f562019-09-17 12:26:57 -06004212 return -EINVAL;
Arnd Bergmannbdf20072019-10-01 09:53:29 -06004213
Jens Axboe26a61672019-12-20 09:02:01 -07004214 req->timeout.count = READ_ONCE(sqe->off);
4215
Jens Axboe3529d8c2019-12-19 18:24:38 -07004216 if (!req->io && io_alloc_async_ctx(req))
Jens Axboe26a61672019-12-20 09:02:01 -07004217 return -ENOMEM;
4218
4219 data = &req->io->timeout;
Jens Axboead8a48a2019-11-15 08:49:11 -07004220 data->req = req;
Jens Axboead8a48a2019-11-15 08:49:11 -07004221 req->flags |= REQ_F_TIMEOUT;
4222
4223 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
Jens Axboe5262f562019-09-17 12:26:57 -06004224 return -EFAULT;
4225
Jens Axboe11365042019-10-16 09:08:32 -06004226 if (flags & IORING_TIMEOUT_ABS)
Jens Axboead8a48a2019-11-15 08:49:11 -07004227 data->mode = HRTIMER_MODE_ABS;
Jens Axboe11365042019-10-16 09:08:32 -06004228 else
Jens Axboead8a48a2019-11-15 08:49:11 -07004229 data->mode = HRTIMER_MODE_REL;
Jens Axboe11365042019-10-16 09:08:32 -06004230
Jens Axboead8a48a2019-11-15 08:49:11 -07004231 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
4232 return 0;
4233}
4234
Jens Axboefc4df992019-12-10 14:38:45 -07004235static int io_timeout(struct io_kiocb *req)
Jens Axboead8a48a2019-11-15 08:49:11 -07004236{
4237 unsigned count;
4238 struct io_ring_ctx *ctx = req->ctx;
4239 struct io_timeout_data *data;
4240 struct list_head *entry;
4241 unsigned span = 0;
Jens Axboead8a48a2019-11-15 08:49:11 -07004242
Jens Axboe2d283902019-12-04 11:08:05 -07004243 data = &req->io->timeout;
Jens Axboe93bd25b2019-11-11 23:34:31 -07004244
Jens Axboe5262f562019-09-17 12:26:57 -06004245 /*
4246 * sqe->off holds how many events that need to occur for this
Jens Axboe93bd25b2019-11-11 23:34:31 -07004247 * timeout event to be satisfied. If it isn't set, then this is
4248 * a pure timeout request, sequence isn't used.
Jens Axboe5262f562019-09-17 12:26:57 -06004249 */
Jens Axboe26a61672019-12-20 09:02:01 -07004250 count = req->timeout.count;
Jens Axboe93bd25b2019-11-11 23:34:31 -07004251 if (!count) {
4252 req->flags |= REQ_F_TIMEOUT_NOSEQ;
4253 spin_lock_irq(&ctx->completion_lock);
4254 entry = ctx->timeout_list.prev;
4255 goto add;
4256 }
Jens Axboe5262f562019-09-17 12:26:57 -06004257
4258 req->sequence = ctx->cached_sq_head + count - 1;
Jens Axboe2d283902019-12-04 11:08:05 -07004259 data->seq_offset = count;
Jens Axboe5262f562019-09-17 12:26:57 -06004260
4261 /*
4262 * Insertion sort, ensuring the first entry in the list is always
4263 * the one we need first.
4264 */
Jens Axboe5262f562019-09-17 12:26:57 -06004265 spin_lock_irq(&ctx->completion_lock);
4266 list_for_each_prev(entry, &ctx->timeout_list) {
4267 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
yangerkun5da0fb12019-10-15 21:59:29 +08004268 unsigned nxt_sq_head;
4269 long long tmp, tmp_nxt;
Jens Axboe2d283902019-12-04 11:08:05 -07004270 u32 nxt_offset = nxt->io->timeout.seq_offset;
Jens Axboe5262f562019-09-17 12:26:57 -06004271
Jens Axboe93bd25b2019-11-11 23:34:31 -07004272 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
4273 continue;
4274
yangerkun5da0fb12019-10-15 21:59:29 +08004275 /*
4276 * Since cached_sq_head + count - 1 can overflow, use type long
4277 * long to store it.
4278 */
4279 tmp = (long long)ctx->cached_sq_head + count - 1;
Pavel Begunkovcc42e0a2019-11-25 23:14:38 +03004280 nxt_sq_head = nxt->sequence - nxt_offset + 1;
4281 tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
yangerkun5da0fb12019-10-15 21:59:29 +08004282
4283 /*
4284 * cached_sq_head may overflow, and it will never overflow twice
4285 * once there is some timeout req still be valid.
4286 */
4287 if (ctx->cached_sq_head < nxt_sq_head)
yangerkun8b07a652019-10-17 12:12:35 +08004288 tmp += UINT_MAX;
yangerkun5da0fb12019-10-15 21:59:29 +08004289
zhangyi (F)a1f58ba2019-10-23 15:10:09 +08004290 if (tmp > tmp_nxt)
Jens Axboe5262f562019-09-17 12:26:57 -06004291 break;
zhangyi (F)a1f58ba2019-10-23 15:10:09 +08004292
4293 /*
4294 * Sequence of reqs after the insert one and itself should
4295 * be adjusted because each timeout req consumes a slot.
4296 */
4297 span++;
4298 nxt->sequence++;
Jens Axboe5262f562019-09-17 12:26:57 -06004299 }
zhangyi (F)a1f58ba2019-10-23 15:10:09 +08004300 req->sequence -= span;
Jens Axboe93bd25b2019-11-11 23:34:31 -07004301add:
Jens Axboe5262f562019-09-17 12:26:57 -06004302 list_add(&req->list, entry);
Jens Axboead8a48a2019-11-15 08:49:11 -07004303 data->timer.function = io_timeout_fn;
4304 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
Jens Axboe842f9612019-10-29 12:34:10 -06004305 spin_unlock_irq(&ctx->completion_lock);
Jens Axboe5262f562019-09-17 12:26:57 -06004306 return 0;
4307}
4308
Jens Axboe62755e32019-10-28 21:49:21 -06004309static bool io_cancel_cb(struct io_wq_work *work, void *data)
Jens Axboede0617e2019-04-06 21:51:27 -06004310{
Jens Axboe62755e32019-10-28 21:49:21 -06004311 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
Jens Axboede0617e2019-04-06 21:51:27 -06004312
Jens Axboe62755e32019-10-28 21:49:21 -06004313 return req->user_data == (unsigned long) data;
4314}
4315
Jens Axboee977d6d2019-11-05 12:39:45 -07004316static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
Jens Axboe62755e32019-10-28 21:49:21 -06004317{
Jens Axboe62755e32019-10-28 21:49:21 -06004318 enum io_wq_cancel cancel_ret;
Jens Axboe62755e32019-10-28 21:49:21 -06004319 int ret = 0;
4320
Jens Axboe62755e32019-10-28 21:49:21 -06004321 cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
4322 switch (cancel_ret) {
4323 case IO_WQ_CANCEL_OK:
4324 ret = 0;
4325 break;
4326 case IO_WQ_CANCEL_RUNNING:
4327 ret = -EALREADY;
4328 break;
4329 case IO_WQ_CANCEL_NOTFOUND:
4330 ret = -ENOENT;
4331 break;
4332 }
4333
Jens Axboee977d6d2019-11-05 12:39:45 -07004334 return ret;
4335}
4336
Jens Axboe47f46762019-11-09 17:43:02 -07004337static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
4338 struct io_kiocb *req, __u64 sqe_addr,
Pavel Begunkov014db002020-03-03 21:33:12 +03004339 int success_ret)
Jens Axboe47f46762019-11-09 17:43:02 -07004340{
4341 unsigned long flags;
4342 int ret;
4343
4344 ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
4345 if (ret != -ENOENT) {
4346 spin_lock_irqsave(&ctx->completion_lock, flags);
4347 goto done;
4348 }
4349
4350 spin_lock_irqsave(&ctx->completion_lock, flags);
4351 ret = io_timeout_cancel(ctx, sqe_addr);
4352 if (ret != -ENOENT)
4353 goto done;
4354 ret = io_poll_cancel(ctx, sqe_addr);
4355done:
Jens Axboeb0dd8a42019-11-18 12:14:54 -07004356 if (!ret)
4357 ret = success_ret;
Jens Axboe47f46762019-11-09 17:43:02 -07004358 io_cqring_fill_event(req, ret);
4359 io_commit_cqring(ctx);
4360 spin_unlock_irqrestore(&ctx->completion_lock, flags);
4361 io_cqring_ev_posted(ctx);
4362
Jens Axboe4e88d6e2019-12-07 20:59:47 -07004363 if (ret < 0)
4364 req_set_fail_links(req);
Pavel Begunkov014db002020-03-03 21:33:12 +03004365 io_put_req(req);
Jens Axboe47f46762019-11-09 17:43:02 -07004366}
4367
Jens Axboe3529d8c2019-12-19 18:24:38 -07004368static int io_async_cancel_prep(struct io_kiocb *req,
4369 const struct io_uring_sqe *sqe)
Jens Axboee977d6d2019-11-05 12:39:45 -07004370{
Jens Axboefbf23842019-12-17 18:45:56 -07004371 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboee977d6d2019-11-05 12:39:45 -07004372 return -EINVAL;
4373 if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
4374 sqe->cancel_flags)
4375 return -EINVAL;
4376
Jens Axboefbf23842019-12-17 18:45:56 -07004377 req->cancel.addr = READ_ONCE(sqe->addr);
4378 return 0;
4379}
4380
Pavel Begunkov014db002020-03-03 21:33:12 +03004381static int io_async_cancel(struct io_kiocb *req)
Jens Axboefbf23842019-12-17 18:45:56 -07004382{
4383 struct io_ring_ctx *ctx = req->ctx;
Jens Axboefbf23842019-12-17 18:45:56 -07004384
Pavel Begunkov014db002020-03-03 21:33:12 +03004385 io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
Jens Axboe62755e32019-10-28 21:49:21 -06004386 return 0;
4387}
4388
Jens Axboe05f3fb32019-12-09 11:22:50 -07004389static int io_files_update_prep(struct io_kiocb *req,
4390 const struct io_uring_sqe *sqe)
4391{
4392 if (sqe->flags || sqe->ioprio || sqe->rw_flags)
4393 return -EINVAL;
4394
4395 req->files_update.offset = READ_ONCE(sqe->off);
4396 req->files_update.nr_args = READ_ONCE(sqe->len);
4397 if (!req->files_update.nr_args)
4398 return -EINVAL;
4399 req->files_update.arg = READ_ONCE(sqe->addr);
4400 return 0;
4401}
4402
4403static int io_files_update(struct io_kiocb *req, bool force_nonblock)
4404{
4405 struct io_ring_ctx *ctx = req->ctx;
4406 struct io_uring_files_update up;
4407 int ret;
4408
Jens Axboef86cd202020-01-29 13:46:44 -07004409 if (force_nonblock)
Jens Axboe05f3fb32019-12-09 11:22:50 -07004410 return -EAGAIN;
Jens Axboe05f3fb32019-12-09 11:22:50 -07004411
4412 up.offset = req->files_update.offset;
4413 up.fds = req->files_update.arg;
4414
4415 mutex_lock(&ctx->uring_lock);
4416 ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
4417 mutex_unlock(&ctx->uring_lock);
4418
4419 if (ret < 0)
4420 req_set_fail_links(req);
4421 io_cqring_add_event(req, ret);
4422 io_put_req(req);
4423 return 0;
4424}
4425
Jens Axboe3529d8c2019-12-19 18:24:38 -07004426static int io_req_defer_prep(struct io_kiocb *req,
4427 const struct io_uring_sqe *sqe)
Jens Axboef67676d2019-12-02 11:03:47 -07004428{
Jens Axboee7815732019-12-17 19:45:06 -07004429 ssize_t ret = 0;
Jens Axboef67676d2019-12-02 11:03:47 -07004430
Jens Axboef86cd202020-01-29 13:46:44 -07004431 if (io_op_defs[req->opcode].file_table) {
4432 ret = io_grab_files(req);
4433 if (unlikely(ret))
4434 return ret;
4435 }
4436
Jens Axboecccf0ee2020-01-27 16:34:48 -07004437 io_req_work_grab_env(req, &io_op_defs[req->opcode]);
4438
Jens Axboed625c6e2019-12-17 19:53:05 -07004439 switch (req->opcode) {
Jens Axboee7815732019-12-17 19:45:06 -07004440 case IORING_OP_NOP:
4441 break;
Jens Axboef67676d2019-12-02 11:03:47 -07004442 case IORING_OP_READV:
4443 case IORING_OP_READ_FIXED:
Jens Axboe3a6820f2019-12-22 15:19:35 -07004444 case IORING_OP_READ:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004445 ret = io_read_prep(req, sqe, true);
Jens Axboef67676d2019-12-02 11:03:47 -07004446 break;
4447 case IORING_OP_WRITEV:
4448 case IORING_OP_WRITE_FIXED:
Jens Axboe3a6820f2019-12-22 15:19:35 -07004449 case IORING_OP_WRITE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004450 ret = io_write_prep(req, sqe, true);
Jens Axboef67676d2019-12-02 11:03:47 -07004451 break;
Jens Axboe0969e782019-12-17 18:40:57 -07004452 case IORING_OP_POLL_ADD:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004453 ret = io_poll_add_prep(req, sqe);
Jens Axboe0969e782019-12-17 18:40:57 -07004454 break;
4455 case IORING_OP_POLL_REMOVE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004456 ret = io_poll_remove_prep(req, sqe);
Jens Axboe0969e782019-12-17 18:40:57 -07004457 break;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004458 case IORING_OP_FSYNC:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004459 ret = io_prep_fsync(req, sqe);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004460 break;
4461 case IORING_OP_SYNC_FILE_RANGE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004462 ret = io_prep_sfr(req, sqe);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004463 break;
Jens Axboe03b12302019-12-02 18:50:25 -07004464 case IORING_OP_SENDMSG:
Jens Axboefddafac2020-01-04 20:19:44 -07004465 case IORING_OP_SEND:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004466 ret = io_sendmsg_prep(req, sqe);
Jens Axboe03b12302019-12-02 18:50:25 -07004467 break;
4468 case IORING_OP_RECVMSG:
Jens Axboefddafac2020-01-04 20:19:44 -07004469 case IORING_OP_RECV:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004470 ret = io_recvmsg_prep(req, sqe);
Jens Axboe03b12302019-12-02 18:50:25 -07004471 break;
Jens Axboef499a022019-12-02 16:28:46 -07004472 case IORING_OP_CONNECT:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004473 ret = io_connect_prep(req, sqe);
Jens Axboef499a022019-12-02 16:28:46 -07004474 break;
Jens Axboe2d283902019-12-04 11:08:05 -07004475 case IORING_OP_TIMEOUT:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004476 ret = io_timeout_prep(req, sqe, false);
Jens Axboeb7bb4f72019-12-15 22:13:43 -07004477 break;
Jens Axboeb29472e2019-12-17 18:50:29 -07004478 case IORING_OP_TIMEOUT_REMOVE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004479 ret = io_timeout_remove_prep(req, sqe);
Jens Axboeb29472e2019-12-17 18:50:29 -07004480 break;
Jens Axboefbf23842019-12-17 18:45:56 -07004481 case IORING_OP_ASYNC_CANCEL:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004482 ret = io_async_cancel_prep(req, sqe);
Jens Axboefbf23842019-12-17 18:45:56 -07004483 break;
Jens Axboe2d283902019-12-04 11:08:05 -07004484 case IORING_OP_LINK_TIMEOUT:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004485 ret = io_timeout_prep(req, sqe, true);
Jens Axboeb7bb4f72019-12-15 22:13:43 -07004486 break;
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004487 case IORING_OP_ACCEPT:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004488 ret = io_accept_prep(req, sqe);
Jens Axboe8ed8d3c2019-12-16 11:55:28 -07004489 break;
Jens Axboed63d1b52019-12-10 10:38:56 -07004490 case IORING_OP_FALLOCATE:
4491 ret = io_fallocate_prep(req, sqe);
4492 break;
Jens Axboe15b71ab2019-12-11 11:20:36 -07004493 case IORING_OP_OPENAT:
4494 ret = io_openat_prep(req, sqe);
4495 break;
Jens Axboeb5dba592019-12-11 14:02:38 -07004496 case IORING_OP_CLOSE:
4497 ret = io_close_prep(req, sqe);
4498 break;
Jens Axboe05f3fb32019-12-09 11:22:50 -07004499 case IORING_OP_FILES_UPDATE:
4500 ret = io_files_update_prep(req, sqe);
4501 break;
Jens Axboeeddc7ef2019-12-13 21:18:10 -07004502 case IORING_OP_STATX:
4503 ret = io_statx_prep(req, sqe);
4504 break;
Jens Axboe4840e412019-12-25 22:03:45 -07004505 case IORING_OP_FADVISE:
4506 ret = io_fadvise_prep(req, sqe);
4507 break;
Jens Axboec1ca7572019-12-25 22:18:28 -07004508 case IORING_OP_MADVISE:
4509 ret = io_madvise_prep(req, sqe);
4510 break;
Jens Axboecebdb982020-01-08 17:59:24 -07004511 case IORING_OP_OPENAT2:
4512 ret = io_openat2_prep(req, sqe);
4513 break;
Jens Axboe3e4827b2020-01-08 15:18:09 -07004514 case IORING_OP_EPOLL_CTL:
4515 ret = io_epoll_ctl_prep(req, sqe);
4516 break;
Pavel Begunkov7d67af22020-02-24 11:32:45 +03004517 case IORING_OP_SPLICE:
4518 ret = io_splice_prep(req, sqe);
4519 break;
Jens Axboeddf0322d2020-02-23 16:41:33 -07004520 case IORING_OP_PROVIDE_BUFFERS:
4521 ret = io_provide_buffers_prep(req, sqe);
4522 break;
Jens Axboef67676d2019-12-02 11:03:47 -07004523 default:
Jens Axboee7815732019-12-17 19:45:06 -07004524 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
4525 req->opcode);
4526 ret = -EINVAL;
Jens Axboeb7bb4f72019-12-15 22:13:43 -07004527 break;
Jens Axboef67676d2019-12-02 11:03:47 -07004528 }
4529
Jens Axboeb7bb4f72019-12-15 22:13:43 -07004530 return ret;
Jens Axboef67676d2019-12-02 11:03:47 -07004531}
4532
Jens Axboe3529d8c2019-12-19 18:24:38 -07004533static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jens Axboede0617e2019-04-06 21:51:27 -06004534{
Jackie Liua197f662019-11-08 08:09:12 -07004535 struct io_ring_ctx *ctx = req->ctx;
Jens Axboef67676d2019-12-02 11:03:47 -07004536 int ret;
Jens Axboede0617e2019-04-06 21:51:27 -06004537
Bob Liu9d858b22019-11-13 18:06:25 +08004538 /* Still need defer if there is pending req in defer list. */
4539 if (!req_need_defer(req) && list_empty(&ctx->defer_list))
Jens Axboede0617e2019-04-06 21:51:27 -06004540 return 0;
4541
Jens Axboe3529d8c2019-12-19 18:24:38 -07004542 if (!req->io && io_alloc_async_ctx(req))
Jens Axboede0617e2019-04-06 21:51:27 -06004543 return -EAGAIN;
4544
Jens Axboe3529d8c2019-12-19 18:24:38 -07004545 ret = io_req_defer_prep(req, sqe);
Jens Axboeb7bb4f72019-12-15 22:13:43 -07004546 if (ret < 0)
Jens Axboe2d283902019-12-04 11:08:05 -07004547 return ret;
Jens Axboe2d283902019-12-04 11:08:05 -07004548
Jens Axboede0617e2019-04-06 21:51:27 -06004549 spin_lock_irq(&ctx->completion_lock);
Bob Liu9d858b22019-11-13 18:06:25 +08004550 if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
Jens Axboede0617e2019-04-06 21:51:27 -06004551 spin_unlock_irq(&ctx->completion_lock);
Jens Axboede0617e2019-04-06 21:51:27 -06004552 return 0;
4553 }
4554
Jens Axboe915967f2019-11-21 09:01:20 -07004555 trace_io_uring_defer(ctx, req, req->user_data);
Jens Axboede0617e2019-04-06 21:51:27 -06004556 list_add_tail(&req->list, &ctx->defer_list);
4557 spin_unlock_irq(&ctx->completion_lock);
4558 return -EIOCBQUEUED;
4559}
4560
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03004561static void io_cleanup_req(struct io_kiocb *req)
4562{
4563 struct io_async_ctx *io = req->io;
4564
4565 switch (req->opcode) {
4566 case IORING_OP_READV:
4567 case IORING_OP_READ_FIXED:
4568 case IORING_OP_READ:
4569 case IORING_OP_WRITEV:
4570 case IORING_OP_WRITE_FIXED:
4571 case IORING_OP_WRITE:
4572 if (io->rw.iov != io->rw.fast_iov)
4573 kfree(io->rw.iov);
4574 break;
4575 case IORING_OP_SENDMSG:
4576 case IORING_OP_RECVMSG:
4577 if (io->msg.iov != io->msg.fast_iov)
4578 kfree(io->msg.iov);
4579 break;
Pavel Begunkov8fef80b2020-02-07 23:59:53 +03004580 case IORING_OP_OPENAT:
4581 case IORING_OP_OPENAT2:
4582 case IORING_OP_STATX:
4583 putname(req->open.filename);
4584 break;
Pavel Begunkov7d67af22020-02-24 11:32:45 +03004585 case IORING_OP_SPLICE:
4586 io_put_file(req, req->splice.file_in,
4587 (req->splice.flags & SPLICE_F_FD_IN_FIXED));
4588 break;
Pavel Begunkov99bc4c32020-02-07 22:04:45 +03004589 }
4590
4591 req->flags &= ~REQ_F_NEED_CLEANUP;
4592}
4593
Jens Axboe3529d8c2019-12-19 18:24:38 -07004594static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
Pavel Begunkov014db002020-03-03 21:33:12 +03004595 bool force_nonblock)
Jens Axboe2b188cc2019-01-07 10:46:33 -07004596{
Jackie Liua197f662019-11-08 08:09:12 -07004597 struct io_ring_ctx *ctx = req->ctx;
Jens Axboed625c6e2019-12-17 19:53:05 -07004598 int ret;
Jens Axboe2b188cc2019-01-07 10:46:33 -07004599
Jens Axboed625c6e2019-12-17 19:53:05 -07004600 switch (req->opcode) {
Jens Axboe2b188cc2019-01-07 10:46:33 -07004601 case IORING_OP_NOP:
Jens Axboe78e19bb2019-11-06 15:21:34 -07004602 ret = io_nop(req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07004603 break;
4604 case IORING_OP_READV:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004605 case IORING_OP_READ_FIXED:
Jens Axboe3a6820f2019-12-22 15:19:35 -07004606 case IORING_OP_READ:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004607 if (sqe) {
4608 ret = io_read_prep(req, sqe, force_nonblock);
4609 if (ret < 0)
4610 break;
4611 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004612 ret = io_read(req, force_nonblock);
Jens Axboe2b188cc2019-01-07 10:46:33 -07004613 break;
4614 case IORING_OP_WRITEV:
Jens Axboeedafcce2019-01-09 09:16:05 -07004615 case IORING_OP_WRITE_FIXED:
Jens Axboe3a6820f2019-12-22 15:19:35 -07004616 case IORING_OP_WRITE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004617 if (sqe) {
4618 ret = io_write_prep(req, sqe, force_nonblock);
4619 if (ret < 0)
4620 break;
4621 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004622 ret = io_write(req, force_nonblock);
Jens Axboe2b188cc2019-01-07 10:46:33 -07004623 break;
Christoph Hellwigc992fe22019-01-11 09:43:02 -07004624 case IORING_OP_FSYNC:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004625 if (sqe) {
4626 ret = io_prep_fsync(req, sqe);
4627 if (ret < 0)
4628 break;
4629 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004630 ret = io_fsync(req, force_nonblock);
Christoph Hellwigc992fe22019-01-11 09:43:02 -07004631 break;
Jens Axboe221c5eb2019-01-17 09:41:58 -07004632 case IORING_OP_POLL_ADD:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004633 if (sqe) {
4634 ret = io_poll_add_prep(req, sqe);
4635 if (ret)
4636 break;
4637 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004638 ret = io_poll_add(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07004639 break;
4640 case IORING_OP_POLL_REMOVE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004641 if (sqe) {
4642 ret = io_poll_remove_prep(req, sqe);
4643 if (ret < 0)
4644 break;
4645 }
Jens Axboefc4df992019-12-10 14:38:45 -07004646 ret = io_poll_remove(req);
Jens Axboe221c5eb2019-01-17 09:41:58 -07004647 break;
Jens Axboe5d17b4a2019-04-09 14:56:44 -06004648 case IORING_OP_SYNC_FILE_RANGE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004649 if (sqe) {
4650 ret = io_prep_sfr(req, sqe);
4651 if (ret < 0)
4652 break;
4653 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004654 ret = io_sync_file_range(req, force_nonblock);
Jens Axboe5d17b4a2019-04-09 14:56:44 -06004655 break;
Jens Axboe0fa03c62019-04-19 13:34:07 -06004656 case IORING_OP_SENDMSG:
Jens Axboefddafac2020-01-04 20:19:44 -07004657 case IORING_OP_SEND:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004658 if (sqe) {
4659 ret = io_sendmsg_prep(req, sqe);
4660 if (ret < 0)
4661 break;
4662 }
Jens Axboefddafac2020-01-04 20:19:44 -07004663 if (req->opcode == IORING_OP_SENDMSG)
Pavel Begunkov014db002020-03-03 21:33:12 +03004664 ret = io_sendmsg(req, force_nonblock);
Jens Axboefddafac2020-01-04 20:19:44 -07004665 else
Pavel Begunkov014db002020-03-03 21:33:12 +03004666 ret = io_send(req, force_nonblock);
Jens Axboe0fa03c62019-04-19 13:34:07 -06004667 break;
Jens Axboeaa1fa282019-04-19 13:38:09 -06004668 case IORING_OP_RECVMSG:
Jens Axboefddafac2020-01-04 20:19:44 -07004669 case IORING_OP_RECV:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004670 if (sqe) {
4671 ret = io_recvmsg_prep(req, sqe);
4672 if (ret)
4673 break;
4674 }
Jens Axboefddafac2020-01-04 20:19:44 -07004675 if (req->opcode == IORING_OP_RECVMSG)
Pavel Begunkov014db002020-03-03 21:33:12 +03004676 ret = io_recvmsg(req, force_nonblock);
Jens Axboefddafac2020-01-04 20:19:44 -07004677 else
Pavel Begunkov014db002020-03-03 21:33:12 +03004678 ret = io_recv(req, force_nonblock);
Jens Axboeaa1fa282019-04-19 13:38:09 -06004679 break;
Jens Axboe5262f562019-09-17 12:26:57 -06004680 case IORING_OP_TIMEOUT:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004681 if (sqe) {
4682 ret = io_timeout_prep(req, sqe, false);
4683 if (ret)
4684 break;
4685 }
Jens Axboefc4df992019-12-10 14:38:45 -07004686 ret = io_timeout(req);
Jens Axboe5262f562019-09-17 12:26:57 -06004687 break;
Jens Axboe11365042019-10-16 09:08:32 -06004688 case IORING_OP_TIMEOUT_REMOVE:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004689 if (sqe) {
4690 ret = io_timeout_remove_prep(req, sqe);
4691 if (ret)
4692 break;
4693 }
Jens Axboefc4df992019-12-10 14:38:45 -07004694 ret = io_timeout_remove(req);
Jens Axboe11365042019-10-16 09:08:32 -06004695 break;
Jens Axboe17f2fe32019-10-17 14:42:58 -06004696 case IORING_OP_ACCEPT:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004697 if (sqe) {
4698 ret = io_accept_prep(req, sqe);
4699 if (ret)
4700 break;
4701 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004702 ret = io_accept(req, force_nonblock);
Jens Axboe17f2fe32019-10-17 14:42:58 -06004703 break;
Jens Axboef8e85cf2019-11-23 14:24:24 -07004704 case IORING_OP_CONNECT:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004705 if (sqe) {
4706 ret = io_connect_prep(req, sqe);
4707 if (ret)
4708 break;
4709 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004710 ret = io_connect(req, force_nonblock);
Jens Axboef8e85cf2019-11-23 14:24:24 -07004711 break;
Jens Axboe62755e32019-10-28 21:49:21 -06004712 case IORING_OP_ASYNC_CANCEL:
Jens Axboe3529d8c2019-12-19 18:24:38 -07004713 if (sqe) {
4714 ret = io_async_cancel_prep(req, sqe);
4715 if (ret)
4716 break;
4717 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004718 ret = io_async_cancel(req);
Jens Axboe62755e32019-10-28 21:49:21 -06004719 break;
Jens Axboed63d1b52019-12-10 10:38:56 -07004720 case IORING_OP_FALLOCATE:
4721 if (sqe) {
4722 ret = io_fallocate_prep(req, sqe);
4723 if (ret)
4724 break;
4725 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004726 ret = io_fallocate(req, force_nonblock);
Jens Axboed63d1b52019-12-10 10:38:56 -07004727 break;
Jens Axboe15b71ab2019-12-11 11:20:36 -07004728 case IORING_OP_OPENAT:
4729 if (sqe) {
4730 ret = io_openat_prep(req, sqe);
4731 if (ret)
4732 break;
4733 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004734 ret = io_openat(req, force_nonblock);
Jens Axboe15b71ab2019-12-11 11:20:36 -07004735 break;
Jens Axboeb5dba592019-12-11 14:02:38 -07004736 case IORING_OP_CLOSE:
4737 if (sqe) {
4738 ret = io_close_prep(req, sqe);
4739 if (ret)
4740 break;
4741 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004742 ret = io_close(req, force_nonblock);
Jens Axboeb5dba592019-12-11 14:02:38 -07004743 break;
Jens Axboe05f3fb32019-12-09 11:22:50 -07004744 case IORING_OP_FILES_UPDATE:
4745 if (sqe) {
4746 ret = io_files_update_prep(req, sqe);
4747 if (ret)
4748 break;
4749 }
4750 ret = io_files_update(req, force_nonblock);
4751 break;
Jens Axboeeddc7ef2019-12-13 21:18:10 -07004752 case IORING_OP_STATX:
4753 if (sqe) {
4754 ret = io_statx_prep(req, sqe);
4755 if (ret)
4756 break;
4757 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004758 ret = io_statx(req, force_nonblock);
Jens Axboeeddc7ef2019-12-13 21:18:10 -07004759 break;
Jens Axboe4840e412019-12-25 22:03:45 -07004760 case IORING_OP_FADVISE:
4761 if (sqe) {
4762 ret = io_fadvise_prep(req, sqe);
4763 if (ret)
4764 break;
4765 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004766 ret = io_fadvise(req, force_nonblock);
Jens Axboe4840e412019-12-25 22:03:45 -07004767 break;
Jens Axboec1ca7572019-12-25 22:18:28 -07004768 case IORING_OP_MADVISE:
4769 if (sqe) {
4770 ret = io_madvise_prep(req, sqe);
4771 if (ret)
4772 break;
4773 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004774 ret = io_madvise(req, force_nonblock);
Jens Axboec1ca7572019-12-25 22:18:28 -07004775 break;
Jens Axboecebdb982020-01-08 17:59:24 -07004776 case IORING_OP_OPENAT2:
4777 if (sqe) {
4778 ret = io_openat2_prep(req, sqe);
4779 if (ret)
4780 break;
4781 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004782 ret = io_openat2(req, force_nonblock);
Jens Axboecebdb982020-01-08 17:59:24 -07004783 break;
Jens Axboe3e4827b2020-01-08 15:18:09 -07004784 case IORING_OP_EPOLL_CTL:
4785 if (sqe) {
4786 ret = io_epoll_ctl_prep(req, sqe);
4787 if (ret)
4788 break;
4789 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004790 ret = io_epoll_ctl(req, force_nonblock);
Jens Axboe3e4827b2020-01-08 15:18:09 -07004791 break;
Pavel Begunkov7d67af22020-02-24 11:32:45 +03004792 case IORING_OP_SPLICE:
4793 if (sqe) {
4794 ret = io_splice_prep(req, sqe);
4795 if (ret < 0)
4796 break;
4797 }
Pavel Begunkov014db002020-03-03 21:33:12 +03004798 ret = io_splice(req, force_nonblock);
Pavel Begunkov7d67af22020-02-24 11:32:45 +03004799 break;
Jens Axboeddf0322d2020-02-23 16:41:33 -07004800 case IORING_OP_PROVIDE_BUFFERS:
4801 if (sqe) {
4802 ret = io_provide_buffers_prep(req, sqe);
4803 if (ret)
4804 break;
4805 }
4806 ret = io_provide_buffers(req, force_nonblock);
4807 break;
Jens Axboe2b188cc2019-01-07 10:46:33 -07004808 default:
4809 ret = -EINVAL;
4810 break;
4811 }
4812
Jens Axboedef596e2019-01-09 08:59:42 -07004813 if (ret)
4814 return ret;
4815
4816 if (ctx->flags & IORING_SETUP_IOPOLL) {
Jens Axboe11ba8202020-01-15 21:51:17 -07004817 const bool in_async = io_wq_current_is_worker();
4818
Jens Axboe9e645e112019-05-10 16:07:28 -06004819 if (req->result == -EAGAIN)
Jens Axboedef596e2019-01-09 08:59:42 -07004820 return -EAGAIN;
4821
Jens Axboe11ba8202020-01-15 21:51:17 -07004822 /* workqueue context doesn't hold uring_lock, grab it now */
4823 if (in_async)
4824 mutex_lock(&ctx->uring_lock);
4825
Jens Axboedef596e2019-01-09 08:59:42 -07004826 io_iopoll_req_issued(req);
Jens Axboe11ba8202020-01-15 21:51:17 -07004827
4828 if (in_async)
4829 mutex_unlock(&ctx->uring_lock);
Jens Axboedef596e2019-01-09 08:59:42 -07004830 }
4831
4832 return 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07004833}
4834
Jens Axboe561fb042019-10-24 07:25:42 -06004835static void io_wq_submit_work(struct io_wq_work **workptr)
Jens Axboe31b51512019-01-18 22:56:34 -07004836{
Jens Axboe561fb042019-10-24 07:25:42 -06004837 struct io_wq_work *work = *workptr;
Jens Axboe2b188cc2019-01-07 10:46:33 -07004838 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
Jens Axboe561fb042019-10-24 07:25:42 -06004839 int ret = 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07004840
Jens Axboe0c9d5cc2019-12-11 19:29:43 -07004841 /* if NO_CANCEL is set, we must still run the work */
4842 if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
4843 IO_WQ_WORK_CANCEL) {
Jens Axboe561fb042019-10-24 07:25:42 -06004844 ret = -ECANCELED;
Jens Axboe0c9d5cc2019-12-11 19:29:43 -07004845 }
Jens Axboe31b51512019-01-18 22:56:34 -07004846
Jens Axboe561fb042019-10-24 07:25:42 -06004847 if (!ret) {
Jens Axboe561fb042019-10-24 07:25:42 -06004848 do {
Pavel Begunkov014db002020-03-03 21:33:12 +03004849 ret = io_issue_sqe(req, NULL, false);
Jens Axboe561fb042019-10-24 07:25:42 -06004850 /*
4851 * We can get EAGAIN for polled IO even though we're
4852 * forcing a sync submission from here, since we can't
4853 * wait for request slots on the block side.
4854 */
4855 if (ret != -EAGAIN)
4856 break;
4857 cond_resched();
4858 } while (1);
4859 }
Jens Axboe31b51512019-01-18 22:56:34 -07004860
Jens Axboe561fb042019-10-24 07:25:42 -06004861 if (ret) {
Jens Axboe4e88d6e2019-12-07 20:59:47 -07004862 req_set_fail_links(req);
Jens Axboe78e19bb2019-11-06 15:21:34 -07004863 io_cqring_add_event(req, ret);
Jens Axboe817869d2019-04-30 14:44:05 -06004864 io_put_req(req);
Jens Axboeedafcce2019-01-09 09:16:05 -07004865 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07004866
Pavel Begunkove9fd9392020-03-04 16:14:12 +03004867 io_steal_work(req, workptr);
Jens Axboe31b51512019-01-18 22:56:34 -07004868}
Jens Axboe2b188cc2019-01-07 10:46:33 -07004869
Jens Axboe15b71ab2019-12-11 11:20:36 -07004870static int io_req_needs_file(struct io_kiocb *req, int fd)
Jens Axboe9e3aa612019-12-11 15:55:43 -07004871{
Jens Axboed3656342019-12-18 09:50:26 -07004872 if (!io_op_defs[req->opcode].needs_file)
Jens Axboe9e3aa612019-12-11 15:55:43 -07004873 return 0;
Jens Axboe0b5faf62020-02-06 21:42:51 -07004874 if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg)
Jens Axboed3656342019-12-18 09:50:26 -07004875 return 0;
4876 return 1;
Jens Axboe09bb8392019-03-13 12:39:28 -06004877}
4878
Jens Axboe65e19f52019-10-26 07:20:21 -06004879static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
4880 int index)
Jens Axboe09bb8392019-03-13 12:39:28 -06004881{
Jens Axboe65e19f52019-10-26 07:20:21 -06004882 struct fixed_file_table *table;
4883
Jens Axboe05f3fb32019-12-09 11:22:50 -07004884 table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
4885 return table->files[index & IORING_FILE_TABLE_MASK];;
Jens Axboe65e19f52019-10-26 07:20:21 -06004886}
4887
Pavel Begunkov8da11c12020-02-24 11:32:44 +03004888static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
4889 int fd, struct file **out_file, bool fixed)
4890{
4891 struct io_ring_ctx *ctx = req->ctx;
4892 struct file *file;
4893
4894 if (fixed) {
4895 if (unlikely(!ctx->file_data ||
4896 (unsigned) fd >= ctx->nr_user_files))
4897 return -EBADF;
4898 fd = array_index_nospec(fd, ctx->nr_user_files);
4899 file = io_file_from_index(ctx, fd);
4900 if (!file)
4901 return -EBADF;
4902 percpu_ref_get(&ctx->file_data->refs);
4903 } else {
4904 trace_io_uring_file_get(ctx, fd);
4905 file = __io_file_get(state, fd);
4906 if (unlikely(!file))
4907 return -EBADF;
4908 }
4909
4910 *out_file = file;
4911 return 0;
4912}
4913
Jens Axboe3529d8c2019-12-19 18:24:38 -07004914static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
4915 const struct io_uring_sqe *sqe)
Jens Axboe09bb8392019-03-13 12:39:28 -06004916{
4917 unsigned flags;
Jens Axboed3656342019-12-18 09:50:26 -07004918 int fd;
Pavel Begunkov8da11c12020-02-24 11:32:44 +03004919 bool fixed;
Jens Axboe09bb8392019-03-13 12:39:28 -06004920
Jens Axboe3529d8c2019-12-19 18:24:38 -07004921 flags = READ_ONCE(sqe->flags);
4922 fd = READ_ONCE(sqe->fd);
Jens Axboe09bb8392019-03-13 12:39:28 -06004923
Jens Axboed3656342019-12-18 09:50:26 -07004924 if (!io_req_needs_file(req, fd))
4925 return 0;
Jens Axboe09bb8392019-03-13 12:39:28 -06004926
Pavel Begunkov8da11c12020-02-24 11:32:44 +03004927 fixed = (flags & IOSQE_FIXED_FILE);
4928 if (unlikely(!fixed && req->needs_fixed_file))
4929 return -EBADF;
Jens Axboe09bb8392019-03-13 12:39:28 -06004930
Pavel Begunkov8da11c12020-02-24 11:32:44 +03004931 return io_file_get(state, req, fd, &req->file, fixed);
Jens Axboe09bb8392019-03-13 12:39:28 -06004932}
4933
Jackie Liua197f662019-11-08 08:09:12 -07004934static int io_grab_files(struct io_kiocb *req)
Jens Axboe2b188cc2019-01-07 10:46:33 -07004935{
Jens Axboefcb323c2019-10-24 12:39:47 -06004936 int ret = -EBADF;
Jackie Liua197f662019-11-08 08:09:12 -07004937 struct io_ring_ctx *ctx = req->ctx;
Jens Axboefcb323c2019-10-24 12:39:47 -06004938
Jens Axboef86cd202020-01-29 13:46:44 -07004939 if (req->work.files)
4940 return 0;
Pavel Begunkovb14cca02020-01-17 04:45:59 +03004941 if (!ctx->ring_file)
Jens Axboeb5dba592019-12-11 14:02:38 -07004942 return -EBADF;
4943
Jens Axboefcb323c2019-10-24 12:39:47 -06004944 rcu_read_lock();
4945 spin_lock_irq(&ctx->inflight_lock);
4946 /*
4947 * We use the f_ops->flush() handler to ensure that we can flush
4948 * out work accessing these files if the fd is closed. Check if
4949 * the fd has changed since we started down this path, and disallow
4950 * this operation if it has.
4951 */
Pavel Begunkovb14cca02020-01-17 04:45:59 +03004952 if (fcheck(ctx->ring_fd) == ctx->ring_file) {
Jens Axboefcb323c2019-10-24 12:39:47 -06004953 list_add(&req->inflight_entry, &ctx->inflight_list);
4954 req->flags |= REQ_F_INFLIGHT;
4955 req->work.files = current->files;
4956 ret = 0;
4957 }
4958 spin_unlock_irq(&ctx->inflight_lock);
4959 rcu_read_unlock();
4960
4961 return ret;
4962}
4963
Jens Axboe2665abf2019-11-05 12:40:47 -07004964static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
4965{
Jens Axboead8a48a2019-11-15 08:49:11 -07004966 struct io_timeout_data *data = container_of(timer,
4967 struct io_timeout_data, timer);
4968 struct io_kiocb *req = data->req;
Jens Axboe2665abf2019-11-05 12:40:47 -07004969 struct io_ring_ctx *ctx = req->ctx;
4970 struct io_kiocb *prev = NULL;
4971 unsigned long flags;
Jens Axboe2665abf2019-11-05 12:40:47 -07004972
4973 spin_lock_irqsave(&ctx->completion_lock, flags);
4974
4975 /*
4976 * We don't expect the list to be empty, that will only happen if we
4977 * race with the completion of the linked work.
4978 */
Pavel Begunkov44932332019-12-05 16:16:35 +03004979 if (!list_empty(&req->link_list)) {
4980 prev = list_entry(req->link_list.prev, struct io_kiocb,
4981 link_list);
Jens Axboe5d960722019-11-19 15:31:28 -07004982 if (refcount_inc_not_zero(&prev->refs)) {
Pavel Begunkov44932332019-12-05 16:16:35 +03004983 list_del_init(&req->link_list);
Jens Axboe5d960722019-11-19 15:31:28 -07004984 prev->flags &= ~REQ_F_LINK_TIMEOUT;
4985 } else
Jens Axboe76a46e02019-11-10 23:34:16 -07004986 prev = NULL;
Jens Axboe2665abf2019-11-05 12:40:47 -07004987 }
4988
4989 spin_unlock_irqrestore(&ctx->completion_lock, flags);
4990
4991 if (prev) {
Jens Axboe4e88d6e2019-12-07 20:59:47 -07004992 req_set_fail_links(prev);
Pavel Begunkov014db002020-03-03 21:33:12 +03004993 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
Jens Axboe76a46e02019-11-10 23:34:16 -07004994 io_put_req(prev);
Jens Axboe47f46762019-11-09 17:43:02 -07004995 } else {
4996 io_cqring_add_event(req, -ETIME);
4997 io_put_req(req);
Jens Axboe2665abf2019-11-05 12:40:47 -07004998 }
Jens Axboe2665abf2019-11-05 12:40:47 -07004999 return HRTIMER_NORESTART;
5000}
5001
Jens Axboead8a48a2019-11-15 08:49:11 -07005002static void io_queue_linked_timeout(struct io_kiocb *req)
Jens Axboe2665abf2019-11-05 12:40:47 -07005003{
Jens Axboe76a46e02019-11-10 23:34:16 -07005004 struct io_ring_ctx *ctx = req->ctx;
Jens Axboe2665abf2019-11-05 12:40:47 -07005005
Jens Axboe76a46e02019-11-10 23:34:16 -07005006 /*
5007 * If the list is now empty, then our linked request finished before
5008 * we got a chance to setup the timer
5009 */
5010 spin_lock_irq(&ctx->completion_lock);
Pavel Begunkov44932332019-12-05 16:16:35 +03005011 if (!list_empty(&req->link_list)) {
Jens Axboe2d283902019-12-04 11:08:05 -07005012 struct io_timeout_data *data = &req->io->timeout;
Jens Axboe94ae5e72019-11-14 19:39:52 -07005013
Jens Axboead8a48a2019-11-15 08:49:11 -07005014 data->timer.function = io_link_timeout_fn;
5015 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
5016 data->mode);
Jens Axboe2665abf2019-11-05 12:40:47 -07005017 }
Jens Axboe76a46e02019-11-10 23:34:16 -07005018 spin_unlock_irq(&ctx->completion_lock);
Jens Axboe2665abf2019-11-05 12:40:47 -07005019
Jens Axboe2665abf2019-11-05 12:40:47 -07005020 /* drop submission reference */
Jens Axboe76a46e02019-11-10 23:34:16 -07005021 io_put_req(req);
Jens Axboe2665abf2019-11-05 12:40:47 -07005022}
5023
Jens Axboead8a48a2019-11-15 08:49:11 -07005024static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
Jens Axboe2665abf2019-11-05 12:40:47 -07005025{
5026 struct io_kiocb *nxt;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005027
Jens Axboe2665abf2019-11-05 12:40:47 -07005028 if (!(req->flags & REQ_F_LINK))
5029 return NULL;
Jens Axboed7718a92020-02-14 22:23:12 -07005030 /* for polled retry, if flag is set, we already went through here */
5031 if (req->flags & REQ_F_POLLED)
5032 return NULL;
Jens Axboe2665abf2019-11-05 12:40:47 -07005033
Pavel Begunkov44932332019-12-05 16:16:35 +03005034 nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
5035 link_list);
Jens Axboed625c6e2019-12-17 19:53:05 -07005036 if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
Jens Axboe76a46e02019-11-10 23:34:16 -07005037 return NULL;
Jens Axboe2665abf2019-11-05 12:40:47 -07005038
Jens Axboe76a46e02019-11-10 23:34:16 -07005039 req->flags |= REQ_F_LINK_TIMEOUT;
Jens Axboe76a46e02019-11-10 23:34:16 -07005040 return nxt;
Jens Axboe2665abf2019-11-05 12:40:47 -07005041}
5042
Jens Axboe3529d8c2019-12-19 18:24:38 -07005043static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jens Axboe2b188cc2019-01-07 10:46:33 -07005044{
Jens Axboe4a0a7a12019-12-09 20:01:01 -07005045 struct io_kiocb *linked_timeout;
Pavel Begunkov4bc44942020-02-29 22:48:24 +03005046 struct io_kiocb *nxt;
Jens Axboe193155c2020-02-22 23:22:19 -07005047 const struct cred *old_creds = NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005048 int ret;
5049
Jens Axboe4a0a7a12019-12-09 20:01:01 -07005050again:
5051 linked_timeout = io_prep_linked_timeout(req);
5052
Jens Axboe193155c2020-02-22 23:22:19 -07005053 if (req->work.creds && req->work.creds != current_cred()) {
5054 if (old_creds)
5055 revert_creds(old_creds);
5056 if (old_creds == req->work.creds)
5057 old_creds = NULL; /* restored original creds */
5058 else
5059 old_creds = override_creds(req->work.creds);
5060 }
5061
Pavel Begunkov014db002020-03-03 21:33:12 +03005062 ret = io_issue_sqe(req, sqe, true);
Jens Axboe491381ce2019-10-17 09:20:46 -06005063
5064 /*
5065 * We async punt it if the file wasn't marked NOWAIT, or if the file
5066 * doesn't support non-blocking read/write attempts
5067 */
5068 if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
5069 (req->flags & REQ_F_MUST_PUNT))) {
Jens Axboed7718a92020-02-14 22:23:12 -07005070 if (io_arm_poll_handler(req)) {
5071 if (linked_timeout)
5072 io_queue_linked_timeout(linked_timeout);
Pavel Begunkov4bc44942020-02-29 22:48:24 +03005073 goto exit;
Jens Axboed7718a92020-02-14 22:23:12 -07005074 }
Pavel Begunkov86a761f2020-01-22 23:09:36 +03005075punt:
Jens Axboef86cd202020-01-29 13:46:44 -07005076 if (io_op_defs[req->opcode].file_table) {
Pavel Begunkovbbad27b2019-11-19 23:32:47 +03005077 ret = io_grab_files(req);
5078 if (ret)
5079 goto err;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005080 }
Pavel Begunkovbbad27b2019-11-19 23:32:47 +03005081
5082 /*
5083 * Queued up for async execution, worker will release
5084 * submit reference when the iocb is actually submitted.
5085 */
5086 io_queue_async_work(req);
Pavel Begunkov4bc44942020-02-29 22:48:24 +03005087 goto exit;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005088 }
Jens Axboee65ef562019-03-12 10:16:44 -06005089
Jens Axboefcb323c2019-10-24 12:39:47 -06005090err:
Pavel Begunkov4bc44942020-02-29 22:48:24 +03005091 nxt = NULL;
Jens Axboee65ef562019-03-12 10:16:44 -06005092 /* drop submission reference */
Jens Axboe2a44f462020-02-25 13:25:41 -07005093 io_put_req_find_next(req, &nxt);
Jens Axboee65ef562019-03-12 10:16:44 -06005094
Pavel Begunkovf9bd67f2019-11-21 23:21:03 +03005095 if (linked_timeout) {
Jens Axboe76a46e02019-11-10 23:34:16 -07005096 if (!ret)
Pavel Begunkovf9bd67f2019-11-21 23:21:03 +03005097 io_queue_linked_timeout(linked_timeout);
Jens Axboe76a46e02019-11-10 23:34:16 -07005098 else
Pavel Begunkovf9bd67f2019-11-21 23:21:03 +03005099 io_put_req(linked_timeout);
Jens Axboe76a46e02019-11-10 23:34:16 -07005100 }
5101
Jens Axboee65ef562019-03-12 10:16:44 -06005102 /* and drop final reference, if we failed */
Jens Axboe9e645e112019-05-10 16:07:28 -06005103 if (ret) {
Jens Axboe78e19bb2019-11-06 15:21:34 -07005104 io_cqring_add_event(req, ret);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07005105 req_set_fail_links(req);
Jens Axboee65ef562019-03-12 10:16:44 -06005106 io_put_req(req);
Jens Axboe9e645e112019-05-10 16:07:28 -06005107 }
Jens Axboe4a0a7a12019-12-09 20:01:01 -07005108 if (nxt) {
5109 req = nxt;
Pavel Begunkov86a761f2020-01-22 23:09:36 +03005110
5111 if (req->flags & REQ_F_FORCE_ASYNC)
5112 goto punt;
Jens Axboe4a0a7a12019-12-09 20:01:01 -07005113 goto again;
5114 }
Pavel Begunkov4bc44942020-02-29 22:48:24 +03005115exit:
Jens Axboe193155c2020-02-22 23:22:19 -07005116 if (old_creds)
5117 revert_creds(old_creds);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005118}
5119
Jens Axboe3529d8c2019-12-19 18:24:38 -07005120static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
Jackie Liu4fe2c962019-09-09 20:50:40 +08005121{
5122 int ret;
5123
Jens Axboe3529d8c2019-12-19 18:24:38 -07005124 ret = io_req_defer(req, sqe);
Jackie Liu4fe2c962019-09-09 20:50:40 +08005125 if (ret) {
5126 if (ret != -EIOCBQUEUED) {
Pavel Begunkov11185912020-01-22 23:09:35 +03005127fail_req:
Jens Axboe78e19bb2019-11-06 15:21:34 -07005128 io_cqring_add_event(req, ret);
Jens Axboe4e88d6e2019-12-07 20:59:47 -07005129 req_set_fail_links(req);
Jens Axboe78e19bb2019-11-06 15:21:34 -07005130 io_double_put_req(req);
Jackie Liu4fe2c962019-09-09 20:50:40 +08005131 }
Pavel Begunkov25508782019-12-30 21:24:47 +03005132 } else if (req->flags & REQ_F_FORCE_ASYNC) {
Pavel Begunkov11185912020-01-22 23:09:35 +03005133 ret = io_req_defer_prep(req, sqe);
5134 if (unlikely(ret < 0))
5135 goto fail_req;
Jens Axboece35a472019-12-17 08:04:44 -07005136 /*
5137 * Never try inline submit of IOSQE_ASYNC is set, go straight
5138 * to async execution.
5139 */
5140 req->work.flags |= IO_WQ_WORK_CONCURRENT;
5141 io_queue_async_work(req);
5142 } else {
Jens Axboe3529d8c2019-12-19 18:24:38 -07005143 __io_queue_sqe(req, sqe);
Jens Axboece35a472019-12-17 08:04:44 -07005144 }
Jackie Liu4fe2c962019-09-09 20:50:40 +08005145}
5146
Pavel Begunkov1b4a51b2019-11-21 11:54:28 +03005147static inline void io_queue_link_head(struct io_kiocb *req)
Jackie Liu4fe2c962019-09-09 20:50:40 +08005148{
Jens Axboe94ae5e72019-11-14 19:39:52 -07005149 if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
Pavel Begunkov1b4a51b2019-11-21 11:54:28 +03005150 io_cqring_add_event(req, -ECANCELED);
5151 io_double_put_req(req);
5152 } else
Jens Axboe3529d8c2019-12-19 18:24:38 -07005153 io_queue_sqe(req, NULL);
Jackie Liu4fe2c962019-09-09 20:50:40 +08005154}
5155
Jens Axboe4e88d6e2019-12-07 20:59:47 -07005156#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
Jens Axboece35a472019-12-17 08:04:44 -07005157 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
Jens Axboe9e645e112019-05-10 16:07:28 -06005158
Jens Axboe3529d8c2019-12-19 18:24:38 -07005159static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5160 struct io_submit_state *state, struct io_kiocb **link)
Jens Axboe9e645e112019-05-10 16:07:28 -06005161{
Jackie Liua197f662019-11-08 08:09:12 -07005162 struct io_ring_ctx *ctx = req->ctx;
Pavel Begunkov32fe5252019-12-17 22:26:58 +03005163 unsigned int sqe_flags;
Jens Axboe75c6a032020-01-28 10:15:23 -07005164 int ret, id;
Jens Axboe9e645e112019-05-10 16:07:28 -06005165
Pavel Begunkov32fe5252019-12-17 22:26:58 +03005166 sqe_flags = READ_ONCE(sqe->flags);
Jens Axboe9e645e112019-05-10 16:07:28 -06005167
5168 /* enforce forwards compatibility on users */
Pavel Begunkov32fe5252019-12-17 22:26:58 +03005169 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
Jens Axboe9e645e112019-05-10 16:07:28 -06005170 ret = -EINVAL;
Pavel Begunkov196be952019-11-07 01:41:06 +03005171 goto err_req;
Jens Axboe9e645e112019-05-10 16:07:28 -06005172 }
5173
Jens Axboe75c6a032020-01-28 10:15:23 -07005174 id = READ_ONCE(sqe->personality);
5175 if (id) {
Jens Axboe193155c2020-02-22 23:22:19 -07005176 req->work.creds = idr_find(&ctx->personality_idr, id);
5177 if (unlikely(!req->work.creds)) {
Jens Axboe75c6a032020-01-28 10:15:23 -07005178 ret = -EINVAL;
5179 goto err_req;
5180 }
Jens Axboe193155c2020-02-22 23:22:19 -07005181 get_cred(req->work.creds);
Jens Axboe75c6a032020-01-28 10:15:23 -07005182 }
5183
Pavel Begunkov6b47ee62020-01-18 20:22:41 +03005184 /* same numerical values with corresponding REQ_F_*, safe to copy */
Pavel Begunkov8da11c12020-02-24 11:32:44 +03005185 req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
5186 IOSQE_ASYNC | IOSQE_FIXED_FILE);
Jens Axboe9e645e112019-05-10 16:07:28 -06005187
Jens Axboe3529d8c2019-12-19 18:24:38 -07005188 ret = io_req_set_file(state, req, sqe);
Jens Axboe9e645e112019-05-10 16:07:28 -06005189 if (unlikely(ret)) {
5190err_req:
Jens Axboe78e19bb2019-11-06 15:21:34 -07005191 io_cqring_add_event(req, ret);
5192 io_double_put_req(req);
Pavel Begunkov2e6e1fd2019-12-05 16:15:45 +03005193 return false;
Jens Axboe9e645e112019-05-10 16:07:28 -06005194 }
5195
Jens Axboe9e645e112019-05-10 16:07:28 -06005196 /*
5197 * If we already have a head request, queue this one for async
5198 * submittal once the head completes. If we don't have a head but
5199 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
5200 * submitted sync once the chain is complete. If none of those
5201 * conditions are true (normal request), then just queue it.
5202 */
5203 if (*link) {
Pavel Begunkov9d763772019-12-17 02:22:07 +03005204 struct io_kiocb *head = *link;
Jens Axboe9e645e112019-05-10 16:07:28 -06005205
Pavel Begunkov8cdf2192020-01-25 00:40:24 +03005206 /*
5207 * Taking sequential execution of a link, draining both sides
5208 * of the link also fullfils IOSQE_IO_DRAIN semantics for all
5209 * requests in the link. So, it drains the head and the
5210 * next after the link request. The last one is done via
5211 * drain_next flag to persist the effect across calls.
5212 */
Pavel Begunkov711be032020-01-17 03:57:59 +03005213 if (sqe_flags & IOSQE_IO_DRAIN) {
5214 head->flags |= REQ_F_IO_DRAIN;
5215 ctx->drain_next = 1;
5216 }
Jens Axboeb7bb4f72019-12-15 22:13:43 -07005217 if (io_alloc_async_ctx(req)) {
Jens Axboe9e645e112019-05-10 16:07:28 -06005218 ret = -EAGAIN;
5219 goto err_req;
5220 }
5221
Jens Axboe3529d8c2019-12-19 18:24:38 -07005222 ret = io_req_defer_prep(req, sqe);
Jens Axboe2d283902019-12-04 11:08:05 -07005223 if (ret) {
Jens Axboe4e88d6e2019-12-07 20:59:47 -07005224 /* fail even hard links since we don't submit */
Pavel Begunkov9d763772019-12-17 02:22:07 +03005225 head->flags |= REQ_F_FAIL_LINK;
Jens Axboef67676d2019-12-02 11:03:47 -07005226 goto err_req;
Jens Axboe2d283902019-12-04 11:08:05 -07005227 }
Pavel Begunkov9d763772019-12-17 02:22:07 +03005228 trace_io_uring_link(ctx, req, head);
5229 list_add_tail(&req->link_list, &head->link_list);
Jens Axboe9e645e112019-05-10 16:07:28 -06005230
Pavel Begunkov32fe5252019-12-17 22:26:58 +03005231 /* last request of a link, enqueue the link */
5232 if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
5233 io_queue_link_head(head);
5234 *link = NULL;
5235 }
Jens Axboe9e645e112019-05-10 16:07:28 -06005236 } else {
Pavel Begunkov711be032020-01-17 03:57:59 +03005237 if (unlikely(ctx->drain_next)) {
5238 req->flags |= REQ_F_IO_DRAIN;
5239 req->ctx->drain_next = 0;
5240 }
5241 if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
5242 req->flags |= REQ_F_LINK;
Pavel Begunkov711be032020-01-17 03:57:59 +03005243 INIT_LIST_HEAD(&req->link_list);
5244 ret = io_req_defer_prep(req, sqe);
5245 if (ret)
5246 req->flags |= REQ_F_FAIL_LINK;
5247 *link = req;
5248 } else {
5249 io_queue_sqe(req, sqe);
5250 }
Jens Axboe9e645e112019-05-10 16:07:28 -06005251 }
Pavel Begunkov2e6e1fd2019-12-05 16:15:45 +03005252
5253 return true;
Jens Axboe9e645e112019-05-10 16:07:28 -06005254}
5255
Jens Axboe9a56a232019-01-09 09:06:50 -07005256/*
5257 * Batched submission is done, ensure local IO is flushed out.
5258 */
5259static void io_submit_state_end(struct io_submit_state *state)
5260{
5261 blk_finish_plug(&state->plug);
Jens Axboe3d6770f2019-04-13 11:50:54 -06005262 io_file_put(state);
Jens Axboe2579f912019-01-09 09:10:43 -07005263 if (state->free_reqs)
Pavel Begunkov6c8a3132020-02-01 03:58:00 +03005264 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
Jens Axboe9a56a232019-01-09 09:06:50 -07005265}
5266
5267/*
5268 * Start submission side cache.
5269 */
5270static void io_submit_state_start(struct io_submit_state *state,
Jackie Liu22efde52019-12-02 17:14:52 +08005271 unsigned int max_ios)
Jens Axboe9a56a232019-01-09 09:06:50 -07005272{
5273 blk_start_plug(&state->plug);
Jens Axboe2579f912019-01-09 09:10:43 -07005274 state->free_reqs = 0;
Jens Axboe9a56a232019-01-09 09:06:50 -07005275 state->file = NULL;
5276 state->ios_left = max_ios;
5277}
5278
Jens Axboe2b188cc2019-01-07 10:46:33 -07005279static void io_commit_sqring(struct io_ring_ctx *ctx)
5280{
Hristo Venev75b28af2019-08-26 17:23:46 +00005281 struct io_rings *rings = ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005282
Pavel Begunkovcaf582c2019-12-30 21:24:46 +03005283 /*
5284 * Ensure any loads from the SQEs are done at this point,
5285 * since once we write the new head, the application could
5286 * write new data to them.
5287 */
5288 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005289}
5290
5291/*
Jens Axboe3529d8c2019-12-19 18:24:38 -07005292 * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
Jens Axboe2b188cc2019-01-07 10:46:33 -07005293 * that is mapped by userspace. This means that care needs to be taken to
5294 * ensure that reads are stable, as we cannot rely on userspace always
5295 * being a good citizen. If members of the sqe are validated and then later
5296 * used, it's important that those reads are done through READ_ONCE() to
5297 * prevent a re-load down the line.
5298 */
Jens Axboe3529d8c2019-12-19 18:24:38 -07005299static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
5300 const struct io_uring_sqe **sqe_ptr)
Jens Axboe2b188cc2019-01-07 10:46:33 -07005301{
Hristo Venev75b28af2019-08-26 17:23:46 +00005302 u32 *sq_array = ctx->sq_array;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005303 unsigned head;
5304
5305 /*
5306 * The cached sq head (or cq tail) serves two purposes:
5307 *
5308 * 1) allows us to batch the cost of updating the user visible
5309 * head updates.
5310 * 2) allows the kernel side to track the head on its own, even
5311 * though the application is the one updating it.
5312 */
Pavel Begunkovee7d46d2019-12-30 21:24:45 +03005313 head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
Pavel Begunkov9835d6f2019-11-21 21:24:56 +03005314 if (likely(head < ctx->sq_entries)) {
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03005315 /*
5316 * All io need record the previous position, if LINK vs DARIN,
5317 * it can be used to mark the position of the first IO in the
5318 * link list.
5319 */
5320 req->sequence = ctx->cached_sq_head;
Jens Axboe3529d8c2019-12-19 18:24:38 -07005321 *sqe_ptr = &ctx->sq_sqes[head];
5322 req->opcode = READ_ONCE((*sqe_ptr)->opcode);
5323 req->user_data = READ_ONCE((*sqe_ptr)->user_data);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005324 ctx->cached_sq_head++;
5325 return true;
5326 }
5327
5328 /* drop invalid entries */
5329 ctx->cached_sq_head++;
Jens Axboe498ccd92019-10-25 10:04:25 -06005330 ctx->cached_sq_dropped++;
Pavel Begunkovee7d46d2019-12-30 21:24:45 +03005331 WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005332 return false;
5333}
5334
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03005335static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
Pavel Begunkovae9428c2019-11-06 00:22:14 +03005336 struct file *ring_file, int ring_fd,
5337 struct mm_struct **mm, bool async)
Jens Axboe6c271ce2019-01-10 11:22:30 -07005338{
5339 struct io_submit_state state, *statep = NULL;
Jens Axboe9e645e112019-05-10 16:07:28 -06005340 struct io_kiocb *link = NULL;
Jens Axboe9e645e112019-05-10 16:07:28 -06005341 int i, submitted = 0;
Pavel Begunkov95a1b3ff2019-10-27 23:15:41 +03005342 bool mm_fault = false;
Jens Axboe6c271ce2019-01-10 11:22:30 -07005343
Jens Axboec4a2ed72019-11-21 21:01:26 -07005344 /* if we have a backlog and couldn't flush it all, return BUSY */
Jens Axboead3eb2c2019-12-18 17:12:20 -07005345 if (test_bit(0, &ctx->sq_check_overflow)) {
5346 if (!list_empty(&ctx->cq_overflow_list) &&
5347 !io_cqring_overflow_flush(ctx, false))
5348 return -EBUSY;
5349 }
Jens Axboe6c271ce2019-01-10 11:22:30 -07005350
Pavel Begunkovee7d46d2019-12-30 21:24:45 +03005351 /* make sure SQ entry isn't read before tail */
5352 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
Pavel Begunkov9ef4f122019-12-30 21:24:44 +03005353
Pavel Begunkov2b85edf2019-12-28 14:13:03 +03005354 if (!percpu_ref_tryget_many(&ctx->refs, nr))
5355 return -EAGAIN;
Jens Axboe6c271ce2019-01-10 11:22:30 -07005356
5357 if (nr > IO_PLUG_THRESHOLD) {
Jackie Liu22efde52019-12-02 17:14:52 +08005358 io_submit_state_start(&state, nr);
Jens Axboe6c271ce2019-01-10 11:22:30 -07005359 statep = &state;
5360 }
5361
Pavel Begunkovb14cca02020-01-17 04:45:59 +03005362 ctx->ring_fd = ring_fd;
5363 ctx->ring_file = ring_file;
5364
Jens Axboe6c271ce2019-01-10 11:22:30 -07005365 for (i = 0; i < nr; i++) {
Jens Axboe3529d8c2019-12-19 18:24:38 -07005366 const struct io_uring_sqe *sqe;
Pavel Begunkov196be952019-11-07 01:41:06 +03005367 struct io_kiocb *req;
Pavel Begunkov1cb1edb2020-02-06 21:16:09 +03005368 int err;
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03005369
Pavel Begunkov196be952019-11-07 01:41:06 +03005370 req = io_get_req(ctx, statep);
5371 if (unlikely(!req)) {
5372 if (!submitted)
5373 submitted = -EAGAIN;
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03005374 break;
Jens Axboe9e645e112019-05-10 16:07:28 -06005375 }
Jens Axboe3529d8c2019-12-19 18:24:38 -07005376 if (!io_get_sqring(ctx, req, &sqe)) {
Pavel Begunkov2b85edf2019-12-28 14:13:03 +03005377 __io_req_do_free(req);
Pavel Begunkov196be952019-11-07 01:41:06 +03005378 break;
5379 }
Jens Axboe9e645e112019-05-10 16:07:28 -06005380
Jens Axboed3656342019-12-18 09:50:26 -07005381 /* will complete beyond this point, count as submitted */
5382 submitted++;
5383
5384 if (unlikely(req->opcode >= IORING_OP_LAST)) {
Pavel Begunkov1cb1edb2020-02-06 21:16:09 +03005385 err = -EINVAL;
5386fail_req:
5387 io_cqring_add_event(req, err);
Jens Axboed3656342019-12-18 09:50:26 -07005388 io_double_put_req(req);
5389 break;
5390 }
5391
5392 if (io_op_defs[req->opcode].needs_mm && !*mm) {
Pavel Begunkov95a1b3ff2019-10-27 23:15:41 +03005393 mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
Pavel Begunkov1cb1edb2020-02-06 21:16:09 +03005394 if (unlikely(mm_fault)) {
5395 err = -EFAULT;
5396 goto fail_req;
Pavel Begunkov95a1b3ff2019-10-27 23:15:41 +03005397 }
Pavel Begunkov1cb1edb2020-02-06 21:16:09 +03005398 use_mm(ctx->sqo_mm);
5399 *mm = ctx->sqo_mm;
Pavel Begunkov95a1b3ff2019-10-27 23:15:41 +03005400 }
5401
Pavel Begunkovcf6fd4b2019-11-25 23:14:39 +03005402 req->needs_fixed_file = async;
Jens Axboe354420f2020-01-08 18:55:15 -07005403 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
5404 true, async);
Jens Axboe3529d8c2019-12-19 18:24:38 -07005405 if (!io_submit_sqe(req, sqe, statep, &link))
Pavel Begunkov2e6e1fd2019-12-05 16:15:45 +03005406 break;
Jens Axboe6c271ce2019-01-10 11:22:30 -07005407 }
5408
Pavel Begunkov9466f432020-01-25 22:34:01 +03005409 if (unlikely(submitted != nr)) {
5410 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
5411
5412 percpu_ref_put_many(&ctx->refs, nr - ref_used);
5413 }
Jens Axboe9e645e112019-05-10 16:07:28 -06005414 if (link)
Pavel Begunkov1b4a51b2019-11-21 11:54:28 +03005415 io_queue_link_head(link);
Jens Axboe6c271ce2019-01-10 11:22:30 -07005416 if (statep)
5417 io_submit_state_end(&state);
5418
Pavel Begunkovae9428c2019-11-06 00:22:14 +03005419 /* Commit SQ ring head once we've consumed and submitted all SQEs */
5420 io_commit_sqring(ctx);
5421
Jens Axboe6c271ce2019-01-10 11:22:30 -07005422 return submitted;
5423}
5424
5425static int io_sq_thread(void *data)
5426{
Jens Axboe6c271ce2019-01-10 11:22:30 -07005427 struct io_ring_ctx *ctx = data;
5428 struct mm_struct *cur_mm = NULL;
Jens Axboe181e4482019-11-25 08:52:30 -07005429 const struct cred *old_cred;
Jens Axboe6c271ce2019-01-10 11:22:30 -07005430 mm_segment_t old_fs;
5431 DEFINE_WAIT(wait);
Jens Axboe6c271ce2019-01-10 11:22:30 -07005432 unsigned long timeout;
Xiaoguang Wangbdcd3ea2020-02-25 22:12:08 +08005433 int ret = 0;
Jens Axboe6c271ce2019-01-10 11:22:30 -07005434
Jens Axboe206aefd2019-11-07 18:27:42 -07005435 complete(&ctx->completions[1]);
Jackie Liua4c0b3d2019-07-08 13:41:12 +08005436
Jens Axboe6c271ce2019-01-10 11:22:30 -07005437 old_fs = get_fs();
5438 set_fs(USER_DS);
Jens Axboe181e4482019-11-25 08:52:30 -07005439 old_cred = override_creds(ctx->creds);
Jens Axboe6c271ce2019-01-10 11:22:30 -07005440
Xiaoguang Wangbdcd3ea2020-02-25 22:12:08 +08005441 timeout = jiffies + ctx->sq_thread_idle;
Roman Penyaev2bbcd6d2019-05-16 10:53:57 +02005442 while (!kthread_should_park()) {
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03005443 unsigned int to_submit;
Jens Axboe6c271ce2019-01-10 11:22:30 -07005444
Xiaoguang Wangbdcd3ea2020-02-25 22:12:08 +08005445 if (!list_empty(&ctx->poll_list)) {
Jens Axboe6c271ce2019-01-10 11:22:30 -07005446 unsigned nr_events = 0;
5447
Xiaoguang Wangbdcd3ea2020-02-25 22:12:08 +08005448 mutex_lock(&ctx->uring_lock);
5449 if (!list_empty(&ctx->poll_list))
5450 io_iopoll_getevents(ctx, &nr_events, 0);
5451 else
Jens Axboe6c271ce2019-01-10 11:22:30 -07005452 timeout = jiffies + ctx->sq_thread_idle;
Xiaoguang Wangbdcd3ea2020-02-25 22:12:08 +08005453 mutex_unlock(&ctx->uring_lock);
Jens Axboe6c271ce2019-01-10 11:22:30 -07005454 }
5455
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03005456 to_submit = io_sqring_entries(ctx);
Jens Axboec1edbf52019-11-10 16:56:04 -07005457
5458 /*
5459 * If submit got -EBUSY, flag us as needing the application
5460 * to enter the kernel to reap and flush events.
5461 */
5462 if (!to_submit || ret == -EBUSY) {
Jens Axboe6c271ce2019-01-10 11:22:30 -07005463 /*
Stefano Garzarella7143b5a2020-02-21 16:42:16 +01005464 * Drop cur_mm before scheduling, we can't hold it for
5465 * long periods (or over schedule()). Do this before
5466 * adding ourselves to the waitqueue, as the unuse/drop
5467 * may sleep.
5468 */
5469 if (cur_mm) {
5470 unuse_mm(cur_mm);
5471 mmput(cur_mm);
5472 cur_mm = NULL;
5473 }
5474
5475 /*
Jens Axboe6c271ce2019-01-10 11:22:30 -07005476 * We're polling. If we're within the defined idle
5477 * period, then let us spin without work before going
Jens Axboec1edbf52019-11-10 16:56:04 -07005478 * to sleep. The exception is if we got EBUSY doing
5479 * more IO, we should wait for the application to
5480 * reap events and wake us up.
Jens Axboe6c271ce2019-01-10 11:22:30 -07005481 */
Xiaoguang Wangbdcd3ea2020-02-25 22:12:08 +08005482 if (!list_empty(&ctx->poll_list) ||
Jens Axboedf069d82020-02-04 16:48:34 -07005483 (!time_after(jiffies, timeout) && ret != -EBUSY &&
5484 !percpu_ref_is_dying(&ctx->refs))) {
Jens Axboeb41e9852020-02-17 09:52:41 -07005485 if (current->task_works)
5486 task_work_run();
Jens Axboe9831a902019-09-19 09:48:55 -06005487 cond_resched();
Jens Axboe6c271ce2019-01-10 11:22:30 -07005488 continue;
5489 }
5490
Jens Axboe6c271ce2019-01-10 11:22:30 -07005491 prepare_to_wait(&ctx->sqo_wait, &wait,
5492 TASK_INTERRUPTIBLE);
5493
Xiaoguang Wangbdcd3ea2020-02-25 22:12:08 +08005494 /*
5495 * While doing polled IO, before going to sleep, we need
5496 * to check if there are new reqs added to poll_list, it
5497 * is because reqs may have been punted to io worker and
5498 * will be added to poll_list later, hence check the
5499 * poll_list again.
5500 */
5501 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
5502 !list_empty_careful(&ctx->poll_list)) {
5503 finish_wait(&ctx->sqo_wait, &wait);
5504 continue;
5505 }
5506
Jens Axboe6c271ce2019-01-10 11:22:30 -07005507 /* Tell userspace we may need a wakeup call */
Hristo Venev75b28af2019-08-26 17:23:46 +00005508 ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
Stefan Bühler0d7bae62019-04-19 11:57:45 +02005509 /* make sure to read SQ tail after writing flags */
5510 smp_mb();
Jens Axboe6c271ce2019-01-10 11:22:30 -07005511
Pavel Begunkovfb5ccc92019-10-25 12:31:30 +03005512 to_submit = io_sqring_entries(ctx);
Jens Axboec1edbf52019-11-10 16:56:04 -07005513 if (!to_submit || ret == -EBUSY) {
Roman Penyaev2bbcd6d2019-05-16 10:53:57 +02005514 if (kthread_should_park()) {
Jens Axboe6c271ce2019-01-10 11:22:30 -07005515 finish_wait(&ctx->sqo_wait, &wait);
5516 break;
5517 }
Jens Axboeb41e9852020-02-17 09:52:41 -07005518 if (current->task_works) {
5519 task_work_run();
5520 continue;
5521 }
Jens Axboe6c271ce2019-01-10 11:22:30 -07005522 if (signal_pending(current))
5523 flush_signals(current);
5524 schedule();
5525 finish_wait(&ctx->sqo_wait, &wait);
5526
Hristo Venev75b28af2019-08-26 17:23:46 +00005527 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
Jens Axboe6c271ce2019-01-10 11:22:30 -07005528 continue;
5529 }
5530 finish_wait(&ctx->sqo_wait, &wait);
5531
Hristo Venev75b28af2019-08-26 17:23:46 +00005532 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
Jens Axboe6c271ce2019-01-10 11:22:30 -07005533 }
5534
Jens Axboe8a4955f2019-12-09 14:52:35 -07005535 mutex_lock(&ctx->uring_lock);
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07005536 ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
Jens Axboe8a4955f2019-12-09 14:52:35 -07005537 mutex_unlock(&ctx->uring_lock);
Xiaoguang Wangbdcd3ea2020-02-25 22:12:08 +08005538 timeout = jiffies + ctx->sq_thread_idle;
Jens Axboe6c271ce2019-01-10 11:22:30 -07005539 }
5540
Jens Axboeb41e9852020-02-17 09:52:41 -07005541 if (current->task_works)
5542 task_work_run();
5543
Jens Axboe6c271ce2019-01-10 11:22:30 -07005544 set_fs(old_fs);
5545 if (cur_mm) {
5546 unuse_mm(cur_mm);
5547 mmput(cur_mm);
5548 }
Jens Axboe181e4482019-11-25 08:52:30 -07005549 revert_creds(old_cred);
Jens Axboe06058632019-04-13 09:26:03 -06005550
Roman Penyaev2bbcd6d2019-05-16 10:53:57 +02005551 kthread_parkme();
Jens Axboe06058632019-04-13 09:26:03 -06005552
Jens Axboe6c271ce2019-01-10 11:22:30 -07005553 return 0;
5554}
5555
Jens Axboebda52162019-09-24 13:47:15 -06005556struct io_wait_queue {
5557 struct wait_queue_entry wq;
5558 struct io_ring_ctx *ctx;
5559 unsigned to_wait;
5560 unsigned nr_timeouts;
5561};
5562
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07005563static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
Jens Axboebda52162019-09-24 13:47:15 -06005564{
5565 struct io_ring_ctx *ctx = iowq->ctx;
5566
5567 /*
Brian Gianforcarod195a662019-12-13 03:09:50 -08005568 * Wake up if we have enough events, or if a timeout occurred since we
Jens Axboebda52162019-09-24 13:47:15 -06005569 * started waiting. For timeouts, we always want to return to userspace,
5570 * regardless of event count.
5571 */
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07005572 return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
Jens Axboebda52162019-09-24 13:47:15 -06005573 atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
5574}
5575
5576static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
5577 int wake_flags, void *key)
5578{
5579 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
5580 wq);
5581
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07005582 /* use noflush == true, as we can't safely rely on locking context */
5583 if (!io_should_wake(iowq, true))
Jens Axboebda52162019-09-24 13:47:15 -06005584 return -1;
5585
5586 return autoremove_wake_function(curr, mode, wake_flags, key);
5587}
5588
Jens Axboe2b188cc2019-01-07 10:46:33 -07005589/*
5590 * Wait until events become available, if we don't already have some. The
5591 * application must reap them itself, as they reside on the shared cq ring.
5592 */
5593static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
5594 const sigset_t __user *sig, size_t sigsz)
5595{
Jens Axboebda52162019-09-24 13:47:15 -06005596 struct io_wait_queue iowq = {
5597 .wq = {
5598 .private = current,
5599 .func = io_wake_function,
5600 .entry = LIST_HEAD_INIT(iowq.wq.entry),
5601 },
5602 .ctx = ctx,
5603 .to_wait = min_events,
5604 };
Hristo Venev75b28af2019-08-26 17:23:46 +00005605 struct io_rings *rings = ctx->rings;
Jackie Liue9ffa5c2019-10-29 11:16:42 +08005606 int ret = 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005607
Jens Axboeb41e9852020-02-17 09:52:41 -07005608 do {
5609 if (io_cqring_events(ctx, false) >= min_events)
5610 return 0;
5611 if (!current->task_works)
5612 break;
5613 task_work_run();
5614 } while (1);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005615
5616 if (sig) {
Arnd Bergmann9e75ad52019-03-25 15:34:53 +01005617#ifdef CONFIG_COMPAT
5618 if (in_compat_syscall())
5619 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
Oleg Nesterovb7724342019-07-16 16:29:53 -07005620 sigsz);
Arnd Bergmann9e75ad52019-03-25 15:34:53 +01005621 else
5622#endif
Oleg Nesterovb7724342019-07-16 16:29:53 -07005623 ret = set_user_sigmask(sig, sigsz);
Arnd Bergmann9e75ad52019-03-25 15:34:53 +01005624
Jens Axboe2b188cc2019-01-07 10:46:33 -07005625 if (ret)
5626 return ret;
5627 }
5628
Jens Axboebda52162019-09-24 13:47:15 -06005629 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +02005630 trace_io_uring_cqring_wait(ctx, min_events);
Jens Axboebda52162019-09-24 13:47:15 -06005631 do {
5632 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
5633 TASK_INTERRUPTIBLE);
Jens Axboeb41e9852020-02-17 09:52:41 -07005634 if (current->task_works)
5635 task_work_run();
Jens Axboe1d7bb1d2019-11-06 11:31:17 -07005636 if (io_should_wake(&iowq, false))
Jens Axboebda52162019-09-24 13:47:15 -06005637 break;
5638 schedule();
5639 if (signal_pending(current)) {
Jackie Liue9ffa5c2019-10-29 11:16:42 +08005640 ret = -EINTR;
Jens Axboebda52162019-09-24 13:47:15 -06005641 break;
5642 }
5643 } while (1);
5644 finish_wait(&ctx->wait, &iowq.wq);
5645
Jackie Liue9ffa5c2019-10-29 11:16:42 +08005646 restore_saved_sigmask_unless(ret == -EINTR);
Jens Axboe2b188cc2019-01-07 10:46:33 -07005647
Hristo Venev75b28af2019-08-26 17:23:46 +00005648 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
Jens Axboe2b188cc2019-01-07 10:46:33 -07005649}
5650
Jens Axboe6b063142019-01-10 22:13:58 -07005651static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
5652{
5653#if defined(CONFIG_UNIX)
5654 if (ctx->ring_sock) {
5655 struct sock *sock = ctx->ring_sock->sk;
5656 struct sk_buff *skb;
5657
5658 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
5659 kfree_skb(skb);
5660 }
5661#else
5662 int i;
5663
Jens Axboe65e19f52019-10-26 07:20:21 -06005664 for (i = 0; i < ctx->nr_user_files; i++) {
5665 struct file *file;
5666
5667 file = io_file_from_index(ctx, i);
5668 if (file)
5669 fput(file);
5670 }
Jens Axboe6b063142019-01-10 22:13:58 -07005671#endif
5672}
5673
Jens Axboe05f3fb32019-12-09 11:22:50 -07005674static void io_file_ref_kill(struct percpu_ref *ref)
5675{
5676 struct fixed_file_data *data;
5677
5678 data = container_of(ref, struct fixed_file_data, refs);
5679 complete(&data->done);
5680}
5681
Jens Axboe6b063142019-01-10 22:13:58 -07005682static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
5683{
Jens Axboe05f3fb32019-12-09 11:22:50 -07005684 struct fixed_file_data *data = ctx->file_data;
Jens Axboe65e19f52019-10-26 07:20:21 -06005685 unsigned nr_tables, i;
5686
Jens Axboe05f3fb32019-12-09 11:22:50 -07005687 if (!data)
Jens Axboe6b063142019-01-10 22:13:58 -07005688 return -ENXIO;
5689
Jens Axboe05f3fb32019-12-09 11:22:50 -07005690 percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
Jens Axboee46a7952020-01-17 11:15:34 -07005691 flush_work(&data->ref_work);
Jens Axboe2faf8522020-02-04 19:54:55 -07005692 wait_for_completion(&data->done);
5693 io_ring_file_ref_flush(data);
Jens Axboe05f3fb32019-12-09 11:22:50 -07005694 percpu_ref_exit(&data->refs);
5695
Jens Axboe6b063142019-01-10 22:13:58 -07005696 __io_sqe_files_unregister(ctx);
Jens Axboe65e19f52019-10-26 07:20:21 -06005697 nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
5698 for (i = 0; i < nr_tables; i++)
Jens Axboe05f3fb32019-12-09 11:22:50 -07005699 kfree(data->table[i].files);
5700 kfree(data->table);
5701 kfree(data);
5702 ctx->file_data = NULL;
Jens Axboe6b063142019-01-10 22:13:58 -07005703 ctx->nr_user_files = 0;
5704 return 0;
5705}
5706
Jens Axboe6c271ce2019-01-10 11:22:30 -07005707static void io_sq_thread_stop(struct io_ring_ctx *ctx)
5708{
5709 if (ctx->sqo_thread) {
Jens Axboe206aefd2019-11-07 18:27:42 -07005710 wait_for_completion(&ctx->completions[1]);
Roman Penyaev2bbcd6d2019-05-16 10:53:57 +02005711 /*
5712 * The park is a bit of a work-around, without it we get
5713 * warning spews on shutdown with SQPOLL set and affinity
5714 * set to a single CPU.
5715 */
Jens Axboe06058632019-04-13 09:26:03 -06005716 kthread_park(ctx->sqo_thread);
Jens Axboe6c271ce2019-01-10 11:22:30 -07005717 kthread_stop(ctx->sqo_thread);
5718 ctx->sqo_thread = NULL;
5719 }
5720}
5721
Jens Axboe6b063142019-01-10 22:13:58 -07005722static void io_finish_async(struct io_ring_ctx *ctx)
5723{
Jens Axboe6c271ce2019-01-10 11:22:30 -07005724 io_sq_thread_stop(ctx);
5725
Jens Axboe561fb042019-10-24 07:25:42 -06005726 if (ctx->io_wq) {
5727 io_wq_destroy(ctx->io_wq);
5728 ctx->io_wq = NULL;
Jens Axboe6b063142019-01-10 22:13:58 -07005729 }
5730}
5731
5732#if defined(CONFIG_UNIX)
Jens Axboe6b063142019-01-10 22:13:58 -07005733/*
5734 * Ensure the UNIX gc is aware of our file set, so we are certain that
5735 * the io_uring can be safely unregistered on process exit, even if we have
5736 * loops in the file referencing.
5737 */
5738static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
5739{
5740 struct sock *sk = ctx->ring_sock->sk;
5741 struct scm_fp_list *fpl;
5742 struct sk_buff *skb;
Jens Axboe08a45172019-10-03 08:11:03 -06005743 int i, nr_files;
Jens Axboe6b063142019-01-10 22:13:58 -07005744
5745 if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
5746 unsigned long inflight = ctx->user->unix_inflight + nr;
5747
5748 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
5749 return -EMFILE;
5750 }
5751
5752 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
5753 if (!fpl)
5754 return -ENOMEM;
5755
5756 skb = alloc_skb(0, GFP_KERNEL);
5757 if (!skb) {
5758 kfree(fpl);
5759 return -ENOMEM;
5760 }
5761
5762 skb->sk = sk;
Jens Axboe6b063142019-01-10 22:13:58 -07005763
Jens Axboe08a45172019-10-03 08:11:03 -06005764 nr_files = 0;
Jens Axboe6b063142019-01-10 22:13:58 -07005765 fpl->user = get_uid(ctx->user);
5766 for (i = 0; i < nr; i++) {
Jens Axboe65e19f52019-10-26 07:20:21 -06005767 struct file *file = io_file_from_index(ctx, i + offset);
5768
5769 if (!file)
Jens Axboe08a45172019-10-03 08:11:03 -06005770 continue;
Jens Axboe65e19f52019-10-26 07:20:21 -06005771 fpl->fp[nr_files] = get_file(file);
Jens Axboe08a45172019-10-03 08:11:03 -06005772 unix_inflight(fpl->user, fpl->fp[nr_files]);
5773 nr_files++;
Jens Axboe6b063142019-01-10 22:13:58 -07005774 }
5775
Jens Axboe08a45172019-10-03 08:11:03 -06005776 if (nr_files) {
5777 fpl->max = SCM_MAX_FD;
5778 fpl->count = nr_files;
5779 UNIXCB(skb).fp = fpl;
Jens Axboe05f3fb32019-12-09 11:22:50 -07005780 skb->destructor = unix_destruct_scm;
Jens Axboe08a45172019-10-03 08:11:03 -06005781 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
5782 skb_queue_head(&sk->sk_receive_queue, skb);
Jens Axboe6b063142019-01-10 22:13:58 -07005783
Jens Axboe08a45172019-10-03 08:11:03 -06005784 for (i = 0; i < nr_files; i++)
5785 fput(fpl->fp[i]);
5786 } else {
5787 kfree_skb(skb);
5788 kfree(fpl);
5789 }
Jens Axboe6b063142019-01-10 22:13:58 -07005790
5791 return 0;
5792}
5793
5794/*
5795 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
5796 * causes regular reference counting to break down. We rely on the UNIX
5797 * garbage collection to take care of this problem for us.
5798 */
5799static int io_sqe_files_scm(struct io_ring_ctx *ctx)
5800{
5801 unsigned left, total;
5802 int ret = 0;
5803
5804 total = 0;
5805 left = ctx->nr_user_files;
5806 while (left) {
5807 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
Jens Axboe6b063142019-01-10 22:13:58 -07005808
5809 ret = __io_sqe_files_scm(ctx, this_files, total);
5810 if (ret)
5811 break;
5812 left -= this_files;
5813 total += this_files;
5814 }
5815
5816 if (!ret)
5817 return 0;
5818
5819 while (total < ctx->nr_user_files) {
Jens Axboe65e19f52019-10-26 07:20:21 -06005820 struct file *file = io_file_from_index(ctx, total);
5821
5822 if (file)
5823 fput(file);
Jens Axboe6b063142019-01-10 22:13:58 -07005824 total++;
5825 }
5826
5827 return ret;
5828}
5829#else
5830static int io_sqe_files_scm(struct io_ring_ctx *ctx)
5831{
5832 return 0;
5833}
5834#endif
5835
Jens Axboe65e19f52019-10-26 07:20:21 -06005836static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
5837 unsigned nr_files)
5838{
5839 int i;
5840
5841 for (i = 0; i < nr_tables; i++) {
Jens Axboe05f3fb32019-12-09 11:22:50 -07005842 struct fixed_file_table *table = &ctx->file_data->table[i];
Jens Axboe65e19f52019-10-26 07:20:21 -06005843 unsigned this_files;
5844
5845 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
5846 table->files = kcalloc(this_files, sizeof(struct file *),
5847 GFP_KERNEL);
5848 if (!table->files)
5849 break;
5850 nr_files -= this_files;
5851 }
5852
5853 if (i == nr_tables)
5854 return 0;
5855
5856 for (i = 0; i < nr_tables; i++) {
Jens Axboe05f3fb32019-12-09 11:22:50 -07005857 struct fixed_file_table *table = &ctx->file_data->table[i];
Jens Axboe65e19f52019-10-26 07:20:21 -06005858 kfree(table->files);
5859 }
5860 return 1;
5861}
5862
Jens Axboe05f3fb32019-12-09 11:22:50 -07005863static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
Jens Axboec3a31e62019-10-03 13:59:56 -06005864{
5865#if defined(CONFIG_UNIX)
Jens Axboec3a31e62019-10-03 13:59:56 -06005866 struct sock *sock = ctx->ring_sock->sk;
5867 struct sk_buff_head list, *head = &sock->sk_receive_queue;
5868 struct sk_buff *skb;
5869 int i;
5870
5871 __skb_queue_head_init(&list);
5872
5873 /*
5874 * Find the skb that holds this file in its SCM_RIGHTS. When found,
5875 * remove this entry and rearrange the file array.
5876 */
5877 skb = skb_dequeue(head);
5878 while (skb) {
5879 struct scm_fp_list *fp;
5880
5881 fp = UNIXCB(skb).fp;
5882 for (i = 0; i < fp->count; i++) {
5883 int left;
5884
5885 if (fp->fp[i] != file)
5886 continue;
5887
5888 unix_notinflight(fp->user, fp->fp[i]);
5889 left = fp->count - 1 - i;
5890 if (left) {
5891 memmove(&fp->fp[i], &fp->fp[i + 1],
5892 left * sizeof(struct file *));
5893 }
5894 fp->count--;
5895 if (!fp->count) {
5896 kfree_skb(skb);
5897 skb = NULL;
5898 } else {
5899 __skb_queue_tail(&list, skb);
5900 }
5901 fput(file);
5902 file = NULL;
5903 break;
5904 }
5905
5906 if (!file)
5907 break;
5908
5909 __skb_queue_tail(&list, skb);
5910
5911 skb = skb_dequeue(head);
5912 }
5913
5914 if (skb_peek(&list)) {
5915 spin_lock_irq(&head->lock);
5916 while ((skb = __skb_dequeue(&list)) != NULL)
5917 __skb_queue_tail(head, skb);
5918 spin_unlock_irq(&head->lock);
5919 }
5920#else
Jens Axboe05f3fb32019-12-09 11:22:50 -07005921 fput(file);
Jens Axboec3a31e62019-10-03 13:59:56 -06005922#endif
5923}
5924
Jens Axboe05f3fb32019-12-09 11:22:50 -07005925struct io_file_put {
5926 struct llist_node llist;
5927 struct file *file;
5928 struct completion *done;
5929};
5930
Jens Axboe2faf8522020-02-04 19:54:55 -07005931static void io_ring_file_ref_flush(struct fixed_file_data *data)
Jens Axboe05f3fb32019-12-09 11:22:50 -07005932{
5933 struct io_file_put *pfile, *tmp;
Jens Axboe05f3fb32019-12-09 11:22:50 -07005934 struct llist_node *node;
5935
Jens Axboe05f3fb32019-12-09 11:22:50 -07005936 while ((node = llist_del_all(&data->put_llist)) != NULL) {
5937 llist_for_each_entry_safe(pfile, tmp, node, llist) {
5938 io_ring_file_put(data->ctx, pfile->file);
5939 if (pfile->done)
5940 complete(pfile->done);
5941 else
5942 kfree(pfile);
5943 }
5944 }
Jens Axboe2faf8522020-02-04 19:54:55 -07005945}
Jens Axboe05f3fb32019-12-09 11:22:50 -07005946
Jens Axboe2faf8522020-02-04 19:54:55 -07005947static void io_ring_file_ref_switch(struct work_struct *work)
5948{
5949 struct fixed_file_data *data;
5950
5951 data = container_of(work, struct fixed_file_data, ref_work);
5952 io_ring_file_ref_flush(data);
Jens Axboe05f3fb32019-12-09 11:22:50 -07005953 percpu_ref_switch_to_percpu(&data->refs);
5954}
5955
5956static void io_file_data_ref_zero(struct percpu_ref *ref)
5957{
5958 struct fixed_file_data *data;
5959
5960 data = container_of(ref, struct fixed_file_data, refs);
5961
Jens Axboe2faf8522020-02-04 19:54:55 -07005962 /*
5963 * We can't safely switch from inside this context, punt to wq. If
5964 * the table ref is going away, the table is being unregistered.
5965 * Don't queue up the async work for that case, the caller will
5966 * handle it.
5967 */
5968 if (!percpu_ref_is_dying(&data->refs))
5969 queue_work(system_wq, &data->ref_work);
Jens Axboe05f3fb32019-12-09 11:22:50 -07005970}
5971
5972static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
5973 unsigned nr_args)
5974{
5975 __s32 __user *fds = (__s32 __user *) arg;
5976 unsigned nr_tables;
5977 struct file *file;
5978 int fd, ret = 0;
5979 unsigned i;
5980
5981 if (ctx->file_data)
5982 return -EBUSY;
5983 if (!nr_args)
5984 return -EINVAL;
5985 if (nr_args > IORING_MAX_FIXED_FILES)
5986 return -EMFILE;
5987
5988 ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
5989 if (!ctx->file_data)
5990 return -ENOMEM;
5991 ctx->file_data->ctx = ctx;
5992 init_completion(&ctx->file_data->done);
5993
5994 nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
5995 ctx->file_data->table = kcalloc(nr_tables,
5996 sizeof(struct fixed_file_table),
5997 GFP_KERNEL);
5998 if (!ctx->file_data->table) {
5999 kfree(ctx->file_data);
6000 ctx->file_data = NULL;
6001 return -ENOMEM;
6002 }
6003
6004 if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
6005 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
6006 kfree(ctx->file_data->table);
6007 kfree(ctx->file_data);
6008 ctx->file_data = NULL;
6009 return -ENOMEM;
6010 }
6011 ctx->file_data->put_llist.first = NULL;
6012 INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
6013
6014 if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
6015 percpu_ref_exit(&ctx->file_data->refs);
6016 kfree(ctx->file_data->table);
6017 kfree(ctx->file_data);
6018 ctx->file_data = NULL;
6019 return -ENOMEM;
6020 }
6021
6022 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
6023 struct fixed_file_table *table;
6024 unsigned index;
6025
6026 ret = -EFAULT;
6027 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
6028 break;
6029 /* allow sparse sets */
6030 if (fd == -1) {
6031 ret = 0;
6032 continue;
6033 }
6034
6035 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
6036 index = i & IORING_FILE_TABLE_MASK;
6037 file = fget(fd);
6038
6039 ret = -EBADF;
6040 if (!file)
6041 break;
6042
6043 /*
6044 * Don't allow io_uring instances to be registered. If UNIX
6045 * isn't enabled, then this causes a reference cycle and this
6046 * instance can never get freed. If UNIX is enabled we'll
6047 * handle it just fine, but there's still no point in allowing
6048 * a ring fd as it doesn't support regular read/write anyway.
6049 */
6050 if (file->f_op == &io_uring_fops) {
6051 fput(file);
6052 break;
6053 }
6054 ret = 0;
6055 table->files[index] = file;
6056 }
6057
6058 if (ret) {
6059 for (i = 0; i < ctx->nr_user_files; i++) {
6060 file = io_file_from_index(ctx, i);
6061 if (file)
6062 fput(file);
6063 }
6064 for (i = 0; i < nr_tables; i++)
6065 kfree(ctx->file_data->table[i].files);
6066
6067 kfree(ctx->file_data->table);
6068 kfree(ctx->file_data);
6069 ctx->file_data = NULL;
6070 ctx->nr_user_files = 0;
6071 return ret;
6072 }
6073
6074 ret = io_sqe_files_scm(ctx);
6075 if (ret)
6076 io_sqe_files_unregister(ctx);
6077
6078 return ret;
6079}
6080
Jens Axboec3a31e62019-10-03 13:59:56 -06006081static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
6082 int index)
6083{
6084#if defined(CONFIG_UNIX)
6085 struct sock *sock = ctx->ring_sock->sk;
6086 struct sk_buff_head *head = &sock->sk_receive_queue;
6087 struct sk_buff *skb;
6088
6089 /*
6090 * See if we can merge this file into an existing skb SCM_RIGHTS
6091 * file set. If there's no room, fall back to allocating a new skb
6092 * and filling it in.
6093 */
6094 spin_lock_irq(&head->lock);
6095 skb = skb_peek(head);
6096 if (skb) {
6097 struct scm_fp_list *fpl = UNIXCB(skb).fp;
6098
6099 if (fpl->count < SCM_MAX_FD) {
6100 __skb_unlink(skb, head);
6101 spin_unlock_irq(&head->lock);
6102 fpl->fp[fpl->count] = get_file(file);
6103 unix_inflight(fpl->user, fpl->fp[fpl->count]);
6104 fpl->count++;
6105 spin_lock_irq(&head->lock);
6106 __skb_queue_head(head, skb);
6107 } else {
6108 skb = NULL;
6109 }
6110 }
6111 spin_unlock_irq(&head->lock);
6112
6113 if (skb) {
6114 fput(file);
6115 return 0;
6116 }
6117
6118 return __io_sqe_files_scm(ctx, 1, index);
6119#else
6120 return 0;
6121#endif
6122}
6123
Jens Axboe05f3fb32019-12-09 11:22:50 -07006124static void io_atomic_switch(struct percpu_ref *ref)
Jens Axboec3a31e62019-10-03 13:59:56 -06006125{
Jens Axboe05f3fb32019-12-09 11:22:50 -07006126 struct fixed_file_data *data;
6127
Jens Axboedd3db2a2020-02-26 10:23:43 -07006128 /*
6129 * Juggle reference to ensure we hit zero, if needed, so we can
6130 * switch back to percpu mode
6131 */
Jens Axboe05f3fb32019-12-09 11:22:50 -07006132 data = container_of(ref, struct fixed_file_data, refs);
Jens Axboedd3db2a2020-02-26 10:23:43 -07006133 percpu_ref_put(&data->refs);
6134 percpu_ref_get(&data->refs);
Jens Axboe05f3fb32019-12-09 11:22:50 -07006135}
6136
6137static bool io_queue_file_removal(struct fixed_file_data *data,
6138 struct file *file)
6139{
6140 struct io_file_put *pfile, pfile_stack;
6141 DECLARE_COMPLETION_ONSTACK(done);
6142
6143 /*
6144 * If we fail allocating the struct we need for doing async reomval
6145 * of this file, just punt to sync and wait for it.
6146 */
6147 pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
6148 if (!pfile) {
6149 pfile = &pfile_stack;
6150 pfile->done = &done;
6151 }
6152
6153 pfile->file = file;
6154 llist_add(&pfile->llist, &data->put_llist);
6155
6156 if (pfile == &pfile_stack) {
Jens Axboedd3db2a2020-02-26 10:23:43 -07006157 percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
Jens Axboe05f3fb32019-12-09 11:22:50 -07006158 wait_for_completion(&done);
6159 flush_work(&data->ref_work);
6160 return false;
6161 }
6162
6163 return true;
6164}
6165
6166static int __io_sqe_files_update(struct io_ring_ctx *ctx,
6167 struct io_uring_files_update *up,
6168 unsigned nr_args)
6169{
6170 struct fixed_file_data *data = ctx->file_data;
6171 bool ref_switch = false;
6172 struct file *file;
Jens Axboec3a31e62019-10-03 13:59:56 -06006173 __s32 __user *fds;
6174 int fd, i, err;
6175 __u32 done;
6176
Jens Axboe05f3fb32019-12-09 11:22:50 -07006177 if (check_add_overflow(up->offset, nr_args, &done))
Jens Axboec3a31e62019-10-03 13:59:56 -06006178 return -EOVERFLOW;
6179 if (done > ctx->nr_user_files)
6180 return -EINVAL;
6181
6182 done = 0;
Jens Axboe05f3fb32019-12-09 11:22:50 -07006183 fds = u64_to_user_ptr(up->fds);
Jens Axboec3a31e62019-10-03 13:59:56 -06006184 while (nr_args) {
Jens Axboe65e19f52019-10-26 07:20:21 -06006185 struct fixed_file_table *table;
6186 unsigned index;
6187
Jens Axboec3a31e62019-10-03 13:59:56 -06006188 err = 0;
6189 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
6190 err = -EFAULT;
6191 break;
6192 }
Jens Axboe05f3fb32019-12-09 11:22:50 -07006193 i = array_index_nospec(up->offset, ctx->nr_user_files);
6194 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
Jens Axboe65e19f52019-10-26 07:20:21 -06006195 index = i & IORING_FILE_TABLE_MASK;
6196 if (table->files[index]) {
Jens Axboe05f3fb32019-12-09 11:22:50 -07006197 file = io_file_from_index(ctx, index);
Jens Axboe65e19f52019-10-26 07:20:21 -06006198 table->files[index] = NULL;
Jens Axboe05f3fb32019-12-09 11:22:50 -07006199 if (io_queue_file_removal(data, file))
6200 ref_switch = true;
Jens Axboec3a31e62019-10-03 13:59:56 -06006201 }
6202 if (fd != -1) {
Jens Axboec3a31e62019-10-03 13:59:56 -06006203 file = fget(fd);
6204 if (!file) {
6205 err = -EBADF;
6206 break;
6207 }
6208 /*
6209 * Don't allow io_uring instances to be registered. If
6210 * UNIX isn't enabled, then this causes a reference
6211 * cycle and this instance can never get freed. If UNIX
6212 * is enabled we'll handle it just fine, but there's
6213 * still no point in allowing a ring fd as it doesn't
6214 * support regular read/write anyway.
6215 */
6216 if (file->f_op == &io_uring_fops) {
6217 fput(file);
6218 err = -EBADF;
6219 break;
6220 }
Jens Axboe65e19f52019-10-26 07:20:21 -06006221 table->files[index] = file;
Jens Axboec3a31e62019-10-03 13:59:56 -06006222 err = io_sqe_file_register(ctx, file, i);
6223 if (err)
6224 break;
6225 }
6226 nr_args--;
6227 done++;
Jens Axboe05f3fb32019-12-09 11:22:50 -07006228 up->offset++;
6229 }
6230
Jens Axboedd3db2a2020-02-26 10:23:43 -07006231 if (ref_switch)
Jens Axboe05f3fb32019-12-09 11:22:50 -07006232 percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
Jens Axboec3a31e62019-10-03 13:59:56 -06006233
6234 return done ? done : err;
6235}
Jens Axboe05f3fb32019-12-09 11:22:50 -07006236static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
6237 unsigned nr_args)
6238{
6239 struct io_uring_files_update up;
6240
6241 if (!ctx->file_data)
6242 return -ENXIO;
6243 if (!nr_args)
6244 return -EINVAL;
6245 if (copy_from_user(&up, arg, sizeof(up)))
6246 return -EFAULT;
6247 if (up.resv)
6248 return -EINVAL;
6249
6250 return __io_sqe_files_update(ctx, &up, nr_args);
6251}
Jens Axboec3a31e62019-10-03 13:59:56 -06006252
Pavel Begunkove9fd9392020-03-04 16:14:12 +03006253static void io_free_work(struct io_wq_work *work)
Jens Axboe7d723062019-11-12 22:31:31 -07006254{
6255 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6256
Pavel Begunkove9fd9392020-03-04 16:14:12 +03006257 /* Consider that io_steal_work() relies on this ref */
Jens Axboe7d723062019-11-12 22:31:31 -07006258 io_put_req(req);
6259}
6260
Pavel Begunkov24369c22020-01-28 03:15:48 +03006261static int io_init_wq_offload(struct io_ring_ctx *ctx,
6262 struct io_uring_params *p)
6263{
6264 struct io_wq_data data;
6265 struct fd f;
6266 struct io_ring_ctx *ctx_attach;
6267 unsigned int concurrency;
6268 int ret = 0;
6269
6270 data.user = ctx->user;
Pavel Begunkove9fd9392020-03-04 16:14:12 +03006271 data.free_work = io_free_work;
Pavel Begunkov24369c22020-01-28 03:15:48 +03006272
6273 if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
6274 /* Do QD, or 4 * CPUS, whatever is smallest */
6275 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
6276
6277 ctx->io_wq = io_wq_create(concurrency, &data);
6278 if (IS_ERR(ctx->io_wq)) {
6279 ret = PTR_ERR(ctx->io_wq);
6280 ctx->io_wq = NULL;
6281 }
6282 return ret;
6283 }
6284
6285 f = fdget(p->wq_fd);
6286 if (!f.file)
6287 return -EBADF;
6288
6289 if (f.file->f_op != &io_uring_fops) {
6290 ret = -EINVAL;
6291 goto out_fput;
6292 }
6293
6294 ctx_attach = f.file->private_data;
6295 /* @io_wq is protected by holding the fd */
6296 if (!io_wq_get(ctx_attach->io_wq, &data)) {
6297 ret = -EINVAL;
6298 goto out_fput;
6299 }
6300
6301 ctx->io_wq = ctx_attach->io_wq;
6302out_fput:
6303 fdput(f);
6304 return ret;
6305}
6306
Jens Axboe6c271ce2019-01-10 11:22:30 -07006307static int io_sq_offload_start(struct io_ring_ctx *ctx,
6308 struct io_uring_params *p)
Jens Axboe2b188cc2019-01-07 10:46:33 -07006309{
6310 int ret;
6311
Jens Axboe6c271ce2019-01-10 11:22:30 -07006312 init_waitqueue_head(&ctx->sqo_wait);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006313 mmgrab(current->mm);
6314 ctx->sqo_mm = current->mm;
6315
Jens Axboe6c271ce2019-01-10 11:22:30 -07006316 if (ctx->flags & IORING_SETUP_SQPOLL) {
Jens Axboe3ec482d2019-04-08 10:51:01 -06006317 ret = -EPERM;
6318 if (!capable(CAP_SYS_ADMIN))
6319 goto err;
6320
Jens Axboe917257d2019-04-13 09:28:55 -06006321 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
6322 if (!ctx->sq_thread_idle)
6323 ctx->sq_thread_idle = HZ;
6324
Jens Axboe6c271ce2019-01-10 11:22:30 -07006325 if (p->flags & IORING_SETUP_SQ_AFF) {
Jens Axboe44a9bd12019-05-14 20:00:30 -06006326 int cpu = p->sq_thread_cpu;
Jens Axboe6c271ce2019-01-10 11:22:30 -07006327
Jens Axboe917257d2019-04-13 09:28:55 -06006328 ret = -EINVAL;
Jens Axboe44a9bd12019-05-14 20:00:30 -06006329 if (cpu >= nr_cpu_ids)
6330 goto err;
Shenghui Wang7889f442019-05-07 16:03:19 +08006331 if (!cpu_online(cpu))
Jens Axboe917257d2019-04-13 09:28:55 -06006332 goto err;
6333
Jens Axboe6c271ce2019-01-10 11:22:30 -07006334 ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
6335 ctx, cpu,
6336 "io_uring-sq");
6337 } else {
6338 ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
6339 "io_uring-sq");
6340 }
6341 if (IS_ERR(ctx->sqo_thread)) {
6342 ret = PTR_ERR(ctx->sqo_thread);
6343 ctx->sqo_thread = NULL;
6344 goto err;
6345 }
6346 wake_up_process(ctx->sqo_thread);
6347 } else if (p->flags & IORING_SETUP_SQ_AFF) {
6348 /* Can't have SQ_AFF without SQPOLL */
6349 ret = -EINVAL;
6350 goto err;
6351 }
6352
Pavel Begunkov24369c22020-01-28 03:15:48 +03006353 ret = io_init_wq_offload(ctx, p);
6354 if (ret)
Jens Axboe2b188cc2019-01-07 10:46:33 -07006355 goto err;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006356
6357 return 0;
6358err:
Jens Axboe54a91f32019-09-10 09:15:04 -06006359 io_finish_async(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006360 mmdrop(ctx->sqo_mm);
6361 ctx->sqo_mm = NULL;
6362 return ret;
6363}
6364
6365static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
6366{
6367 atomic_long_sub(nr_pages, &user->locked_vm);
6368}
6369
6370static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
6371{
6372 unsigned long page_limit, cur_pages, new_pages;
6373
6374 /* Don't allow more pages than we can safely lock */
6375 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
6376
6377 do {
6378 cur_pages = atomic_long_read(&user->locked_vm);
6379 new_pages = cur_pages + nr_pages;
6380 if (new_pages > page_limit)
6381 return -ENOMEM;
6382 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
6383 new_pages) != cur_pages);
6384
6385 return 0;
6386}
6387
6388static void io_mem_free(void *ptr)
6389{
Mark Rutland52e04ef2019-04-30 17:30:21 +01006390 struct page *page;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006391
Mark Rutland52e04ef2019-04-30 17:30:21 +01006392 if (!ptr)
6393 return;
6394
6395 page = virt_to_head_page(ptr);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006396 if (put_page_testzero(page))
6397 free_compound_page(page);
6398}
6399
6400static void *io_mem_alloc(size_t size)
6401{
6402 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
6403 __GFP_NORETRY;
6404
6405 return (void *) __get_free_pages(gfp_flags, get_order(size));
6406}
6407
Hristo Venev75b28af2019-08-26 17:23:46 +00006408static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
6409 size_t *sq_offset)
6410{
6411 struct io_rings *rings;
6412 size_t off, sq_array_size;
6413
6414 off = struct_size(rings, cqes, cq_entries);
6415 if (off == SIZE_MAX)
6416 return SIZE_MAX;
6417
6418#ifdef CONFIG_SMP
6419 off = ALIGN(off, SMP_CACHE_BYTES);
6420 if (off == 0)
6421 return SIZE_MAX;
6422#endif
6423
6424 sq_array_size = array_size(sizeof(u32), sq_entries);
6425 if (sq_array_size == SIZE_MAX)
6426 return SIZE_MAX;
6427
6428 if (check_add_overflow(off, sq_array_size, &off))
6429 return SIZE_MAX;
6430
6431 if (sq_offset)
6432 *sq_offset = off;
6433
6434 return off;
6435}
6436
Jens Axboe2b188cc2019-01-07 10:46:33 -07006437static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
6438{
Hristo Venev75b28af2019-08-26 17:23:46 +00006439 size_t pages;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006440
Hristo Venev75b28af2019-08-26 17:23:46 +00006441 pages = (size_t)1 << get_order(
6442 rings_size(sq_entries, cq_entries, NULL));
6443 pages += (size_t)1 << get_order(
6444 array_size(sizeof(struct io_uring_sqe), sq_entries));
Jens Axboe2b188cc2019-01-07 10:46:33 -07006445
Hristo Venev75b28af2019-08-26 17:23:46 +00006446 return pages;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006447}
6448
Jens Axboeedafcce2019-01-09 09:16:05 -07006449static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
6450{
6451 int i, j;
6452
6453 if (!ctx->user_bufs)
6454 return -ENXIO;
6455
6456 for (i = 0; i < ctx->nr_user_bufs; i++) {
6457 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
6458
6459 for (j = 0; j < imu->nr_bvecs; j++)
John Hubbardf1f6a7d2020-01-30 22:13:35 -08006460 unpin_user_page(imu->bvec[j].bv_page);
Jens Axboeedafcce2019-01-09 09:16:05 -07006461
6462 if (ctx->account_mem)
6463 io_unaccount_mem(ctx->user, imu->nr_bvecs);
Mark Rutlandd4ef6472019-05-01 16:59:16 +01006464 kvfree(imu->bvec);
Jens Axboeedafcce2019-01-09 09:16:05 -07006465 imu->nr_bvecs = 0;
6466 }
6467
6468 kfree(ctx->user_bufs);
6469 ctx->user_bufs = NULL;
6470 ctx->nr_user_bufs = 0;
6471 return 0;
6472}
6473
6474static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
6475 void __user *arg, unsigned index)
6476{
6477 struct iovec __user *src;
6478
6479#ifdef CONFIG_COMPAT
6480 if (ctx->compat) {
6481 struct compat_iovec __user *ciovs;
6482 struct compat_iovec ciov;
6483
6484 ciovs = (struct compat_iovec __user *) arg;
6485 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
6486 return -EFAULT;
6487
Jens Axboed55e5f52019-12-11 16:12:15 -07006488 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
Jens Axboeedafcce2019-01-09 09:16:05 -07006489 dst->iov_len = ciov.iov_len;
6490 return 0;
6491 }
6492#endif
6493 src = (struct iovec __user *) arg;
6494 if (copy_from_user(dst, &src[index], sizeof(*dst)))
6495 return -EFAULT;
6496 return 0;
6497}
6498
6499static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
6500 unsigned nr_args)
6501{
6502 struct vm_area_struct **vmas = NULL;
6503 struct page **pages = NULL;
6504 int i, j, got_pages = 0;
6505 int ret = -EINVAL;
6506
6507 if (ctx->user_bufs)
6508 return -EBUSY;
6509 if (!nr_args || nr_args > UIO_MAXIOV)
6510 return -EINVAL;
6511
6512 ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
6513 GFP_KERNEL);
6514 if (!ctx->user_bufs)
6515 return -ENOMEM;
6516
6517 for (i = 0; i < nr_args; i++) {
6518 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
6519 unsigned long off, start, end, ubuf;
6520 int pret, nr_pages;
6521 struct iovec iov;
6522 size_t size;
6523
6524 ret = io_copy_iov(ctx, &iov, arg, i);
6525 if (ret)
Pavel Begunkova2786822019-05-26 12:35:47 +03006526 goto err;
Jens Axboeedafcce2019-01-09 09:16:05 -07006527
6528 /*
6529 * Don't impose further limits on the size and buffer
6530 * constraints here, we'll -EINVAL later when IO is
6531 * submitted if they are wrong.
6532 */
6533 ret = -EFAULT;
6534 if (!iov.iov_base || !iov.iov_len)
6535 goto err;
6536
6537 /* arbitrary limit, but we need something */
6538 if (iov.iov_len > SZ_1G)
6539 goto err;
6540
6541 ubuf = (unsigned long) iov.iov_base;
6542 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
6543 start = ubuf >> PAGE_SHIFT;
6544 nr_pages = end - start;
6545
6546 if (ctx->account_mem) {
6547 ret = io_account_mem(ctx->user, nr_pages);
6548 if (ret)
6549 goto err;
6550 }
6551
6552 ret = 0;
6553 if (!pages || nr_pages > got_pages) {
6554 kfree(vmas);
6555 kfree(pages);
Mark Rutlandd4ef6472019-05-01 16:59:16 +01006556 pages = kvmalloc_array(nr_pages, sizeof(struct page *),
Jens Axboeedafcce2019-01-09 09:16:05 -07006557 GFP_KERNEL);
Mark Rutlandd4ef6472019-05-01 16:59:16 +01006558 vmas = kvmalloc_array(nr_pages,
Jens Axboeedafcce2019-01-09 09:16:05 -07006559 sizeof(struct vm_area_struct *),
6560 GFP_KERNEL);
6561 if (!pages || !vmas) {
6562 ret = -ENOMEM;
6563 if (ctx->account_mem)
6564 io_unaccount_mem(ctx->user, nr_pages);
6565 goto err;
6566 }
6567 got_pages = nr_pages;
6568 }
6569
Mark Rutlandd4ef6472019-05-01 16:59:16 +01006570 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
Jens Axboeedafcce2019-01-09 09:16:05 -07006571 GFP_KERNEL);
6572 ret = -ENOMEM;
6573 if (!imu->bvec) {
6574 if (ctx->account_mem)
6575 io_unaccount_mem(ctx->user, nr_pages);
6576 goto err;
6577 }
6578
6579 ret = 0;
6580 down_read(&current->mm->mmap_sem);
John Hubbard2113b052020-01-30 22:13:13 -08006581 pret = pin_user_pages(ubuf, nr_pages,
Ira Weiny932f4a62019-05-13 17:17:03 -07006582 FOLL_WRITE | FOLL_LONGTERM,
6583 pages, vmas);
Jens Axboeedafcce2019-01-09 09:16:05 -07006584 if (pret == nr_pages) {
6585 /* don't support file backed memory */
6586 for (j = 0; j < nr_pages; j++) {
6587 struct vm_area_struct *vma = vmas[j];
6588
6589 if (vma->vm_file &&
6590 !is_file_hugepages(vma->vm_file)) {
6591 ret = -EOPNOTSUPP;
6592 break;
6593 }
6594 }
6595 } else {
6596 ret = pret < 0 ? pret : -EFAULT;
6597 }
6598 up_read(&current->mm->mmap_sem);
6599 if (ret) {
6600 /*
6601 * if we did partial map, or found file backed vmas,
6602 * release any pages we did get
6603 */
John Hubbard27c4d3a2019-08-04 19:32:06 -07006604 if (pret > 0)
John Hubbardf1f6a7d2020-01-30 22:13:35 -08006605 unpin_user_pages(pages, pret);
Jens Axboeedafcce2019-01-09 09:16:05 -07006606 if (ctx->account_mem)
6607 io_unaccount_mem(ctx->user, nr_pages);
Mark Rutlandd4ef6472019-05-01 16:59:16 +01006608 kvfree(imu->bvec);
Jens Axboeedafcce2019-01-09 09:16:05 -07006609 goto err;
6610 }
6611
6612 off = ubuf & ~PAGE_MASK;
6613 size = iov.iov_len;
6614 for (j = 0; j < nr_pages; j++) {
6615 size_t vec_len;
6616
6617 vec_len = min_t(size_t, size, PAGE_SIZE - off);
6618 imu->bvec[j].bv_page = pages[j];
6619 imu->bvec[j].bv_len = vec_len;
6620 imu->bvec[j].bv_offset = off;
6621 off = 0;
6622 size -= vec_len;
6623 }
6624 /* store original address for later verification */
6625 imu->ubuf = ubuf;
6626 imu->len = iov.iov_len;
6627 imu->nr_bvecs = nr_pages;
6628
6629 ctx->nr_user_bufs++;
6630 }
Mark Rutlandd4ef6472019-05-01 16:59:16 +01006631 kvfree(pages);
6632 kvfree(vmas);
Jens Axboeedafcce2019-01-09 09:16:05 -07006633 return 0;
6634err:
Mark Rutlandd4ef6472019-05-01 16:59:16 +01006635 kvfree(pages);
6636 kvfree(vmas);
Jens Axboeedafcce2019-01-09 09:16:05 -07006637 io_sqe_buffer_unregister(ctx);
6638 return ret;
6639}
6640
Jens Axboe9b402842019-04-11 11:45:41 -06006641static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
6642{
6643 __s32 __user *fds = arg;
6644 int fd;
6645
6646 if (ctx->cq_ev_fd)
6647 return -EBUSY;
6648
6649 if (copy_from_user(&fd, fds, sizeof(*fds)))
6650 return -EFAULT;
6651
6652 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
6653 if (IS_ERR(ctx->cq_ev_fd)) {
6654 int ret = PTR_ERR(ctx->cq_ev_fd);
6655 ctx->cq_ev_fd = NULL;
6656 return ret;
6657 }
6658
6659 return 0;
6660}
6661
6662static int io_eventfd_unregister(struct io_ring_ctx *ctx)
6663{
6664 if (ctx->cq_ev_fd) {
6665 eventfd_ctx_put(ctx->cq_ev_fd);
6666 ctx->cq_ev_fd = NULL;
6667 return 0;
6668 }
6669
6670 return -ENXIO;
6671}
6672
Jens Axboe5a2e7452020-02-23 16:23:11 -07006673static int __io_destroy_buffers(int id, void *p, void *data)
6674{
6675 struct io_ring_ctx *ctx = data;
6676 struct io_buffer *buf = p;
6677
6678 /* the head kbuf is the list itself */
6679 while (!list_empty(&buf->list)) {
6680 struct io_buffer *nxt;
6681
6682 nxt = list_first_entry(&buf->list, struct io_buffer, list);
6683 list_del(&nxt->list);
6684 kfree(nxt);
6685 }
6686 kfree(buf);
6687 idr_remove(&ctx->io_buffer_idr, id);
6688 return 0;
6689}
6690
6691static void io_destroy_buffers(struct io_ring_ctx *ctx)
6692{
6693 idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
6694 idr_destroy(&ctx->io_buffer_idr);
6695}
6696
Jens Axboe2b188cc2019-01-07 10:46:33 -07006697static void io_ring_ctx_free(struct io_ring_ctx *ctx)
6698{
Jens Axboe6b063142019-01-10 22:13:58 -07006699 io_finish_async(ctx);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006700 if (ctx->sqo_mm)
6701 mmdrop(ctx->sqo_mm);
Jens Axboedef596e2019-01-09 08:59:42 -07006702
6703 io_iopoll_reap_events(ctx);
Jens Axboeedafcce2019-01-09 09:16:05 -07006704 io_sqe_buffer_unregister(ctx);
Jens Axboe6b063142019-01-10 22:13:58 -07006705 io_sqe_files_unregister(ctx);
Jens Axboe9b402842019-04-11 11:45:41 -06006706 io_eventfd_unregister(ctx);
Jens Axboe5a2e7452020-02-23 16:23:11 -07006707 io_destroy_buffers(ctx);
Jens Axboe41726c92020-02-23 13:11:42 -07006708 idr_destroy(&ctx->personality_idr);
Jens Axboedef596e2019-01-09 08:59:42 -07006709
Jens Axboe2b188cc2019-01-07 10:46:33 -07006710#if defined(CONFIG_UNIX)
Eric Biggers355e8d22019-06-12 14:58:43 -07006711 if (ctx->ring_sock) {
6712 ctx->ring_sock->file = NULL; /* so that iput() is called */
Jens Axboe2b188cc2019-01-07 10:46:33 -07006713 sock_release(ctx->ring_sock);
Eric Biggers355e8d22019-06-12 14:58:43 -07006714 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07006715#endif
6716
Hristo Venev75b28af2019-08-26 17:23:46 +00006717 io_mem_free(ctx->rings);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006718 io_mem_free(ctx->sq_sqes);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006719
6720 percpu_ref_exit(&ctx->refs);
6721 if (ctx->account_mem)
6722 io_unaccount_mem(ctx->user,
6723 ring_pages(ctx->sq_entries, ctx->cq_entries));
6724 free_uid(ctx->user);
Jens Axboe181e4482019-11-25 08:52:30 -07006725 put_cred(ctx->creds);
Jens Axboe206aefd2019-11-07 18:27:42 -07006726 kfree(ctx->completions);
Jens Axboe78076bb2019-12-04 19:56:40 -07006727 kfree(ctx->cancel_hash);
Jens Axboe0ddf92e2019-11-08 08:52:53 -07006728 kmem_cache_free(req_cachep, ctx->fallback_req);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006729 kfree(ctx);
6730}
6731
6732static __poll_t io_uring_poll(struct file *file, poll_table *wait)
6733{
6734 struct io_ring_ctx *ctx = file->private_data;
6735 __poll_t mask = 0;
6736
6737 poll_wait(file, &ctx->cq_wait, wait);
Stefan Bühler4f7067c2019-04-24 23:54:17 +02006738 /*
6739 * synchronizes with barrier from wq_has_sleeper call in
6740 * io_commit_cqring
6741 */
Jens Axboe2b188cc2019-01-07 10:46:33 -07006742 smp_rmb();
Hristo Venev75b28af2019-08-26 17:23:46 +00006743 if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
6744 ctx->rings->sq_ring_entries)
Jens Axboe2b188cc2019-01-07 10:46:33 -07006745 mask |= EPOLLOUT | EPOLLWRNORM;
Stefano Garzarella63e5d812020-02-07 13:18:28 +01006746 if (io_cqring_events(ctx, false))
Jens Axboe2b188cc2019-01-07 10:46:33 -07006747 mask |= EPOLLIN | EPOLLRDNORM;
6748
6749 return mask;
6750}
6751
6752static int io_uring_fasync(int fd, struct file *file, int on)
6753{
6754 struct io_ring_ctx *ctx = file->private_data;
6755
6756 return fasync_helper(fd, file, on, &ctx->cq_fasync);
6757}
6758
Jens Axboe071698e2020-01-28 10:04:42 -07006759static int io_remove_personalities(int id, void *p, void *data)
6760{
6761 struct io_ring_ctx *ctx = data;
6762 const struct cred *cred;
6763
6764 cred = idr_remove(&ctx->personality_idr, id);
6765 if (cred)
6766 put_cred(cred);
6767 return 0;
6768}
6769
Jens Axboe2b188cc2019-01-07 10:46:33 -07006770static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
6771{
6772 mutex_lock(&ctx->uring_lock);
6773 percpu_ref_kill(&ctx->refs);
6774 mutex_unlock(&ctx->uring_lock);
6775
Jens Axboedf069d82020-02-04 16:48:34 -07006776 /*
6777 * Wait for sq thread to idle, if we have one. It won't spin on new
6778 * work after we've killed the ctx ref above. This is important to do
6779 * before we cancel existing commands, as the thread could otherwise
6780 * be queueing new work post that. If that's work we need to cancel,
6781 * it could cause shutdown to hang.
6782 */
6783 while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
6784 cpu_relax();
6785
Jens Axboe5262f562019-09-17 12:26:57 -06006786 io_kill_timeouts(ctx);
Jens Axboe221c5eb2019-01-17 09:41:58 -07006787 io_poll_remove_all(ctx);
Jens Axboe561fb042019-10-24 07:25:42 -06006788
6789 if (ctx->io_wq)
6790 io_wq_cancel_all(ctx->io_wq);
6791
Jens Axboedef596e2019-01-09 08:59:42 -07006792 io_iopoll_reap_events(ctx);
Jens Axboe15dff282019-11-13 09:09:23 -07006793 /* if we failed setting up the ctx, we might not have any rings */
6794 if (ctx->rings)
6795 io_cqring_overflow_flush(ctx, true);
Jens Axboe071698e2020-01-28 10:04:42 -07006796 idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
Jens Axboe206aefd2019-11-07 18:27:42 -07006797 wait_for_completion(&ctx->completions[0]);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006798 io_ring_ctx_free(ctx);
6799}
6800
6801static int io_uring_release(struct inode *inode, struct file *file)
6802{
6803 struct io_ring_ctx *ctx = file->private_data;
6804
6805 file->private_data = NULL;
6806 io_ring_ctx_wait_and_kill(ctx);
6807 return 0;
6808}
6809
Jens Axboefcb323c2019-10-24 12:39:47 -06006810static void io_uring_cancel_files(struct io_ring_ctx *ctx,
6811 struct files_struct *files)
6812{
6813 struct io_kiocb *req;
6814 DEFINE_WAIT(wait);
6815
6816 while (!list_empty_careful(&ctx->inflight_list)) {
Jens Axboe768134d2019-11-10 20:30:53 -07006817 struct io_kiocb *cancel_req = NULL;
Jens Axboefcb323c2019-10-24 12:39:47 -06006818
6819 spin_lock_irq(&ctx->inflight_lock);
6820 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
Jens Axboe768134d2019-11-10 20:30:53 -07006821 if (req->work.files != files)
6822 continue;
6823 /* req is being completed, ignore */
6824 if (!refcount_inc_not_zero(&req->refs))
6825 continue;
6826 cancel_req = req;
6827 break;
Jens Axboefcb323c2019-10-24 12:39:47 -06006828 }
Jens Axboe768134d2019-11-10 20:30:53 -07006829 if (cancel_req)
Jens Axboefcb323c2019-10-24 12:39:47 -06006830 prepare_to_wait(&ctx->inflight_wait, &wait,
Jens Axboe768134d2019-11-10 20:30:53 -07006831 TASK_UNINTERRUPTIBLE);
Jens Axboefcb323c2019-10-24 12:39:47 -06006832 spin_unlock_irq(&ctx->inflight_lock);
6833
Jens Axboe768134d2019-11-10 20:30:53 -07006834 /* We need to keep going until we don't find a matching req */
6835 if (!cancel_req)
Jens Axboefcb323c2019-10-24 12:39:47 -06006836 break;
Bob Liu2f6d9b92019-11-13 18:06:24 +08006837
Jens Axboe2ca10252020-02-13 17:17:35 -07006838 if (cancel_req->flags & REQ_F_OVERFLOW) {
6839 spin_lock_irq(&ctx->completion_lock);
6840 list_del(&cancel_req->list);
6841 cancel_req->flags &= ~REQ_F_OVERFLOW;
6842 if (list_empty(&ctx->cq_overflow_list)) {
6843 clear_bit(0, &ctx->sq_check_overflow);
6844 clear_bit(0, &ctx->cq_check_overflow);
6845 }
6846 spin_unlock_irq(&ctx->completion_lock);
6847
6848 WRITE_ONCE(ctx->rings->cq_overflow,
6849 atomic_inc_return(&ctx->cached_cq_overflow));
6850
6851 /*
6852 * Put inflight ref and overflow ref. If that's
6853 * all we had, then we're done with this request.
6854 */
6855 if (refcount_sub_and_test(2, &cancel_req->refs)) {
6856 io_put_req(cancel_req);
6857 continue;
6858 }
6859 }
6860
Bob Liu2f6d9b92019-11-13 18:06:24 +08006861 io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
6862 io_put_req(cancel_req);
Jens Axboefcb323c2019-10-24 12:39:47 -06006863 schedule();
6864 }
Jens Axboe768134d2019-11-10 20:30:53 -07006865 finish_wait(&ctx->inflight_wait, &wait);
Jens Axboefcb323c2019-10-24 12:39:47 -06006866}
6867
6868static int io_uring_flush(struct file *file, void *data)
6869{
6870 struct io_ring_ctx *ctx = file->private_data;
6871
6872 io_uring_cancel_files(ctx, data);
Jens Axboe6ab23142020-02-08 20:23:59 -07006873
6874 /*
6875 * If the task is going away, cancel work it may have pending
6876 */
6877 if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
6878 io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
6879
Jens Axboefcb323c2019-10-24 12:39:47 -06006880 return 0;
6881}
6882
Roman Penyaev6c5c2402019-11-28 12:53:22 +01006883static void *io_uring_validate_mmap_request(struct file *file,
6884 loff_t pgoff, size_t sz)
Jens Axboe2b188cc2019-01-07 10:46:33 -07006885{
Jens Axboe2b188cc2019-01-07 10:46:33 -07006886 struct io_ring_ctx *ctx = file->private_data;
Roman Penyaev6c5c2402019-11-28 12:53:22 +01006887 loff_t offset = pgoff << PAGE_SHIFT;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006888 struct page *page;
6889 void *ptr;
6890
6891 switch (offset) {
6892 case IORING_OFF_SQ_RING:
Hristo Venev75b28af2019-08-26 17:23:46 +00006893 case IORING_OFF_CQ_RING:
6894 ptr = ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006895 break;
6896 case IORING_OFF_SQES:
6897 ptr = ctx->sq_sqes;
6898 break;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006899 default:
Roman Penyaev6c5c2402019-11-28 12:53:22 +01006900 return ERR_PTR(-EINVAL);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006901 }
6902
6903 page = virt_to_head_page(ptr);
Matthew Wilcox (Oracle)a50b8542019-09-23 15:34:25 -07006904 if (sz > page_size(page))
Roman Penyaev6c5c2402019-11-28 12:53:22 +01006905 return ERR_PTR(-EINVAL);
6906
6907 return ptr;
6908}
6909
6910#ifdef CONFIG_MMU
6911
6912static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6913{
6914 size_t sz = vma->vm_end - vma->vm_start;
6915 unsigned long pfn;
6916 void *ptr;
6917
6918 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
6919 if (IS_ERR(ptr))
6920 return PTR_ERR(ptr);
Jens Axboe2b188cc2019-01-07 10:46:33 -07006921
6922 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
6923 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
6924}
6925
Roman Penyaev6c5c2402019-11-28 12:53:22 +01006926#else /* !CONFIG_MMU */
6927
6928static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6929{
6930 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
6931}
6932
6933static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
6934{
6935 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
6936}
6937
6938static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
6939 unsigned long addr, unsigned long len,
6940 unsigned long pgoff, unsigned long flags)
6941{
6942 void *ptr;
6943
6944 ptr = io_uring_validate_mmap_request(file, pgoff, len);
6945 if (IS_ERR(ptr))
6946 return PTR_ERR(ptr);
6947
6948 return (unsigned long) ptr;
6949}
6950
6951#endif /* !CONFIG_MMU */
6952
Jens Axboe2b188cc2019-01-07 10:46:33 -07006953SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
6954 u32, min_complete, u32, flags, const sigset_t __user *, sig,
6955 size_t, sigsz)
6956{
6957 struct io_ring_ctx *ctx;
6958 long ret = -EBADF;
6959 int submitted = 0;
6960 struct fd f;
6961
Jens Axboeb41e9852020-02-17 09:52:41 -07006962 if (current->task_works)
6963 task_work_run();
6964
Jens Axboe6c271ce2019-01-10 11:22:30 -07006965 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
Jens Axboe2b188cc2019-01-07 10:46:33 -07006966 return -EINVAL;
6967
6968 f = fdget(fd);
6969 if (!f.file)
6970 return -EBADF;
6971
6972 ret = -EOPNOTSUPP;
6973 if (f.file->f_op != &io_uring_fops)
6974 goto out_fput;
6975
6976 ret = -ENXIO;
6977 ctx = f.file->private_data;
6978 if (!percpu_ref_tryget(&ctx->refs))
6979 goto out_fput;
6980
Jens Axboe6c271ce2019-01-10 11:22:30 -07006981 /*
6982 * For SQ polling, the thread will do all submissions and completions.
6983 * Just return the requested submit count, and wake the thread if
6984 * we were asked to.
6985 */
Jens Axboeb2a9ead2019-09-12 14:19:16 -06006986 ret = 0;
Jens Axboe6c271ce2019-01-10 11:22:30 -07006987 if (ctx->flags & IORING_SETUP_SQPOLL) {
Jens Axboec1edbf52019-11-10 16:56:04 -07006988 if (!list_empty_careful(&ctx->cq_overflow_list))
6989 io_cqring_overflow_flush(ctx, false);
Jens Axboe6c271ce2019-01-10 11:22:30 -07006990 if (flags & IORING_ENTER_SQ_WAKEUP)
6991 wake_up(&ctx->sqo_wait);
6992 submitted = to_submit;
Jens Axboeb2a9ead2019-09-12 14:19:16 -06006993 } else if (to_submit) {
Pavel Begunkovae9428c2019-11-06 00:22:14 +03006994 struct mm_struct *cur_mm;
Jens Axboe2b188cc2019-01-07 10:46:33 -07006995
6996 mutex_lock(&ctx->uring_lock);
Pavel Begunkovae9428c2019-11-06 00:22:14 +03006997 /* already have mm, so io_submit_sqes() won't try to grab it */
6998 cur_mm = ctx->sqo_mm;
6999 submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
7000 &cur_mm, false);
Jens Axboe2b188cc2019-01-07 10:46:33 -07007001 mutex_unlock(&ctx->uring_lock);
Pavel Begunkov7c504e652019-12-18 19:53:45 +03007002
7003 if (submitted != to_submit)
7004 goto out;
Jens Axboe2b188cc2019-01-07 10:46:33 -07007005 }
7006 if (flags & IORING_ENTER_GETEVENTS) {
Jens Axboedef596e2019-01-09 08:59:42 -07007007 unsigned nr_events = 0;
7008
Jens Axboe2b188cc2019-01-07 10:46:33 -07007009 min_complete = min(min_complete, ctx->cq_entries);
7010
Jens Axboedef596e2019-01-09 08:59:42 -07007011 if (ctx->flags & IORING_SETUP_IOPOLL) {
Jens Axboedef596e2019-01-09 08:59:42 -07007012 ret = io_iopoll_check(ctx, &nr_events, min_complete);
Jens Axboedef596e2019-01-09 08:59:42 -07007013 } else {
7014 ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
7015 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07007016 }
7017
Pavel Begunkov7c504e652019-12-18 19:53:45 +03007018out:
Pavel Begunkov6805b322019-10-08 02:18:42 +03007019 percpu_ref_put(&ctx->refs);
Jens Axboe2b188cc2019-01-07 10:46:33 -07007020out_fput:
7021 fdput(f);
7022 return submitted ? submitted : ret;
7023}
7024
Tobias Klauserbebdb652020-02-26 18:38:32 +01007025#ifdef CONFIG_PROC_FS
Jens Axboe87ce9552020-01-30 08:25:34 -07007026static int io_uring_show_cred(int id, void *p, void *data)
7027{
7028 const struct cred *cred = p;
7029 struct seq_file *m = data;
7030 struct user_namespace *uns = seq_user_ns(m);
7031 struct group_info *gi;
7032 kernel_cap_t cap;
7033 unsigned __capi;
7034 int g;
7035
7036 seq_printf(m, "%5d\n", id);
7037 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
7038 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
7039 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
7040 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
7041 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
7042 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
7043 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
7044 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
7045 seq_puts(m, "\n\tGroups:\t");
7046 gi = cred->group_info;
7047 for (g = 0; g < gi->ngroups; g++) {
7048 seq_put_decimal_ull(m, g ? " " : "",
7049 from_kgid_munged(uns, gi->gid[g]));
7050 }
7051 seq_puts(m, "\n\tCapEff:\t");
7052 cap = cred->cap_effective;
7053 CAP_FOR_EACH_U32(__capi)
7054 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
7055 seq_putc(m, '\n');
7056 return 0;
7057}
7058
7059static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
7060{
7061 int i;
7062
7063 mutex_lock(&ctx->uring_lock);
7064 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
7065 for (i = 0; i < ctx->nr_user_files; i++) {
7066 struct fixed_file_table *table;
7067 struct file *f;
7068
7069 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7070 f = table->files[i & IORING_FILE_TABLE_MASK];
7071 if (f)
7072 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
7073 else
7074 seq_printf(m, "%5u: <none>\n", i);
7075 }
7076 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
7077 for (i = 0; i < ctx->nr_user_bufs; i++) {
7078 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
7079
7080 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
7081 (unsigned int) buf->len);
7082 }
7083 if (!idr_is_empty(&ctx->personality_idr)) {
7084 seq_printf(m, "Personalities:\n");
7085 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
7086 }
Jens Axboed7718a92020-02-14 22:23:12 -07007087 seq_printf(m, "PollList:\n");
7088 spin_lock_irq(&ctx->completion_lock);
7089 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
7090 struct hlist_head *list = &ctx->cancel_hash[i];
7091 struct io_kiocb *req;
7092
7093 hlist_for_each_entry(req, list, hash_node)
7094 seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
7095 req->task->task_works != NULL);
7096 }
7097 spin_unlock_irq(&ctx->completion_lock);
Jens Axboe87ce9552020-01-30 08:25:34 -07007098 mutex_unlock(&ctx->uring_lock);
7099}
7100
7101static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
7102{
7103 struct io_ring_ctx *ctx = f->private_data;
7104
7105 if (percpu_ref_tryget(&ctx->refs)) {
7106 __io_uring_show_fdinfo(ctx, m);
7107 percpu_ref_put(&ctx->refs);
7108 }
7109}
Tobias Klauserbebdb652020-02-26 18:38:32 +01007110#endif
Jens Axboe87ce9552020-01-30 08:25:34 -07007111
Jens Axboe2b188cc2019-01-07 10:46:33 -07007112static const struct file_operations io_uring_fops = {
7113 .release = io_uring_release,
Jens Axboefcb323c2019-10-24 12:39:47 -06007114 .flush = io_uring_flush,
Jens Axboe2b188cc2019-01-07 10:46:33 -07007115 .mmap = io_uring_mmap,
Roman Penyaev6c5c2402019-11-28 12:53:22 +01007116#ifndef CONFIG_MMU
7117 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
7118 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
7119#endif
Jens Axboe2b188cc2019-01-07 10:46:33 -07007120 .poll = io_uring_poll,
7121 .fasync = io_uring_fasync,
Tobias Klauserbebdb652020-02-26 18:38:32 +01007122#ifdef CONFIG_PROC_FS
Jens Axboe87ce9552020-01-30 08:25:34 -07007123 .show_fdinfo = io_uring_show_fdinfo,
Tobias Klauserbebdb652020-02-26 18:38:32 +01007124#endif
Jens Axboe2b188cc2019-01-07 10:46:33 -07007125};
7126
7127static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
7128 struct io_uring_params *p)
7129{
Hristo Venev75b28af2019-08-26 17:23:46 +00007130 struct io_rings *rings;
7131 size_t size, sq_array_offset;
Jens Axboe2b188cc2019-01-07 10:46:33 -07007132
Hristo Venev75b28af2019-08-26 17:23:46 +00007133 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
7134 if (size == SIZE_MAX)
7135 return -EOVERFLOW;
7136
7137 rings = io_mem_alloc(size);
7138 if (!rings)
Jens Axboe2b188cc2019-01-07 10:46:33 -07007139 return -ENOMEM;
7140
Hristo Venev75b28af2019-08-26 17:23:46 +00007141 ctx->rings = rings;
7142 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
7143 rings->sq_ring_mask = p->sq_entries - 1;
7144 rings->cq_ring_mask = p->cq_entries - 1;
7145 rings->sq_ring_entries = p->sq_entries;
7146 rings->cq_ring_entries = p->cq_entries;
7147 ctx->sq_mask = rings->sq_ring_mask;
7148 ctx->cq_mask = rings->cq_ring_mask;
7149 ctx->sq_entries = rings->sq_ring_entries;
7150 ctx->cq_entries = rings->cq_ring_entries;
Jens Axboe2b188cc2019-01-07 10:46:33 -07007151
7152 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
Jens Axboeeb065d32019-11-20 09:26:29 -07007153 if (size == SIZE_MAX) {
7154 io_mem_free(ctx->rings);
7155 ctx->rings = NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07007156 return -EOVERFLOW;
Jens Axboeeb065d32019-11-20 09:26:29 -07007157 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07007158
7159 ctx->sq_sqes = io_mem_alloc(size);
Jens Axboeeb065d32019-11-20 09:26:29 -07007160 if (!ctx->sq_sqes) {
7161 io_mem_free(ctx->rings);
7162 ctx->rings = NULL;
Jens Axboe2b188cc2019-01-07 10:46:33 -07007163 return -ENOMEM;
Jens Axboeeb065d32019-11-20 09:26:29 -07007164 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07007165
Jens Axboe2b188cc2019-01-07 10:46:33 -07007166 return 0;
7167}
7168
7169/*
7170 * Allocate an anonymous fd, this is what constitutes the application
7171 * visible backing of an io_uring instance. The application mmaps this
7172 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
7173 * we have to tie this fd to a socket for file garbage collection purposes.
7174 */
7175static int io_uring_get_fd(struct io_ring_ctx *ctx)
7176{
7177 struct file *file;
7178 int ret;
7179
7180#if defined(CONFIG_UNIX)
7181 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
7182 &ctx->ring_sock);
7183 if (ret)
7184 return ret;
7185#endif
7186
7187 ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
7188 if (ret < 0)
7189 goto err;
7190
7191 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
7192 O_RDWR | O_CLOEXEC);
7193 if (IS_ERR(file)) {
7194 put_unused_fd(ret);
7195 ret = PTR_ERR(file);
7196 goto err;
7197 }
7198
7199#if defined(CONFIG_UNIX)
7200 ctx->ring_sock->file = file;
7201#endif
7202 fd_install(ret, file);
7203 return ret;
7204err:
7205#if defined(CONFIG_UNIX)
7206 sock_release(ctx->ring_sock);
7207 ctx->ring_sock = NULL;
7208#endif
7209 return ret;
7210}
7211
7212static int io_uring_create(unsigned entries, struct io_uring_params *p)
7213{
7214 struct user_struct *user = NULL;
7215 struct io_ring_ctx *ctx;
7216 bool account_mem;
7217 int ret;
7218
Jens Axboe8110c1a2019-12-28 15:39:54 -07007219 if (!entries)
Jens Axboe2b188cc2019-01-07 10:46:33 -07007220 return -EINVAL;
Jens Axboe8110c1a2019-12-28 15:39:54 -07007221 if (entries > IORING_MAX_ENTRIES) {
7222 if (!(p->flags & IORING_SETUP_CLAMP))
7223 return -EINVAL;
7224 entries = IORING_MAX_ENTRIES;
7225 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07007226
7227 /*
7228 * Use twice as many entries for the CQ ring. It's possible for the
7229 * application to drive a higher depth than the size of the SQ ring,
7230 * since the sqes are only used at submission time. This allows for
Jens Axboe33a107f2019-10-04 12:10:03 -06007231 * some flexibility in overcommitting a bit. If the application has
7232 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
7233 * of CQ ring entries manually.
Jens Axboe2b188cc2019-01-07 10:46:33 -07007234 */
7235 p->sq_entries = roundup_pow_of_two(entries);
Jens Axboe33a107f2019-10-04 12:10:03 -06007236 if (p->flags & IORING_SETUP_CQSIZE) {
7237 /*
7238 * If IORING_SETUP_CQSIZE is set, we do the same roundup
7239 * to a power-of-two, if it isn't already. We do NOT impose
7240 * any cq vs sq ring sizing.
7241 */
Jens Axboe8110c1a2019-12-28 15:39:54 -07007242 if (p->cq_entries < p->sq_entries)
Jens Axboe33a107f2019-10-04 12:10:03 -06007243 return -EINVAL;
Jens Axboe8110c1a2019-12-28 15:39:54 -07007244 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
7245 if (!(p->flags & IORING_SETUP_CLAMP))
7246 return -EINVAL;
7247 p->cq_entries = IORING_MAX_CQ_ENTRIES;
7248 }
Jens Axboe33a107f2019-10-04 12:10:03 -06007249 p->cq_entries = roundup_pow_of_two(p->cq_entries);
7250 } else {
7251 p->cq_entries = 2 * p->sq_entries;
7252 }
Jens Axboe2b188cc2019-01-07 10:46:33 -07007253
7254 user = get_uid(current_user());
7255 account_mem = !capable(CAP_IPC_LOCK);
7256
7257 if (account_mem) {
7258 ret = io_account_mem(user,
7259 ring_pages(p->sq_entries, p->cq_entries));
7260 if (ret) {
7261 free_uid(user);
7262 return ret;
7263 }
7264 }
7265
7266 ctx = io_ring_ctx_alloc(p);
7267 if (!ctx) {
7268 if (account_mem)
7269 io_unaccount_mem(user, ring_pages(p->sq_entries,
7270 p->cq_entries));
7271 free_uid(user);
7272 return -ENOMEM;
7273 }
7274 ctx->compat = in_compat_syscall();
7275 ctx->account_mem = account_mem;
7276 ctx->user = user;
Jens Axboe0b8c0ec2019-12-02 08:50:00 -07007277 ctx->creds = get_current_cred();
Jens Axboe2b188cc2019-01-07 10:46:33 -07007278
7279 ret = io_allocate_scq_urings(ctx, p);
7280 if (ret)
7281 goto err;
7282
Jens Axboe6c271ce2019-01-10 11:22:30 -07007283 ret = io_sq_offload_start(ctx, p);
Jens Axboe2b188cc2019-01-07 10:46:33 -07007284 if (ret)
7285 goto err;
7286
Jens Axboe2b188cc2019-01-07 10:46:33 -07007287 memset(&p->sq_off, 0, sizeof(p->sq_off));
Hristo Venev75b28af2019-08-26 17:23:46 +00007288 p->sq_off.head = offsetof(struct io_rings, sq.head);
7289 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
7290 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
7291 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
7292 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
7293 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
7294 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
Jens Axboe2b188cc2019-01-07 10:46:33 -07007295
7296 memset(&p->cq_off, 0, sizeof(p->cq_off));
Hristo Venev75b28af2019-08-26 17:23:46 +00007297 p->cq_off.head = offsetof(struct io_rings, cq.head);
7298 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
7299 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
7300 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
7301 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
7302 p->cq_off.cqes = offsetof(struct io_rings, cqes);
Jens Axboeac90f242019-09-06 10:26:21 -06007303
Jens Axboe044c1ab2019-10-28 09:15:33 -06007304 /*
7305 * Install ring fd as the very last thing, so we don't risk someone
7306 * having closed it before we finish setup
7307 */
7308 ret = io_uring_get_fd(ctx);
7309 if (ret < 0)
7310 goto err;
7311
Jens Axboeda8c9692019-12-02 18:51:26 -07007312 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
Jens Axboecccf0ee2020-01-27 16:34:48 -07007313 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
Jens Axboed7718a92020-02-14 22:23:12 -07007314 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +02007315 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
Jens Axboe2b188cc2019-01-07 10:46:33 -07007316 return ret;
7317err:
7318 io_ring_ctx_wait_and_kill(ctx);
7319 return ret;
7320}
7321
7322/*
7323 * Sets up an aio uring context, and returns the fd. Applications asks for a
7324 * ring size, we return the actual sq/cq ring sizes (among other things) in the
7325 * params structure passed in.
7326 */
7327static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
7328{
7329 struct io_uring_params p;
7330 long ret;
7331 int i;
7332
7333 if (copy_from_user(&p, params, sizeof(p)))
7334 return -EFAULT;
7335 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
7336 if (p.resv[i])
7337 return -EINVAL;
7338 }
7339
Jens Axboe6c271ce2019-01-10 11:22:30 -07007340 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
Jens Axboe8110c1a2019-12-28 15:39:54 -07007341 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
Pavel Begunkov24369c22020-01-28 03:15:48 +03007342 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
Jens Axboe2b188cc2019-01-07 10:46:33 -07007343 return -EINVAL;
7344
7345 ret = io_uring_create(entries, &p);
7346 if (ret < 0)
7347 return ret;
7348
7349 if (copy_to_user(params, &p, sizeof(p)))
7350 return -EFAULT;
7351
7352 return ret;
7353}
7354
7355SYSCALL_DEFINE2(io_uring_setup, u32, entries,
7356 struct io_uring_params __user *, params)
7357{
7358 return io_uring_setup(entries, params);
7359}
7360
Jens Axboe66f4af92020-01-16 15:36:52 -07007361static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
7362{
7363 struct io_uring_probe *p;
7364 size_t size;
7365 int i, ret;
7366
7367 size = struct_size(p, ops, nr_args);
7368 if (size == SIZE_MAX)
7369 return -EOVERFLOW;
7370 p = kzalloc(size, GFP_KERNEL);
7371 if (!p)
7372 return -ENOMEM;
7373
7374 ret = -EFAULT;
7375 if (copy_from_user(p, arg, size))
7376 goto out;
7377 ret = -EINVAL;
7378 if (memchr_inv(p, 0, size))
7379 goto out;
7380
7381 p->last_op = IORING_OP_LAST - 1;
7382 if (nr_args > IORING_OP_LAST)
7383 nr_args = IORING_OP_LAST;
7384
7385 for (i = 0; i < nr_args; i++) {
7386 p->ops[i].op = i;
7387 if (!io_op_defs[i].not_supported)
7388 p->ops[i].flags = IO_URING_OP_SUPPORTED;
7389 }
7390 p->ops_len = i;
7391
7392 ret = 0;
7393 if (copy_to_user(arg, p, size))
7394 ret = -EFAULT;
7395out:
7396 kfree(p);
7397 return ret;
7398}
7399
Jens Axboe071698e2020-01-28 10:04:42 -07007400static int io_register_personality(struct io_ring_ctx *ctx)
7401{
7402 const struct cred *creds = get_current_cred();
7403 int id;
7404
7405 id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
7406 USHRT_MAX, GFP_KERNEL);
7407 if (id < 0)
7408 put_cred(creds);
7409 return id;
7410}
7411
7412static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
7413{
7414 const struct cred *old_creds;
7415
7416 old_creds = idr_remove(&ctx->personality_idr, id);
7417 if (old_creds) {
7418 put_cred(old_creds);
7419 return 0;
7420 }
7421
7422 return -EINVAL;
7423}
7424
7425static bool io_register_op_must_quiesce(int op)
7426{
7427 switch (op) {
7428 case IORING_UNREGISTER_FILES:
7429 case IORING_REGISTER_FILES_UPDATE:
7430 case IORING_REGISTER_PROBE:
7431 case IORING_REGISTER_PERSONALITY:
7432 case IORING_UNREGISTER_PERSONALITY:
7433 return false;
7434 default:
7435 return true;
7436 }
7437}
7438
Jens Axboeedafcce2019-01-09 09:16:05 -07007439static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
7440 void __user *arg, unsigned nr_args)
Jens Axboeb19062a2019-04-15 10:49:38 -06007441 __releases(ctx->uring_lock)
7442 __acquires(ctx->uring_lock)
Jens Axboeedafcce2019-01-09 09:16:05 -07007443{
7444 int ret;
7445
Jens Axboe35fa71a2019-04-22 10:23:23 -06007446 /*
7447 * We're inside the ring mutex, if the ref is already dying, then
7448 * someone else killed the ctx or is already going through
7449 * io_uring_register().
7450 */
7451 if (percpu_ref_is_dying(&ctx->refs))
7452 return -ENXIO;
7453
Jens Axboe071698e2020-01-28 10:04:42 -07007454 if (io_register_op_must_quiesce(opcode)) {
Jens Axboe05f3fb32019-12-09 11:22:50 -07007455 percpu_ref_kill(&ctx->refs);
Jens Axboeb19062a2019-04-15 10:49:38 -06007456
Jens Axboe05f3fb32019-12-09 11:22:50 -07007457 /*
7458 * Drop uring mutex before waiting for references to exit. If
7459 * another thread is currently inside io_uring_enter() it might
7460 * need to grab the uring_lock to make progress. If we hold it
7461 * here across the drain wait, then we can deadlock. It's safe
7462 * to drop the mutex here, since no new references will come in
7463 * after we've killed the percpu ref.
7464 */
7465 mutex_unlock(&ctx->uring_lock);
Jens Axboec1503682020-01-08 08:26:07 -07007466 ret = wait_for_completion_interruptible(&ctx->completions[0]);
Jens Axboe05f3fb32019-12-09 11:22:50 -07007467 mutex_lock(&ctx->uring_lock);
Jens Axboec1503682020-01-08 08:26:07 -07007468 if (ret) {
7469 percpu_ref_resurrect(&ctx->refs);
7470 ret = -EINTR;
7471 goto out;
7472 }
Jens Axboe05f3fb32019-12-09 11:22:50 -07007473 }
Jens Axboeedafcce2019-01-09 09:16:05 -07007474
7475 switch (opcode) {
7476 case IORING_REGISTER_BUFFERS:
7477 ret = io_sqe_buffer_register(ctx, arg, nr_args);
7478 break;
7479 case IORING_UNREGISTER_BUFFERS:
7480 ret = -EINVAL;
7481 if (arg || nr_args)
7482 break;
7483 ret = io_sqe_buffer_unregister(ctx);
7484 break;
Jens Axboe6b063142019-01-10 22:13:58 -07007485 case IORING_REGISTER_FILES:
7486 ret = io_sqe_files_register(ctx, arg, nr_args);
7487 break;
7488 case IORING_UNREGISTER_FILES:
7489 ret = -EINVAL;
7490 if (arg || nr_args)
7491 break;
7492 ret = io_sqe_files_unregister(ctx);
7493 break;
Jens Axboec3a31e62019-10-03 13:59:56 -06007494 case IORING_REGISTER_FILES_UPDATE:
7495 ret = io_sqe_files_update(ctx, arg, nr_args);
7496 break;
Jens Axboe9b402842019-04-11 11:45:41 -06007497 case IORING_REGISTER_EVENTFD:
Jens Axboef2842ab2020-01-08 11:04:00 -07007498 case IORING_REGISTER_EVENTFD_ASYNC:
Jens Axboe9b402842019-04-11 11:45:41 -06007499 ret = -EINVAL;
7500 if (nr_args != 1)
7501 break;
7502 ret = io_eventfd_register(ctx, arg);
Jens Axboef2842ab2020-01-08 11:04:00 -07007503 if (ret)
7504 break;
7505 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
7506 ctx->eventfd_async = 1;
7507 else
7508 ctx->eventfd_async = 0;
Jens Axboe9b402842019-04-11 11:45:41 -06007509 break;
7510 case IORING_UNREGISTER_EVENTFD:
7511 ret = -EINVAL;
7512 if (arg || nr_args)
7513 break;
7514 ret = io_eventfd_unregister(ctx);
7515 break;
Jens Axboe66f4af92020-01-16 15:36:52 -07007516 case IORING_REGISTER_PROBE:
7517 ret = -EINVAL;
7518 if (!arg || nr_args > 256)
7519 break;
7520 ret = io_probe(ctx, arg, nr_args);
7521 break;
Jens Axboe071698e2020-01-28 10:04:42 -07007522 case IORING_REGISTER_PERSONALITY:
7523 ret = -EINVAL;
7524 if (arg || nr_args)
7525 break;
7526 ret = io_register_personality(ctx);
7527 break;
7528 case IORING_UNREGISTER_PERSONALITY:
7529 ret = -EINVAL;
7530 if (arg)
7531 break;
7532 ret = io_unregister_personality(ctx, nr_args);
7533 break;
Jens Axboeedafcce2019-01-09 09:16:05 -07007534 default:
7535 ret = -EINVAL;
7536 break;
7537 }
7538
Jens Axboe071698e2020-01-28 10:04:42 -07007539 if (io_register_op_must_quiesce(opcode)) {
Jens Axboe05f3fb32019-12-09 11:22:50 -07007540 /* bring the ctx back to life */
Jens Axboe05f3fb32019-12-09 11:22:50 -07007541 percpu_ref_reinit(&ctx->refs);
Jens Axboec1503682020-01-08 08:26:07 -07007542out:
7543 reinit_completion(&ctx->completions[0]);
Jens Axboe05f3fb32019-12-09 11:22:50 -07007544 }
Jens Axboeedafcce2019-01-09 09:16:05 -07007545 return ret;
7546}
7547
7548SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
7549 void __user *, arg, unsigned int, nr_args)
7550{
7551 struct io_ring_ctx *ctx;
7552 long ret = -EBADF;
7553 struct fd f;
7554
7555 f = fdget(fd);
7556 if (!f.file)
7557 return -EBADF;
7558
7559 ret = -EOPNOTSUPP;
7560 if (f.file->f_op != &io_uring_fops)
7561 goto out_fput;
7562
7563 ctx = f.file->private_data;
7564
7565 mutex_lock(&ctx->uring_lock);
7566 ret = __io_uring_register(ctx, opcode, arg, nr_args);
7567 mutex_unlock(&ctx->uring_lock);
Dmitrii Dolgovc826bd72019-10-15 19:02:01 +02007568 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
7569 ctx->cq_ev_fd != NULL, ret);
Jens Axboeedafcce2019-01-09 09:16:05 -07007570out_fput:
7571 fdput(f);
7572 return ret;
7573}
7574
Jens Axboe2b188cc2019-01-07 10:46:33 -07007575static int __init io_uring_init(void)
7576{
Stefan Metzmacherd7f62e82020-01-29 14:39:41 +01007577#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
7578 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
7579 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
7580} while (0)
7581
7582#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
7583 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
7584 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
7585 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
7586 BUILD_BUG_SQE_ELEM(1, __u8, flags);
7587 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
7588 BUILD_BUG_SQE_ELEM(4, __s32, fd);
7589 BUILD_BUG_SQE_ELEM(8, __u64, off);
7590 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
7591 BUILD_BUG_SQE_ELEM(16, __u64, addr);
Pavel Begunkov7d67af22020-02-24 11:32:45 +03007592 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
Stefan Metzmacherd7f62e82020-01-29 14:39:41 +01007593 BUILD_BUG_SQE_ELEM(24, __u32, len);
7594 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
7595 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
7596 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
7597 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
7598 BUILD_BUG_SQE_ELEM(28, __u16, poll_events);
7599 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
7600 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
7601 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
7602 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
7603 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
7604 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
7605 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
7606 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
Pavel Begunkov7d67af22020-02-24 11:32:45 +03007607 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
Stefan Metzmacherd7f62e82020-01-29 14:39:41 +01007608 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
7609 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
7610 BUILD_BUG_SQE_ELEM(42, __u16, personality);
Pavel Begunkov7d67af22020-02-24 11:32:45 +03007611 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
Stefan Metzmacherd7f62e82020-01-29 14:39:41 +01007612
Jens Axboed3656342019-12-18 09:50:26 -07007613 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
Jens Axboe2b188cc2019-01-07 10:46:33 -07007614 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
7615 return 0;
7616};
7617__initcall(io_uring_init);