Blame - fs/io_uring.c - yocto/kernel/common

blob: d2a3a1bc85cc053a687fd7760f78038cb88c4f87 [file] [log] [blame]

Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Shared application/kernel submission and completion ring pairs, for
				4	* supporting fast/efficient IO.
				5	*
				6	* A note on the read/write ordering memory barriers that are matched between
				7	* the application and kernel side. When the application reads the CQ ring
				8	* tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
				9	* the kernel uses after writing the tail. Failure to do so could cause a
				10	* delay in when the application notices that completion events available.
				11	* This isn't a fatal condition. Likewise, the application must use an
				12	* appropriate smp_wmb() both before writing the SQ tail, and after writing
				13	* the SQ tail. The first one orders the sqe writes with the tail write, and
				14	* the latter is paired with the smp_rmb() the kernel will issue before
				15	* reading the SQ tail on submission.
				16	*
				17	* Also see the examples in the liburing library:
				18	*
				19	* git://git.kernel.dk/liburing
				20	*
				21	* io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
				22	* from data shared between the kernel and application. This is done both
				23	* for ordering purposes, but also to ensure that once a value is loaded from
				24	* data that the application could potentially modify, it remains stable.
				25	*
				26	* Copyright (C) 2018-2019 Jens Axboe
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	27	* Copyright (c) 2018-2019 Christoph Hellwig
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	28	*/
				29	#include <linux/kernel.h>
				30	#include <linux/init.h>
				31	#include <linux/errno.h>
				32	#include <linux/syscalls.h>
				33	#include <linux/compat.h>
				34	#include <linux/refcount.h>
				35	#include <linux/uio.h>
				36
				37	#include <linux/sched/signal.h>
				38	#include <linux/fs.h>
				39	#include <linux/file.h>
				40	#include <linux/fdtable.h>
				41	#include <linux/mm.h>
				42	#include <linux/mman.h>
				43	#include <linux/mmu_context.h>
				44	#include <linux/percpu.h>
				45	#include <linux/slab.h>
				46	#include <linux/workqueue.h>
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	47	#include <linux/kthread.h>
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	48	#include <linux/blkdev.h>
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	49	#include <linux/bvec.h>
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	50	#include <linux/net.h>
				51	#include <net/sock.h>
				52	#include <net/af_unix.h>
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	53	#include <net/scm.h>
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	54	#include <linux/anon_inodes.h>
				55	#include <linux/sched/mm.h>
				56	#include <linux/uaccess.h>
				57	#include <linux/nospec.h>
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	58	#include <linux/sizes.h>
				59	#include <linux/hugetlb.h>
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	60
				61	#include <uapi/linux/io_uring.h>
				62
				63	#include "internal.h"
				64
				65	#define IORING_MAX_ENTRIES 4096
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	66	#define IORING_MAX_FIXED_FILES 1024
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	67
				68	struct io_uring {
				69	u32 head ____cacheline_aligned_in_smp;
				70	u32 tail ____cacheline_aligned_in_smp;
				71	};
				72
				73	struct io_sq_ring {
				74	struct io_uring r;
				75	u32 ring_mask;
				76	u32 ring_entries;
				77	u32 dropped;
				78	u32 flags;
				79	u32 array[];
				80	};
				81
				82	struct io_cq_ring {
				83	struct io_uring r;
				84	u32 ring_mask;
				85	u32 ring_entries;
				86	u32 overflow;
				87	struct io_uring_cqe cqes[];
				88	};
				89
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	90	struct io_mapped_ubuf {
				91	u64 ubuf;
				92	size_t len;
				93	struct bio_vec *bvec;
				94	unsigned int nr_bvecs;
				95	};
				96
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	97	struct io_ring_ctx {
				98	struct {
				99	struct percpu_ref refs;
				100	} ____cacheline_aligned_in_smp;
				101
				102	struct {
				103	unsigned int flags;
				104	bool compat;
				105	bool account_mem;
				106
				107	/* SQ ring */
				108	struct io_sq_ring *sq_ring;
				109	unsigned cached_sq_head;
				110	unsigned sq_entries;
				111	unsigned sq_mask;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	112	unsigned sq_thread_idle;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	113	struct io_uring_sqe *sq_sqes;
				114	} ____cacheline_aligned_in_smp;
				115
				116	/* IO offload */
				117	struct workqueue_struct *sqo_wq;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	118	struct task_struct sqo_thread; / if using sq thread polling */
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	119	struct mm_struct *sqo_mm;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	120	wait_queue_head_t sqo_wait;
				121	unsigned sqo_stop;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	122
				123	struct {
				124	/* CQ ring */
				125	struct io_cq_ring *cq_ring;
				126	unsigned cached_cq_tail;
				127	unsigned cq_entries;
				128	unsigned cq_mask;
				129	struct wait_queue_head cq_wait;
				130	struct fasync_struct *cq_fasync;
				131	} ____cacheline_aligned_in_smp;
				132
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	133	/*
				134	* If used, fixed file set. Writers must ensure that ->refs is dead,
				135	* readers must ensure that ->refs is alive as long as the file* is
				136	* used. Only updated through io_uring_register(2).
				137	*/
				138	struct file **user_files;
				139	unsigned nr_user_files;
				140
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	141	/* if used, fixed mapped user buffers */
				142	unsigned nr_user_bufs;
				143	struct io_mapped_ubuf *user_bufs;
				144
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	145	struct user_struct *user;
				146
				147	struct completion ctx_done;
				148
				149	struct {
				150	struct mutex uring_lock;
				151	wait_queue_head_t wait;
				152	} ____cacheline_aligned_in_smp;
				153
				154	struct {
				155	spinlock_t completion_lock;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	156	bool poll_multi_file;
				157	/*
				158	* ->poll_list is protected by the ctx->uring_lock for
				159	* io_uring instances that don't use IORING_SETUP_SQPOLL.
				160	* For SQPOLL, only the single threaded io_sq_thread() will
				161	* manipulate the list, hence no extra locking is needed there.
				162	*/
				163	struct list_head poll_list;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	164	} ____cacheline_aligned_in_smp;
				165
				166	#if defined(CONFIG_UNIX)
				167	struct socket *ring_sock;
				168	#endif
				169	};
				170
				171	struct sqe_submit {
				172	const struct io_uring_sqe *sqe;
				173	unsigned short index;
				174	bool has_user;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	175	bool needs_lock;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	176	bool needs_fixed_file;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	177	};
				178
				179	struct io_kiocb {
				180	struct kiocb rw;
				181
				182	struct sqe_submit submit;
				183
				184	struct io_ring_ctx *ctx;
				185	struct list_head list;
				186	unsigned int flags;
Jens Axboe	c16361c	2019-01-17 08:39:48 -0700	[diff] [blame^]	187	refcount_t refs;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	188	#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	189	#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	190	#define REQ_F_FIXED_FILE 4 /* ctx owns file */
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	191	u64 user_data;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	192	u64 error;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	193
				194	struct work_struct work;
				195	};
				196
				197	#define IO_PLUG_THRESHOLD 2
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	198	#define IO_IOPOLL_BATCH 8
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	199
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	200	struct io_submit_state {
				201	struct blk_plug plug;
				202
				203	/*
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	204	* io_kiocb alloc cache
				205	*/
				206	void *reqs[IO_IOPOLL_BATCH];
				207	unsigned int free_reqs;
				208	unsigned int cur_req;
				209
				210	/*
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	211	* File reference cache
				212	*/
				213	struct file *file;
				214	unsigned int fd;
				215	unsigned int has_refs;
				216	unsigned int used_refs;
				217	unsigned int ios_left;
				218	};
				219
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	220	static struct kmem_cache *req_cachep;
				221
				222	static const struct file_operations io_uring_fops;
				223
				224	struct sock io_uring_get_socket(struct file file)
				225	{
				226	#if defined(CONFIG_UNIX)
				227	if (file->f_op == &io_uring_fops) {
				228	struct io_ring_ctx *ctx = file->private_data;
				229
				230	return ctx->ring_sock->sk;
				231	}
				232	#endif
				233	return NULL;
				234	}
				235	EXPORT_SYMBOL(io_uring_get_socket);
				236
				237	static void io_ring_ctx_ref_free(struct percpu_ref *ref)
				238	{
				239	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
				240
				241	complete(&ctx->ctx_done);
				242	}
				243
				244	static struct io_ring_ctx io_ring_ctx_alloc(struct io_uring_params p)
				245	{
				246	struct io_ring_ctx *ctx;
				247
				248	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
				249	if (!ctx)
				250	return NULL;
				251
				252	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
				253	kfree(ctx);
				254	return NULL;
				255	}
				256
				257	ctx->flags = p->flags;
				258	init_waitqueue_head(&ctx->cq_wait);
				259	init_completion(&ctx->ctx_done);
				260	mutex_init(&ctx->uring_lock);
				261	init_waitqueue_head(&ctx->wait);
				262	spin_lock_init(&ctx->completion_lock);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	263	INIT_LIST_HEAD(&ctx->poll_list);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	264	return ctx;
				265	}
				266
				267	static void io_commit_cqring(struct io_ring_ctx *ctx)
				268	{
				269	struct io_cq_ring *ring = ctx->cq_ring;
				270
				271	if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
				272	/* order cqe stores with ring update */
				273	smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
				274
				275	/*
				276	* Write sider barrier of tail update, app has read side. See
				277	* comment at the top of this file.
				278	*/
				279	smp_wmb();
				280
				281	if (wq_has_sleeper(&ctx->cq_wait)) {
				282	wake_up_interruptible(&ctx->cq_wait);
				283	kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
				284	}
				285	}
				286	}
				287
				288	static struct io_uring_cqe io_get_cqring(struct io_ring_ctx ctx)
				289	{
				290	struct io_cq_ring *ring = ctx->cq_ring;
				291	unsigned tail;
				292
				293	tail = ctx->cached_cq_tail;
				294	/* See comment at the top of the file */
				295	smp_rmb();
				296	if (tail + 1 == READ_ONCE(ring->r.head))
				297	return NULL;
				298
				299	ctx->cached_cq_tail++;
				300	return &ring->cqes[tail & ctx->cq_mask];
				301	}
				302
				303	static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
				304	long res, unsigned ev_flags)
				305	{
				306	struct io_uring_cqe *cqe;
				307
				308	/*
				309	* If we can't get a cq entry, userspace overflowed the
				310	* submission (by quite a lot). Increment the overflow count in
				311	* the ring.
				312	*/
				313	cqe = io_get_cqring(ctx);
				314	if (cqe) {
				315	WRITE_ONCE(cqe->user_data, ki_user_data);
				316	WRITE_ONCE(cqe->res, res);
				317	WRITE_ONCE(cqe->flags, ev_flags);
				318	} else {
				319	unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
				320
				321	WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
				322	}
				323	}
				324
				325	static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
				326	long res, unsigned ev_flags)
				327	{
				328	unsigned long flags;
				329
				330	spin_lock_irqsave(&ctx->completion_lock, flags);
				331	io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
				332	io_commit_cqring(ctx);
				333	spin_unlock_irqrestore(&ctx->completion_lock, flags);
				334
				335	if (waitqueue_active(&ctx->wait))
				336	wake_up(&ctx->wait);
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	337	if (waitqueue_active(&ctx->sqo_wait))
				338	wake_up(&ctx->sqo_wait);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	339	}
				340
				341	static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
				342	{
				343	percpu_ref_put_many(&ctx->refs, refs);
				344
				345	if (waitqueue_active(&ctx->wait))
				346	wake_up(&ctx->wait);
				347	}
				348
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	349	static struct io_kiocb io_get_req(struct io_ring_ctx ctx,
				350	struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	351	{
				352	struct io_kiocb *req;
				353
				354	if (!percpu_ref_tryget(&ctx->refs))
				355	return NULL;
				356
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	357	if (!state) {
				358	req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
				359	if (unlikely(!req))
				360	goto out;
				361	} else if (!state->free_reqs) {
				362	size_t sz;
				363	int ret;
				364
				365	sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
				366	ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz,
				367	state->reqs);
				368	if (unlikely(ret <= 0))
				369	goto out;
				370	state->free_reqs = ret - 1;
				371	state->cur_req = 1;
				372	req = state->reqs[0];
				373	} else {
				374	req = state->reqs[state->cur_req];
				375	state->free_reqs--;
				376	state->cur_req++;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	377	}
				378
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	379	req->ctx = ctx;
				380	req->flags = 0;
Jens Axboe	c16361c	2019-01-17 08:39:48 -0700	[diff] [blame^]	381	refcount_set(&req->refs, 0);
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	382	return req;
				383	out:
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	384	io_ring_drop_ctx_refs(ctx, 1);
				385	return NULL;
				386	}
				387
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	388	static void io_free_req_many(struct io_ring_ctx ctx, void reqs, int nr)
				389	{
				390	if (*nr) {
				391	kmem_cache_free_bulk(req_cachep, *nr, reqs);
				392	io_ring_drop_ctx_refs(ctx, *nr);
				393	*nr = 0;
				394	}
				395	}
				396
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	397	static void io_free_req(struct io_kiocb *req)
				398	{
Jens Axboe	c16361c	2019-01-17 08:39:48 -0700	[diff] [blame^]	399	if (!refcount_read(&req->refs) \|\| refcount_dec_and_test(&req->refs)) {
				400	io_ring_drop_ctx_refs(req->ctx, 1);
				401	kmem_cache_free(req_cachep, req);
				402	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	403	}
				404
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	405	/*
				406	* Find and free completed poll iocbs
				407	*/
				408	static void io_iopoll_complete(struct io_ring_ctx ctx, unsigned int nr_events,
				409	struct list_head *done)
				410	{
				411	void *reqs[IO_IOPOLL_BATCH];
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	412	int file_count, to_free;
				413	struct file *file = NULL;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	414	struct io_kiocb *req;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	415
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	416	file_count = to_free = 0;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	417	while (!list_empty(done)) {
				418	req = list_first_entry(done, struct io_kiocb, list);
				419	list_del(&req->list);
				420
				421	io_cqring_fill_event(ctx, req->user_data, req->error, 0);
				422
				423	reqs[to_free++] = req;
				424	(*nr_events)++;
				425
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	426	/*
				427	* Batched puts of the same file, to avoid dirtying the
				428	* file usage count multiple times, if avoidable.
				429	*/
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	430	if (!(req->flags & REQ_F_FIXED_FILE)) {
				431	if (!file) {
				432	file = req->rw.ki_filp;
				433	file_count = 1;
				434	} else if (file == req->rw.ki_filp) {
				435	file_count++;
				436	} else {
				437	fput_many(file, file_count);
				438	file = req->rw.ki_filp;
				439	file_count = 1;
				440	}
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	441	}
				442
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	443	if (to_free == ARRAY_SIZE(reqs))
				444	io_free_req_many(ctx, reqs, &to_free);
				445	}
				446	io_commit_cqring(ctx);
				447
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	448	if (file)
				449	fput_many(file, file_count);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	450	io_free_req_many(ctx, reqs, &to_free);
				451	}
				452
				453	static int io_do_iopoll(struct io_ring_ctx ctx, unsigned int nr_events,
				454	long min)
				455	{
				456	struct io_kiocb req, tmp;
				457	LIST_HEAD(done);
				458	bool spin;
				459	int ret;
				460
				461	/*
				462	* Only spin for completions if we don't have multiple devices hanging
				463	* off our complete list, and we're under the requested amount.
				464	*/
				465	spin = !ctx->poll_multi_file && *nr_events < min;
				466
				467	ret = 0;
				468	list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
				469	struct kiocb *kiocb = &req->rw;
				470
				471	/*
				472	* Move completed entries to our local list. If we find a
				473	* request that requires polling, break out and complete
				474	* the done list first, if we have entries there.
				475	*/
				476	if (req->flags & REQ_F_IOPOLL_COMPLETED) {
				477	list_move_tail(&req->list, &done);
				478	continue;
				479	}
				480	if (!list_empty(&done))
				481	break;
				482
				483	ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
				484	if (ret < 0)
				485	break;
				486
				487	if (ret && spin)
				488	spin = false;
				489	ret = 0;
				490	}
				491
				492	if (!list_empty(&done))
				493	io_iopoll_complete(ctx, nr_events, &done);
				494
				495	return ret;
				496	}
				497
				498	/*
				499	* Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
				500	* non-spinning poll check - we'll still enter the driver poll loop, but only
				501	* as a non-spinning completion check.
				502	*/
				503	static int io_iopoll_getevents(struct io_ring_ctx ctx, unsigned int nr_events,
				504	long min)
				505	{
				506	while (!list_empty(&ctx->poll_list)) {
				507	int ret;
				508
				509	ret = io_do_iopoll(ctx, nr_events, min);
				510	if (ret < 0)
				511	return ret;
				512	if (!min \|\| *nr_events >= min)
				513	return 0;
				514	}
				515
				516	return 1;
				517	}
				518
				519	/*
				520	* We can't just wait for polled events to come to us, we have to actively
				521	* find and complete them.
				522	*/
				523	static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
				524	{
				525	if (!(ctx->flags & IORING_SETUP_IOPOLL))
				526	return;
				527
				528	mutex_lock(&ctx->uring_lock);
				529	while (!list_empty(&ctx->poll_list)) {
				530	unsigned int nr_events = 0;
				531
				532	io_iopoll_getevents(ctx, &nr_events, 1);
				533	}
				534	mutex_unlock(&ctx->uring_lock);
				535	}
				536
				537	static int io_iopoll_check(struct io_ring_ctx ctx, unsigned nr_events,
				538	long min)
				539	{
				540	int ret = 0;
				541
				542	do {
				543	int tmin = 0;
				544
				545	if (*nr_events < min)
				546	tmin = min - *nr_events;
				547
				548	ret = io_iopoll_getevents(ctx, nr_events, tmin);
				549	if (ret <= 0)
				550	break;
				551	ret = 0;
				552	} while (min && !*nr_events && !need_resched());
				553
				554	return ret;
				555	}
				556
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	557	static void kiocb_end_write(struct kiocb *kiocb)
				558	{
				559	if (kiocb->ki_flags & IOCB_WRITE) {
				560	struct inode *inode = file_inode(kiocb->ki_filp);
				561
				562	/*
				563	* Tell lockdep we inherited freeze protection from submission
				564	* thread.
				565	*/
				566	if (S_ISREG(inode->i_mode))
				567	__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
				568	file_end_write(kiocb->ki_filp);
				569	}
				570	}
				571
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	572	static void io_fput(struct io_kiocb *req)
				573	{
				574	if (!(req->flags & REQ_F_FIXED_FILE))
				575	fput(req->rw.ki_filp);
				576	}
				577
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	578	static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
				579	{
				580	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
				581
				582	kiocb_end_write(kiocb);
				583
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	584	io_fput(req);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	585	io_cqring_add_event(req->ctx, req->user_data, res, 0);
				586	io_free_req(req);
				587	}
				588
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	589	static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
				590	{
				591	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
				592
				593	kiocb_end_write(kiocb);
				594
				595	req->error = res;
				596	if (res != -EAGAIN)
				597	req->flags \|= REQ_F_IOPOLL_COMPLETED;
				598	}
				599
				600	/*
				601	* After the iocb has been issued, it's safe to be found on the poll list.
				602	* Adding the kiocb to the list AFTER submission ensures that we don't
				603	* find it from a io_iopoll_getevents() thread before the issuer is done
				604	* accessing the kiocb cookie.
				605	*/
				606	static void io_iopoll_req_issued(struct io_kiocb *req)
				607	{
				608	struct io_ring_ctx *ctx = req->ctx;
				609
				610	/*
				611	* Track whether we have multiple files in our lists. This will impact
				612	* how we do polling eventually, not spinning if we're on potentially
				613	* different devices.
				614	*/
				615	if (list_empty(&ctx->poll_list)) {
				616	ctx->poll_multi_file = false;
				617	} else if (!ctx->poll_multi_file) {
				618	struct io_kiocb *list_req;
				619
				620	list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
				621	list);
				622	if (list_req->rw.ki_filp != req->rw.ki_filp)
				623	ctx->poll_multi_file = true;
				624	}
				625
				626	/*
				627	* For fast devices, IO may have already completed. If it has, add
				628	* it to the front so we find it first.
				629	*/
				630	if (req->flags & REQ_F_IOPOLL_COMPLETED)
				631	list_add(&req->list, &ctx->poll_list);
				632	else
				633	list_add_tail(&req->list, &ctx->poll_list);
				634	}
				635
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	636	static void io_file_put(struct io_submit_state state, struct file file)
				637	{
				638	if (!state) {
				639	fput(file);
				640	} else if (state->file) {
				641	int diff = state->has_refs - state->used_refs;
				642
				643	if (diff)
				644	fput_many(state->file, diff);
				645	state->file = NULL;
				646	}
				647	}
				648
				649	/*
				650	* Get as many references to a file as we have IOs left in this submission,
				651	* assuming most submissions are for one file, or at least that each file
				652	* has more than one submission.
				653	*/
				654	static struct file io_file_get(struct io_submit_state state, int fd)
				655	{
				656	if (!state)
				657	return fget(fd);
				658
				659	if (state->file) {
				660	if (state->fd == fd) {
				661	state->used_refs++;
				662	state->ios_left--;
				663	return state->file;
				664	}
				665	io_file_put(state, NULL);
				666	}
				667	state->file = fget_many(fd, state->ios_left);
				668	if (!state->file)
				669	return NULL;
				670
				671	state->fd = fd;
				672	state->has_refs = state->ios_left;
				673	state->used_refs = 1;
				674	state->ios_left--;
				675	return state->file;
				676	}
				677
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	678	/*
				679	* If we tracked the file through the SCM inflight mechanism, we could support
				680	* any file. For now, just ensure that anything potentially problematic is done
				681	* inline.
				682	*/
				683	static bool io_file_supports_async(struct file *file)
				684	{
				685	umode_t mode = file_inode(file)->i_mode;
				686
				687	if (S_ISBLK(mode) \|\| S_ISCHR(mode))
				688	return true;
				689	if (S_ISREG(mode) && file->f_op != &io_uring_fops)
				690	return true;
				691
				692	return false;
				693	}
				694
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	695	static int io_prep_rw(struct io_kiocb req, const struct sqe_submit s,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	696	bool force_nonblock, struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	697	{
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	698	const struct io_uring_sqe *sqe = s->sqe;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	699	struct io_ring_ctx *ctx = req->ctx;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	700	struct kiocb *kiocb = &req->rw;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	701	unsigned ioprio, flags;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	702	int fd, ret;
				703
				704	/* For -EAGAIN retry, everything is already prepped */
				705	if (kiocb->ki_filp)
				706	return 0;
				707
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	708	flags = READ_ONCE(sqe->flags);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	709	fd = READ_ONCE(sqe->fd);
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	710
				711	if (flags & IOSQE_FIXED_FILE) {
				712	if (unlikely(!ctx->user_files \|\|
				713	(unsigned) fd >= ctx->nr_user_files))
				714	return -EBADF;
				715	kiocb->ki_filp = ctx->user_files[fd];
				716	req->flags \|= REQ_F_FIXED_FILE;
				717	} else {
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	718	if (s->needs_fixed_file)
				719	return -EBADF;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	720	kiocb->ki_filp = io_file_get(state, fd);
				721	if (unlikely(!kiocb->ki_filp))
				722	return -EBADF;
				723	if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
				724	force_nonblock = false;
				725	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	726	kiocb->ki_pos = READ_ONCE(sqe->off);
				727	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
				728	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
				729
				730	ioprio = READ_ONCE(sqe->ioprio);
				731	if (ioprio) {
				732	ret = ioprio_check_cap(ioprio);
				733	if (ret)
				734	goto out_fput;
				735
				736	kiocb->ki_ioprio = ioprio;
				737	} else
				738	kiocb->ki_ioprio = get_current_ioprio();
				739
				740	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
				741	if (unlikely(ret))
				742	goto out_fput;
				743	if (force_nonblock) {
				744	kiocb->ki_flags \|= IOCB_NOWAIT;
				745	req->flags \|= REQ_F_FORCE_NONBLOCK;
				746	}
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	747	if (ctx->flags & IORING_SETUP_IOPOLL) {
				748	ret = -EOPNOTSUPP;
				749	if (!(kiocb->ki_flags & IOCB_DIRECT) \|\|
				750	!kiocb->ki_filp->f_op->iopoll)
				751	goto out_fput;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	752
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	753	req->error = 0;
				754	kiocb->ki_flags \|= IOCB_HIPRI;
				755	kiocb->ki_complete = io_complete_rw_iopoll;
				756	} else {
				757	if (kiocb->ki_flags & IOCB_HIPRI) {
				758	ret = -EINVAL;
				759	goto out_fput;
				760	}
				761	kiocb->ki_complete = io_complete_rw;
				762	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	763	return 0;
				764	out_fput:
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	765	if (!(flags & IOSQE_FIXED_FILE)) {
				766	/*
				767	* in case of error, we didn't use this file reference. drop it.
				768	*/
				769	if (state)
				770	state->used_refs--;
				771	io_file_put(state, kiocb->ki_filp);
				772	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	773	return ret;
				774	}
				775
				776	static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
				777	{
				778	switch (ret) {
				779	case -EIOCBQUEUED:
				780	break;
				781	case -ERESTARTSYS:
				782	case -ERESTARTNOINTR:
				783	case -ERESTARTNOHAND:
				784	case -ERESTART_RESTARTBLOCK:
				785	/*
				786	* We can't just restart the syscall, since previously
				787	* submitted sqes may already be in progress. Just fail this
				788	* IO with EINTR.
				789	*/
				790	ret = -EINTR;
				791	/* fall through */
				792	default:
				793	kiocb->ki_complete(kiocb, ret, 0);
				794	}
				795	}
				796
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	797	static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
				798	const struct io_uring_sqe *sqe,
				799	struct iov_iter *iter)
				800	{
				801	size_t len = READ_ONCE(sqe->len);
				802	struct io_mapped_ubuf *imu;
				803	unsigned index, buf_index;
				804	size_t offset;
				805	u64 buf_addr;
				806
				807	/* attempt to use fixed buffers without having provided iovecs */
				808	if (unlikely(!ctx->user_bufs))
				809	return -EFAULT;
				810
				811	buf_index = READ_ONCE(sqe->buf_index);
				812	if (unlikely(buf_index >= ctx->nr_user_bufs))
				813	return -EFAULT;
				814
				815	index = array_index_nospec(buf_index, ctx->nr_user_bufs);
				816	imu = &ctx->user_bufs[index];
				817	buf_addr = READ_ONCE(sqe->addr);
				818
				819	/* overflow */
				820	if (buf_addr + len < buf_addr)
				821	return -EFAULT;
				822	/* not inside the mapped region */
				823	if (buf_addr < imu->ubuf \|\| buf_addr + len > imu->ubuf + imu->len)
				824	return -EFAULT;
				825
				826	/*
				827	* May not be a start of buffer, set size appropriately
				828	* and advance us to the beginning.
				829	*/
				830	offset = buf_addr - imu->ubuf;
				831	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
				832	if (offset)
				833	iov_iter_advance(iter, offset);
				834	return 0;
				835	}
				836
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	837	static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
				838	const struct sqe_submit s, struct iovec *iovec,
				839	struct iov_iter *iter)
				840	{
				841	const struct io_uring_sqe *sqe = s->sqe;
				842	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
				843	size_t sqe_len = READ_ONCE(sqe->len);
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	844	u8 opcode;
				845
				846	/*
				847	* We're reading ->opcode for the second time, but the first read
				848	* doesn't care whether it's _FIXED or not, so it doesn't matter
				849	* whether ->opcode changes concurrently. The first read does care
				850	* about whether it is a READ or a WRITE, so we don't trust this read
				851	* for that purpose and instead let the caller pass in the read/write
				852	* flag.
				853	*/
				854	opcode = READ_ONCE(sqe->opcode);
				855	if (opcode == IORING_OP_READ_FIXED \|\|
				856	opcode == IORING_OP_WRITE_FIXED) {
				857	ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
				858	*iovec = NULL;
				859	return ret;
				860	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	861
				862	if (!s->has_user)
				863	return -EFAULT;
				864
				865	#ifdef CONFIG_COMPAT
				866	if (ctx->compat)
				867	return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
				868	iovec, iter);
				869	#endif
				870
				871	return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
				872	}
				873
				874	static ssize_t io_read(struct io_kiocb req, const struct sqe_submit s,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	875	bool force_nonblock, struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	876	{
				877	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
				878	struct kiocb *kiocb = &req->rw;
				879	struct iov_iter iter;
				880	struct file *file;
				881	ssize_t ret;
				882
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	883	ret = io_prep_rw(req, s, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	884	if (ret)
				885	return ret;
				886	file = kiocb->ki_filp;
				887
				888	ret = -EBADF;
				889	if (unlikely(!(file->f_mode & FMODE_READ)))
				890	goto out_fput;
				891	ret = -EINVAL;
				892	if (unlikely(!file->f_op->read_iter))
				893	goto out_fput;
				894
				895	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
				896	if (ret)
				897	goto out_fput;
				898
				899	ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_iter_count(&iter));
				900	if (!ret) {
				901	ssize_t ret2;
				902
				903	/* Catch -EAGAIN return for forced non-blocking submission */
				904	ret2 = call_read_iter(file, kiocb, &iter);
				905	if (!force_nonblock \|\| ret2 != -EAGAIN)
				906	io_rw_done(kiocb, ret2);
				907	else
				908	ret = -EAGAIN;
				909	}
				910	kfree(iovec);
				911	out_fput:
				912	/* Hold on to the file for -EAGAIN */
				913	if (unlikely(ret && ret != -EAGAIN))
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	914	io_fput(req);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	915	return ret;
				916	}
				917
				918	static ssize_t io_write(struct io_kiocb req, const struct sqe_submit s,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	919	bool force_nonblock, struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	920	{
				921	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
				922	struct kiocb *kiocb = &req->rw;
				923	struct iov_iter iter;
				924	struct file *file;
				925	ssize_t ret;
				926
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	927	ret = io_prep_rw(req, s, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	928	if (ret)
				929	return ret;
				930	/* Hold on to the file for -EAGAIN */
				931	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
				932	return -EAGAIN;
				933
				934	ret = -EBADF;
				935	file = kiocb->ki_filp;
				936	if (unlikely(!(file->f_mode & FMODE_WRITE)))
				937	goto out_fput;
				938	ret = -EINVAL;
				939	if (unlikely(!file->f_op->write_iter))
				940	goto out_fput;
				941
				942	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
				943	if (ret)
				944	goto out_fput;
				945
				946	ret = rw_verify_area(WRITE, file, &kiocb->ki_pos,
				947	iov_iter_count(&iter));
				948	if (!ret) {
				949	/*
				950	* Open-code file_start_write here to grab freeze protection,
				951	* which will be released by another thread in
				952	* io_complete_rw(). Fool lockdep by telling it the lock got
				953	* released so that it doesn't complain about the held lock when
				954	* we return to userspace.
				955	*/
				956	if (S_ISREG(file_inode(file)->i_mode)) {
				957	__sb_start_write(file_inode(file)->i_sb,
				958	SB_FREEZE_WRITE, true);
				959	__sb_writers_release(file_inode(file)->i_sb,
				960	SB_FREEZE_WRITE);
				961	}
				962	kiocb->ki_flags \|= IOCB_WRITE;
				963	io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
				964	}
				965	kfree(iovec);
				966	out_fput:
				967	if (unlikely(ret))
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	968	io_fput(req);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	969	return ret;
				970	}
				971
				972	/*
				973	* IORING_OP_NOP just posts a completion event, nothing else.
				974	*/
				975	static int io_nop(struct io_kiocb *req, u64 user_data)
				976	{
				977	struct io_ring_ctx *ctx = req->ctx;
				978	long err = 0;
				979
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	980	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
				981	return -EINVAL;
				982
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	983	/*
				984	* Twilight zone - it's possible that someone issued an opcode that
				985	* has a file attached, then got -EAGAIN on submission, and changed
				986	* the sqe before we retried it from async context. Avoid dropping
				987	* a file reference for this malicious case, and flag the error.
				988	*/
				989	if (req->rw.ki_filp) {
				990	err = -EBADF;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	991	io_fput(req);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	992	}
				993	io_cqring_add_event(ctx, user_data, err, 0);
				994	io_free_req(req);
				995	return 0;
				996	}
				997
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	998	static int io_prep_fsync(struct io_kiocb req, const struct io_uring_sqe sqe)
				999	{
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1000	struct io_ring_ctx *ctx = req->ctx;
				1001	unsigned flags;
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1002	int fd;
				1003
				1004	/* Prep already done */
				1005	if (req->rw.ki_filp)
				1006	return 0;
				1007
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1008	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1009	return -EINVAL;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1010	if (unlikely(sqe->addr \|\| sqe->ioprio \|\| sqe->buf_index))
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1011	return -EINVAL;
				1012
				1013	fd = READ_ONCE(sqe->fd);
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1014	flags = READ_ONCE(sqe->flags);
				1015
				1016	if (flags & IOSQE_FIXED_FILE) {
				1017	if (unlikely(!ctx->user_files \|\| fd >= ctx->nr_user_files))
				1018	return -EBADF;
				1019	req->rw.ki_filp = ctx->user_files[fd];
				1020	req->flags \|= REQ_F_FIXED_FILE;
				1021	} else {
				1022	req->rw.ki_filp = fget(fd);
				1023	if (unlikely(!req->rw.ki_filp))
				1024	return -EBADF;
				1025	}
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1026
				1027	return 0;
				1028	}
				1029
				1030	static int io_fsync(struct io_kiocb req, const struct io_uring_sqe sqe,
				1031	bool force_nonblock)
				1032	{
				1033	loff_t sqe_off = READ_ONCE(sqe->off);
				1034	loff_t sqe_len = READ_ONCE(sqe->len);
				1035	loff_t end = sqe_off + sqe_len;
				1036	unsigned fsync_flags;
				1037	int ret;
				1038
				1039	fsync_flags = READ_ONCE(sqe->fsync_flags);
				1040	if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
				1041	return -EINVAL;
				1042
				1043	ret = io_prep_fsync(req, sqe);
				1044	if (ret)
				1045	return ret;
				1046
				1047	/* fsync always requires a blocking context */
				1048	if (force_nonblock)
				1049	return -EAGAIN;
				1050
				1051	ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
				1052	end > 0 ? end : LLONG_MAX,
				1053	fsync_flags & IORING_FSYNC_DATASYNC);
				1054
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1055	io_fput(req);
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1056	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
				1057	io_free_req(req);
				1058	return 0;
				1059	}
				1060
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1061	static int __io_submit_sqe(struct io_ring_ctx ctx, struct io_kiocb req,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1062	const struct sqe_submit *s, bool force_nonblock,
				1063	struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1064	{
				1065	ssize_t ret;
				1066	int opcode;
				1067
				1068	if (unlikely(s->index >= ctx->sq_entries))
				1069	return -EINVAL;
				1070	req->user_data = READ_ONCE(s->sqe->user_data);
				1071
				1072	opcode = READ_ONCE(s->sqe->opcode);
				1073	switch (opcode) {
				1074	case IORING_OP_NOP:
				1075	ret = io_nop(req, req->user_data);
				1076	break;
				1077	case IORING_OP_READV:
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1078	if (unlikely(s->sqe->buf_index))
				1079	return -EINVAL;
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1080	ret = io_read(req, s, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1081	break;
				1082	case IORING_OP_WRITEV:
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1083	if (unlikely(s->sqe->buf_index))
				1084	return -EINVAL;
				1085	ret = io_write(req, s, force_nonblock, state);
				1086	break;
				1087	case IORING_OP_READ_FIXED:
				1088	ret = io_read(req, s, force_nonblock, state);
				1089	break;
				1090	case IORING_OP_WRITE_FIXED:
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1091	ret = io_write(req, s, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1092	break;
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1093	case IORING_OP_FSYNC:
				1094	ret = io_fsync(req, s->sqe, force_nonblock);
				1095	break;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1096	default:
				1097	ret = -EINVAL;
				1098	break;
				1099	}
				1100
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1101	if (ret)
				1102	return ret;
				1103
				1104	if (ctx->flags & IORING_SETUP_IOPOLL) {
				1105	if (req->error == -EAGAIN)
				1106	return -EAGAIN;
				1107
				1108	/* workqueue context doesn't hold uring_lock, grab it now */
				1109	if (s->needs_lock)
				1110	mutex_lock(&ctx->uring_lock);
				1111	io_iopoll_req_issued(req);
				1112	if (s->needs_lock)
				1113	mutex_unlock(&ctx->uring_lock);
				1114	}
				1115
				1116	return 0;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1117	}
				1118
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1119	static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
				1120	{
				1121	u8 opcode = READ_ONCE(sqe->opcode);
				1122
				1123	return !(opcode == IORING_OP_READ_FIXED \|\|
				1124	opcode == IORING_OP_WRITE_FIXED);
				1125	}
				1126
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1127	static void io_sq_wq_submit_work(struct work_struct *work)
				1128	{
				1129	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
				1130	struct sqe_submit *s = &req->submit;
				1131	const struct io_uring_sqe *sqe = s->sqe;
				1132	struct io_ring_ctx *ctx = req->ctx;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1133	mm_segment_t old_fs;
				1134	bool needs_user;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1135	int ret;
				1136
				1137	/* Ensure we clear previously set forced non-block flag */
				1138	req->flags &= ~REQ_F_FORCE_NONBLOCK;
				1139	req->rw.ki_flags &= ~IOCB_NOWAIT;
				1140
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1141	s->needs_lock = true;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1142	s->has_user = false;
				1143
				1144	/*
				1145	* If we're doing IO to fixed buffers, we don't need to get/set
				1146	* user context
				1147	*/
				1148	needs_user = io_sqe_needs_user(s->sqe);
				1149	if (needs_user) {
				1150	if (!mmget_not_zero(ctx->sqo_mm)) {
				1151	ret = -EFAULT;
				1152	goto err;
				1153	}
				1154	use_mm(ctx->sqo_mm);
				1155	old_fs = get_fs();
				1156	set_fs(USER_DS);
				1157	s->has_user = true;
				1158	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1159
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1160	do {
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1161	ret = __io_submit_sqe(ctx, req, s, false, NULL);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1162	/*
				1163	* We can get EAGAIN for polled IO even though we're forcing
				1164	* a sync submission from here, since we can't wait for
				1165	* request slots on the block side.
				1166	*/
				1167	if (ret != -EAGAIN)
				1168	break;
				1169	cond_resched();
				1170	} while (1);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1171
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1172	if (needs_user) {
				1173	set_fs(old_fs);
				1174	unuse_mm(ctx->sqo_mm);
				1175	mmput(ctx->sqo_mm);
				1176	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1177	err:
				1178	if (ret) {
				1179	io_cqring_add_event(ctx, sqe->user_data, ret, 0);
				1180	io_free_req(req);
				1181	}
				1182
				1183	/* async context always use a copy of the sqe */
				1184	kfree(sqe);
				1185	}
				1186
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1187	static int io_submit_sqe(struct io_ring_ctx ctx, struct sqe_submit s,
				1188	struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1189	{
				1190	struct io_kiocb *req;
				1191	ssize_t ret;
				1192
				1193	/* enforce forwards compatibility on users */
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1194	if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1195	return -EINVAL;
				1196
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	1197	req = io_get_req(ctx, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1198	if (unlikely(!req))
				1199	return -EAGAIN;
				1200
				1201	req->rw.ki_filp = NULL;
				1202
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1203	ret = __io_submit_sqe(ctx, req, s, true, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1204	if (ret == -EAGAIN) {
				1205	struct io_uring_sqe *sqe_copy;
				1206
				1207	sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
				1208	if (sqe_copy) {
				1209	memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
				1210	s->sqe = sqe_copy;
				1211
				1212	memcpy(&req->submit, s, sizeof(*s));
				1213	INIT_WORK(&req->work, io_sq_wq_submit_work);
				1214	queue_work(ctx->sqo_wq, &req->work);
				1215	ret = 0;
				1216	}
				1217	}
				1218	if (ret)
				1219	io_free_req(req);
				1220
				1221	return ret;
				1222	}
				1223
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1224	/*
				1225	* Batched submission is done, ensure local IO is flushed out.
				1226	*/
				1227	static void io_submit_state_end(struct io_submit_state *state)
				1228	{
				1229	blk_finish_plug(&state->plug);
				1230	io_file_put(state, NULL);
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	1231	if (state->free_reqs)
				1232	kmem_cache_free_bulk(req_cachep, state->free_reqs,
				1233	&state->reqs[state->cur_req]);
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1234	}
				1235
				1236	/*
				1237	* Start submission side cache.
				1238	*/
				1239	static void io_submit_state_start(struct io_submit_state *state,
				1240	struct io_ring_ctx *ctx, unsigned max_ios)
				1241	{
				1242	blk_start_plug(&state->plug);
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	1243	state->free_reqs = 0;
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1244	state->file = NULL;
				1245	state->ios_left = max_ios;
				1246	}
				1247
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1248	static void io_commit_sqring(struct io_ring_ctx *ctx)
				1249	{
				1250	struct io_sq_ring *ring = ctx->sq_ring;
				1251
				1252	if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
				1253	/*
				1254	* Ensure any loads from the SQEs are done at this point,
				1255	* since once we write the new head, the application could
				1256	* write new data to them.
				1257	*/
				1258	smp_store_release(&ring->r.head, ctx->cached_sq_head);
				1259
				1260	/*
				1261	* write side barrier of head update, app has read side. See
				1262	* comment at the top of this file
				1263	*/
				1264	smp_wmb();
				1265	}
				1266	}
				1267
				1268	/*
				1269	* Undo last io_get_sqring()
				1270	*/
				1271	static void io_drop_sqring(struct io_ring_ctx *ctx)
				1272	{
				1273	ctx->cached_sq_head--;
				1274	}
				1275
				1276	/*
				1277	* Fetch an sqe, if one is available. Note that s->sqe will point to memory
				1278	* that is mapped by userspace. This means that care needs to be taken to
				1279	* ensure that reads are stable, as we cannot rely on userspace always
				1280	* being a good citizen. If members of the sqe are validated and then later
				1281	* used, it's important that those reads are done through READ_ONCE() to
				1282	* prevent a re-load down the line.
				1283	*/
				1284	static bool io_get_sqring(struct io_ring_ctx ctx, struct sqe_submit s)
				1285	{
				1286	struct io_sq_ring *ring = ctx->sq_ring;
				1287	unsigned head;
				1288
				1289	/*
				1290	* The cached sq head (or cq tail) serves two purposes:
				1291	*
				1292	* 1) allows us to batch the cost of updating the user visible
				1293	* head updates.
				1294	* 2) allows the kernel side to track the head on its own, even
				1295	* though the application is the one updating it.
				1296	*/
				1297	head = ctx->cached_sq_head;
				1298	/* See comment at the top of this file */
				1299	smp_rmb();
				1300	if (head == READ_ONCE(ring->r.tail))
				1301	return false;
				1302
				1303	head = READ_ONCE(ring->array[head & ctx->sq_mask]);
				1304	if (head < ctx->sq_entries) {
				1305	s->index = head;
				1306	s->sqe = &ctx->sq_sqes[head];
				1307	ctx->cached_sq_head++;
				1308	return true;
				1309	}
				1310
				1311	/* drop invalid entries */
				1312	ctx->cached_sq_head++;
				1313	ring->dropped++;
				1314	/* See comment at the top of this file */
				1315	smp_wmb();
				1316	return false;
				1317	}
				1318
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	1319	static int io_submit_sqes(struct io_ring_ctx ctx, struct sqe_submit sqes,
				1320	unsigned int nr, bool has_user, bool mm_fault)
				1321	{
				1322	struct io_submit_state state, *statep = NULL;
				1323	int ret, i, submitted = 0;
				1324
				1325	if (nr > IO_PLUG_THRESHOLD) {
				1326	io_submit_state_start(&state, ctx, nr);
				1327	statep = &state;
				1328	}
				1329
				1330	for (i = 0; i < nr; i++) {
				1331	if (unlikely(mm_fault)) {
				1332	ret = -EFAULT;
				1333	} else {
				1334	sqes[i].has_user = has_user;
				1335	sqes[i].needs_lock = true;
				1336	sqes[i].needs_fixed_file = true;
				1337	ret = io_submit_sqe(ctx, &sqes[i], statep);
				1338	}
				1339	if (!ret) {
				1340	submitted++;
				1341	continue;
				1342	}
				1343
				1344	io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0);
				1345	}
				1346
				1347	if (statep)
				1348	io_submit_state_end(&state);
				1349
				1350	return submitted;
				1351	}
				1352
				1353	static int io_sq_thread(void *data)
				1354	{
				1355	struct sqe_submit sqes[IO_IOPOLL_BATCH];
				1356	struct io_ring_ctx *ctx = data;
				1357	struct mm_struct *cur_mm = NULL;
				1358	mm_segment_t old_fs;
				1359	DEFINE_WAIT(wait);
				1360	unsigned inflight;
				1361	unsigned long timeout;
				1362
				1363	old_fs = get_fs();
				1364	set_fs(USER_DS);
				1365
				1366	timeout = inflight = 0;
				1367	while (!kthread_should_stop() && !ctx->sqo_stop) {
				1368	bool all_fixed, mm_fault = false;
				1369	int i;
				1370
				1371	if (inflight) {
				1372	unsigned nr_events = 0;
				1373
				1374	if (ctx->flags & IORING_SETUP_IOPOLL) {
				1375	/*
				1376	* We disallow the app entering submit/complete
				1377	* with polling, but we still need to lock the
				1378	* ring to prevent racing with polled issue
				1379	* that got punted to a workqueue.
				1380	*/
				1381	mutex_lock(&ctx->uring_lock);
				1382	io_iopoll_check(ctx, &nr_events, 0);
				1383	mutex_unlock(&ctx->uring_lock);
				1384	} else {
				1385	/*
				1386	* Normal IO, just pretend everything completed.
				1387	* We don't have to poll completions for that.
				1388	*/
				1389	nr_events = inflight;
				1390	}
				1391
				1392	inflight -= nr_events;
				1393	if (!inflight)
				1394	timeout = jiffies + ctx->sq_thread_idle;
				1395	}
				1396
				1397	if (!io_get_sqring(ctx, &sqes[0])) {
				1398	/*
				1399	* We're polling. If we're within the defined idle
				1400	* period, then let us spin without work before going
				1401	* to sleep.
				1402	*/
				1403	if (inflight \|\| !time_after(jiffies, timeout)) {
				1404	cpu_relax();
				1405	continue;
				1406	}
				1407
				1408	/*
				1409	* Drop cur_mm before scheduling, we can't hold it for
				1410	* long periods (or over schedule()). Do this before
				1411	* adding ourselves to the waitqueue, as the unuse/drop
				1412	* may sleep.
				1413	*/
				1414	if (cur_mm) {
				1415	unuse_mm(cur_mm);
				1416	mmput(cur_mm);
				1417	cur_mm = NULL;
				1418	}
				1419
				1420	prepare_to_wait(&ctx->sqo_wait, &wait,
				1421	TASK_INTERRUPTIBLE);
				1422
				1423	/* Tell userspace we may need a wakeup call */
				1424	ctx->sq_ring->flags \|= IORING_SQ_NEED_WAKEUP;
				1425	smp_wmb();
				1426
				1427	if (!io_get_sqring(ctx, &sqes[0])) {
				1428	if (kthread_should_stop()) {
				1429	finish_wait(&ctx->sqo_wait, &wait);
				1430	break;
				1431	}
				1432	if (signal_pending(current))
				1433	flush_signals(current);
				1434	schedule();
				1435	finish_wait(&ctx->sqo_wait, &wait);
				1436
				1437	ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
				1438	smp_wmb();
				1439	continue;
				1440	}
				1441	finish_wait(&ctx->sqo_wait, &wait);
				1442
				1443	ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
				1444	smp_wmb();
				1445	}
				1446
				1447	i = 0;
				1448	all_fixed = true;
				1449	do {
				1450	if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
				1451	all_fixed = false;
				1452
				1453	i++;
				1454	if (i == ARRAY_SIZE(sqes))
				1455	break;
				1456	} while (io_get_sqring(ctx, &sqes[i]));
				1457
				1458	/* Unless all new commands are FIXED regions, grab mm */
				1459	if (!all_fixed && !cur_mm) {
				1460	mm_fault = !mmget_not_zero(ctx->sqo_mm);
				1461	if (!mm_fault) {
				1462	use_mm(ctx->sqo_mm);
				1463	cur_mm = ctx->sqo_mm;
				1464	}
				1465	}
				1466
				1467	inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
				1468	mm_fault);
				1469
				1470	/* Commit SQ ring head once we've consumed all SQEs */
				1471	io_commit_sqring(ctx);
				1472	}
				1473
				1474	set_fs(old_fs);
				1475	if (cur_mm) {
				1476	unuse_mm(cur_mm);
				1477	mmput(cur_mm);
				1478	}
				1479	return 0;
				1480	}
				1481
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1482	static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
				1483	{
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1484	struct io_submit_state state, *statep = NULL;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1485	int i, ret = 0, submit = 0;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1486
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1487	if (to_submit > IO_PLUG_THRESHOLD) {
				1488	io_submit_state_start(&state, ctx, to_submit);
				1489	statep = &state;
				1490	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1491
				1492	for (i = 0; i < to_submit; i++) {
				1493	struct sqe_submit s;
				1494
				1495	if (!io_get_sqring(ctx, &s))
				1496	break;
				1497
				1498	s.has_user = true;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1499	s.needs_lock = false;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	1500	s.needs_fixed_file = false;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1501
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1502	ret = io_submit_sqe(ctx, &s, statep);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1503	if (ret) {
				1504	io_drop_sqring(ctx);
				1505	break;
				1506	}
				1507
				1508	submit++;
				1509	}
				1510	io_commit_sqring(ctx);
				1511
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1512	if (statep)
				1513	io_submit_state_end(statep);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1514
				1515	return submit ? submit : ret;
				1516	}
				1517
				1518	static unsigned io_cqring_events(struct io_cq_ring *ring)
				1519	{
				1520	return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
				1521	}
				1522
				1523	/*
				1524	* Wait until events become available, if we don't already have some. The
				1525	* application must reap them itself, as they reside on the shared cq ring.
				1526	*/
				1527	static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
				1528	const sigset_t __user *sig, size_t sigsz)
				1529	{
				1530	struct io_cq_ring *ring = ctx->cq_ring;
				1531	sigset_t ksigmask, sigsaved;
				1532	DEFINE_WAIT(wait);
				1533	int ret;
				1534
				1535	/* See comment at the top of this file */
				1536	smp_rmb();
				1537	if (io_cqring_events(ring) >= min_events)
				1538	return 0;
				1539
				1540	if (sig) {
				1541	ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
				1542	if (ret)
				1543	return ret;
				1544	}
				1545
				1546	do {
				1547	prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
				1548
				1549	ret = 0;
				1550	/* See comment at the top of this file */
				1551	smp_rmb();
				1552	if (io_cqring_events(ring) >= min_events)
				1553	break;
				1554
				1555	schedule();
				1556
				1557	ret = -EINTR;
				1558	if (signal_pending(current))
				1559	break;
				1560	} while (1);
				1561
				1562	finish_wait(&ctx->wait, &wait);
				1563
				1564	if (sig)
				1565	restore_user_sigmask(sig, &sigsaved);
				1566
				1567	return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
				1568	}
				1569
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1570	static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
				1571	{
				1572	#if defined(CONFIG_UNIX)
				1573	if (ctx->ring_sock) {
				1574	struct sock *sock = ctx->ring_sock->sk;
				1575	struct sk_buff *skb;
				1576
				1577	while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
				1578	kfree_skb(skb);
				1579	}
				1580	#else
				1581	int i;
				1582
				1583	for (i = 0; i < ctx->nr_user_files; i++)
				1584	fput(ctx->user_files[i]);
				1585	#endif
				1586	}
				1587
				1588	static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
				1589	{
				1590	if (!ctx->user_files)
				1591	return -ENXIO;
				1592
				1593	__io_sqe_files_unregister(ctx);
				1594	kfree(ctx->user_files);
				1595	ctx->user_files = NULL;
				1596	ctx->nr_user_files = 0;
				1597	return 0;
				1598	}
				1599
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	1600	static void io_sq_thread_stop(struct io_ring_ctx *ctx)
				1601	{
				1602	if (ctx->sqo_thread) {
				1603	ctx->sqo_stop = 1;
				1604	mb();
				1605	kthread_stop(ctx->sqo_thread);
				1606	ctx->sqo_thread = NULL;
				1607	}
				1608	}
				1609
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1610	static void io_finish_async(struct io_ring_ctx *ctx)
				1611	{
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	1612	io_sq_thread_stop(ctx);
				1613
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1614	if (ctx->sqo_wq) {
				1615	destroy_workqueue(ctx->sqo_wq);
				1616	ctx->sqo_wq = NULL;
				1617	}
				1618	}
				1619
				1620	#if defined(CONFIG_UNIX)
				1621	static void io_destruct_skb(struct sk_buff *skb)
				1622	{
				1623	struct io_ring_ctx *ctx = skb->sk->sk_user_data;
				1624
				1625	io_finish_async(ctx);
				1626	unix_destruct_scm(skb);
				1627	}
				1628
				1629	/*
				1630	* Ensure the UNIX gc is aware of our file set, so we are certain that
				1631	* the io_uring can be safely unregistered on process exit, even if we have
				1632	* loops in the file referencing.
				1633	*/
				1634	static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
				1635	{
				1636	struct sock *sk = ctx->ring_sock->sk;
				1637	struct scm_fp_list *fpl;
				1638	struct sk_buff *skb;
				1639	int i;
				1640
				1641	if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
				1642	unsigned long inflight = ctx->user->unix_inflight + nr;
				1643
				1644	if (inflight > task_rlimit(current, RLIMIT_NOFILE))
				1645	return -EMFILE;
				1646	}
				1647
				1648	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
				1649	if (!fpl)
				1650	return -ENOMEM;
				1651
				1652	skb = alloc_skb(0, GFP_KERNEL);
				1653	if (!skb) {
				1654	kfree(fpl);
				1655	return -ENOMEM;
				1656	}
				1657
				1658	skb->sk = sk;
				1659	skb->destructor = io_destruct_skb;
				1660
				1661	fpl->user = get_uid(ctx->user);
				1662	for (i = 0; i < nr; i++) {
				1663	fpl->fp[i] = get_file(ctx->user_files[i + offset]);
				1664	unix_inflight(fpl->user, fpl->fp[i]);
				1665	}
				1666
				1667	fpl->max = fpl->count = nr;
				1668	UNIXCB(skb).fp = fpl;
				1669	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
				1670	skb_queue_head(&sk->sk_receive_queue, skb);
				1671
				1672	for (i = 0; i < nr; i++)
				1673	fput(fpl->fp[i]);
				1674
				1675	return 0;
				1676	}
				1677
				1678	/*
				1679	* If UNIX sockets are enabled, fd passing can cause a reference cycle which
				1680	* causes regular reference counting to break down. We rely on the UNIX
				1681	* garbage collection to take care of this problem for us.
				1682	*/
				1683	static int io_sqe_files_scm(struct io_ring_ctx *ctx)
				1684	{
				1685	unsigned left, total;
				1686	int ret = 0;
				1687
				1688	total = 0;
				1689	left = ctx->nr_user_files;
				1690	while (left) {
				1691	unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
				1692	int ret;
				1693
				1694	ret = __io_sqe_files_scm(ctx, this_files, total);
				1695	if (ret)
				1696	break;
				1697	left -= this_files;
				1698	total += this_files;
				1699	}
				1700
				1701	if (!ret)
				1702	return 0;
				1703
				1704	while (total < ctx->nr_user_files) {
				1705	fput(ctx->user_files[total]);
				1706	total++;
				1707	}
				1708
				1709	return ret;
				1710	}
				1711	#else
				1712	static int io_sqe_files_scm(struct io_ring_ctx *ctx)
				1713	{
				1714	return 0;
				1715	}
				1716	#endif
				1717
				1718	static int io_sqe_files_register(struct io_ring_ctx ctx, void __user arg,
				1719	unsigned nr_args)
				1720	{
				1721	__s32 __user fds = (__s32 __user ) arg;
				1722	int fd, ret = 0;
				1723	unsigned i;
				1724
				1725	if (ctx->user_files)
				1726	return -EBUSY;
				1727	if (!nr_args)
				1728	return -EINVAL;
				1729	if (nr_args > IORING_MAX_FIXED_FILES)
				1730	return -EMFILE;
				1731
				1732	ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
				1733	if (!ctx->user_files)
				1734	return -ENOMEM;
				1735
				1736	for (i = 0; i < nr_args; i++) {
				1737	ret = -EFAULT;
				1738	if (copy_from_user(&fd, &fds[i], sizeof(fd)))
				1739	break;
				1740
				1741	ctx->user_files[i] = fget(fd);
				1742
				1743	ret = -EBADF;
				1744	if (!ctx->user_files[i])
				1745	break;
				1746	/*
				1747	* Don't allow io_uring instances to be registered. If UNIX
				1748	* isn't enabled, then this causes a reference cycle and this
				1749	* instance can never get freed. If UNIX is enabled we'll
				1750	* handle it just fine, but there's still no point in allowing
				1751	* a ring fd as it doesn't support regular read/write anyway.
				1752	*/
				1753	if (ctx->user_files[i]->f_op == &io_uring_fops) {
				1754	fput(ctx->user_files[i]);
				1755	break;
				1756	}
				1757	ctx->nr_user_files++;
				1758	ret = 0;
				1759	}
				1760
				1761	if (ret) {
				1762	for (i = 0; i < ctx->nr_user_files; i++)
				1763	fput(ctx->user_files[i]);
				1764
				1765	kfree(ctx->user_files);
				1766	ctx->nr_user_files = 0;
				1767	return ret;
				1768	}
				1769
				1770	ret = io_sqe_files_scm(ctx);
				1771	if (ret)
				1772	io_sqe_files_unregister(ctx);
				1773
				1774	return ret;
				1775	}
				1776
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	1777	static int io_sq_offload_start(struct io_ring_ctx *ctx,
				1778	struct io_uring_params *p)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1779	{
				1780	int ret;
				1781
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	1782	init_waitqueue_head(&ctx->sqo_wait);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1783	mmgrab(current->mm);
				1784	ctx->sqo_mm = current->mm;
				1785
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	1786	ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
				1787	if (!ctx->sq_thread_idle)
				1788	ctx->sq_thread_idle = HZ;
				1789
				1790	ret = -EINVAL;
				1791	if (!cpu_possible(p->sq_thread_cpu))
				1792	goto err;
				1793
				1794	if (ctx->flags & IORING_SETUP_SQPOLL) {
				1795	if (p->flags & IORING_SETUP_SQ_AFF) {
				1796	int cpu;
				1797
				1798	cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
				1799	ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
				1800	ctx, cpu,
				1801	"io_uring-sq");
				1802	} else {
				1803	ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
				1804	"io_uring-sq");
				1805	}
				1806	if (IS_ERR(ctx->sqo_thread)) {
				1807	ret = PTR_ERR(ctx->sqo_thread);
				1808	ctx->sqo_thread = NULL;
				1809	goto err;
				1810	}
				1811	wake_up_process(ctx->sqo_thread);
				1812	} else if (p->flags & IORING_SETUP_SQ_AFF) {
				1813	/* Can't have SQ_AFF without SQPOLL */
				1814	ret = -EINVAL;
				1815	goto err;
				1816	}
				1817
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1818	/* Do QD, or 2 * CPUS, whatever is smallest */
				1819	ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND \| WQ_FREEZABLE,
				1820	min(ctx->sq_entries - 1, 2 * num_online_cpus()));
				1821	if (!ctx->sqo_wq) {
				1822	ret = -ENOMEM;
				1823	goto err;
				1824	}
				1825
				1826	return 0;
				1827	err:
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	1828	io_sq_thread_stop(ctx);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1829	mmdrop(ctx->sqo_mm);
				1830	ctx->sqo_mm = NULL;
				1831	return ret;
				1832	}
				1833
				1834	static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
				1835	{
				1836	atomic_long_sub(nr_pages, &user->locked_vm);
				1837	}
				1838
				1839	static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
				1840	{
				1841	unsigned long page_limit, cur_pages, new_pages;
				1842
				1843	/* Don't allow more pages than we can safely lock */
				1844	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
				1845
				1846	do {
				1847	cur_pages = atomic_long_read(&user->locked_vm);
				1848	new_pages = cur_pages + nr_pages;
				1849	if (new_pages > page_limit)
				1850	return -ENOMEM;
				1851	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
				1852	new_pages) != cur_pages);
				1853
				1854	return 0;
				1855	}
				1856
				1857	static void io_mem_free(void *ptr)
				1858	{
				1859	struct page *page = virt_to_head_page(ptr);
				1860
				1861	if (put_page_testzero(page))
				1862	free_compound_page(page);
				1863	}
				1864
				1865	static void *io_mem_alloc(size_t size)
				1866	{
				1867	gfp_t gfp_flags = GFP_KERNEL \| __GFP_ZERO \| __GFP_NOWARN \| __GFP_COMP \|
				1868	__GFP_NORETRY;
				1869
				1870	return (void *) __get_free_pages(gfp_flags, get_order(size));
				1871	}
				1872
				1873	static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
				1874	{
				1875	struct io_sq_ring *sq_ring;
				1876	struct io_cq_ring *cq_ring;
				1877	size_t bytes;
				1878
				1879	bytes = struct_size(sq_ring, array, sq_entries);
				1880	bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
				1881	bytes += struct_size(cq_ring, cqes, cq_entries);
				1882
				1883	return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
				1884	}
				1885
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1886	static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
				1887	{
				1888	int i, j;
				1889
				1890	if (!ctx->user_bufs)
				1891	return -ENXIO;
				1892
				1893	for (i = 0; i < ctx->nr_user_bufs; i++) {
				1894	struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
				1895
				1896	for (j = 0; j < imu->nr_bvecs; j++)
				1897	put_page(imu->bvec[j].bv_page);
				1898
				1899	if (ctx->account_mem)
				1900	io_unaccount_mem(ctx->user, imu->nr_bvecs);
				1901	kfree(imu->bvec);
				1902	imu->nr_bvecs = 0;
				1903	}
				1904
				1905	kfree(ctx->user_bufs);
				1906	ctx->user_bufs = NULL;
				1907	ctx->nr_user_bufs = 0;
				1908	return 0;
				1909	}
				1910
				1911	static int io_copy_iov(struct io_ring_ctx ctx, struct iovec dst,
				1912	void __user *arg, unsigned index)
				1913	{
				1914	struct iovec __user *src;
				1915
				1916	#ifdef CONFIG_COMPAT
				1917	if (ctx->compat) {
				1918	struct compat_iovec __user *ciovs;
				1919	struct compat_iovec ciov;
				1920
				1921	ciovs = (struct compat_iovec __user *) arg;
				1922	if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
				1923	return -EFAULT;
				1924
				1925	dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
				1926	dst->iov_len = ciov.iov_len;
				1927	return 0;
				1928	}
				1929	#endif
				1930	src = (struct iovec __user *) arg;
				1931	if (copy_from_user(dst, &src[index], sizeof(*dst)))
				1932	return -EFAULT;
				1933	return 0;
				1934	}
				1935
				1936	static int io_sqe_buffer_register(struct io_ring_ctx ctx, void __user arg,
				1937	unsigned nr_args)
				1938	{
				1939	struct vm_area_struct **vmas = NULL;
				1940	struct page **pages = NULL;
				1941	int i, j, got_pages = 0;
				1942	int ret = -EINVAL;
				1943
				1944	if (ctx->user_bufs)
				1945	return -EBUSY;
				1946	if (!nr_args \|\| nr_args > UIO_MAXIOV)
				1947	return -EINVAL;
				1948
				1949	ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
				1950	GFP_KERNEL);
				1951	if (!ctx->user_bufs)
				1952	return -ENOMEM;
				1953
				1954	for (i = 0; i < nr_args; i++) {
				1955	struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
				1956	unsigned long off, start, end, ubuf;
				1957	int pret, nr_pages;
				1958	struct iovec iov;
				1959	size_t size;
				1960
				1961	ret = io_copy_iov(ctx, &iov, arg, i);
				1962	if (ret)
				1963	break;
				1964
				1965	/*
				1966	* Don't impose further limits on the size and buffer
				1967	* constraints here, we'll -EINVAL later when IO is
				1968	* submitted if they are wrong.
				1969	*/
				1970	ret = -EFAULT;
				1971	if (!iov.iov_base \|\| !iov.iov_len)
				1972	goto err;
				1973
				1974	/* arbitrary limit, but we need something */
				1975	if (iov.iov_len > SZ_1G)
				1976	goto err;
				1977
				1978	ubuf = (unsigned long) iov.iov_base;
				1979	end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
				1980	start = ubuf >> PAGE_SHIFT;
				1981	nr_pages = end - start;
				1982
				1983	if (ctx->account_mem) {
				1984	ret = io_account_mem(ctx->user, nr_pages);
				1985	if (ret)
				1986	goto err;
				1987	}
				1988
				1989	ret = 0;
				1990	if (!pages \|\| nr_pages > got_pages) {
				1991	kfree(vmas);
				1992	kfree(pages);
				1993	pages = kmalloc_array(nr_pages, sizeof(struct page *),
				1994	GFP_KERNEL);
				1995	vmas = kmalloc_array(nr_pages,
				1996	sizeof(struct vm_area_struct *),
				1997	GFP_KERNEL);
				1998	if (!pages \|\| !vmas) {
				1999	ret = -ENOMEM;
				2000	if (ctx->account_mem)
				2001	io_unaccount_mem(ctx->user, nr_pages);
				2002	goto err;
				2003	}
				2004	got_pages = nr_pages;
				2005	}
				2006
				2007	imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
				2008	GFP_KERNEL);
				2009	ret = -ENOMEM;
				2010	if (!imu->bvec) {
				2011	if (ctx->account_mem)
				2012	io_unaccount_mem(ctx->user, nr_pages);
				2013	goto err;
				2014	}
				2015
				2016	ret = 0;
				2017	down_read(&current->mm->mmap_sem);
				2018	pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
				2019	pages, vmas);
				2020	if (pret == nr_pages) {
				2021	/* don't support file backed memory */
				2022	for (j = 0; j < nr_pages; j++) {
				2023	struct vm_area_struct *vma = vmas[j];
				2024
				2025	if (vma->vm_file &&
				2026	!is_file_hugepages(vma->vm_file)) {
				2027	ret = -EOPNOTSUPP;
				2028	break;
				2029	}
				2030	}
				2031	} else {
				2032	ret = pret < 0 ? pret : -EFAULT;
				2033	}
				2034	up_read(&current->mm->mmap_sem);
				2035	if (ret) {
				2036	/*
				2037	* if we did partial map, or found file backed vmas,
				2038	* release any pages we did get
				2039	*/
				2040	if (pret > 0) {
				2041	for (j = 0; j < pret; j++)
				2042	put_page(pages[j]);
				2043	}
				2044	if (ctx->account_mem)
				2045	io_unaccount_mem(ctx->user, nr_pages);
				2046	goto err;
				2047	}
				2048
				2049	off = ubuf & ~PAGE_MASK;
				2050	size = iov.iov_len;
				2051	for (j = 0; j < nr_pages; j++) {
				2052	size_t vec_len;
				2053
				2054	vec_len = min_t(size_t, size, PAGE_SIZE - off);
				2055	imu->bvec[j].bv_page = pages[j];
				2056	imu->bvec[j].bv_len = vec_len;
				2057	imu->bvec[j].bv_offset = off;
				2058	off = 0;
				2059	size -= vec_len;
				2060	}
				2061	/* store original address for later verification */
				2062	imu->ubuf = ubuf;
				2063	imu->len = iov.iov_len;
				2064	imu->nr_bvecs = nr_pages;
				2065
				2066	ctx->nr_user_bufs++;
				2067	}
				2068	kfree(pages);
				2069	kfree(vmas);
				2070	return 0;
				2071	err:
				2072	kfree(pages);
				2073	kfree(vmas);
				2074	io_sqe_buffer_unregister(ctx);
				2075	return ret;
				2076	}
				2077
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2078	static void io_ring_ctx_free(struct io_ring_ctx *ctx)
				2079	{
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	2080	io_finish_async(ctx);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2081	if (ctx->sqo_mm)
				2082	mmdrop(ctx->sqo_mm);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2083
				2084	io_iopoll_reap_events(ctx);
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	2085	io_sqe_buffer_unregister(ctx);
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	2086	io_sqe_files_unregister(ctx);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2087
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2088	#if defined(CONFIG_UNIX)
				2089	if (ctx->ring_sock)
				2090	sock_release(ctx->ring_sock);
				2091	#endif
				2092
				2093	io_mem_free(ctx->sq_ring);
				2094	io_mem_free(ctx->sq_sqes);
				2095	io_mem_free(ctx->cq_ring);
				2096
				2097	percpu_ref_exit(&ctx->refs);
				2098	if (ctx->account_mem)
				2099	io_unaccount_mem(ctx->user,
				2100	ring_pages(ctx->sq_entries, ctx->cq_entries));
				2101	free_uid(ctx->user);
				2102	kfree(ctx);
				2103	}
				2104
				2105	static __poll_t io_uring_poll(struct file file, poll_table wait)
				2106	{
				2107	struct io_ring_ctx *ctx = file->private_data;
				2108	__poll_t mask = 0;
				2109
				2110	poll_wait(file, &ctx->cq_wait, wait);
				2111	/* See comment at the top of this file */
				2112	smp_rmb();
				2113	if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
				2114	mask \|= EPOLLOUT \| EPOLLWRNORM;
				2115	if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
				2116	mask \|= EPOLLIN \| EPOLLRDNORM;
				2117
				2118	return mask;
				2119	}
				2120
				2121	static int io_uring_fasync(int fd, struct file *file, int on)
				2122	{
				2123	struct io_ring_ctx *ctx = file->private_data;
				2124
				2125	return fasync_helper(fd, file, on, &ctx->cq_fasync);
				2126	}
				2127
				2128	static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
				2129	{
				2130	mutex_lock(&ctx->uring_lock);
				2131	percpu_ref_kill(&ctx->refs);
				2132	mutex_unlock(&ctx->uring_lock);
				2133
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2134	io_iopoll_reap_events(ctx);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2135	wait_for_completion(&ctx->ctx_done);
				2136	io_ring_ctx_free(ctx);
				2137	}
				2138
				2139	static int io_uring_release(struct inode inode, struct file file)
				2140	{
				2141	struct io_ring_ctx *ctx = file->private_data;
				2142
				2143	file->private_data = NULL;
				2144	io_ring_ctx_wait_and_kill(ctx);
				2145	return 0;
				2146	}
				2147
				2148	static int io_uring_mmap(struct file file, struct vm_area_struct vma)
				2149	{
				2150	loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
				2151	unsigned long sz = vma->vm_end - vma->vm_start;
				2152	struct io_ring_ctx *ctx = file->private_data;
				2153	unsigned long pfn;
				2154	struct page *page;
				2155	void *ptr;
				2156
				2157	switch (offset) {
				2158	case IORING_OFF_SQ_RING:
				2159	ptr = ctx->sq_ring;
				2160	break;
				2161	case IORING_OFF_SQES:
				2162	ptr = ctx->sq_sqes;
				2163	break;
				2164	case IORING_OFF_CQ_RING:
				2165	ptr = ctx->cq_ring;
				2166	break;
				2167	default:
				2168	return -EINVAL;
				2169	}
				2170
				2171	page = virt_to_head_page(ptr);
				2172	if (sz > (PAGE_SIZE << compound_order(page)))
				2173	return -EINVAL;
				2174
				2175	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
				2176	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
				2177	}
				2178
				2179	SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
				2180	u32, min_complete, u32, flags, const sigset_t __user *, sig,
				2181	size_t, sigsz)
				2182	{
				2183	struct io_ring_ctx *ctx;
				2184	long ret = -EBADF;
				2185	int submitted = 0;
				2186	struct fd f;
				2187
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	2188	if (flags & ~(IORING_ENTER_GETEVENTS \| IORING_ENTER_SQ_WAKEUP))
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2189	return -EINVAL;
				2190
				2191	f = fdget(fd);
				2192	if (!f.file)
				2193	return -EBADF;
				2194
				2195	ret = -EOPNOTSUPP;
				2196	if (f.file->f_op != &io_uring_fops)
				2197	goto out_fput;
				2198
				2199	ret = -ENXIO;
				2200	ctx = f.file->private_data;
				2201	if (!percpu_ref_tryget(&ctx->refs))
				2202	goto out_fput;
				2203
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	2204	/*
				2205	* For SQ polling, the thread will do all submissions and completions.
				2206	* Just return the requested submit count, and wake the thread if
				2207	* we were asked to.
				2208	*/
				2209	if (ctx->flags & IORING_SETUP_SQPOLL) {
				2210	if (flags & IORING_ENTER_SQ_WAKEUP)
				2211	wake_up(&ctx->sqo_wait);
				2212	submitted = to_submit;
				2213	goto out_ctx;
				2214	}
				2215
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2216	ret = 0;
				2217	if (to_submit) {
				2218	to_submit = min(to_submit, ctx->sq_entries);
				2219
				2220	mutex_lock(&ctx->uring_lock);
				2221	submitted = io_ring_submit(ctx, to_submit);
				2222	mutex_unlock(&ctx->uring_lock);
				2223
				2224	if (submitted < 0)
				2225	goto out_ctx;
				2226	}
				2227	if (flags & IORING_ENTER_GETEVENTS) {
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2228	unsigned nr_events = 0;
				2229
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2230	min_complete = min(min_complete, ctx->cq_entries);
				2231
				2232	/*
				2233	* The application could have included the 'to_submit' count
				2234	* in how many events it wanted to wait for. If we failed to
				2235	* submit the desired count, we may need to adjust the number
				2236	* of events to poll/wait for.
				2237	*/
				2238	if (submitted < to_submit)
				2239	min_complete = min_t(unsigned, submitted, min_complete);
				2240
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2241	if (ctx->flags & IORING_SETUP_IOPOLL) {
				2242	mutex_lock(&ctx->uring_lock);
				2243	ret = io_iopoll_check(ctx, &nr_events, min_complete);
				2244	mutex_unlock(&ctx->uring_lock);
				2245	} else {
				2246	ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
				2247	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2248	}
				2249
				2250	out_ctx:
				2251	io_ring_drop_ctx_refs(ctx, 1);
				2252	out_fput:
				2253	fdput(f);
				2254	return submitted ? submitted : ret;
				2255	}
				2256
				2257	static const struct file_operations io_uring_fops = {
				2258	.release = io_uring_release,
				2259	.mmap = io_uring_mmap,
				2260	.poll = io_uring_poll,
				2261	.fasync = io_uring_fasync,
				2262	};
				2263
				2264	static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
				2265	struct io_uring_params *p)
				2266	{
				2267	struct io_sq_ring *sq_ring;
				2268	struct io_cq_ring *cq_ring;
				2269	size_t size;
				2270
				2271	sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
				2272	if (!sq_ring)
				2273	return -ENOMEM;
				2274
				2275	ctx->sq_ring = sq_ring;
				2276	sq_ring->ring_mask = p->sq_entries - 1;
				2277	sq_ring->ring_entries = p->sq_entries;
				2278	ctx->sq_mask = sq_ring->ring_mask;
				2279	ctx->sq_entries = sq_ring->ring_entries;
				2280
				2281	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
				2282	if (size == SIZE_MAX)
				2283	return -EOVERFLOW;
				2284
				2285	ctx->sq_sqes = io_mem_alloc(size);
				2286	if (!ctx->sq_sqes) {
				2287	io_mem_free(ctx->sq_ring);
				2288	return -ENOMEM;
				2289	}
				2290
				2291	cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
				2292	if (!cq_ring) {
				2293	io_mem_free(ctx->sq_ring);
				2294	io_mem_free(ctx->sq_sqes);
				2295	return -ENOMEM;
				2296	}
				2297
				2298	ctx->cq_ring = cq_ring;
				2299	cq_ring->ring_mask = p->cq_entries - 1;
				2300	cq_ring->ring_entries = p->cq_entries;
				2301	ctx->cq_mask = cq_ring->ring_mask;
				2302	ctx->cq_entries = cq_ring->ring_entries;
				2303	return 0;
				2304	}
				2305
				2306	/*
				2307	* Allocate an anonymous fd, this is what constitutes the application
				2308	* visible backing of an io_uring instance. The application mmaps this
				2309	* fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
				2310	* we have to tie this fd to a socket for file garbage collection purposes.
				2311	*/
				2312	static int io_uring_get_fd(struct io_ring_ctx *ctx)
				2313	{
				2314	struct file *file;
				2315	int ret;
				2316
				2317	#if defined(CONFIG_UNIX)
				2318	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
				2319	&ctx->ring_sock);
				2320	if (ret)
				2321	return ret;
				2322	#endif
				2323
				2324	ret = get_unused_fd_flags(O_RDWR \| O_CLOEXEC);
				2325	if (ret < 0)
				2326	goto err;
				2327
				2328	file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
				2329	O_RDWR \| O_CLOEXEC);
				2330	if (IS_ERR(file)) {
				2331	put_unused_fd(ret);
				2332	ret = PTR_ERR(file);
				2333	goto err;
				2334	}
				2335
				2336	#if defined(CONFIG_UNIX)
				2337	ctx->ring_sock->file = file;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	2338	ctx->ring_sock->sk->sk_user_data = ctx;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2339	#endif
				2340	fd_install(ret, file);
				2341	return ret;
				2342	err:
				2343	#if defined(CONFIG_UNIX)
				2344	sock_release(ctx->ring_sock);
				2345	ctx->ring_sock = NULL;
				2346	#endif
				2347	return ret;
				2348	}
				2349
				2350	static int io_uring_create(unsigned entries, struct io_uring_params *p)
				2351	{
				2352	struct user_struct *user = NULL;
				2353	struct io_ring_ctx *ctx;
				2354	bool account_mem;
				2355	int ret;
				2356
				2357	if (!entries \|\| entries > IORING_MAX_ENTRIES)
				2358	return -EINVAL;
				2359
				2360	/*
				2361	* Use twice as many entries for the CQ ring. It's possible for the
				2362	* application to drive a higher depth than the size of the SQ ring,
				2363	* since the sqes are only used at submission time. This allows for
				2364	* some flexibility in overcommitting a bit.
				2365	*/
				2366	p->sq_entries = roundup_pow_of_two(entries);
				2367	p->cq_entries = 2 * p->sq_entries;
				2368
				2369	user = get_uid(current_user());
				2370	account_mem = !capable(CAP_IPC_LOCK);
				2371
				2372	if (account_mem) {
				2373	ret = io_account_mem(user,
				2374	ring_pages(p->sq_entries, p->cq_entries));
				2375	if (ret) {
				2376	free_uid(user);
				2377	return ret;
				2378	}
				2379	}
				2380
				2381	ctx = io_ring_ctx_alloc(p);
				2382	if (!ctx) {
				2383	if (account_mem)
				2384	io_unaccount_mem(user, ring_pages(p->sq_entries,
				2385	p->cq_entries));
				2386	free_uid(user);
				2387	return -ENOMEM;
				2388	}
				2389	ctx->compat = in_compat_syscall();
				2390	ctx->account_mem = account_mem;
				2391	ctx->user = user;
				2392
				2393	ret = io_allocate_scq_urings(ctx, p);
				2394	if (ret)
				2395	goto err;
				2396
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	2397	ret = io_sq_offload_start(ctx, p);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2398	if (ret)
				2399	goto err;
				2400
				2401	ret = io_uring_get_fd(ctx);
				2402	if (ret < 0)
				2403	goto err;
				2404
				2405	memset(&p->sq_off, 0, sizeof(p->sq_off));
				2406	p->sq_off.head = offsetof(struct io_sq_ring, r.head);
				2407	p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
				2408	p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
				2409	p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
				2410	p->sq_off.flags = offsetof(struct io_sq_ring, flags);
				2411	p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
				2412	p->sq_off.array = offsetof(struct io_sq_ring, array);
				2413
				2414	memset(&p->cq_off, 0, sizeof(p->cq_off));
				2415	p->cq_off.head = offsetof(struct io_cq_ring, r.head);
				2416	p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
				2417	p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
				2418	p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
				2419	p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
				2420	p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
				2421	return ret;
				2422	err:
				2423	io_ring_ctx_wait_and_kill(ctx);
				2424	return ret;
				2425	}
				2426
				2427	/*
				2428	* Sets up an aio uring context, and returns the fd. Applications asks for a
				2429	* ring size, we return the actual sq/cq ring sizes (among other things) in the
				2430	* params structure passed in.
				2431	*/
				2432	static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
				2433	{
				2434	struct io_uring_params p;
				2435	long ret;
				2436	int i;
				2437
				2438	if (copy_from_user(&p, params, sizeof(p)))
				2439	return -EFAULT;
				2440	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
				2441	if (p.resv[i])
				2442	return -EINVAL;
				2443	}
				2444
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	2445	if (p.flags & ~(IORING_SETUP_IOPOLL \| IORING_SETUP_SQPOLL \|
				2446	IORING_SETUP_SQ_AFF))
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2447	return -EINVAL;
				2448
				2449	ret = io_uring_create(entries, &p);
				2450	if (ret < 0)
				2451	return ret;
				2452
				2453	if (copy_to_user(params, &p, sizeof(p)))
				2454	return -EFAULT;
				2455
				2456	return ret;
				2457	}
				2458
				2459	SYSCALL_DEFINE2(io_uring_setup, u32, entries,
				2460	struct io_uring_params __user *, params)
				2461	{
				2462	return io_uring_setup(entries, params);
				2463	}
				2464
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	2465	static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
				2466	void __user *arg, unsigned nr_args)
				2467	{
				2468	int ret;
				2469
				2470	percpu_ref_kill(&ctx->refs);
				2471	wait_for_completion(&ctx->ctx_done);
				2472
				2473	switch (opcode) {
				2474	case IORING_REGISTER_BUFFERS:
				2475	ret = io_sqe_buffer_register(ctx, arg, nr_args);
				2476	break;
				2477	case IORING_UNREGISTER_BUFFERS:
				2478	ret = -EINVAL;
				2479	if (arg \|\| nr_args)
				2480	break;
				2481	ret = io_sqe_buffer_unregister(ctx);
				2482	break;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	2483	case IORING_REGISTER_FILES:
				2484	ret = io_sqe_files_register(ctx, arg, nr_args);
				2485	break;
				2486	case IORING_UNREGISTER_FILES:
				2487	ret = -EINVAL;
				2488	if (arg \|\| nr_args)
				2489	break;
				2490	ret = io_sqe_files_unregister(ctx);
				2491	break;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	2492	default:
				2493	ret = -EINVAL;
				2494	break;
				2495	}
				2496
				2497	/* bring the ctx back to life */
				2498	reinit_completion(&ctx->ctx_done);
				2499	percpu_ref_reinit(&ctx->refs);
				2500	return ret;
				2501	}
				2502
				2503	SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
				2504	void __user *, arg, unsigned int, nr_args)
				2505	{
				2506	struct io_ring_ctx *ctx;
				2507	long ret = -EBADF;
				2508	struct fd f;
				2509
				2510	f = fdget(fd);
				2511	if (!f.file)
				2512	return -EBADF;
				2513
				2514	ret = -EOPNOTSUPP;
				2515	if (f.file->f_op != &io_uring_fops)
				2516	goto out_fput;
				2517
				2518	ctx = f.file->private_data;
				2519
				2520	mutex_lock(&ctx->uring_lock);
				2521	ret = __io_uring_register(ctx, opcode, arg, nr_args);
				2522	mutex_unlock(&ctx->uring_lock);
				2523	out_fput:
				2524	fdput(f);
				2525	return ret;
				2526	}
				2527
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2528	static int __init io_uring_init(void)
				2529	{
				2530	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN \| SLAB_PANIC);
				2531	return 0;
				2532	};
				2533	__initcall(io_uring_init);