Blame - fs/io_uring.c - yocto/kernel/common

blob: 0a4caedf82c1e476c4dc00e4ddbc898259cdb3b2 [file] [log] [blame]

Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Shared application/kernel submission and completion ring pairs, for
				4	* supporting fast/efficient IO.
				5	*
				6	* A note on the read/write ordering memory barriers that are matched between
				7	* the application and kernel side. When the application reads the CQ ring
				8	* tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
				9	* the kernel uses after writing the tail. Failure to do so could cause a
				10	* delay in when the application notices that completion events available.
				11	* This isn't a fatal condition. Likewise, the application must use an
				12	* appropriate smp_wmb() both before writing the SQ tail, and after writing
				13	* the SQ tail. The first one orders the sqe writes with the tail write, and
				14	* the latter is paired with the smp_rmb() the kernel will issue before
				15	* reading the SQ tail on submission.
				16	*
				17	* Also see the examples in the liburing library:
				18	*
				19	* git://git.kernel.dk/liburing
				20	*
				21	* io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
				22	* from data shared between the kernel and application. This is done both
				23	* for ordering purposes, but also to ensure that once a value is loaded from
				24	* data that the application could potentially modify, it remains stable.
				25	*
				26	* Copyright (C) 2018-2019 Jens Axboe
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	27	* Copyright (c) 2018-2019 Christoph Hellwig
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	28	*/
				29	#include <linux/kernel.h>
				30	#include <linux/init.h>
				31	#include <linux/errno.h>
				32	#include <linux/syscalls.h>
				33	#include <linux/compat.h>
				34	#include <linux/refcount.h>
				35	#include <linux/uio.h>
				36
				37	#include <linux/sched/signal.h>
				38	#include <linux/fs.h>
				39	#include <linux/file.h>
				40	#include <linux/fdtable.h>
				41	#include <linux/mm.h>
				42	#include <linux/mman.h>
				43	#include <linux/mmu_context.h>
				44	#include <linux/percpu.h>
				45	#include <linux/slab.h>
				46	#include <linux/workqueue.h>
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	47	#include <linux/kthread.h>
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	48	#include <linux/blkdev.h>
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	49	#include <linux/bvec.h>
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	50	#include <linux/net.h>
				51	#include <net/sock.h>
				52	#include <net/af_unix.h>
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	53	#include <net/scm.h>
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	54	#include <linux/anon_inodes.h>
				55	#include <linux/sched/mm.h>
				56	#include <linux/uaccess.h>
				57	#include <linux/nospec.h>
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	58	#include <linux/sizes.h>
				59	#include <linux/hugetlb.h>
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	60
				61	#include <uapi/linux/io_uring.h>
				62
				63	#include "internal.h"
				64
				65	#define IORING_MAX_ENTRIES 4096
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	66	#define IORING_MAX_FIXED_FILES 1024
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	67
				68	struct io_uring {
				69	u32 head ____cacheline_aligned_in_smp;
				70	u32 tail ____cacheline_aligned_in_smp;
				71	};
				72
				73	struct io_sq_ring {
				74	struct io_uring r;
				75	u32 ring_mask;
				76	u32 ring_entries;
				77	u32 dropped;
				78	u32 flags;
				79	u32 array[];
				80	};
				81
				82	struct io_cq_ring {
				83	struct io_uring r;
				84	u32 ring_mask;
				85	u32 ring_entries;
				86	u32 overflow;
				87	struct io_uring_cqe cqes[];
				88	};
				89
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	90	struct io_mapped_ubuf {
				91	u64 ubuf;
				92	size_t len;
				93	struct bio_vec *bvec;
				94	unsigned int nr_bvecs;
				95	};
				96
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	97	struct io_ring_ctx {
				98	struct {
				99	struct percpu_ref refs;
				100	} ____cacheline_aligned_in_smp;
				101
				102	struct {
				103	unsigned int flags;
				104	bool compat;
				105	bool account_mem;
				106
				107	/* SQ ring */
				108	struct io_sq_ring *sq_ring;
				109	unsigned cached_sq_head;
				110	unsigned sq_entries;
				111	unsigned sq_mask;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	112	unsigned sq_thread_idle;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	113	struct io_uring_sqe *sq_sqes;
				114	} ____cacheline_aligned_in_smp;
				115
				116	/* IO offload */
				117	struct workqueue_struct *sqo_wq;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	118	struct task_struct sqo_thread; / if using sq thread polling */
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	119	struct mm_struct *sqo_mm;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	120	wait_queue_head_t sqo_wait;
				121	unsigned sqo_stop;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	122
				123	struct {
				124	/* CQ ring */
				125	struct io_cq_ring *cq_ring;
				126	unsigned cached_cq_tail;
				127	unsigned cq_entries;
				128	unsigned cq_mask;
				129	struct wait_queue_head cq_wait;
				130	struct fasync_struct *cq_fasync;
				131	} ____cacheline_aligned_in_smp;
				132
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	133	/*
				134	* If used, fixed file set. Writers must ensure that ->refs is dead,
				135	* readers must ensure that ->refs is alive as long as the file* is
				136	* used. Only updated through io_uring_register(2).
				137	*/
				138	struct file **user_files;
				139	unsigned nr_user_files;
				140
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	141	/* if used, fixed mapped user buffers */
				142	unsigned nr_user_bufs;
				143	struct io_mapped_ubuf *user_bufs;
				144
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	145	struct user_struct *user;
				146
				147	struct completion ctx_done;
				148
				149	struct {
				150	struct mutex uring_lock;
				151	wait_queue_head_t wait;
				152	} ____cacheline_aligned_in_smp;
				153
				154	struct {
				155	spinlock_t completion_lock;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	156	bool poll_multi_file;
				157	/*
				158	* ->poll_list is protected by the ctx->uring_lock for
				159	* io_uring instances that don't use IORING_SETUP_SQPOLL.
				160	* For SQPOLL, only the single threaded io_sq_thread() will
				161	* manipulate the list, hence no extra locking is needed there.
				162	*/
				163	struct list_head poll_list;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	164	} ____cacheline_aligned_in_smp;
				165
				166	#if defined(CONFIG_UNIX)
				167	struct socket *ring_sock;
				168	#endif
				169	};
				170
				171	struct sqe_submit {
				172	const struct io_uring_sqe *sqe;
				173	unsigned short index;
				174	bool has_user;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	175	bool needs_lock;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	176	bool needs_fixed_file;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	177	};
				178
				179	struct io_kiocb {
				180	struct kiocb rw;
				181
				182	struct sqe_submit submit;
				183
				184	struct io_ring_ctx *ctx;
				185	struct list_head list;
				186	unsigned int flags;
				187	#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	188	#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	189	#define REQ_F_FIXED_FILE 4 /* ctx owns file */
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	190	u64 user_data;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	191	u64 error;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	192
				193	struct work_struct work;
				194	};
				195
				196	#define IO_PLUG_THRESHOLD 2
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	197	#define IO_IOPOLL_BATCH 8
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	198
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	199	struct io_submit_state {
				200	struct blk_plug plug;
				201
				202	/*
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	203	* io_kiocb alloc cache
				204	*/
				205	void *reqs[IO_IOPOLL_BATCH];
				206	unsigned int free_reqs;
				207	unsigned int cur_req;
				208
				209	/*
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	210	* File reference cache
				211	*/
				212	struct file *file;
				213	unsigned int fd;
				214	unsigned int has_refs;
				215	unsigned int used_refs;
				216	unsigned int ios_left;
				217	};
				218
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	219	static struct kmem_cache *req_cachep;
				220
				221	static const struct file_operations io_uring_fops;
				222
				223	struct sock io_uring_get_socket(struct file file)
				224	{
				225	#if defined(CONFIG_UNIX)
				226	if (file->f_op == &io_uring_fops) {
				227	struct io_ring_ctx *ctx = file->private_data;
				228
				229	return ctx->ring_sock->sk;
				230	}
				231	#endif
				232	return NULL;
				233	}
				234	EXPORT_SYMBOL(io_uring_get_socket);
				235
				236	static void io_ring_ctx_ref_free(struct percpu_ref *ref)
				237	{
				238	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
				239
				240	complete(&ctx->ctx_done);
				241	}
				242
				243	static struct io_ring_ctx io_ring_ctx_alloc(struct io_uring_params p)
				244	{
				245	struct io_ring_ctx *ctx;
				246
				247	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
				248	if (!ctx)
				249	return NULL;
				250
				251	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
				252	kfree(ctx);
				253	return NULL;
				254	}
				255
				256	ctx->flags = p->flags;
				257	init_waitqueue_head(&ctx->cq_wait);
				258	init_completion(&ctx->ctx_done);
				259	mutex_init(&ctx->uring_lock);
				260	init_waitqueue_head(&ctx->wait);
				261	spin_lock_init(&ctx->completion_lock);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	262	INIT_LIST_HEAD(&ctx->poll_list);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	263	return ctx;
				264	}
				265
				266	static void io_commit_cqring(struct io_ring_ctx *ctx)
				267	{
				268	struct io_cq_ring *ring = ctx->cq_ring;
				269
				270	if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
				271	/* order cqe stores with ring update */
				272	smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
				273
				274	/*
				275	* Write sider barrier of tail update, app has read side. See
				276	* comment at the top of this file.
				277	*/
				278	smp_wmb();
				279
				280	if (wq_has_sleeper(&ctx->cq_wait)) {
				281	wake_up_interruptible(&ctx->cq_wait);
				282	kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
				283	}
				284	}
				285	}
				286
				287	static struct io_uring_cqe io_get_cqring(struct io_ring_ctx ctx)
				288	{
				289	struct io_cq_ring *ring = ctx->cq_ring;
				290	unsigned tail;
				291
				292	tail = ctx->cached_cq_tail;
				293	/* See comment at the top of the file */
				294	smp_rmb();
				295	if (tail + 1 == READ_ONCE(ring->r.head))
				296	return NULL;
				297
				298	ctx->cached_cq_tail++;
				299	return &ring->cqes[tail & ctx->cq_mask];
				300	}
				301
				302	static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
				303	long res, unsigned ev_flags)
				304	{
				305	struct io_uring_cqe *cqe;
				306
				307	/*
				308	* If we can't get a cq entry, userspace overflowed the
				309	* submission (by quite a lot). Increment the overflow count in
				310	* the ring.
				311	*/
				312	cqe = io_get_cqring(ctx);
				313	if (cqe) {
				314	WRITE_ONCE(cqe->user_data, ki_user_data);
				315	WRITE_ONCE(cqe->res, res);
				316	WRITE_ONCE(cqe->flags, ev_flags);
				317	} else {
				318	unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
				319
				320	WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
				321	}
				322	}
				323
				324	static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
				325	long res, unsigned ev_flags)
				326	{
				327	unsigned long flags;
				328
				329	spin_lock_irqsave(&ctx->completion_lock, flags);
				330	io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
				331	io_commit_cqring(ctx);
				332	spin_unlock_irqrestore(&ctx->completion_lock, flags);
				333
				334	if (waitqueue_active(&ctx->wait))
				335	wake_up(&ctx->wait);
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	336	if (waitqueue_active(&ctx->sqo_wait))
				337	wake_up(&ctx->sqo_wait);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	338	}
				339
				340	static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
				341	{
				342	percpu_ref_put_many(&ctx->refs, refs);
				343
				344	if (waitqueue_active(&ctx->wait))
				345	wake_up(&ctx->wait);
				346	}
				347
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	348	static struct io_kiocb io_get_req(struct io_ring_ctx ctx,
				349	struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	350	{
				351	struct io_kiocb *req;
				352
				353	if (!percpu_ref_tryget(&ctx->refs))
				354	return NULL;
				355
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	356	if (!state) {
				357	req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
				358	if (unlikely(!req))
				359	goto out;
				360	} else if (!state->free_reqs) {
				361	size_t sz;
				362	int ret;
				363
				364	sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
				365	ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz,
				366	state->reqs);
				367	if (unlikely(ret <= 0))
				368	goto out;
				369	state->free_reqs = ret - 1;
				370	state->cur_req = 1;
				371	req = state->reqs[0];
				372	} else {
				373	req = state->reqs[state->cur_req];
				374	state->free_reqs--;
				375	state->cur_req++;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	376	}
				377
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	378	req->ctx = ctx;
				379	req->flags = 0;
				380	return req;
				381	out:
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	382	io_ring_drop_ctx_refs(ctx, 1);
				383	return NULL;
				384	}
				385
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	386	static void io_free_req_many(struct io_ring_ctx ctx, void reqs, int nr)
				387	{
				388	if (*nr) {
				389	kmem_cache_free_bulk(req_cachep, *nr, reqs);
				390	io_ring_drop_ctx_refs(ctx, *nr);
				391	*nr = 0;
				392	}
				393	}
				394
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	395	static void io_free_req(struct io_kiocb *req)
				396	{
				397	io_ring_drop_ctx_refs(req->ctx, 1);
				398	kmem_cache_free(req_cachep, req);
				399	}
				400
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	401	/*
				402	* Find and free completed poll iocbs
				403	*/
				404	static void io_iopoll_complete(struct io_ring_ctx ctx, unsigned int nr_events,
				405	struct list_head *done)
				406	{
				407	void *reqs[IO_IOPOLL_BATCH];
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	408	int file_count, to_free;
				409	struct file *file = NULL;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	410	struct io_kiocb *req;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	411
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	412	file_count = to_free = 0;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	413	while (!list_empty(done)) {
				414	req = list_first_entry(done, struct io_kiocb, list);
				415	list_del(&req->list);
				416
				417	io_cqring_fill_event(ctx, req->user_data, req->error, 0);
				418
				419	reqs[to_free++] = req;
				420	(*nr_events)++;
				421
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	422	/*
				423	* Batched puts of the same file, to avoid dirtying the
				424	* file usage count multiple times, if avoidable.
				425	*/
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	426	if (!(req->flags & REQ_F_FIXED_FILE)) {
				427	if (!file) {
				428	file = req->rw.ki_filp;
				429	file_count = 1;
				430	} else if (file == req->rw.ki_filp) {
				431	file_count++;
				432	} else {
				433	fput_many(file, file_count);
				434	file = req->rw.ki_filp;
				435	file_count = 1;
				436	}
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	437	}
				438
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	439	if (to_free == ARRAY_SIZE(reqs))
				440	io_free_req_many(ctx, reqs, &to_free);
				441	}
				442	io_commit_cqring(ctx);
				443
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	444	if (file)
				445	fput_many(file, file_count);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	446	io_free_req_many(ctx, reqs, &to_free);
				447	}
				448
				449	static int io_do_iopoll(struct io_ring_ctx ctx, unsigned int nr_events,
				450	long min)
				451	{
				452	struct io_kiocb req, tmp;
				453	LIST_HEAD(done);
				454	bool spin;
				455	int ret;
				456
				457	/*
				458	* Only spin for completions if we don't have multiple devices hanging
				459	* off our complete list, and we're under the requested amount.
				460	*/
				461	spin = !ctx->poll_multi_file && *nr_events < min;
				462
				463	ret = 0;
				464	list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
				465	struct kiocb *kiocb = &req->rw;
				466
				467	/*
				468	* Move completed entries to our local list. If we find a
				469	* request that requires polling, break out and complete
				470	* the done list first, if we have entries there.
				471	*/
				472	if (req->flags & REQ_F_IOPOLL_COMPLETED) {
				473	list_move_tail(&req->list, &done);
				474	continue;
				475	}
				476	if (!list_empty(&done))
				477	break;
				478
				479	ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
				480	if (ret < 0)
				481	break;
				482
				483	if (ret && spin)
				484	spin = false;
				485	ret = 0;
				486	}
				487
				488	if (!list_empty(&done))
				489	io_iopoll_complete(ctx, nr_events, &done);
				490
				491	return ret;
				492	}
				493
				494	/*
				495	* Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
				496	* non-spinning poll check - we'll still enter the driver poll loop, but only
				497	* as a non-spinning completion check.
				498	*/
				499	static int io_iopoll_getevents(struct io_ring_ctx ctx, unsigned int nr_events,
				500	long min)
				501	{
				502	while (!list_empty(&ctx->poll_list)) {
				503	int ret;
				504
				505	ret = io_do_iopoll(ctx, nr_events, min);
				506	if (ret < 0)
				507	return ret;
				508	if (!min \|\| *nr_events >= min)
				509	return 0;
				510	}
				511
				512	return 1;
				513	}
				514
				515	/*
				516	* We can't just wait for polled events to come to us, we have to actively
				517	* find and complete them.
				518	*/
				519	static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
				520	{
				521	if (!(ctx->flags & IORING_SETUP_IOPOLL))
				522	return;
				523
				524	mutex_lock(&ctx->uring_lock);
				525	while (!list_empty(&ctx->poll_list)) {
				526	unsigned int nr_events = 0;
				527
				528	io_iopoll_getevents(ctx, &nr_events, 1);
				529	}
				530	mutex_unlock(&ctx->uring_lock);
				531	}
				532
				533	static int io_iopoll_check(struct io_ring_ctx ctx, unsigned nr_events,
				534	long min)
				535	{
				536	int ret = 0;
				537
				538	do {
				539	int tmin = 0;
				540
				541	if (*nr_events < min)
				542	tmin = min - *nr_events;
				543
				544	ret = io_iopoll_getevents(ctx, nr_events, tmin);
				545	if (ret <= 0)
				546	break;
				547	ret = 0;
				548	} while (min && !*nr_events && !need_resched());
				549
				550	return ret;
				551	}
				552
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	553	static void kiocb_end_write(struct kiocb *kiocb)
				554	{
				555	if (kiocb->ki_flags & IOCB_WRITE) {
				556	struct inode *inode = file_inode(kiocb->ki_filp);
				557
				558	/*
				559	* Tell lockdep we inherited freeze protection from submission
				560	* thread.
				561	*/
				562	if (S_ISREG(inode->i_mode))
				563	__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
				564	file_end_write(kiocb->ki_filp);
				565	}
				566	}
				567
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	568	static void io_fput(struct io_kiocb *req)
				569	{
				570	if (!(req->flags & REQ_F_FIXED_FILE))
				571	fput(req->rw.ki_filp);
				572	}
				573
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	574	static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
				575	{
				576	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
				577
				578	kiocb_end_write(kiocb);
				579
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	580	io_fput(req);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	581	io_cqring_add_event(req->ctx, req->user_data, res, 0);
				582	io_free_req(req);
				583	}
				584
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	585	static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
				586	{
				587	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
				588
				589	kiocb_end_write(kiocb);
				590
				591	req->error = res;
				592	if (res != -EAGAIN)
				593	req->flags \|= REQ_F_IOPOLL_COMPLETED;
				594	}
				595
				596	/*
				597	* After the iocb has been issued, it's safe to be found on the poll list.
				598	* Adding the kiocb to the list AFTER submission ensures that we don't
				599	* find it from a io_iopoll_getevents() thread before the issuer is done
				600	* accessing the kiocb cookie.
				601	*/
				602	static void io_iopoll_req_issued(struct io_kiocb *req)
				603	{
				604	struct io_ring_ctx *ctx = req->ctx;
				605
				606	/*
				607	* Track whether we have multiple files in our lists. This will impact
				608	* how we do polling eventually, not spinning if we're on potentially
				609	* different devices.
				610	*/
				611	if (list_empty(&ctx->poll_list)) {
				612	ctx->poll_multi_file = false;
				613	} else if (!ctx->poll_multi_file) {
				614	struct io_kiocb *list_req;
				615
				616	list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
				617	list);
				618	if (list_req->rw.ki_filp != req->rw.ki_filp)
				619	ctx->poll_multi_file = true;
				620	}
				621
				622	/*
				623	* For fast devices, IO may have already completed. If it has, add
				624	* it to the front so we find it first.
				625	*/
				626	if (req->flags & REQ_F_IOPOLL_COMPLETED)
				627	list_add(&req->list, &ctx->poll_list);
				628	else
				629	list_add_tail(&req->list, &ctx->poll_list);
				630	}
				631
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	632	static void io_file_put(struct io_submit_state state, struct file file)
				633	{
				634	if (!state) {
				635	fput(file);
				636	} else if (state->file) {
				637	int diff = state->has_refs - state->used_refs;
				638
				639	if (diff)
				640	fput_many(state->file, diff);
				641	state->file = NULL;
				642	}
				643	}
				644
				645	/*
				646	* Get as many references to a file as we have IOs left in this submission,
				647	* assuming most submissions are for one file, or at least that each file
				648	* has more than one submission.
				649	*/
				650	static struct file io_file_get(struct io_submit_state state, int fd)
				651	{
				652	if (!state)
				653	return fget(fd);
				654
				655	if (state->file) {
				656	if (state->fd == fd) {
				657	state->used_refs++;
				658	state->ios_left--;
				659	return state->file;
				660	}
				661	io_file_put(state, NULL);
				662	}
				663	state->file = fget_many(fd, state->ios_left);
				664	if (!state->file)
				665	return NULL;
				666
				667	state->fd = fd;
				668	state->has_refs = state->ios_left;
				669	state->used_refs = 1;
				670	state->ios_left--;
				671	return state->file;
				672	}
				673
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	674	/*
				675	* If we tracked the file through the SCM inflight mechanism, we could support
				676	* any file. For now, just ensure that anything potentially problematic is done
				677	* inline.
				678	*/
				679	static bool io_file_supports_async(struct file *file)
				680	{
				681	umode_t mode = file_inode(file)->i_mode;
				682
				683	if (S_ISBLK(mode) \|\| S_ISCHR(mode))
				684	return true;
				685	if (S_ISREG(mode) && file->f_op != &io_uring_fops)
				686	return true;
				687
				688	return false;
				689	}
				690
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	691	static int io_prep_rw(struct io_kiocb req, const struct sqe_submit s,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	692	bool force_nonblock, struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	693	{
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	694	const struct io_uring_sqe *sqe = s->sqe;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	695	struct io_ring_ctx *ctx = req->ctx;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	696	struct kiocb *kiocb = &req->rw;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	697	unsigned ioprio, flags;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	698	int fd, ret;
				699
				700	/* For -EAGAIN retry, everything is already prepped */
				701	if (kiocb->ki_filp)
				702	return 0;
				703
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	704	flags = READ_ONCE(sqe->flags);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	705	fd = READ_ONCE(sqe->fd);
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	706
				707	if (flags & IOSQE_FIXED_FILE) {
				708	if (unlikely(!ctx->user_files \|\|
				709	(unsigned) fd >= ctx->nr_user_files))
				710	return -EBADF;
				711	kiocb->ki_filp = ctx->user_files[fd];
				712	req->flags \|= REQ_F_FIXED_FILE;
				713	} else {
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	714	if (s->needs_fixed_file)
				715	return -EBADF;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	716	kiocb->ki_filp = io_file_get(state, fd);
				717	if (unlikely(!kiocb->ki_filp))
				718	return -EBADF;
				719	if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
				720	force_nonblock = false;
				721	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	722	kiocb->ki_pos = READ_ONCE(sqe->off);
				723	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
				724	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
				725
				726	ioprio = READ_ONCE(sqe->ioprio);
				727	if (ioprio) {
				728	ret = ioprio_check_cap(ioprio);
				729	if (ret)
				730	goto out_fput;
				731
				732	kiocb->ki_ioprio = ioprio;
				733	} else
				734	kiocb->ki_ioprio = get_current_ioprio();
				735
				736	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
				737	if (unlikely(ret))
				738	goto out_fput;
				739	if (force_nonblock) {
				740	kiocb->ki_flags \|= IOCB_NOWAIT;
				741	req->flags \|= REQ_F_FORCE_NONBLOCK;
				742	}
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	743	if (ctx->flags & IORING_SETUP_IOPOLL) {
				744	ret = -EOPNOTSUPP;
				745	if (!(kiocb->ki_flags & IOCB_DIRECT) \|\|
				746	!kiocb->ki_filp->f_op->iopoll)
				747	goto out_fput;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	748
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	749	req->error = 0;
				750	kiocb->ki_flags \|= IOCB_HIPRI;
				751	kiocb->ki_complete = io_complete_rw_iopoll;
				752	} else {
				753	if (kiocb->ki_flags & IOCB_HIPRI) {
				754	ret = -EINVAL;
				755	goto out_fput;
				756	}
				757	kiocb->ki_complete = io_complete_rw;
				758	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	759	return 0;
				760	out_fput:
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	761	if (!(flags & IOSQE_FIXED_FILE)) {
				762	/*
				763	* in case of error, we didn't use this file reference. drop it.
				764	*/
				765	if (state)
				766	state->used_refs--;
				767	io_file_put(state, kiocb->ki_filp);
				768	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	769	return ret;
				770	}
				771
				772	static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
				773	{
				774	switch (ret) {
				775	case -EIOCBQUEUED:
				776	break;
				777	case -ERESTARTSYS:
				778	case -ERESTARTNOINTR:
				779	case -ERESTARTNOHAND:
				780	case -ERESTART_RESTARTBLOCK:
				781	/*
				782	* We can't just restart the syscall, since previously
				783	* submitted sqes may already be in progress. Just fail this
				784	* IO with EINTR.
				785	*/
				786	ret = -EINTR;
				787	/* fall through */
				788	default:
				789	kiocb->ki_complete(kiocb, ret, 0);
				790	}
				791	}
				792
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	793	static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
				794	const struct io_uring_sqe *sqe,
				795	struct iov_iter *iter)
				796	{
				797	size_t len = READ_ONCE(sqe->len);
				798	struct io_mapped_ubuf *imu;
				799	unsigned index, buf_index;
				800	size_t offset;
				801	u64 buf_addr;
				802
				803	/* attempt to use fixed buffers without having provided iovecs */
				804	if (unlikely(!ctx->user_bufs))
				805	return -EFAULT;
				806
				807	buf_index = READ_ONCE(sqe->buf_index);
				808	if (unlikely(buf_index >= ctx->nr_user_bufs))
				809	return -EFAULT;
				810
				811	index = array_index_nospec(buf_index, ctx->nr_user_bufs);
				812	imu = &ctx->user_bufs[index];
				813	buf_addr = READ_ONCE(sqe->addr);
				814
				815	/* overflow */
				816	if (buf_addr + len < buf_addr)
				817	return -EFAULT;
				818	/* not inside the mapped region */
				819	if (buf_addr < imu->ubuf \|\| buf_addr + len > imu->ubuf + imu->len)
				820	return -EFAULT;
				821
				822	/*
				823	* May not be a start of buffer, set size appropriately
				824	* and advance us to the beginning.
				825	*/
				826	offset = buf_addr - imu->ubuf;
				827	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
				828	if (offset)
				829	iov_iter_advance(iter, offset);
				830	return 0;
				831	}
				832
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	833	static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
				834	const struct sqe_submit s, struct iovec *iovec,
				835	struct iov_iter *iter)
				836	{
				837	const struct io_uring_sqe *sqe = s->sqe;
				838	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
				839	size_t sqe_len = READ_ONCE(sqe->len);
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	840	u8 opcode;
				841
				842	/*
				843	* We're reading ->opcode for the second time, but the first read
				844	* doesn't care whether it's _FIXED or not, so it doesn't matter
				845	* whether ->opcode changes concurrently. The first read does care
				846	* about whether it is a READ or a WRITE, so we don't trust this read
				847	* for that purpose and instead let the caller pass in the read/write
				848	* flag.
				849	*/
				850	opcode = READ_ONCE(sqe->opcode);
				851	if (opcode == IORING_OP_READ_FIXED \|\|
				852	opcode == IORING_OP_WRITE_FIXED) {
				853	ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
				854	*iovec = NULL;
				855	return ret;
				856	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	857
				858	if (!s->has_user)
				859	return -EFAULT;
				860
				861	#ifdef CONFIG_COMPAT
				862	if (ctx->compat)
				863	return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
				864	iovec, iter);
				865	#endif
				866
				867	return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
				868	}
				869
				870	static ssize_t io_read(struct io_kiocb req, const struct sqe_submit s,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	871	bool force_nonblock, struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	872	{
				873	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
				874	struct kiocb *kiocb = &req->rw;
				875	struct iov_iter iter;
				876	struct file *file;
				877	ssize_t ret;
				878
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	879	ret = io_prep_rw(req, s, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	880	if (ret)
				881	return ret;
				882	file = kiocb->ki_filp;
				883
				884	ret = -EBADF;
				885	if (unlikely(!(file->f_mode & FMODE_READ)))
				886	goto out_fput;
				887	ret = -EINVAL;
				888	if (unlikely(!file->f_op->read_iter))
				889	goto out_fput;
				890
				891	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
				892	if (ret)
				893	goto out_fput;
				894
				895	ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_iter_count(&iter));
				896	if (!ret) {
				897	ssize_t ret2;
				898
				899	/* Catch -EAGAIN return for forced non-blocking submission */
				900	ret2 = call_read_iter(file, kiocb, &iter);
				901	if (!force_nonblock \|\| ret2 != -EAGAIN)
				902	io_rw_done(kiocb, ret2);
				903	else
				904	ret = -EAGAIN;
				905	}
				906	kfree(iovec);
				907	out_fput:
				908	/* Hold on to the file for -EAGAIN */
				909	if (unlikely(ret && ret != -EAGAIN))
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	910	io_fput(req);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	911	return ret;
				912	}
				913
				914	static ssize_t io_write(struct io_kiocb req, const struct sqe_submit s,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	915	bool force_nonblock, struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	916	{
				917	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
				918	struct kiocb *kiocb = &req->rw;
				919	struct iov_iter iter;
				920	struct file *file;
				921	ssize_t ret;
				922
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	923	ret = io_prep_rw(req, s, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	924	if (ret)
				925	return ret;
				926	/* Hold on to the file for -EAGAIN */
				927	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
				928	return -EAGAIN;
				929
				930	ret = -EBADF;
				931	file = kiocb->ki_filp;
				932	if (unlikely(!(file->f_mode & FMODE_WRITE)))
				933	goto out_fput;
				934	ret = -EINVAL;
				935	if (unlikely(!file->f_op->write_iter))
				936	goto out_fput;
				937
				938	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
				939	if (ret)
				940	goto out_fput;
				941
				942	ret = rw_verify_area(WRITE, file, &kiocb->ki_pos,
				943	iov_iter_count(&iter));
				944	if (!ret) {
				945	/*
				946	* Open-code file_start_write here to grab freeze protection,
				947	* which will be released by another thread in
				948	* io_complete_rw(). Fool lockdep by telling it the lock got
				949	* released so that it doesn't complain about the held lock when
				950	* we return to userspace.
				951	*/
				952	if (S_ISREG(file_inode(file)->i_mode)) {
				953	__sb_start_write(file_inode(file)->i_sb,
				954	SB_FREEZE_WRITE, true);
				955	__sb_writers_release(file_inode(file)->i_sb,
				956	SB_FREEZE_WRITE);
				957	}
				958	kiocb->ki_flags \|= IOCB_WRITE;
				959	io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
				960	}
				961	kfree(iovec);
				962	out_fput:
				963	if (unlikely(ret))
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	964	io_fput(req);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	965	return ret;
				966	}
				967
				968	/*
				969	* IORING_OP_NOP just posts a completion event, nothing else.
				970	*/
				971	static int io_nop(struct io_kiocb *req, u64 user_data)
				972	{
				973	struct io_ring_ctx *ctx = req->ctx;
				974	long err = 0;
				975
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	976	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
				977	return -EINVAL;
				978
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	979	/*
				980	* Twilight zone - it's possible that someone issued an opcode that
				981	* has a file attached, then got -EAGAIN on submission, and changed
				982	* the sqe before we retried it from async context. Avoid dropping
				983	* a file reference for this malicious case, and flag the error.
				984	*/
				985	if (req->rw.ki_filp) {
				986	err = -EBADF;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	987	io_fput(req);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	988	}
				989	io_cqring_add_event(ctx, user_data, err, 0);
				990	io_free_req(req);
				991	return 0;
				992	}
				993
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	994	static int io_prep_fsync(struct io_kiocb req, const struct io_uring_sqe sqe)
				995	{
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	996	struct io_ring_ctx *ctx = req->ctx;
				997	unsigned flags;
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	998	int fd;
				999
				1000	/* Prep already done */
				1001	if (req->rw.ki_filp)
				1002	return 0;
				1003
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1004	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1005	return -EINVAL;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1006	if (unlikely(sqe->addr \|\| sqe->ioprio \|\| sqe->buf_index))
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1007	return -EINVAL;
				1008
				1009	fd = READ_ONCE(sqe->fd);
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1010	flags = READ_ONCE(sqe->flags);
				1011
				1012	if (flags & IOSQE_FIXED_FILE) {
				1013	if (unlikely(!ctx->user_files \|\| fd >= ctx->nr_user_files))
				1014	return -EBADF;
				1015	req->rw.ki_filp = ctx->user_files[fd];
				1016	req->flags \|= REQ_F_FIXED_FILE;
				1017	} else {
				1018	req->rw.ki_filp = fget(fd);
				1019	if (unlikely(!req->rw.ki_filp))
				1020	return -EBADF;
				1021	}
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1022
				1023	return 0;
				1024	}
				1025
				1026	static int io_fsync(struct io_kiocb req, const struct io_uring_sqe sqe,
				1027	bool force_nonblock)
				1028	{
				1029	loff_t sqe_off = READ_ONCE(sqe->off);
				1030	loff_t sqe_len = READ_ONCE(sqe->len);
				1031	loff_t end = sqe_off + sqe_len;
				1032	unsigned fsync_flags;
				1033	int ret;
				1034
				1035	fsync_flags = READ_ONCE(sqe->fsync_flags);
				1036	if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
				1037	return -EINVAL;
				1038
				1039	ret = io_prep_fsync(req, sqe);
				1040	if (ret)
				1041	return ret;
				1042
				1043	/* fsync always requires a blocking context */
				1044	if (force_nonblock)
				1045	return -EAGAIN;
				1046
				1047	ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
				1048	end > 0 ? end : LLONG_MAX,
				1049	fsync_flags & IORING_FSYNC_DATASYNC);
				1050
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1051	io_fput(req);
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1052	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
				1053	io_free_req(req);
				1054	return 0;
				1055	}
				1056
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1057	static int __io_submit_sqe(struct io_ring_ctx ctx, struct io_kiocb req,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1058	const struct sqe_submit *s, bool force_nonblock,
				1059	struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1060	{
				1061	ssize_t ret;
				1062	int opcode;
				1063
				1064	if (unlikely(s->index >= ctx->sq_entries))
				1065	return -EINVAL;
				1066	req->user_data = READ_ONCE(s->sqe->user_data);
				1067
				1068	opcode = READ_ONCE(s->sqe->opcode);
				1069	switch (opcode) {
				1070	case IORING_OP_NOP:
				1071	ret = io_nop(req, req->user_data);
				1072	break;
				1073	case IORING_OP_READV:
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1074	if (unlikely(s->sqe->buf_index))
				1075	return -EINVAL;
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1076	ret = io_read(req, s, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1077	break;
				1078	case IORING_OP_WRITEV:
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1079	if (unlikely(s->sqe->buf_index))
				1080	return -EINVAL;
				1081	ret = io_write(req, s, force_nonblock, state);
				1082	break;
				1083	case IORING_OP_READ_FIXED:
				1084	ret = io_read(req, s, force_nonblock, state);
				1085	break;
				1086	case IORING_OP_WRITE_FIXED:
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1087	ret = io_write(req, s, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1088	break;
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1089	case IORING_OP_FSYNC:
				1090	ret = io_fsync(req, s->sqe, force_nonblock);
				1091	break;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1092	default:
				1093	ret = -EINVAL;
				1094	break;
				1095	}
				1096
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1097	if (ret)
				1098	return ret;
				1099
				1100	if (ctx->flags & IORING_SETUP_IOPOLL) {
				1101	if (req->error == -EAGAIN)
				1102	return -EAGAIN;
				1103
				1104	/* workqueue context doesn't hold uring_lock, grab it now */
				1105	if (s->needs_lock)
				1106	mutex_lock(&ctx->uring_lock);
				1107	io_iopoll_req_issued(req);
				1108	if (s->needs_lock)
				1109	mutex_unlock(&ctx->uring_lock);
				1110	}
				1111
				1112	return 0;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1113	}
				1114
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1115	static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
				1116	{
				1117	u8 opcode = READ_ONCE(sqe->opcode);
				1118
				1119	return !(opcode == IORING_OP_READ_FIXED \|\|
				1120	opcode == IORING_OP_WRITE_FIXED);
				1121	}
				1122
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1123	static void io_sq_wq_submit_work(struct work_struct *work)
				1124	{
				1125	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
				1126	struct sqe_submit *s = &req->submit;
				1127	const struct io_uring_sqe *sqe = s->sqe;
				1128	struct io_ring_ctx *ctx = req->ctx;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1129	mm_segment_t old_fs;
				1130	bool needs_user;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1131	int ret;
				1132
				1133	/* Ensure we clear previously set forced non-block flag */
				1134	req->flags &= ~REQ_F_FORCE_NONBLOCK;
				1135	req->rw.ki_flags &= ~IOCB_NOWAIT;
				1136
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1137	s->needs_lock = true;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1138	s->has_user = false;
				1139
				1140	/*
				1141	* If we're doing IO to fixed buffers, we don't need to get/set
				1142	* user context
				1143	*/
				1144	needs_user = io_sqe_needs_user(s->sqe);
				1145	if (needs_user) {
				1146	if (!mmget_not_zero(ctx->sqo_mm)) {
				1147	ret = -EFAULT;
				1148	goto err;
				1149	}
				1150	use_mm(ctx->sqo_mm);
				1151	old_fs = get_fs();
				1152	set_fs(USER_DS);
				1153	s->has_user = true;
				1154	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1155
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1156	do {
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1157	ret = __io_submit_sqe(ctx, req, s, false, NULL);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1158	/*
				1159	* We can get EAGAIN for polled IO even though we're forcing
				1160	* a sync submission from here, since we can't wait for
				1161	* request slots on the block side.
				1162	*/
				1163	if (ret != -EAGAIN)
				1164	break;
				1165	cond_resched();
				1166	} while (1);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1167
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1168	if (needs_user) {
				1169	set_fs(old_fs);
				1170	unuse_mm(ctx->sqo_mm);
				1171	mmput(ctx->sqo_mm);
				1172	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1173	err:
				1174	if (ret) {
				1175	io_cqring_add_event(ctx, sqe->user_data, ret, 0);
				1176	io_free_req(req);
				1177	}
				1178
				1179	/* async context always use a copy of the sqe */
				1180	kfree(sqe);
				1181	}
				1182
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1183	static int io_submit_sqe(struct io_ring_ctx ctx, struct sqe_submit s,
				1184	struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1185	{
				1186	struct io_kiocb *req;
				1187	ssize_t ret;
				1188
				1189	/* enforce forwards compatibility on users */
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1190	if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1191	return -EINVAL;
				1192
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	1193	req = io_get_req(ctx, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1194	if (unlikely(!req))
				1195	return -EAGAIN;
				1196
				1197	req->rw.ki_filp = NULL;
				1198
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1199	ret = __io_submit_sqe(ctx, req, s, true, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1200	if (ret == -EAGAIN) {
				1201	struct io_uring_sqe *sqe_copy;
				1202
				1203	sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
				1204	if (sqe_copy) {
				1205	memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
				1206	s->sqe = sqe_copy;
				1207
				1208	memcpy(&req->submit, s, sizeof(*s));
				1209	INIT_WORK(&req->work, io_sq_wq_submit_work);
				1210	queue_work(ctx->sqo_wq, &req->work);
				1211	ret = 0;
				1212	}
				1213	}
				1214	if (ret)
				1215	io_free_req(req);
				1216
				1217	return ret;
				1218	}
				1219
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1220	/*
				1221	* Batched submission is done, ensure local IO is flushed out.
				1222	*/
				1223	static void io_submit_state_end(struct io_submit_state *state)
				1224	{
				1225	blk_finish_plug(&state->plug);
				1226	io_file_put(state, NULL);
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	1227	if (state->free_reqs)
				1228	kmem_cache_free_bulk(req_cachep, state->free_reqs,
				1229	&state->reqs[state->cur_req]);
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1230	}
				1231
				1232	/*
				1233	* Start submission side cache.
				1234	*/
				1235	static void io_submit_state_start(struct io_submit_state *state,
				1236	struct io_ring_ctx *ctx, unsigned max_ios)
				1237	{
				1238	blk_start_plug(&state->plug);
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	1239	state->free_reqs = 0;
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1240	state->file = NULL;
				1241	state->ios_left = max_ios;
				1242	}
				1243
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1244	static void io_commit_sqring(struct io_ring_ctx *ctx)
				1245	{
				1246	struct io_sq_ring *ring = ctx->sq_ring;
				1247
				1248	if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
				1249	/*
				1250	* Ensure any loads from the SQEs are done at this point,
				1251	* since once we write the new head, the application could
				1252	* write new data to them.
				1253	*/
				1254	smp_store_release(&ring->r.head, ctx->cached_sq_head);
				1255
				1256	/*
				1257	* write side barrier of head update, app has read side. See
				1258	* comment at the top of this file
				1259	*/
				1260	smp_wmb();
				1261	}
				1262	}
				1263
				1264	/*
				1265	* Undo last io_get_sqring()
				1266	*/
				1267	static void io_drop_sqring(struct io_ring_ctx *ctx)
				1268	{
				1269	ctx->cached_sq_head--;
				1270	}
				1271
				1272	/*
				1273	* Fetch an sqe, if one is available. Note that s->sqe will point to memory
				1274	* that is mapped by userspace. This means that care needs to be taken to
				1275	* ensure that reads are stable, as we cannot rely on userspace always
				1276	* being a good citizen. If members of the sqe are validated and then later
				1277	* used, it's important that those reads are done through READ_ONCE() to
				1278	* prevent a re-load down the line.
				1279	*/
				1280	static bool io_get_sqring(struct io_ring_ctx ctx, struct sqe_submit s)
				1281	{
				1282	struct io_sq_ring *ring = ctx->sq_ring;
				1283	unsigned head;
				1284
				1285	/*
				1286	* The cached sq head (or cq tail) serves two purposes:
				1287	*
				1288	* 1) allows us to batch the cost of updating the user visible
				1289	* head updates.
				1290	* 2) allows the kernel side to track the head on its own, even
				1291	* though the application is the one updating it.
				1292	*/
				1293	head = ctx->cached_sq_head;
				1294	/* See comment at the top of this file */
				1295	smp_rmb();
				1296	if (head == READ_ONCE(ring->r.tail))
				1297	return false;
				1298
				1299	head = READ_ONCE(ring->array[head & ctx->sq_mask]);
				1300	if (head < ctx->sq_entries) {
				1301	s->index = head;
				1302	s->sqe = &ctx->sq_sqes[head];
				1303	ctx->cached_sq_head++;
				1304	return true;
				1305	}
				1306
				1307	/* drop invalid entries */
				1308	ctx->cached_sq_head++;
				1309	ring->dropped++;
				1310	/* See comment at the top of this file */
				1311	smp_wmb();
				1312	return false;
				1313	}
				1314
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	1315	static int io_submit_sqes(struct io_ring_ctx ctx, struct sqe_submit sqes,
				1316	unsigned int nr, bool has_user, bool mm_fault)
				1317	{
				1318	struct io_submit_state state, *statep = NULL;
				1319	int ret, i, submitted = 0;
				1320
				1321	if (nr > IO_PLUG_THRESHOLD) {
				1322	io_submit_state_start(&state, ctx, nr);
				1323	statep = &state;
				1324	}
				1325
				1326	for (i = 0; i < nr; i++) {
				1327	if (unlikely(mm_fault)) {
				1328	ret = -EFAULT;
				1329	} else {
				1330	sqes[i].has_user = has_user;
				1331	sqes[i].needs_lock = true;
				1332	sqes[i].needs_fixed_file = true;
				1333	ret = io_submit_sqe(ctx, &sqes[i], statep);
				1334	}
				1335	if (!ret) {
				1336	submitted++;
				1337	continue;
				1338	}
				1339
				1340	io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0);
				1341	}
				1342
				1343	if (statep)
				1344	io_submit_state_end(&state);
				1345
				1346	return submitted;
				1347	}
				1348
				1349	static int io_sq_thread(void *data)
				1350	{
				1351	struct sqe_submit sqes[IO_IOPOLL_BATCH];
				1352	struct io_ring_ctx *ctx = data;
				1353	struct mm_struct *cur_mm = NULL;
				1354	mm_segment_t old_fs;
				1355	DEFINE_WAIT(wait);
				1356	unsigned inflight;
				1357	unsigned long timeout;
				1358
				1359	old_fs = get_fs();
				1360	set_fs(USER_DS);
				1361
				1362	timeout = inflight = 0;
				1363	while (!kthread_should_stop() && !ctx->sqo_stop) {
				1364	bool all_fixed, mm_fault = false;
				1365	int i;
				1366
				1367	if (inflight) {
				1368	unsigned nr_events = 0;
				1369
				1370	if (ctx->flags & IORING_SETUP_IOPOLL) {
				1371	/*
				1372	* We disallow the app entering submit/complete
				1373	* with polling, but we still need to lock the
				1374	* ring to prevent racing with polled issue
				1375	* that got punted to a workqueue.
				1376	*/
				1377	mutex_lock(&ctx->uring_lock);
				1378	io_iopoll_check(ctx, &nr_events, 0);
				1379	mutex_unlock(&ctx->uring_lock);
				1380	} else {
				1381	/*
				1382	* Normal IO, just pretend everything completed.
				1383	* We don't have to poll completions for that.
				1384	*/
				1385	nr_events = inflight;
				1386	}
				1387
				1388	inflight -= nr_events;
				1389	if (!inflight)
				1390	timeout = jiffies + ctx->sq_thread_idle;
				1391	}
				1392
				1393	if (!io_get_sqring(ctx, &sqes[0])) {
				1394	/*
				1395	* We're polling. If we're within the defined idle
				1396	* period, then let us spin without work before going
				1397	* to sleep.
				1398	*/
				1399	if (inflight \|\| !time_after(jiffies, timeout)) {
				1400	cpu_relax();
				1401	continue;
				1402	}
				1403
				1404	/*
				1405	* Drop cur_mm before scheduling, we can't hold it for
				1406	* long periods (or over schedule()). Do this before
				1407	* adding ourselves to the waitqueue, as the unuse/drop
				1408	* may sleep.
				1409	*/
				1410	if (cur_mm) {
				1411	unuse_mm(cur_mm);
				1412	mmput(cur_mm);
				1413	cur_mm = NULL;
				1414	}
				1415
				1416	prepare_to_wait(&ctx->sqo_wait, &wait,
				1417	TASK_INTERRUPTIBLE);
				1418
				1419	/* Tell userspace we may need a wakeup call */
				1420	ctx->sq_ring->flags \|= IORING_SQ_NEED_WAKEUP;
				1421	smp_wmb();
				1422
				1423	if (!io_get_sqring(ctx, &sqes[0])) {
				1424	if (kthread_should_stop()) {
				1425	finish_wait(&ctx->sqo_wait, &wait);
				1426	break;
				1427	}
				1428	if (signal_pending(current))
				1429	flush_signals(current);
				1430	schedule();
				1431	finish_wait(&ctx->sqo_wait, &wait);
				1432
				1433	ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
				1434	smp_wmb();
				1435	continue;
				1436	}
				1437	finish_wait(&ctx->sqo_wait, &wait);
				1438
				1439	ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
				1440	smp_wmb();
				1441	}
				1442
				1443	i = 0;
				1444	all_fixed = true;
				1445	do {
				1446	if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
				1447	all_fixed = false;
				1448
				1449	i++;
				1450	if (i == ARRAY_SIZE(sqes))
				1451	break;
				1452	} while (io_get_sqring(ctx, &sqes[i]));
				1453
				1454	/* Unless all new commands are FIXED regions, grab mm */
				1455	if (!all_fixed && !cur_mm) {
				1456	mm_fault = !mmget_not_zero(ctx->sqo_mm);
				1457	if (!mm_fault) {
				1458	use_mm(ctx->sqo_mm);
				1459	cur_mm = ctx->sqo_mm;
				1460	}
				1461	}
				1462
				1463	inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
				1464	mm_fault);
				1465
				1466	/* Commit SQ ring head once we've consumed all SQEs */
				1467	io_commit_sqring(ctx);
				1468	}
				1469
				1470	set_fs(old_fs);
				1471	if (cur_mm) {
				1472	unuse_mm(cur_mm);
				1473	mmput(cur_mm);
				1474	}
				1475	return 0;
				1476	}
				1477
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1478	static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
				1479	{
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1480	struct io_submit_state state, *statep = NULL;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1481	int i, ret = 0, submit = 0;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1482
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1483	if (to_submit > IO_PLUG_THRESHOLD) {
				1484	io_submit_state_start(&state, ctx, to_submit);
				1485	statep = &state;
				1486	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1487
				1488	for (i = 0; i < to_submit; i++) {
				1489	struct sqe_submit s;
				1490
				1491	if (!io_get_sqring(ctx, &s))
				1492	break;
				1493
				1494	s.has_user = true;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1495	s.needs_lock = false;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	1496	s.needs_fixed_file = false;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1497
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1498	ret = io_submit_sqe(ctx, &s, statep);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1499	if (ret) {
				1500	io_drop_sqring(ctx);
				1501	break;
				1502	}
				1503
				1504	submit++;
				1505	}
				1506	io_commit_sqring(ctx);
				1507
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1508	if (statep)
				1509	io_submit_state_end(statep);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1510
				1511	return submit ? submit : ret;
				1512	}
				1513
				1514	static unsigned io_cqring_events(struct io_cq_ring *ring)
				1515	{
				1516	return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
				1517	}
				1518
				1519	/*
				1520	* Wait until events become available, if we don't already have some. The
				1521	* application must reap them itself, as they reside on the shared cq ring.
				1522	*/
				1523	static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
				1524	const sigset_t __user *sig, size_t sigsz)
				1525	{
				1526	struct io_cq_ring *ring = ctx->cq_ring;
				1527	sigset_t ksigmask, sigsaved;
				1528	DEFINE_WAIT(wait);
				1529	int ret;
				1530
				1531	/* See comment at the top of this file */
				1532	smp_rmb();
				1533	if (io_cqring_events(ring) >= min_events)
				1534	return 0;
				1535
				1536	if (sig) {
				1537	ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
				1538	if (ret)
				1539	return ret;
				1540	}
				1541
				1542	do {
				1543	prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
				1544
				1545	ret = 0;
				1546	/* See comment at the top of this file */
				1547	smp_rmb();
				1548	if (io_cqring_events(ring) >= min_events)
				1549	break;
				1550
				1551	schedule();
				1552
				1553	ret = -EINTR;
				1554	if (signal_pending(current))
				1555	break;
				1556	} while (1);
				1557
				1558	finish_wait(&ctx->wait, &wait);
				1559
				1560	if (sig)
				1561	restore_user_sigmask(sig, &sigsaved);
				1562
				1563	return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
				1564	}
				1565
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1566	static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
				1567	{
				1568	#if defined(CONFIG_UNIX)
				1569	if (ctx->ring_sock) {
				1570	struct sock *sock = ctx->ring_sock->sk;
				1571	struct sk_buff *skb;
				1572
				1573	while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
				1574	kfree_skb(skb);
				1575	}
				1576	#else
				1577	int i;
				1578
				1579	for (i = 0; i < ctx->nr_user_files; i++)
				1580	fput(ctx->user_files[i]);
				1581	#endif
				1582	}
				1583
				1584	static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
				1585	{
				1586	if (!ctx->user_files)
				1587	return -ENXIO;
				1588
				1589	__io_sqe_files_unregister(ctx);
				1590	kfree(ctx->user_files);
				1591	ctx->user_files = NULL;
				1592	ctx->nr_user_files = 0;
				1593	return 0;
				1594	}
				1595
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	1596	static void io_sq_thread_stop(struct io_ring_ctx *ctx)
				1597	{
				1598	if (ctx->sqo_thread) {
				1599	ctx->sqo_stop = 1;
				1600	mb();
				1601	kthread_stop(ctx->sqo_thread);
				1602	ctx->sqo_thread = NULL;
				1603	}
				1604	}
				1605
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1606	static void io_finish_async(struct io_ring_ctx *ctx)
				1607	{
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	1608	io_sq_thread_stop(ctx);
				1609
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1610	if (ctx->sqo_wq) {
				1611	destroy_workqueue(ctx->sqo_wq);
				1612	ctx->sqo_wq = NULL;
				1613	}
				1614	}
				1615
				1616	#if defined(CONFIG_UNIX)
				1617	static void io_destruct_skb(struct sk_buff *skb)
				1618	{
				1619	struct io_ring_ctx *ctx = skb->sk->sk_user_data;
				1620
				1621	io_finish_async(ctx);
				1622	unix_destruct_scm(skb);
				1623	}
				1624
				1625	/*
				1626	* Ensure the UNIX gc is aware of our file set, so we are certain that
				1627	* the io_uring can be safely unregistered on process exit, even if we have
				1628	* loops in the file referencing.
				1629	*/
				1630	static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
				1631	{
				1632	struct sock *sk = ctx->ring_sock->sk;
				1633	struct scm_fp_list *fpl;
				1634	struct sk_buff *skb;
				1635	int i;
				1636
				1637	if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
				1638	unsigned long inflight = ctx->user->unix_inflight + nr;
				1639
				1640	if (inflight > task_rlimit(current, RLIMIT_NOFILE))
				1641	return -EMFILE;
				1642	}
				1643
				1644	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
				1645	if (!fpl)
				1646	return -ENOMEM;
				1647
				1648	skb = alloc_skb(0, GFP_KERNEL);
				1649	if (!skb) {
				1650	kfree(fpl);
				1651	return -ENOMEM;
				1652	}
				1653
				1654	skb->sk = sk;
				1655	skb->destructor = io_destruct_skb;
				1656
				1657	fpl->user = get_uid(ctx->user);
				1658	for (i = 0; i < nr; i++) {
				1659	fpl->fp[i] = get_file(ctx->user_files[i + offset]);
				1660	unix_inflight(fpl->user, fpl->fp[i]);
				1661	}
				1662
				1663	fpl->max = fpl->count = nr;
				1664	UNIXCB(skb).fp = fpl;
				1665	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
				1666	skb_queue_head(&sk->sk_receive_queue, skb);
				1667
				1668	for (i = 0; i < nr; i++)
				1669	fput(fpl->fp[i]);
				1670
				1671	return 0;
				1672	}
				1673
				1674	/*
				1675	* If UNIX sockets are enabled, fd passing can cause a reference cycle which
				1676	* causes regular reference counting to break down. We rely on the UNIX
				1677	* garbage collection to take care of this problem for us.
				1678	*/
				1679	static int io_sqe_files_scm(struct io_ring_ctx *ctx)
				1680	{
				1681	unsigned left, total;
				1682	int ret = 0;
				1683
				1684	total = 0;
				1685	left = ctx->nr_user_files;
				1686	while (left) {
				1687	unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
				1688	int ret;
				1689
				1690	ret = __io_sqe_files_scm(ctx, this_files, total);
				1691	if (ret)
				1692	break;
				1693	left -= this_files;
				1694	total += this_files;
				1695	}
				1696
				1697	if (!ret)
				1698	return 0;
				1699
				1700	while (total < ctx->nr_user_files) {
				1701	fput(ctx->user_files[total]);
				1702	total++;
				1703	}
				1704
				1705	return ret;
				1706	}
				1707	#else
				1708	static int io_sqe_files_scm(struct io_ring_ctx *ctx)
				1709	{
				1710	return 0;
				1711	}
				1712	#endif
				1713
				1714	static int io_sqe_files_register(struct io_ring_ctx ctx, void __user arg,
				1715	unsigned nr_args)
				1716	{
				1717	__s32 __user fds = (__s32 __user ) arg;
				1718	int fd, ret = 0;
				1719	unsigned i;
				1720
				1721	if (ctx->user_files)
				1722	return -EBUSY;
				1723	if (!nr_args)
				1724	return -EINVAL;
				1725	if (nr_args > IORING_MAX_FIXED_FILES)
				1726	return -EMFILE;
				1727
				1728	ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
				1729	if (!ctx->user_files)
				1730	return -ENOMEM;
				1731
				1732	for (i = 0; i < nr_args; i++) {
				1733	ret = -EFAULT;
				1734	if (copy_from_user(&fd, &fds[i], sizeof(fd)))
				1735	break;
				1736
				1737	ctx->user_files[i] = fget(fd);
				1738
				1739	ret = -EBADF;
				1740	if (!ctx->user_files[i])
				1741	break;
				1742	/*
				1743	* Don't allow io_uring instances to be registered. If UNIX
				1744	* isn't enabled, then this causes a reference cycle and this
				1745	* instance can never get freed. If UNIX is enabled we'll
				1746	* handle it just fine, but there's still no point in allowing
				1747	* a ring fd as it doesn't support regular read/write anyway.
				1748	*/
				1749	if (ctx->user_files[i]->f_op == &io_uring_fops) {
				1750	fput(ctx->user_files[i]);
				1751	break;
				1752	}
				1753	ctx->nr_user_files++;
				1754	ret = 0;
				1755	}
				1756
				1757	if (ret) {
				1758	for (i = 0; i < ctx->nr_user_files; i++)
				1759	fput(ctx->user_files[i]);
				1760
				1761	kfree(ctx->user_files);
				1762	ctx->nr_user_files = 0;
				1763	return ret;
				1764	}
				1765
				1766	ret = io_sqe_files_scm(ctx);
				1767	if (ret)
				1768	io_sqe_files_unregister(ctx);
				1769
				1770	return ret;
				1771	}
				1772
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	1773	static int io_sq_offload_start(struct io_ring_ctx *ctx,
				1774	struct io_uring_params *p)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1775	{
				1776	int ret;
				1777
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	1778	init_waitqueue_head(&ctx->sqo_wait);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1779	mmgrab(current->mm);
				1780	ctx->sqo_mm = current->mm;
				1781
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	1782	ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
				1783	if (!ctx->sq_thread_idle)
				1784	ctx->sq_thread_idle = HZ;
				1785
				1786	ret = -EINVAL;
				1787	if (!cpu_possible(p->sq_thread_cpu))
				1788	goto err;
				1789
				1790	if (ctx->flags & IORING_SETUP_SQPOLL) {
				1791	if (p->flags & IORING_SETUP_SQ_AFF) {
				1792	int cpu;
				1793
				1794	cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
				1795	ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
				1796	ctx, cpu,
				1797	"io_uring-sq");
				1798	} else {
				1799	ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
				1800	"io_uring-sq");
				1801	}
				1802	if (IS_ERR(ctx->sqo_thread)) {
				1803	ret = PTR_ERR(ctx->sqo_thread);
				1804	ctx->sqo_thread = NULL;
				1805	goto err;
				1806	}
				1807	wake_up_process(ctx->sqo_thread);
				1808	} else if (p->flags & IORING_SETUP_SQ_AFF) {
				1809	/* Can't have SQ_AFF without SQPOLL */
				1810	ret = -EINVAL;
				1811	goto err;
				1812	}
				1813
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1814	/* Do QD, or 2 * CPUS, whatever is smallest */
				1815	ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND \| WQ_FREEZABLE,
				1816	min(ctx->sq_entries - 1, 2 * num_online_cpus()));
				1817	if (!ctx->sqo_wq) {
				1818	ret = -ENOMEM;
				1819	goto err;
				1820	}
				1821
				1822	return 0;
				1823	err:
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	1824	io_sq_thread_stop(ctx);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1825	mmdrop(ctx->sqo_mm);
				1826	ctx->sqo_mm = NULL;
				1827	return ret;
				1828	}
				1829
				1830	static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
				1831	{
				1832	atomic_long_sub(nr_pages, &user->locked_vm);
				1833	}
				1834
				1835	static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
				1836	{
				1837	unsigned long page_limit, cur_pages, new_pages;
				1838
				1839	/* Don't allow more pages than we can safely lock */
				1840	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
				1841
				1842	do {
				1843	cur_pages = atomic_long_read(&user->locked_vm);
				1844	new_pages = cur_pages + nr_pages;
				1845	if (new_pages > page_limit)
				1846	return -ENOMEM;
				1847	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
				1848	new_pages) != cur_pages);
				1849
				1850	return 0;
				1851	}
				1852
				1853	static void io_mem_free(void *ptr)
				1854	{
				1855	struct page *page = virt_to_head_page(ptr);
				1856
				1857	if (put_page_testzero(page))
				1858	free_compound_page(page);
				1859	}
				1860
				1861	static void *io_mem_alloc(size_t size)
				1862	{
				1863	gfp_t gfp_flags = GFP_KERNEL \| __GFP_ZERO \| __GFP_NOWARN \| __GFP_COMP \|
				1864	__GFP_NORETRY;
				1865
				1866	return (void *) __get_free_pages(gfp_flags, get_order(size));
				1867	}
				1868
				1869	static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
				1870	{
				1871	struct io_sq_ring *sq_ring;
				1872	struct io_cq_ring *cq_ring;
				1873	size_t bytes;
				1874
				1875	bytes = struct_size(sq_ring, array, sq_entries);
				1876	bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
				1877	bytes += struct_size(cq_ring, cqes, cq_entries);
				1878
				1879	return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
				1880	}
				1881
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1882	static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
				1883	{
				1884	int i, j;
				1885
				1886	if (!ctx->user_bufs)
				1887	return -ENXIO;
				1888
				1889	for (i = 0; i < ctx->nr_user_bufs; i++) {
				1890	struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
				1891
				1892	for (j = 0; j < imu->nr_bvecs; j++)
				1893	put_page(imu->bvec[j].bv_page);
				1894
				1895	if (ctx->account_mem)
				1896	io_unaccount_mem(ctx->user, imu->nr_bvecs);
				1897	kfree(imu->bvec);
				1898	imu->nr_bvecs = 0;
				1899	}
				1900
				1901	kfree(ctx->user_bufs);
				1902	ctx->user_bufs = NULL;
				1903	ctx->nr_user_bufs = 0;
				1904	return 0;
				1905	}
				1906
				1907	static int io_copy_iov(struct io_ring_ctx ctx, struct iovec dst,
				1908	void __user *arg, unsigned index)
				1909	{
				1910	struct iovec __user *src;
				1911
				1912	#ifdef CONFIG_COMPAT
				1913	if (ctx->compat) {
				1914	struct compat_iovec __user *ciovs;
				1915	struct compat_iovec ciov;
				1916
				1917	ciovs = (struct compat_iovec __user *) arg;
				1918	if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
				1919	return -EFAULT;
				1920
				1921	dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
				1922	dst->iov_len = ciov.iov_len;
				1923	return 0;
				1924	}
				1925	#endif
				1926	src = (struct iovec __user *) arg;
				1927	if (copy_from_user(dst, &src[index], sizeof(*dst)))
				1928	return -EFAULT;
				1929	return 0;
				1930	}
				1931
				1932	static int io_sqe_buffer_register(struct io_ring_ctx ctx, void __user arg,
				1933	unsigned nr_args)
				1934	{
				1935	struct vm_area_struct **vmas = NULL;
				1936	struct page **pages = NULL;
				1937	int i, j, got_pages = 0;
				1938	int ret = -EINVAL;
				1939
				1940	if (ctx->user_bufs)
				1941	return -EBUSY;
				1942	if (!nr_args \|\| nr_args > UIO_MAXIOV)
				1943	return -EINVAL;
				1944
				1945	ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
				1946	GFP_KERNEL);
				1947	if (!ctx->user_bufs)
				1948	return -ENOMEM;
				1949
				1950	for (i = 0; i < nr_args; i++) {
				1951	struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
				1952	unsigned long off, start, end, ubuf;
				1953	int pret, nr_pages;
				1954	struct iovec iov;
				1955	size_t size;
				1956
				1957	ret = io_copy_iov(ctx, &iov, arg, i);
				1958	if (ret)
				1959	break;
				1960
				1961	/*
				1962	* Don't impose further limits on the size and buffer
				1963	* constraints here, we'll -EINVAL later when IO is
				1964	* submitted if they are wrong.
				1965	*/
				1966	ret = -EFAULT;
				1967	if (!iov.iov_base \|\| !iov.iov_len)
				1968	goto err;
				1969
				1970	/* arbitrary limit, but we need something */
				1971	if (iov.iov_len > SZ_1G)
				1972	goto err;
				1973
				1974	ubuf = (unsigned long) iov.iov_base;
				1975	end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
				1976	start = ubuf >> PAGE_SHIFT;
				1977	nr_pages = end - start;
				1978
				1979	if (ctx->account_mem) {
				1980	ret = io_account_mem(ctx->user, nr_pages);
				1981	if (ret)
				1982	goto err;
				1983	}
				1984
				1985	ret = 0;
				1986	if (!pages \|\| nr_pages > got_pages) {
				1987	kfree(vmas);
				1988	kfree(pages);
				1989	pages = kmalloc_array(nr_pages, sizeof(struct page *),
				1990	GFP_KERNEL);
				1991	vmas = kmalloc_array(nr_pages,
				1992	sizeof(struct vm_area_struct *),
				1993	GFP_KERNEL);
				1994	if (!pages \|\| !vmas) {
				1995	ret = -ENOMEM;
				1996	if (ctx->account_mem)
				1997	io_unaccount_mem(ctx->user, nr_pages);
				1998	goto err;
				1999	}
				2000	got_pages = nr_pages;
				2001	}
				2002
				2003	imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
				2004	GFP_KERNEL);
				2005	ret = -ENOMEM;
				2006	if (!imu->bvec) {
				2007	if (ctx->account_mem)
				2008	io_unaccount_mem(ctx->user, nr_pages);
				2009	goto err;
				2010	}
				2011
				2012	ret = 0;
				2013	down_read(&current->mm->mmap_sem);
				2014	pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
				2015	pages, vmas);
				2016	if (pret == nr_pages) {
				2017	/* don't support file backed memory */
				2018	for (j = 0; j < nr_pages; j++) {
				2019	struct vm_area_struct *vma = vmas[j];
				2020
				2021	if (vma->vm_file &&
				2022	!is_file_hugepages(vma->vm_file)) {
				2023	ret = -EOPNOTSUPP;
				2024	break;
				2025	}
				2026	}
				2027	} else {
				2028	ret = pret < 0 ? pret : -EFAULT;
				2029	}
				2030	up_read(&current->mm->mmap_sem);
				2031	if (ret) {
				2032	/*
				2033	* if we did partial map, or found file backed vmas,
				2034	* release any pages we did get
				2035	*/
				2036	if (pret > 0) {
				2037	for (j = 0; j < pret; j++)
				2038	put_page(pages[j]);
				2039	}
				2040	if (ctx->account_mem)
				2041	io_unaccount_mem(ctx->user, nr_pages);
				2042	goto err;
				2043	}
				2044
				2045	off = ubuf & ~PAGE_MASK;
				2046	size = iov.iov_len;
				2047	for (j = 0; j < nr_pages; j++) {
				2048	size_t vec_len;
				2049
				2050	vec_len = min_t(size_t, size, PAGE_SIZE - off);
				2051	imu->bvec[j].bv_page = pages[j];
				2052	imu->bvec[j].bv_len = vec_len;
				2053	imu->bvec[j].bv_offset = off;
				2054	off = 0;
				2055	size -= vec_len;
				2056	}
				2057	/* store original address for later verification */
				2058	imu->ubuf = ubuf;
				2059	imu->len = iov.iov_len;
				2060	imu->nr_bvecs = nr_pages;
				2061
				2062	ctx->nr_user_bufs++;
				2063	}
				2064	kfree(pages);
				2065	kfree(vmas);
				2066	return 0;
				2067	err:
				2068	kfree(pages);
				2069	kfree(vmas);
				2070	io_sqe_buffer_unregister(ctx);
				2071	return ret;
				2072	}
				2073
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2074	static void io_ring_ctx_free(struct io_ring_ctx *ctx)
				2075	{
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	2076	io_finish_async(ctx);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2077	if (ctx->sqo_mm)
				2078	mmdrop(ctx->sqo_mm);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2079
				2080	io_iopoll_reap_events(ctx);
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	2081	io_sqe_buffer_unregister(ctx);
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	2082	io_sqe_files_unregister(ctx);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2083
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2084	#if defined(CONFIG_UNIX)
				2085	if (ctx->ring_sock)
				2086	sock_release(ctx->ring_sock);
				2087	#endif
				2088
				2089	io_mem_free(ctx->sq_ring);
				2090	io_mem_free(ctx->sq_sqes);
				2091	io_mem_free(ctx->cq_ring);
				2092
				2093	percpu_ref_exit(&ctx->refs);
				2094	if (ctx->account_mem)
				2095	io_unaccount_mem(ctx->user,
				2096	ring_pages(ctx->sq_entries, ctx->cq_entries));
				2097	free_uid(ctx->user);
				2098	kfree(ctx);
				2099	}
				2100
				2101	static __poll_t io_uring_poll(struct file file, poll_table wait)
				2102	{
				2103	struct io_ring_ctx *ctx = file->private_data;
				2104	__poll_t mask = 0;
				2105
				2106	poll_wait(file, &ctx->cq_wait, wait);
				2107	/* See comment at the top of this file */
				2108	smp_rmb();
				2109	if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
				2110	mask \|= EPOLLOUT \| EPOLLWRNORM;
				2111	if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
				2112	mask \|= EPOLLIN \| EPOLLRDNORM;
				2113
				2114	return mask;
				2115	}
				2116
				2117	static int io_uring_fasync(int fd, struct file *file, int on)
				2118	{
				2119	struct io_ring_ctx *ctx = file->private_data;
				2120
				2121	return fasync_helper(fd, file, on, &ctx->cq_fasync);
				2122	}
				2123
				2124	static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
				2125	{
				2126	mutex_lock(&ctx->uring_lock);
				2127	percpu_ref_kill(&ctx->refs);
				2128	mutex_unlock(&ctx->uring_lock);
				2129
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2130	io_iopoll_reap_events(ctx);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2131	wait_for_completion(&ctx->ctx_done);
				2132	io_ring_ctx_free(ctx);
				2133	}
				2134
				2135	static int io_uring_release(struct inode inode, struct file file)
				2136	{
				2137	struct io_ring_ctx *ctx = file->private_data;
				2138
				2139	file->private_data = NULL;
				2140	io_ring_ctx_wait_and_kill(ctx);
				2141	return 0;
				2142	}
				2143
				2144	static int io_uring_mmap(struct file file, struct vm_area_struct vma)
				2145	{
				2146	loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
				2147	unsigned long sz = vma->vm_end - vma->vm_start;
				2148	struct io_ring_ctx *ctx = file->private_data;
				2149	unsigned long pfn;
				2150	struct page *page;
				2151	void *ptr;
				2152
				2153	switch (offset) {
				2154	case IORING_OFF_SQ_RING:
				2155	ptr = ctx->sq_ring;
				2156	break;
				2157	case IORING_OFF_SQES:
				2158	ptr = ctx->sq_sqes;
				2159	break;
				2160	case IORING_OFF_CQ_RING:
				2161	ptr = ctx->cq_ring;
				2162	break;
				2163	default:
				2164	return -EINVAL;
				2165	}
				2166
				2167	page = virt_to_head_page(ptr);
				2168	if (sz > (PAGE_SIZE << compound_order(page)))
				2169	return -EINVAL;
				2170
				2171	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
				2172	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
				2173	}
				2174
				2175	SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
				2176	u32, min_complete, u32, flags, const sigset_t __user *, sig,
				2177	size_t, sigsz)
				2178	{
				2179	struct io_ring_ctx *ctx;
				2180	long ret = -EBADF;
				2181	int submitted = 0;
				2182	struct fd f;
				2183
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	2184	if (flags & ~(IORING_ENTER_GETEVENTS \| IORING_ENTER_SQ_WAKEUP))
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2185	return -EINVAL;
				2186
				2187	f = fdget(fd);
				2188	if (!f.file)
				2189	return -EBADF;
				2190
				2191	ret = -EOPNOTSUPP;
				2192	if (f.file->f_op != &io_uring_fops)
				2193	goto out_fput;
				2194
				2195	ret = -ENXIO;
				2196	ctx = f.file->private_data;
				2197	if (!percpu_ref_tryget(&ctx->refs))
				2198	goto out_fput;
				2199
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	2200	/*
				2201	* For SQ polling, the thread will do all submissions and completions.
				2202	* Just return the requested submit count, and wake the thread if
				2203	* we were asked to.
				2204	*/
				2205	if (ctx->flags & IORING_SETUP_SQPOLL) {
				2206	if (flags & IORING_ENTER_SQ_WAKEUP)
				2207	wake_up(&ctx->sqo_wait);
				2208	submitted = to_submit;
				2209	goto out_ctx;
				2210	}
				2211
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2212	ret = 0;
				2213	if (to_submit) {
				2214	to_submit = min(to_submit, ctx->sq_entries);
				2215
				2216	mutex_lock(&ctx->uring_lock);
				2217	submitted = io_ring_submit(ctx, to_submit);
				2218	mutex_unlock(&ctx->uring_lock);
				2219
				2220	if (submitted < 0)
				2221	goto out_ctx;
				2222	}
				2223	if (flags & IORING_ENTER_GETEVENTS) {
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2224	unsigned nr_events = 0;
				2225
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2226	min_complete = min(min_complete, ctx->cq_entries);
				2227
				2228	/*
				2229	* The application could have included the 'to_submit' count
				2230	* in how many events it wanted to wait for. If we failed to
				2231	* submit the desired count, we may need to adjust the number
				2232	* of events to poll/wait for.
				2233	*/
				2234	if (submitted < to_submit)
				2235	min_complete = min_t(unsigned, submitted, min_complete);
				2236
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2237	if (ctx->flags & IORING_SETUP_IOPOLL) {
				2238	mutex_lock(&ctx->uring_lock);
				2239	ret = io_iopoll_check(ctx, &nr_events, min_complete);
				2240	mutex_unlock(&ctx->uring_lock);
				2241	} else {
				2242	ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
				2243	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2244	}
				2245
				2246	out_ctx:
				2247	io_ring_drop_ctx_refs(ctx, 1);
				2248	out_fput:
				2249	fdput(f);
				2250	return submitted ? submitted : ret;
				2251	}
				2252
				2253	static const struct file_operations io_uring_fops = {
				2254	.release = io_uring_release,
				2255	.mmap = io_uring_mmap,
				2256	.poll = io_uring_poll,
				2257	.fasync = io_uring_fasync,
				2258	};
				2259
				2260	static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
				2261	struct io_uring_params *p)
				2262	{
				2263	struct io_sq_ring *sq_ring;
				2264	struct io_cq_ring *cq_ring;
				2265	size_t size;
				2266
				2267	sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
				2268	if (!sq_ring)
				2269	return -ENOMEM;
				2270
				2271	ctx->sq_ring = sq_ring;
				2272	sq_ring->ring_mask = p->sq_entries - 1;
				2273	sq_ring->ring_entries = p->sq_entries;
				2274	ctx->sq_mask = sq_ring->ring_mask;
				2275	ctx->sq_entries = sq_ring->ring_entries;
				2276
				2277	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
				2278	if (size == SIZE_MAX)
				2279	return -EOVERFLOW;
				2280
				2281	ctx->sq_sqes = io_mem_alloc(size);
				2282	if (!ctx->sq_sqes) {
				2283	io_mem_free(ctx->sq_ring);
				2284	return -ENOMEM;
				2285	}
				2286
				2287	cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
				2288	if (!cq_ring) {
				2289	io_mem_free(ctx->sq_ring);
				2290	io_mem_free(ctx->sq_sqes);
				2291	return -ENOMEM;
				2292	}
				2293
				2294	ctx->cq_ring = cq_ring;
				2295	cq_ring->ring_mask = p->cq_entries - 1;
				2296	cq_ring->ring_entries = p->cq_entries;
				2297	ctx->cq_mask = cq_ring->ring_mask;
				2298	ctx->cq_entries = cq_ring->ring_entries;
				2299	return 0;
				2300	}
				2301
				2302	/*
				2303	* Allocate an anonymous fd, this is what constitutes the application
				2304	* visible backing of an io_uring instance. The application mmaps this
				2305	* fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
				2306	* we have to tie this fd to a socket for file garbage collection purposes.
				2307	*/
				2308	static int io_uring_get_fd(struct io_ring_ctx *ctx)
				2309	{
				2310	struct file *file;
				2311	int ret;
				2312
				2313	#if defined(CONFIG_UNIX)
				2314	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
				2315	&ctx->ring_sock);
				2316	if (ret)
				2317	return ret;
				2318	#endif
				2319
				2320	ret = get_unused_fd_flags(O_RDWR \| O_CLOEXEC);
				2321	if (ret < 0)
				2322	goto err;
				2323
				2324	file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
				2325	O_RDWR \| O_CLOEXEC);
				2326	if (IS_ERR(file)) {
				2327	put_unused_fd(ret);
				2328	ret = PTR_ERR(file);
				2329	goto err;
				2330	}
				2331
				2332	#if defined(CONFIG_UNIX)
				2333	ctx->ring_sock->file = file;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	2334	ctx->ring_sock->sk->sk_user_data = ctx;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2335	#endif
				2336	fd_install(ret, file);
				2337	return ret;
				2338	err:
				2339	#if defined(CONFIG_UNIX)
				2340	sock_release(ctx->ring_sock);
				2341	ctx->ring_sock = NULL;
				2342	#endif
				2343	return ret;
				2344	}
				2345
				2346	static int io_uring_create(unsigned entries, struct io_uring_params *p)
				2347	{
				2348	struct user_struct *user = NULL;
				2349	struct io_ring_ctx *ctx;
				2350	bool account_mem;
				2351	int ret;
				2352
				2353	if (!entries \|\| entries > IORING_MAX_ENTRIES)
				2354	return -EINVAL;
				2355
				2356	/*
				2357	* Use twice as many entries for the CQ ring. It's possible for the
				2358	* application to drive a higher depth than the size of the SQ ring,
				2359	* since the sqes are only used at submission time. This allows for
				2360	* some flexibility in overcommitting a bit.
				2361	*/
				2362	p->sq_entries = roundup_pow_of_two(entries);
				2363	p->cq_entries = 2 * p->sq_entries;
				2364
				2365	user = get_uid(current_user());
				2366	account_mem = !capable(CAP_IPC_LOCK);
				2367
				2368	if (account_mem) {
				2369	ret = io_account_mem(user,
				2370	ring_pages(p->sq_entries, p->cq_entries));
				2371	if (ret) {
				2372	free_uid(user);
				2373	return ret;
				2374	}
				2375	}
				2376
				2377	ctx = io_ring_ctx_alloc(p);
				2378	if (!ctx) {
				2379	if (account_mem)
				2380	io_unaccount_mem(user, ring_pages(p->sq_entries,
				2381	p->cq_entries));
				2382	free_uid(user);
				2383	return -ENOMEM;
				2384	}
				2385	ctx->compat = in_compat_syscall();
				2386	ctx->account_mem = account_mem;
				2387	ctx->user = user;
				2388
				2389	ret = io_allocate_scq_urings(ctx, p);
				2390	if (ret)
				2391	goto err;
				2392
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	2393	ret = io_sq_offload_start(ctx, p);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2394	if (ret)
				2395	goto err;
				2396
				2397	ret = io_uring_get_fd(ctx);
				2398	if (ret < 0)
				2399	goto err;
				2400
				2401	memset(&p->sq_off, 0, sizeof(p->sq_off));
				2402	p->sq_off.head = offsetof(struct io_sq_ring, r.head);
				2403	p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
				2404	p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
				2405	p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
				2406	p->sq_off.flags = offsetof(struct io_sq_ring, flags);
				2407	p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
				2408	p->sq_off.array = offsetof(struct io_sq_ring, array);
				2409
				2410	memset(&p->cq_off, 0, sizeof(p->cq_off));
				2411	p->cq_off.head = offsetof(struct io_cq_ring, r.head);
				2412	p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
				2413	p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
				2414	p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
				2415	p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
				2416	p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
				2417	return ret;
				2418	err:
				2419	io_ring_ctx_wait_and_kill(ctx);
				2420	return ret;
				2421	}
				2422
				2423	/*
				2424	* Sets up an aio uring context, and returns the fd. Applications asks for a
				2425	* ring size, we return the actual sq/cq ring sizes (among other things) in the
				2426	* params structure passed in.
				2427	*/
				2428	static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
				2429	{
				2430	struct io_uring_params p;
				2431	long ret;
				2432	int i;
				2433
				2434	if (copy_from_user(&p, params, sizeof(p)))
				2435	return -EFAULT;
				2436	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
				2437	if (p.resv[i])
				2438	return -EINVAL;
				2439	}
				2440
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame^]	2441	if (p.flags & ~(IORING_SETUP_IOPOLL \| IORING_SETUP_SQPOLL \|
				2442	IORING_SETUP_SQ_AFF))
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2443	return -EINVAL;
				2444
				2445	ret = io_uring_create(entries, &p);
				2446	if (ret < 0)
				2447	return ret;
				2448
				2449	if (copy_to_user(params, &p, sizeof(p)))
				2450	return -EFAULT;
				2451
				2452	return ret;
				2453	}
				2454
				2455	SYSCALL_DEFINE2(io_uring_setup, u32, entries,
				2456	struct io_uring_params __user *, params)
				2457	{
				2458	return io_uring_setup(entries, params);
				2459	}
				2460
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	2461	static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
				2462	void __user *arg, unsigned nr_args)
				2463	{
				2464	int ret;
				2465
				2466	percpu_ref_kill(&ctx->refs);
				2467	wait_for_completion(&ctx->ctx_done);
				2468
				2469	switch (opcode) {
				2470	case IORING_REGISTER_BUFFERS:
				2471	ret = io_sqe_buffer_register(ctx, arg, nr_args);
				2472	break;
				2473	case IORING_UNREGISTER_BUFFERS:
				2474	ret = -EINVAL;
				2475	if (arg \|\| nr_args)
				2476	break;
				2477	ret = io_sqe_buffer_unregister(ctx);
				2478	break;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	2479	case IORING_REGISTER_FILES:
				2480	ret = io_sqe_files_register(ctx, arg, nr_args);
				2481	break;
				2482	case IORING_UNREGISTER_FILES:
				2483	ret = -EINVAL;
				2484	if (arg \|\| nr_args)
				2485	break;
				2486	ret = io_sqe_files_unregister(ctx);
				2487	break;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	2488	default:
				2489	ret = -EINVAL;
				2490	break;
				2491	}
				2492
				2493	/* bring the ctx back to life */
				2494	reinit_completion(&ctx->ctx_done);
				2495	percpu_ref_reinit(&ctx->refs);
				2496	return ret;
				2497	}
				2498
				2499	SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
				2500	void __user *, arg, unsigned int, nr_args)
				2501	{
				2502	struct io_ring_ctx *ctx;
				2503	long ret = -EBADF;
				2504	struct fd f;
				2505
				2506	f = fdget(fd);
				2507	if (!f.file)
				2508	return -EBADF;
				2509
				2510	ret = -EOPNOTSUPP;
				2511	if (f.file->f_op != &io_uring_fops)
				2512	goto out_fput;
				2513
				2514	ctx = f.file->private_data;
				2515
				2516	mutex_lock(&ctx->uring_lock);
				2517	ret = __io_uring_register(ctx, opcode, arg, nr_args);
				2518	mutex_unlock(&ctx->uring_lock);
				2519	out_fput:
				2520	fdput(f);
				2521	return ret;
				2522	}
				2523
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2524	static int __init io_uring_init(void)
				2525	{
				2526	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN \| SLAB_PANIC);
				2527	return 0;
				2528	};
				2529	__initcall(io_uring_init);