Blame - fs/io_uring.c - yocto/kernel/common

blob: c0c0f68568b5a5ab344b65101b49c6457dce7869 [file] [log] [blame]

Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Shared application/kernel submission and completion ring pairs, for
				4	* supporting fast/efficient IO.
				5	*
				6	* A note on the read/write ordering memory barriers that are matched between
				7	* the application and kernel side. When the application reads the CQ ring
				8	* tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
				9	* the kernel uses after writing the tail. Failure to do so could cause a
				10	* delay in when the application notices that completion events available.
				11	* This isn't a fatal condition. Likewise, the application must use an
				12	* appropriate smp_wmb() both before writing the SQ tail, and after writing
				13	* the SQ tail. The first one orders the sqe writes with the tail write, and
				14	* the latter is paired with the smp_rmb() the kernel will issue before
				15	* reading the SQ tail on submission.
				16	*
				17	* Also see the examples in the liburing library:
				18	*
				19	* git://git.kernel.dk/liburing
				20	*
				21	* io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
				22	* from data shared between the kernel and application. This is done both
				23	* for ordering purposes, but also to ensure that once a value is loaded from
				24	* data that the application could potentially modify, it remains stable.
				25	*
				26	* Copyright (C) 2018-2019 Jens Axboe
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	27	* Copyright (c) 2018-2019 Christoph Hellwig
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	28	*/
				29	#include <linux/kernel.h>
				30	#include <linux/init.h>
				31	#include <linux/errno.h>
				32	#include <linux/syscalls.h>
				33	#include <linux/compat.h>
				34	#include <linux/refcount.h>
				35	#include <linux/uio.h>
				36
				37	#include <linux/sched/signal.h>
				38	#include <linux/fs.h>
				39	#include <linux/file.h>
				40	#include <linux/fdtable.h>
				41	#include <linux/mm.h>
				42	#include <linux/mman.h>
				43	#include <linux/mmu_context.h>
				44	#include <linux/percpu.h>
				45	#include <linux/slab.h>
				46	#include <linux/workqueue.h>
				47	#include <linux/blkdev.h>
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	48	#include <linux/bvec.h>
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	49	#include <linux/net.h>
				50	#include <net/sock.h>
				51	#include <net/af_unix.h>
				52	#include <linux/anon_inodes.h>
				53	#include <linux/sched/mm.h>
				54	#include <linux/uaccess.h>
				55	#include <linux/nospec.h>
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	56	#include <linux/sizes.h>
				57	#include <linux/hugetlb.h>
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	58
				59	#include <uapi/linux/io_uring.h>
				60
				61	#include "internal.h"
				62
				63	#define IORING_MAX_ENTRIES 4096
				64
				65	struct io_uring {
				66	u32 head ____cacheline_aligned_in_smp;
				67	u32 tail ____cacheline_aligned_in_smp;
				68	};
				69
				70	struct io_sq_ring {
				71	struct io_uring r;
				72	u32 ring_mask;
				73	u32 ring_entries;
				74	u32 dropped;
				75	u32 flags;
				76	u32 array[];
				77	};
				78
				79	struct io_cq_ring {
				80	struct io_uring r;
				81	u32 ring_mask;
				82	u32 ring_entries;
				83	u32 overflow;
				84	struct io_uring_cqe cqes[];
				85	};
				86
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	87	struct io_mapped_ubuf {
				88	u64 ubuf;
				89	size_t len;
				90	struct bio_vec *bvec;
				91	unsigned int nr_bvecs;
				92	};
				93
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	94	struct io_ring_ctx {
				95	struct {
				96	struct percpu_ref refs;
				97	} ____cacheline_aligned_in_smp;
				98
				99	struct {
				100	unsigned int flags;
				101	bool compat;
				102	bool account_mem;
				103
				104	/* SQ ring */
				105	struct io_sq_ring *sq_ring;
				106	unsigned cached_sq_head;
				107	unsigned sq_entries;
				108	unsigned sq_mask;
				109	struct io_uring_sqe *sq_sqes;
				110	} ____cacheline_aligned_in_smp;
				111
				112	/* IO offload */
				113	struct workqueue_struct *sqo_wq;
				114	struct mm_struct *sqo_mm;
				115
				116	struct {
				117	/* CQ ring */
				118	struct io_cq_ring *cq_ring;
				119	unsigned cached_cq_tail;
				120	unsigned cq_entries;
				121	unsigned cq_mask;
				122	struct wait_queue_head cq_wait;
				123	struct fasync_struct *cq_fasync;
				124	} ____cacheline_aligned_in_smp;
				125
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	126	/* if used, fixed mapped user buffers */
				127	unsigned nr_user_bufs;
				128	struct io_mapped_ubuf *user_bufs;
				129
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	130	struct user_struct *user;
				131
				132	struct completion ctx_done;
				133
				134	struct {
				135	struct mutex uring_lock;
				136	wait_queue_head_t wait;
				137	} ____cacheline_aligned_in_smp;
				138
				139	struct {
				140	spinlock_t completion_lock;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	141	bool poll_multi_file;
				142	/*
				143	* ->poll_list is protected by the ctx->uring_lock for
				144	* io_uring instances that don't use IORING_SETUP_SQPOLL.
				145	* For SQPOLL, only the single threaded io_sq_thread() will
				146	* manipulate the list, hence no extra locking is needed there.
				147	*/
				148	struct list_head poll_list;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	149	} ____cacheline_aligned_in_smp;
				150
				151	#if defined(CONFIG_UNIX)
				152	struct socket *ring_sock;
				153	#endif
				154	};
				155
				156	struct sqe_submit {
				157	const struct io_uring_sqe *sqe;
				158	unsigned short index;
				159	bool has_user;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	160	bool needs_lock;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	161	};
				162
				163	struct io_kiocb {
				164	struct kiocb rw;
				165
				166	struct sqe_submit submit;
				167
				168	struct io_ring_ctx *ctx;
				169	struct list_head list;
				170	unsigned int flags;
				171	#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	172	#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	173	u64 user_data;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	174	u64 error;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	175
				176	struct work_struct work;
				177	};
				178
				179	#define IO_PLUG_THRESHOLD 2
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	180	#define IO_IOPOLL_BATCH 8
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	181
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	182	struct io_submit_state {
				183	struct blk_plug plug;
				184
				185	/*
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	186	* io_kiocb alloc cache
				187	*/
				188	void *reqs[IO_IOPOLL_BATCH];
				189	unsigned int free_reqs;
				190	unsigned int cur_req;
				191
				192	/*
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	193	* File reference cache
				194	*/
				195	struct file *file;
				196	unsigned int fd;
				197	unsigned int has_refs;
				198	unsigned int used_refs;
				199	unsigned int ios_left;
				200	};
				201
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	202	static struct kmem_cache *req_cachep;
				203
				204	static const struct file_operations io_uring_fops;
				205
				206	struct sock io_uring_get_socket(struct file file)
				207	{
				208	#if defined(CONFIG_UNIX)
				209	if (file->f_op == &io_uring_fops) {
				210	struct io_ring_ctx *ctx = file->private_data;
				211
				212	return ctx->ring_sock->sk;
				213	}
				214	#endif
				215	return NULL;
				216	}
				217	EXPORT_SYMBOL(io_uring_get_socket);
				218
				219	static void io_ring_ctx_ref_free(struct percpu_ref *ref)
				220	{
				221	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
				222
				223	complete(&ctx->ctx_done);
				224	}
				225
				226	static struct io_ring_ctx io_ring_ctx_alloc(struct io_uring_params p)
				227	{
				228	struct io_ring_ctx *ctx;
				229
				230	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
				231	if (!ctx)
				232	return NULL;
				233
				234	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
				235	kfree(ctx);
				236	return NULL;
				237	}
				238
				239	ctx->flags = p->flags;
				240	init_waitqueue_head(&ctx->cq_wait);
				241	init_completion(&ctx->ctx_done);
				242	mutex_init(&ctx->uring_lock);
				243	init_waitqueue_head(&ctx->wait);
				244	spin_lock_init(&ctx->completion_lock);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	245	INIT_LIST_HEAD(&ctx->poll_list);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	246	return ctx;
				247	}
				248
				249	static void io_commit_cqring(struct io_ring_ctx *ctx)
				250	{
				251	struct io_cq_ring *ring = ctx->cq_ring;
				252
				253	if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
				254	/* order cqe stores with ring update */
				255	smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
				256
				257	/*
				258	* Write sider barrier of tail update, app has read side. See
				259	* comment at the top of this file.
				260	*/
				261	smp_wmb();
				262
				263	if (wq_has_sleeper(&ctx->cq_wait)) {
				264	wake_up_interruptible(&ctx->cq_wait);
				265	kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
				266	}
				267	}
				268	}
				269
				270	static struct io_uring_cqe io_get_cqring(struct io_ring_ctx ctx)
				271	{
				272	struct io_cq_ring *ring = ctx->cq_ring;
				273	unsigned tail;
				274
				275	tail = ctx->cached_cq_tail;
				276	/* See comment at the top of the file */
				277	smp_rmb();
				278	if (tail + 1 == READ_ONCE(ring->r.head))
				279	return NULL;
				280
				281	ctx->cached_cq_tail++;
				282	return &ring->cqes[tail & ctx->cq_mask];
				283	}
				284
				285	static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
				286	long res, unsigned ev_flags)
				287	{
				288	struct io_uring_cqe *cqe;
				289
				290	/*
				291	* If we can't get a cq entry, userspace overflowed the
				292	* submission (by quite a lot). Increment the overflow count in
				293	* the ring.
				294	*/
				295	cqe = io_get_cqring(ctx);
				296	if (cqe) {
				297	WRITE_ONCE(cqe->user_data, ki_user_data);
				298	WRITE_ONCE(cqe->res, res);
				299	WRITE_ONCE(cqe->flags, ev_flags);
				300	} else {
				301	unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
				302
				303	WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
				304	}
				305	}
				306
				307	static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
				308	long res, unsigned ev_flags)
				309	{
				310	unsigned long flags;
				311
				312	spin_lock_irqsave(&ctx->completion_lock, flags);
				313	io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
				314	io_commit_cqring(ctx);
				315	spin_unlock_irqrestore(&ctx->completion_lock, flags);
				316
				317	if (waitqueue_active(&ctx->wait))
				318	wake_up(&ctx->wait);
				319	}
				320
				321	static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
				322	{
				323	percpu_ref_put_many(&ctx->refs, refs);
				324
				325	if (waitqueue_active(&ctx->wait))
				326	wake_up(&ctx->wait);
				327	}
				328
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	329	static struct io_kiocb io_get_req(struct io_ring_ctx ctx,
				330	struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	331	{
				332	struct io_kiocb *req;
				333
				334	if (!percpu_ref_tryget(&ctx->refs))
				335	return NULL;
				336
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	337	if (!state) {
				338	req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
				339	if (unlikely(!req))
				340	goto out;
				341	} else if (!state->free_reqs) {
				342	size_t sz;
				343	int ret;
				344
				345	sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
				346	ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz,
				347	state->reqs);
				348	if (unlikely(ret <= 0))
				349	goto out;
				350	state->free_reqs = ret - 1;
				351	state->cur_req = 1;
				352	req = state->reqs[0];
				353	} else {
				354	req = state->reqs[state->cur_req];
				355	state->free_reqs--;
				356	state->cur_req++;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	357	}
				358
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	359	req->ctx = ctx;
				360	req->flags = 0;
				361	return req;
				362	out:
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	363	io_ring_drop_ctx_refs(ctx, 1);
				364	return NULL;
				365	}
				366
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	367	static void io_free_req_many(struct io_ring_ctx ctx, void reqs, int nr)
				368	{
				369	if (*nr) {
				370	kmem_cache_free_bulk(req_cachep, *nr, reqs);
				371	io_ring_drop_ctx_refs(ctx, *nr);
				372	*nr = 0;
				373	}
				374	}
				375
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	376	static void io_free_req(struct io_kiocb *req)
				377	{
				378	io_ring_drop_ctx_refs(req->ctx, 1);
				379	kmem_cache_free(req_cachep, req);
				380	}
				381
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	382	/*
				383	* Find and free completed poll iocbs
				384	*/
				385	static void io_iopoll_complete(struct io_ring_ctx ctx, unsigned int nr_events,
				386	struct list_head *done)
				387	{
				388	void *reqs[IO_IOPOLL_BATCH];
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	389	int file_count, to_free;
				390	struct file *file = NULL;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	391	struct io_kiocb *req;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	392
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	393	file_count = to_free = 0;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	394	while (!list_empty(done)) {
				395	req = list_first_entry(done, struct io_kiocb, list);
				396	list_del(&req->list);
				397
				398	io_cqring_fill_event(ctx, req->user_data, req->error, 0);
				399
				400	reqs[to_free++] = req;
				401	(*nr_events)++;
				402
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	403	/*
				404	* Batched puts of the same file, to avoid dirtying the
				405	* file usage count multiple times, if avoidable.
				406	*/
				407	if (!file) {
				408	file = req->rw.ki_filp;
				409	file_count = 1;
				410	} else if (file == req->rw.ki_filp) {
				411	file_count++;
				412	} else {
				413	fput_many(file, file_count);
				414	file = req->rw.ki_filp;
				415	file_count = 1;
				416	}
				417
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	418	if (to_free == ARRAY_SIZE(reqs))
				419	io_free_req_many(ctx, reqs, &to_free);
				420	}
				421	io_commit_cqring(ctx);
				422
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	423	if (file)
				424	fput_many(file, file_count);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	425	io_free_req_many(ctx, reqs, &to_free);
				426	}
				427
				428	static int io_do_iopoll(struct io_ring_ctx ctx, unsigned int nr_events,
				429	long min)
				430	{
				431	struct io_kiocb req, tmp;
				432	LIST_HEAD(done);
				433	bool spin;
				434	int ret;
				435
				436	/*
				437	* Only spin for completions if we don't have multiple devices hanging
				438	* off our complete list, and we're under the requested amount.
				439	*/
				440	spin = !ctx->poll_multi_file && *nr_events < min;
				441
				442	ret = 0;
				443	list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
				444	struct kiocb *kiocb = &req->rw;
				445
				446	/*
				447	* Move completed entries to our local list. If we find a
				448	* request that requires polling, break out and complete
				449	* the done list first, if we have entries there.
				450	*/
				451	if (req->flags & REQ_F_IOPOLL_COMPLETED) {
				452	list_move_tail(&req->list, &done);
				453	continue;
				454	}
				455	if (!list_empty(&done))
				456	break;
				457
				458	ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
				459	if (ret < 0)
				460	break;
				461
				462	if (ret && spin)
				463	spin = false;
				464	ret = 0;
				465	}
				466
				467	if (!list_empty(&done))
				468	io_iopoll_complete(ctx, nr_events, &done);
				469
				470	return ret;
				471	}
				472
				473	/*
				474	* Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
				475	* non-spinning poll check - we'll still enter the driver poll loop, but only
				476	* as a non-spinning completion check.
				477	*/
				478	static int io_iopoll_getevents(struct io_ring_ctx ctx, unsigned int nr_events,
				479	long min)
				480	{
				481	while (!list_empty(&ctx->poll_list)) {
				482	int ret;
				483
				484	ret = io_do_iopoll(ctx, nr_events, min);
				485	if (ret < 0)
				486	return ret;
				487	if (!min \|\| *nr_events >= min)
				488	return 0;
				489	}
				490
				491	return 1;
				492	}
				493
				494	/*
				495	* We can't just wait for polled events to come to us, we have to actively
				496	* find and complete them.
				497	*/
				498	static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
				499	{
				500	if (!(ctx->flags & IORING_SETUP_IOPOLL))
				501	return;
				502
				503	mutex_lock(&ctx->uring_lock);
				504	while (!list_empty(&ctx->poll_list)) {
				505	unsigned int nr_events = 0;
				506
				507	io_iopoll_getevents(ctx, &nr_events, 1);
				508	}
				509	mutex_unlock(&ctx->uring_lock);
				510	}
				511
				512	static int io_iopoll_check(struct io_ring_ctx ctx, unsigned nr_events,
				513	long min)
				514	{
				515	int ret = 0;
				516
				517	do {
				518	int tmin = 0;
				519
				520	if (*nr_events < min)
				521	tmin = min - *nr_events;
				522
				523	ret = io_iopoll_getevents(ctx, nr_events, tmin);
				524	if (ret <= 0)
				525	break;
				526	ret = 0;
				527	} while (min && !*nr_events && !need_resched());
				528
				529	return ret;
				530	}
				531
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	532	static void kiocb_end_write(struct kiocb *kiocb)
				533	{
				534	if (kiocb->ki_flags & IOCB_WRITE) {
				535	struct inode *inode = file_inode(kiocb->ki_filp);
				536
				537	/*
				538	* Tell lockdep we inherited freeze protection from submission
				539	* thread.
				540	*/
				541	if (S_ISREG(inode->i_mode))
				542	__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
				543	file_end_write(kiocb->ki_filp);
				544	}
				545	}
				546
				547	static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
				548	{
				549	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
				550
				551	kiocb_end_write(kiocb);
				552
				553	fput(kiocb->ki_filp);
				554	io_cqring_add_event(req->ctx, req->user_data, res, 0);
				555	io_free_req(req);
				556	}
				557
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	558	static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
				559	{
				560	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
				561
				562	kiocb_end_write(kiocb);
				563
				564	req->error = res;
				565	if (res != -EAGAIN)
				566	req->flags \|= REQ_F_IOPOLL_COMPLETED;
				567	}
				568
				569	/*
				570	* After the iocb has been issued, it's safe to be found on the poll list.
				571	* Adding the kiocb to the list AFTER submission ensures that we don't
				572	* find it from a io_iopoll_getevents() thread before the issuer is done
				573	* accessing the kiocb cookie.
				574	*/
				575	static void io_iopoll_req_issued(struct io_kiocb *req)
				576	{
				577	struct io_ring_ctx *ctx = req->ctx;
				578
				579	/*
				580	* Track whether we have multiple files in our lists. This will impact
				581	* how we do polling eventually, not spinning if we're on potentially
				582	* different devices.
				583	*/
				584	if (list_empty(&ctx->poll_list)) {
				585	ctx->poll_multi_file = false;
				586	} else if (!ctx->poll_multi_file) {
				587	struct io_kiocb *list_req;
				588
				589	list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
				590	list);
				591	if (list_req->rw.ki_filp != req->rw.ki_filp)
				592	ctx->poll_multi_file = true;
				593	}
				594
				595	/*
				596	* For fast devices, IO may have already completed. If it has, add
				597	* it to the front so we find it first.
				598	*/
				599	if (req->flags & REQ_F_IOPOLL_COMPLETED)
				600	list_add(&req->list, &ctx->poll_list);
				601	else
				602	list_add_tail(&req->list, &ctx->poll_list);
				603	}
				604
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	605	static void io_file_put(struct io_submit_state state, struct file file)
				606	{
				607	if (!state) {
				608	fput(file);
				609	} else if (state->file) {
				610	int diff = state->has_refs - state->used_refs;
				611
				612	if (diff)
				613	fput_many(state->file, diff);
				614	state->file = NULL;
				615	}
				616	}
				617
				618	/*
				619	* Get as many references to a file as we have IOs left in this submission,
				620	* assuming most submissions are for one file, or at least that each file
				621	* has more than one submission.
				622	*/
				623	static struct file io_file_get(struct io_submit_state state, int fd)
				624	{
				625	if (!state)
				626	return fget(fd);
				627
				628	if (state->file) {
				629	if (state->fd == fd) {
				630	state->used_refs++;
				631	state->ios_left--;
				632	return state->file;
				633	}
				634	io_file_put(state, NULL);
				635	}
				636	state->file = fget_many(fd, state->ios_left);
				637	if (!state->file)
				638	return NULL;
				639
				640	state->fd = fd;
				641	state->has_refs = state->ios_left;
				642	state->used_refs = 1;
				643	state->ios_left--;
				644	return state->file;
				645	}
				646
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	647	/*
				648	* If we tracked the file through the SCM inflight mechanism, we could support
				649	* any file. For now, just ensure that anything potentially problematic is done
				650	* inline.
				651	*/
				652	static bool io_file_supports_async(struct file *file)
				653	{
				654	umode_t mode = file_inode(file)->i_mode;
				655
				656	if (S_ISBLK(mode) \|\| S_ISCHR(mode))
				657	return true;
				658	if (S_ISREG(mode) && file->f_op != &io_uring_fops)
				659	return true;
				660
				661	return false;
				662	}
				663
				664	static int io_prep_rw(struct io_kiocb req, const struct io_uring_sqe sqe,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	665	bool force_nonblock, struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	666	{
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	667	struct io_ring_ctx *ctx = req->ctx;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	668	struct kiocb *kiocb = &req->rw;
				669	unsigned ioprio;
				670	int fd, ret;
				671
				672	/* For -EAGAIN retry, everything is already prepped */
				673	if (kiocb->ki_filp)
				674	return 0;
				675
				676	fd = READ_ONCE(sqe->fd);
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	677	kiocb->ki_filp = io_file_get(state, fd);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	678	if (unlikely(!kiocb->ki_filp))
				679	return -EBADF;
				680	if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
				681	force_nonblock = false;
				682	kiocb->ki_pos = READ_ONCE(sqe->off);
				683	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
				684	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
				685
				686	ioprio = READ_ONCE(sqe->ioprio);
				687	if (ioprio) {
				688	ret = ioprio_check_cap(ioprio);
				689	if (ret)
				690	goto out_fput;
				691
				692	kiocb->ki_ioprio = ioprio;
				693	} else
				694	kiocb->ki_ioprio = get_current_ioprio();
				695
				696	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
				697	if (unlikely(ret))
				698	goto out_fput;
				699	if (force_nonblock) {
				700	kiocb->ki_flags \|= IOCB_NOWAIT;
				701	req->flags \|= REQ_F_FORCE_NONBLOCK;
				702	}
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	703	if (ctx->flags & IORING_SETUP_IOPOLL) {
				704	ret = -EOPNOTSUPP;
				705	if (!(kiocb->ki_flags & IOCB_DIRECT) \|\|
				706	!kiocb->ki_filp->f_op->iopoll)
				707	goto out_fput;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	708
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	709	req->error = 0;
				710	kiocb->ki_flags \|= IOCB_HIPRI;
				711	kiocb->ki_complete = io_complete_rw_iopoll;
				712	} else {
				713	if (kiocb->ki_flags & IOCB_HIPRI) {
				714	ret = -EINVAL;
				715	goto out_fput;
				716	}
				717	kiocb->ki_complete = io_complete_rw;
				718	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	719	return 0;
				720	out_fput:
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	721	/* in case of error, we didn't use this file reference. drop it. */
				722	if (state)
				723	state->used_refs--;
				724	io_file_put(state, kiocb->ki_filp);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	725	return ret;
				726	}
				727
				728	static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
				729	{
				730	switch (ret) {
				731	case -EIOCBQUEUED:
				732	break;
				733	case -ERESTARTSYS:
				734	case -ERESTARTNOINTR:
				735	case -ERESTARTNOHAND:
				736	case -ERESTART_RESTARTBLOCK:
				737	/*
				738	* We can't just restart the syscall, since previously
				739	* submitted sqes may already be in progress. Just fail this
				740	* IO with EINTR.
				741	*/
				742	ret = -EINTR;
				743	/* fall through */
				744	default:
				745	kiocb->ki_complete(kiocb, ret, 0);
				746	}
				747	}
				748
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	749	static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
				750	const struct io_uring_sqe *sqe,
				751	struct iov_iter *iter)
				752	{
				753	size_t len = READ_ONCE(sqe->len);
				754	struct io_mapped_ubuf *imu;
				755	unsigned index, buf_index;
				756	size_t offset;
				757	u64 buf_addr;
				758
				759	/* attempt to use fixed buffers without having provided iovecs */
				760	if (unlikely(!ctx->user_bufs))
				761	return -EFAULT;
				762
				763	buf_index = READ_ONCE(sqe->buf_index);
				764	if (unlikely(buf_index >= ctx->nr_user_bufs))
				765	return -EFAULT;
				766
				767	index = array_index_nospec(buf_index, ctx->nr_user_bufs);
				768	imu = &ctx->user_bufs[index];
				769	buf_addr = READ_ONCE(sqe->addr);
				770
				771	/* overflow */
				772	if (buf_addr + len < buf_addr)
				773	return -EFAULT;
				774	/* not inside the mapped region */
				775	if (buf_addr < imu->ubuf \|\| buf_addr + len > imu->ubuf + imu->len)
				776	return -EFAULT;
				777
				778	/*
				779	* May not be a start of buffer, set size appropriately
				780	* and advance us to the beginning.
				781	*/
				782	offset = buf_addr - imu->ubuf;
				783	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
				784	if (offset)
				785	iov_iter_advance(iter, offset);
				786	return 0;
				787	}
				788
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	789	static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
				790	const struct sqe_submit s, struct iovec *iovec,
				791	struct iov_iter *iter)
				792	{
				793	const struct io_uring_sqe *sqe = s->sqe;
				794	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
				795	size_t sqe_len = READ_ONCE(sqe->len);
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	796	u8 opcode;
				797
				798	/*
				799	* We're reading ->opcode for the second time, but the first read
				800	* doesn't care whether it's _FIXED or not, so it doesn't matter
				801	* whether ->opcode changes concurrently. The first read does care
				802	* about whether it is a READ or a WRITE, so we don't trust this read
				803	* for that purpose and instead let the caller pass in the read/write
				804	* flag.
				805	*/
				806	opcode = READ_ONCE(sqe->opcode);
				807	if (opcode == IORING_OP_READ_FIXED \|\|
				808	opcode == IORING_OP_WRITE_FIXED) {
				809	ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
				810	*iovec = NULL;
				811	return ret;
				812	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	813
				814	if (!s->has_user)
				815	return -EFAULT;
				816
				817	#ifdef CONFIG_COMPAT
				818	if (ctx->compat)
				819	return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
				820	iovec, iter);
				821	#endif
				822
				823	return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
				824	}
				825
				826	static ssize_t io_read(struct io_kiocb req, const struct sqe_submit s,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	827	bool force_nonblock, struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	828	{
				829	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
				830	struct kiocb *kiocb = &req->rw;
				831	struct iov_iter iter;
				832	struct file *file;
				833	ssize_t ret;
				834
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	835	ret = io_prep_rw(req, s->sqe, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	836	if (ret)
				837	return ret;
				838	file = kiocb->ki_filp;
				839
				840	ret = -EBADF;
				841	if (unlikely(!(file->f_mode & FMODE_READ)))
				842	goto out_fput;
				843	ret = -EINVAL;
				844	if (unlikely(!file->f_op->read_iter))
				845	goto out_fput;
				846
				847	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
				848	if (ret)
				849	goto out_fput;
				850
				851	ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_iter_count(&iter));
				852	if (!ret) {
				853	ssize_t ret2;
				854
				855	/* Catch -EAGAIN return for forced non-blocking submission */
				856	ret2 = call_read_iter(file, kiocb, &iter);
				857	if (!force_nonblock \|\| ret2 != -EAGAIN)
				858	io_rw_done(kiocb, ret2);
				859	else
				860	ret = -EAGAIN;
				861	}
				862	kfree(iovec);
				863	out_fput:
				864	/* Hold on to the file for -EAGAIN */
				865	if (unlikely(ret && ret != -EAGAIN))
				866	fput(file);
				867	return ret;
				868	}
				869
				870	static ssize_t io_write(struct io_kiocb req, const struct sqe_submit s,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	871	bool force_nonblock, struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	872	{
				873	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
				874	struct kiocb *kiocb = &req->rw;
				875	struct iov_iter iter;
				876	struct file *file;
				877	ssize_t ret;
				878
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	879	ret = io_prep_rw(req, s->sqe, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	880	if (ret)
				881	return ret;
				882	/* Hold on to the file for -EAGAIN */
				883	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
				884	return -EAGAIN;
				885
				886	ret = -EBADF;
				887	file = kiocb->ki_filp;
				888	if (unlikely(!(file->f_mode & FMODE_WRITE)))
				889	goto out_fput;
				890	ret = -EINVAL;
				891	if (unlikely(!file->f_op->write_iter))
				892	goto out_fput;
				893
				894	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
				895	if (ret)
				896	goto out_fput;
				897
				898	ret = rw_verify_area(WRITE, file, &kiocb->ki_pos,
				899	iov_iter_count(&iter));
				900	if (!ret) {
				901	/*
				902	* Open-code file_start_write here to grab freeze protection,
				903	* which will be released by another thread in
				904	* io_complete_rw(). Fool lockdep by telling it the lock got
				905	* released so that it doesn't complain about the held lock when
				906	* we return to userspace.
				907	*/
				908	if (S_ISREG(file_inode(file)->i_mode)) {
				909	__sb_start_write(file_inode(file)->i_sb,
				910	SB_FREEZE_WRITE, true);
				911	__sb_writers_release(file_inode(file)->i_sb,
				912	SB_FREEZE_WRITE);
				913	}
				914	kiocb->ki_flags \|= IOCB_WRITE;
				915	io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
				916	}
				917	kfree(iovec);
				918	out_fput:
				919	if (unlikely(ret))
				920	fput(file);
				921	return ret;
				922	}
				923
				924	/*
				925	* IORING_OP_NOP just posts a completion event, nothing else.
				926	*/
				927	static int io_nop(struct io_kiocb *req, u64 user_data)
				928	{
				929	struct io_ring_ctx *ctx = req->ctx;
				930	long err = 0;
				931
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	932	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
				933	return -EINVAL;
				934
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	935	/*
				936	* Twilight zone - it's possible that someone issued an opcode that
				937	* has a file attached, then got -EAGAIN on submission, and changed
				938	* the sqe before we retried it from async context. Avoid dropping
				939	* a file reference for this malicious case, and flag the error.
				940	*/
				941	if (req->rw.ki_filp) {
				942	err = -EBADF;
				943	fput(req->rw.ki_filp);
				944	}
				945	io_cqring_add_event(ctx, user_data, err, 0);
				946	io_free_req(req);
				947	return 0;
				948	}
				949
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	950	static int io_prep_fsync(struct io_kiocb req, const struct io_uring_sqe sqe)
				951	{
				952	int fd;
				953
				954	/* Prep already done */
				955	if (req->rw.ki_filp)
				956	return 0;
				957
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	958	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				959	return -EINVAL;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	960	if (unlikely(sqe->addr \|\| sqe->ioprio \|\| sqe->buf_index))
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	961	return -EINVAL;
				962
				963	fd = READ_ONCE(sqe->fd);
				964	req->rw.ki_filp = fget(fd);
				965	if (unlikely(!req->rw.ki_filp))
				966	return -EBADF;
				967
				968	return 0;
				969	}
				970
				971	static int io_fsync(struct io_kiocb req, const struct io_uring_sqe sqe,
				972	bool force_nonblock)
				973	{
				974	loff_t sqe_off = READ_ONCE(sqe->off);
				975	loff_t sqe_len = READ_ONCE(sqe->len);
				976	loff_t end = sqe_off + sqe_len;
				977	unsigned fsync_flags;
				978	int ret;
				979
				980	fsync_flags = READ_ONCE(sqe->fsync_flags);
				981	if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
				982	return -EINVAL;
				983
				984	ret = io_prep_fsync(req, sqe);
				985	if (ret)
				986	return ret;
				987
				988	/* fsync always requires a blocking context */
				989	if (force_nonblock)
				990	return -EAGAIN;
				991
				992	ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
				993	end > 0 ? end : LLONG_MAX,
				994	fsync_flags & IORING_FSYNC_DATASYNC);
				995
				996	fput(req->rw.ki_filp);
				997	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
				998	io_free_req(req);
				999	return 0;
				1000	}
				1001
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1002	static int __io_submit_sqe(struct io_ring_ctx ctx, struct io_kiocb req,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1003	const struct sqe_submit *s, bool force_nonblock,
				1004	struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1005	{
				1006	ssize_t ret;
				1007	int opcode;
				1008
				1009	if (unlikely(s->index >= ctx->sq_entries))
				1010	return -EINVAL;
				1011	req->user_data = READ_ONCE(s->sqe->user_data);
				1012
				1013	opcode = READ_ONCE(s->sqe->opcode);
				1014	switch (opcode) {
				1015	case IORING_OP_NOP:
				1016	ret = io_nop(req, req->user_data);
				1017	break;
				1018	case IORING_OP_READV:
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	1019	if (unlikely(s->sqe->buf_index))
				1020	return -EINVAL;
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1021	ret = io_read(req, s, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1022	break;
				1023	case IORING_OP_WRITEV:
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	1024	if (unlikely(s->sqe->buf_index))
				1025	return -EINVAL;
				1026	ret = io_write(req, s, force_nonblock, state);
				1027	break;
				1028	case IORING_OP_READ_FIXED:
				1029	ret = io_read(req, s, force_nonblock, state);
				1030	break;
				1031	case IORING_OP_WRITE_FIXED:
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1032	ret = io_write(req, s, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1033	break;
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1034	case IORING_OP_FSYNC:
				1035	ret = io_fsync(req, s->sqe, force_nonblock);
				1036	break;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1037	default:
				1038	ret = -EINVAL;
				1039	break;
				1040	}
				1041
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1042	if (ret)
				1043	return ret;
				1044
				1045	if (ctx->flags & IORING_SETUP_IOPOLL) {
				1046	if (req->error == -EAGAIN)
				1047	return -EAGAIN;
				1048
				1049	/* workqueue context doesn't hold uring_lock, grab it now */
				1050	if (s->needs_lock)
				1051	mutex_lock(&ctx->uring_lock);
				1052	io_iopoll_req_issued(req);
				1053	if (s->needs_lock)
				1054	mutex_unlock(&ctx->uring_lock);
				1055	}
				1056
				1057	return 0;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1058	}
				1059
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	1060	static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
				1061	{
				1062	u8 opcode = READ_ONCE(sqe->opcode);
				1063
				1064	return !(opcode == IORING_OP_READ_FIXED \|\|
				1065	opcode == IORING_OP_WRITE_FIXED);
				1066	}
				1067
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1068	static void io_sq_wq_submit_work(struct work_struct *work)
				1069	{
				1070	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
				1071	struct sqe_submit *s = &req->submit;
				1072	const struct io_uring_sqe *sqe = s->sqe;
				1073	struct io_ring_ctx *ctx = req->ctx;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	1074	mm_segment_t old_fs;
				1075	bool needs_user;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1076	int ret;
				1077
				1078	/* Ensure we clear previously set forced non-block flag */
				1079	req->flags &= ~REQ_F_FORCE_NONBLOCK;
				1080	req->rw.ki_flags &= ~IOCB_NOWAIT;
				1081
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1082	s->needs_lock = true;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	1083	s->has_user = false;
				1084
				1085	/*
				1086	* If we're doing IO to fixed buffers, we don't need to get/set
				1087	* user context
				1088	*/
				1089	needs_user = io_sqe_needs_user(s->sqe);
				1090	if (needs_user) {
				1091	if (!mmget_not_zero(ctx->sqo_mm)) {
				1092	ret = -EFAULT;
				1093	goto err;
				1094	}
				1095	use_mm(ctx->sqo_mm);
				1096	old_fs = get_fs();
				1097	set_fs(USER_DS);
				1098	s->has_user = true;
				1099	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1100
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1101	do {
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1102	ret = __io_submit_sqe(ctx, req, s, false, NULL);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1103	/*
				1104	* We can get EAGAIN for polled IO even though we're forcing
				1105	* a sync submission from here, since we can't wait for
				1106	* request slots on the block side.
				1107	*/
				1108	if (ret != -EAGAIN)
				1109	break;
				1110	cond_resched();
				1111	} while (1);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1112
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	1113	if (needs_user) {
				1114	set_fs(old_fs);
				1115	unuse_mm(ctx->sqo_mm);
				1116	mmput(ctx->sqo_mm);
				1117	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1118	err:
				1119	if (ret) {
				1120	io_cqring_add_event(ctx, sqe->user_data, ret, 0);
				1121	io_free_req(req);
				1122	}
				1123
				1124	/* async context always use a copy of the sqe */
				1125	kfree(sqe);
				1126	}
				1127
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1128	static int io_submit_sqe(struct io_ring_ctx ctx, struct sqe_submit s,
				1129	struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1130	{
				1131	struct io_kiocb *req;
				1132	ssize_t ret;
				1133
				1134	/* enforce forwards compatibility on users */
				1135	if (unlikely(s->sqe->flags))
				1136	return -EINVAL;
				1137
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	1138	req = io_get_req(ctx, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1139	if (unlikely(!req))
				1140	return -EAGAIN;
				1141
				1142	req->rw.ki_filp = NULL;
				1143
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1144	ret = __io_submit_sqe(ctx, req, s, true, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1145	if (ret == -EAGAIN) {
				1146	struct io_uring_sqe *sqe_copy;
				1147
				1148	sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
				1149	if (sqe_copy) {
				1150	memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
				1151	s->sqe = sqe_copy;
				1152
				1153	memcpy(&req->submit, s, sizeof(*s));
				1154	INIT_WORK(&req->work, io_sq_wq_submit_work);
				1155	queue_work(ctx->sqo_wq, &req->work);
				1156	ret = 0;
				1157	}
				1158	}
				1159	if (ret)
				1160	io_free_req(req);
				1161
				1162	return ret;
				1163	}
				1164
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1165	/*
				1166	* Batched submission is done, ensure local IO is flushed out.
				1167	*/
				1168	static void io_submit_state_end(struct io_submit_state *state)
				1169	{
				1170	blk_finish_plug(&state->plug);
				1171	io_file_put(state, NULL);
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	1172	if (state->free_reqs)
				1173	kmem_cache_free_bulk(req_cachep, state->free_reqs,
				1174	&state->reqs[state->cur_req]);
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1175	}
				1176
				1177	/*
				1178	* Start submission side cache.
				1179	*/
				1180	static void io_submit_state_start(struct io_submit_state *state,
				1181	struct io_ring_ctx *ctx, unsigned max_ios)
				1182	{
				1183	blk_start_plug(&state->plug);
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	1184	state->free_reqs = 0;
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1185	state->file = NULL;
				1186	state->ios_left = max_ios;
				1187	}
				1188
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1189	static void io_commit_sqring(struct io_ring_ctx *ctx)
				1190	{
				1191	struct io_sq_ring *ring = ctx->sq_ring;
				1192
				1193	if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
				1194	/*
				1195	* Ensure any loads from the SQEs are done at this point,
				1196	* since once we write the new head, the application could
				1197	* write new data to them.
				1198	*/
				1199	smp_store_release(&ring->r.head, ctx->cached_sq_head);
				1200
				1201	/*
				1202	* write side barrier of head update, app has read side. See
				1203	* comment at the top of this file
				1204	*/
				1205	smp_wmb();
				1206	}
				1207	}
				1208
				1209	/*
				1210	* Undo last io_get_sqring()
				1211	*/
				1212	static void io_drop_sqring(struct io_ring_ctx *ctx)
				1213	{
				1214	ctx->cached_sq_head--;
				1215	}
				1216
				1217	/*
				1218	* Fetch an sqe, if one is available. Note that s->sqe will point to memory
				1219	* that is mapped by userspace. This means that care needs to be taken to
				1220	* ensure that reads are stable, as we cannot rely on userspace always
				1221	* being a good citizen. If members of the sqe are validated and then later
				1222	* used, it's important that those reads are done through READ_ONCE() to
				1223	* prevent a re-load down the line.
				1224	*/
				1225	static bool io_get_sqring(struct io_ring_ctx ctx, struct sqe_submit s)
				1226	{
				1227	struct io_sq_ring *ring = ctx->sq_ring;
				1228	unsigned head;
				1229
				1230	/*
				1231	* The cached sq head (or cq tail) serves two purposes:
				1232	*
				1233	* 1) allows us to batch the cost of updating the user visible
				1234	* head updates.
				1235	* 2) allows the kernel side to track the head on its own, even
				1236	* though the application is the one updating it.
				1237	*/
				1238	head = ctx->cached_sq_head;
				1239	/* See comment at the top of this file */
				1240	smp_rmb();
				1241	if (head == READ_ONCE(ring->r.tail))
				1242	return false;
				1243
				1244	head = READ_ONCE(ring->array[head & ctx->sq_mask]);
				1245	if (head < ctx->sq_entries) {
				1246	s->index = head;
				1247	s->sqe = &ctx->sq_sqes[head];
				1248	ctx->cached_sq_head++;
				1249	return true;
				1250	}
				1251
				1252	/* drop invalid entries */
				1253	ctx->cached_sq_head++;
				1254	ring->dropped++;
				1255	/* See comment at the top of this file */
				1256	smp_wmb();
				1257	return false;
				1258	}
				1259
				1260	static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
				1261	{
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1262	struct io_submit_state state, *statep = NULL;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1263	int i, ret = 0, submit = 0;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1264
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1265	if (to_submit > IO_PLUG_THRESHOLD) {
				1266	io_submit_state_start(&state, ctx, to_submit);
				1267	statep = &state;
				1268	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1269
				1270	for (i = 0; i < to_submit; i++) {
				1271	struct sqe_submit s;
				1272
				1273	if (!io_get_sqring(ctx, &s))
				1274	break;
				1275
				1276	s.has_user = true;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1277	s.needs_lock = false;
				1278
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1279	ret = io_submit_sqe(ctx, &s, statep);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1280	if (ret) {
				1281	io_drop_sqring(ctx);
				1282	break;
				1283	}
				1284
				1285	submit++;
				1286	}
				1287	io_commit_sqring(ctx);
				1288
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1289	if (statep)
				1290	io_submit_state_end(statep);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1291
				1292	return submit ? submit : ret;
				1293	}
				1294
				1295	static unsigned io_cqring_events(struct io_cq_ring *ring)
				1296	{
				1297	return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
				1298	}
				1299
				1300	/*
				1301	* Wait until events become available, if we don't already have some. The
				1302	* application must reap them itself, as they reside on the shared cq ring.
				1303	*/
				1304	static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
				1305	const sigset_t __user *sig, size_t sigsz)
				1306	{
				1307	struct io_cq_ring *ring = ctx->cq_ring;
				1308	sigset_t ksigmask, sigsaved;
				1309	DEFINE_WAIT(wait);
				1310	int ret;
				1311
				1312	/* See comment at the top of this file */
				1313	smp_rmb();
				1314	if (io_cqring_events(ring) >= min_events)
				1315	return 0;
				1316
				1317	if (sig) {
				1318	ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
				1319	if (ret)
				1320	return ret;
				1321	}
				1322
				1323	do {
				1324	prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
				1325
				1326	ret = 0;
				1327	/* See comment at the top of this file */
				1328	smp_rmb();
				1329	if (io_cqring_events(ring) >= min_events)
				1330	break;
				1331
				1332	schedule();
				1333
				1334	ret = -EINTR;
				1335	if (signal_pending(current))
				1336	break;
				1337	} while (1);
				1338
				1339	finish_wait(&ctx->wait, &wait);
				1340
				1341	if (sig)
				1342	restore_user_sigmask(sig, &sigsaved);
				1343
				1344	return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
				1345	}
				1346
				1347	static int io_sq_offload_start(struct io_ring_ctx *ctx)
				1348	{
				1349	int ret;
				1350
				1351	mmgrab(current->mm);
				1352	ctx->sqo_mm = current->mm;
				1353
				1354	/* Do QD, or 2 * CPUS, whatever is smallest */
				1355	ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND \| WQ_FREEZABLE,
				1356	min(ctx->sq_entries - 1, 2 * num_online_cpus()));
				1357	if (!ctx->sqo_wq) {
				1358	ret = -ENOMEM;
				1359	goto err;
				1360	}
				1361
				1362	return 0;
				1363	err:
				1364	mmdrop(ctx->sqo_mm);
				1365	ctx->sqo_mm = NULL;
				1366	return ret;
				1367	}
				1368
				1369	static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
				1370	{
				1371	atomic_long_sub(nr_pages, &user->locked_vm);
				1372	}
				1373
				1374	static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
				1375	{
				1376	unsigned long page_limit, cur_pages, new_pages;
				1377
				1378	/* Don't allow more pages than we can safely lock */
				1379	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
				1380
				1381	do {
				1382	cur_pages = atomic_long_read(&user->locked_vm);
				1383	new_pages = cur_pages + nr_pages;
				1384	if (new_pages > page_limit)
				1385	return -ENOMEM;
				1386	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
				1387	new_pages) != cur_pages);
				1388
				1389	return 0;
				1390	}
				1391
				1392	static void io_mem_free(void *ptr)
				1393	{
				1394	struct page *page = virt_to_head_page(ptr);
				1395
				1396	if (put_page_testzero(page))
				1397	free_compound_page(page);
				1398	}
				1399
				1400	static void *io_mem_alloc(size_t size)
				1401	{
				1402	gfp_t gfp_flags = GFP_KERNEL \| __GFP_ZERO \| __GFP_NOWARN \| __GFP_COMP \|
				1403	__GFP_NORETRY;
				1404
				1405	return (void *) __get_free_pages(gfp_flags, get_order(size));
				1406	}
				1407
				1408	static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
				1409	{
				1410	struct io_sq_ring *sq_ring;
				1411	struct io_cq_ring *cq_ring;
				1412	size_t bytes;
				1413
				1414	bytes = struct_size(sq_ring, array, sq_entries);
				1415	bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
				1416	bytes += struct_size(cq_ring, cqes, cq_entries);
				1417
				1418	return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
				1419	}
				1420
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	1421	static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
				1422	{
				1423	int i, j;
				1424
				1425	if (!ctx->user_bufs)
				1426	return -ENXIO;
				1427
				1428	for (i = 0; i < ctx->nr_user_bufs; i++) {
				1429	struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
				1430
				1431	for (j = 0; j < imu->nr_bvecs; j++)
				1432	put_page(imu->bvec[j].bv_page);
				1433
				1434	if (ctx->account_mem)
				1435	io_unaccount_mem(ctx->user, imu->nr_bvecs);
				1436	kfree(imu->bvec);
				1437	imu->nr_bvecs = 0;
				1438	}
				1439
				1440	kfree(ctx->user_bufs);
				1441	ctx->user_bufs = NULL;
				1442	ctx->nr_user_bufs = 0;
				1443	return 0;
				1444	}
				1445
				1446	static int io_copy_iov(struct io_ring_ctx ctx, struct iovec dst,
				1447	void __user *arg, unsigned index)
				1448	{
				1449	struct iovec __user *src;
				1450
				1451	#ifdef CONFIG_COMPAT
				1452	if (ctx->compat) {
				1453	struct compat_iovec __user *ciovs;
				1454	struct compat_iovec ciov;
				1455
				1456	ciovs = (struct compat_iovec __user *) arg;
				1457	if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
				1458	return -EFAULT;
				1459
				1460	dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
				1461	dst->iov_len = ciov.iov_len;
				1462	return 0;
				1463	}
				1464	#endif
				1465	src = (struct iovec __user *) arg;
				1466	if (copy_from_user(dst, &src[index], sizeof(*dst)))
				1467	return -EFAULT;
				1468	return 0;
				1469	}
				1470
				1471	static int io_sqe_buffer_register(struct io_ring_ctx ctx, void __user arg,
				1472	unsigned nr_args)
				1473	{
				1474	struct vm_area_struct **vmas = NULL;
				1475	struct page **pages = NULL;
				1476	int i, j, got_pages = 0;
				1477	int ret = -EINVAL;
				1478
				1479	if (ctx->user_bufs)
				1480	return -EBUSY;
				1481	if (!nr_args \|\| nr_args > UIO_MAXIOV)
				1482	return -EINVAL;
				1483
				1484	ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
				1485	GFP_KERNEL);
				1486	if (!ctx->user_bufs)
				1487	return -ENOMEM;
				1488
				1489	for (i = 0; i < nr_args; i++) {
				1490	struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
				1491	unsigned long off, start, end, ubuf;
				1492	int pret, nr_pages;
				1493	struct iovec iov;
				1494	size_t size;
				1495
				1496	ret = io_copy_iov(ctx, &iov, arg, i);
				1497	if (ret)
				1498	break;
				1499
				1500	/*
				1501	* Don't impose further limits on the size and buffer
				1502	* constraints here, we'll -EINVAL later when IO is
				1503	* submitted if they are wrong.
				1504	*/
				1505	ret = -EFAULT;
				1506	if (!iov.iov_base \|\| !iov.iov_len)
				1507	goto err;
				1508
				1509	/* arbitrary limit, but we need something */
				1510	if (iov.iov_len > SZ_1G)
				1511	goto err;
				1512
				1513	ubuf = (unsigned long) iov.iov_base;
				1514	end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
				1515	start = ubuf >> PAGE_SHIFT;
				1516	nr_pages = end - start;
				1517
				1518	if (ctx->account_mem) {
				1519	ret = io_account_mem(ctx->user, nr_pages);
				1520	if (ret)
				1521	goto err;
				1522	}
				1523
				1524	ret = 0;
				1525	if (!pages \|\| nr_pages > got_pages) {
				1526	kfree(vmas);
				1527	kfree(pages);
				1528	pages = kmalloc_array(nr_pages, sizeof(struct page *),
				1529	GFP_KERNEL);
				1530	vmas = kmalloc_array(nr_pages,
				1531	sizeof(struct vm_area_struct *),
				1532	GFP_KERNEL);
				1533	if (!pages \|\| !vmas) {
				1534	ret = -ENOMEM;
				1535	if (ctx->account_mem)
				1536	io_unaccount_mem(ctx->user, nr_pages);
				1537	goto err;
				1538	}
				1539	got_pages = nr_pages;
				1540	}
				1541
				1542	imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
				1543	GFP_KERNEL);
				1544	ret = -ENOMEM;
				1545	if (!imu->bvec) {
				1546	if (ctx->account_mem)
				1547	io_unaccount_mem(ctx->user, nr_pages);
				1548	goto err;
				1549	}
				1550
				1551	ret = 0;
				1552	down_read(&current->mm->mmap_sem);
				1553	pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
				1554	pages, vmas);
				1555	if (pret == nr_pages) {
				1556	/* don't support file backed memory */
				1557	for (j = 0; j < nr_pages; j++) {
				1558	struct vm_area_struct *vma = vmas[j];
				1559
				1560	if (vma->vm_file &&
				1561	!is_file_hugepages(vma->vm_file)) {
				1562	ret = -EOPNOTSUPP;
				1563	break;
				1564	}
				1565	}
				1566	} else {
				1567	ret = pret < 0 ? pret : -EFAULT;
				1568	}
				1569	up_read(&current->mm->mmap_sem);
				1570	if (ret) {
				1571	/*
				1572	* if we did partial map, or found file backed vmas,
				1573	* release any pages we did get
				1574	*/
				1575	if (pret > 0) {
				1576	for (j = 0; j < pret; j++)
				1577	put_page(pages[j]);
				1578	}
				1579	if (ctx->account_mem)
				1580	io_unaccount_mem(ctx->user, nr_pages);
				1581	goto err;
				1582	}
				1583
				1584	off = ubuf & ~PAGE_MASK;
				1585	size = iov.iov_len;
				1586	for (j = 0; j < nr_pages; j++) {
				1587	size_t vec_len;
				1588
				1589	vec_len = min_t(size_t, size, PAGE_SIZE - off);
				1590	imu->bvec[j].bv_page = pages[j];
				1591	imu->bvec[j].bv_len = vec_len;
				1592	imu->bvec[j].bv_offset = off;
				1593	off = 0;
				1594	size -= vec_len;
				1595	}
				1596	/* store original address for later verification */
				1597	imu->ubuf = ubuf;
				1598	imu->len = iov.iov_len;
				1599	imu->nr_bvecs = nr_pages;
				1600
				1601	ctx->nr_user_bufs++;
				1602	}
				1603	kfree(pages);
				1604	kfree(vmas);
				1605	return 0;
				1606	err:
				1607	kfree(pages);
				1608	kfree(vmas);
				1609	io_sqe_buffer_unregister(ctx);
				1610	return ret;
				1611	}
				1612
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1613	static void io_ring_ctx_free(struct io_ring_ctx *ctx)
				1614	{
				1615	if (ctx->sqo_wq)
				1616	destroy_workqueue(ctx->sqo_wq);
				1617	if (ctx->sqo_mm)
				1618	mmdrop(ctx->sqo_mm);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1619
				1620	io_iopoll_reap_events(ctx);
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	1621	io_sqe_buffer_unregister(ctx);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1622
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1623	#if defined(CONFIG_UNIX)
				1624	if (ctx->ring_sock)
				1625	sock_release(ctx->ring_sock);
				1626	#endif
				1627
				1628	io_mem_free(ctx->sq_ring);
				1629	io_mem_free(ctx->sq_sqes);
				1630	io_mem_free(ctx->cq_ring);
				1631
				1632	percpu_ref_exit(&ctx->refs);
				1633	if (ctx->account_mem)
				1634	io_unaccount_mem(ctx->user,
				1635	ring_pages(ctx->sq_entries, ctx->cq_entries));
				1636	free_uid(ctx->user);
				1637	kfree(ctx);
				1638	}
				1639
				1640	static __poll_t io_uring_poll(struct file file, poll_table wait)
				1641	{
				1642	struct io_ring_ctx *ctx = file->private_data;
				1643	__poll_t mask = 0;
				1644
				1645	poll_wait(file, &ctx->cq_wait, wait);
				1646	/* See comment at the top of this file */
				1647	smp_rmb();
				1648	if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
				1649	mask \|= EPOLLOUT \| EPOLLWRNORM;
				1650	if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
				1651	mask \|= EPOLLIN \| EPOLLRDNORM;
				1652
				1653	return mask;
				1654	}
				1655
				1656	static int io_uring_fasync(int fd, struct file *file, int on)
				1657	{
				1658	struct io_ring_ctx *ctx = file->private_data;
				1659
				1660	return fasync_helper(fd, file, on, &ctx->cq_fasync);
				1661	}
				1662
				1663	static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
				1664	{
				1665	mutex_lock(&ctx->uring_lock);
				1666	percpu_ref_kill(&ctx->refs);
				1667	mutex_unlock(&ctx->uring_lock);
				1668
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1669	io_iopoll_reap_events(ctx);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1670	wait_for_completion(&ctx->ctx_done);
				1671	io_ring_ctx_free(ctx);
				1672	}
				1673
				1674	static int io_uring_release(struct inode inode, struct file file)
				1675	{
				1676	struct io_ring_ctx *ctx = file->private_data;
				1677
				1678	file->private_data = NULL;
				1679	io_ring_ctx_wait_and_kill(ctx);
				1680	return 0;
				1681	}
				1682
				1683	static int io_uring_mmap(struct file file, struct vm_area_struct vma)
				1684	{
				1685	loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
				1686	unsigned long sz = vma->vm_end - vma->vm_start;
				1687	struct io_ring_ctx *ctx = file->private_data;
				1688	unsigned long pfn;
				1689	struct page *page;
				1690	void *ptr;
				1691
				1692	switch (offset) {
				1693	case IORING_OFF_SQ_RING:
				1694	ptr = ctx->sq_ring;
				1695	break;
				1696	case IORING_OFF_SQES:
				1697	ptr = ctx->sq_sqes;
				1698	break;
				1699	case IORING_OFF_CQ_RING:
				1700	ptr = ctx->cq_ring;
				1701	break;
				1702	default:
				1703	return -EINVAL;
				1704	}
				1705
				1706	page = virt_to_head_page(ptr);
				1707	if (sz > (PAGE_SIZE << compound_order(page)))
				1708	return -EINVAL;
				1709
				1710	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
				1711	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
				1712	}
				1713
				1714	SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
				1715	u32, min_complete, u32, flags, const sigset_t __user *, sig,
				1716	size_t, sigsz)
				1717	{
				1718	struct io_ring_ctx *ctx;
				1719	long ret = -EBADF;
				1720	int submitted = 0;
				1721	struct fd f;
				1722
				1723	if (flags & ~IORING_ENTER_GETEVENTS)
				1724	return -EINVAL;
				1725
				1726	f = fdget(fd);
				1727	if (!f.file)
				1728	return -EBADF;
				1729
				1730	ret = -EOPNOTSUPP;
				1731	if (f.file->f_op != &io_uring_fops)
				1732	goto out_fput;
				1733
				1734	ret = -ENXIO;
				1735	ctx = f.file->private_data;
				1736	if (!percpu_ref_tryget(&ctx->refs))
				1737	goto out_fput;
				1738
				1739	ret = 0;
				1740	if (to_submit) {
				1741	to_submit = min(to_submit, ctx->sq_entries);
				1742
				1743	mutex_lock(&ctx->uring_lock);
				1744	submitted = io_ring_submit(ctx, to_submit);
				1745	mutex_unlock(&ctx->uring_lock);
				1746
				1747	if (submitted < 0)
				1748	goto out_ctx;
				1749	}
				1750	if (flags & IORING_ENTER_GETEVENTS) {
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1751	unsigned nr_events = 0;
				1752
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1753	min_complete = min(min_complete, ctx->cq_entries);
				1754
				1755	/*
				1756	* The application could have included the 'to_submit' count
				1757	* in how many events it wanted to wait for. If we failed to
				1758	* submit the desired count, we may need to adjust the number
				1759	* of events to poll/wait for.
				1760	*/
				1761	if (submitted < to_submit)
				1762	min_complete = min_t(unsigned, submitted, min_complete);
				1763
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1764	if (ctx->flags & IORING_SETUP_IOPOLL) {
				1765	mutex_lock(&ctx->uring_lock);
				1766	ret = io_iopoll_check(ctx, &nr_events, min_complete);
				1767	mutex_unlock(&ctx->uring_lock);
				1768	} else {
				1769	ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
				1770	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1771	}
				1772
				1773	out_ctx:
				1774	io_ring_drop_ctx_refs(ctx, 1);
				1775	out_fput:
				1776	fdput(f);
				1777	return submitted ? submitted : ret;
				1778	}
				1779
				1780	static const struct file_operations io_uring_fops = {
				1781	.release = io_uring_release,
				1782	.mmap = io_uring_mmap,
				1783	.poll = io_uring_poll,
				1784	.fasync = io_uring_fasync,
				1785	};
				1786
				1787	static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
				1788	struct io_uring_params *p)
				1789	{
				1790	struct io_sq_ring *sq_ring;
				1791	struct io_cq_ring *cq_ring;
				1792	size_t size;
				1793
				1794	sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
				1795	if (!sq_ring)
				1796	return -ENOMEM;
				1797
				1798	ctx->sq_ring = sq_ring;
				1799	sq_ring->ring_mask = p->sq_entries - 1;
				1800	sq_ring->ring_entries = p->sq_entries;
				1801	ctx->sq_mask = sq_ring->ring_mask;
				1802	ctx->sq_entries = sq_ring->ring_entries;
				1803
				1804	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
				1805	if (size == SIZE_MAX)
				1806	return -EOVERFLOW;
				1807
				1808	ctx->sq_sqes = io_mem_alloc(size);
				1809	if (!ctx->sq_sqes) {
				1810	io_mem_free(ctx->sq_ring);
				1811	return -ENOMEM;
				1812	}
				1813
				1814	cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
				1815	if (!cq_ring) {
				1816	io_mem_free(ctx->sq_ring);
				1817	io_mem_free(ctx->sq_sqes);
				1818	return -ENOMEM;
				1819	}
				1820
				1821	ctx->cq_ring = cq_ring;
				1822	cq_ring->ring_mask = p->cq_entries - 1;
				1823	cq_ring->ring_entries = p->cq_entries;
				1824	ctx->cq_mask = cq_ring->ring_mask;
				1825	ctx->cq_entries = cq_ring->ring_entries;
				1826	return 0;
				1827	}
				1828
				1829	/*
				1830	* Allocate an anonymous fd, this is what constitutes the application
				1831	* visible backing of an io_uring instance. The application mmaps this
				1832	* fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
				1833	* we have to tie this fd to a socket for file garbage collection purposes.
				1834	*/
				1835	static int io_uring_get_fd(struct io_ring_ctx *ctx)
				1836	{
				1837	struct file *file;
				1838	int ret;
				1839
				1840	#if defined(CONFIG_UNIX)
				1841	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
				1842	&ctx->ring_sock);
				1843	if (ret)
				1844	return ret;
				1845	#endif
				1846
				1847	ret = get_unused_fd_flags(O_RDWR \| O_CLOEXEC);
				1848	if (ret < 0)
				1849	goto err;
				1850
				1851	file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
				1852	O_RDWR \| O_CLOEXEC);
				1853	if (IS_ERR(file)) {
				1854	put_unused_fd(ret);
				1855	ret = PTR_ERR(file);
				1856	goto err;
				1857	}
				1858
				1859	#if defined(CONFIG_UNIX)
				1860	ctx->ring_sock->file = file;
				1861	#endif
				1862	fd_install(ret, file);
				1863	return ret;
				1864	err:
				1865	#if defined(CONFIG_UNIX)
				1866	sock_release(ctx->ring_sock);
				1867	ctx->ring_sock = NULL;
				1868	#endif
				1869	return ret;
				1870	}
				1871
				1872	static int io_uring_create(unsigned entries, struct io_uring_params *p)
				1873	{
				1874	struct user_struct *user = NULL;
				1875	struct io_ring_ctx *ctx;
				1876	bool account_mem;
				1877	int ret;
				1878
				1879	if (!entries \|\| entries > IORING_MAX_ENTRIES)
				1880	return -EINVAL;
				1881
				1882	/*
				1883	* Use twice as many entries for the CQ ring. It's possible for the
				1884	* application to drive a higher depth than the size of the SQ ring,
				1885	* since the sqes are only used at submission time. This allows for
				1886	* some flexibility in overcommitting a bit.
				1887	*/
				1888	p->sq_entries = roundup_pow_of_two(entries);
				1889	p->cq_entries = 2 * p->sq_entries;
				1890
				1891	user = get_uid(current_user());
				1892	account_mem = !capable(CAP_IPC_LOCK);
				1893
				1894	if (account_mem) {
				1895	ret = io_account_mem(user,
				1896	ring_pages(p->sq_entries, p->cq_entries));
				1897	if (ret) {
				1898	free_uid(user);
				1899	return ret;
				1900	}
				1901	}
				1902
				1903	ctx = io_ring_ctx_alloc(p);
				1904	if (!ctx) {
				1905	if (account_mem)
				1906	io_unaccount_mem(user, ring_pages(p->sq_entries,
				1907	p->cq_entries));
				1908	free_uid(user);
				1909	return -ENOMEM;
				1910	}
				1911	ctx->compat = in_compat_syscall();
				1912	ctx->account_mem = account_mem;
				1913	ctx->user = user;
				1914
				1915	ret = io_allocate_scq_urings(ctx, p);
				1916	if (ret)
				1917	goto err;
				1918
				1919	ret = io_sq_offload_start(ctx);
				1920	if (ret)
				1921	goto err;
				1922
				1923	ret = io_uring_get_fd(ctx);
				1924	if (ret < 0)
				1925	goto err;
				1926
				1927	memset(&p->sq_off, 0, sizeof(p->sq_off));
				1928	p->sq_off.head = offsetof(struct io_sq_ring, r.head);
				1929	p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
				1930	p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
				1931	p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
				1932	p->sq_off.flags = offsetof(struct io_sq_ring, flags);
				1933	p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
				1934	p->sq_off.array = offsetof(struct io_sq_ring, array);
				1935
				1936	memset(&p->cq_off, 0, sizeof(p->cq_off));
				1937	p->cq_off.head = offsetof(struct io_cq_ring, r.head);
				1938	p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
				1939	p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
				1940	p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
				1941	p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
				1942	p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
				1943	return ret;
				1944	err:
				1945	io_ring_ctx_wait_and_kill(ctx);
				1946	return ret;
				1947	}
				1948
				1949	/*
				1950	* Sets up an aio uring context, and returns the fd. Applications asks for a
				1951	* ring size, we return the actual sq/cq ring sizes (among other things) in the
				1952	* params structure passed in.
				1953	*/
				1954	static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
				1955	{
				1956	struct io_uring_params p;
				1957	long ret;
				1958	int i;
				1959
				1960	if (copy_from_user(&p, params, sizeof(p)))
				1961	return -EFAULT;
				1962	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
				1963	if (p.resv[i])
				1964	return -EINVAL;
				1965	}
				1966
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1967	if (p.flags & ~IORING_SETUP_IOPOLL)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1968	return -EINVAL;
				1969
				1970	ret = io_uring_create(entries, &p);
				1971	if (ret < 0)
				1972	return ret;
				1973
				1974	if (copy_to_user(params, &p, sizeof(p)))
				1975	return -EFAULT;
				1976
				1977	return ret;
				1978	}
				1979
				1980	SYSCALL_DEFINE2(io_uring_setup, u32, entries,
				1981	struct io_uring_params __user *, params)
				1982	{
				1983	return io_uring_setup(entries, params);
				1984	}
				1985
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame^]	1986	static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
				1987	void __user *arg, unsigned nr_args)
				1988	{
				1989	int ret;
				1990
				1991	percpu_ref_kill(&ctx->refs);
				1992	wait_for_completion(&ctx->ctx_done);
				1993
				1994	switch (opcode) {
				1995	case IORING_REGISTER_BUFFERS:
				1996	ret = io_sqe_buffer_register(ctx, arg, nr_args);
				1997	break;
				1998	case IORING_UNREGISTER_BUFFERS:
				1999	ret = -EINVAL;
				2000	if (arg \|\| nr_args)
				2001	break;
				2002	ret = io_sqe_buffer_unregister(ctx);
				2003	break;
				2004	default:
				2005	ret = -EINVAL;
				2006	break;
				2007	}
				2008
				2009	/* bring the ctx back to life */
				2010	reinit_completion(&ctx->ctx_done);
				2011	percpu_ref_reinit(&ctx->refs);
				2012	return ret;
				2013	}
				2014
				2015	SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
				2016	void __user *, arg, unsigned int, nr_args)
				2017	{
				2018	struct io_ring_ctx *ctx;
				2019	long ret = -EBADF;
				2020	struct fd f;
				2021
				2022	f = fdget(fd);
				2023	if (!f.file)
				2024	return -EBADF;
				2025
				2026	ret = -EOPNOTSUPP;
				2027	if (f.file->f_op != &io_uring_fops)
				2028	goto out_fput;
				2029
				2030	ctx = f.file->private_data;
				2031
				2032	mutex_lock(&ctx->uring_lock);
				2033	ret = __io_uring_register(ctx, opcode, arg, nr_args);
				2034	mutex_unlock(&ctx->uring_lock);
				2035	out_fput:
				2036	fdput(f);
				2037	return ret;
				2038	}
				2039
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2040	static int __init io_uring_init(void)
				2041	{
				2042	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN \| SLAB_PANIC);
				2043	return 0;
				2044	};
				2045	__initcall(io_uring_init);