Blame - fs/io_uring.c - yocto/kernel/common

blob: 85b914bd823c3235ac5b092e0a9d6dc73aa8ed0f [file] [log] [blame]

Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Shared application/kernel submission and completion ring pairs, for
				4	* supporting fast/efficient IO.
				5	*
				6	* A note on the read/write ordering memory barriers that are matched between
				7	* the application and kernel side. When the application reads the CQ ring
				8	* tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
				9	* the kernel uses after writing the tail. Failure to do so could cause a
				10	* delay in when the application notices that completion events available.
				11	* This isn't a fatal condition. Likewise, the application must use an
				12	* appropriate smp_wmb() both before writing the SQ tail, and after writing
				13	* the SQ tail. The first one orders the sqe writes with the tail write, and
				14	* the latter is paired with the smp_rmb() the kernel will issue before
				15	* reading the SQ tail on submission.
				16	*
				17	* Also see the examples in the liburing library:
				18	*
				19	* git://git.kernel.dk/liburing
				20	*
				21	* io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
				22	* from data shared between the kernel and application. This is done both
				23	* for ordering purposes, but also to ensure that once a value is loaded from
				24	* data that the application could potentially modify, it remains stable.
				25	*
				26	* Copyright (C) 2018-2019 Jens Axboe
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	27	* Copyright (c) 2018-2019 Christoph Hellwig
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	28	*/
				29	#include <linux/kernel.h>
				30	#include <linux/init.h>
				31	#include <linux/errno.h>
				32	#include <linux/syscalls.h>
				33	#include <linux/compat.h>
				34	#include <linux/refcount.h>
				35	#include <linux/uio.h>
				36
				37	#include <linux/sched/signal.h>
				38	#include <linux/fs.h>
				39	#include <linux/file.h>
				40	#include <linux/fdtable.h>
				41	#include <linux/mm.h>
				42	#include <linux/mman.h>
				43	#include <linux/mmu_context.h>
				44	#include <linux/percpu.h>
				45	#include <linux/slab.h>
				46	#include <linux/workqueue.h>
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	47	#include <linux/kthread.h>
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	48	#include <linux/blkdev.h>
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	49	#include <linux/bvec.h>
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	50	#include <linux/net.h>
				51	#include <net/sock.h>
				52	#include <net/af_unix.h>
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	53	#include <net/scm.h>
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	54	#include <linux/anon_inodes.h>
				55	#include <linux/sched/mm.h>
				56	#include <linux/uaccess.h>
				57	#include <linux/nospec.h>
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	58	#include <linux/sizes.h>
				59	#include <linux/hugetlb.h>
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	60
				61	#include <uapi/linux/io_uring.h>
				62
				63	#include "internal.h"
				64
				65	#define IORING_MAX_ENTRIES 4096
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	66	#define IORING_MAX_FIXED_FILES 1024
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	67
				68	struct io_uring {
				69	u32 head ____cacheline_aligned_in_smp;
				70	u32 tail ____cacheline_aligned_in_smp;
				71	};
				72
				73	struct io_sq_ring {
				74	struct io_uring r;
				75	u32 ring_mask;
				76	u32 ring_entries;
				77	u32 dropped;
				78	u32 flags;
				79	u32 array[];
				80	};
				81
				82	struct io_cq_ring {
				83	struct io_uring r;
				84	u32 ring_mask;
				85	u32 ring_entries;
				86	u32 overflow;
				87	struct io_uring_cqe cqes[];
				88	};
				89
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	90	struct io_mapped_ubuf {
				91	u64 ubuf;
				92	size_t len;
				93	struct bio_vec *bvec;
				94	unsigned int nr_bvecs;
				95	};
				96
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	97	struct io_ring_ctx {
				98	struct {
				99	struct percpu_ref refs;
				100	} ____cacheline_aligned_in_smp;
				101
				102	struct {
				103	unsigned int flags;
				104	bool compat;
				105	bool account_mem;
				106
				107	/* SQ ring */
				108	struct io_sq_ring *sq_ring;
				109	unsigned cached_sq_head;
				110	unsigned sq_entries;
				111	unsigned sq_mask;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	112	unsigned sq_thread_idle;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	113	struct io_uring_sqe *sq_sqes;
				114	} ____cacheline_aligned_in_smp;
				115
				116	/* IO offload */
				117	struct workqueue_struct *sqo_wq;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	118	struct task_struct sqo_thread; / if using sq thread polling */
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	119	struct mm_struct *sqo_mm;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	120	wait_queue_head_t sqo_wait;
				121	unsigned sqo_stop;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	122
				123	struct {
				124	/* CQ ring */
				125	struct io_cq_ring *cq_ring;
				126	unsigned cached_cq_tail;
				127	unsigned cq_entries;
				128	unsigned cq_mask;
				129	struct wait_queue_head cq_wait;
				130	struct fasync_struct *cq_fasync;
				131	} ____cacheline_aligned_in_smp;
				132
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	133	/*
				134	* If used, fixed file set. Writers must ensure that ->refs is dead,
				135	* readers must ensure that ->refs is alive as long as the file* is
				136	* used. Only updated through io_uring_register(2).
				137	*/
				138	struct file **user_files;
				139	unsigned nr_user_files;
				140
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	141	/* if used, fixed mapped user buffers */
				142	unsigned nr_user_bufs;
				143	struct io_mapped_ubuf *user_bufs;
				144
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	145	struct user_struct *user;
				146
				147	struct completion ctx_done;
				148
				149	struct {
				150	struct mutex uring_lock;
				151	wait_queue_head_t wait;
				152	} ____cacheline_aligned_in_smp;
				153
				154	struct {
				155	spinlock_t completion_lock;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	156	bool poll_multi_file;
				157	/*
				158	* ->poll_list is protected by the ctx->uring_lock for
				159	* io_uring instances that don't use IORING_SETUP_SQPOLL.
				160	* For SQPOLL, only the single threaded io_sq_thread() will
				161	* manipulate the list, hence no extra locking is needed there.
				162	*/
				163	struct list_head poll_list;
Jens Axboe	221c5eb	2019-01-17 09:41:58 -0700	[diff] [blame^]	164	struct list_head cancel_list;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	165	} ____cacheline_aligned_in_smp;
				166
				167	#if defined(CONFIG_UNIX)
				168	struct socket *ring_sock;
				169	#endif
				170	};
				171
				172	struct sqe_submit {
				173	const struct io_uring_sqe *sqe;
				174	unsigned short index;
				175	bool has_user;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	176	bool needs_lock;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	177	bool needs_fixed_file;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	178	};
				179
Jens Axboe	221c5eb	2019-01-17 09:41:58 -0700	[diff] [blame^]	180	struct io_poll_iocb {
				181	struct file *file;
				182	struct wait_queue_head *head;
				183	__poll_t events;
				184	bool woken;
				185	bool canceled;
				186	struct wait_queue_entry wait;
				187	};
				188
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	189	struct io_kiocb {
Jens Axboe	221c5eb	2019-01-17 09:41:58 -0700	[diff] [blame^]	190	union {
				191	struct kiocb rw;
				192	struct io_poll_iocb poll;
				193	};
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	194
				195	struct sqe_submit submit;
				196
				197	struct io_ring_ctx *ctx;
				198	struct list_head list;
				199	unsigned int flags;
Jens Axboe	c16361c	2019-01-17 08:39:48 -0700	[diff] [blame]	200	refcount_t refs;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	201	#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	202	#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	203	#define REQ_F_FIXED_FILE 4 /* ctx owns file */
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	204	u64 user_data;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	205	u64 error;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	206
				207	struct work_struct work;
				208	};
				209
				210	#define IO_PLUG_THRESHOLD 2
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	211	#define IO_IOPOLL_BATCH 8
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	212
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	213	struct io_submit_state {
				214	struct blk_plug plug;
				215
				216	/*
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	217	* io_kiocb alloc cache
				218	*/
				219	void *reqs[IO_IOPOLL_BATCH];
				220	unsigned int free_reqs;
				221	unsigned int cur_req;
				222
				223	/*
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	224	* File reference cache
				225	*/
				226	struct file *file;
				227	unsigned int fd;
				228	unsigned int has_refs;
				229	unsigned int used_refs;
				230	unsigned int ios_left;
				231	};
				232
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	233	static struct kmem_cache *req_cachep;
				234
				235	static const struct file_operations io_uring_fops;
				236
				237	struct sock io_uring_get_socket(struct file file)
				238	{
				239	#if defined(CONFIG_UNIX)
				240	if (file->f_op == &io_uring_fops) {
				241	struct io_ring_ctx *ctx = file->private_data;
				242
				243	return ctx->ring_sock->sk;
				244	}
				245	#endif
				246	return NULL;
				247	}
				248	EXPORT_SYMBOL(io_uring_get_socket);
				249
				250	static void io_ring_ctx_ref_free(struct percpu_ref *ref)
				251	{
				252	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
				253
				254	complete(&ctx->ctx_done);
				255	}
				256
				257	static struct io_ring_ctx io_ring_ctx_alloc(struct io_uring_params p)
				258	{
				259	struct io_ring_ctx *ctx;
				260
				261	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
				262	if (!ctx)
				263	return NULL;
				264
				265	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
				266	kfree(ctx);
				267	return NULL;
				268	}
				269
				270	ctx->flags = p->flags;
				271	init_waitqueue_head(&ctx->cq_wait);
				272	init_completion(&ctx->ctx_done);
				273	mutex_init(&ctx->uring_lock);
				274	init_waitqueue_head(&ctx->wait);
				275	spin_lock_init(&ctx->completion_lock);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	276	INIT_LIST_HEAD(&ctx->poll_list);
Jens Axboe	221c5eb	2019-01-17 09:41:58 -0700	[diff] [blame^]	277	INIT_LIST_HEAD(&ctx->cancel_list);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	278	return ctx;
				279	}
				280
				281	static void io_commit_cqring(struct io_ring_ctx *ctx)
				282	{
				283	struct io_cq_ring *ring = ctx->cq_ring;
				284
				285	if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
				286	/* order cqe stores with ring update */
				287	smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
				288
				289	/*
				290	* Write sider barrier of tail update, app has read side. See
				291	* comment at the top of this file.
				292	*/
				293	smp_wmb();
				294
				295	if (wq_has_sleeper(&ctx->cq_wait)) {
				296	wake_up_interruptible(&ctx->cq_wait);
				297	kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
				298	}
				299	}
				300	}
				301
				302	static struct io_uring_cqe io_get_cqring(struct io_ring_ctx ctx)
				303	{
				304	struct io_cq_ring *ring = ctx->cq_ring;
				305	unsigned tail;
				306
				307	tail = ctx->cached_cq_tail;
				308	/* See comment at the top of the file */
				309	smp_rmb();
				310	if (tail + 1 == READ_ONCE(ring->r.head))
				311	return NULL;
				312
				313	ctx->cached_cq_tail++;
				314	return &ring->cqes[tail & ctx->cq_mask];
				315	}
				316
				317	static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
				318	long res, unsigned ev_flags)
				319	{
				320	struct io_uring_cqe *cqe;
				321
				322	/*
				323	* If we can't get a cq entry, userspace overflowed the
				324	* submission (by quite a lot). Increment the overflow count in
				325	* the ring.
				326	*/
				327	cqe = io_get_cqring(ctx);
				328	if (cqe) {
				329	WRITE_ONCE(cqe->user_data, ki_user_data);
				330	WRITE_ONCE(cqe->res, res);
				331	WRITE_ONCE(cqe->flags, ev_flags);
				332	} else {
				333	unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
				334
				335	WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
				336	}
				337	}
				338
				339	static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
				340	long res, unsigned ev_flags)
				341	{
				342	unsigned long flags;
				343
				344	spin_lock_irqsave(&ctx->completion_lock, flags);
				345	io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
				346	io_commit_cqring(ctx);
				347	spin_unlock_irqrestore(&ctx->completion_lock, flags);
				348
				349	if (waitqueue_active(&ctx->wait))
				350	wake_up(&ctx->wait);
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	351	if (waitqueue_active(&ctx->sqo_wait))
				352	wake_up(&ctx->sqo_wait);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	353	}
				354
				355	static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
				356	{
				357	percpu_ref_put_many(&ctx->refs, refs);
				358
				359	if (waitqueue_active(&ctx->wait))
				360	wake_up(&ctx->wait);
				361	}
				362
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	363	static struct io_kiocb io_get_req(struct io_ring_ctx ctx,
				364	struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	365	{
				366	struct io_kiocb *req;
				367
				368	if (!percpu_ref_tryget(&ctx->refs))
				369	return NULL;
				370
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	371	if (!state) {
				372	req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
				373	if (unlikely(!req))
				374	goto out;
				375	} else if (!state->free_reqs) {
				376	size_t sz;
				377	int ret;
				378
				379	sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
				380	ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz,
				381	state->reqs);
				382	if (unlikely(ret <= 0))
				383	goto out;
				384	state->free_reqs = ret - 1;
				385	state->cur_req = 1;
				386	req = state->reqs[0];
				387	} else {
				388	req = state->reqs[state->cur_req];
				389	state->free_reqs--;
				390	state->cur_req++;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	391	}
				392
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	393	req->ctx = ctx;
				394	req->flags = 0;
Jens Axboe	c16361c	2019-01-17 08:39:48 -0700	[diff] [blame]	395	refcount_set(&req->refs, 0);
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	396	return req;
				397	out:
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	398	io_ring_drop_ctx_refs(ctx, 1);
				399	return NULL;
				400	}
				401
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	402	static void io_free_req_many(struct io_ring_ctx ctx, void reqs, int nr)
				403	{
				404	if (*nr) {
				405	kmem_cache_free_bulk(req_cachep, *nr, reqs);
				406	io_ring_drop_ctx_refs(ctx, *nr);
				407	*nr = 0;
				408	}
				409	}
				410
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	411	static void io_free_req(struct io_kiocb *req)
				412	{
Jens Axboe	c16361c	2019-01-17 08:39:48 -0700	[diff] [blame]	413	if (!refcount_read(&req->refs) \|\| refcount_dec_and_test(&req->refs)) {
				414	io_ring_drop_ctx_refs(req->ctx, 1);
				415	kmem_cache_free(req_cachep, req);
				416	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	417	}
				418
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	419	/*
				420	* Find and free completed poll iocbs
				421	*/
				422	static void io_iopoll_complete(struct io_ring_ctx ctx, unsigned int nr_events,
				423	struct list_head *done)
				424	{
				425	void *reqs[IO_IOPOLL_BATCH];
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	426	int file_count, to_free;
				427	struct file *file = NULL;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	428	struct io_kiocb *req;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	429
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	430	file_count = to_free = 0;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	431	while (!list_empty(done)) {
				432	req = list_first_entry(done, struct io_kiocb, list);
				433	list_del(&req->list);
				434
				435	io_cqring_fill_event(ctx, req->user_data, req->error, 0);
				436
				437	reqs[to_free++] = req;
				438	(*nr_events)++;
				439
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	440	/*
				441	* Batched puts of the same file, to avoid dirtying the
				442	* file usage count multiple times, if avoidable.
				443	*/
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	444	if (!(req->flags & REQ_F_FIXED_FILE)) {
				445	if (!file) {
				446	file = req->rw.ki_filp;
				447	file_count = 1;
				448	} else if (file == req->rw.ki_filp) {
				449	file_count++;
				450	} else {
				451	fput_many(file, file_count);
				452	file = req->rw.ki_filp;
				453	file_count = 1;
				454	}
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	455	}
				456
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	457	if (to_free == ARRAY_SIZE(reqs))
				458	io_free_req_many(ctx, reqs, &to_free);
				459	}
				460	io_commit_cqring(ctx);
				461
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	462	if (file)
				463	fput_many(file, file_count);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	464	io_free_req_many(ctx, reqs, &to_free);
				465	}
				466
				467	static int io_do_iopoll(struct io_ring_ctx ctx, unsigned int nr_events,
				468	long min)
				469	{
				470	struct io_kiocb req, tmp;
				471	LIST_HEAD(done);
				472	bool spin;
				473	int ret;
				474
				475	/*
				476	* Only spin for completions if we don't have multiple devices hanging
				477	* off our complete list, and we're under the requested amount.
				478	*/
				479	spin = !ctx->poll_multi_file && *nr_events < min;
				480
				481	ret = 0;
				482	list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
				483	struct kiocb *kiocb = &req->rw;
				484
				485	/*
				486	* Move completed entries to our local list. If we find a
				487	* request that requires polling, break out and complete
				488	* the done list first, if we have entries there.
				489	*/
				490	if (req->flags & REQ_F_IOPOLL_COMPLETED) {
				491	list_move_tail(&req->list, &done);
				492	continue;
				493	}
				494	if (!list_empty(&done))
				495	break;
				496
				497	ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
				498	if (ret < 0)
				499	break;
				500
				501	if (ret && spin)
				502	spin = false;
				503	ret = 0;
				504	}
				505
				506	if (!list_empty(&done))
				507	io_iopoll_complete(ctx, nr_events, &done);
				508
				509	return ret;
				510	}
				511
				512	/*
				513	* Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
				514	* non-spinning poll check - we'll still enter the driver poll loop, but only
				515	* as a non-spinning completion check.
				516	*/
				517	static int io_iopoll_getevents(struct io_ring_ctx ctx, unsigned int nr_events,
				518	long min)
				519	{
				520	while (!list_empty(&ctx->poll_list)) {
				521	int ret;
				522
				523	ret = io_do_iopoll(ctx, nr_events, min);
				524	if (ret < 0)
				525	return ret;
				526	if (!min \|\| *nr_events >= min)
				527	return 0;
				528	}
				529
				530	return 1;
				531	}
				532
				533	/*
				534	* We can't just wait for polled events to come to us, we have to actively
				535	* find and complete them.
				536	*/
				537	static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
				538	{
				539	if (!(ctx->flags & IORING_SETUP_IOPOLL))
				540	return;
				541
				542	mutex_lock(&ctx->uring_lock);
				543	while (!list_empty(&ctx->poll_list)) {
				544	unsigned int nr_events = 0;
				545
				546	io_iopoll_getevents(ctx, &nr_events, 1);
				547	}
				548	mutex_unlock(&ctx->uring_lock);
				549	}
				550
				551	static int io_iopoll_check(struct io_ring_ctx ctx, unsigned nr_events,
				552	long min)
				553	{
				554	int ret = 0;
				555
				556	do {
				557	int tmin = 0;
				558
				559	if (*nr_events < min)
				560	tmin = min - *nr_events;
				561
				562	ret = io_iopoll_getevents(ctx, nr_events, tmin);
				563	if (ret <= 0)
				564	break;
				565	ret = 0;
				566	} while (min && !*nr_events && !need_resched());
				567
				568	return ret;
				569	}
				570
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	571	static void kiocb_end_write(struct kiocb *kiocb)
				572	{
				573	if (kiocb->ki_flags & IOCB_WRITE) {
				574	struct inode *inode = file_inode(kiocb->ki_filp);
				575
				576	/*
				577	* Tell lockdep we inherited freeze protection from submission
				578	* thread.
				579	*/
				580	if (S_ISREG(inode->i_mode))
				581	__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
				582	file_end_write(kiocb->ki_filp);
				583	}
				584	}
				585
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	586	static void io_fput(struct io_kiocb *req)
				587	{
				588	if (!(req->flags & REQ_F_FIXED_FILE))
				589	fput(req->rw.ki_filp);
				590	}
				591
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	592	static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
				593	{
				594	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
				595
				596	kiocb_end_write(kiocb);
				597
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	598	io_fput(req);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	599	io_cqring_add_event(req->ctx, req->user_data, res, 0);
				600	io_free_req(req);
				601	}
				602
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	603	static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
				604	{
				605	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
				606
				607	kiocb_end_write(kiocb);
				608
				609	req->error = res;
				610	if (res != -EAGAIN)
				611	req->flags \|= REQ_F_IOPOLL_COMPLETED;
				612	}
				613
				614	/*
				615	* After the iocb has been issued, it's safe to be found on the poll list.
				616	* Adding the kiocb to the list AFTER submission ensures that we don't
				617	* find it from a io_iopoll_getevents() thread before the issuer is done
				618	* accessing the kiocb cookie.
				619	*/
				620	static void io_iopoll_req_issued(struct io_kiocb *req)
				621	{
				622	struct io_ring_ctx *ctx = req->ctx;
				623
				624	/*
				625	* Track whether we have multiple files in our lists. This will impact
				626	* how we do polling eventually, not spinning if we're on potentially
				627	* different devices.
				628	*/
				629	if (list_empty(&ctx->poll_list)) {
				630	ctx->poll_multi_file = false;
				631	} else if (!ctx->poll_multi_file) {
				632	struct io_kiocb *list_req;
				633
				634	list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
				635	list);
				636	if (list_req->rw.ki_filp != req->rw.ki_filp)
				637	ctx->poll_multi_file = true;
				638	}
				639
				640	/*
				641	* For fast devices, IO may have already completed. If it has, add
				642	* it to the front so we find it first.
				643	*/
				644	if (req->flags & REQ_F_IOPOLL_COMPLETED)
				645	list_add(&req->list, &ctx->poll_list);
				646	else
				647	list_add_tail(&req->list, &ctx->poll_list);
				648	}
				649
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	650	static void io_file_put(struct io_submit_state state, struct file file)
				651	{
				652	if (!state) {
				653	fput(file);
				654	} else if (state->file) {
				655	int diff = state->has_refs - state->used_refs;
				656
				657	if (diff)
				658	fput_many(state->file, diff);
				659	state->file = NULL;
				660	}
				661	}
				662
				663	/*
				664	* Get as many references to a file as we have IOs left in this submission,
				665	* assuming most submissions are for one file, or at least that each file
				666	* has more than one submission.
				667	*/
				668	static struct file io_file_get(struct io_submit_state state, int fd)
				669	{
				670	if (!state)
				671	return fget(fd);
				672
				673	if (state->file) {
				674	if (state->fd == fd) {
				675	state->used_refs++;
				676	state->ios_left--;
				677	return state->file;
				678	}
				679	io_file_put(state, NULL);
				680	}
				681	state->file = fget_many(fd, state->ios_left);
				682	if (!state->file)
				683	return NULL;
				684
				685	state->fd = fd;
				686	state->has_refs = state->ios_left;
				687	state->used_refs = 1;
				688	state->ios_left--;
				689	return state->file;
				690	}
				691
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	692	/*
				693	* If we tracked the file through the SCM inflight mechanism, we could support
				694	* any file. For now, just ensure that anything potentially problematic is done
				695	* inline.
				696	*/
				697	static bool io_file_supports_async(struct file *file)
				698	{
				699	umode_t mode = file_inode(file)->i_mode;
				700
				701	if (S_ISBLK(mode) \|\| S_ISCHR(mode))
				702	return true;
				703	if (S_ISREG(mode) && file->f_op != &io_uring_fops)
				704	return true;
				705
				706	return false;
				707	}
				708
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	709	static int io_prep_rw(struct io_kiocb req, const struct sqe_submit s,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	710	bool force_nonblock, struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	711	{
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	712	const struct io_uring_sqe *sqe = s->sqe;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	713	struct io_ring_ctx *ctx = req->ctx;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	714	struct kiocb *kiocb = &req->rw;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	715	unsigned ioprio, flags;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	716	int fd, ret;
				717
				718	/* For -EAGAIN retry, everything is already prepped */
				719	if (kiocb->ki_filp)
				720	return 0;
				721
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	722	flags = READ_ONCE(sqe->flags);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	723	fd = READ_ONCE(sqe->fd);
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	724
				725	if (flags & IOSQE_FIXED_FILE) {
				726	if (unlikely(!ctx->user_files \|\|
				727	(unsigned) fd >= ctx->nr_user_files))
				728	return -EBADF;
				729	kiocb->ki_filp = ctx->user_files[fd];
				730	req->flags \|= REQ_F_FIXED_FILE;
				731	} else {
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	732	if (s->needs_fixed_file)
				733	return -EBADF;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	734	kiocb->ki_filp = io_file_get(state, fd);
				735	if (unlikely(!kiocb->ki_filp))
				736	return -EBADF;
				737	if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
				738	force_nonblock = false;
				739	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	740	kiocb->ki_pos = READ_ONCE(sqe->off);
				741	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
				742	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
				743
				744	ioprio = READ_ONCE(sqe->ioprio);
				745	if (ioprio) {
				746	ret = ioprio_check_cap(ioprio);
				747	if (ret)
				748	goto out_fput;
				749
				750	kiocb->ki_ioprio = ioprio;
				751	} else
				752	kiocb->ki_ioprio = get_current_ioprio();
				753
				754	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
				755	if (unlikely(ret))
				756	goto out_fput;
				757	if (force_nonblock) {
				758	kiocb->ki_flags \|= IOCB_NOWAIT;
				759	req->flags \|= REQ_F_FORCE_NONBLOCK;
				760	}
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	761	if (ctx->flags & IORING_SETUP_IOPOLL) {
				762	ret = -EOPNOTSUPP;
				763	if (!(kiocb->ki_flags & IOCB_DIRECT) \|\|
				764	!kiocb->ki_filp->f_op->iopoll)
				765	goto out_fput;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	766
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	767	req->error = 0;
				768	kiocb->ki_flags \|= IOCB_HIPRI;
				769	kiocb->ki_complete = io_complete_rw_iopoll;
				770	} else {
				771	if (kiocb->ki_flags & IOCB_HIPRI) {
				772	ret = -EINVAL;
				773	goto out_fput;
				774	}
				775	kiocb->ki_complete = io_complete_rw;
				776	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	777	return 0;
				778	out_fput:
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	779	if (!(flags & IOSQE_FIXED_FILE)) {
				780	/*
				781	* in case of error, we didn't use this file reference. drop it.
				782	*/
				783	if (state)
				784	state->used_refs--;
				785	io_file_put(state, kiocb->ki_filp);
				786	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	787	return ret;
				788	}
				789
				790	static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
				791	{
				792	switch (ret) {
				793	case -EIOCBQUEUED:
				794	break;
				795	case -ERESTARTSYS:
				796	case -ERESTARTNOINTR:
				797	case -ERESTARTNOHAND:
				798	case -ERESTART_RESTARTBLOCK:
				799	/*
				800	* We can't just restart the syscall, since previously
				801	* submitted sqes may already be in progress. Just fail this
				802	* IO with EINTR.
				803	*/
				804	ret = -EINTR;
				805	/* fall through */
				806	default:
				807	kiocb->ki_complete(kiocb, ret, 0);
				808	}
				809	}
				810
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	811	static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
				812	const struct io_uring_sqe *sqe,
				813	struct iov_iter *iter)
				814	{
				815	size_t len = READ_ONCE(sqe->len);
				816	struct io_mapped_ubuf *imu;
				817	unsigned index, buf_index;
				818	size_t offset;
				819	u64 buf_addr;
				820
				821	/* attempt to use fixed buffers without having provided iovecs */
				822	if (unlikely(!ctx->user_bufs))
				823	return -EFAULT;
				824
				825	buf_index = READ_ONCE(sqe->buf_index);
				826	if (unlikely(buf_index >= ctx->nr_user_bufs))
				827	return -EFAULT;
				828
				829	index = array_index_nospec(buf_index, ctx->nr_user_bufs);
				830	imu = &ctx->user_bufs[index];
				831	buf_addr = READ_ONCE(sqe->addr);
				832
				833	/* overflow */
				834	if (buf_addr + len < buf_addr)
				835	return -EFAULT;
				836	/* not inside the mapped region */
				837	if (buf_addr < imu->ubuf \|\| buf_addr + len > imu->ubuf + imu->len)
				838	return -EFAULT;
				839
				840	/*
				841	* May not be a start of buffer, set size appropriately
				842	* and advance us to the beginning.
				843	*/
				844	offset = buf_addr - imu->ubuf;
				845	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
				846	if (offset)
				847	iov_iter_advance(iter, offset);
				848	return 0;
				849	}
				850
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	851	static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
				852	const struct sqe_submit s, struct iovec *iovec,
				853	struct iov_iter *iter)
				854	{
				855	const struct io_uring_sqe *sqe = s->sqe;
				856	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
				857	size_t sqe_len = READ_ONCE(sqe->len);
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	858	u8 opcode;
				859
				860	/*
				861	* We're reading ->opcode for the second time, but the first read
				862	* doesn't care whether it's _FIXED or not, so it doesn't matter
				863	* whether ->opcode changes concurrently. The first read does care
				864	* about whether it is a READ or a WRITE, so we don't trust this read
				865	* for that purpose and instead let the caller pass in the read/write
				866	* flag.
				867	*/
				868	opcode = READ_ONCE(sqe->opcode);
				869	if (opcode == IORING_OP_READ_FIXED \|\|
				870	opcode == IORING_OP_WRITE_FIXED) {
				871	ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
				872	*iovec = NULL;
				873	return ret;
				874	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	875
				876	if (!s->has_user)
				877	return -EFAULT;
				878
				879	#ifdef CONFIG_COMPAT
				880	if (ctx->compat)
				881	return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
				882	iovec, iter);
				883	#endif
				884
				885	return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
				886	}
				887
				888	static ssize_t io_read(struct io_kiocb req, const struct sqe_submit s,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	889	bool force_nonblock, struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	890	{
				891	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
				892	struct kiocb *kiocb = &req->rw;
				893	struct iov_iter iter;
				894	struct file *file;
				895	ssize_t ret;
				896
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	897	ret = io_prep_rw(req, s, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	898	if (ret)
				899	return ret;
				900	file = kiocb->ki_filp;
				901
				902	ret = -EBADF;
				903	if (unlikely(!(file->f_mode & FMODE_READ)))
				904	goto out_fput;
				905	ret = -EINVAL;
				906	if (unlikely(!file->f_op->read_iter))
				907	goto out_fput;
				908
				909	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
				910	if (ret)
				911	goto out_fput;
				912
				913	ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_iter_count(&iter));
				914	if (!ret) {
				915	ssize_t ret2;
				916
				917	/* Catch -EAGAIN return for forced non-blocking submission */
				918	ret2 = call_read_iter(file, kiocb, &iter);
				919	if (!force_nonblock \|\| ret2 != -EAGAIN)
				920	io_rw_done(kiocb, ret2);
				921	else
				922	ret = -EAGAIN;
				923	}
				924	kfree(iovec);
				925	out_fput:
				926	/* Hold on to the file for -EAGAIN */
				927	if (unlikely(ret && ret != -EAGAIN))
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	928	io_fput(req);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	929	return ret;
				930	}
				931
				932	static ssize_t io_write(struct io_kiocb req, const struct sqe_submit s,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	933	bool force_nonblock, struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	934	{
				935	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
				936	struct kiocb *kiocb = &req->rw;
				937	struct iov_iter iter;
				938	struct file *file;
				939	ssize_t ret;
				940
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	941	ret = io_prep_rw(req, s, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	942	if (ret)
				943	return ret;
				944	/* Hold on to the file for -EAGAIN */
				945	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
				946	return -EAGAIN;
				947
				948	ret = -EBADF;
				949	file = kiocb->ki_filp;
				950	if (unlikely(!(file->f_mode & FMODE_WRITE)))
				951	goto out_fput;
				952	ret = -EINVAL;
				953	if (unlikely(!file->f_op->write_iter))
				954	goto out_fput;
				955
				956	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
				957	if (ret)
				958	goto out_fput;
				959
				960	ret = rw_verify_area(WRITE, file, &kiocb->ki_pos,
				961	iov_iter_count(&iter));
				962	if (!ret) {
				963	/*
				964	* Open-code file_start_write here to grab freeze protection,
				965	* which will be released by another thread in
				966	* io_complete_rw(). Fool lockdep by telling it the lock got
				967	* released so that it doesn't complain about the held lock when
				968	* we return to userspace.
				969	*/
				970	if (S_ISREG(file_inode(file)->i_mode)) {
				971	__sb_start_write(file_inode(file)->i_sb,
				972	SB_FREEZE_WRITE, true);
				973	__sb_writers_release(file_inode(file)->i_sb,
				974	SB_FREEZE_WRITE);
				975	}
				976	kiocb->ki_flags \|= IOCB_WRITE;
				977	io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
				978	}
				979	kfree(iovec);
				980	out_fput:
				981	if (unlikely(ret))
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	982	io_fput(req);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	983	return ret;
				984	}
				985
				986	/*
				987	* IORING_OP_NOP just posts a completion event, nothing else.
				988	*/
				989	static int io_nop(struct io_kiocb *req, u64 user_data)
				990	{
				991	struct io_ring_ctx *ctx = req->ctx;
				992	long err = 0;
				993
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	994	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
				995	return -EINVAL;
				996
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	997	/*
				998	* Twilight zone - it's possible that someone issued an opcode that
				999	* has a file attached, then got -EAGAIN on submission, and changed
				1000	* the sqe before we retried it from async context. Avoid dropping
				1001	* a file reference for this malicious case, and flag the error.
				1002	*/
				1003	if (req->rw.ki_filp) {
				1004	err = -EBADF;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1005	io_fput(req);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1006	}
				1007	io_cqring_add_event(ctx, user_data, err, 0);
				1008	io_free_req(req);
				1009	return 0;
				1010	}
				1011
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1012	static int io_prep_fsync(struct io_kiocb req, const struct io_uring_sqe sqe)
				1013	{
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1014	struct io_ring_ctx *ctx = req->ctx;
				1015	unsigned flags;
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1016	int fd;
				1017
				1018	/* Prep already done */
				1019	if (req->rw.ki_filp)
				1020	return 0;
				1021
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1022	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1023	return -EINVAL;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1024	if (unlikely(sqe->addr \|\| sqe->ioprio \|\| sqe->buf_index))
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1025	return -EINVAL;
				1026
				1027	fd = READ_ONCE(sqe->fd);
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1028	flags = READ_ONCE(sqe->flags);
				1029
				1030	if (flags & IOSQE_FIXED_FILE) {
				1031	if (unlikely(!ctx->user_files \|\| fd >= ctx->nr_user_files))
				1032	return -EBADF;
				1033	req->rw.ki_filp = ctx->user_files[fd];
				1034	req->flags \|= REQ_F_FIXED_FILE;
				1035	} else {
				1036	req->rw.ki_filp = fget(fd);
				1037	if (unlikely(!req->rw.ki_filp))
				1038	return -EBADF;
				1039	}
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1040
				1041	return 0;
				1042	}
				1043
				1044	static int io_fsync(struct io_kiocb req, const struct io_uring_sqe sqe,
				1045	bool force_nonblock)
				1046	{
				1047	loff_t sqe_off = READ_ONCE(sqe->off);
				1048	loff_t sqe_len = READ_ONCE(sqe->len);
				1049	loff_t end = sqe_off + sqe_len;
				1050	unsigned fsync_flags;
				1051	int ret;
				1052
				1053	fsync_flags = READ_ONCE(sqe->fsync_flags);
				1054	if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
				1055	return -EINVAL;
				1056
				1057	ret = io_prep_fsync(req, sqe);
				1058	if (ret)
				1059	return ret;
				1060
				1061	/* fsync always requires a blocking context */
				1062	if (force_nonblock)
				1063	return -EAGAIN;
				1064
				1065	ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
				1066	end > 0 ? end : LLONG_MAX,
				1067	fsync_flags & IORING_FSYNC_DATASYNC);
				1068
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1069	io_fput(req);
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1070	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
				1071	io_free_req(req);
				1072	return 0;
				1073	}
				1074
Jens Axboe	221c5eb	2019-01-17 09:41:58 -0700	[diff] [blame^]	1075	static void io_poll_remove_one(struct io_kiocb *req)
				1076	{
				1077	struct io_poll_iocb *poll = &req->poll;
				1078
				1079	spin_lock(&poll->head->lock);
				1080	WRITE_ONCE(poll->canceled, true);
				1081	if (!list_empty(&poll->wait.entry)) {
				1082	list_del_init(&poll->wait.entry);
				1083	queue_work(req->ctx->sqo_wq, &req->work);
				1084	}
				1085	spin_unlock(&poll->head->lock);
				1086
				1087	list_del_init(&req->list);
				1088	}
				1089
				1090	static void io_poll_remove_all(struct io_ring_ctx *ctx)
				1091	{
				1092	struct io_kiocb *req;
				1093
				1094	spin_lock_irq(&ctx->completion_lock);
				1095	while (!list_empty(&ctx->cancel_list)) {
				1096	req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
				1097	io_poll_remove_one(req);
				1098	}
				1099	spin_unlock_irq(&ctx->completion_lock);
				1100	}
				1101
				1102	/*
				1103	* Find a running poll command that matches one specified in sqe->addr,
				1104	* and remove it if found.
				1105	*/
				1106	static int io_poll_remove(struct io_kiocb req, const struct io_uring_sqe sqe)
				1107	{
				1108	struct io_ring_ctx *ctx = req->ctx;
				1109	struct io_kiocb poll_req, next;
				1110	int ret = -ENOENT;
				1111
				1112	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				1113	return -EINVAL;
				1114	if (sqe->ioprio \|\| sqe->off \|\| sqe->len \|\| sqe->buf_index \|\|
				1115	sqe->poll_events)
				1116	return -EINVAL;
				1117
				1118	spin_lock_irq(&ctx->completion_lock);
				1119	list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
				1120	if (READ_ONCE(sqe->addr) == poll_req->user_data) {
				1121	io_poll_remove_one(poll_req);
				1122	ret = 0;
				1123	break;
				1124	}
				1125	}
				1126	spin_unlock_irq(&ctx->completion_lock);
				1127
				1128	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
				1129	io_free_req(req);
				1130	return 0;
				1131	}
				1132
				1133	static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
				1134	{
				1135	io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
				1136	io_fput(req);
				1137	io_free_req(req);
				1138	}
				1139
				1140	static void io_poll_complete_work(struct work_struct *work)
				1141	{
				1142	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
				1143	struct io_poll_iocb *poll = &req->poll;
				1144	struct poll_table_struct pt = { ._key = poll->events };
				1145	struct io_ring_ctx *ctx = req->ctx;
				1146	__poll_t mask = 0;
				1147
				1148	if (!READ_ONCE(poll->canceled))
				1149	mask = vfs_poll(poll->file, &pt) & poll->events;
				1150
				1151	/*
				1152	* Note that ->ki_cancel callers also delete iocb from active_reqs after
				1153	* calling ->ki_cancel. We need the ctx_lock roundtrip here to
				1154	* synchronize with them. In the cancellation case the list_del_init
				1155	* itself is not actually needed, but harmless so we keep it in to
				1156	* avoid further branches in the fast path.
				1157	*/
				1158	spin_lock_irq(&ctx->completion_lock);
				1159	if (!mask && !READ_ONCE(poll->canceled)) {
				1160	add_wait_queue(poll->head, &poll->wait);
				1161	spin_unlock_irq(&ctx->completion_lock);
				1162	return;
				1163	}
				1164	list_del_init(&req->list);
				1165	spin_unlock_irq(&ctx->completion_lock);
				1166
				1167	io_poll_complete(req, mask);
				1168	}
				1169
				1170	static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
				1171	void *key)
				1172	{
				1173	struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
				1174	wait);
				1175	struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
				1176	struct io_ring_ctx *ctx = req->ctx;
				1177	__poll_t mask = key_to_poll(key);
				1178
				1179	poll->woken = true;
				1180
				1181	/* for instances that support it check for an event match first: */
				1182	if (mask) {
				1183	unsigned long flags;
				1184
				1185	if (!(mask & poll->events))
				1186	return 0;
				1187
				1188	/* try to complete the iocb inline if we can: */
				1189	if (spin_trylock_irqsave(&ctx->completion_lock, flags)) {
				1190	list_del(&req->list);
				1191	spin_unlock_irqrestore(&ctx->completion_lock, flags);
				1192
				1193	list_del_init(&poll->wait.entry);
				1194	io_poll_complete(req, mask);
				1195	return 1;
				1196	}
				1197	}
				1198
				1199	list_del_init(&poll->wait.entry);
				1200	queue_work(ctx->sqo_wq, &req->work);
				1201	return 1;
				1202	}
				1203
				1204	struct io_poll_table {
				1205	struct poll_table_struct pt;
				1206	struct io_kiocb *req;
				1207	int error;
				1208	};
				1209
				1210	static void io_poll_queue_proc(struct file file, struct wait_queue_head head,
				1211	struct poll_table_struct *p)
				1212	{
				1213	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
				1214
				1215	if (unlikely(pt->req->poll.head)) {
				1216	pt->error = -EINVAL;
				1217	return;
				1218	}
				1219
				1220	pt->error = 0;
				1221	pt->req->poll.head = head;
				1222	add_wait_queue(head, &pt->req->poll.wait);
				1223	}
				1224
				1225	static int io_poll_add(struct io_kiocb req, const struct io_uring_sqe sqe)
				1226	{
				1227	struct io_poll_iocb *poll = &req->poll;
				1228	struct io_ring_ctx *ctx = req->ctx;
				1229	struct io_poll_table ipt;
				1230	unsigned flags;
				1231	__poll_t mask;
				1232	u16 events;
				1233	int fd;
				1234
				1235	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				1236	return -EINVAL;
				1237	if (sqe->addr \|\| sqe->ioprio \|\| sqe->off \|\| sqe->len \|\| sqe->buf_index)
				1238	return -EINVAL;
				1239
				1240	INIT_WORK(&req->work, io_poll_complete_work);
				1241	events = READ_ONCE(sqe->poll_events);
				1242	poll->events = demangle_poll(events) \| EPOLLERR \| EPOLLHUP;
				1243
				1244	flags = READ_ONCE(sqe->flags);
				1245	fd = READ_ONCE(sqe->fd);
				1246
				1247	if (flags & IOSQE_FIXED_FILE) {
				1248	if (unlikely(!ctx->user_files \|\| fd >= ctx->nr_user_files))
				1249	return -EBADF;
				1250	poll->file = ctx->user_files[fd];
				1251	req->flags \|= REQ_F_FIXED_FILE;
				1252	} else {
				1253	poll->file = fget(fd);
				1254	}
				1255	if (unlikely(!poll->file))
				1256	return -EBADF;
				1257
				1258	poll->head = NULL;
				1259	poll->woken = false;
				1260	poll->canceled = false;
				1261
				1262	ipt.pt._qproc = io_poll_queue_proc;
				1263	ipt.pt._key = poll->events;
				1264	ipt.req = req;
				1265	ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
				1266
				1267	/* initialized the list so that we can do list_empty checks */
				1268	INIT_LIST_HEAD(&poll->wait.entry);
				1269	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
				1270
				1271	/* one for removal from waitqueue, one for this function */
				1272	refcount_set(&req->refs, 2);
				1273
				1274	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
				1275	if (unlikely(!poll->head)) {
				1276	/* we did not manage to set up a waitqueue, done */
				1277	goto out;
				1278	}
				1279
				1280	spin_lock_irq(&ctx->completion_lock);
				1281	spin_lock(&poll->head->lock);
				1282	if (poll->woken) {
				1283	/* wake_up context handles the rest */
				1284	mask = 0;
				1285	ipt.error = 0;
				1286	} else if (mask \|\| ipt.error) {
				1287	/* if we get an error or a mask we are done */
				1288	WARN_ON_ONCE(list_empty(&poll->wait.entry));
				1289	list_del_init(&poll->wait.entry);
				1290	} else {
				1291	/* actually waiting for an event */
				1292	list_add_tail(&req->list, &ctx->cancel_list);
				1293	}
				1294	spin_unlock(&poll->head->lock);
				1295	spin_unlock_irq(&ctx->completion_lock);
				1296
				1297	out:
				1298	if (unlikely(ipt.error)) {
				1299	if (!(flags & IOSQE_FIXED_FILE))
				1300	fput(poll->file);
				1301	/*
				1302	* Drop one of our refs to this req, __io_submit_sqe() will
				1303	* drop the other one since we're returning an error.
				1304	*/
				1305	io_free_req(req);
				1306	return ipt.error;
				1307	}
				1308
				1309	if (mask)
				1310	io_poll_complete(req, mask);
				1311	io_free_req(req);
				1312	return 0;
				1313	}
				1314
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1315	static int __io_submit_sqe(struct io_ring_ctx ctx, struct io_kiocb req,
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1316	const struct sqe_submit *s, bool force_nonblock,
				1317	struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1318	{
				1319	ssize_t ret;
				1320	int opcode;
				1321
				1322	if (unlikely(s->index >= ctx->sq_entries))
				1323	return -EINVAL;
				1324	req->user_data = READ_ONCE(s->sqe->user_data);
				1325
				1326	opcode = READ_ONCE(s->sqe->opcode);
				1327	switch (opcode) {
				1328	case IORING_OP_NOP:
				1329	ret = io_nop(req, req->user_data);
				1330	break;
				1331	case IORING_OP_READV:
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1332	if (unlikely(s->sqe->buf_index))
				1333	return -EINVAL;
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1334	ret = io_read(req, s, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1335	break;
				1336	case IORING_OP_WRITEV:
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1337	if (unlikely(s->sqe->buf_index))
				1338	return -EINVAL;
				1339	ret = io_write(req, s, force_nonblock, state);
				1340	break;
				1341	case IORING_OP_READ_FIXED:
				1342	ret = io_read(req, s, force_nonblock, state);
				1343	break;
				1344	case IORING_OP_WRITE_FIXED:
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1345	ret = io_write(req, s, force_nonblock, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1346	break;
Christoph Hellwig	c992fe2	2019-01-11 09:43:02 -0700	[diff] [blame]	1347	case IORING_OP_FSYNC:
				1348	ret = io_fsync(req, s->sqe, force_nonblock);
				1349	break;
Jens Axboe	221c5eb	2019-01-17 09:41:58 -0700	[diff] [blame^]	1350	case IORING_OP_POLL_ADD:
				1351	ret = io_poll_add(req, s->sqe);
				1352	break;
				1353	case IORING_OP_POLL_REMOVE:
				1354	ret = io_poll_remove(req, s->sqe);
				1355	break;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1356	default:
				1357	ret = -EINVAL;
				1358	break;
				1359	}
				1360
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1361	if (ret)
				1362	return ret;
				1363
				1364	if (ctx->flags & IORING_SETUP_IOPOLL) {
				1365	if (req->error == -EAGAIN)
				1366	return -EAGAIN;
				1367
				1368	/* workqueue context doesn't hold uring_lock, grab it now */
				1369	if (s->needs_lock)
				1370	mutex_lock(&ctx->uring_lock);
				1371	io_iopoll_req_issued(req);
				1372	if (s->needs_lock)
				1373	mutex_unlock(&ctx->uring_lock);
				1374	}
				1375
				1376	return 0;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1377	}
				1378
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1379	static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
				1380	{
				1381	u8 opcode = READ_ONCE(sqe->opcode);
				1382
				1383	return !(opcode == IORING_OP_READ_FIXED \|\|
				1384	opcode == IORING_OP_WRITE_FIXED);
				1385	}
				1386
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1387	static void io_sq_wq_submit_work(struct work_struct *work)
				1388	{
				1389	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
				1390	struct sqe_submit *s = &req->submit;
				1391	const struct io_uring_sqe *sqe = s->sqe;
				1392	struct io_ring_ctx *ctx = req->ctx;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1393	mm_segment_t old_fs;
				1394	bool needs_user;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1395	int ret;
				1396
				1397	/* Ensure we clear previously set forced non-block flag */
				1398	req->flags &= ~REQ_F_FORCE_NONBLOCK;
				1399	req->rw.ki_flags &= ~IOCB_NOWAIT;
				1400
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1401	s->needs_lock = true;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1402	s->has_user = false;
				1403
				1404	/*
				1405	* If we're doing IO to fixed buffers, we don't need to get/set
				1406	* user context
				1407	*/
				1408	needs_user = io_sqe_needs_user(s->sqe);
				1409	if (needs_user) {
				1410	if (!mmget_not_zero(ctx->sqo_mm)) {
				1411	ret = -EFAULT;
				1412	goto err;
				1413	}
				1414	use_mm(ctx->sqo_mm);
				1415	old_fs = get_fs();
				1416	set_fs(USER_DS);
				1417	s->has_user = true;
				1418	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1419
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1420	do {
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1421	ret = __io_submit_sqe(ctx, req, s, false, NULL);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1422	/*
				1423	* We can get EAGAIN for polled IO even though we're forcing
				1424	* a sync submission from here, since we can't wait for
				1425	* request slots on the block side.
				1426	*/
				1427	if (ret != -EAGAIN)
				1428	break;
				1429	cond_resched();
				1430	} while (1);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1431
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	1432	if (needs_user) {
				1433	set_fs(old_fs);
				1434	unuse_mm(ctx->sqo_mm);
				1435	mmput(ctx->sqo_mm);
				1436	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1437	err:
				1438	if (ret) {
				1439	io_cqring_add_event(ctx, sqe->user_data, ret, 0);
				1440	io_free_req(req);
				1441	}
				1442
				1443	/* async context always use a copy of the sqe */
				1444	kfree(sqe);
				1445	}
				1446
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1447	static int io_submit_sqe(struct io_ring_ctx ctx, struct sqe_submit s,
				1448	struct io_submit_state *state)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1449	{
				1450	struct io_kiocb *req;
				1451	ssize_t ret;
				1452
				1453	/* enforce forwards compatibility on users */
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1454	if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1455	return -EINVAL;
				1456
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	1457	req = io_get_req(ctx, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1458	if (unlikely(!req))
				1459	return -EAGAIN;
				1460
				1461	req->rw.ki_filp = NULL;
				1462
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1463	ret = __io_submit_sqe(ctx, req, s, true, state);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1464	if (ret == -EAGAIN) {
				1465	struct io_uring_sqe *sqe_copy;
				1466
				1467	sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
				1468	if (sqe_copy) {
				1469	memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
				1470	s->sqe = sqe_copy;
				1471
				1472	memcpy(&req->submit, s, sizeof(*s));
				1473	INIT_WORK(&req->work, io_sq_wq_submit_work);
				1474	queue_work(ctx->sqo_wq, &req->work);
				1475	ret = 0;
				1476	}
				1477	}
				1478	if (ret)
				1479	io_free_req(req);
				1480
				1481	return ret;
				1482	}
				1483
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1484	/*
				1485	* Batched submission is done, ensure local IO is flushed out.
				1486	*/
				1487	static void io_submit_state_end(struct io_submit_state *state)
				1488	{
				1489	blk_finish_plug(&state->plug);
				1490	io_file_put(state, NULL);
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	1491	if (state->free_reqs)
				1492	kmem_cache_free_bulk(req_cachep, state->free_reqs,
				1493	&state->reqs[state->cur_req]);
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1494	}
				1495
				1496	/*
				1497	* Start submission side cache.
				1498	*/
				1499	static void io_submit_state_start(struct io_submit_state *state,
				1500	struct io_ring_ctx *ctx, unsigned max_ios)
				1501	{
				1502	blk_start_plug(&state->plug);
Jens Axboe	2579f91	2019-01-09 09:10:43 -0700	[diff] [blame]	1503	state->free_reqs = 0;
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1504	state->file = NULL;
				1505	state->ios_left = max_ios;
				1506	}
				1507
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1508	static void io_commit_sqring(struct io_ring_ctx *ctx)
				1509	{
				1510	struct io_sq_ring *ring = ctx->sq_ring;
				1511
				1512	if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
				1513	/*
				1514	* Ensure any loads from the SQEs are done at this point,
				1515	* since once we write the new head, the application could
				1516	* write new data to them.
				1517	*/
				1518	smp_store_release(&ring->r.head, ctx->cached_sq_head);
				1519
				1520	/*
				1521	* write side barrier of head update, app has read side. See
				1522	* comment at the top of this file
				1523	*/
				1524	smp_wmb();
				1525	}
				1526	}
				1527
				1528	/*
				1529	* Undo last io_get_sqring()
				1530	*/
				1531	static void io_drop_sqring(struct io_ring_ctx *ctx)
				1532	{
				1533	ctx->cached_sq_head--;
				1534	}
				1535
				1536	/*
				1537	* Fetch an sqe, if one is available. Note that s->sqe will point to memory
				1538	* that is mapped by userspace. This means that care needs to be taken to
				1539	* ensure that reads are stable, as we cannot rely on userspace always
				1540	* being a good citizen. If members of the sqe are validated and then later
				1541	* used, it's important that those reads are done through READ_ONCE() to
				1542	* prevent a re-load down the line.
				1543	*/
				1544	static bool io_get_sqring(struct io_ring_ctx ctx, struct sqe_submit s)
				1545	{
				1546	struct io_sq_ring *ring = ctx->sq_ring;
				1547	unsigned head;
				1548
				1549	/*
				1550	* The cached sq head (or cq tail) serves two purposes:
				1551	*
				1552	* 1) allows us to batch the cost of updating the user visible
				1553	* head updates.
				1554	* 2) allows the kernel side to track the head on its own, even
				1555	* though the application is the one updating it.
				1556	*/
				1557	head = ctx->cached_sq_head;
				1558	/* See comment at the top of this file */
				1559	smp_rmb();
				1560	if (head == READ_ONCE(ring->r.tail))
				1561	return false;
				1562
				1563	head = READ_ONCE(ring->array[head & ctx->sq_mask]);
				1564	if (head < ctx->sq_entries) {
				1565	s->index = head;
				1566	s->sqe = &ctx->sq_sqes[head];
				1567	ctx->cached_sq_head++;
				1568	return true;
				1569	}
				1570
				1571	/* drop invalid entries */
				1572	ctx->cached_sq_head++;
				1573	ring->dropped++;
				1574	/* See comment at the top of this file */
				1575	smp_wmb();
				1576	return false;
				1577	}
				1578
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	1579	static int io_submit_sqes(struct io_ring_ctx ctx, struct sqe_submit sqes,
				1580	unsigned int nr, bool has_user, bool mm_fault)
				1581	{
				1582	struct io_submit_state state, *statep = NULL;
				1583	int ret, i, submitted = 0;
				1584
				1585	if (nr > IO_PLUG_THRESHOLD) {
				1586	io_submit_state_start(&state, ctx, nr);
				1587	statep = &state;
				1588	}
				1589
				1590	for (i = 0; i < nr; i++) {
				1591	if (unlikely(mm_fault)) {
				1592	ret = -EFAULT;
				1593	} else {
				1594	sqes[i].has_user = has_user;
				1595	sqes[i].needs_lock = true;
				1596	sqes[i].needs_fixed_file = true;
				1597	ret = io_submit_sqe(ctx, &sqes[i], statep);
				1598	}
				1599	if (!ret) {
				1600	submitted++;
				1601	continue;
				1602	}
				1603
				1604	io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0);
				1605	}
				1606
				1607	if (statep)
				1608	io_submit_state_end(&state);
				1609
				1610	return submitted;
				1611	}
				1612
				1613	static int io_sq_thread(void *data)
				1614	{
				1615	struct sqe_submit sqes[IO_IOPOLL_BATCH];
				1616	struct io_ring_ctx *ctx = data;
				1617	struct mm_struct *cur_mm = NULL;
				1618	mm_segment_t old_fs;
				1619	DEFINE_WAIT(wait);
				1620	unsigned inflight;
				1621	unsigned long timeout;
				1622
				1623	old_fs = get_fs();
				1624	set_fs(USER_DS);
				1625
				1626	timeout = inflight = 0;
				1627	while (!kthread_should_stop() && !ctx->sqo_stop) {
				1628	bool all_fixed, mm_fault = false;
				1629	int i;
				1630
				1631	if (inflight) {
				1632	unsigned nr_events = 0;
				1633
				1634	if (ctx->flags & IORING_SETUP_IOPOLL) {
				1635	/*
				1636	* We disallow the app entering submit/complete
				1637	* with polling, but we still need to lock the
				1638	* ring to prevent racing with polled issue
				1639	* that got punted to a workqueue.
				1640	*/
				1641	mutex_lock(&ctx->uring_lock);
				1642	io_iopoll_check(ctx, &nr_events, 0);
				1643	mutex_unlock(&ctx->uring_lock);
				1644	} else {
				1645	/*
				1646	* Normal IO, just pretend everything completed.
				1647	* We don't have to poll completions for that.
				1648	*/
				1649	nr_events = inflight;
				1650	}
				1651
				1652	inflight -= nr_events;
				1653	if (!inflight)
				1654	timeout = jiffies + ctx->sq_thread_idle;
				1655	}
				1656
				1657	if (!io_get_sqring(ctx, &sqes[0])) {
				1658	/*
				1659	* We're polling. If we're within the defined idle
				1660	* period, then let us spin without work before going
				1661	* to sleep.
				1662	*/
				1663	if (inflight \|\| !time_after(jiffies, timeout)) {
				1664	cpu_relax();
				1665	continue;
				1666	}
				1667
				1668	/*
				1669	* Drop cur_mm before scheduling, we can't hold it for
				1670	* long periods (or over schedule()). Do this before
				1671	* adding ourselves to the waitqueue, as the unuse/drop
				1672	* may sleep.
				1673	*/
				1674	if (cur_mm) {
				1675	unuse_mm(cur_mm);
				1676	mmput(cur_mm);
				1677	cur_mm = NULL;
				1678	}
				1679
				1680	prepare_to_wait(&ctx->sqo_wait, &wait,
				1681	TASK_INTERRUPTIBLE);
				1682
				1683	/* Tell userspace we may need a wakeup call */
				1684	ctx->sq_ring->flags \|= IORING_SQ_NEED_WAKEUP;
				1685	smp_wmb();
				1686
				1687	if (!io_get_sqring(ctx, &sqes[0])) {
				1688	if (kthread_should_stop()) {
				1689	finish_wait(&ctx->sqo_wait, &wait);
				1690	break;
				1691	}
				1692	if (signal_pending(current))
				1693	flush_signals(current);
				1694	schedule();
				1695	finish_wait(&ctx->sqo_wait, &wait);
				1696
				1697	ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
				1698	smp_wmb();
				1699	continue;
				1700	}
				1701	finish_wait(&ctx->sqo_wait, &wait);
				1702
				1703	ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
				1704	smp_wmb();
				1705	}
				1706
				1707	i = 0;
				1708	all_fixed = true;
				1709	do {
				1710	if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
				1711	all_fixed = false;
				1712
				1713	i++;
				1714	if (i == ARRAY_SIZE(sqes))
				1715	break;
				1716	} while (io_get_sqring(ctx, &sqes[i]));
				1717
				1718	/* Unless all new commands are FIXED regions, grab mm */
				1719	if (!all_fixed && !cur_mm) {
				1720	mm_fault = !mmget_not_zero(ctx->sqo_mm);
				1721	if (!mm_fault) {
				1722	use_mm(ctx->sqo_mm);
				1723	cur_mm = ctx->sqo_mm;
				1724	}
				1725	}
				1726
				1727	inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
				1728	mm_fault);
				1729
				1730	/* Commit SQ ring head once we've consumed all SQEs */
				1731	io_commit_sqring(ctx);
				1732	}
				1733
				1734	set_fs(old_fs);
				1735	if (cur_mm) {
				1736	unuse_mm(cur_mm);
				1737	mmput(cur_mm);
				1738	}
				1739	return 0;
				1740	}
				1741
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1742	static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
				1743	{
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1744	struct io_submit_state state, *statep = NULL;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1745	int i, ret = 0, submit = 0;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1746
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1747	if (to_submit > IO_PLUG_THRESHOLD) {
				1748	io_submit_state_start(&state, ctx, to_submit);
				1749	statep = &state;
				1750	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1751
				1752	for (i = 0; i < to_submit; i++) {
				1753	struct sqe_submit s;
				1754
				1755	if (!io_get_sqring(ctx, &s))
				1756	break;
				1757
				1758	s.has_user = true;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1759	s.needs_lock = false;
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	1760	s.needs_fixed_file = false;
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	1761
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1762	ret = io_submit_sqe(ctx, &s, statep);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1763	if (ret) {
				1764	io_drop_sqring(ctx);
				1765	break;
				1766	}
				1767
				1768	submit++;
				1769	}
				1770	io_commit_sqring(ctx);
				1771
Jens Axboe	9a56a23	2019-01-09 09:06:50 -0700	[diff] [blame]	1772	if (statep)
				1773	io_submit_state_end(statep);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	1774
				1775	return submit ? submit : ret;
				1776	}
				1777
				1778	static unsigned io_cqring_events(struct io_cq_ring *ring)
				1779	{
				1780	return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
				1781	}
				1782
				1783	/*
				1784	* Wait until events become available, if we don't already have some. The
				1785	* application must reap them itself, as they reside on the shared cq ring.
				1786	*/
				1787	static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
				1788	const sigset_t __user *sig, size_t sigsz)
				1789	{
				1790	struct io_cq_ring *ring = ctx->cq_ring;
				1791	sigset_t ksigmask, sigsaved;
				1792	DEFINE_WAIT(wait);
				1793	int ret;
				1794
				1795	/* See comment at the top of this file */
				1796	smp_rmb();
				1797	if (io_cqring_events(ring) >= min_events)
				1798	return 0;
				1799
				1800	if (sig) {
				1801	ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
				1802	if (ret)
				1803	return ret;
				1804	}
				1805
				1806	do {
				1807	prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
				1808
				1809	ret = 0;
				1810	/* See comment at the top of this file */
				1811	smp_rmb();
				1812	if (io_cqring_events(ring) >= min_events)
				1813	break;
				1814
				1815	schedule();
				1816
				1817	ret = -EINTR;
				1818	if (signal_pending(current))
				1819	break;
				1820	} while (1);
				1821
				1822	finish_wait(&ctx->wait, &wait);
				1823
				1824	if (sig)
				1825	restore_user_sigmask(sig, &sigsaved);
				1826
				1827	return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
				1828	}
				1829
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1830	static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
				1831	{
				1832	#if defined(CONFIG_UNIX)
				1833	if (ctx->ring_sock) {
				1834	struct sock *sock = ctx->ring_sock->sk;
				1835	struct sk_buff *skb;
				1836
				1837	while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
				1838	kfree_skb(skb);
				1839	}
				1840	#else
				1841	int i;
				1842
				1843	for (i = 0; i < ctx->nr_user_files; i++)
				1844	fput(ctx->user_files[i]);
				1845	#endif
				1846	}
				1847
				1848	static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
				1849	{
				1850	if (!ctx->user_files)
				1851	return -ENXIO;
				1852
				1853	__io_sqe_files_unregister(ctx);
				1854	kfree(ctx->user_files);
				1855	ctx->user_files = NULL;
				1856	ctx->nr_user_files = 0;
				1857	return 0;
				1858	}
				1859
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	1860	static void io_sq_thread_stop(struct io_ring_ctx *ctx)
				1861	{
				1862	if (ctx->sqo_thread) {
				1863	ctx->sqo_stop = 1;
				1864	mb();
				1865	kthread_stop(ctx->sqo_thread);
				1866	ctx->sqo_thread = NULL;
				1867	}
				1868	}
				1869
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1870	static void io_finish_async(struct io_ring_ctx *ctx)
				1871	{
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	1872	io_sq_thread_stop(ctx);
				1873
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	1874	if (ctx->sqo_wq) {
				1875	destroy_workqueue(ctx->sqo_wq);
				1876	ctx->sqo_wq = NULL;
				1877	}
				1878	}
				1879
				1880	#if defined(CONFIG_UNIX)
				1881	static void io_destruct_skb(struct sk_buff *skb)
				1882	{
				1883	struct io_ring_ctx *ctx = skb->sk->sk_user_data;
				1884
				1885	io_finish_async(ctx);
				1886	unix_destruct_scm(skb);
				1887	}
				1888
				1889	/*
				1890	* Ensure the UNIX gc is aware of our file set, so we are certain that
				1891	* the io_uring can be safely unregistered on process exit, even if we have
				1892	* loops in the file referencing.
				1893	*/
				1894	static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
				1895	{
				1896	struct sock *sk = ctx->ring_sock->sk;
				1897	struct scm_fp_list *fpl;
				1898	struct sk_buff *skb;
				1899	int i;
				1900
				1901	if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
				1902	unsigned long inflight = ctx->user->unix_inflight + nr;
				1903
				1904	if (inflight > task_rlimit(current, RLIMIT_NOFILE))
				1905	return -EMFILE;
				1906	}
				1907
				1908	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
				1909	if (!fpl)
				1910	return -ENOMEM;
				1911
				1912	skb = alloc_skb(0, GFP_KERNEL);
				1913	if (!skb) {
				1914	kfree(fpl);
				1915	return -ENOMEM;
				1916	}
				1917
				1918	skb->sk = sk;
				1919	skb->destructor = io_destruct_skb;
				1920
				1921	fpl->user = get_uid(ctx->user);
				1922	for (i = 0; i < nr; i++) {
				1923	fpl->fp[i] = get_file(ctx->user_files[i + offset]);
				1924	unix_inflight(fpl->user, fpl->fp[i]);
				1925	}
				1926
				1927	fpl->max = fpl->count = nr;
				1928	UNIXCB(skb).fp = fpl;
				1929	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
				1930	skb_queue_head(&sk->sk_receive_queue, skb);
				1931
				1932	for (i = 0; i < nr; i++)
				1933	fput(fpl->fp[i]);
				1934
				1935	return 0;
				1936	}
				1937
				1938	/*
				1939	* If UNIX sockets are enabled, fd passing can cause a reference cycle which
				1940	* causes regular reference counting to break down. We rely on the UNIX
				1941	* garbage collection to take care of this problem for us.
				1942	*/
				1943	static int io_sqe_files_scm(struct io_ring_ctx *ctx)
				1944	{
				1945	unsigned left, total;
				1946	int ret = 0;
				1947
				1948	total = 0;
				1949	left = ctx->nr_user_files;
				1950	while (left) {
				1951	unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
				1952	int ret;
				1953
				1954	ret = __io_sqe_files_scm(ctx, this_files, total);
				1955	if (ret)
				1956	break;
				1957	left -= this_files;
				1958	total += this_files;
				1959	}
				1960
				1961	if (!ret)
				1962	return 0;
				1963
				1964	while (total < ctx->nr_user_files) {
				1965	fput(ctx->user_files[total]);
				1966	total++;
				1967	}
				1968
				1969	return ret;
				1970	}
				1971	#else
				1972	static int io_sqe_files_scm(struct io_ring_ctx *ctx)
				1973	{
				1974	return 0;
				1975	}
				1976	#endif
				1977
				1978	static int io_sqe_files_register(struct io_ring_ctx ctx, void __user arg,
				1979	unsigned nr_args)
				1980	{
				1981	__s32 __user fds = (__s32 __user ) arg;
				1982	int fd, ret = 0;
				1983	unsigned i;
				1984
				1985	if (ctx->user_files)
				1986	return -EBUSY;
				1987	if (!nr_args)
				1988	return -EINVAL;
				1989	if (nr_args > IORING_MAX_FIXED_FILES)
				1990	return -EMFILE;
				1991
				1992	ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
				1993	if (!ctx->user_files)
				1994	return -ENOMEM;
				1995
				1996	for (i = 0; i < nr_args; i++) {
				1997	ret = -EFAULT;
				1998	if (copy_from_user(&fd, &fds[i], sizeof(fd)))
				1999	break;
				2000
				2001	ctx->user_files[i] = fget(fd);
				2002
				2003	ret = -EBADF;
				2004	if (!ctx->user_files[i])
				2005	break;
				2006	/*
				2007	* Don't allow io_uring instances to be registered. If UNIX
				2008	* isn't enabled, then this causes a reference cycle and this
				2009	* instance can never get freed. If UNIX is enabled we'll
				2010	* handle it just fine, but there's still no point in allowing
				2011	* a ring fd as it doesn't support regular read/write anyway.
				2012	*/
				2013	if (ctx->user_files[i]->f_op == &io_uring_fops) {
				2014	fput(ctx->user_files[i]);
				2015	break;
				2016	}
				2017	ctx->nr_user_files++;
				2018	ret = 0;
				2019	}
				2020
				2021	if (ret) {
				2022	for (i = 0; i < ctx->nr_user_files; i++)
				2023	fput(ctx->user_files[i]);
				2024
				2025	kfree(ctx->user_files);
				2026	ctx->nr_user_files = 0;
				2027	return ret;
				2028	}
				2029
				2030	ret = io_sqe_files_scm(ctx);
				2031	if (ret)
				2032	io_sqe_files_unregister(ctx);
				2033
				2034	return ret;
				2035	}
				2036
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	2037	static int io_sq_offload_start(struct io_ring_ctx *ctx,
				2038	struct io_uring_params *p)
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2039	{
				2040	int ret;
				2041
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	2042	init_waitqueue_head(&ctx->sqo_wait);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2043	mmgrab(current->mm);
				2044	ctx->sqo_mm = current->mm;
				2045
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	2046	ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
				2047	if (!ctx->sq_thread_idle)
				2048	ctx->sq_thread_idle = HZ;
				2049
				2050	ret = -EINVAL;
				2051	if (!cpu_possible(p->sq_thread_cpu))
				2052	goto err;
				2053
				2054	if (ctx->flags & IORING_SETUP_SQPOLL) {
				2055	if (p->flags & IORING_SETUP_SQ_AFF) {
				2056	int cpu;
				2057
				2058	cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
				2059	ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
				2060	ctx, cpu,
				2061	"io_uring-sq");
				2062	} else {
				2063	ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
				2064	"io_uring-sq");
				2065	}
				2066	if (IS_ERR(ctx->sqo_thread)) {
				2067	ret = PTR_ERR(ctx->sqo_thread);
				2068	ctx->sqo_thread = NULL;
				2069	goto err;
				2070	}
				2071	wake_up_process(ctx->sqo_thread);
				2072	} else if (p->flags & IORING_SETUP_SQ_AFF) {
				2073	/* Can't have SQ_AFF without SQPOLL */
				2074	ret = -EINVAL;
				2075	goto err;
				2076	}
				2077
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2078	/* Do QD, or 2 * CPUS, whatever is smallest */
				2079	ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND \| WQ_FREEZABLE,
				2080	min(ctx->sq_entries - 1, 2 * num_online_cpus()));
				2081	if (!ctx->sqo_wq) {
				2082	ret = -ENOMEM;
				2083	goto err;
				2084	}
				2085
				2086	return 0;
				2087	err:
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	2088	io_sq_thread_stop(ctx);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2089	mmdrop(ctx->sqo_mm);
				2090	ctx->sqo_mm = NULL;
				2091	return ret;
				2092	}
				2093
				2094	static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
				2095	{
				2096	atomic_long_sub(nr_pages, &user->locked_vm);
				2097	}
				2098
				2099	static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
				2100	{
				2101	unsigned long page_limit, cur_pages, new_pages;
				2102
				2103	/* Don't allow more pages than we can safely lock */
				2104	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
				2105
				2106	do {
				2107	cur_pages = atomic_long_read(&user->locked_vm);
				2108	new_pages = cur_pages + nr_pages;
				2109	if (new_pages > page_limit)
				2110	return -ENOMEM;
				2111	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
				2112	new_pages) != cur_pages);
				2113
				2114	return 0;
				2115	}
				2116
				2117	static void io_mem_free(void *ptr)
				2118	{
				2119	struct page *page = virt_to_head_page(ptr);
				2120
				2121	if (put_page_testzero(page))
				2122	free_compound_page(page);
				2123	}
				2124
				2125	static void *io_mem_alloc(size_t size)
				2126	{
				2127	gfp_t gfp_flags = GFP_KERNEL \| __GFP_ZERO \| __GFP_NOWARN \| __GFP_COMP \|
				2128	__GFP_NORETRY;
				2129
				2130	return (void *) __get_free_pages(gfp_flags, get_order(size));
				2131	}
				2132
				2133	static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
				2134	{
				2135	struct io_sq_ring *sq_ring;
				2136	struct io_cq_ring *cq_ring;
				2137	size_t bytes;
				2138
				2139	bytes = struct_size(sq_ring, array, sq_entries);
				2140	bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
				2141	bytes += struct_size(cq_ring, cqes, cq_entries);
				2142
				2143	return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
				2144	}
				2145
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	2146	static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
				2147	{
				2148	int i, j;
				2149
				2150	if (!ctx->user_bufs)
				2151	return -ENXIO;
				2152
				2153	for (i = 0; i < ctx->nr_user_bufs; i++) {
				2154	struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
				2155
				2156	for (j = 0; j < imu->nr_bvecs; j++)
				2157	put_page(imu->bvec[j].bv_page);
				2158
				2159	if (ctx->account_mem)
				2160	io_unaccount_mem(ctx->user, imu->nr_bvecs);
				2161	kfree(imu->bvec);
				2162	imu->nr_bvecs = 0;
				2163	}
				2164
				2165	kfree(ctx->user_bufs);
				2166	ctx->user_bufs = NULL;
				2167	ctx->nr_user_bufs = 0;
				2168	return 0;
				2169	}
				2170
				2171	static int io_copy_iov(struct io_ring_ctx ctx, struct iovec dst,
				2172	void __user *arg, unsigned index)
				2173	{
				2174	struct iovec __user *src;
				2175
				2176	#ifdef CONFIG_COMPAT
				2177	if (ctx->compat) {
				2178	struct compat_iovec __user *ciovs;
				2179	struct compat_iovec ciov;
				2180
				2181	ciovs = (struct compat_iovec __user *) arg;
				2182	if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
				2183	return -EFAULT;
				2184
				2185	dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
				2186	dst->iov_len = ciov.iov_len;
				2187	return 0;
				2188	}
				2189	#endif
				2190	src = (struct iovec __user *) arg;
				2191	if (copy_from_user(dst, &src[index], sizeof(*dst)))
				2192	return -EFAULT;
				2193	return 0;
				2194	}
				2195
				2196	static int io_sqe_buffer_register(struct io_ring_ctx ctx, void __user arg,
				2197	unsigned nr_args)
				2198	{
				2199	struct vm_area_struct **vmas = NULL;
				2200	struct page **pages = NULL;
				2201	int i, j, got_pages = 0;
				2202	int ret = -EINVAL;
				2203
				2204	if (ctx->user_bufs)
				2205	return -EBUSY;
				2206	if (!nr_args \|\| nr_args > UIO_MAXIOV)
				2207	return -EINVAL;
				2208
				2209	ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
				2210	GFP_KERNEL);
				2211	if (!ctx->user_bufs)
				2212	return -ENOMEM;
				2213
				2214	for (i = 0; i < nr_args; i++) {
				2215	struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
				2216	unsigned long off, start, end, ubuf;
				2217	int pret, nr_pages;
				2218	struct iovec iov;
				2219	size_t size;
				2220
				2221	ret = io_copy_iov(ctx, &iov, arg, i);
				2222	if (ret)
				2223	break;
				2224
				2225	/*
				2226	* Don't impose further limits on the size and buffer
				2227	* constraints here, we'll -EINVAL later when IO is
				2228	* submitted if they are wrong.
				2229	*/
				2230	ret = -EFAULT;
				2231	if (!iov.iov_base \|\| !iov.iov_len)
				2232	goto err;
				2233
				2234	/* arbitrary limit, but we need something */
				2235	if (iov.iov_len > SZ_1G)
				2236	goto err;
				2237
				2238	ubuf = (unsigned long) iov.iov_base;
				2239	end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
				2240	start = ubuf >> PAGE_SHIFT;
				2241	nr_pages = end - start;
				2242
				2243	if (ctx->account_mem) {
				2244	ret = io_account_mem(ctx->user, nr_pages);
				2245	if (ret)
				2246	goto err;
				2247	}
				2248
				2249	ret = 0;
				2250	if (!pages \|\| nr_pages > got_pages) {
				2251	kfree(vmas);
				2252	kfree(pages);
				2253	pages = kmalloc_array(nr_pages, sizeof(struct page *),
				2254	GFP_KERNEL);
				2255	vmas = kmalloc_array(nr_pages,
				2256	sizeof(struct vm_area_struct *),
				2257	GFP_KERNEL);
				2258	if (!pages \|\| !vmas) {
				2259	ret = -ENOMEM;
				2260	if (ctx->account_mem)
				2261	io_unaccount_mem(ctx->user, nr_pages);
				2262	goto err;
				2263	}
				2264	got_pages = nr_pages;
				2265	}
				2266
				2267	imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
				2268	GFP_KERNEL);
				2269	ret = -ENOMEM;
				2270	if (!imu->bvec) {
				2271	if (ctx->account_mem)
				2272	io_unaccount_mem(ctx->user, nr_pages);
				2273	goto err;
				2274	}
				2275
				2276	ret = 0;
				2277	down_read(&current->mm->mmap_sem);
				2278	pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
				2279	pages, vmas);
				2280	if (pret == nr_pages) {
				2281	/* don't support file backed memory */
				2282	for (j = 0; j < nr_pages; j++) {
				2283	struct vm_area_struct *vma = vmas[j];
				2284
				2285	if (vma->vm_file &&
				2286	!is_file_hugepages(vma->vm_file)) {
				2287	ret = -EOPNOTSUPP;
				2288	break;
				2289	}
				2290	}
				2291	} else {
				2292	ret = pret < 0 ? pret : -EFAULT;
				2293	}
				2294	up_read(&current->mm->mmap_sem);
				2295	if (ret) {
				2296	/*
				2297	* if we did partial map, or found file backed vmas,
				2298	* release any pages we did get
				2299	*/
				2300	if (pret > 0) {
				2301	for (j = 0; j < pret; j++)
				2302	put_page(pages[j]);
				2303	}
				2304	if (ctx->account_mem)
				2305	io_unaccount_mem(ctx->user, nr_pages);
				2306	goto err;
				2307	}
				2308
				2309	off = ubuf & ~PAGE_MASK;
				2310	size = iov.iov_len;
				2311	for (j = 0; j < nr_pages; j++) {
				2312	size_t vec_len;
				2313
				2314	vec_len = min_t(size_t, size, PAGE_SIZE - off);
				2315	imu->bvec[j].bv_page = pages[j];
				2316	imu->bvec[j].bv_len = vec_len;
				2317	imu->bvec[j].bv_offset = off;
				2318	off = 0;
				2319	size -= vec_len;
				2320	}
				2321	/* store original address for later verification */
				2322	imu->ubuf = ubuf;
				2323	imu->len = iov.iov_len;
				2324	imu->nr_bvecs = nr_pages;
				2325
				2326	ctx->nr_user_bufs++;
				2327	}
				2328	kfree(pages);
				2329	kfree(vmas);
				2330	return 0;
				2331	err:
				2332	kfree(pages);
				2333	kfree(vmas);
				2334	io_sqe_buffer_unregister(ctx);
				2335	return ret;
				2336	}
				2337
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2338	static void io_ring_ctx_free(struct io_ring_ctx *ctx)
				2339	{
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	2340	io_finish_async(ctx);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2341	if (ctx->sqo_mm)
				2342	mmdrop(ctx->sqo_mm);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2343
				2344	io_iopoll_reap_events(ctx);
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	2345	io_sqe_buffer_unregister(ctx);
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	2346	io_sqe_files_unregister(ctx);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2347
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2348	#if defined(CONFIG_UNIX)
				2349	if (ctx->ring_sock)
				2350	sock_release(ctx->ring_sock);
				2351	#endif
				2352
				2353	io_mem_free(ctx->sq_ring);
				2354	io_mem_free(ctx->sq_sqes);
				2355	io_mem_free(ctx->cq_ring);
				2356
				2357	percpu_ref_exit(&ctx->refs);
				2358	if (ctx->account_mem)
				2359	io_unaccount_mem(ctx->user,
				2360	ring_pages(ctx->sq_entries, ctx->cq_entries));
				2361	free_uid(ctx->user);
				2362	kfree(ctx);
				2363	}
				2364
				2365	static __poll_t io_uring_poll(struct file file, poll_table wait)
				2366	{
				2367	struct io_ring_ctx *ctx = file->private_data;
				2368	__poll_t mask = 0;
				2369
				2370	poll_wait(file, &ctx->cq_wait, wait);
				2371	/* See comment at the top of this file */
				2372	smp_rmb();
				2373	if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
				2374	mask \|= EPOLLOUT \| EPOLLWRNORM;
				2375	if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
				2376	mask \|= EPOLLIN \| EPOLLRDNORM;
				2377
				2378	return mask;
				2379	}
				2380
				2381	static int io_uring_fasync(int fd, struct file *file, int on)
				2382	{
				2383	struct io_ring_ctx *ctx = file->private_data;
				2384
				2385	return fasync_helper(fd, file, on, &ctx->cq_fasync);
				2386	}
				2387
				2388	static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
				2389	{
				2390	mutex_lock(&ctx->uring_lock);
				2391	percpu_ref_kill(&ctx->refs);
				2392	mutex_unlock(&ctx->uring_lock);
				2393
Jens Axboe	221c5eb	2019-01-17 09:41:58 -0700	[diff] [blame^]	2394	io_poll_remove_all(ctx);
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2395	io_iopoll_reap_events(ctx);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2396	wait_for_completion(&ctx->ctx_done);
				2397	io_ring_ctx_free(ctx);
				2398	}
				2399
				2400	static int io_uring_release(struct inode inode, struct file file)
				2401	{
				2402	struct io_ring_ctx *ctx = file->private_data;
				2403
				2404	file->private_data = NULL;
				2405	io_ring_ctx_wait_and_kill(ctx);
				2406	return 0;
				2407	}
				2408
				2409	static int io_uring_mmap(struct file file, struct vm_area_struct vma)
				2410	{
				2411	loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
				2412	unsigned long sz = vma->vm_end - vma->vm_start;
				2413	struct io_ring_ctx *ctx = file->private_data;
				2414	unsigned long pfn;
				2415	struct page *page;
				2416	void *ptr;
				2417
				2418	switch (offset) {
				2419	case IORING_OFF_SQ_RING:
				2420	ptr = ctx->sq_ring;
				2421	break;
				2422	case IORING_OFF_SQES:
				2423	ptr = ctx->sq_sqes;
				2424	break;
				2425	case IORING_OFF_CQ_RING:
				2426	ptr = ctx->cq_ring;
				2427	break;
				2428	default:
				2429	return -EINVAL;
				2430	}
				2431
				2432	page = virt_to_head_page(ptr);
				2433	if (sz > (PAGE_SIZE << compound_order(page)))
				2434	return -EINVAL;
				2435
				2436	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
				2437	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
				2438	}
				2439
				2440	SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
				2441	u32, min_complete, u32, flags, const sigset_t __user *, sig,
				2442	size_t, sigsz)
				2443	{
				2444	struct io_ring_ctx *ctx;
				2445	long ret = -EBADF;
				2446	int submitted = 0;
				2447	struct fd f;
				2448
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	2449	if (flags & ~(IORING_ENTER_GETEVENTS \| IORING_ENTER_SQ_WAKEUP))
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2450	return -EINVAL;
				2451
				2452	f = fdget(fd);
				2453	if (!f.file)
				2454	return -EBADF;
				2455
				2456	ret = -EOPNOTSUPP;
				2457	if (f.file->f_op != &io_uring_fops)
				2458	goto out_fput;
				2459
				2460	ret = -ENXIO;
				2461	ctx = f.file->private_data;
				2462	if (!percpu_ref_tryget(&ctx->refs))
				2463	goto out_fput;
				2464
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	2465	/*
				2466	* For SQ polling, the thread will do all submissions and completions.
				2467	* Just return the requested submit count, and wake the thread if
				2468	* we were asked to.
				2469	*/
				2470	if (ctx->flags & IORING_SETUP_SQPOLL) {
				2471	if (flags & IORING_ENTER_SQ_WAKEUP)
				2472	wake_up(&ctx->sqo_wait);
				2473	submitted = to_submit;
				2474	goto out_ctx;
				2475	}
				2476
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2477	ret = 0;
				2478	if (to_submit) {
				2479	to_submit = min(to_submit, ctx->sq_entries);
				2480
				2481	mutex_lock(&ctx->uring_lock);
				2482	submitted = io_ring_submit(ctx, to_submit);
				2483	mutex_unlock(&ctx->uring_lock);
				2484
				2485	if (submitted < 0)
				2486	goto out_ctx;
				2487	}
				2488	if (flags & IORING_ENTER_GETEVENTS) {
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2489	unsigned nr_events = 0;
				2490
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2491	min_complete = min(min_complete, ctx->cq_entries);
				2492
				2493	/*
				2494	* The application could have included the 'to_submit' count
				2495	* in how many events it wanted to wait for. If we failed to
				2496	* submit the desired count, we may need to adjust the number
				2497	* of events to poll/wait for.
				2498	*/
				2499	if (submitted < to_submit)
				2500	min_complete = min_t(unsigned, submitted, min_complete);
				2501
Jens Axboe	def596e	2019-01-09 08:59:42 -0700	[diff] [blame]	2502	if (ctx->flags & IORING_SETUP_IOPOLL) {
				2503	mutex_lock(&ctx->uring_lock);
				2504	ret = io_iopoll_check(ctx, &nr_events, min_complete);
				2505	mutex_unlock(&ctx->uring_lock);
				2506	} else {
				2507	ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
				2508	}
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2509	}
				2510
				2511	out_ctx:
				2512	io_ring_drop_ctx_refs(ctx, 1);
				2513	out_fput:
				2514	fdput(f);
				2515	return submitted ? submitted : ret;
				2516	}
				2517
				2518	static const struct file_operations io_uring_fops = {
				2519	.release = io_uring_release,
				2520	.mmap = io_uring_mmap,
				2521	.poll = io_uring_poll,
				2522	.fasync = io_uring_fasync,
				2523	};
				2524
				2525	static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
				2526	struct io_uring_params *p)
				2527	{
				2528	struct io_sq_ring *sq_ring;
				2529	struct io_cq_ring *cq_ring;
				2530	size_t size;
				2531
				2532	sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
				2533	if (!sq_ring)
				2534	return -ENOMEM;
				2535
				2536	ctx->sq_ring = sq_ring;
				2537	sq_ring->ring_mask = p->sq_entries - 1;
				2538	sq_ring->ring_entries = p->sq_entries;
				2539	ctx->sq_mask = sq_ring->ring_mask;
				2540	ctx->sq_entries = sq_ring->ring_entries;
				2541
				2542	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
				2543	if (size == SIZE_MAX)
				2544	return -EOVERFLOW;
				2545
				2546	ctx->sq_sqes = io_mem_alloc(size);
				2547	if (!ctx->sq_sqes) {
				2548	io_mem_free(ctx->sq_ring);
				2549	return -ENOMEM;
				2550	}
				2551
				2552	cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
				2553	if (!cq_ring) {
				2554	io_mem_free(ctx->sq_ring);
				2555	io_mem_free(ctx->sq_sqes);
				2556	return -ENOMEM;
				2557	}
				2558
				2559	ctx->cq_ring = cq_ring;
				2560	cq_ring->ring_mask = p->cq_entries - 1;
				2561	cq_ring->ring_entries = p->cq_entries;
				2562	ctx->cq_mask = cq_ring->ring_mask;
				2563	ctx->cq_entries = cq_ring->ring_entries;
				2564	return 0;
				2565	}
				2566
				2567	/*
				2568	* Allocate an anonymous fd, this is what constitutes the application
				2569	* visible backing of an io_uring instance. The application mmaps this
				2570	* fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
				2571	* we have to tie this fd to a socket for file garbage collection purposes.
				2572	*/
				2573	static int io_uring_get_fd(struct io_ring_ctx *ctx)
				2574	{
				2575	struct file *file;
				2576	int ret;
				2577
				2578	#if defined(CONFIG_UNIX)
				2579	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
				2580	&ctx->ring_sock);
				2581	if (ret)
				2582	return ret;
				2583	#endif
				2584
				2585	ret = get_unused_fd_flags(O_RDWR \| O_CLOEXEC);
				2586	if (ret < 0)
				2587	goto err;
				2588
				2589	file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
				2590	O_RDWR \| O_CLOEXEC);
				2591	if (IS_ERR(file)) {
				2592	put_unused_fd(ret);
				2593	ret = PTR_ERR(file);
				2594	goto err;
				2595	}
				2596
				2597	#if defined(CONFIG_UNIX)
				2598	ctx->ring_sock->file = file;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	2599	ctx->ring_sock->sk->sk_user_data = ctx;
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2600	#endif
				2601	fd_install(ret, file);
				2602	return ret;
				2603	err:
				2604	#if defined(CONFIG_UNIX)
				2605	sock_release(ctx->ring_sock);
				2606	ctx->ring_sock = NULL;
				2607	#endif
				2608	return ret;
				2609	}
				2610
				2611	static int io_uring_create(unsigned entries, struct io_uring_params *p)
				2612	{
				2613	struct user_struct *user = NULL;
				2614	struct io_ring_ctx *ctx;
				2615	bool account_mem;
				2616	int ret;
				2617
				2618	if (!entries \|\| entries > IORING_MAX_ENTRIES)
				2619	return -EINVAL;
				2620
				2621	/*
				2622	* Use twice as many entries for the CQ ring. It's possible for the
				2623	* application to drive a higher depth than the size of the SQ ring,
				2624	* since the sqes are only used at submission time. This allows for
				2625	* some flexibility in overcommitting a bit.
				2626	*/
				2627	p->sq_entries = roundup_pow_of_two(entries);
				2628	p->cq_entries = 2 * p->sq_entries;
				2629
				2630	user = get_uid(current_user());
				2631	account_mem = !capable(CAP_IPC_LOCK);
				2632
				2633	if (account_mem) {
				2634	ret = io_account_mem(user,
				2635	ring_pages(p->sq_entries, p->cq_entries));
				2636	if (ret) {
				2637	free_uid(user);
				2638	return ret;
				2639	}
				2640	}
				2641
				2642	ctx = io_ring_ctx_alloc(p);
				2643	if (!ctx) {
				2644	if (account_mem)
				2645	io_unaccount_mem(user, ring_pages(p->sq_entries,
				2646	p->cq_entries));
				2647	free_uid(user);
				2648	return -ENOMEM;
				2649	}
				2650	ctx->compat = in_compat_syscall();
				2651	ctx->account_mem = account_mem;
				2652	ctx->user = user;
				2653
				2654	ret = io_allocate_scq_urings(ctx, p);
				2655	if (ret)
				2656	goto err;
				2657
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	2658	ret = io_sq_offload_start(ctx, p);
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2659	if (ret)
				2660	goto err;
				2661
				2662	ret = io_uring_get_fd(ctx);
				2663	if (ret < 0)
				2664	goto err;
				2665
				2666	memset(&p->sq_off, 0, sizeof(p->sq_off));
				2667	p->sq_off.head = offsetof(struct io_sq_ring, r.head);
				2668	p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
				2669	p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
				2670	p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
				2671	p->sq_off.flags = offsetof(struct io_sq_ring, flags);
				2672	p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
				2673	p->sq_off.array = offsetof(struct io_sq_ring, array);
				2674
				2675	memset(&p->cq_off, 0, sizeof(p->cq_off));
				2676	p->cq_off.head = offsetof(struct io_cq_ring, r.head);
				2677	p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
				2678	p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
				2679	p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
				2680	p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
				2681	p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
				2682	return ret;
				2683	err:
				2684	io_ring_ctx_wait_and_kill(ctx);
				2685	return ret;
				2686	}
				2687
				2688	/*
				2689	* Sets up an aio uring context, and returns the fd. Applications asks for a
				2690	* ring size, we return the actual sq/cq ring sizes (among other things) in the
				2691	* params structure passed in.
				2692	*/
				2693	static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
				2694	{
				2695	struct io_uring_params p;
				2696	long ret;
				2697	int i;
				2698
				2699	if (copy_from_user(&p, params, sizeof(p)))
				2700	return -EFAULT;
				2701	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
				2702	if (p.resv[i])
				2703	return -EINVAL;
				2704	}
				2705
Jens Axboe	6c271ce	2019-01-10 11:22:30 -0700	[diff] [blame]	2706	if (p.flags & ~(IORING_SETUP_IOPOLL \| IORING_SETUP_SQPOLL \|
				2707	IORING_SETUP_SQ_AFF))
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2708	return -EINVAL;
				2709
				2710	ret = io_uring_create(entries, &p);
				2711	if (ret < 0)
				2712	return ret;
				2713
				2714	if (copy_to_user(params, &p, sizeof(p)))
				2715	return -EFAULT;
				2716
				2717	return ret;
				2718	}
				2719
				2720	SYSCALL_DEFINE2(io_uring_setup, u32, entries,
				2721	struct io_uring_params __user *, params)
				2722	{
				2723	return io_uring_setup(entries, params);
				2724	}
				2725
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	2726	static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
				2727	void __user *arg, unsigned nr_args)
				2728	{
				2729	int ret;
				2730
				2731	percpu_ref_kill(&ctx->refs);
				2732	wait_for_completion(&ctx->ctx_done);
				2733
				2734	switch (opcode) {
				2735	case IORING_REGISTER_BUFFERS:
				2736	ret = io_sqe_buffer_register(ctx, arg, nr_args);
				2737	break;
				2738	case IORING_UNREGISTER_BUFFERS:
				2739	ret = -EINVAL;
				2740	if (arg \|\| nr_args)
				2741	break;
				2742	ret = io_sqe_buffer_unregister(ctx);
				2743	break;
Jens Axboe	6b06314	2019-01-10 22:13:58 -0700	[diff] [blame]	2744	case IORING_REGISTER_FILES:
				2745	ret = io_sqe_files_register(ctx, arg, nr_args);
				2746	break;
				2747	case IORING_UNREGISTER_FILES:
				2748	ret = -EINVAL;
				2749	if (arg \|\| nr_args)
				2750	break;
				2751	ret = io_sqe_files_unregister(ctx);
				2752	break;
Jens Axboe	edafcce	2019-01-09 09:16:05 -0700	[diff] [blame]	2753	default:
				2754	ret = -EINVAL;
				2755	break;
				2756	}
				2757
				2758	/* bring the ctx back to life */
				2759	reinit_completion(&ctx->ctx_done);
				2760	percpu_ref_reinit(&ctx->refs);
				2761	return ret;
				2762	}
				2763
				2764	SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
				2765	void __user *, arg, unsigned int, nr_args)
				2766	{
				2767	struct io_ring_ctx *ctx;
				2768	long ret = -EBADF;
				2769	struct fd f;
				2770
				2771	f = fdget(fd);
				2772	if (!f.file)
				2773	return -EBADF;
				2774
				2775	ret = -EOPNOTSUPP;
				2776	if (f.file->f_op != &io_uring_fops)
				2777	goto out_fput;
				2778
				2779	ctx = f.file->private_data;
				2780
				2781	mutex_lock(&ctx->uring_lock);
				2782	ret = __io_uring_register(ctx, opcode, arg, nr_args);
				2783	mutex_unlock(&ctx->uring_lock);
				2784	out_fput:
				2785	fdput(f);
				2786	return ret;
				2787	}
				2788
Jens Axboe	2b188cc	2019-01-07 10:46:33 -0700	[diff] [blame]	2789	static int __init io_uring_init(void)
				2790	{
				2791	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN \| SLAB_PANIC);
				2792	return 0;
				2793	};
				2794	__initcall(io_uring_init);