io_uring: batch account ->req_issue and task struct references

Identical to how we handle the ctx reference counts, increase by the
batch we're expecting to submit, and handle any slow path residual,
if any. The request alloc-and-issue path is very hot, and this makes
a noticeable difference by avoiding an two atomic incs for each
individual request.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index c409af7..85e8ad9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6429,8 +6429,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	/* one is dropped after submission, the other at completion */
 	refcount_set(&req->refs, 2);
 	req->task = current;
-	get_task_struct(req->task);
-	atomic_long_inc(&req->task->io_uring->req_issue);
 	req->result = 0;
 
 	if (unlikely(req->opcode >= IORING_OP_LAST))
@@ -6488,6 +6486,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 	if (!percpu_ref_tryget_many(&ctx->refs, nr))
 		return -EAGAIN;
 
+	atomic_long_add(nr, &current->io_uring->req_issue);
+	refcount_add(nr, &current->usage);
+
 	io_submit_state_start(&state, ctx, nr);
 
 	for (i = 0; i < nr; i++) {
@@ -6530,6 +6531,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
 
 		percpu_ref_put_many(&ctx->refs, nr - ref_used);
+		atomic_long_sub(nr - ref_used, &current->io_uring->req_issue);
+		put_task_struct_many(current, nr - ref_used);
 	}
 	if (link)
 		io_queue_link_head(link, &state.comp);