diff options
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r-- | fs/io_uring.c | 1390 |
1 files changed, 895 insertions, 495 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index 4c030a92de795..405be10da73d4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -69,6 +69,7 @@ #include <linux/nospec.h> #include <linux/sizes.h> #include <linux/hugetlb.h> +#include <linux/highmem.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -144,7 +145,7 @@ struct io_rings { /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure - * there are not more requests pending thatn there is space in + * there are not more requests pending than there is space in * the completion queue. * * Written by the kernel, shouldn't be modified by the @@ -186,6 +187,7 @@ struct io_ring_ctx { bool compat; bool account_mem; bool cq_overflow_flushed; + bool drain_next; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -236,6 +238,8 @@ struct io_ring_ctx { struct user_struct *user; + const struct cred *creds; + /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ struct completion *completions; @@ -271,23 +275,14 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct list_head poll_list; - struct rb_root cancel_tree; + struct hlist_head *cancel_hash; + unsigned cancel_hash_bits; spinlock_t inflight_lock; struct list_head inflight_list; } ____cacheline_aligned_in_smp; }; -struct sqe_submit { - const struct io_uring_sqe *sqe; - struct file *ring_file; - int ring_fd; - u32 sequence; - bool has_user; - bool in_async; - bool needs_fixed_file; -}; - /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in <linux/fs.h> @@ -298,12 +293,43 @@ struct io_poll_iocb { __poll_t events; bool done; bool canceled; - struct wait_queue_entry wait; + struct wait_queue_entry *wait; }; -struct io_timeout { - struct file *file; +struct io_timeout_data { + struct io_kiocb *req; struct hrtimer timer; + struct timespec64 ts; + enum hrtimer_mode mode; + u32 seq_offset; +}; + +struct io_async_connect { + struct sockaddr_storage address; +}; + +struct io_async_msghdr { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov; + struct sockaddr __user *uaddr; + struct msghdr msg; +}; + +struct io_async_rw { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov; + ssize_t nr_segs; + ssize_t size; +}; + +struct io_async_ctx { + struct io_uring_sqe sqe; + union { + struct io_async_rw rw; + struct io_async_msghdr msg; + struct io_async_connect connect; + struct io_timeout_data timeout; + }; }; /* @@ -317,15 +343,20 @@ struct io_kiocb { struct file *file; struct kiocb rw; struct io_poll_iocb poll; - struct io_timeout timeout; }; - struct sqe_submit submit; + const struct io_uring_sqe *sqe; + struct io_async_ctx *io; + struct file *ring_file; + int ring_fd; + bool has_user; + bool in_async; + bool needs_fixed_file; struct io_ring_ctx *ctx; union { struct list_head list; - struct rb_node rb_node; + struct hlist_node hash_node; }; struct list_head link_list; unsigned int flags; @@ -333,13 +364,13 @@ struct io_kiocb { #define REQ_F_NOWAIT 1 /* must not punt to workers */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ #define REQ_F_FIXED_FILE 4 /* ctx owns file */ -#define REQ_F_SEQ_PREV 8 /* sequential with previous */ +#define REQ_F_LINK_NEXT 8 /* already grabbed next link */ #define REQ_F_IO_DRAIN 16 /* drain existing IO first */ #define REQ_F_IO_DRAINED 32 /* drain done */ #define REQ_F_LINK 64 /* linked sqes */ #define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ -#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ +#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */ #define REQ_F_TIMEOUT 1024 /* timeout request */ #define REQ_F_ISREG 2048 /* regular file */ #define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ @@ -383,6 +414,9 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res); static void __io_free_req(struct io_kiocb *req); static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); +static void __io_double_put_req(struct io_kiocb *req); +static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); +static void io_queue_linked_timeout(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -411,6 +445,7 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; + int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -424,6 +459,21 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (!ctx->completions) goto err; + /* + * Use 5 bits less than the max cq entries, that should give us around + * 32 entries per hash list if totally full and uniformly spread. + */ + hash_bits = ilog2(p->cq_entries); + hash_bits -= 5; + if (hash_bits <= 0) + hash_bits = 1; + ctx->cancel_hash_bits = hash_bits; + ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), + GFP_KERNEL); + if (!ctx->cancel_hash) + goto err; + __hash_init(ctx->cancel_hash, 1U << hash_bits); + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -437,7 +487,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); - ctx->cancel_tree = RB_ROOT; INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); init_waitqueue_head(&ctx->inflight_wait); @@ -448,6 +497,7 @@ err: if (ctx->fallback_req) kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx->completions); + kfree(ctx->cancel_hash); kfree(ctx); return NULL; } @@ -521,12 +571,13 @@ static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) opcode == IORING_OP_WRITE_FIXED); } -static inline bool io_prep_async_work(struct io_kiocb *req) +static inline bool io_prep_async_work(struct io_kiocb *req, + struct io_kiocb **link) { bool do_hashed = false; - if (req->submit.sqe) { - switch (req->submit.sqe->opcode) { + if (req->sqe) { + switch (req->sqe->opcode) { case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: do_hashed = true; @@ -537,6 +588,7 @@ static inline bool io_prep_async_work(struct io_kiocb *req) case IORING_OP_RECVMSG: case IORING_OP_ACCEPT: case IORING_OP_POLL_ADD: + case IORING_OP_CONNECT: /* * We know REQ_F_ISREG is not set on some of these * opcodes, but this enables us to keep the check in @@ -546,17 +598,21 @@ static inline bool io_prep_async_work(struct io_kiocb *req) req->work.flags |= IO_WQ_WORK_UNBOUND; break; } - if (io_sqe_needs_user(req->submit.sqe)) + if (io_sqe_needs_user(req->sqe)) req->work.flags |= IO_WQ_WORK_NEEDS_USER; } + *link = io_prep_linked_timeout(req); return do_hashed; } static inline void io_queue_async_work(struct io_kiocb *req) { - bool do_hashed = io_prep_async_work(req); struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *link; + bool do_hashed; + + do_hashed = io_prep_async_work(req, &link); trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work, req->flags); @@ -566,13 +622,16 @@ static inline void io_queue_async_work(struct io_kiocb *req) io_wq_enqueue_hashed(ctx->io_wq, &req->work, file_inode(req->file)); } + + if (link) + io_queue_linked_timeout(link); } static void io_kill_timeout(struct io_kiocb *req) { int ret; - ret = hrtimer_try_to_cancel(&req->timeout.timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); list_del_init(&req->list); @@ -601,11 +660,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_commit_cqring(ctx); while ((req = io_get_deferred_req(ctx)) != NULL) { - if (req->flags & REQ_F_SHADOW_DRAIN) { - /* Just for drain, free it. */ - __io_free_req(req); - continue; - } req->flags |= REQ_F_IO_DRAINED; io_queue_async_work(req); } @@ -639,7 +693,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) eventfd_signal(ctx->cq_ev_fd, 1); } -static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) +/* Returns true if there are no backlogged entries after the flush */ +static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { struct io_rings *rings = ctx->rings; struct io_uring_cqe *cqe; @@ -649,10 +704,10 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (!force) { if (list_empty_careful(&ctx->cq_overflow_list)) - return; + return true; if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)) - return; + return false; } spin_lock_irqsave(&ctx->completion_lock, flags); @@ -661,6 +716,7 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (force) ctx->cq_overflow_flushed = true; + cqe = NULL; while (!list_empty(&ctx->cq_overflow_list)) { cqe = io_get_cqring(ctx); if (!cqe && !force) @@ -688,6 +744,8 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) list_del(&req->list); io_put_req(req); } + + return cqe != NULL; } static void io_cqring_fill_event(struct io_kiocb *req, long res) @@ -787,6 +845,8 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, } got_it: + req->io = NULL; + req->ring_file = NULL; req->file = NULL; req->ctx = ctx; req->flags = 0; @@ -816,6 +876,8 @@ static void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + if (req->io) + kfree(req->io); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); if (req->flags & REQ_F_INFLIGHT) { @@ -839,7 +901,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = hrtimer_try_to_cancel(&req->timeout.timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); @@ -854,42 +916,37 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt; bool wake_ev = false; + /* Already got next link */ + if (req->flags & REQ_F_LINK_NEXT) + return; + /* * The list should never be empty when we are called here. But could * potentially happen if the chain is messed up, check to be on the * safe side. */ - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - while (nxt) { - list_del_init(&nxt->list); - if (!list_empty(&req->link_list)) { - INIT_LIST_HEAD(&nxt->link_list); - list_splice(&req->link_list, &nxt->link_list); - nxt->flags |= REQ_F_LINK; + while (!list_empty(&req->link_list)) { + struct io_kiocb *nxt = list_first_entry(&req->link_list, + struct io_kiocb, link_list); + + if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && + (nxt->flags & REQ_F_TIMEOUT))) { + list_del_init(&nxt->link_list); + wake_ev |= io_link_cancel_timeout(nxt); + req->flags &= ~REQ_F_LINK_TIMEOUT; + continue; } - /* - * If we're in async work, we can continue processing the chain - * in this context instead of having to queue up new async work. - */ - if (req->flags & REQ_F_LINK_TIMEOUT) { - wake_ev = io_link_cancel_timeout(nxt); - - /* we dropped this link, get next */ - nxt = list_first_entry_or_null(&req->link_list, - struct io_kiocb, list); - } else if (nxtptr && io_wq_current_is_worker()) { - *nxtptr = nxt; - break; - } else { - io_queue_async_work(nxt); - break; - } + list_del_init(&req->link_list); + if (!list_empty(&nxt->link_list)) + nxt->flags |= REQ_F_LINK; + *nxtptr = nxt; + break; } + req->flags |= REQ_F_LINK_NEXT; if (wake_ev) io_cqring_ev_posted(ctx); } @@ -900,24 +957,25 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) static void io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { - link = list_first_entry(&req->link_list, struct io_kiocb, list); - list_del_init(&link->list); + struct io_kiocb *link = list_first_entry(&req->link_list, + struct io_kiocb, link_list); + list_del_init(&link->link_list); trace_io_uring_fail_link(req, link); if ((req->flags & REQ_F_LINK_TIMEOUT) && - link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) { + link->sqe->opcode == IORING_OP_LINK_TIMEOUT) { io_link_cancel_timeout(link); } else { io_cqring_fill_event(link, -ECANCELED); - io_double_put_req(link); + __io_double_put_req(link); } + req->flags &= ~REQ_F_LINK_TIMEOUT; } io_commit_cqring(ctx); @@ -925,12 +983,10 @@ static void io_fail_links(struct io_kiocb *req) io_cqring_ev_posted(ctx); } -static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) +static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) { - if (likely(!(req->flags & REQ_F_LINK))) { - __io_free_req(req); + if (likely(!(req->flags & REQ_F_LINK))) return; - } /* * If LINK is set, we have dependent requests in this chain. If we @@ -956,32 +1012,30 @@ static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) } else { io_req_link_next(req, nxt); } - - __io_free_req(req); } static void io_free_req(struct io_kiocb *req) { - io_free_req_find_next(req, NULL); + struct io_kiocb *nxt = NULL; + + io_req_find_next(req, &nxt); + __io_free_req(req); + + if (nxt) + io_queue_async_work(nxt); } /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. */ +__attribute__((nonnull)) static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { - struct io_kiocb *nxt = NULL; + io_req_find_next(req, nxtptr); if (refcount_dec_and_test(&req->refs)) - io_free_req_find_next(req, &nxt); - - if (nxt) { - if (nxtptr) - *nxtptr = nxt; - else - io_queue_async_work(nxt); - } + __io_free_req(req); } static void io_put_req(struct io_kiocb *req) @@ -990,13 +1044,24 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } -static void io_double_put_req(struct io_kiocb *req) +/* + * Must only be used if we don't need to care about links, usually from + * within the completion handling itself. + */ +static void __io_double_put_req(struct io_kiocb *req) { /* drop both submit and complete references */ if (refcount_sub_and_test(2, &req->refs)) __io_free_req(req); } +static void io_double_put_req(struct io_kiocb *req) +{ + /* drop both submit and complete references */ + if (refcount_sub_and_test(2, &req->refs)) + io_free_req(req); +} + static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) { struct io_rings *rings = ctx->rings; @@ -1049,7 +1114,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, * file and non-linked commands. */ if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == - REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) { + REQ_F_FIXED_FILE) && !io_is_fallback_req(req) && + !req->io) { reqs[to_free++] = req; if (to_free == ARRAY_SIZE(reqs)) io_free_req_many(ctx, reqs, &to_free); @@ -1366,7 +1432,7 @@ static bool io_file_supports_async(struct file *file) static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) { - const struct io_uring_sqe *sqe = req->submit.sqe; + const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; unsigned ioprio; @@ -1378,15 +1444,6 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) if (S_ISREG(file_inode(req->file)->i_mode)) req->flags |= REQ_F_ISREG; - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; - return -EAGAIN; - } - kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -1453,15 +1510,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, bool in_async) { - if (in_async && ret >= 0 && nxt && kiocb->ki_complete == io_complete_rw) + if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) *nxt = __io_complete_rw(kiocb, ret); else io_rw_done(kiocb, ret); } -static int io_import_fixed(struct io_ring_ctx *ctx, int rw, - const struct io_uring_sqe *sqe, - struct iov_iter *iter) +static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, + const struct io_uring_sqe *sqe, + struct iov_iter *iter) { size_t len = READ_ONCE(sqe->len); struct io_mapped_ubuf *imu; @@ -1533,11 +1590,10 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, return len; } -static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, - const struct sqe_submit *s, struct iovec **iovec, - struct iov_iter *iter) +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter) { - const struct io_uring_sqe *sqe = s->sqe; + const struct io_uring_sqe *sqe = req->sqe; void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); size_t sqe_len = READ_ONCE(sqe->len); u8 opcode; @@ -1551,18 +1607,26 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, * flag. */ opcode = READ_ONCE(sqe->opcode); - if (opcode == IORING_OP_READ_FIXED || - opcode == IORING_OP_WRITE_FIXED) { - ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); + if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; - return ret; + return io_import_fixed(req->ctx, rw, sqe, iter); + } + + if (req->io) { + struct io_async_rw *iorw = &req->io->rw; + + *iovec = iorw->iov; + iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size); + if (iorw->iov == iorw->fast_iov) + *iovec = NULL; + return iorw->size; } - if (!s->has_user) + if (!req->has_user) return -EFAULT; #ifdef CONFIG_COMPAT - if (ctx->compat) + if (req->ctx->compat) return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); #endif @@ -1590,9 +1654,19 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return -EAGAIN; while (iov_iter_count(iter)) { - struct iovec iovec = iov_iter_iovec(iter); + struct iovec iovec; ssize_t nr; + if (!iov_iter_is_bvec(iter)) { + iovec = iov_iter_iovec(iter); + } else { + /* fixed buffers import bvec */ + iovec.iov_base = kmap(iter->bvec->bv_page) + + iter->iov_offset; + iovec.iov_len = min(iter->count, + iter->bvec->bv_len - iter->iov_offset); + } + if (rw == READ) { nr = file->f_op->read(file, iovec.iov_base, iovec.iov_len, &kiocb->ki_pos); @@ -1601,6 +1675,9 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, iovec.iov_len, &kiocb->ki_pos); } + if (iov_iter_is_bvec(iter)) + kunmap(iter->bvec->bv_page); + if (nr < 0) { if (!ret) ret = nr; @@ -1615,6 +1692,50 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return ret; } +static void io_req_map_io(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter) +{ + req->io->rw.nr_segs = iter->nr_segs; + req->io->rw.size = io_size; + req->io->rw.iov = iovec; + if (!req->io->rw.iov) { + req->io->rw.iov = req->io->rw.fast_iov; + memcpy(req->io->rw.iov, fast_iov, + sizeof(struct iovec) * iter->nr_segs); + } +} + +static int io_setup_async_io(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter) +{ + req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); + if (req->io) { + io_req_map_io(req, io_size, iovec, fast_iov, iter); + memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe)); + req->sqe = &req->io->sqe; + return 0; + } + + return -ENOMEM; +} + +static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, + struct iov_iter *iter, bool force_nonblock) +{ + ssize_t ret; + + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; + + if (unlikely(!(req->file->f_mode & FMODE_READ))) + return -EBADF; + + return io_import_iovec(READ, req, iovec, iter); +} + static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { @@ -1623,23 +1744,31 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t read_size, ret; - - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; - file = kiocb->ki_filp; - - if (unlikely(!(file->f_mode & FMODE_READ))) - return -EBADF; + ssize_t io_size, ret; - ret = io_import_iovec(req->ctx, READ, &req->submit, &iovec, &iter); - if (ret < 0) - return ret; + if (!req->io) { + ret = io_read_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; + } else { + ret = io_import_iovec(READ, req, &iovec, &iter); + if (ret < 0) + return ret; + } - read_size = ret; + file = req->file; + io_size = ret; if (req->flags & REQ_F_LINK) - req->result = read_size; + req->result = io_size; + + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(file)) { + req->flags |= REQ_F_MUST_PUNT; + goto copy_iov; + } iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); @@ -1661,18 +1790,40 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, */ if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && (req->flags & REQ_F_ISREG) && - ret2 > 0 && ret2 < read_size) + ret2 > 0 && ret2 < io_size) ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || ret2 != -EAGAIN) - kiocb_done(kiocb, ret2, nxt, req->submit.in_async); - else - ret = -EAGAIN; + if (!force_nonblock || ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2, nxt, req->in_async); + } else { +copy_iov: + ret = io_setup_async_io(req, io_size, iovec, + inline_vecs, &iter); + if (ret) + goto out_free; + return -EAGAIN; + } } +out_free: kfree(iovec); return ret; } +static int io_write_prep(struct io_kiocb *req, struct iovec **iovec, + struct iov_iter *iter, bool force_nonblock) +{ + ssize_t ret; + + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; + + if (unlikely(!(req->file->f_mode & FMODE_WRITE))) + return -EBADF; + + return io_import_iovec(WRITE, req, iovec, iter); +} + static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { @@ -1681,29 +1832,36 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t ret; + ssize_t ret, io_size; - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; + if (!req->io) { + ret = io_write_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; + } else { + ret = io_import_iovec(WRITE, req, &iovec, &iter); + if (ret < 0) + return ret; + } file = kiocb->ki_filp; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - return -EBADF; - - ret = io_import_iovec(req->ctx, WRITE, &req->submit, &iovec, &iter); - if (ret < 0) - return ret; - + io_size = ret; if (req->flags & REQ_F_LINK) - req->result = ret; + req->result = io_size; - iov_count = iov_iter_count(&iter); + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(req->file)) { + req->flags |= REQ_F_MUST_PUNT; + goto copy_iov; + } - ret = -EAGAIN; if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) - goto out_free; + goto copy_iov; + iov_count = iov_iter_count(&iter); ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; @@ -1727,10 +1885,16 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, ret2 = call_write_iter(file, kiocb, &iter); else ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); - if (!force_nonblock || ret2 != -EAGAIN) - kiocb_done(kiocb, ret2, nxt, req->submit.in_async); - else - ret = -EAGAIN; + if (!force_nonblock || ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2, nxt, req->in_async); + } else { +copy_iov: + ret = io_setup_async_io(req, io_size, iovec, + inline_vecs, &iter); + if (ret) + goto out_free; + return -EAGAIN; + } } out_free: kfree(iovec); @@ -1846,12 +2010,25 @@ static int io_sync_file_range(struct io_kiocb *req, return 0; } +static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ #if defined(CONFIG_NET) -static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock, - long (*fn)(struct socket *, struct user_msghdr __user *, - unsigned int)) + const struct io_uring_sqe *sqe = req->sqe; + struct user_msghdr __user *msg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); +#else + return 0; +#endif +} + +static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) { +#if defined(CONFIG_NET) struct socket *sock; int ret; @@ -1860,7 +2037,9 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, sock = sock_from_file(req->file, &ret); if (sock) { - struct user_msghdr __user *msg; + struct io_async_ctx io, *copy; + struct sockaddr_storage addr; + struct msghdr *kmsg; unsigned flags; flags = READ_ONCE(sqe->msg_flags); @@ -1869,30 +2048,59 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, else if (force_nonblock) flags |= MSG_DONTWAIT; - msg = (struct user_msghdr __user *) (unsigned long) - READ_ONCE(sqe->addr); + if (req->io) { + kmsg = &req->io->msg.msg; + kmsg->msg_name = &addr; + } else { + kmsg = &io.msg.msg; + kmsg->msg_name = &addr; + io.msg.iov = io.msg.fast_iov; + ret = io_sendmsg_prep(req, &io); + if (ret) + goto out; + } - ret = fn(sock, msg, flags); - if (force_nonblock && ret == -EAGAIN) + ret = __sys_sendmsg_sock(sock, kmsg, flags); + if (force_nonblock && ret == -EAGAIN) { + copy = kmalloc(sizeof(*copy), GFP_KERNEL); + if (!copy) { + ret = -ENOMEM; + goto out; + } + memcpy(©->msg, &io.msg, sizeof(copy->msg)); + req->io = copy; + memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); + req->sqe = &req->io->sqe; return ret; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; } +out: io_cqring_add_event(req, ret); if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_put_req_find_next(req, nxt); return 0; -} +#else + return -EOPNOTSUPP; #endif +} -static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) - return io_send_recvmsg(req, sqe, nxt, force_nonblock, - __sys_sendmsg_sock); + const struct io_uring_sqe *sqe = req->sqe; + struct user_msghdr __user *msg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, + &io->msg.iov); #else - return -EOPNOTSUPP; + return 0; #endif } @@ -1900,8 +2108,63 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - return io_send_recvmsg(req, sqe, nxt, force_nonblock, - __sys_recvmsg_sock); + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct user_msghdr __user *msg; + struct io_async_ctx io, *copy; + struct sockaddr_storage addr; + struct msghdr *kmsg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + msg = (struct user_msghdr __user *) (unsigned long) + READ_ONCE(sqe->addr); + if (req->io) { + kmsg = &req->io->msg.msg; + kmsg->msg_name = &addr; + } else { + kmsg = &io.msg.msg; + kmsg->msg_name = &addr; + io.msg.iov = io.msg.fast_iov; + ret = io_recvmsg_prep(req, &io); + if (ret) + goto out; + } + + ret = __sys_recvmsg_sock(sock, kmsg, msg, io.msg.uaddr, flags); + if (force_nonblock && ret == -EAGAIN) { + copy = kmalloc(sizeof(*copy), GFP_KERNEL); + if (!copy) { + ret = -ENOMEM; + goto out; + } + memcpy(copy, &io, sizeof(*copy)); + req->io = copy; + memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); + req->sqe = &req->io->sqe; + return ret; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + +out: + io_cqring_add_event(req, ret); + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; + io_put_req_find_next(req, nxt); + return 0; #else return -EOPNOTSUPP; #endif @@ -1918,7 +2181,7 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) + if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL; addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); @@ -1943,12 +2206,71 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, #endif } -static inline void io_poll_remove_req(struct io_kiocb *req) +static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ +#if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; + struct sockaddr __user *addr; + int addr_len; + + addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); + addr_len = READ_ONCE(sqe->addr2); + return move_addr_to_kernel(addr, addr_len, &io->connect.address); +#else + return 0; +#endif +} + +static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) { - if (!RB_EMPTY_NODE(&req->rb_node)) { - rb_erase(&req->rb_node, &req->ctx->cancel_tree); - RB_CLEAR_NODE(&req->rb_node); +#if defined(CONFIG_NET) + struct io_async_ctx __io, *io; + unsigned file_flags; + int addr_len, ret; + + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; + if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) + return -EINVAL; + + addr_len = READ_ONCE(sqe->addr2); + file_flags = force_nonblock ? O_NONBLOCK : 0; + + if (req->io) { + io = req->io; + } else { + ret = io_connect_prep(req, &__io); + if (ret) + goto out; + io = &__io; + } + + ret = __sys_connect_file(req->file, &io->connect.address, addr_len, + file_flags); + if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) { + ret = -ENOMEM; + goto out; + } + memcpy(&io->connect, &__io.connect, sizeof(io->connect)); + req->io = io; + memcpy(&io->sqe, req->sqe, sizeof(*req->sqe)); + req->sqe = &io->sqe; + return -EAGAIN; } + if (ret == -ERESTARTSYS) + ret = -EINTR; +out: + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif } static void io_poll_remove_one(struct io_kiocb *req) @@ -1957,41 +2279,39 @@ static void io_poll_remove_one(struct io_kiocb *req) spin_lock(&poll->head->lock); WRITE_ONCE(poll->canceled, true); - if (!list_empty(&poll->wait.entry)) { - list_del_init(&poll->wait.entry); + if (!list_empty(&poll->wait->entry)) { + list_del_init(&poll->wait->entry); io_queue_async_work(req); } spin_unlock(&poll->head->lock); - io_poll_remove_req(req); + hash_del(&req->hash_node); } static void io_poll_remove_all(struct io_ring_ctx *ctx) { - struct rb_node *node; + struct hlist_node *tmp; struct io_kiocb *req; + int i; spin_lock_irq(&ctx->completion_lock); - while ((node = rb_first(&ctx->cancel_tree)) != NULL) { - req = rb_entry(node, struct io_kiocb, rb_node); - io_poll_remove_one(req); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry_safe(req, tmp, list, hash_node) + io_poll_remove_one(req); } spin_unlock_irq(&ctx->completion_lock); } static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) { - struct rb_node *p, *parent = NULL; + struct hlist_head *list; struct io_kiocb *req; - p = ctx->cancel_tree.rb_node; - while (p) { - parent = p; - req = rb_entry(parent, struct io_kiocb, rb_node); - if (sqe_addr < req->user_data) { - p = p->rb_left; - } else if (sqe_addr > req->user_data) { - p = p->rb_right; - } else { + list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; + hlist_for_each_entry(req, list, hash_node) { + if (sqe_addr == req->user_data) { io_poll_remove_one(req); return 0; } @@ -2026,12 +2346,16 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static void io_poll_complete(struct io_kiocb *req, __poll_t mask) +static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) { struct io_ring_ctx *ctx = req->ctx; req->poll.done = true; - io_cqring_fill_event(req, mangle_poll(mask)); + kfree(req->poll.wait); + if (error) + io_cqring_fill_event(req, error); + else + io_cqring_fill_event(req, mangle_poll(mask)); io_commit_cqring(ctx); } @@ -2044,11 +2368,16 @@ static void io_poll_complete_work(struct io_wq_work **workptr) struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt = NULL; __poll_t mask = 0; + int ret = 0; - if (work->flags & IO_WQ_WORK_CANCEL) + if (work->flags & IO_WQ_WORK_CANCEL) { WRITE_ONCE(poll->canceled, true); + ret = -ECANCELED; + } else if (READ_ONCE(poll->canceled)) { + ret = -ECANCELED; + } - if (!READ_ONCE(poll->canceled)) + if (ret != -ECANCELED) mask = vfs_poll(poll->file, &pt) & poll->events; /* @@ -2059,17 +2388,19 @@ static void io_poll_complete_work(struct io_wq_work **workptr) * avoid further branches in the fast path. */ spin_lock_irq(&ctx->completion_lock); - if (!mask && !READ_ONCE(poll->canceled)) { - add_wait_queue(poll->head, &poll->wait); + if (!mask && ret != -ECANCELED) { + add_wait_queue(poll->head, poll->wait); spin_unlock_irq(&ctx->completion_lock); return; } - io_poll_remove_req(req); - io_poll_complete(req, mask); + hash_del(&req->hash_node); + io_poll_complete(req, mask, ret); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); + if (ret < 0 && req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_put_req_find_next(req, &nxt); if (nxt) *workptr = &nxt->work; @@ -2078,8 +2409,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { - struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, - wait); + struct io_poll_iocb *poll = wait->private; struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); struct io_ring_ctx *ctx = req->ctx; __poll_t mask = key_to_poll(key); @@ -2089,7 +2419,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && !(mask & poll->events)) return 0; - list_del_init(&poll->wait.entry); + list_del_init(&poll->wait->entry); /* * Run completion inline if we can. We're using trylock here because @@ -2098,8 +2428,8 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * for finalizing the request, mark us as having grabbed that already. */ if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { - io_poll_remove_req(req); - io_poll_complete(req, mask); + hash_del(&req->hash_node); + io_poll_complete(req, mask, 0); req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -2130,26 +2460,16 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, pt->error = 0; pt->req->poll.head = head; - add_wait_queue(head, &pt->req->poll.wait); + add_wait_queue(head, pt->req->poll.wait); } static void io_poll_req_insert(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct rb_node **p = &ctx->cancel_tree.rb_node; - struct rb_node *parent = NULL; - struct io_kiocb *tmp; - - while (*p) { - parent = *p; - tmp = rb_entry(parent, struct io_kiocb, rb_node); - if (req->user_data < tmp->user_data) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - rb_link_node(&req->rb_node, parent, p); - rb_insert_color(&req->rb_node, &ctx->cancel_tree); + struct hlist_head *list; + + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); } static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, @@ -2169,11 +2489,15 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (!poll->file) return -EBADF; - req->submit.sqe = NULL; + poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL); + if (!poll->wait) + return -ENOMEM; + + req->io = NULL; INIT_IO_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - RB_CLEAR_NODE(&req->rb_node); + INIT_HLIST_NODE(&req->hash_node); poll->head = NULL; poll->done = false; @@ -2185,8 +2509,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, io_poll_wake); + INIT_LIST_HEAD(&poll->wait->entry); + init_waitqueue_func_entry(poll->wait, io_poll_wake); + poll->wait->private = poll; INIT_LIST_HEAD(&req->list); @@ -2195,14 +2520,14 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, spin_lock_irq(&ctx->completion_lock); if (likely(poll->head)) { spin_lock(&poll->head->lock); - if (unlikely(list_empty(&poll->wait.entry))) { + if (unlikely(list_empty(&poll->wait->entry))) { if (ipt.error) cancel = true; ipt.error = 0; mask = 0; } if (mask || ipt.error) - list_del_init(&poll->wait.entry); + list_del_init(&poll->wait->entry); else if (cancel) WRITE_ONCE(poll->canceled, true); else if (!poll->done) /* actually waiting for an event */ @@ -2211,7 +2536,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, } if (mask) { /* no async, we'd stolen it */ ipt.error = 0; - io_poll_complete(req, mask); + io_poll_complete(req, mask, 0); } spin_unlock_irq(&ctx->completion_lock); @@ -2224,12 +2549,12 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) { - struct io_ring_ctx *ctx; - struct io_kiocb *req; + struct io_timeout_data *data = container_of(timer, + struct io_timeout_data, timer); + struct io_kiocb *req = data->req; + struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - req = container_of(timer, struct io_kiocb, timeout.timer); - ctx = req->ctx; atomic_inc(&ctx->cq_timeouts); spin_lock_irqsave(&ctx->completion_lock, flags); @@ -2279,10 +2604,12 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -ENOENT) return ret; - ret = hrtimer_try_to_cancel(&req->timeout.timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret == -1) return -EALREADY; + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_cqring_fill_event(req, -ECANCELED); io_put_req(req); return 0; @@ -2319,34 +2646,63 @@ static int io_timeout_remove(struct io_kiocb *req, return 0; } -static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, + bool is_timeout_link) { - unsigned count; - struct io_ring_ctx *ctx = req->ctx; - struct list_head *entry; - enum hrtimer_mode mode; - struct timespec64 ts; - unsigned span = 0; + const struct io_uring_sqe *sqe = req->sqe; + struct io_timeout_data *data; unsigned flags; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len != 1) + if (sqe->ioprio || sqe->buf_index || sqe->len != 1) + return -EINVAL; + if (sqe->off && is_timeout_link) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; - if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) + data = &io->timeout; + data->req = req; + req->flags |= REQ_F_TIMEOUT; + + if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; if (flags & IORING_TIMEOUT_ABS) - mode = HRTIMER_MODE_ABS; + data->mode = HRTIMER_MODE_ABS; else - mode = HRTIMER_MODE_REL; + data->mode = HRTIMER_MODE_REL; - hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, mode); - req->flags |= REQ_F_TIMEOUT; + hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); + req->io = io; + return 0; +} + +static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + unsigned count; + struct io_ring_ctx *ctx = req->ctx; + struct io_timeout_data *data; + struct io_async_ctx *io; + struct list_head *entry; + unsigned span = 0; + + io = req->io; + if (!io) { + int ret; + + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) + return -ENOMEM; + ret = io_timeout_prep(req, io, false); + if (ret) { + kfree(io); + return ret; + } + } + data = &req->io->timeout; /* * sqe->off holds how many events that need to occur for this @@ -2362,8 +2718,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->sequence = ctx->cached_sq_head + count - 1; - /* reuse it to store the count */ - req->submit.sequence = count; + data->seq_offset = count; /* * Insertion sort, ensuring the first entry in the list is always @@ -2374,6 +2729,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); unsigned nxt_sq_head; long long tmp, tmp_nxt; + u32 nxt_offset = nxt->io->timeout.seq_offset; if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) continue; @@ -2383,8 +2739,8 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) * long to store it. */ tmp = (long long)ctx->cached_sq_head + count - 1; - nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1; - tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1; + nxt_sq_head = nxt->sequence - nxt_offset + 1; + tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1; /* * cached_sq_head may overflow, and it will never overflow twice @@ -2406,8 +2762,8 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->sequence -= span; add: list_add(&req->list, entry); - req->timeout.timer.function = io_timeout_fn; - hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), mode); + data->timer.function = io_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->completion_lock); return 0; } @@ -2442,7 +2798,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) static void io_async_find_and_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req, __u64 sqe_addr, - struct io_kiocb **nxt) + struct io_kiocb **nxt, int success_ret) { unsigned long flags; int ret; @@ -2459,6 +2815,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx, goto done; ret = io_poll_cancel(ctx, sqe_addr); done: + if (!ret) + ret = success_ret; io_cqring_fill_event(req, ret); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -2480,59 +2838,105 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, sqe->cancel_flags) return -EINVAL; - io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), NULL); + io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0); + return 0; +} + +static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct iov_iter iter; + ssize_t ret; + + memcpy(&io->sqe, req->sqe, sizeof(io->sqe)); + req->sqe = &io->sqe; + + switch (io->sqe.opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + ret = io_read_prep(req, &iovec, &iter, true); + break; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + ret = io_write_prep(req, &iovec, &iter, true); + break; + case IORING_OP_SENDMSG: + ret = io_sendmsg_prep(req, io); + break; + case IORING_OP_RECVMSG: + ret = io_recvmsg_prep(req, io); + break; + case IORING_OP_CONNECT: + ret = io_connect_prep(req, io); + break; + case IORING_OP_TIMEOUT: + return io_timeout_prep(req, io, false); + case IORING_OP_LINK_TIMEOUT: + return io_timeout_prep(req, io, true); + default: + req->io = io; + return 0; + } + + if (ret < 0) + return ret; + + req->io = io; + io_req_map_io(req, ret, iovec, inline_vecs, &iter); return 0; } static int io_req_defer(struct io_kiocb *req) { - const struct io_uring_sqe *sqe = req->submit.sqe; - struct io_uring_sqe *sqe_copy; struct io_ring_ctx *ctx = req->ctx; + struct io_async_ctx *io; + int ret; /* Still need defer if there is pending req in defer list. */ if (!req_need_defer(req) && list_empty(&ctx->defer_list)) return 0; - sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) return -EAGAIN; + ret = io_req_defer_prep(req, io); + if (ret < 0) { + kfree(io); + return ret; + } + spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); - kfree(sqe_copy); return 0; } - memcpy(sqe_copy, sqe, sizeof(*sqe_copy)); - req->submit.sqe = sqe_copy; - - trace_io_uring_defer(ctx, req, false); + trace_io_uring_defer(ctx, req, req->user_data); list_add_tail(&req->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); return -EIOCBQUEUED; } -static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +__attribute__((nonnull)) +static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { int ret, opcode; - struct sqe_submit *s = &req->submit; struct io_ring_ctx *ctx = req->ctx; - opcode = READ_ONCE(s->sqe->opcode); + opcode = READ_ONCE(req->sqe->opcode); switch (opcode) { case IORING_OP_NOP: ret = io_nop(req); break; case IORING_OP_READV: - if (unlikely(s->sqe->buf_index)) + if (unlikely(req->sqe->buf_index)) return -EINVAL; ret = io_read(req, nxt, force_nonblock); break; case IORING_OP_WRITEV: - if (unlikely(s->sqe->buf_index)) + if (unlikely(req->sqe->buf_index)) return -EINVAL; ret = io_write(req, nxt, force_nonblock); break; @@ -2543,34 +2947,37 @@ static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt, ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_FSYNC: - ret = io_fsync(req, s->sqe, nxt, force_nonblock); + ret = io_fsync(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_POLL_ADD: - ret = io_poll_add(req, s->sqe, nxt); + ret = io_poll_add(req, req->sqe, nxt); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req, s->sqe); + ret = io_poll_remove(req, req->sqe); break; case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, s->sqe, nxt, force_nonblock); + ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_SENDMSG: - ret = io_sendmsg(req, s->sqe, nxt, force_nonblock); + ret = io_sendmsg(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_RECVMSG: - ret = io_recvmsg(req, s->sqe, nxt, force_nonblock); + ret = io_recvmsg(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: - ret = io_timeout(req, s->sqe); + ret = io_timeout(req, req->sqe); break; case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove(req, s->sqe); + ret = io_timeout_remove(req, req->sqe); break; case IORING_OP_ACCEPT: - ret = io_accept(req, s->sqe, nxt, force_nonblock); + ret = io_accept(req, req->sqe, nxt, force_nonblock); + break; + case IORING_OP_CONNECT: + ret = io_connect(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel(req, s->sqe, nxt); + ret = io_async_cancel(req, req->sqe, nxt); break; default: ret = -EINVAL; @@ -2585,22 +2992,29 @@ static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt, return -EAGAIN; /* workqueue context doesn't hold uring_lock, grab it now */ - if (s->in_async) + if (req->in_async) mutex_lock(&ctx->uring_lock); io_iopoll_req_issued(req); - if (s->in_async) + if (req->in_async) mutex_unlock(&ctx->uring_lock); } return 0; } +static void io_link_work_cb(struct io_wq_work **workptr) +{ + struct io_wq_work *work = *workptr; + struct io_kiocb *link = work->data; + + io_queue_linked_timeout(link); + work->func = io_wq_submit_work; +} + static void io_wq_submit_work(struct io_wq_work **workptr) { struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct sqe_submit *s = &req->submit; - const struct io_uring_sqe *sqe = s->sqe; struct io_kiocb *nxt = NULL; int ret = 0; @@ -2611,10 +3025,10 @@ static void io_wq_submit_work(struct io_wq_work **workptr) ret = -ECANCELED; if (!ret) { - s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; - s->in_async = true; + req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; + req->in_async = true; do { - ret = __io_submit_sqe(req, &nxt, false); + ret = io_issue_sqe(req, &nxt, false); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -2636,13 +3050,17 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_put_req(req); } - /* async context always use a copy of the sqe */ - kfree(sqe); - /* if a dependent link is ready, pass it back */ if (!ret && nxt) { - io_prep_async_work(nxt); + struct io_kiocb *link; + + io_prep_async_work(nxt, &link); *workptr = &nxt->work; + if (link) { + nxt->work.flags |= IO_WQ_WORK_CB; + nxt->work.func = io_link_work_cb; + nxt->work.data = link; + } } } @@ -2674,24 +3092,17 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) { - struct sqe_submit *s = &req->submit; struct io_ring_ctx *ctx = req->ctx; unsigned flags; int fd; - flags = READ_ONCE(s->sqe->flags); - fd = READ_ONCE(s->sqe->fd); + flags = READ_ONCE(req->sqe->flags); + fd = READ_ONCE(req->sqe->fd); if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - /* - * All io need record the previous position, if LINK vs DARIN, - * it can be used to mark the position of the first IO in the - * link list. - */ - req->sequence = s->sequence; - if (!io_op_needs_file(s->sqe)) + if (!io_op_needs_file(req->sqe)) return 0; if (flags & IOSQE_FIXED_FILE) { @@ -2704,7 +3115,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) return -EBADF; req->flags |= REQ_F_FIXED_FILE; } else { - if (s->needs_fixed_file) + if (req->needs_fixed_file) return -EBADF; trace_io_uring_file_get(ctx, fd); req->file = io_file_get(state, fd); @@ -2728,7 +3139,7 @@ static int io_grab_files(struct io_kiocb *req) * the fd has changed since we started down this path, and disallow * this operation if it has. */ - if (fcheck(req->submit.ring_fd) == req->submit.ring_file) { + if (fcheck(req->ring_fd) == req->ring_file) { list_add(&req->inflight_entry, &ctx->inflight_list); req->flags |= REQ_F_INFLIGHT; req->work.files = current->files; @@ -2742,8 +3153,9 @@ static int io_grab_files(struct io_kiocb *req) static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { - struct io_kiocb *req = container_of(timer, struct io_kiocb, - timeout.timer); + struct io_timeout_data *data = container_of(timer, + struct io_timeout_data, timer); + struct io_kiocb *req = data->req; struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *prev = NULL; unsigned long flags; @@ -2754,18 +3166,23 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) * We don't expect the list to be empty, that will only happen if we * race with the completion of the linked work. */ - if (!list_empty(&req->list)) { - prev = list_entry(req->list.prev, struct io_kiocb, link_list); - if (refcount_inc_not_zero(&prev->refs)) - list_del_init(&req->list); - else + if (!list_empty(&req->link_list)) { + prev = list_entry(req->link_list.prev, struct io_kiocb, + link_list); + if (refcount_inc_not_zero(&prev->refs)) { + list_del_init(&req->link_list); + prev->flags &= ~REQ_F_LINK_TIMEOUT; + } else prev = NULL; } spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { - io_async_find_and_cancel(ctx, req, prev->user_data, NULL); + if (prev->flags & REQ_F_LINK) + prev->flags |= REQ_F_FAIL_LINK; + io_async_find_and_cancel(ctx, req, prev->user_data, NULL, + -ETIME); io_put_req(prev); } else { io_cqring_add_event(req, -ETIME); @@ -2774,8 +3191,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts, - enum hrtimer_mode *mode) +static void io_queue_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -2784,10 +3200,12 @@ static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts, * we got a chance to setup the timer */ spin_lock_irq(&ctx->completion_lock); - if (!list_empty(&req->list)) { - req->timeout.timer.function = io_link_timeout_fn; - hrtimer_start(&req->timeout.timer, timespec64_to_ktime(*ts), - *mode); + if (!list_empty(&req->link_list)) { + struct io_timeout_data *data = &req->io->timeout; + + data->timer.function = io_link_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), + data->mode); } spin_unlock_irq(&ctx->completion_lock); @@ -2795,66 +3213,31 @@ static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts, io_put_req(req); } -static int io_validate_link_timeout(const struct io_uring_sqe *sqe, - struct timespec64 *ts) -{ - if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || sqe->off) - return -EINVAL; - if (sqe->timeout_flags & ~IORING_TIMEOUT_ABS) - return -EINVAL; - if (get_timespec64(ts, u64_to_user_ptr(sqe->addr))) - return -EFAULT; - - return 0; -} - -static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req, - struct timespec64 *ts, - enum hrtimer_mode *mode) +static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) { struct io_kiocb *nxt; - int ret; if (!(req->flags & REQ_F_LINK)) return NULL; - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT) + nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, + link_list); + if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT) return NULL; - ret = io_validate_link_timeout(nxt->submit.sqe, ts); - if (ret) { - list_del_init(&nxt->list); - io_cqring_add_event(nxt, ret); - io_double_put_req(nxt); - return ERR_PTR(-ECANCELED); - } - - if (nxt->submit.sqe->timeout_flags & IORING_TIMEOUT_ABS) - *mode = HRTIMER_MODE_ABS; - else - *mode = HRTIMER_MODE_REL; - req->flags |= REQ_F_LINK_TIMEOUT; - hrtimer_init(&nxt->timeout.timer, CLOCK_MONOTONIC, *mode); return nxt; } -static int __io_queue_sqe(struct io_kiocb *req) +static void __io_queue_sqe(struct io_kiocb *req) { - enum hrtimer_mode mode; - struct io_kiocb *nxt; - struct timespec64 ts; + struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *nxt = NULL; int ret; - nxt = io_prep_linked_timeout(req, &ts, &mode); - if (IS_ERR(nxt)) { - ret = PTR_ERR(nxt); - nxt = NULL; - goto err; - } - - ret = __io_submit_sqe(req, NULL, true); + ret = io_issue_sqe(req, &nxt, true); + if (nxt) + io_queue_async_work(nxt); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -2862,42 +3245,29 @@ static int __io_queue_sqe(struct io_kiocb *req) */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { - struct sqe_submit *s = &req->submit; - struct io_uring_sqe *sqe_copy; - - sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); - if (sqe_copy) { - s->sqe = sqe_copy; - if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { - ret = io_grab_files(req); - if (ret) { - kfree(sqe_copy); - goto err; - } - } - - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_queue_async_work(req); - - if (nxt) - io_queue_linked_timeout(nxt, &ts, &mode); - - return 0; + if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { + ret = io_grab_files(req); + if (ret) + goto err; } + + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually submitted. + */ + io_queue_async_work(req); + return; } err: /* drop submission reference */ io_put_req(req); - if (nxt) { + if (linked_timeout) { if (!ret) - io_queue_linked_timeout(nxt, &ts, &mode); + io_queue_linked_timeout(linked_timeout); else - io_put_req(nxt); + io_put_req(linked_timeout); } /* and drop final reference, if we failed */ @@ -2907,83 +3277,52 @@ err: req->flags |= REQ_F_FAIL_LINK; io_put_req(req); } - - return ret; } -static int io_queue_sqe(struct io_kiocb *req) +static void io_queue_sqe(struct io_kiocb *req) { int ret; - ret = io_req_defer(req); - if (ret) { - if (ret != -EIOCBQUEUED) { - io_cqring_add_event(req, ret); - io_double_put_req(req); - } - return 0; + if (unlikely(req->ctx->drain_next)) { + req->flags |= REQ_F_IO_DRAIN; + req->ctx->drain_next = false; } + req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK); - return __io_queue_sqe(req); -} - -static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow) -{ - int ret; - int need_submit = false; - struct io_ring_ctx *ctx = req->ctx; - - if (!shadow) - return io_queue_sqe(req); - - /* - * Mark the first IO in link list as DRAIN, let all the following - * IOs enter the defer list. all IO needs to be completed before link - * list. - */ - req->flags |= REQ_F_IO_DRAIN; ret = io_req_defer(req); if (ret) { if (ret != -EIOCBQUEUED) { io_cqring_add_event(req, ret); + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_double_put_req(req); - __io_free_req(shadow); - return 0; } - } else { - /* - * If ret == 0 means that all IOs in front of link io are - * running done. let's queue link head. - */ - need_submit = true; - } - - /* Insert shadow req to defer_list, blocking next IOs */ - spin_lock_irq(&ctx->completion_lock); - trace_io_uring_defer(ctx, shadow, true); - list_add_tail(&shadow->list, &ctx->defer_list); - spin_unlock_irq(&ctx->completion_lock); - - if (need_submit) - return __io_queue_sqe(req); + } else + __io_queue_sqe(req); +} - return 0; +static inline void io_queue_link_head(struct io_kiocb *req) +{ + if (unlikely(req->flags & REQ_F_FAIL_LINK)) { + io_cqring_add_event(req, -ECANCELED); + io_double_put_req(req); + } else + io_queue_sqe(req); } + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) -static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, +static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, struct io_kiocb **link) { - struct io_uring_sqe *sqe_copy; - struct sqe_submit *s = &req->submit; struct io_ring_ctx *ctx = req->ctx; int ret; - req->user_data = s->sqe->user_data; + req->user_data = req->sqe->user_data; /* enforce forwards compatibility on users */ - if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) { + if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; goto err_req; } @@ -2993,7 +3332,7 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, err_req: io_cqring_add_event(req, ret); io_double_put_req(req); - return; + return false; } /* @@ -3005,28 +3344,35 @@ err_req: */ if (*link) { struct io_kiocb *prev = *link; + struct io_async_ctx *io; - sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) { + if (req->sqe->flags & IOSQE_IO_DRAIN) + (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; + + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) { ret = -EAGAIN; goto err_req; } - s->sqe = sqe_copy; + ret = io_req_defer_prep(req, io); + if (ret) { + kfree(io); + prev->flags |= REQ_F_FAIL_LINK; + goto err_req; + } trace_io_uring_link(ctx, req, prev); - list_add_tail(&req->list, &prev->link_list); - } else if (s->sqe->flags & IOSQE_IO_LINK) { + list_add_tail(&req->link_list, &prev->link_list); + } else if (req->sqe->flags & IOSQE_IO_LINK) { req->flags |= REQ_F_LINK; INIT_LIST_HEAD(&req->link_list); *link = req; - } else if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { - /* Only valid as a linked SQE */ - ret = -EINVAL; - goto err_req; } else { io_queue_sqe(req); } + + return true; } /* @@ -3045,7 +3391,7 @@ static void io_submit_state_end(struct io_submit_state *state) * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, - struct io_ring_ctx *ctx, unsigned max_ios) + unsigned int max_ios) { blk_start_plug(&state->plug); state->free_reqs = 0; @@ -3075,7 +3421,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) +static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) { struct io_rings *rings = ctx->rings; u32 *sq_array = ctx->sq_array; @@ -3091,14 +3437,18 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) */ head = ctx->cached_sq_head; /* make sure SQ entry isn't read before tail */ - if (head == smp_load_acquire(&rings->sq.tail)) + if (unlikely(head == smp_load_acquire(&rings->sq.tail))) return false; head = READ_ONCE(sq_array[head & ctx->sq_mask]); - if (head < ctx->sq_entries) { - s->ring_file = NULL; - s->sqe = &ctx->sq_sqes[head]; - s->sequence = ctx->cached_sq_head; + if (likely(head < ctx->sq_entries)) { + /* + * All io need record the previous position, if LINK vs DARIN, + * it can be used to mark the position of the first IO in the + * link list. + */ + req->sequence = ctx->cached_sq_head; + req->sqe = &ctx->sq_sqes[head]; ctx->cached_sq_head++; return true; } @@ -3116,17 +3466,16 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; - struct io_kiocb *shadow_req = NULL; int i, submitted = 0; bool mm_fault = false; - if (!list_empty(&ctx->cq_overflow_list)) { - io_cqring_overflow_flush(ctx, false); + /* if we have a backlog and couldn't flush it all, return BUSY */ + if (!list_empty(&ctx->cq_overflow_list) && + !io_cqring_overflow_flush(ctx, false)) return -EBUSY; - } if (nr > IO_PLUG_THRESHOLD) { - io_submit_state_start(&state, ctx, nr); + io_submit_state_start(&state, nr); statep = &state; } @@ -3140,12 +3489,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, submitted = -EAGAIN; break; } - if (!io_get_sqring(ctx, &req->submit)) { + if (!io_get_sqring(ctx, req)) { __io_free_req(req); break; } - if (io_sqe_needs_user(req->submit.sqe) && !*mm) { + if (io_sqe_needs_user(req->sqe) && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -3153,43 +3502,30 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } - sqe_flags = req->submit.sqe->flags; - - if (link && (sqe_flags & IOSQE_IO_DRAIN)) { - if (!shadow_req) { - shadow_req = io_get_req(ctx, NULL); - if (unlikely(!shadow_req)) - goto out; - shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); - refcount_dec(&shadow_req->refs); - } - shadow_req->sequence = req->submit.sequence; - } - -out: - req->submit.ring_file = ring_file; - req->submit.ring_fd = ring_fd; - req->submit.has_user = *mm != NULL; - req->submit.in_async = async; - req->submit.needs_fixed_file = async; - trace_io_uring_submit_sqe(ctx, req->submit.sqe->user_data, - true, async); - io_submit_sqe(req, statep, &link); submitted++; - + sqe_flags = req->sqe->flags; + + req->ring_file = ring_file; + req->ring_fd = ring_fd; + req->has_user = *mm != NULL; + req->in_async = async; + req->needs_fixed_file = async; + trace_io_uring_submit_sqe(ctx, req->sqe->user_data, + true, async); + if (!io_submit_sqe(req, statep, &link)) + break; /* * If previous wasn't linked and we have a linked command, * that's the end of the chain. Submit the previous link. */ if (!(sqe_flags & IOSQE_IO_LINK) && link) { - io_queue_link_head(link, shadow_req); + io_queue_link_head(link); link = NULL; - shadow_req = NULL; } } if (link) - io_queue_link_head(link, shadow_req); + io_queue_link_head(link); if (statep) io_submit_state_end(&state); @@ -3203,6 +3539,7 @@ static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; struct mm_struct *cur_mm = NULL; + const struct cred *old_cred; mm_segment_t old_fs; DEFINE_WAIT(wait); unsigned inflight; @@ -3213,6 +3550,7 @@ static int io_sq_thread(void *data) old_fs = get_fs(); set_fs(USER_DS); + old_cred = override_creds(ctx->creds); ret = timeout = inflight = 0; while (!kthread_should_park()) { @@ -3319,6 +3657,7 @@ static int io_sq_thread(void *data) unuse_mm(cur_mm); mmput(cur_mm); } + revert_creds(old_cred); kthread_parkme(); @@ -3898,6 +4237,7 @@ static void io_get_work(struct io_wq_work *work) static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct io_wq_data data; unsigned concurrency; int ret; @@ -3942,10 +4282,15 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, goto err; } + data.mm = ctx->sqo_mm; + data.user = ctx->user; + data.creds = ctx->creds; + data.get_work = io_get_work; + data.put_work = io_put_work; + /* Do QD, or 4 * CPUS, whatever is smallest */ concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, ctx->user, - io_get_work, io_put_work); + ctx->io_wq = io_wq_create(concurrency, &data); if (IS_ERR(ctx->io_wq)) { ret = PTR_ERR(ctx->io_wq); ctx->io_wq = NULL; @@ -4294,7 +4639,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_unaccount_mem(ctx->user, ring_pages(ctx->sq_entries, ctx->cq_entries)); free_uid(ctx->user); + put_cred(ctx->creds); kfree(ctx->completions); + kfree(ctx->cancel_hash); kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx); } @@ -4402,12 +4749,11 @@ static int io_uring_flush(struct file *file, void *data) return 0; } -static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +static void *io_uring_validate_mmap_request(struct file *file, + loff_t pgoff, size_t sz) { - loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT; - unsigned long sz = vma->vm_end - vma->vm_start; struct io_ring_ctx *ctx = file->private_data; - unsigned long pfn; + loff_t offset = pgoff << PAGE_SHIFT; struct page *page; void *ptr; @@ -4420,17 +4766,59 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) ptr = ctx->sq_sqes; break; default: - return -EINVAL; + return ERR_PTR(-EINVAL); } page = virt_to_head_page(ptr); if (sz > page_size(page)) - return -EINVAL; + return ERR_PTR(-EINVAL); + + return ptr; +} + +#ifdef CONFIG_MMU + +static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + size_t sz = vma->vm_end - vma->vm_start; + unsigned long pfn; + void *ptr; + + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); pfn = virt_to_phys(ptr) >> PAGE_SHIFT; return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } +#else /* !CONFIG_MMU */ + +static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; +} + +static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; +} + +static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + void *ptr; + + ptr = io_uring_validate_mmap_request(file, pgoff, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + return (unsigned long) ptr; +} + +#endif /* !CONFIG_MMU */ + SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32, min_complete, u32, flags, const sigset_t __user *, sig, size_t, sigsz) @@ -4501,6 +4889,10 @@ static const struct file_operations io_uring_fops = { .release = io_uring_release, .flush = io_uring_flush, .mmap = io_uring_mmap, +#ifndef CONFIG_MMU + .get_unmapped_area = io_uring_nommu_get_unmapped_area, + .mmap_capabilities = io_uring_nommu_mmap_capabilities, +#endif .poll = io_uring_poll, .fasync = io_uring_fasync, }; @@ -4531,12 +4923,18 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, ctx->cq_entries = rings->cq_ring_entries; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); - if (size == SIZE_MAX) + if (size == SIZE_MAX) { + io_mem_free(ctx->rings); + ctx->rings = NULL; return -EOVERFLOW; + } ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) + if (!ctx->sq_sqes) { + io_mem_free(ctx->rings); + ctx->rings = NULL; return -ENOMEM; + } return 0; } @@ -4640,6 +5038,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) ctx->compat = in_compat_syscall(); ctx->account_mem = account_mem; ctx->user = user; + ctx->creds = get_current_cred(); ret = io_allocate_scq_urings(ctx, p); if (ret) @@ -4674,7 +5073,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret < 0) goto err; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP; + p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | + IORING_FEAT_SUBMIT_STABLE; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: |