diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 18:58:57 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 18:58:57 -0800 |
| commit | 0abcfd8983e3d3d27b8f5f7d01fed4354eb422c4 (patch) | |
| tree | 6a8dfbb303a1273da5fc65d09600467196107a5e /io_uring | |
| parent | 8f7aa3d3c7323f4ca2768a9e74ebbe359c4f8f88 (diff) | |
| parent | 5d24321e4c159088604512d7a5c5cf634d23e01a (diff) | |
Merge tag 'for-6.19/io_uring-20251201' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe:
- Unify how task_work cancelations are detected, placing it in the
task_work running state rather than needing to check the task state
- Series cleaning up and moving the cancelation code to where it
belongs, in cancel.c
- Cleanup of waitid and futex argument handling
- Add support for mixed sized SQEs. 6.18 added support for mixed sized
CQEs, improving flexibility and efficiency of workloads that need big
CQEs. This adds similar support for SQEs, where the occasional need
for a 128b SQE doesn't necessitate having all SQEs be 128b in size
- Introduce zcrx and SQ/CQ layout queries. The former returns what zcrx
features are available. And both return the ring size information to
help with allocation size calculation for user provided rings like
IORING_SETUP_NO_MMAP and IORING_MEM_REGION_TYPE_USER
- Zcrx updates for 6.19. It includes a bunch of small patches,
IORING_REGISTER_ZCRX_CTRL and RQ flushing and David's work on sharing
zcrx b/w multiple io_uring instances
- Series cleaning up ring initializations, notable deduplicating ring
size and offset calculations. It also moves most of the checking
before doing any allocations, making the code simpler
- Add support for getsockname and getpeername, which is mostly a
trivial hookup after a bit of refactoring on the networking side
- Various fixes and cleanups
* tag 'for-6.19/io_uring-20251201' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (68 commits)
io_uring: Introduce getsockname io_uring cmd
socket: Split out a getsockname helper for io_uring
socket: Unify getsockname and getpeername implementation
io_uring/query: drop unused io_handle_query_entry() ctx arg
io_uring/kbuf: remove obsolete buf_nr_pages and update comments
io_uring/register: use correct location for io_rings_layout
io_uring/zcrx: share an ifq between rings
io_uring/zcrx: add io_fill_zcrx_offsets()
io_uring/zcrx: export zcrx via a file
io_uring/zcrx: move io_zcrx_scrub() and dependencies up
io_uring/zcrx: count zcrx users
io_uring/zcrx: add sync refill queue flushing
io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL
io_uring/zcrx: elide passing msg flags
io_uring/zcrx: use folio_nr_pages() instead of shift operation
io_uring/zcrx: convert to use netmem_desc
io_uring/query: introduce rings info query
io_uring/query: introduce zcrx query
io_uring: move cq/sq user offset init around
io_uring: pre-calculate scq layout
...
Diffstat (limited to 'io_uring')
| -rw-r--r-- | io_uring/cancel.c | 270 | ||||
| -rw-r--r-- | io_uring/cancel.h | 8 | ||||
| -rw-r--r-- | io_uring/cmd_net.c | 22 | ||||
| -rw-r--r-- | io_uring/fdinfo.c | 37 | ||||
| -rw-r--r-- | io_uring/futex.c | 57 | ||||
| -rw-r--r-- | io_uring/io_uring.c | 547 | ||||
| -rw-r--r-- | io_uring/io_uring.h | 63 | ||||
| -rw-r--r-- | io_uring/kbuf.c | 6 | ||||
| -rw-r--r-- | io_uring/kbuf.h | 5 | ||||
| -rw-r--r-- | io_uring/memmap.c | 59 | ||||
| -rw-r--r-- | io_uring/memmap.h | 24 | ||||
| -rw-r--r-- | io_uring/msg_ring.c | 3 | ||||
| -rw-r--r-- | io_uring/net.c | 7 | ||||
| -rw-r--r-- | io_uring/notif.c | 7 | ||||
| -rw-r--r-- | io_uring/opdef.c | 26 | ||||
| -rw-r--r-- | io_uring/opdef.h | 2 | ||||
| -rw-r--r-- | io_uring/poll.c | 13 | ||||
| -rw-r--r-- | io_uring/poll.h | 2 | ||||
| -rw-r--r-- | io_uring/query.c | 55 | ||||
| -rw-r--r-- | io_uring/query.h | 2 | ||||
| -rw-r--r-- | io_uring/register.c | 105 | ||||
| -rw-r--r-- | io_uring/rsrc.c | 30 | ||||
| -rw-r--r-- | io_uring/rsrc.h | 6 | ||||
| -rw-r--r-- | io_uring/rw.c | 12 | ||||
| -rw-r--r-- | io_uring/rw.h | 2 | ||||
| -rw-r--r-- | io_uring/slist.h | 18 | ||||
| -rw-r--r-- | io_uring/sqpoll.c | 1 | ||||
| -rw-r--r-- | io_uring/timeout.c | 20 | ||||
| -rw-r--r-- | io_uring/uring_cmd.c | 34 | ||||
| -rw-r--r-- | io_uring/waitid.c | 48 | ||||
| -rw-r--r-- | io_uring/zcrx.c | 421 | ||||
| -rw-r--r-- | io_uring/zcrx.h | 16 |
32 files changed, 1158 insertions, 770 deletions
diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 64b51e82baa2..ca12ac10c0ae 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -14,6 +14,8 @@ #include "filetable.h" #include "io_uring.h" #include "tctx.h" +#include "sqpoll.h" +#include "uring_cmd.h" #include "poll.h" #include "timeout.h" #include "waitid.h" @@ -384,3 +386,271 @@ int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, io_ring_submit_unlock(ctx, issue_flags); return nr ?: -ENOENT; } + +static bool io_match_linked(struct io_kiocb *head) +{ + struct io_kiocb *req; + + io_for_each_link(req, head) { + if (req->flags & REQ_F_INFLIGHT) + return true; + } + return false; +} + +/* + * As io_match_task() but protected against racing with linked timeouts. + * User must not hold timeout_lock. + */ +bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, + bool cancel_all) +{ + bool matched; + + if (tctx && head->tctx != tctx) + return false; + if (cancel_all) + return true; + + if (head->flags & REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = head->ctx; + + /* protect against races with linked timeouts */ + raw_spin_lock_irq(&ctx->timeout_lock); + matched = io_match_linked(head); + raw_spin_unlock_irq(&ctx->timeout_lock); + } else { + matched = io_match_linked(head); + } + return matched; +} + +void __io_uring_cancel(bool cancel_all) +{ + io_uring_unreg_ringfd(); + io_uring_cancel_generic(cancel_all, NULL); +} + +struct io_task_cancel { + struct io_uring_task *tctx; + bool all; +}; + +static bool io_cancel_task_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_task_cancel *cancel = data; + + return io_match_task_safe(req, cancel->tctx, cancel->all); +} + +static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, + struct io_uring_task *tctx, + bool cancel_all) +{ + struct io_defer_entry *de; + LIST_HEAD(list); + + list_for_each_entry_reverse(de, &ctx->defer_list, list) { + if (io_match_task_safe(de->req, tctx, cancel_all)) { + list_cut_position(&list, &ctx->defer_list, &de->list); + break; + } + } + if (list_empty(&list)) + return false; + + while (!list_empty(&list)) { + de = list_first_entry(&list, struct io_defer_entry, list); + list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); + io_req_task_queue_fail(de->req, -ECANCELED); + kfree(de); + } + return true; +} + +__cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + return req->ctx == data; +} + +static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) +{ + struct io_tctx_node *node; + enum io_wq_cancel cret; + bool ret = false; + + mutex_lock(&ctx->uring_lock); + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + /* + * io_wq will stay alive while we hold uring_lock, because it's + * killed after ctx nodes, which requires to take the lock. + */ + if (!tctx || !tctx->io_wq) + continue; + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); + } + mutex_unlock(&ctx->uring_lock); + + return ret; +} + +__cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, + struct io_uring_task *tctx, + bool cancel_all, bool is_sqpoll_thread) +{ + struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; + enum io_wq_cancel cret; + bool ret = false; + + /* set it so io_req_local_work_add() would wake us up */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + } + + /* failed during ring init, it couldn't have issued any requests */ + if (!ctx->rings) + return false; + + if (!tctx) { + ret |= io_uring_try_cancel_iowq(ctx); + } else if (tctx->io_wq) { + /* + * Cancels requests of all rings, not only @ctx, but + * it's fine as the task is in exit/exec. + */ + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, + &cancel, true); + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); + } + + /* SQPOLL thread does its own polling */ + if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || + is_sqpoll_thread) { + while (!wq_list_empty(&ctx->iopoll_list)) { + io_iopoll_try_reap_events(ctx); + ret = true; + cond_resched(); + } + } + + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && + io_allowed_defer_tw_run(ctx)) + ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; + mutex_lock(&ctx->uring_lock); + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); + ret |= io_poll_remove_all(ctx, tctx, cancel_all); + ret |= io_waitid_remove_all(ctx, tctx, cancel_all); + ret |= io_futex_remove_all(ctx, tctx, cancel_all); + ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); + mutex_unlock(&ctx->uring_lock); + ret |= io_kill_timeouts(ctx, tctx, cancel_all); + if (tctx) + ret |= io_run_task_work() > 0; + else + ret |= flush_delayed_work(&ctx->fallback_work); + return ret; +} + +static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) +{ + if (tracked) + return atomic_read(&tctx->inflight_tracked); + return percpu_counter_sum(&tctx->inflight); +} + +/* + * Find any io_uring ctx that this task has registered or done IO on, and cancel + * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. + */ +__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) +{ + struct io_uring_task *tctx = current->io_uring; + struct io_ring_ctx *ctx; + struct io_tctx_node *node; + unsigned long index; + s64 inflight; + DEFINE_WAIT(wait); + + WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current); + + if (!current->io_uring) + return; + if (tctx->io_wq) + io_wq_exit_start(tctx->io_wq); + + atomic_inc(&tctx->in_cancel); + do { + bool loop = false; + + io_uring_drop_tctx_refs(current); + if (!tctx_inflight(tctx, !cancel_all)) + break; + + /* read completions before cancelations */ + inflight = tctx_inflight(tctx, false); + if (!inflight) + break; + + if (!sqd) { + xa_for_each(&tctx->xa, index, node) { + /* sqpoll task will cancel all its requests */ + if (node->ctx->sq_data) + continue; + loop |= io_uring_try_cancel_requests(node->ctx, + current->io_uring, + cancel_all, + false); + } + } else { + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + loop |= io_uring_try_cancel_requests(ctx, + current->io_uring, + cancel_all, + true); + } + + if (loop) { + cond_resched(); + continue; + } + + prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); + io_run_task_work(); + io_uring_drop_tctx_refs(current); + xa_for_each(&tctx->xa, index, node) { + if (io_local_work_pending(node->ctx)) { + WARN_ON_ONCE(node->ctx->submitter_task && + node->ctx->submitter_task != current); + goto end_wait; + } + } + /* + * If we've seen completions, retry without waiting. This + * avoids a race where a completion comes in before we did + * prepare_to_wait(). + */ + if (inflight == tctx_inflight(tctx, !cancel_all)) + schedule(); +end_wait: + finish_wait(&tctx->wait, &wait); + } while (1); + + io_uring_clean_tctx(tctx); + if (cancel_all) { + /* + * We shouldn't run task_works after cancel, so just leave + * ->in_cancel set for normal exit. + */ + atomic_dec(&tctx->in_cancel); + /* for exec all current's requests should be gone, kill tctx */ + __io_uring_free(current); + } +} diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 43e9bb74e9d1..6783961ede1b 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -23,14 +23,20 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); +bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, + bool cancel_all); bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, struct hlist_head *list, bool cancel_all, bool (*cancel)(struct io_kiocb *)); - int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags, struct hlist_head *list, bool (*cancel)(struct io_kiocb *)); +__cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, + struct io_uring_task *tctx, + bool cancel_all, bool is_sqpoll_thread); +__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); +__cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data); static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) { diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index 3b75931bd569..19d3ce2bd20a 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -132,6 +132,26 @@ static int io_uring_cmd_timestamp(struct socket *sock, return -EAGAIN; } +static int io_uring_cmd_getsockname(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + struct sockaddr __user *uaddr; + unsigned int peer; + int __user *ulen; + + if (sqe->ioprio || sqe->__pad1 || sqe->len || sqe->rw_flags) + return -EINVAL; + + uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ulen = u64_to_user_ptr(sqe->addr3); + peer = READ_ONCE(sqe->optlen); + if (peer > 1) + return -EINVAL; + return do_getsockname(sock, peer, uaddr, ulen); +} + int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct socket *sock = cmd->file->private_data; @@ -159,6 +179,8 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) return io_uring_cmd_setsockopt(sock, cmd, issue_flags); case SOCKET_URING_OP_TX_TIMESTAMP: return io_uring_cmd_timestamp(sock, cmd, issue_flags); + case SOCKET_URING_OP_GETSOCKNAME: + return io_uring_cmd_getsockname(sock, cmd, issue_flags); default: return -EOPNOTSUPP; } diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 294c75a8a3bd..a87d4e26eee8 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -5,6 +5,7 @@ #include <linux/file.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/nospec.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> @@ -14,6 +15,7 @@ #include "fdinfo.h" #include "cancel.h" #include "rsrc.h" +#include "opdef.h" #ifdef CONFIG_NET_RX_BUSY_POLL static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, @@ -93,21 +95,46 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) unsigned int entry = i + sq_head; struct io_uring_sqe *sqe; unsigned int sq_idx; + bool sqe128 = false; + u8 opcode; if (ctx->flags & IORING_SETUP_NO_SQARRAY) - break; - sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); + sq_idx = entry & sq_mask; + else + sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); if (sq_idx > sq_mask) continue; + sqe = &ctx->sq_sqes[sq_idx << sq_shift]; + opcode = READ_ONCE(sqe->opcode); + if (opcode >= IORING_OP_LAST) + continue; + opcode = array_index_nospec(opcode, IORING_OP_LAST); + if (sq_shift) { + sqe128 = true; + } else if (io_issue_defs[opcode].is_128) { + if (!(ctx->flags & IORING_SETUP_SQE_MIXED)) { + seq_printf(m, + "%5u: invalid sqe, 128B entry on non-mixed sq\n", + sq_idx); + break; + } + if ((++sq_head & sq_mask) == 0) { + seq_printf(m, + "%5u: corrupted sqe, wrapping 128B entry\n", + sq_idx); + break; + } + sqe128 = true; + } seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, " "addr:0x%llx, rw_flags:0x%x, buf_index:%d " "user_data:%llu", - sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd, + sq_idx, io_uring_get_opcode(opcode), sqe->fd, sqe->flags, (unsigned long long) sqe->off, (unsigned long long) sqe->addr, sqe->rw_flags, sqe->buf_index, sqe->user_data); - if (sq_shift) { + if (sqe128) { u64 *sqeb = (void *) (sqe + 1); int size = sizeof(struct io_uring_sqe) / sizeof(u64); int j; @@ -128,7 +155,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) cqe = &r->cqes[(cq_head & cq_mask)]; if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) cqe32 = true; - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", + seq_printf(m, "%5u: user_data:%llu, res:%d, flags:%x", cq_head & cq_mask, cqe->user_data, cqe->res, cqe->flags); if (cqe32) diff --git a/io_uring/futex.c b/io_uring/futex.c index 64f3bd51c84c..11bfff5a80df 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -17,7 +17,6 @@ struct io_futex { void __user *uaddr; unsigned long futex_val; unsigned long futex_mask; - unsigned long futexv_owned; u32 futex_flags; unsigned int futex_nr; bool futexv_unqueued; @@ -28,6 +27,11 @@ struct io_futex_data { struct io_kiocb *req; }; +struct io_futexv_data { + unsigned long owned; + struct futex_vector futexv[]; +}; + #define IO_FUTEX_ALLOC_CACHE_MAX 32 bool io_futex_cache_init(struct io_ring_ctx *ctx) @@ -41,45 +45,46 @@ void io_futex_cache_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->futex_cache, kfree); } -static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) +static void __io_futex_complete(struct io_tw_req tw_req, io_tw_token_t tw) { - hlist_del_init(&req->hash_node); - io_req_task_complete(req, tw); + hlist_del_init(&tw_req.req->hash_node); + io_req_task_complete(tw_req, tw); } -static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) +static void io_futex_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, tw); io_cache_free(&ctx->futex_cache, req->async_data); io_req_async_data_clear(req, 0); - __io_futex_complete(req, tw); + __io_futex_complete(tw_req, tw); } -static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) +static void io_futexv_complete(struct io_tw_req tw_req, io_tw_token_t tw) { + struct io_kiocb *req = tw_req.req; struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); - struct futex_vector *futexv = req->async_data; + struct io_futexv_data *ifd = req->async_data; io_tw_lock(req->ctx, tw); if (!iof->futexv_unqueued) { int res; - res = futex_unqueue_multiple(futexv, iof->futex_nr); + res = futex_unqueue_multiple(ifd->futexv, iof->futex_nr); if (res != -1) io_req_set_res(req, res, 0); } io_req_async_data_free(req); - __io_futex_complete(req, tw); + __io_futex_complete(tw_req, tw); } -static bool io_futexv_claim(struct io_futex *iof) +static bool io_futexv_claim(struct io_futexv_data *ifd) { - if (test_bit(0, &iof->futexv_owned) || - test_and_set_bit_lock(0, &iof->futexv_owned)) + if (test_bit(0, &ifd->owned) || test_and_set_bit_lock(0, &ifd->owned)) return false; return true; } @@ -94,9 +99,9 @@ static bool __io_futex_cancel(struct io_kiocb *req) return false; req->io_task_work.func = io_futex_complete; } else { - struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); + struct io_futexv_data *ifd = req->async_data; - if (!io_futexv_claim(iof)) + if (!io_futexv_claim(ifd)) return false; req->io_task_work.func = io_futexv_complete; } @@ -152,9 +157,9 @@ int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q) { struct io_kiocb *req = q->wake_data; - struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); + struct io_futexv_data *ifd = req->async_data; - if (!io_futexv_claim(iof)) + if (!io_futexv_claim(ifd)) return; if (unlikely(!__futex_wake_mark(q))) return; @@ -167,7 +172,7 @@ static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q) int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); - struct futex_vector *futexv; + struct io_futexv_data *ifd; int ret; /* No flags or mask supported for waitv */ @@ -180,23 +185,23 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!iof->futex_nr || iof->futex_nr > FUTEX_WAITV_MAX) return -EINVAL; - futexv = kcalloc(iof->futex_nr, sizeof(*futexv), GFP_KERNEL); - if (!futexv) + ifd = kzalloc(struct_size_t(struct io_futexv_data, futexv, iof->futex_nr), + GFP_KERNEL); + if (!ifd) return -ENOMEM; - ret = futex_parse_waitv(futexv, iof->uaddr, iof->futex_nr, + ret = futex_parse_waitv(ifd->futexv, iof->uaddr, iof->futex_nr, io_futex_wakev_fn, req); if (ret) { - kfree(futexv); + kfree(ifd); return ret; } /* Mark as inflight, so file exit cancelation will find it */ io_req_track_inflight(req); - iof->futexv_owned = 0; iof->futexv_unqueued = 0; req->flags |= REQ_F_ASYNC_DATA; - req->async_data = futexv; + req->async_data = ifd; return 0; } @@ -216,13 +221,13 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q) int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); - struct futex_vector *futexv = req->async_data; + struct io_futexv_data *ifd = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret, woken = -1; io_ring_submit_lock(ctx, issue_flags); - ret = futex_wait_multiple_setup(futexv, iof->futex_nr, &woken); + ret = futex_wait_multiple_setup(ifd->futexv, iof->futex_nr, &woken); /* * Error case, ret is < 0. Mark the request as failed. diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 02339b74ba8d..5d130c578435 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -124,11 +124,6 @@ #define IO_REQ_ALLOC_BATCH 8 #define IO_LOCAL_TW_DEFAULT_MAX 20 -struct io_defer_entry { - struct list_head list; - struct io_kiocb *req; -}; - /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) @@ -140,11 +135,6 @@ struct io_defer_entry { /* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) -static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct io_uring_task *tctx, - bool cancel_all, - bool is_sqpoll_thread); - static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags); static void __io_req_caches_free(struct io_ring_ctx *ctx); @@ -207,44 +197,6 @@ static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); } -static bool io_match_linked(struct io_kiocb *head) -{ - struct io_kiocb *req; - - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; -} - -/* - * As io_match_task() but protected against racing with linked timeouts. - * User must not hold timeout_lock. - */ -bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, - bool cancel_all) -{ - bool matched; - - if (tctx && head->tctx != tctx) - return false; - if (cancel_all) - return true; - - if (head->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = head->ctx; - - /* protect against races with linked timeouts */ - raw_spin_lock_irq(&ctx->timeout_lock); - matched = io_match_linked(head); - raw_spin_unlock_irq(&ctx->timeout_lock); - } else { - matched = io_match_linked(head); - } - return matched; -} - static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); @@ -265,6 +217,20 @@ static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) complete(&ctx->ref_comp); } +/* + * Terminate the request if either of these conditions are true: + * + * 1) It's being executed by the original task, but that task is marked + * with PF_EXITING as it's exiting. + * 2) PF_KTHREAD is set, in which case the invoker of the task_work is + * our fallback task_work. + * 3) The ring has been closed and is going away. + */ +static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) +{ + return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); +} + static __cold void io_fallback_req_func(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, @@ -275,8 +241,9 @@ static __cold void io_fallback_req_func(struct work_struct *work) percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); + ts.cancel = io_should_terminate_tw(ctx); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func(req, ts); + req->io_task_work.func((struct io_tw_req){req}, ts); io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -524,9 +491,9 @@ static void io_queue_iowq(struct io_kiocb *req) io_wq_enqueue(tctx->io_wq, &req->work); } -static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw) +static void io_req_queue_iowq_tw(struct io_tw_req tw_req, io_tw_token_t tw) { - io_queue_iowq(req); + io_queue_iowq(tw_req.req); } void io_req_queue_iowq(struct io_kiocb *req) @@ -535,7 +502,7 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } -static unsigned io_linked_nr(struct io_kiocb *req) +unsigned io_linked_nr(struct io_kiocb *req) { struct io_kiocb *tmp; unsigned nr = 0; @@ -706,7 +673,7 @@ void io_task_refs_refill(struct io_uring_task *tctx) tctx->cached_refs += refill; } -static __cold void io_uring_drop_tctx_refs(struct task_struct *task) +__cold void io_uring_drop_tctx_refs(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; unsigned int refs = tctx->cached_refs; @@ -917,7 +884,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags } /* - * Must be called from inline task_work so we now a flush will happen later, + * Must be called from inline task_work so we know a flush will happen later, * and obviously with ctx->uring_lock held (tw always has that). */ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) @@ -1149,10 +1116,11 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, ctx = req->ctx; mutex_lock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); + ts.cancel = io_should_terminate_tw(ctx); } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, ts); + (struct io_tw_req){req}, ts); node = next; (*count)++; if (unlikely(need_resched())) { @@ -1207,11 +1175,6 @@ struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, { struct llist_node *node; - if (unlikely(current->flags & PF_EXITING)) { - io_fallback_tw(tctx, true); - return NULL; - } - node = llist_del_all(&tctx->task_list); if (node) { node = llist_reverse_order(node); @@ -1248,7 +1211,7 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); /* - * We don't know how many reuqests is there in the link and whether + * We don't know how many requests there are in the link and whether * they can even be queued lazily, fall back to non-lazy. */ if (req->flags & IO_REQ_LINK_FLAGS) @@ -1380,7 +1343,7 @@ static int __io_run_local_work_loop(struct llist_node **node, io_task_work.node); INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, tw); + (struct io_tw_req){req}, tw); *node = next; if (++ret >= events) break; @@ -1401,6 +1364,7 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) |
