diff options
Diffstat (limited to 'io_uring/io_uring.c')
-rw-r--r-- | io_uring/io_uring.c | 433 |
1 files changed, 236 insertions, 197 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5a1676bab998..73af59863300 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -69,6 +69,7 @@ #include <linux/io_uring/cmd.h> #include <linux/audit.h> #include <linux/security.h> +#include <linux/jump_label.h> #include <asm/shmparam.h> #define CREATE_TRACE_POINTS @@ -103,9 +104,6 @@ #include "alloc_cache.h" #include "eventfd.h" -#define IORING_MAX_ENTRIES 32768 -#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) - #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -143,11 +141,13 @@ struct io_defer_entry { #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, + struct io_uring_task *tctx, bool cancel_all); static void io_queue_sqe(struct io_kiocb *req); +static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); + struct kmem_cache *req_cachep; static struct workqueue_struct *iou_wq __ro_after_init; @@ -200,12 +200,12 @@ static bool io_match_linked(struct io_kiocb *head) * As io_match_task() but protected against racing with linked timeouts. * User must not hold timeout_lock. */ -bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, +bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, bool cancel_all) { bool matched; - if (task && head->task != task) + if (tctx && head->tctx != tctx) return false; if (cancel_all) return true; @@ -260,15 +260,23 @@ static __cold void io_fallback_req_func(struct work_struct *work) static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) { - unsigned hash_buckets = 1U << bits; - size_t hash_size = hash_buckets * sizeof(table->hbs[0]); + unsigned int hash_buckets; + int i; - table->hbs = kmalloc(hash_size, GFP_KERNEL); - if (!table->hbs) - return -ENOMEM; + do { + hash_buckets = 1U << bits; + table->hbs = kvmalloc_array(hash_buckets, sizeof(table->hbs[0]), + GFP_KERNEL_ACCOUNT); + if (table->hbs) + break; + if (bits == 1) + return -ENOMEM; + bits--; + } while (1); table->hash_bits = bits; - init_hash_table(table, hash_buckets); + for (i = 0; i < hash_buckets; i++) + INIT_HLIST_HEAD(&table->hbs[i].list); return 0; } @@ -293,21 +301,18 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) hash_bits = clamp(hash_bits, 1, 8); if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) goto err; - if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) - goto err; if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) goto err; ctx->flags = p->flags; + ctx->hybrid_poll_time = LLONG_MAX; atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, - sizeof(struct io_rsrc_node)); - ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, + ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, sizeof(struct async_poll)); ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_msghdr)); @@ -326,7 +331,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->poll_wq); - init_waitqueue_head(&ctx->rsrc_quiesce_wq); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); @@ -334,7 +338,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); - INIT_LIST_HEAD(&ctx->rsrc_ref_list); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; @@ -346,21 +349,20 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); io_napi_init(ctx); + mutex_init(&ctx->resize_lock); return ctx; free_ref: percpu_ref_exit(&ctx->refs); err: - io_alloc_cache_free(&ctx->rsrc_node_cache, kfree); io_alloc_cache_free(&ctx->apoll_cache, kfree); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); - kfree(ctx->cancel_table.hbs); - kfree(ctx->cancel_table_locked.hbs); + kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; @@ -404,11 +406,8 @@ static void io_clean_op(struct io_kiocb *req) kfree(req->apoll); req->apoll = NULL; } - if (req->flags & REQ_F_INFLIGHT) { - struct io_uring_task *tctx = req->task->io_uring; - - atomic_dec(&tctx->inflight_tracked); - } + if (req->flags & REQ_F_INFLIGHT) + atomic_dec(&req->tctx->inflight_tracked); if (req->flags & REQ_F_CREDS) put_cred(req->creds); if (req->flags & REQ_F_ASYNC_DATA) { @@ -422,7 +421,7 @@ static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; - atomic_inc(&req->task->io_uring->inflight_tracked); + atomic_inc(&req->tctx->inflight_tracked); } } @@ -511,7 +510,7 @@ static void io_prep_async_link(struct io_kiocb *req) static void io_queue_iowq(struct io_kiocb *req) { struct io_kiocb *link = io_prep_linked_timeout(req); - struct io_uring_task *tctx = req->task->io_uring; + struct io_uring_task *tctx = req->tctx; BUG_ON(!tctx); BUG_ON(!tctx->io_wq); @@ -526,7 +525,7 @@ static void io_queue_iowq(struct io_kiocb *req) * procedure rather than attempt to run this request (or create a new * worker for it). */ - if (WARN_ON_ONCE(!same_thread_group(req->task, current))) + if (WARN_ON_ONCE(!same_thread_group(tctx->task, current))) atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags); trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); @@ -674,30 +673,19 @@ static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -/* can be called by any task */ -static void io_put_task_remote(struct task_struct *task) -{ - struct io_uring_task *tctx = task->io_uring; - - percpu_counter_sub(&tctx->inflight, 1); - if (unlikely(atomic_read(&tctx->in_cancel))) - wake_up(&tctx->wait); - put_task_struct(task); -} - -/* used by a task to put its own references */ -static void io_put_task_local(struct task_struct *task) -{ - task->io_uring->cached_refs++; -} - /* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task) +static inline void io_put_task(struct io_kiocb *req) { - if (likely(task == current)) - io_put_task_local(task); - else - io_put_task_remote(task); + struct io_uring_task *tctx = req->tctx; + + if (likely(tctx->task == current)) { + tctx->cached_refs++; + } else { + percpu_counter_sub(&tctx->inflight, 1); + if (unlikely(atomic_read(&tctx->in_cancel))) + wake_up(&tctx->wait); + put_task_struct(tctx->task); + } } void io_task_refs_refill(struct io_uring_task *tctx) @@ -819,8 +807,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, * the ring. */ if (likely(io_get_cqe(ctx, &cqe))) { - trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); - WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); @@ -829,6 +815,8 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } + + trace_io_uring_complete(ctx, NULL, cqe); return true; } return false; @@ -945,6 +933,8 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) { req->ctx = ctx; + req->buf_node = NULL; + req->file_node = NULL; req->link = NULL; req->async_data = NULL; /* not necessary, but safer to zero */ @@ -1075,23 +1065,8 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, return node; } -/** - * io_llist_xchg - swap all entries in a lock-less list - * @head: the head of lock-less list to delete all entries - * @new: new entry as the head of the list - * - * If list is empty, return NULL, otherwise, return the pointer to the first entry. - * The order of entries returned is from the newest to the oldest added one. - */ -static inline struct llist_node *io_llist_xchg(struct llist_head *head, - struct llist_node *new) -{ - return xchg(&head->first, new); -} - -static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) +static __cold void __io_fallback_tw(struct llist_node *node, bool sync) { - struct llist_node *node = llist_del_all(&tctx->task_list); struct io_ring_ctx *last_ctx = NULL; struct io_kiocb *req; @@ -1117,6 +1092,13 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) } } +static void io_fallback_tw(struct io_uring_task *tctx, bool sync) +{ + struct llist_node *node = llist_del_all(&tctx->task_list); + + __io_fallback_tw(node, sync); +} + struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count) @@ -1227,7 +1209,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, static void io_req_normal_work_add(struct io_kiocb *req) { - struct io_uring_task *tctx = req->task->io_uring; + struct io_uring_task *tctx = req->tctx; struct io_ring_ctx *ctx = req->ctx; /* task_work already pending, we're done */ @@ -1246,7 +1228,7 @@ static void io_req_normal_work_add(struct io_kiocb *req) return; } - if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) + if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) return; io_fallback_tw(tctx, false); @@ -1270,16 +1252,9 @@ void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) { - struct llist_node *node; - - node = llist_del_all(&ctx->work_llist); - while (node) { - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); + struct llist_node *node = llist_del_all(&ctx->work_llist); - node = node->next; - io_req_normal_work_add(req); - } + __io_fallback_tw(node, false); } static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, @@ -1310,7 +1285,7 @@ again: * llists are in reverse order, flip it back the right way before * running the pending items. */ - node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL)); + node = llist_reverse_order(llist_del_all(&ctx->work_llist)); while (node) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, @@ -1363,8 +1338,7 @@ static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) { io_tw_lock(req->ctx, ts); - /* req->task == current here, checking PF_EXITING is safe */ - if (unlikely(req->task->flags & PF_EXITING)) + if (unlikely(io_should_terminate_tw())) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); @@ -1422,8 +1396,8 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, io_clean_op(req); } io_put_file(req); - io_put_rsrc_node(ctx, req->rsrc_node); - io_put_task(req->task); + io_req_put_rsrc_nodes(req); + io_put_task(req); node = req->comp_list.next; io_req_add_to_cache(req, ctx); @@ -1885,20 +1859,16 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_fixed_file *slot; + struct io_rsrc_node *node; struct file *file = NULL; io_ring_submit_lock(ctx, issue_flags); - - if (unlikely((unsigned int)fd >= ctx->nr_user_files)) - goto out; - fd = array_index_nospec(fd, ctx->nr_user_files); - slot = io_fixed_file_slot(&ctx->file_table, fd); - if (!req->rsrc_node) - __io_req_set_rsrc_node(req, ctx); - req->flags |= io_slot_flags(slot); - file = io_slot_file(slot); -out: + node = io_rsrc_node_lookup(&ctx->file_table.data, fd); + if (node) { + io_req_assign_rsrc_node(&req->file_node, node); + req->flags |= io_slot_flags(node); + file = io_slot_file(node); + } io_ring_submit_unlock(ctx, issue_flags); return file; } @@ -2043,8 +2013,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->flags = (__force io_req_flags_t) sqe_flags; req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; - req->rsrc_node = NULL; - req->task = current; + req->tctx = current->io_uring; req->cancel_seq_set = false; if (unlikely(opcode >= IORING_OP_LAST)) { @@ -2262,7 +2231,8 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) unsigned mask = ctx->sq_entries - 1; unsigned head = ctx->cached_sq_head++ & mask; - if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) { + if (static_branch_unlikely(&io_key_has_sqarray) && + (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { /* drop invalid entries */ @@ -2273,6 +2243,7 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) READ_ONCE(ctx->rings->sq_dropped) + 1); return false; } + head = array_index_nospec(head, ctx->sq_entries); } /* @@ -2501,9 +2472,10 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct ext_arg { size_t argsz; - struct __kernel_timespec __user *ts; + struct timespec64 ts; const sigset_t __user *sig; ktime_t min_time; + bool ts_set; }; /* @@ -2541,13 +2513,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, iowq.timeout = KTIME_MAX; start_time = io_get_time(ctx); - if (ext_arg->ts) { - struct timespec64 ts; - - if (get_timespec64(&ts, ext_arg->ts)) - return -EFAULT; - - iowq.timeout = timespec64_to_ktime(ts); + if (ext_arg->ts_set) { + iowq.timeout = timespec64_to_ktime(ext_arg->ts); if (!(flags & IORING_ENTER_ABS_TIMER)) iowq.timeout = ktime_add(iowq.timeout, start_time); } @@ -2671,8 +2638,8 @@ static void io_rings_free(struct io_ring_ctx *ctx) ctx->sq_sqes = NULL; } -static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset) +unsigned long rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset) { struct io_rings *rings; size_t off, sq_array_size; @@ -2680,7 +2647,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries off = struct_size(rings, cqes, cq_entries); if (off == SIZE_MAX) return SIZE_MAX; - if (ctx->flags & IORING_SETUP_CQE32) { + if (flags & IORING_SETUP_CQE32) { if (check_shl_overflow(off, 1, &off)) return SIZE_MAX; } @@ -2691,7 +2658,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries return SIZE_MAX; #endif - if (ctx->flags & IORING_SETUP_NO_SQARRAY) { + if (flags & IORING_SETUP_NO_SQARRAY) { *sq_offset = SIZE_MAX; return off; } @@ -2728,15 +2695,10 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); - /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ - if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list))) - return; mutex_lock(&ctx->uring_lock); - if (ctx->buf_data) - __io_sqe_buffers_unregister(ctx); - if (ctx->file_data) - __io_sqe_files_unregister(ctx); + io_sqe_buffers_unregister(ctx); + io_sqe_files_unregister(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); io_alloc_cache_free(&ctx->apoll_cache, kfree); @@ -2746,34 +2708,31 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); + io_free_region(ctx, &ctx->param_region); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); if (ctx->submitter_task) put_task_struct(ctx->submitter_task); - /* there are no registered resources left, nobody uses it */ - if (ctx->rsrc_node) - io_rsrc_node_destroy(ctx, ctx->rsrc_node); - - WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); - io_alloc_cache_free(&ctx->rsrc_node_cache, kfree); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; } io_rings_free(ctx); + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + static_branch_dec(&io_key_has_sqarray); + percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); io_napi_free(ctx); - kfree(ctx->cancel_table.hbs); - kfree(ctx->cancel_table_locked.hbs); + kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); } @@ -3012,7 +2971,7 @@ static int io_uring_release(struct inode *inode, struct file *file) } struct io_task_cancel { - struct task_struct *task; + struct io_uring_task *tctx; bool all; }; @@ -3021,11 +2980,11 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_task_cancel *cancel = data; - return io_match_task_safe(req, cancel->task, cancel->all); + return io_match_task_safe(req, cancel->tctx, cancel->all); } static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct task_struct *task, + struct io_uring_task *tctx, bool cancel_all) { struct io_defer_entry *de; @@ -3033,7 +2992,7 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_task_safe(de->req, task, cancel_all)) { + if (io_match_task_safe(de->req, tctx, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } @@ -3076,11 +3035,10 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) } static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, + struct io_uring_task *tctx, bool cancel_all) { - struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; - struct io_uring_task *tctx = task ? task->io_uring : NULL; + struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; enum io_wq_cancel cret; bool ret = false; @@ -3094,9 +3052,9 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if (!ctx->rings) return false; - if (!task) { + if (!tctx) { ret |= io_uring_try_cancel_iowq(ctx); - } else if (tctx && tctx->io_wq) { + } else if (tctx->io_wq) { /* * Cancels requests of all rings, not only @ctx, but * it's fine as the task is in exit/exec. @@ -3119,15 +3077,15 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) ret |= io_run_local_work(ctx, INT_MAX) > 0; - ret |= io_cancel_defer_files(ctx, task, cancel_all); + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); mutex_lock(&ctx->uring_lock); - ret |= io_poll_remove_all(ctx, task, cancel_all); - ret |= io_waitid_remove_all(ctx, task, cancel_all); - ret |= io_futex_remove_all(ctx, task, cancel_all); - ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all); + ret |= io_poll_remove_all(ctx, tctx, cancel_all); + ret |= io_waitid_remove_all(ctx, tctx, cancel_all); + ret |= io_futex_remove_all(ctx, tctx, cancel_all); + ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); mutex_unlock(&ctx->uring_lock); - ret |= io_kill_timeouts(ctx, task, cancel_all); - if (task) + ret |= io_kill_timeouts(ctx, tctx, cancel_all); + if (tctx) ret |= io_run_task_work() > 0; else ret |= flush_delayed_work(&ctx->fallback_work); @@ -3180,12 +3138,13 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) if (node->ctx->sq_data) continue; loop |= io_uring_try_cancel_requests(node->ctx, - current, cancel_all); + current->io_uring, + cancel_all); } } else { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) loop |= io_uring_try_cancel_requests(ctx, - current, + current->io_uring, cancel_all); } @@ -3232,22 +3191,44 @@ void __io_uring_cancel(bool cancel_all) io_uring_cancel_generic(cancel_all, NULL); } -static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) +static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, + const struct io_uring_getevents_arg __user *uarg) { - if (flags & IORING_ENTER_EXT_ARG) { - struct io_uring_getevents_arg arg; + unsigned long size = sizeof(struct io_uring_reg_wait); + unsigned long offset = (uintptr_t)uarg; + unsigned long end; - if (argsz != sizeof(arg)) - return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) - return -EFAULT; - } + if (unlikely(offset % sizeof(long))) + return ERR_PTR(-EFAULT); + + /* also protects from NULL ->cq_wait_arg as the size would be 0 */ + if (unlikely(check_add_overflow(offset, size, &end) || + end > ctx->cq_wait_size)) + return ERR_PTR(-EFAULT); + + return ctx->cq_wait_arg + offset; +} + +static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags, + const void __user *argp, size_t argsz) +{ + struct io_uring_getevents_arg arg; + + if (!(flags & IORING_ENTER_EXT_ARG)) + return 0; + if (flags & IORING_ENTER_EXT_ARG_REG) + return -EINVAL; + if (argsz != sizeof(arg)) + return -EINVAL; + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; return 0; } -static int io_get_ext_arg(unsigned flags, const void __user *argp, - struct ext_arg *ext_arg) +static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags, + const void __user *argp, struct ext_arg *ext_arg) { + const struct io_uring_getevents_arg __user *uarg = argp; struct io_uring_getevents_arg arg; /* @@ -3256,7 +3237,28 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, */ if (!(flags & IORING_ENTER_EXT_ARG)) { ext_arg->sig = (const sigset_t __user *) argp; - ext_arg->ts = NULL; + return 0; + } + + if (flags & IORING_ENTER_EXT_ARG_REG) { + struct io_uring_reg_wait *w; + + if (ext_arg->argsz != sizeof(struct io_uring_reg_wait)) + return -EINVAL; + w = io_get_ext_arg_reg(ctx, argp); + if (IS_ERR(w)) + return PTR_ERR(w); + + if (w->flags & ~IORING_REG_WAIT_TS) + return -EINVAL; + ext_arg->min_time = READ_ONCE(w->min_wait_usec) * NSEC_PER_USEC; + ext_arg->sig = u64_to_user_ptr(READ_ONCE(w->sigmask)); + ext_arg->argsz = READ_ONCE(w->sigmask_sz); + if (w->flags & IORING_REG_WAIT_TS) { + ext_arg->ts.tv_sec = READ_ONCE(w->ts.tv_sec); + ext_arg->ts.tv_nsec = READ_ONCE(w->ts.tv_nsec); + ext_arg->ts_set = true; + } return 0; } @@ -3266,13 +3268,32 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, */ if (ext_arg->argsz != sizeof(arg)) return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) +#ifdef CONFIG_64BIT + if (!user_access_begin(uarg, sizeof(*uarg))) return -EFAULT; + unsafe_get_user(arg.sigmask, &uarg->sigmask, uaccess_end); + unsafe_get_user(arg.sigmask_sz, &uarg->sigmask_sz, uaccess_end); + unsafe_get_user(arg.min_wait_usec, &uarg->min_wait_usec, uaccess_end); + unsafe_get_user(arg.ts, &uarg->ts, uaccess_end); + user_access_end(); +#else + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; +#endif ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC; ext_arg->sig = u64_to_user_ptr(arg.sigmask); ext_arg->argsz = arg.sigmask_sz; - ext_arg->ts = u64_to_user_ptr(arg.ts); + if (arg.ts) { + if (get_timespec64(&ext_arg->ts, u64_to_user_ptr(arg.ts))) + return -EFAULT; + ext_arg->ts_set = true; + } return 0; +#ifdef CONFIG_64BIT +uaccess_end: + user_access_end(); + return -EFAULT; +#endif } SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, @@ -3286,7 +3307,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | IORING_ENTER_REGISTERED_RING | - IORING_ENTER_ABS_TIMER))) + IORING_ENTER_ABS_TIMER | + IORING_ENTER_EXT_ARG_REG))) return -EINVAL; /* @@ -3369,7 +3391,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ mutex_lock(&ctx->uring_lock); iopoll_locked: - ret2 = io_validate_ext_arg(flags, argp, argsz); + ret2 = io_validate_ext_arg(ctx, flags, argp, argsz); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries); @@ -3379,7 +3401,7 @@ iopoll_locked: } else { struct ext_arg ext_arg = { .argsz = argsz }; - ret2 = io_get_ext_arg(flags, argp, &ext_arg); + ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries); @@ -3436,7 +3458,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); + size = rings_size(ctx->flags, p->sq_entries, p->cq_entries, + &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; @@ -3502,14 +3525,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) O_RDWR | O_CLOEXEC, NULL); } -static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, - struct io_uring_params __user *params) +int io_uring_fill_params(unsigned entries, struct io_uring_params *p) { - struct io_ring_ctx *ctx; - struct io_uring_task *tctx; - struct file *file; - int ret; - if (!entries) return -EINVAL; if (entries > IORING_MAX_ENTRIES) { @@ -3551,6 +3568,42 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_entries = 2 * p->sq_entries; } + p->sq_off.head = offsetof(struct io_rings, sq.head); + p->sq_off.tail = offsetof(struct io_rings, sq.tail); + p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); + p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); + p->sq_off.flags = offsetof(struct io_rings, sq_flags); + p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); + p->sq_off.resv1 = 0; + if (!(p->flags & IORING_SETUP_NO_MMAP)) + p->sq_off.user_addr = 0; + + p->cq_off.head = offsetof(struct io_rings, cq.head); + p->cq_off.tail = offsetof(struct io_rings, cq.tail); + p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); + p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); + p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); + p->cq_off.cqes = offsetof(struct io_rings, cqes); + p->cq_off.flags = offsetof(struct io_rings, cq_flags); + p->cq_off.resv1 = 0; + if (!(p->flags & IORING_SETUP_NO_MMAP)) + p->cq_off.user_addr = 0; + + return 0; +} + +static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, + struct io_uring_params __user *params) +{ + struct io_ring_ctx *ctx; + struct io_uring_task *tctx; + struct file *file; + int ret; + + ret = io_uring_fill_params(entries, p); + if (unlikely(ret)) + return ret; + ctx = io_ring_ctx_alloc(p); if (!ctx) return -ENOMEM; @@ -3558,6 +3611,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->clockid = CLOCK_MONOTONIC; ctx->clock_offset = 0; + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + static_branch_inc(&io_key_has_sqarray); + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL) && !(ctx->flags & IORING_SETUP_SQPOLL)) @@ -3608,6 +3664,11 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->notify_method = TWA_SIGNAL; } + /* HYBRID_IOPOLL only valid with IOPOLL */ + if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) == + IORING_SETUP_HYBRID_IOPOLL) + goto err; + /* * For DEFER_TASKRUN we require the completion task to be the same as the * submission task. This implies that there is only one submitter, so enforce @@ -3631,37 +3692,13 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; - ret = io_sq_offload_create(ctx, p); - if (ret) - goto err; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) + p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; - ret = io_rsrc_init(ctx); + ret = io_sq_offload_create(ctx, p); if (ret) goto err; - p->sq_off.head = offsetof(struct io_rings, sq.head); - p->sq_off.tail = offsetof(struct io_rings, sq.tail); - p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); - p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); - p->sq_off.flags = offsetof(struct io_rings, sq_flags); - p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); - if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; - p->sq_off.resv1 = 0; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - p->sq_off.user_addr = 0; - - p->cq_off.head = offsetof(struct io_rings, cq.head); - p->cq_off.tail = offsetof(struct io_rings, cq.tail); - p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); - p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); - p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); - p->cq_off.cqes = offsetof(struct io_rings, cqes); - p->cq_off.flags = offsetof(struct io_rings, cq_flags); - p->cq_off.resv1 = 0; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - p->cq_off.user_addr = 0; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | @@ -3737,7 +3774,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | - IORING_SETUP_NO_SQARRAY)) + IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) return -EINVAL; return io_uring_create(entries, &p, params); @@ -3775,6 +3812,8 @@ static int __init io_uring_init(void) struct kmem_cache_args kmem_args = { .useroffset = offsetof(struct io_kiocb, cmd.data), .usersize = sizeof_field(struct io_kiocb, cmd.data), + .freeptr_offset = offsetof(struct io_kiocb, work), + .use_freeptr_offset = true, }; #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ |