diff options
author | Jens Axboe <axboe@kernel.dk> | 2024-03-13 03:24:21 +0100 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2024-04-15 16:10:26 +0200 |
commit | 87585b05757dc70545efb434669708d276125559 (patch) | |
tree | d3020002d23692f431baf61797e51d439312950f /io_uring/kbuf.c | |
parent | io_uring/kbuf: vmap pinned buffer ring (diff) | |
download | linux-87585b05757dc70545efb434669708d276125559.tar.xz linux-87585b05757dc70545efb434669708d276125559.zip |
io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring
Rather than use remap_pfn_range() for this and manually free later,
switch to using vm_insert_page() and have it Just Work.
This requires a bit of effort on the mmap lookup side, as the ctx
uring_lock isn't held, which otherwise protects buffer_lists from being
torn down, and it's not safe to grab from mmap context that would
introduce an ABBA deadlock between the mmap lock and the ctx uring_lock.
Instead, lookup the buffer_list under RCU, as the the list is RCU freed
already. Use the existing reference count to determine whether it's
possible to safely grab a reference to it (eg if it's not zero already),
and drop that reference when done with the mapping. If the mmap
reference is the last one, the buffer_list and the associated memory can
go away, since the vma insertion has references to the inserted pages at
that point.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'io_uring/kbuf.c')
-rw-r--r-- | io_uring/kbuf.c | 134 |
1 files changed, 26 insertions, 108 deletions
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 4289f4a92693..820ac599d003 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -32,25 +32,12 @@ struct io_provide_buf { __u16 bid; }; -struct io_buf_free { - struct hlist_node list; - void *mem; - size_t size; - int inuse; -}; - -static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, - unsigned int bgid) -{ - return xa_load(&ctx->io_bl_xa, bgid); -} - static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { lockdep_assert_held(&ctx->uring_lock); - return __io_buffer_get_list(ctx, bgid); + return xa_load(&ctx->io_bl_xa, bgid); } static int io_buffer_add_list(struct io_ring_ctx *ctx, @@ -191,24 +178,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, return ret; } -/* - * Mark the given mapped range as free for reuse - */ -static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl) -{ - struct io_buf_free *ibf; - - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { - if (bl->buf_ring == ibf->mem) { - ibf->inuse = 0; - return; - } - } - - /* can't happen... */ - WARN_ON_ONCE(1); -} - static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer_list *bl, unsigned nbufs) { @@ -220,23 +189,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (bl->is_buf_ring) { i = bl->buf_ring->tail - bl->head; - if (bl->is_mmap) { - /* - * io_kbuf_list_free() will free the page(s) at - * ->release() time. - */ - io_kbuf_mark_free(ctx, bl); - bl->buf_ring = NULL; - bl->is_mmap = 0; - } else if (bl->buf_nr_pages) { + if (bl->buf_nr_pages) { int j; - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - vunmap(bl->buf_ring); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; + if (!bl->is_mmap) { + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + } + io_pages_unmap(bl->buf_ring, &bl->buf_pages, + &bl->buf_nr_pages, bl->is_mmap); + bl->is_mmap = 0; } /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); @@ -537,63 +499,18 @@ error_unpin: return ret; } -/* - * See if we have a suitable region that we can reuse, rather than allocate - * both a new io_buf_free and mem region again. We leave it on the list as - * even a reused entry will need freeing at ring release. - */ -static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx, - size_t ring_size) -{ - struct io_buf_free *ibf, *best = NULL; - size_t best_dist; - - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { - size_t dist; - - if (ibf->inuse || ibf->size < ring_size) - continue; - dist = ibf->size - ring_size; - if (!best || dist < best_dist) { - best = ibf; - if (!dist) - break; - best_dist = dist; - } - } - - return best; -} - static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, struct io_uring_buf_reg *reg, struct io_buffer_list *bl) { - struct io_buf_free *ibf; size_t ring_size; - void *ptr; ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); - /* Reuse existing entry, if we can */ - ibf = io_lookup_buf_free_entry(ctx, ring_size); - if (!ibf) { - ptr = io_mem_alloc(ring_size); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - /* Allocate and store deferred free entry */ - ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT); - if (!ibf) { - io_mem_free(ptr); - return -ENOMEM; - } - ibf->mem = ptr; - ibf->size = ring_size; - hlist_add_head(&ibf->list, &ctx->io_buf_list); - } - ibf->inuse = 1; - bl->buf_ring = ibf->mem; + bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); + if (!bl->buf_ring) + return -ENOMEM; + bl->is_buf_ring = 1; bl->is_mmap = 1; return 0; @@ -741,18 +658,19 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, return ERR_PTR(-EINVAL); } -/* - * Called at or after ->release(), free the mmap'ed buffers that we used - * for memory mapped provided buffer rings. - */ -void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx) +int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) { - struct io_buf_free *ibf; - struct hlist_node *tmp; + struct io_ring_ctx *ctx = file->private_data; + loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; + struct io_buffer_list *bl; + int bgid, ret; - hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { - hlist_del(&ibf->list); - io_mem_free(ibf->mem); - kfree(ibf); - } + bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + bl = io_pbuf_get_bl(ctx, bgid); + if (IS_ERR(bl)) + return PTR_ERR(bl); + + ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); + io_put_bl(ctx, bl); + return ret; } |