diff options
| -rw-r--r-- | include/uapi/linux/io_uring.h | 8 | ||||
| -rw-r--r-- | io_uring/fdinfo.c | 34 | ||||
| -rw-r--r-- | io_uring/io_uring.c | 37 | ||||
| -rw-r--r-- | io_uring/io_uring.h | 14 | ||||
| -rw-r--r-- | io_uring/opdef.c | 26 | ||||
| -rw-r--r-- | io_uring/opdef.h | 2 | ||||
| -rw-r--r-- | io_uring/register.c | 2 | ||||
| -rw-r--r-- | io_uring/uring_cmd.c | 17 |
8 files changed, 114 insertions, 26 deletions
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 263bed13473e..04797a9b76bc 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -231,6 +231,12 @@ enum io_uring_sqe_flags_bit { */ #define IORING_SETUP_CQE_MIXED (1U << 18) +/* + * Allow both 64b and 128b SQEs. If a 128b SQE is posted, it will have + * a 128b opcode. + */ +#define IORING_SETUP_SQE_MIXED (1U << 19) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, @@ -295,6 +301,8 @@ enum io_uring_op { IORING_OP_READV_FIXED, IORING_OP_WRITEV_FIXED, IORING_OP_PIPE, + IORING_OP_NOP128, + IORING_OP_URING_CMD128, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index ff3364531c77..1a806ad16840 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -14,6 +14,7 @@ #include "fdinfo.h" #include "cancel.h" #include "rsrc.h" +#include "opdef.h" #ifdef CONFIG_NET_RX_BUSY_POLL static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, @@ -66,7 +67,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) unsigned int cq_head = READ_ONCE(r->cq.head); unsigned int cq_tail = READ_ONCE(r->cq.tail); unsigned int sq_shift = 0; - unsigned int sq_entries; int sq_pid = -1, sq_cpu = -1; u64 sq_total_time = 0, sq_work_time = 0; unsigned int i; @@ -89,26 +89,45 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "CqTail:\t%u\n", cq_tail); seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail)); seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head); - sq_entries = min(sq_tail - sq_head, ctx->sq_entries); - for (i = 0; i < sq_entries; i++) { - unsigned int entry = i + sq_head; + while (sq_head < sq_tail) { struct io_uring_sqe *sqe; unsigned int sq_idx; + bool sqe128 = false; + u8 opcode; if (ctx->flags & IORING_SETUP_NO_SQARRAY) break; - sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); + sq_idx = READ_ONCE(ctx->sq_array[sq_head & sq_mask]); if (sq_idx > sq_mask) continue; + sqe = &ctx->sq_sqes[sq_idx << sq_shift]; + opcode = READ_ONCE(sqe->opcode); + if (sq_shift) { + sqe128 = true; + } else if (io_issue_defs[opcode].is_128) { + if (!(ctx->flags & IORING_SETUP_SQE_MIXED)) { + seq_printf(m, + "%5u: invalid sqe, 128B entry on non-mixed sq\n", + sq_idx); + break; + } + if ((++sq_head & sq_mask) == 0) { + seq_printf(m, + "%5u: corrupted sqe, wrapping 128B entry\n", + sq_idx); + break; + } + sqe128 = true; + } seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, " "addr:0x%llx, rw_flags:0x%x, buf_index:%d " "user_data:%llu", - sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd, + sq_idx, io_uring_get_opcode(opcode), sqe->fd, sqe->flags, (unsigned long long) sqe->off, (unsigned long long) sqe->addr, sqe->rw_flags, sqe->buf_index, sqe->user_data); - if (sq_shift) { + if (sqe128) { u64 *sqeb = (void *) (sqe + 1); int size = sizeof(struct io_uring_sqe) / sizeof(u64); int j; @@ -120,6 +139,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) } } seq_printf(m, "\n"); + sq_head++; } seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); while (cq_head < cq_tail) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e4ede0bad36f..be44d636fe1f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2164,7 +2164,7 @@ static __cold int io_init_fail_req(struct io_kiocb *req, int err) } static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) + const struct io_uring_sqe *sqe, unsigned int *left) __must_hold(&ctx->uring_lock) { const struct io_issue_def *def; @@ -2190,6 +2190,24 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, opcode = array_index_nospec(opcode, IORING_OP_LAST); def = &io_issue_defs[opcode]; + if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) { + /* + * A 128b op on a non-128b SQ requires mixed SQE support as + * well as 2 contiguous entries. + */ + if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 || + !(ctx->cached_sq_head & (ctx->sq_entries - 1))) + return io_init_fail_req(req, -EINVAL); + /* + * A 128b operation on a mixed SQ uses two entries, so we have + * to increment the head and cached refs, and decrement what's + * left. + */ + current->io_uring->cached_refs++; + ctx->cached_sq_head++; + (*left)--; + } + if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) @@ -2299,13 +2317,13 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, } static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) + const struct io_uring_sqe *sqe, unsigned int *left) __must_hold(&ctx->uring_lock) { struct io_submit_link *link = &ctx->submit_state.link; int ret; - ret = io_init_req(ctx, req, sqe); + ret = io_init_req(ctx, req, sqe, left); if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); @@ -2457,7 +2475,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) * Continue submitting even for sqe failure if the * ring was setup with IORING_SETUP_SUBMIT_ALL */ - if (unlikely(io_submit_sqe(ctx, req, sqe)) && + if (unlikely(io_submit_sqe(ctx, req, sqe, &left)) && !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { left--; break; @@ -2802,6 +2820,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, if (cq_entries < 2) return SIZE_MAX; } + if (flags & IORING_SETUP_SQE_MIXED) { + if (sq_entries < 2) + return SIZE_MAX; + } #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); @@ -3726,6 +3748,13 @@ static int io_uring_sanitise_params(struct io_uring_params *p) if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) return -EINVAL; + /* + * Nonsensical to ask for SQE128 and mixed SQE support, it's not + * supported to post 64b SQEs on a ring setup with SQE128. + */ + if ((flags & (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) == + (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) + return -EINVAL; return 0; } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 78777bf1ea4b..44b8091c7fcd 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -54,7 +54,8 @@ IORING_SETUP_REGISTERED_FD_ONLY |\ IORING_SETUP_NO_SQARRAY |\ IORING_SETUP_HYBRID_IOPOLL |\ - IORING_SETUP_CQE_MIXED) + IORING_SETUP_CQE_MIXED |\ + IORING_SETUP_SQE_MIXED) #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ IORING_ENTER_SQ_WAKEUP |\ @@ -565,17 +566,6 @@ static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) io_req_task_work_add(req); } -/* - * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each - * slot. - */ -static inline size_t uring_sqe_size(struct io_ring_ctx *ctx) -{ - if (ctx->flags & IORING_SETUP_SQE128) - return 2 * sizeof(struct io_uring_sqe); - return sizeof(struct io_uring_sqe); -} - static inline bool io_file_can_poll(struct io_kiocb *req) { if (req->flags & REQ_F_CAN_POLL) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 932319633eac..df52d760240e 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -575,6 +575,24 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_pipe_prep, .issue = io_pipe, }, + [IORING_OP_NOP128] = { + .audit_skip = 1, + .iopoll = 1, + .is_128 = 1, + .prep = io_nop_prep, + .issue = io_nop, + }, + [IORING_OP_URING_CMD128] = { + .buffer_select = 1, + .needs_file = 1, + .plug = 1, + .iopoll = 1, + .iopoll_queue = 1, + .is_128 = 1, + .async_size = sizeof(struct io_async_cmd), + .prep = io_uring_cmd_prep, + .issue = io_uring_cmd, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -825,6 +843,14 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_PIPE] = { .name = "PIPE", }, + [IORING_OP_NOP128] = { + .name = "NOP128", + }, + [IORING_OP_URING_CMD128] = { + .name = "URING_CMD128", + .sqe_copy = io_uring_cmd_sqe_copy, + .cleanup = io_uring_cmd_cleanup, + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/opdef.h b/io_uring/opdef.h index c2f0907ed78c..aa37846880ff 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -27,6 +27,8 @@ struct io_issue_def { unsigned iopoll_queue : 1; /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ unsigned vectored : 1; + /* set to 1 if this opcode uses 128b sqes in a mixed sq */ + unsigned is_128 : 1; /* size of async data needed, if any */ unsigned short async_size; diff --git a/io_uring/register.c b/io_uring/register.c index 43eb02004824..1a3e05be6e7b 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -394,7 +394,7 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ - IORING_SETUP_CQE_MIXED) + IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED) static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 1225f8124e4b..9d67a2a721aa 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -216,6 +216,18 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +/* + * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each + * slot. + */ +static inline size_t uring_sqe_size(struct io_kiocb *req) +{ + if (req->ctx->flags & IORING_SETUP_SQE128 || + req->opcode == IORING_OP_URING_CMD128) + return 2 * sizeof(struct io_uring_sqe); + return sizeof(struct io_uring_sqe); +} + void io_uring_cmd_sqe_copy(struct io_kiocb *req) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); @@ -224,7 +236,7 @@ void io_uring_cmd_sqe_copy(struct io_kiocb *req) /* Should not happen, as REQ_F_SQE_COPIED covers this */ if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes)) return; - memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); + memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req)); ioucmd->sqe = ac->sqes; } @@ -242,7 +254,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) if (ret) return ret; - if (ctx->flags & IORING_SETUP_SQE128) + if (ctx->flags & IORING_SETUP_SQE128 || + req->opcode == IORING_OP_URING_CMD128) issue_flags |= IO_URING_F_SQE128; if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED)) issue_flags |= IO_URING_F_CQE32; |