summaryrefslogtreecommitdiff
path: root/drivers/nvme
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme')
-rw-r--r--drivers/nvme/host/apple.c58
-rw-r--r--drivers/nvme/host/core.c14
-rw-r--r--drivers/nvme/host/fc.c67
-rw-r--r--drivers/nvme/host/ioctl.c15
-rw-r--r--drivers/nvme/host/pci.c41
-rw-r--r--drivers/nvme/host/tcp.c85
-rw-r--r--drivers/nvme/target/core.c40
-rw-r--r--drivers/nvme/target/nvmet.h15
-rw-r--r--drivers/nvme/target/pci-epf.c67
-rw-r--r--drivers/nvme/target/rdma.c33
-rw-r--r--drivers/nvme/target/tcp.c15
11 files changed, 273 insertions, 177 deletions
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index 1de11b722f04..8971aca41e63 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -599,7 +599,8 @@ static inline void apple_nvme_handle_cqe(struct apple_nvme_queue *q,
}
if (!nvme_try_complete_req(req, cqe->status, cqe->result) &&
- !blk_mq_add_to_batch(req, iob, nvme_req(req)->status,
+ !blk_mq_add_to_batch(req, iob,
+ nvme_req(req)->status != NVME_SC_SUCCESS,
apple_nvme_complete_batch))
apple_nvme_complete_rq(req);
}
@@ -1011,25 +1012,37 @@ static void apple_nvme_reset_work(struct work_struct *work)
ret = apple_rtkit_shutdown(anv->rtk);
if (ret)
goto out;
+
+ writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL);
}
- writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL);
+ /*
+ * Only do the soft-reset if the CPU is not running, which means either we
+ * or the previous stage shut it down cleanly.
+ */
+ if (!(readl(anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL) &
+ APPLE_ANS_COPROC_CPU_CONTROL_RUN)) {
- ret = reset_control_assert(anv->reset);
- if (ret)
- goto out;
+ ret = reset_control_assert(anv->reset);
+ if (ret)
+ goto out;
- ret = apple_rtkit_reinit(anv->rtk);
- if (ret)
- goto out;
+ ret = apple_rtkit_reinit(anv->rtk);
+ if (ret)
+ goto out;
- ret = reset_control_deassert(anv->reset);
- if (ret)
- goto out;
+ ret = reset_control_deassert(anv->reset);
+ if (ret)
+ goto out;
+
+ writel(APPLE_ANS_COPROC_CPU_CONTROL_RUN,
+ anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL);
+
+ ret = apple_rtkit_boot(anv->rtk);
+ } else {
+ ret = apple_rtkit_wake(anv->rtk);
+ }
- writel(APPLE_ANS_COPROC_CPU_CONTROL_RUN,
- anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL);
- ret = apple_rtkit_boot(anv->rtk);
if (ret) {
dev_err(anv->dev, "ANS did not boot");
goto out;
@@ -1516,6 +1529,7 @@ static struct apple_nvme *apple_nvme_alloc(struct platform_device *pdev)
return anv;
put_dev:
+ apple_nvme_detach_genpd(anv);
put_device(anv->dev);
return ERR_PTR(ret);
}
@@ -1549,6 +1563,7 @@ out_uninit_ctrl:
nvme_uninit_ctrl(&anv->ctrl);
out_put_ctrl:
nvme_put_ctrl(&anv->ctrl);
+ apple_nvme_detach_genpd(anv);
return ret;
}
@@ -1563,9 +1578,12 @@ static void apple_nvme_remove(struct platform_device *pdev)
apple_nvme_disable(anv, true);
nvme_uninit_ctrl(&anv->ctrl);
- if (apple_rtkit_is_running(anv->rtk))
+ if (apple_rtkit_is_running(anv->rtk)) {
apple_rtkit_shutdown(anv->rtk);
+ writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL);
+ }
+
apple_nvme_detach_genpd(anv);
}
@@ -1574,8 +1592,11 @@ static void apple_nvme_shutdown(struct platform_device *pdev)
struct apple_nvme *anv = platform_get_drvdata(pdev);
apple_nvme_disable(anv, true);
- if (apple_rtkit_is_running(anv->rtk))
+ if (apple_rtkit_is_running(anv->rtk)) {
apple_rtkit_shutdown(anv->rtk);
+
+ writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL);
+ }
}
static int apple_nvme_resume(struct device *dev)
@@ -1592,10 +1613,11 @@ static int apple_nvme_suspend(struct device *dev)
apple_nvme_disable(anv, true);
- if (apple_rtkit_is_running(anv->rtk))
+ if (apple_rtkit_is_running(anv->rtk)) {
ret = apple_rtkit_shutdown(anv->rtk);
- writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL);
+ writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL);
+ }
return ret;
}
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 818d4e49aab5..8359d0aa0e44 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -431,6 +431,12 @@ static inline void nvme_end_req_zoned(struct request *req)
static inline void __nvme_end_req(struct request *req)
{
+ if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET))) {
+ if (blk_rq_is_passthrough(req))
+ nvme_log_err_passthru(req);
+ else
+ nvme_log_error(req);
+ }
nvme_end_req_zoned(req);
nvme_trace_bio_complete(req);
if (req->cmd_flags & REQ_NVME_MPATH)
@@ -441,12 +447,6 @@ void nvme_end_req(struct request *req)
{
blk_status_t status = nvme_error_status(nvme_req(req)->status);
- if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET))) {
- if (blk_rq_is_passthrough(req))
- nvme_log_err_passthru(req);
- else
- nvme_log_error(req);
- }
__nvme_end_req(req);
blk_mq_end_request(req, status);
}
@@ -564,8 +564,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
switch (new_state) {
case NVME_CTRL_LIVE:
switch (old_state) {
- case NVME_CTRL_NEW:
- case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
changed = true;
fallthrough;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index f4f1866fbd5b..b9929a5a7f4e 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -781,61 +781,12 @@ restart:
static void
nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
{
- enum nvme_ctrl_state state;
- unsigned long flags;
-
dev_info(ctrl->ctrl.device,
"NVME-FC{%d}: controller connectivity lost. Awaiting "
"Reconnect", ctrl->cnum);
- spin_lock_irqsave(&ctrl->lock, flags);
set_bit(ASSOC_FAILED, &ctrl->flags);
- state = nvme_ctrl_state(&ctrl->ctrl);
- spin_unlock_irqrestore(&ctrl->lock, flags);
-
- switch (state) {
- case NVME_CTRL_NEW:
- case NVME_CTRL_LIVE:
- /*
- * Schedule a controller reset. The reset will terminate the
- * association and schedule the reconnect timer. Reconnects
- * will be attempted until either the ctlr_loss_tmo
- * (max_retries * connect_delay) expires or the remoteport's
- * dev_loss_tmo expires.
- */
- if (nvme_reset_ctrl(&ctrl->ctrl)) {
- dev_warn(ctrl->ctrl.device,
- "NVME-FC{%d}: Couldn't schedule reset.\n",
- ctrl->cnum);
- nvme_delete_ctrl(&ctrl->ctrl);
- }
- break;
-
- case NVME_CTRL_CONNECTING:
- /*
- * The association has already been terminated and the
- * controller is attempting reconnects. No need to do anything
- * futher. Reconnects will be attempted until either the
- * ctlr_loss_tmo (max_retries * connect_delay) expires or the
- * remoteport's dev_loss_tmo expires.
- */
- break;
-
- case NVME_CTRL_RESETTING:
- /*
- * Controller is already in the process of terminating the
- * association. No need to do anything further. The reconnect
- * step will kick in naturally after the association is
- * terminated.
- */
- break;
-
- case NVME_CTRL_DELETING:
- case NVME_CTRL_DELETING_NOIO:
- default:
- /* no action to take - let it delete */
- break;
- }
+ nvme_reset_ctrl(&ctrl->ctrl);
}
/**
@@ -3071,7 +3022,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
struct nvmefc_ls_rcv_op *disls = NULL;
unsigned long flags;
int ret;
- bool changed;
++ctrl->ctrl.nr_reconnects;
@@ -3177,23 +3127,18 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
else
ret = nvme_fc_recreate_io_queues(ctrl);
}
+ if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags))
+ ret = -EIO;
if (ret)
goto out_term_aen_ops;
- spin_lock_irqsave(&ctrl->lock, flags);
- if (!test_bit(ASSOC_FAILED, &ctrl->flags))
- changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
- else
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE)) {
ret = -EIO;
- spin_unlock_irqrestore(&ctrl->lock, flags);
-
- if (ret)
goto out_term_aen_ops;
+ }
ctrl->ctrl.nr_reconnects = 0;
-
- if (changed)
- nvme_start_ctrl(&ctrl->ctrl);
+ nvme_start_ctrl(&ctrl->ctrl);
return 0; /* Success */
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index e8930146847a..24e2c702da7a 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -128,8 +128,10 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
if (!nvme_ctrl_sgl_supported(ctrl))
dev_warn_once(ctrl->device, "using unchecked data buffer\n");
if (has_metadata) {
- if (!supports_metadata)
- return -EINVAL;
+ if (!supports_metadata) {
+ ret = -EINVAL;
+ goto out;
+ }
if (!nvme_ctrl_meta_sgl_supported(ctrl))
dev_warn_once(ctrl->device,
"using unchecked metadata buffer\n");
@@ -139,8 +141,10 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
struct iov_iter iter;
/* fixedbufs is only for non-vectored io */
- if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC))
- return -EINVAL;
+ if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) {
+ ret = -EINVAL;
+ goto out;
+ }
ret = io_uring_cmd_import_fixed(ubuffer, bufflen,
rq_data_dir(req), &iter, ioucmd);
if (ret < 0)
@@ -283,8 +287,7 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
{
if (ns && nsid != ns->head->ns_id) {
dev_err(ctrl->device,
- "%s: nsid (%u) in cmd does not match nsid (%u)"
- "of namespace\n",
+ "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n",
current->comm, nsid, ns->head->ns_id);
return false;
}
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9197a5b173fd..3ad7f197c808 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1130,8 +1130,9 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
if (!nvme_try_complete_req(req, cqe->status, cqe->result) &&
- !blk_mq_add_to_batch(req, iob, nvme_req(req)->status,
- nvme_pci_complete_batch))
+ !blk_mq_add_to_batch(req, iob,
+ nvme_req(req)->status != NVME_SC_SUCCESS,
+ nvme_pci_complete_batch))
nvme_pci_complete_rq(req);
}
@@ -1411,9 +1412,20 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
struct nvme_dev *dev = nvmeq->dev;
struct request *abort_req;
struct nvme_command cmd = { };
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
u32 csts = readl(dev->bar + NVME_REG_CSTS);
u8 opcode;
+ /*
+ * Shutdown the device immediately if we see it is disconnected. This
+ * unblocks PCIe error handling if the nvme driver is waiting in
+ * error_resume for a device that has been removed. We can't unbind the
+ * driver while the driver's error callback is waiting to complete, so
+ * we're relying on a timeout to break that deadlock if a removal
+ * occurs while reset work is running.
+ */
+ if (pci_dev_is_disconnected(pdev))
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
if (nvme_state_terminal(&dev->ctrl))
goto disable;
@@ -1421,7 +1433,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
* the recovery mechanism will surely fail.
*/
mb();
- if (pci_channel_offline(to_pci_dev(dev->dev)))
+ if (pci_channel_offline(pdev))
return BLK_EH_RESET_TIMER;
/*
@@ -1983,6 +1995,18 @@ static void nvme_map_cmb(struct nvme_dev *dev)
return;
/*
+ * Controllers may support a CMB size larger than their BAR, for
+ * example, due to being behind a bridge. Reduce the CMB to the
+ * reported size of the BAR
+ */
+ size = min(size, bar_size - offset);
+
+ if (!IS_ALIGNED(size, memremap_compat_align()) ||
+ !IS_ALIGNED(pci_resource_start(pdev, bar),
+ memremap_compat_align()))
+ return;
+
+ /*
* Tell the controller about the host side address mapping the CMB,
* and enable CMB decoding for the NVMe 1.4+ scheme:
*/
@@ -1992,17 +2016,10 @@ static void nvme_map_cmb(struct nvme_dev *dev)
dev->bar + NVME_REG_CMBMSC);
}
- /*
- * Controllers may support a CMB size larger than their BAR,
- * for example, due to being behind a bridge. Reduce the CMB to
- * the reported size of the BAR
- */
- if (size > bar_size - offset)
- size = bar_size - offset;
-
if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
dev_warn(dev->ctrl.device,
"failed to register the CMB\n");
+ hi_lo_writeq(0, dev->bar + NVME_REG_CMBMSC);
return;
}
@@ -3706,6 +3723,8 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1cc1, 0x5350), /* ADATA XPG GAMMIX S50 */
.driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1dbe, 0x5216), /* Acer/INNOGRIT FA100/5216 NVMe SSD */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1dbe, 0x5236), /* ADATA XPG GAMMIX S70 */
.driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1e49, 0x0021), /* ZHITAI TiPro5000 NVMe SSD */
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 841238f38fdd..327f3f2f5399 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -217,6 +217,19 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
return queue - queue->ctrl->queues;
}
+static inline bool nvme_tcp_recv_pdu_supported(enum nvme_tcp_pdu_type type)
+{
+ switch (type) {
+ case nvme_tcp_c2h_term:
+ case nvme_tcp_c2h_data:
+ case nvme_tcp_r2t:
+ case nvme_tcp_rsp:
+ return true;
+ default:
+ return false;
+ }
+}
+
/*
* Check if the queue is TLS encrypted
*/
@@ -763,6 +776,40 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
return 0;
}
+static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue,
+ struct nvme_tcp_term_pdu *pdu)
+{
+ u16 fes;
+ const char *msg;
+ u32 plen = le32_to_cpu(pdu->hdr.plen);
+
+ static const char * const msg_table[] = {
+ [NVME_TCP_FES_INVALID_PDU_HDR] = "Invalid PDU Header Field",
+ [NVME_TCP_FES_PDU_SEQ_ERR] = "PDU Sequence Error",
+ [NVME_TCP_FES_HDR_DIGEST_ERR] = "Header Digest Error",
+ [NVME_TCP_FES_DATA_OUT_OF_RANGE] = "Data Transfer Out Of Range",
+ [NVME_TCP_FES_DATA_LIMIT_EXCEEDED] = "Data Transfer Limit Exceeded",
+ [NVME_TCP_FES_UNSUPPORTED_PARAM] = "Unsupported Parameter",
+ };
+
+ if (plen < NVME_TCP_MIN_C2HTERM_PLEN ||
+ plen > NVME_TCP_MAX_C2HTERM_PLEN) {
+ dev_err(queue->ctrl->ctrl.device,
+ "Received a malformed C2HTermReq PDU (plen = %u)\n",
+ plen);
+ return;
+ }
+
+ fes = le16_to_cpu(pdu->fes);
+ if (fes && fes < ARRAY_SIZE(msg_table))
+ msg = msg_table[fes];
+ else
+ msg = "Unknown";
+
+ dev_err(queue->ctrl->ctrl.device,
+ "Received C2HTermReq (FES = %s)\n", msg);
+}
+
static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
unsigned int *offset, size_t *len)
{
@@ -784,6 +831,25 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
return 0;
hdr = queue->pdu;
+ if (unlikely(hdr->hlen != sizeof(struct nvme_tcp_rsp_pdu))) {
+ if (!nvme_tcp_recv_pdu_supported(hdr->type))
+ goto unsupported_pdu;
+
+ dev_err(queue->ctrl->ctrl.device,
+ "pdu type %d has unexpected header length (%d)\n",
+ hdr->type, hdr->hlen);
+ return -EPROTO;
+ }
+
+ if (unlikely(hdr->type == nvme_tcp_c2h_term)) {
+ /*
+ * C2HTermReq never includes Header or Data digests.
+ * Skip the checks.
+ */
+ nvme_tcp_handle_c2h_term(queue, (void *)queue->pdu);
+ return -EINVAL;
+ }
+
if (queue->hdr_digest) {
ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
if (unlikely(ret))
@@ -807,10 +873,13 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
nvme_tcp_init_recv_ctx(queue);
return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
default:
- dev_err(queue->ctrl->ctrl.device,
- "unsupported pdu type (%d)\n", hdr->type);
- return -EINVAL;
+ goto unsupported_pdu;
}
+
+unsupported_pdu:
+ dev_err(queue->ctrl->ctrl.device,
+ "unsupported pdu type (%d)\n", hdr->type);
+ return -EINVAL;
}
static inline void nvme_tcp_end_request(struct request *rq, u16 status)
@@ -1449,8 +1518,11 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
msg.msg_control = cbuf;
msg.msg_controllen = sizeof(cbuf);
}
+ msg.msg_flags = MSG_WAITALL;
ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
iov.iov_len, msg.msg_flags);
+ if (ret >= 0 && ret < sizeof(*icresp))
+ ret = -ECONNRESET;
if (ret < 0) {
pr_warn("queue %d: failed to receive icresp, error %d\n",
nvme_tcp_queue_id(queue), ret);
@@ -1565,7 +1637,7 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
ctrl->io_queues[HCTX_TYPE_POLL];
}
-/**
+/*
* Track the number of queues assigned to each cpu using a global per-cpu
* counter and select the least used cpu from the mq_map. Our goal is to spread
* different controllers I/O threads across different cpu cores.
@@ -2653,6 +2725,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{
struct nvme_tcp_queue *queue = hctx->driver_data;
struct sock *sk = queue->sock->sk;
+ int ret;
if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
return 0;
@@ -2660,9 +2733,9 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
set_bit(NVME_TCP_Q_POLLING, &queue->flags);
if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
sk_busy_loop(sk, true);
- nvme_tcp_try_recv(queue);
+ ret = nvme_tcp_try_recv(queue);
clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
- return queue->nr_cqe;
+ return ret < 0 ? ret : queue->nr_cqe;
}
static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index cdc4a09a6e8a..2e741696f371 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -606,6 +606,9 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
goto out_dev_put;
}
+ if (percpu_ref_init(&ns->ref, nvmet_destroy_namespace, 0, GFP_KERNEL))
+ goto out_pr_exit;
+
nvmet_ns_changed(subsys, ns->nsid);
ns->enabled = true;
xa_set_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED);
@@ -613,6 +616,9 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
out_unlock:
mutex_unlock(&subsys->lock);
return ret;
+out_pr_exit:
+ if (ns->pr.enable)
+ nvmet_pr_exit_ns(ns);
out_dev_put:
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
@@ -638,6 +644,19 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
mutex_unlock(&subsys->lock);
+ /*
+ * Now that we removed the namespaces from the lookup list, we
+ * can kill the per_cpu ref and wait for any remaining references
+ * to be dropped, as well as a RCU grace period for anyone only
+ * using the namepace under rcu_read_lock(). Note that we can't
+ * use call_rcu here as we need to ensure the namespaces have
+ * been fully destroyed before unloading the module.
+ */
+ percpu_ref_kill(&ns->ref);
+ synchronize_rcu();
+ wait_for_completion(&ns->disable_done);
+ percpu_ref_exit(&ns->ref);
+
if (ns->pr.enable)
nvmet_pr_exit_ns(ns);
@@ -660,22 +679,6 @@ void nvmet_ns_free(struct nvmet_ns *ns)
if (ns->nsid == subsys->max_nsid)
subsys->max_nsid = nvmet_max_nsid(subsys);
- mutex_unlock(&subsys->lock);
-
- /*
- * Now that we removed the namespaces from the lookup list, we
- * can kill the per_cpu ref and wait for any remaining references
- * to be dropped, as well as a RCU grace period for anyone only
- * using the namepace under rcu_read_lock(). Note that we can't
- * use call_rcu here as we need to ensure the namespaces have
- * been fully destroyed before unloading the module.
- */
- percpu_ref_kill(&ns->ref);
- synchronize_rcu();
- wait_for_completion(&ns->disable_done);
- percpu_ref_exit(&ns->ref);
-
- mutex_lock(&subsys->lock);
subsys->nr_namespaces--;
mutex_unlock(&subsys->lock);
@@ -705,9 +708,6 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
ns->nsid = nsid;
ns->subsys = subsys;
- if (percpu_ref_init(&ns->ref, nvmet_destroy_namespace, 0, GFP_KERNEL))
- goto out_free;
-
if (ns->nsid > subsys->max_nsid)
subsys->max_nsid = nsid;
@@ -730,8 +730,6 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
return ns;
out_exit:
subsys->max_nsid = nvmet_max_nsid(subsys);
- percpu_ref_exit(&ns->ref);
-out_free:
kfree(ns);
out_unlock:
mutex_unlock(&subsys->lock);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 4be8d22d2d8d..fcf4f460dc9a 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -647,7 +647,6 @@ void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys,
struct nvmet_host *host);
void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
u8 event_info, u8 log_page);
-bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid);
#define NVMET_MIN_QUEUE_SIZE 16
#define NVMET_MAX_QUEUE_SIZE 1024
@@ -784,37 +783,37 @@ u16 nvmet_report_invalid_opcode(struct nvmet_req *req);
static inline bool nvmet_cc_en(u32 cc)
{
- return (cc >> NVME_CC_EN_SHIFT) & 0x1;
+ return (cc & NVME_CC_ENABLE) >> NVME_CC_EN_SHIFT;
}
static inline u8 nvmet_cc_css(u32 cc)
{
- return (cc >> NVME_CC_CSS_SHIFT) & 0x7;
+ return (cc & NVME_CC_CSS_MASK) >> NVME_CC_CSS_SHIFT;
}
static inline u8 nvmet_cc_mps(u32 cc)
{
- return (cc >> NVME_CC_MPS_SHIFT) & 0xf;
+ return (cc & NVME_CC_MPS_MASK) >> NVME_CC_MPS_SHIFT;
}
static inline u8 nvmet_cc_ams(u32 cc)
{
- return (cc >> NVME_CC_AMS_SHIFT) & 0x7;
+ return (cc & NVME_CC_AMS_MASK) >> NVME_CC_AMS_SHIFT;
}
static inline u8 nvmet_cc_shn(u32 cc)
{
- return (cc >> NVME_CC_SHN_SHIFT) & 0x3;
+ return (cc & NVME_CC_SHN_MASK) >> NVME_CC_SHN_SHIFT;
}
static inline u8 nvmet_cc_iosqes(u32 cc)
{
- return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf;
+ return (cc & NVME_CC_IOSQES_MASK) >> NVME_CC_IOSQES_SHIFT;
}
static inline u8 nvmet_cc_iocqes(u32 cc)
{
- return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf;
+ return (cc & NVME_CC_IOCQES_MASK) >> NVME_CC_IOCQES_SHIFT;
}
/* Convert a 32-bit number to a 16-bit 0's based number */
diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c
index ac30b42cc622..b1e31483f157 100644
--- a/drivers/nvme/target/pci-epf.c
+++ b/drivers/nvme/target/pci-epf.c
@@ -46,7 +46,7 @@ static DEFINE_MUTEX(nvmet_pci_epf_ports_mutex);
/*
* BAR CC register and SQ polling intervals.
*/
-#define NVMET_PCI_EPF_CC_POLL_INTERVAL msecs_to_jiffies(5)
+#define NVMET_PCI_EPF_CC_POLL_INTERVAL msecs_to_jiffies(10)
#define NVMET_PCI_EPF_SQ_POLL_INTERVAL msecs_to_jiffies(5)
#define NVMET_PCI_EPF_SQ_POLL_IDLE msecs_to_jiffies(5000)
@@ -1265,15 +1265,12 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl,
struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid];
u16 status;
- if (test_and_set_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags))
+ if (test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags))
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
if (!(flags & NVME_QUEUE_PHYS_CONTIG))
return NVME_SC_INVALID_QUEUE | NVME_STATUS_DNR;
- if (flags & NVME_CQ_IRQ_ENABLED)
- set_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags);
-
cq->pci_addr = pci_addr;
cq->qid = cqid;
cq->depth = qsize + 1;
@@ -1290,24 +1287,27 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl,
cq->qes = ctrl->io_cqes;
cq->pci_size = cq->qes * cq->depth;
- cq->iv = nvmet_pci_epf_add_irq_vector(ctrl, vector);
- if (!cq->iv) {
- status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
- goto err;
+ if (flags & NVME_CQ_IRQ_ENABLED) {
+ cq->iv = nvmet_pci_epf_add_irq_vector(ctrl, vector);
+ if (!cq->iv)
+ return NVME_SC_INTERNAL | NVME_STATUS_DNR;
+ set_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags);
}
status = nvmet_cq_create(tctrl, &cq->nvme_cq, cqid, cq->depth);
if (status != NVME_SC_SUCCESS)
goto err;
+ set_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags);
+
dev_dbg(ctrl->dev, "CQ[%u]: %u entries of %zu B, IRQ vector %u\n",
cqid, qsize, cq->qes, cq->vector);
return NVME_SC_SUCCESS;
err:
- clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags);
- clear_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags);
+ if (test_and_clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags))
+ nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector);
return status;
}
@@ -1333,7 +1333,7 @@ static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid];
u16 status;
- if (test_and_set_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
+ if (test_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
if (!(flags & NVME_QUEUE_PHYS_CONTIG))
@@ -1355,7 +1355,7 @@ static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
status = nvmet_sq_create(tctrl, &sq->nvme_sq, sqid, sq->depth);
if (status != NVME_SC_SUCCESS)
- goto out_clear_bit;
+ return status;
sq->iod_wq = alloc_workqueue("sq%d_wq", WQ_UNBOUND,
min_t(int, sq->depth, WQ_MAX_ACTIVE), sqid);
@@ -1365,6 +1365,8 @@ static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
goto out_destroy_sq;
}
+ set_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags);
+
dev_dbg(ctrl->dev, "SQ[%u]: %u entries of %zu B\n",
sqid, qsize, sq->qes);
@@ -1372,8 +1374,6 @@ static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
out_destroy_sq:
nvmet_sq_destroy(&sq->nvme_sq);
-out_clear_bit:
- clear_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags);
return status;
}
@@ -1694,6 +1694,7 @@ static void nvmet_pci_epf_poll_sqs_work(struct work_struct *work)
struct nvmet_pci_epf_ctrl *ctrl =
container_of(work, struct nvmet_pci_epf_ctrl, poll_sqs.work);
struct nvmet_pci_epf_queue *sq;
+ unsigned long limit = jiffies;
unsigned long last = 0;
int i, nr_sqs;
@@ -1708,6 +1709,16 @@ static void nvmet_pci_epf_poll_sqs_work(struct work_struct *work)
nr_sqs++;
}
+ /*
+ * If we have been running for a while, reschedule to let other
+ * tasks run and to avoid RCU stalls.
+ */
+ if (time_is_before_jiffies(limit + secs_to_jiffies(1))) {
+ cond_resched();
+ limit = jiffies;
+ continue;
+ }
+
if (nr_sqs) {
last = jiffies;
continue;
@@ -1822,14 +1833,14 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
if (ctrl->io_sqes < sizeof(struct nvme_command)) {
dev_err(ctrl->dev, "Unsupported I/O SQES %zu (need %zu)\n",
ctrl->io_sqes, sizeof(struct nvme_command));
- return -EINVAL;
+ goto err;
}
ctrl->io_cqes = 1UL << nvmet_cc_iocqes(ctrl->cc);
if (ctrl->io_cqes < sizeof(struct nvme_completion)) {
dev_err(ctrl->dev, "Unsupported I/O CQES %zu (need %zu)\n",
ctrl->io_sqes, sizeof(struct nvme_completion));
- return -EINVAL;
+ goto err;
}
/* Create the admin queue. */
@@ -1844,7 +1855,7 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
qsize, pci_addr, 0);
if (status != NVME_SC_SUCCESS) {
dev_err(ctrl->dev, "Failed to create admin completion queue\n");
- return -EINVAL;
+ goto err;
}
qsize = aqa & 0x00000fff;
@@ -1854,17 +1865,22 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
if (status != NVME_SC_SUCCESS) {
dev_err(ctrl->dev, "Failed to create admin submission queue\n");
nvmet_pci_epf_delete_cq(ctrl->tctrl, 0);
- return -EINVAL;
+ goto err;
}
ctrl->sq_ab = NVMET_PCI_EPF_SQ_AB;
ctrl->irq_vector_threshold = NVMET_PCI_EPF_IV_THRESHOLD;
ctrl->enabled = true;
+ ctrl->csts = NVME_CSTS_RDY;
/* Start polling the controller SQs. */
schedule_delayed_work(&ctrl->poll_sqs, 0);
return 0;
+
+err:
+ ctrl->csts = 0;
+ return -EINVAL;
}
static void nvmet_pci_epf_disable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
@@ -1889,6 +1905,8 @@ static void nvmet_pci_epf_disable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
/* Delete the admin queue last. */
nvmet_pci_epf_delete_sq(ctrl->tctrl, 0);
nvmet_pci_epf_delete_cq(ctrl->tctrl, 0);
+
+ ctrl->csts &= ~NVME_CSTS_RDY;
}
static void nvmet_pci_epf_poll_cc_work(struct work_struct *work)
@@ -1903,19 +1921,19 @@ static void nvmet_pci_epf_poll_cc_work(struct work_struct *work)
old_cc = ctrl->cc;
new_cc = nvmet_pci_epf_bar_read32(ctrl, NVME_REG_CC);
+ if (new_cc == old_cc)
+ goto reschedule_work;
+
ctrl->cc = new_cc;
if (nvmet_cc_en(new_cc) && !nvmet_cc_en(old_cc)) {
ret = nvmet_pci_epf_enable_ctrl(ctrl);
if (ret)
- return;
- ctrl->csts |= NVME_CSTS_RDY;
+ goto reschedule_work;
}
- if (!nvmet_cc_en(new_cc) && nvmet_cc_en(old_cc)) {
+ if (!nvmet_cc_en(new_cc) && nvmet_cc_en(old_cc))
nvmet_pci_epf_disable_ctrl(ctrl);
- ctrl->csts &= ~NVME_CSTS_RDY;
- }
if (nvmet_cc_shn(new_cc) && !nvmet_cc_shn(old_cc)) {
nvmet_pci_epf_disable_ctrl(ctrl);
@@ -1928,6 +1946,7 @@ static void nvmet_pci_epf_poll_cc_work(struct work_struct *work)
nvmet_update_cc(ctrl->tctrl, ctrl->cc);
nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CSTS, ctrl->csts);
+reschedule_work:
schedule_delayed_work(&ctrl->poll_cc, NVMET_PCI_EPF_CC_POLL_INTERVAL);
}
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 1afd93026f9b..2a4536ef6184 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -996,6 +996,27 @@ out_err:
nvmet_req_complete(&cmd->req, status);
}
+static bool nvmet_rdma_recv_not_live(struct nvmet_rdma_queue *queue,
+ struct nvmet_rdma_rsp *rsp)
+{
+ unsigned long flags;
+ bool ret = true;
+
+ spin_lock_irqsave(&queue->state_lock, flags);
+ /*
+ * recheck queue state is not live to prevent a race condition
+ * with RDMA_CM_EVENT_ESTABLISHED handler.
+ */
+ if (queue->state == NVMET_RDMA_Q_LIVE)
+ ret = false;
+ else if (queue->state == NVMET_RDMA_Q_CONNECTING)
+ list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
+ else
+ nvmet_rdma_put_rsp(rsp);
+ spin_unlock_irqrestore(&queue->state_lock, flags);
+ return ret;
+}
+
static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvmet_rdma_cmd *cmd =
@@ -1038,17 +1059,9 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
rsp->n_rdma = 0;
rsp->invalidate_rkey = 0;
- if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
- unsigned long flags;
-
- spin_lock_irqsave(&queue->state_lock, flags);
- if (queue->state == NVMET_RDMA_Q_CONNECTING)
- list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
- else
- nvmet_rdma_put_rsp(rsp);
- spin_unlock_irqrestore(&queue->state_lock, flags);
+ if (unlikely(queue->state != NVMET_RDMA_Q_LIVE) &&
+ nvmet_rdma_recv_not_live(queue, rsp))
return;
- }
nvmet_rdma_handle_command(queue, rsp);
}
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 7c51c2a8c109..4f9cac8a5abe 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -571,10 +571,16 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req)
struct nvmet_tcp_cmd *cmd =
container_of(req, struct nvmet_tcp_cmd, req);
struct nvmet_tcp_queue *queue = cmd->queue;
+ enum nvmet_tcp_recv_state queue_state;
+ struct nvmet_tcp_cmd *queue_cmd;
struct nvme_sgl_desc *sgl;
u32 len;
- if (unlikely(cmd == queue->cmd)) {
+ /* Pairs with store_release in nvmet_prepare_receive_pdu() */
+ queue_state = smp_load_acquire(&queue->rcv_state);
+ queue_cmd = READ_ONCE(queue->cmd);
+
+ if (unlikely(cmd == queue_cmd)) {
sgl = &cmd->req.cmd->common.dptr.sgl;
len = le32_to_cpu(sgl->length);
@@ -583,7 +589,7 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req)
* Avoid using helpers, this might happen before
* nvmet_req_init is completed.
*/
- if (queue->rcv_state == NVMET_TCP_RECV_PDU &&
+ if (queue_state == NVMET_TCP_RECV_PDU &&
len && len <= cmd->req.port->inline_data_size &&
nvme_is_write(cmd->req.cmd))
return;
@@ -847,8 +853,9 @@ static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
{
queue->offset = 0;
queue->left = sizeof(struct nvme_tcp_hdr);
- queue->cmd = NULL;
- queue->rcv_state = NVMET_TCP_RECV_PDU;
+ WRITE_ONCE(queue->cmd, NULL);
+ /* Ensure rcv_state is visible only after queue->cmd is set */
+ smp_store_release(&queue->rcv_state, NVMET_TCP_RECV_PDU);
}
static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)