diff options
Diffstat (limited to 'drivers/nvme/host/core.c')
-rw-r--r-- | drivers/nvme/host/core.c | 209 |
1 files changed, 166 insertions, 43 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 559d567693b8..08f2c92602f4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -97,7 +97,6 @@ static dev_t nvme_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; -static void nvme_ns_remove(struct nvme_ns *ns); static int nvme_revalidate_disk(struct gendisk *disk); static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, @@ -245,12 +244,31 @@ static inline bool nvme_req_needs_retry(struct request *req) return true; } +static void nvme_retry_req(struct request *req) +{ + struct nvme_ns *ns = req->q->queuedata; + unsigned long delay = 0; + u16 crd; + + /* The mask and shift result must be <= 3 */ + crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11; + if (ns && crd) + delay = ns->ctrl->crdt[crd - 1] * 100; + + nvme_req(req)->retries++; + blk_mq_requeue_request(req, false); + blk_mq_delay_kick_requeue_list(req->q, delay); +} + void nvme_complete_rq(struct request *req) { blk_status_t status = nvme_error_status(req); trace_nvme_complete_rq(req); + if (nvme_req(req)->ctrl->kas) + nvme_req(req)->ctrl->comp_seen = true; + if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { if ((req->cmd_flags & REQ_NVME_MPATH) && blk_path_error(status)) { @@ -259,8 +277,7 @@ void nvme_complete_rq(struct request *req) } if (!blk_queue_dying(req->q)) { - nvme_req(req)->retries++; - blk_mq_requeue_request(req, true); + nvme_retry_req(req); return; } } @@ -268,14 +285,14 @@ void nvme_complete_rq(struct request *req) } EXPORT_SYMBOL_GPL(nvme_complete_rq); -void nvme_cancel_request(struct request *req, void *data, bool reserved) +bool nvme_cancel_request(struct request *req, void *data, bool reserved) { dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, "Cancelling I/O %d", req->tag); nvme_req(req)->status = NVME_SC_ABORT_REQ; blk_mq_complete_request(req); - + return true; } EXPORT_SYMBOL_GPL(nvme_cancel_request); @@ -536,7 +553,6 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { - memset(cmnd, 0, sizeof(*cmnd)); cmnd->common.opcode = nvme_cmd_flush; cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); } @@ -548,9 +564,19 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, struct nvme_dsm_range *range; struct bio *bio; - range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC); - if (!range) - return BLK_STS_RESOURCE; + range = kmalloc_array(segments, sizeof(*range), + GFP_ATOMIC | __GFP_NOWARN); + if (!range) { + /* + * If we fail allocation our range, fallback to the controller + * discard page. If that's also busy, it's safe to return + * busy, as we know we can make progress once that's freed. + */ + if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy)) + return BLK_STS_RESOURCE; + + range = page_address(ns->ctrl->discard_page); + } __rq_for_each_bio(bio, req) { u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); @@ -565,11 +591,13 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, } if (WARN_ON_ONCE(n != segments)) { - kfree(range); + if (virt_to_page(range) == ns->ctrl->discard_page) + clear_bit_unlock(0, &ns->ctrl->discard_page_busy); + else + kfree(range); return BLK_STS_IOERR; } - memset(cmnd, 0, sizeof(*cmnd)); cmnd->dsm.opcode = nvme_cmd_dsm; cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id); cmnd->dsm.nr = cpu_to_le32(segments - 1); @@ -598,7 +626,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - memset(cmnd, 0, sizeof(*cmnd)); cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); @@ -650,8 +677,13 @@ void nvme_cleanup_cmd(struct request *req) blk_rq_bytes(req) >> ns->lba_shift); } if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - kfree(page_address(req->special_vec.bv_page) + - req->special_vec.bv_offset); + struct nvme_ns *ns = req->rq_disk->private_data; + struct page *page = req->special_vec.bv_page; + + if (page == ns->ctrl->discard_page) + clear_bit_unlock(0, &ns->ctrl->discard_page_busy); + else + kfree(page_address(page) + req->special_vec.bv_offset); } } EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); @@ -663,6 +695,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, nvme_clear_nvme_request(req); + memset(cmd, 0, sizeof(*cmd)); switch (req_op(req)) { case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: @@ -691,6 +724,31 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, } EXPORT_SYMBOL_GPL(nvme_setup_cmd); +static void nvme_end_sync_rq(struct request *rq, blk_status_t error) +{ + struct completion *waiting = rq->end_io_data; + + rq->end_io_data = NULL; + complete(waiting); +} + +static void nvme_execute_rq_polled(struct request_queue *q, + struct gendisk *bd_disk, struct request *rq, int at_head) +{ + DECLARE_COMPLETION_ONSTACK(wait); + + WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)); + + rq->cmd_flags |= REQ_HIPRI; + rq->end_io_data = &wait; + blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq); + + while (!completion_done(&wait)) { + blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true); + cond_resched(); + } +} + /* * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code @@ -698,7 +756,7 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, unsigned timeout, int qid, int at_head, - blk_mq_req_flags_t flags) + blk_mq_req_flags_t flags, bool poll) { struct request *req; int ret; @@ -715,7 +773,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, goto out; } - blk_execute_rq(req->q, NULL, req, at_head); + if (poll) + nvme_execute_rq_polled(req->q, NULL, req, at_head); + else + blk_execute_rq(req->q, NULL, req, at_head); if (result) *result = nvme_req(req)->result; if (nvme_req(req)->flags & NVME_REQ_CANCELLED) @@ -732,7 +793,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buffer, unsigned bufflen) { return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, - NVME_QID_ANY, 0, 0); + NVME_QID_ANY, 0, 0, false); } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); @@ -831,6 +892,8 @@ static int nvme_submit_user_cmd(struct request_queue *q, static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) { struct nvme_ctrl *ctrl = rq->end_io_data; + unsigned long flags; + bool startka = false; blk_mq_free_request(rq); @@ -841,7 +904,14 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) return; } - schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + ctrl->comp_seen = false; + spin_lock_irqsave(&ctrl->lock, flags); + if (ctrl->state == NVME_CTRL_LIVE || + ctrl->state == NVME_CTRL_CONNECTING) + startka = true; + spin_unlock_irqrestore(&ctrl->lock, flags); + if (startka) + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); } static int nvme_keep_alive(struct nvme_ctrl *ctrl) @@ -865,6 +935,15 @@ static void nvme_keep_alive_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), struct nvme_ctrl, ka_work); + bool comp_seen = ctrl->comp_seen; + + if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) { + dev_dbg(ctrl->device, + "reschedule traffic based keep-alive timer\n"); + ctrl->comp_seen = false; + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + return; + } if (nvme_keep_alive(ctrl)) { /* allocation failure, reset the controller */ @@ -1033,7 +1112,7 @@ static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword c.features.dword11 = cpu_to_le32(dword11); ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, - buffer, buflen, 0, NVME_QID_ANY, 0, 0); + buffer, buflen, 0, NVME_QID_ANY, 0, 0, false); if (ret >= 0 && result) *result = le32_to_cpu(res.u32); return ret; @@ -1232,12 +1311,12 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.nsid = cpu_to_le32(cmd.nsid); c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); - c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); - c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); - c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); - c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); - c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); + c.common.cdw10 = cpu_to_le32(cmd.cdw10); + c.common.cdw11 = cpu_to_le32(cmd.cdw11); + c.common.cdw12 = cpu_to_le32(cmd.cdw12); + c.common.cdw13 = cpu_to_le32(cmd.cdw13); + c.common.cdw14 = cpu_to_le32(cmd.cdw14); + c.common.cdw15 = cpu_to_le32(cmd.cdw15); if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); @@ -1516,8 +1595,6 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->noiob) nvme_set_chunk_size(ns); nvme_update_disk_info(disk, ns, id); - if (ns->ndev) - nvme_nvm_update_nvm_info(ns); #ifdef CONFIG_NVME_MULTIPATH if (ns->head->disk) { nvme_update_disk_info(ns->head->disk, ns, id); @@ -1600,7 +1677,7 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10, memset(&c, 0, sizeof(c)); c.common.opcode = op; c.common.nsid = cpu_to_le32(ns->head->ns_id); - c.common.cdw10[0] = cpu_to_le32(cdw10); + c.common.cdw10 = cpu_to_le32(cdw10); ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16); nvme_put_ns_from_disk(head, srcu_idx); @@ -1674,11 +1751,11 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, else cmd.common.opcode = nvme_admin_security_recv; cmd.common.nsid = 0; - cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); - cmd.common.cdw10[1] = cpu_to_le32(len); + cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); + cmd.common.cdw11 = cpu_to_le32(len); return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, - ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0); + ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false); } EXPORT_SYMBOL_GPL(nvme_sec_submit); #endif /* CONFIG_BLK_SED_OPAL */ @@ -1873,6 +1950,26 @@ static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) return ret; } +static int nvme_configure_acre(struct nvme_ctrl *ctrl) +{ + struct nvme_feat_host_behavior *host; + int ret; + + /* Don't bother enabling the feature if retry delay is not reported */ + if (!ctrl->crdt[0]) + return 0; + + host = kzalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return 0; + + host->acre = NVME_ENABLE_ACRE; + ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0, + host, sizeof(*host), NULL); + kfree(host); + return ret; +} + static int nvme_configure_apst(struct nvme_ctrl *ctrl) { /* @@ -2394,6 +2491,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS; } + ctrl->crdt[0] = le16_to_cpu(id->crdt1); + ctrl->crdt[1] = le16_to_cpu(id->crdt2); + ctrl->crdt[2] = le16_to_cpu(id->crdt3); + ctrl->oacs = le16_to_cpu(id->oacs); ctrl->oncs = le16_to_cpup(&id->oncs); ctrl->oaes = le32_to_cpu(id->oaes); @@ -2411,6 +2512,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->sgls = le32_to_cpu(id->sgls); ctrl->kas = le16_to_cpu(id->kas); ctrl->max_namespaces = le32_to_cpu(id->mnan); + ctrl->ctratt = le32_to_cpu(id->ctratt); if (id->rtd3e) { /* us -> s */ @@ -2493,6 +2595,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) if (ret < 0) return ret; + ret = nvme_configure_acre(ctrl); + if (ret < 0) + return ret; + ctrl->identified = true; return 0; @@ -2768,6 +2874,7 @@ static ssize_t field##_show(struct device *dev, \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); nvme_show_int_function(cntlid); +nvme_show_int_function(numa_node); static ssize_t nvme_sysfs_delete(struct device *dev, struct device_attribute *attr, const char *buf, @@ -2847,6 +2954,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_subsysnqn.attr, &dev_attr_address.attr, &dev_attr_state.attr, + &dev_attr_numa_node.attr, NULL }; @@ -3057,7 +3165,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) struct gendisk *disk; struct nvme_id_ns *id; char disk_name[DISK_NAME_LEN]; - int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT; + int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) @@ -3092,13 +3200,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_setup_streams_ns(ctrl, ns); nvme_set_disk_name(disk_name, ns, ctrl, &flags); - if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - if (nvme_nvm_register(ns, disk_name, node)) { - dev_warn(ctrl->device, "LightNVM init failure\n"); - goto out_unlink_ns; - } - } - disk = alloc_disk_node(0, node); if (!disk) goto out_unlink_ns; @@ -3112,6 +3213,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) __nvme_revalidate_disk(disk, id); + if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { + if (nvme_nvm_register(ns, disk_name, node)) { + dev_warn(ctrl->device, "LightNVM init failure\n"); + goto out_put_disk; + } + } + down_write(&ctrl->namespaces_rwsem); list_add_tail(&ns->list, &ctrl->namespaces); up_write(&ctrl->namespaces_rwsem); @@ -3125,6 +3233,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) kfree(id); return; + out_put_disk: + put_disk(ns->disk); out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); @@ -3314,6 +3424,9 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) struct nvme_ns *ns, *next; LIST_HEAD(ns_list); + /* prevent racing with ns scanning */ + flush_work(&ctrl->scan_work); + /* * The dead states indicates the controller was not gracefully * disconnected. In that case, we won't be able to flush any data while @@ -3476,7 +3589,6 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) nvme_mpath_stop(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); - flush_work(&ctrl->scan_work); cancel_work_sync(&ctrl->fw_act_work); if (ctrl->ops->stop_ctrl) ctrl->ops->stop_ctrl(ctrl); @@ -3512,6 +3624,7 @@ static void nvme_free_ctrl(struct device *dev) ida_simple_remove(&nvme_instance_ida, ctrl->instance); kfree(ctrl->effects); nvme_mpath_uninit(ctrl); + __free_page(ctrl->discard_page); if (subsys) { mutex_lock(&subsys->lock); @@ -3552,6 +3665,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; + BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) > + PAGE_SIZE); + ctrl->discard_page = alloc_page(GFP_KERNEL); + if (!ctrl->discard_page) { + ret = -ENOMEM; + goto out; + } + ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); if (ret < 0) goto out; @@ -3585,10 +3706,12 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, return 0; out_free_name: - kfree_const(dev->kobj.name); + kfree_const(ctrl->device->kobj.name); out_release_instance: ida_simple_remove(&nvme_instance_ida, ctrl->instance); out: + if (ctrl->discard_page) + __free_page(ctrl->discard_page); return ret; } EXPORT_SYMBOL_GPL(nvme_init_ctrl); @@ -3607,7 +3730,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) down_read(&ctrl->namespaces_rwsem); /* Forcibly unquiesce queues to avoid blocking dispatch */ - if (ctrl->admin_q) + if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q)) blk_mq_unquiesce_queue(ctrl->admin_q); list_for_each_entry(ns, &ctrl->namespaces, list) @@ -3736,7 +3859,7 @@ out: return result; } -void nvme_core_exit(void) +void __exit nvme_core_exit(void) { ida_destroy(&nvme_subsystems_ida); class_destroy(nvme_subsys_class); |