diff options
Diffstat (limited to 'drivers/nvme/host/core.c')
-rw-r--r-- | drivers/nvme/host/core.c | 257 |
1 files changed, 250 insertions, 7 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 44a1a257e0b5..25ec4e585220 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -26,6 +26,7 @@ #include <linux/ptrace.h> #include <linux/nvme_ioctl.h> #include <linux/t10-pi.h> +#include <linux/pm_qos.h> #include <scsi/sg.h> #include <asm/unaligned.h> @@ -56,6 +57,11 @@ EXPORT_SYMBOL_GPL(nvme_max_retries); static int nvme_char_major; module_param(nvme_char_major, int, 0); +static unsigned long default_ps_max_latency_us = 25000; +module_param(default_ps_max_latency_us, ulong, 0644); +MODULE_PARM_DESC(default_ps_max_latency_us, + "max power saving latency for new devices; use PM QOS to change per device"); + static LIST_HEAD(nvme_ctrl_list); static DEFINE_SPINLOCK(dev_list_lock); @@ -560,7 +566,7 @@ int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ c.identify.opcode = nvme_admin_identify; - c.identify.cns = cpu_to_le32(NVME_ID_CNS_CTRL); + c.identify.cns = NVME_ID_CNS_CTRL; *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); if (!*id) @@ -578,7 +584,7 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n struct nvme_command c = { }; c.identify.opcode = nvme_admin_identify; - c.identify.cns = cpu_to_le32(NVME_ID_CNS_NS_ACTIVE_LIST); + c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST; c.identify.nsid = cpu_to_le32(nsid); return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000); } @@ -590,8 +596,9 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, int error; /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ - c.identify.opcode = nvme_admin_identify, - c.identify.nsid = cpu_to_le32(nsid), + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(nsid); + c.identify.cns = NVME_ID_CNS_NS; *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); if (!*id) @@ -1251,6 +1258,176 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, blk_queue_write_cache(q, vwc, vwc); } +static void nvme_configure_apst(struct nvme_ctrl *ctrl) +{ + /* + * APST (Autonomous Power State Transition) lets us program a + * table of power state transitions that the controller will + * perform automatically. We configure it with a simple + * heuristic: we are willing to spend at most 2% of the time + * transitioning between power states. Therefore, when running + * in any given state, we will enter the next lower-power + * non-operational state after waiting 100 * (enlat + exlat) + * microseconds, as long as that state's total latency is under + * the requested maximum latency. + * + * We will not autonomously enter any non-operational state for + * which the total latency exceeds ps_max_latency_us. Users + * can set ps_max_latency_us to zero to turn off APST. + */ + + unsigned apste; + struct nvme_feat_auto_pst *table; + int ret; + + /* + * If APST isn't supported or if we haven't been initialized yet, + * then don't do anything. + */ + if (!ctrl->apsta) + return; + + if (ctrl->npss > 31) { + dev_warn(ctrl->device, "NPSS is invalid; not using APST\n"); + return; + } + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return; + + if (ctrl->ps_max_latency_us == 0) { + /* Turn off APST. */ + apste = 0; + } else { + __le64 target = cpu_to_le64(0); + int state; + + /* + * Walk through all states from lowest- to highest-power. + * According to the spec, lower-numbered states use more + * power. NPSS, despite the name, is the index of the + * lowest-power state, not the number of states. + */ + for (state = (int)ctrl->npss; state >= 0; state--) { + u64 total_latency_us, transition_ms; + + if (target) + table->entries[state] = target; + + /* + * Is this state a useful non-operational state for + * higher-power states to autonomously transition to? + */ + if (!(ctrl->psd[state].flags & + NVME_PS_FLAGS_NON_OP_STATE)) + continue; + + total_latency_us = + (u64)le32_to_cpu(ctrl->psd[state].entry_lat) + + + le32_to_cpu(ctrl->psd[state].exit_lat); + if (total_latency_us > ctrl->ps_max_latency_us) + continue; + + /* + * This state is good. Use it as the APST idle + * target for higher power states. + */ + transition_ms = total_latency_us + 19; + do_div(transition_ms, 20); + if (transition_ms > (1 << 24) - 1) + transition_ms = (1 << 24) - 1; + + target = cpu_to_le64((state << 3) | + (transition_ms << 8)); + } + + apste = 1; + } + + ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste, + table, sizeof(*table), NULL); + if (ret) + dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret); + + kfree(table); +} + +static void nvme_set_latency_tolerance(struct device *dev, s32 val) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + u64 latency; + + switch (val) { + case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT: + case PM_QOS_LATENCY_ANY: + latency = U64_MAX; + break; + + default: + latency = val; + } + + if (ctrl->ps_max_latency_us != latency) { + ctrl->ps_max_latency_us = latency; + nvme_configure_apst(ctrl); + } +} + +struct nvme_core_quirk_entry { + /* + * NVMe model and firmware strings are padded with spaces. For + * simplicity, strings in the quirk table are padded with NULLs + * instead. + */ + u16 vid; + const char *mn; + const char *fr; + unsigned long quirks; +}; + +static const struct nvme_core_quirk_entry core_quirks[] = { + /* + * Seen on a Samsung "SM951 NVMe SAMSUNG 256GB": using APST causes + * the controller to go out to lunch. It dies when the watchdog + * timer reads CSTS and gets 0xffffffff. + */ + { + .vid = 0x144d, + .fr = "BXW75D0Q", + .quirks = NVME_QUIRK_NO_APST, + }, +}; + +/* match is null-terminated but idstr is space-padded. */ +static bool string_matches(const char *idstr, const char *match, size_t len) +{ + size_t matchlen; + + if (!match) + return true; + + matchlen = strlen(match); + WARN_ON_ONCE(matchlen > len); + + if (memcmp(idstr, match, matchlen)) + return false; + + for (; matchlen < len; matchlen++) + if (idstr[matchlen] != ' ') + return false; + + return true; +} + +static bool quirk_matches(const struct nvme_id_ctrl *id, + const struct nvme_core_quirk_entry *q) +{ + return q->vid == le16_to_cpu(id->vid) && + string_matches(id->mn, q->mn, sizeof(id->mn)) && + string_matches(id->fr, q->fr, sizeof(id->fr)); +} + /* * Initialize the cached copies of the Identify data and various controller * register in our nvme_ctrl structure. This should be called as soon as @@ -1262,6 +1439,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) u64 cap; int ret, page_shift; u32 max_hw_sectors; + u8 prev_apsta; ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); if (ret) { @@ -1285,6 +1463,24 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return -EIO; } + if (!ctrl->identified) { + /* + * Check for quirks. Quirk can depend on firmware version, + * so, in principle, the set of quirks present can change + * across a reset. As a possible future enhancement, we + * could re-scan for quirks every time we reinitialize + * the device, but we'd have to make sure that the driver + * behaves intelligently if the quirks change. + */ + + int i; + + for (i = 0; i < ARRAY_SIZE(core_quirks); i++) { + if (quirk_matches(id, &core_quirks[i])) + ctrl->quirks |= core_quirks[i].quirks; + } + } + ctrl->oacs = le16_to_cpu(id->oacs); ctrl->vid = le16_to_cpu(id->vid); ctrl->oncs = le16_to_cpup(&id->oncs); @@ -1305,6 +1501,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->sgls = le32_to_cpu(id->sgls); ctrl->kas = le16_to_cpu(id->kas); + ctrl->npss = id->npss; + prev_apsta = ctrl->apsta; + ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta; + memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); + if (ctrl->ops->is_fabrics) { ctrl->icdoff = le16_to_cpu(id->icdoff); ctrl->ioccsz = le32_to_cpu(id->ioccsz); @@ -1328,6 +1529,16 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } kfree(id); + + if (ctrl->apsta && !prev_apsta) + dev_pm_qos_expose_latency_tolerance(ctrl->device); + else if (!ctrl->apsta && prev_apsta) + dev_pm_qos_hide_latency_tolerance(ctrl->device); + + nvme_configure_apst(ctrl); + + ctrl->identified = true; + return ret; } EXPORT_SYMBOL_GPL(nvme_init_identify); @@ -1577,6 +1788,29 @@ static ssize_t nvme_sysfs_show_transport(struct device *dev, } static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); +static ssize_t nvme_sysfs_show_state(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + static const char *const state_name[] = { + [NVME_CTRL_NEW] = "new", + [NVME_CTRL_LIVE] = "live", + [NVME_CTRL_RESETTING] = "resetting", + [NVME_CTRL_RECONNECTING]= "reconnecting", + [NVME_CTRL_DELETING] = "deleting", + [NVME_CTRL_DEAD] = "dead", + }; + + if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) && + state_name[ctrl->state]) + return sprintf(buf, "%s\n", state_name[ctrl->state]); + + return sprintf(buf, "unknown state\n"); +} + +static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL); + static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, struct device_attribute *attr, char *buf) @@ -1609,6 +1843,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_transport.attr, &dev_attr_subsysnqn.attr, &dev_attr_address.attr, + &dev_attr_state.attr, NULL }; @@ -2065,6 +2300,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, list_add_tail(&ctrl->node, &nvme_ctrl_list); spin_unlock(&dev_list_lock); + /* + * Initialize latency tolerance controls. The sysfs files won't + * be visible to userspace unless the device actually supports APST. + */ + ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance; + dev_pm_qos_update_user_latency_tolerance(ctrl->device, + min(default_ps_max_latency_us, (unsigned long)S32_MAX)); + return 0; out_release_instance: nvme_release_instance(ctrl); @@ -2090,9 +2333,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) * Revalidating a dead namespace sets capacity to 0. This will * end buffered writers dirtying pages that can't be synced. */ - if (ns->disk && !test_and_set_bit(NVME_NS_DEAD, &ns->flags)) - revalidate_disk(ns->disk); - + if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + continue; + revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); blk_mq_abort_requeue_list(ns->queue); blk_mq_start_stopped_hw_queues(ns->queue, true); |