diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 521 |
1 files changed, 430 insertions, 91 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 163445baa4fc..303fbb6a48b6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -152,8 +152,9 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address) { - struct ras_err_data err_data = {0, 0, 0, NULL}; + struct ras_err_data err_data; struct eeprom_table_record err_rec; + int ret; if ((address >= adev->gmc.mc_vram_size) || (address >= RAS_UMC_INJECT_ADDR_LIMIT)) { @@ -170,6 +171,10 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre return 0; } + ret = amdgpu_ras_error_data_init(&err_data); + if (ret) + return ret; + memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); err_data.err_addr = &err_rec; amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0); @@ -180,6 +185,8 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre amdgpu_ras_save_bad_pages(adev, NULL); } + amdgpu_ras_error_data_fini(&err_data); + dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); dev_warn(adev->dev, "Clear EEPROM:\n"); dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); @@ -201,8 +208,8 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, return -EINVAL; /* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */ - if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && - obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { + if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && + amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); } @@ -611,8 +618,8 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; - if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && - obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { + if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && + amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); } @@ -628,8 +635,11 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, static inline void put_obj(struct ras_manager *obj) { - if (obj && (--obj->use == 0)) + if (obj && (--obj->use == 0)) { list_del(&obj->node); + amdgpu_ras_error_data_fini(&obj->err_data); + } + if (obj && (obj->use < 0)) DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head)); } @@ -659,6 +669,9 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, if (alive_obj(obj)) return NULL; + if (amdgpu_ras_error_data_init(&obj->err_data)) + return NULL; + obj->head = *head; obj->adev = adev; list_add(&obj->node, &con->head); @@ -769,9 +782,10 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (!con) return -EINVAL; - /* Do not enable ras feature if it is not allowed */ - if (enable && - head->block != AMDGPU_RAS_BLOCK__GFX && + /* For non-gfx ip, do not enable ras feature if it is not allowed */ + /* For gfx ip, regardless of feature support status, */ + /* Force issue enable or disable ras feature commands */ + if (head->block != AMDGPU_RAS_BLOCK__GFX && !amdgpu_ras_is_feature_allowed(adev, head)) return 0; @@ -1014,17 +1028,159 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d } } +static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, + struct ras_manager *ras_mgr, + struct ras_err_data *err_data, + const char *blk_name, + bool is_ue) +{ + struct amdgpu_smuio_mcm_config_info *mcm_info; + struct ras_err_node *err_node; + struct ras_err_info *err_info; + + if (is_ue) { + for_each_ras_error(err_node, err_data) { + err_info = &err_node->err_info; + mcm_info = &err_info->mcm_info; + if (err_info->ue_count) { + dev_info(adev->dev, "socket: %d, die: %d, " + "%lld new uncorrectable hardware errors detected in %s block\n", + mcm_info->socket_id, + mcm_info->die_id, + err_info->ue_count, + blk_name); + } + } + + for_each_ras_error(err_node, &ras_mgr->err_data) { + err_info = &err_node->err_info; + mcm_info = &err_info->mcm_info; + dev_info(adev->dev, "socket: %d, die: %d, " + "%lld uncorrectable hardware errors detected in total in %s block\n", + mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name); + } + + } else { + for_each_ras_error(err_node, err_data) { + err_info = &err_node->err_info; + mcm_info = &err_info->mcm_info; + if (err_info->ce_count) { + dev_info(adev->dev, "socket: %d, die: %d, " + "%lld new correctable hardware errors detected in %s block, " + "no user action is needed\n", + mcm_info->socket_id, + mcm_info->die_id, + err_info->ce_count, + blk_name); + } + } + + for_each_ras_error(err_node, &ras_mgr->err_data) { + err_info = &err_node->err_info; + mcm_info = &err_info->mcm_info; + dev_info(adev->dev, "socket: %d, die: %d, " + "%lld correctable hardware errors detected in total in %s block, " + "no user action is needed\n", + mcm_info->socket_id, mcm_info->die_id, err_info->ce_count, blk_name); + } + } +} + +static inline bool err_data_has_source_info(struct ras_err_data *data) +{ + return !list_empty(&data->err_node_list); +} + +static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, + struct ras_query_if *query_if, + struct ras_err_data *err_data) +{ + struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head); + const char *blk_name = get_ras_block_str(&query_if->head); + + if (err_data->ce_count) { + if (err_data_has_source_info(err_data)) { + amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, false); + } else if (!adev->aid_mask && + adev->smuio.funcs && + adev->smuio.funcs->get_socket_id && + adev->smuio.funcs->get_die_id) { + dev_info(adev->dev, "socket: %d, die: %d " + "%ld correctable hardware errors " + "detected in %s block, no user " + "action is needed.\n", + adev->smuio.funcs->get_socket_id(adev), + adev->smuio.funcs->get_die_id(adev), + ras_mgr->err_data.ce_count, + blk_name); + } else { + dev_info(adev->dev, "%ld correctable hardware errors " + "detected in %s block, no user " + "action is needed.\n", + ras_mgr->err_data.ce_count, + blk_name); + } + } + + if (err_data->ue_count) { + if (err_data_has_source_info(err_data)) { + amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, true); + } else if (!adev->aid_mask && + adev->smuio.funcs && + adev->smuio.funcs->get_socket_id && + adev->smuio.funcs->get_die_id) { + dev_info(adev->dev, "socket: %d, die: %d " + "%ld uncorrectable hardware errors " + "detected in %s block\n", + adev->smuio.funcs->get_socket_id(adev), + adev->smuio.funcs->get_die_id(adev), + ras_mgr->err_data.ue_count, + blk_name); + } else { + dev_info(adev->dev, "%ld uncorrectable hardware errors " + "detected in %s block\n", + ras_mgr->err_data.ue_count, + blk_name); + } + } + +} + +static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data) +{ + struct ras_err_node *err_node; + struct ras_err_info *err_info; + + if (err_data_has_source_info(err_data)) { + for_each_ras_error(err_node, err_data) { + err_info = &err_node->err_info; + + amdgpu_ras_error_statistic_ce_count(&obj->err_data, &err_info->mcm_info, err_info->ce_count); + amdgpu_ras_error_statistic_ue_count(&obj->err_data, &err_info->mcm_info, err_info->ue_count); + } + } else { + /* for legacy asic path which doesn't has error source info */ + obj->err_data.ue_count += err_data->ue_count; + obj->err_data.ce_count += err_data->ce_count; + } +} + /* query/inject/cure begin */ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) { struct amdgpu_ras_block_object *block_obj = NULL; struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); - struct ras_err_data err_data = {0, 0, 0, NULL}; + struct ras_err_data err_data; + int ret; if (!obj) return -EINVAL; + ret = amdgpu_ras_error_data_init(&err_data); + if (ret) + return ret; + if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { amdgpu_ras_get_ecc_info(adev, &err_data); } else { @@ -1032,7 +1188,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, if (!block_obj || !block_obj->hw_ops) { dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", get_ras_block_str(&info->head)); - return -EINVAL; + ret = -EINVAL; + goto out_fini_err_data; } if (block_obj->hw_ops->query_ras_error_count) @@ -1046,73 +1203,55 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, } } - obj->err_data.ue_count += err_data.ue_count; - obj->err_data.ce_count += err_data.ce_count; + amdgpu_rasmgr_error_data_statistic_update(obj, &err_data); info->ue_count = obj->err_data.ue_count; info->ce_count = obj->err_data.ce_count; - if (err_data.ce_count) { - if (!adev->aid_mask && - adev->smuio.funcs && - adev->smuio.funcs->get_socket_id && - adev->smuio.funcs->get_die_id) { - dev_info(adev->dev, "socket: %d, die: %d " - "%ld correctable hardware errors " - "detected in %s block, no user " - "action is needed.\n", - adev->smuio.funcs->get_socket_id(adev), - adev->smuio.funcs->get_die_id(adev), - obj->err_data.ce_count, - get_ras_block_str(&info->head)); - } else { - dev_info(adev->dev, "%ld correctable hardware errors " - "detected in %s block, no user " - "action is needed.\n", - obj->err_data.ce_count, - get_ras_block_str(&info->head)); - } - } - if (err_data.ue_count) { - if (!adev->aid_mask && - adev->smuio.funcs && - adev->smuio.funcs->get_socket_id && - adev->smuio.funcs->get_die_id) { - dev_info(adev->dev, "socket: %d, die: %d " - "%ld uncorrectable hardware errors " - "detected in %s block\n", - adev->smuio.funcs->get_socket_id(adev), - adev->smuio.funcs->get_die_id(adev), - obj->err_data.ue_count, - get_ras_block_str(&info->head)); - } else { - dev_info(adev->dev, "%ld uncorrectable hardware errors " - "detected in %s block\n", - obj->err_data.ue_count, - get_ras_block_str(&info->head)); - } - } + amdgpu_ras_error_generate_report(adev, info, &err_data); - return 0; +out_fini_err_data: + amdgpu_ras_error_data_fini(&err_data); + + return ret; } -int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, +int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block block) { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - if (!amdgpu_ras_is_supported(adev, block)) - return -EINVAL; - - if (!block_obj || !block_obj->hw_ops) { + if (!block_obj || !block_obj->hw_ops) { dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", - ras_block_str(block)); - return -EINVAL; + ras_block_str(block)); + return -EOPNOTSUPP; } + /* skip ras error reset in gpu reset */ + if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery)) && + mca_funcs && mca_funcs->mca_set_debug_mode) + return -EOPNOTSUPP; + + if (!amdgpu_ras_is_supported(adev, block) || + !amdgpu_ras_get_mca_debug_mode(adev)) + return -EOPNOTSUPP; + if (block_obj->hw_ops->reset_ras_error_count) block_obj->hw_ops->reset_ras_error_count(adev); + return 0; +} + +int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, + enum amdgpu_ras_block block) +{ + struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + + if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP) + return 0; + if ((block == AMDGPU_RAS_BLOCK__GFX) || (block == AMDGPU_RAS_BLOCK__MMHUB)) { if (block_obj->hw_ops->reset_ras_error_status) @@ -1208,8 +1347,8 @@ static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev, /* some hardware/IP supports read to clear * no need to explictly reset the err status after the query call */ - if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && - adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { + if (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && + amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { if (amdgpu_ras_reset_error_status(adev, query_info->head.block)) dev_warn(adev->dev, "Failed to reset error counter and error status\n"); @@ -1369,6 +1508,22 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, return sysfs_emit(buf, "feature mask: 0x%x\n", con->features); } +static ssize_t amdgpu_ras_sysfs_version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct amdgpu_ras *con = + container_of(attr, struct amdgpu_ras, version_attr); + return sysfs_emit(buf, "table version: 0x%x\n", con->eeprom_control.tbl_hdr.version); +} + +static ssize_t amdgpu_ras_sysfs_schema_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct amdgpu_ras *con = + container_of(attr, struct amdgpu_ras, schema_attr); + return sysfs_emit(buf, "schema: 0x%x\n", con->schema); +} + static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1378,11 +1533,13 @@ static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) RAS_FS_NAME); } -static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) +static int amdgpu_ras_sysfs_remove_dev_attr_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct attribute *attrs[] = { &con->features_attr.attr, + &con->version_attr.attr, + &con->schema_attr.attr, NULL }; struct attribute_group group = { @@ -1458,7 +1615,7 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) if (amdgpu_bad_page_threshold != 0) amdgpu_ras_sysfs_remove_bad_page_node(adev); - amdgpu_ras_sysfs_remove_feature_node(adev); + amdgpu_ras_sysfs_remove_dev_attr_node(adev); return 0; } @@ -1570,6 +1727,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) amdgpu_ras_debugfs_create(adev, &fs_info, dir); } } + + amdgpu_mca_smu_debugfs_init(adev, dir); } /* debugfs end */ @@ -1579,6 +1738,10 @@ static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO, amdgpu_ras_sysfs_badpages_read, NULL, 0); static DEVICE_ATTR(features, S_IRUGO, amdgpu_ras_sysfs_features_read, NULL); +static DEVICE_ATTR(version, 0444, + amdgpu_ras_sysfs_version_show, NULL); +static DEVICE_ATTR(schema, 0444, + amdgpu_ras_sysfs_schema_show, NULL); static int amdgpu_ras_fs_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1587,6 +1750,8 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev) }; struct attribute *attrs[] = { &con->features_attr.attr, + &con->version_attr.attr, + &con->schema_attr.attr, NULL }; struct bin_attribute *bin_attrs[] = { @@ -1595,11 +1760,20 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev) }; int r; + group.attrs = attrs; + /* add features entry */ con->features_attr = dev_attr_features; - group.attrs = attrs; sysfs_attr_init(attrs[0]); + /* add version entry */ + con->version_attr = dev_attr_version; + sysfs_attr_init(attrs[1]); + + /* add schema entry */ + con->schema_attr = dev_attr_schema; + sysfs_attr_init(attrs[2]); + if (amdgpu_bad_page_threshold != 0) { /* add bad_page_features entry */ bin_attr_gpu_vram_bad_pages.private = NULL; @@ -1708,12 +1882,16 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, struct amdgpu_iv_entry *entry) { struct ras_ih_data *data = &obj->ih_data; - struct ras_err_data err_data = {0, 0, 0, NULL}; + struct ras_err_data err_data; int ret; if (!data->cb) return; + ret = amdgpu_ras_error_data_init(&err_data); + if (ret) + return; + /* Let IP handle its data, maybe we need get the output * from the callback to update the error type/count, etc */ @@ -1730,6 +1908,8 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, obj->err_data.ue_count += err_data.ue_count; obj->err_data.ce_count += err_data.ce_count; } + + amdgpu_ras_error_data_fini(&err_data); } static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) @@ -1905,14 +2085,18 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) * should be removed until smu fix handle ecc_info table. */ if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) && - (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))) + (amdgpu_ip_version(adev, MP1_HWIP, 0) == + IP_VERSION(13, 0, 2))) continue; amdgpu_ras_query_error_status(adev, &info); - if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && - adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) && - adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) { + if (amdgpu_ip_version(adev, MP0_HWIP, 0) != + IP_VERSION(11, 0, 2) && + amdgpu_ip_version(adev, MP0_HWIP, 0) != + IP_VERSION(11, 0, 4) && + amdgpu_ip_version(adev, MP0_HWIP, 0) != + IP_VERSION(13, 0, 0)) { if (amdgpu_ras_reset_error_status(adev, info.head.block)) dev_warn(adev->dev, "Failed to reset error counter and error status"); } @@ -2021,9 +2205,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *remote_adev = NULL; struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + if (hive) + atomic_set(&hive->ras_recovery, 1); if (!ras->disable_ras_err_cnt_harvest) { - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); /* Build list of devices to query RAS related errors */ if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { @@ -2040,7 +2226,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_ras_log_on_err_counter(remote_adev); } - amdgpu_put_xgmi_hive(hive); } if (amdgpu_device_should_recover_gpu(ras->adev)) { @@ -2075,6 +2260,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); } atomic_set(&ras->in_recovery, 0); + if (hive) { + atomic_set(&hive->ras_recovery, 0); + amdgpu_put_xgmi_hive(hive); + } } /* alloc/realloc bps array */ @@ -2400,7 +2589,7 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) { if (amdgpu_sriov_vf(adev)) { - switch (adev->ip_versions[MP0_HWIP][0]) { + switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { case IP_VERSION(13, 0, 2): case IP_VERSION(13, 0, 6): return true; @@ -2410,7 +2599,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) } if (adev->asic_type == CHIP_IP_DISCOVERY) { - switch (adev->ip_versions[MP0_HWIP][0]) { + switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { case IP_VERSION(13, 0, 0): case IP_VERSION(13, 0, 6): case IP_VERSION(13, 0, 10): @@ -2484,8 +2673,12 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) /* VCN/JPEG RAS can be supported on both bare metal and * SRIOV environment */ - if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) || - adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0)) + if (amdgpu_ip_version(adev, VCN_HWIP, 0) == + IP_VERSION(2, 6, 0) || + amdgpu_ip_version(adev, VCN_HWIP, 0) == + IP_VERSION(4, 0, 0) || + amdgpu_ip_version(adev, VCN_HWIP, 0) == + IP_VERSION(4, 0, 3)) adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | 1 << AMDGPU_RAS_BLOCK__JPEG); else @@ -2514,18 +2707,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) /* hw_supported needs to be aligned with RAS block mask. */ adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; - - /* - * Disable ras feature for aqua vanjaram - * by default on apu platform. - */ - if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 6) && - adev->gmc.is_app_apu) - adev->ras_enabled = amdgpu_ras_enable != 1 ? 0 : - adev->ras_hw_enabled & amdgpu_ras_mask; - else - adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : - adev->ras_hw_enabled & amdgpu_ras_mask; + adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : + adev->ras_hw_enabled & amdgpu_ras_mask; } static void amdgpu_ras_counte_dw(struct work_struct *work) @@ -2563,7 +2746,8 @@ static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) return; /* Init poison supported flag, the default value is false */ - if (adev->gmc.xgmi.connected_to_cpu) { + if (adev->gmc.xgmi.connected_to_cpu || + adev->gmc.is_app_apu) { /* enabled by default when GPU is connected to CPU */ con->poison_supported = true; } else if (adev->df.funcs && @@ -2585,6 +2769,14 @@ static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) } } +static int amdgpu_get_ras_schema(struct amdgpu_device *adev) +{ + return amdgpu_ras_is_poison_mode_supported(adev) ? AMDGPU_RAS_ERROR__POISON : 0 | + AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE | + AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE | + AMDGPU_RAS_ERROR__PARITY; +} + int amdgpu_ras_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -2627,6 +2819,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) con->update_channel_flag = false; con->features = 0; + con->schema = 0; INIT_LIST_HEAD(&con->head); /* Might need get this flag from vbios. */ con->flags = RAS_DEFAULT_FLAGS; @@ -2634,7 +2827,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) /* initialize nbio ras function ahead of any other * ras functions so hardware fatal error interrupt * can be enabled as early as possible */ - switch (adev->ip_versions[NBIO_HWIP][0]) { + switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { case IP_VERSION(7, 4, 0): case IP_VERSION(7, 4, 1): case IP_VERSION(7, 4, 4): @@ -2682,6 +2875,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev) amdgpu_ras_query_poison_mode(adev); + /* Get RAS schema for particular SOC */ + con->schema = amdgpu_get_ras_schema(adev); + if (amdgpu_ras_fs_init(adev)) { r = -EINVAL; goto release_con; @@ -3170,6 +3366,27 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) return 0; } +void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (con) + con->is_mca_debug_mode = enable; +} + +bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + + if (!con) + return false; + + if (mca_funcs && mca_funcs->mca_set_debug_mode) + return con->is_mca_debug_mode; + else + return true; +} /* Register each ip ras block into amdgpu ras */ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, @@ -3329,3 +3546,125 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, WREG32(err_status_hi_offset, 0); } } + +int amdgpu_ras_error_data_init(struct ras_err_data *err_data) +{ + memset(err_data, 0, sizeof(*err_data)); + + INIT_LIST_HEAD(&err_data->err_node_list); + + return 0; +} + +static void amdgpu_ras_error_node_release(struct ras_err_node *err_node) +{ + if (!err_node) + return; + + list_del(&err_node->node); + kvfree(err_node); +} + +void amdgpu_ras_error_data_fini(struct ras_err_data *err_data) +{ + struct ras_err_node *err_node, *tmp; + + list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) + amdgpu_ras_error_node_release(err_node); +} + +static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info) +{ + struct ras_err_node *err_node; + struct amdgpu_smuio_mcm_config_info *ref_id; + + if (!err_data || !mcm_info) + return NULL; + + for_each_ras_error(err_node, err_data) { + ref_id = &err_node->err_info.mcm_info; + + if (mcm_info->socket_id == ref_id->socket_id && + mcm_info->die_id == ref_id->die_id) + return err_node; + } + + return NULL; +} + +static struct ras_err_node *amdgpu_ras_error_node_new(void) +{ + struct ras_err_node *err_node; + + err_node = kvzalloc(sizeof(*err_node), GFP_KERNEL); + if (!err_node) + return NULL; + + INIT_LIST_HEAD(&err_node->node); + + return err_node; +} + +static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info) +{ + struct ras_err_node *err_node; + + err_node = amdgpu_ras_error_find_node_by_id(err_data, mcm_info); + if (err_node) + return &err_node->err_info; + + err_node = amdgpu_ras_error_node_new(); + if (!err_node) + return NULL; + + memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info)); + + err_data->err_list_count++; + list_add_tail(&err_node->node, &err_data->err_node_list); + + return &err_node->err_info; +} + +int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count) +{ + struct ras_err_info *err_info; + + if (!err_data || !mcm_info) + return -EINVAL; + + if (!count) + return 0; + + err_info = amdgpu_ras_error_get_info(err_data, mcm_info); + if (!err_info) + return -EINVAL; + + err_info->ue_count += count; + err_data->ue_count += count; + + return 0; +} + +int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, + struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count) +{ + struct ras_err_info *err_info; + + if (!err_data || !mcm_info) + return -EINVAL; + + if (!count) + return 0; + + err_info = amdgpu_ras_error_get_info(err_data, mcm_info); + if (!err_info) + return -EINVAL; + + err_info->ce_count += count; + err_data->ce_count += count; + + return 0; +} |