diff options
author | Dennis Li <Dennis.Li@amd.com> | 2021-03-05 22:30:54 +0100 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2021-03-24 03:59:52 +0100 |
commit | 88f8575bca5fc70ba8608cfc49811f9b4d1eb6f9 (patch) | |
tree | 18987aeeed92f54b8cba5258eb2542641f667489 /drivers | |
parent | drm/amdgpu: refine ras codes for GC utc of aldebaran (diff) | |
download | linux-88f8575bca5fc70ba8608cfc49811f9b4d1eb6f9.tar.xz linux-88f8575bca5fc70ba8608cfc49811f9b4d1eb6f9.zip |
drm/amdgpu: enable watchdog feature for SQ of aldebaran
SQ's watchdog timer monitors forward progress, a mask of which waves
caused the watchdog timeout is recorded into ras status registers and
then trigger a system fatal error event.
v2:
1. change *query_timeout_status to *query_sq_timeout_status.
2. move query_sq_timeout_status into amdgpu_ras_do_recovery.
3. add module parameters to enable/disable fatal error event and modify
the watchdog timer.
v3:
1. remove unused parameters of *enable_watchdog_timer
Signed-off-by: Dennis Li <Dennis.Li@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 18 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 5 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 106 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/soc15_common.h | 30 |
8 files changed, 174 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index bdaaba42bda4..951a2a19c19e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -126,6 +126,12 @@ struct amdgpu_mgpu_info uint32_t num_apu; }; +struct amdgpu_watchdog_timer +{ + bool timeout_fatal_disable; + uint32_t period; /* maxCycles = (1 << period), the number of cycles before a timeout */ +}; + #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH 256 /* @@ -187,6 +193,7 @@ extern struct amdgpu_mgpu_info mgpu_info; extern int amdgpu_ras_enable; extern uint amdgpu_ras_mask; extern int amdgpu_bad_page_threshold; +extern struct amdgpu_watchdog_timer amdgpu_watchdog_timer; extern int amdgpu_async_gfx_ring; extern int amdgpu_mcbp; extern int amdgpu_discovery; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 5179d5f032ee..e39d81b68169 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -175,6 +175,10 @@ struct amdgpu_mgpu_info mgpu_info = { int amdgpu_ras_enable = -1; uint amdgpu_ras_mask = 0xffffffff; int amdgpu_bad_page_threshold = 100; +struct amdgpu_watchdog_timer amdgpu_watchdog_timer = { + .timeout_fatal_disable = false, + .period = 0x3f, /* about 8s */ +}; /** * DOC: vramlimit (int) @@ -531,6 +535,20 @@ MODULE_PARM_DESC(ras_mask, "Mask of RAS features to enable (default 0xffffffff), module_param_named(ras_mask, amdgpu_ras_mask, uint, 0444); /** + * DOC: timeout_fatal_disable (bool) + * Disable Watchdog timeout fatal error event + */ +MODULE_PARM_DESC(timeout_fatal_disable, "disable watchdog timeout fatal error (false = default)"); +module_param_named(timeout_fatal_disable, amdgpu_watchdog_timer.timeout_fatal_disable, bool, 0644); + +/** + * DOC: timeout_period (uint) + * Modify the watchdog timeout max_cycles as (1 << period) + */ +MODULE_PARM_DESC(timeout_period, "watchdog timeout period (0x1F = default), timeout maxCycles = (1 << period)"); +module_param_named(timeout_period, amdgpu_watchdog_timer.period, uint, 0644); + +/** * DOC: si_support (int) * Set SI support driver. This parameter works after set config CONFIG_DRM_AMDGPU_SI. For SI asic, when radeon driver is enabled, * set value 0 to use radeon driver, while set value 1 to use amdgpu driver. The default is using radeon driver when it available, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 1ab9632282d4..d92f0f14cbeb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -226,6 +226,8 @@ struct amdgpu_gfx_funcs { void (*init_spm_golden)(struct amdgpu_device *adev); void (*query_ras_error_status) (struct amdgpu_device *adev); void (*update_perfmon_mgcg)(struct amdgpu_device *adev, bool enable); + void (*enable_watchdog_timer)(struct amdgpu_device *adev); + void (*query_sq_timeout_status)(struct amdgpu_device *adev); }; struct sq_work { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c669435ccc74..c1516d871881 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1467,6 +1467,9 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, case AMDGPU_RAS_BLOCK__GFX: if (adev->gfx.funcs->query_ras_error_status) adev->gfx.funcs->query_ras_error_status(adev); + + if (adev->gfx.funcs->query_sq_timeout_status) + adev->gfx.funcs->query_sq_timeout_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: if (adev->mmhub.funcs->query_ras_error_status) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 6af4ecf8e9f0..8b6ba1594f41 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2124,6 +2124,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_2_gfx_funcs = { .query_ras_error_count = &gfx_v9_4_2_query_ras_error_count, .reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count, .query_ras_error_status = &gfx_v9_4_2_query_ras_error_status, + .enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer, + .query_sq_timeout_status = &gfx_v9_4_2_query_sq_timeout_status, }; static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) @@ -3968,6 +3970,9 @@ static int gfx_v9_0_hw_init(void *handle) if (adev->asic_type == CHIP_ALDEBARAN) gfx_v9_4_2_set_power_brake_sequence(adev); + if (adev->gfx.funcs->enable_watchdog_timer) + adev->gfx.funcs->enable_watchdog_timer(adev); + return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c index b2e2026c3ec7..1faeae14ead9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c @@ -1129,3 +1129,109 @@ void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev) gfx_v9_4_2_query_ea_err_status(adev); gfx_v9_4_2_query_utc_err_status(adev); } + +void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev) +{ + uint32_t i; + uint32_t data; + + data = REG_SET_FIELD(0, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE, + amdgpu_watchdog_timer.timeout_fatal_disable ? 1 : + 0); + data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, PERIOD_SEL, + amdgpu_watchdog_timer.period); + + mutex_lock(&adev->grbm_idx_mutex); + for (i = 0; i < adev->gfx.config.max_shader_engines; i++) { + gfx_v9_4_2_select_se_sh(adev, i, 0xffffffff, 0xffffffff); + WREG32_SOC15(GC, 0, regSQ_TIMEOUT_CONFIG, data); + } + gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); + mutex_unlock(&adev->grbm_idx_mutex); +} + +static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address) +{ + WREG32_SOC15_RLC_EX(reg, GC, 0, regSQ_IND_INDEX, + (wave << SQ_IND_INDEX__WAVE_ID__SHIFT) | + (simd << SQ_IND_INDEX__SIMD_ID__SHIFT) | + (address << SQ_IND_INDEX__INDEX__SHIFT) | + (SQ_IND_INDEX__FORCE_READ_MASK)); + return RREG32_SOC15(GC, 0, regSQ_IND_DATA); +} + +static void gfx_v9_4_2_log_cu_timeout_status(struct amdgpu_device *adev, + uint32_t status) +{ + struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info; + uint32_t i, simd, wave; + uint32_t wave_status; + uint32_t wave_pc_lo, wave_pc_hi; + uint32_t wave_exec_lo, wave_exec_hi; + uint32_t wave_inst_dw0, wave_inst_dw1; + uint32_t wave_ib_sts; + + for (i = 0; i < 32; i++) { + if (!((i << 1) & status)) + continue; + + simd = i / cu_info->max_waves_per_simd; + wave = i % cu_info->max_waves_per_simd; + + wave_status = wave_read_ind(adev, simd, wave, ixSQ_WAVE_STATUS); + wave_pc_lo = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_LO); + wave_pc_hi = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_HI); + wave_exec_lo = + wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_LO); + wave_exec_hi = + wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_HI); + wave_inst_dw0 = + wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW0); + wave_inst_dw1 = + wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW1); + wave_ib_sts = wave_read_ind(adev, simd, wave, ixSQ_WAVE_IB_STS); + + dev_info( + adev->dev, + "\t SIMD %d, Wave %d: status 0x%x, pc 0x%llx, exec 0x%llx, inst 0x%llx, ib_sts 0x%x\n", + simd, wave, wave_status, + ((uint64_t)wave_pc_hi << 32 | wave_pc_lo), + ((uint64_t)wave_exec_hi << 32 | wave_exec_lo), + ((uint64_t)wave_inst_dw1 << 32 | wave_inst_dw0), + wave_ib_sts); + } +} + +void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev) +{ + uint32_t se_idx, sh_idx, cu_idx; + uint32_t status; + + mutex_lock(&adev->grbm_idx_mutex); + for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines; + se_idx++) { + for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se; + sh_idx++) { + for (cu_idx = 0; + cu_idx < adev->gfx.config.max_cu_per_sh; + cu_idx++) { + gfx_v9_4_2_select_se_sh(adev, se_idx, sh_idx, + cu_idx); + status = RREG32_SOC15(GC, 0, + regSQ_TIMEOUT_STATUS); + if (status != 0) { + dev_info( + adev->dev, + "GFX Watchdog Timeout: SE %d, SH %d, CU %d\n", + se_idx, sh_idx, cu_idx); + gfx_v9_4_2_log_cu_timeout_status( + adev, status); + } + /* clear old status */ + WREG32_SOC15(GC, 0, regSQ_TIMEOUT_STATUS, 0); + } + } + } + gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); + mutex_unlock(&adev->grbm_idx_mutex); +}
\ No newline at end of file diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h index d7e3041947e8..e01fa6afa8e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h @@ -35,4 +35,7 @@ int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if); void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev); int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status); + +void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev); +void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev); #endif /* __GFX_V9_4_2_H__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/soc15_common.h b/drivers/gpu/drm/amd/amdgpu/soc15_common.h index 52ffbea63a4f..8cdf5d1685cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15_common.h +++ b/drivers/gpu/drm/amd/amdgpu/soc15_common.h @@ -100,6 +100,30 @@ } \ } while (0) +#define WREG32_RLC_EX(prefix, reg, value) \ + do { \ + if (amdgpu_sriov_fullaccess(adev)) { \ + uint32_t i = 0; \ + uint32_t retries = 50000; \ + uint32_t r0 = adev->reg_offset[GC_HWIP][0][prefix##SCRATCH_REG0_BASE_IDX] + prefix##SCRATCH_REG0; \ + uint32_t r1 = adev->reg_offset[GC_HWIP][0][prefix##SCRATCH_REG1_BASE_IDX] + prefix##SCRATCH_REG1; \ + uint32_t spare_int = adev->reg_offset[GC_HWIP][0][prefix##RLC_SPARE_INT_BASE_IDX] + prefix##RLC_SPARE_INT; \ + WREG32(r0, value); \ + WREG32(r1, (reg | 0x80000000)); \ + WREG32(spare_int, 0x1); \ + for (i = 0; i < retries; i++) { \ + u32 tmp = RREG32(r1); \ + if (!(tmp & 0x80000000)) \ + break; \ + udelay(10); \ + } \ + if (i >= retries) \ + pr_err("timeout: rlcg program reg:0x%05x failed !\n", reg); \ + } else { \ + WREG32(reg, value); \ + } \ + } while (0) + #define WREG32_SOC15_RLC_SHADOW(ip, inst, reg, value) \ do { \ uint32_t target_reg = adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg;\ @@ -142,6 +166,12 @@ WREG32_RLC(target_reg, value); \ } while (0) +#define WREG32_SOC15_RLC_EX(prefix, ip, inst, reg, value) \ + do { \ + uint32_t target_reg = adev->reg_offset[GC_HWIP][0][reg##_BASE_IDX] + reg;\ + WREG32_RLC_EX(prefix, target_reg, value); \ + } while (0) + #define WREG32_FIELD15_RLC(ip, idx, reg, field, val) \ WREG32_RLC((adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg), \ (RREG32(adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg) \ |