// SPDX-License-Identifier: MIT /* * Copyright 2025 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * */ #include "ras.h" #include "ras_cmd.h" #define RAS_CMD_MAJOR_VERSION 6 #define RAS_CMD_MINOR_VERSION 0 #define RAS_CMD_VERSION (((RAS_CMD_MAJOR_VERSION) << 10) | (RAS_CMD_MINOR_VERSION)) static int ras_cmd_add_device(struct ras_core_context *ras_core) { INIT_LIST_HEAD(&ras_core->ras_cmd.head); ras_core->ras_cmd.ras_core = ras_core; ras_core->ras_cmd.dev_handle = (uintptr_t)ras_core ^ RAS_CMD_DEV_HANDLE_MAGIC; return 0; } static int ras_cmd_remove_device(struct ras_core_context *ras_core) { memset(&ras_core->ras_cmd, 0, sizeof(ras_core->ras_cmd)); return 0; } static int ras_get_block_ecc_info(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data) { struct ras_cmd_block_ecc_info_req *input_data = (struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw; struct ras_cmd_block_ecc_info_rsp *output_data = (struct ras_cmd_block_ecc_info_rsp *)cmd->output_buff_raw; struct ras_ecc_count err_data; int ret; if (cmd->input_size != sizeof(struct ras_cmd_block_ecc_info_req)) return RAS_CMD__ERROR_INVALID_INPUT_SIZE; memset(&err_data, 0, sizeof(err_data)); ret = ras_aca_get_block_ecc_count(ras_core, input_data->block_id, &err_data); if (ret) return RAS_CMD__ERROR_GENERIC; output_data->ce_count = err_data.total_ce_count; output_data->ue_count = err_data.total_ue_count; output_data->de_count = err_data.total_de_count; cmd->output_size = sizeof(struct ras_cmd_block_ecc_info_rsp); return RAS_CMD__SUCCESS; } static void ras_cmd_update_bad_page_info(struct ras_cmd_bad_page_record *ras_cmd_record, struct eeprom_umc_record *record) { ras_cmd_record->retired_page = record->cur_nps_retired_row_pfn; ras_cmd_record->ts = record->ts; ras_cmd_record->err_type = record->err_type; ras_cmd_record->mem_channel = record->mem_channel; ras_cmd_record->mcumc_id = record->mcumc_id; ras_cmd_record->address = record->address; ras_cmd_record->bank = record->bank; ras_cmd_record->valid = 1; } static int ras_cmd_get_group_bad_pages(struct ras_core_context *ras_core, uint32_t group_index, struct ras_cmd_bad_pages_info_rsp *output_data) { struct eeprom_umc_record record; struct ras_cmd_bad_page_record *ras_cmd_record; uint32_t i = 0, bp_cnt = 0, group_cnt = 0; output_data->bp_in_group = 0; output_data->group_index = 0; bp_cnt = ras_umc_get_badpage_count(ras_core); if (bp_cnt) { output_data->group_index = group_index; group_cnt = bp_cnt / RAS_CMD_MAX_BAD_PAGES_PER_GROUP + ((bp_cnt % RAS_CMD_MAX_BAD_PAGES_PER_GROUP) ? 1 : 0); if (group_index >= group_cnt) return RAS_CMD__ERROR_INVALID_INPUT_DATA; i = group_index * RAS_CMD_MAX_BAD_PAGES_PER_GROUP; for (; i < bp_cnt && output_data->bp_in_group < RAS_CMD_MAX_BAD_PAGES_PER_GROUP; i++) { if (ras_umc_get_badpage_record(ras_core, i, &record)) return RAS_CMD__ERROR_GENERIC; ras_cmd_record = &output_data->records[i % RAS_CMD_MAX_BAD_PAGES_PER_GROUP]; memset(ras_cmd_record, 0, sizeof(*ras_cmd_record)); ras_cmd_update_bad_page_info(ras_cmd_record, &record); output_data->bp_in_group++; } } output_data->bp_total_cnt = bp_cnt; return RAS_CMD__SUCCESS; } static int ras_cmd_get_bad_pages(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data) { struct ras_cmd_bad_pages_info_req *input_data = (struct ras_cmd_bad_pages_info_req *)cmd->input_buff_raw; struct ras_cmd_bad_pages_info_rsp *output_data = (struct ras_cmd_bad_pages_info_rsp *)cmd->output_buff_raw; int ret; if (cmd->input_size != sizeof(struct ras_cmd_bad_pages_info_req)) return RAS_CMD__ERROR_INVALID_INPUT_SIZE; ret = ras_cmd_get_group_bad_pages(ras_core, input_data->group_index, output_data); if (ret) return RAS_CMD__ERROR_GENERIC; output_data->version = 0; cmd->output_size = sizeof(struct ras_cmd_bad_pages_info_rsp); return RAS_CMD__SUCCESS; } static int ras_cmd_clear_bad_page_info(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data) { if (cmd->input_size != sizeof(struct ras_cmd_dev_handle)) return RAS_CMD__ERROR_INVALID_INPUT_SIZE; if (ras_eeprom_reset_table(ras_core)) return RAS_CMD__ERROR_GENERIC; if (ras_umc_clean_badpage_data(ras_core)) return RAS_CMD__ERROR_GENERIC; return RAS_CMD__SUCCESS; } static int ras_cmd_reset_all_error_counts(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data) { if (cmd->input_size != sizeof(struct ras_cmd_dev_handle)) return RAS_CMD__ERROR_INVALID_INPUT_SIZE; if (ras_aca_clear_all_blocks_ecc_count(ras_core)) return RAS_CMD__ERROR_GENERIC; if (ras_umc_clear_logged_ecc(ras_core)) return RAS_CMD__ERROR_GENERIC; return RAS_CMD__SUCCESS; } static int ras_cmd_get_cper_snapshot(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data) { struct ras_cmd_cper_snapshot_rsp *output_data = (struct ras_cmd_cper_snapshot_rsp *)cmd->output_buff_raw; struct ras_log_batch_overview overview; if (cmd->input_size != sizeof(struct ras_cmd_cper_snapshot_req)) return RAS_CMD__ERROR_INVALID_INPUT_SIZE; ras_log_ring_get_batch_overview(ras_core, &overview); output_data->total_cper_num = overview.logged_batch_count; output_data->start_cper_id = overview.first_batch_id; output_data->latest_cper_id = overview.last_batch_id; output_data->version = 0; cmd->output_size = sizeof(struct ras_cmd_cper_snapshot_rsp); return RAS_CMD__SUCCESS; } static int ras_cmd_get_cper_records(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data) { struct ras_cmd_cper_record_req *req = (struct ras_cmd_cper_record_req *)cmd->input_buff_raw; struct ras_cmd_cper_record_rsp *rsp = (struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw; struct ras_log_info *trace[MAX_RECORD_PER_BATCH] = {0}; struct ras_log_batch_overview overview; uint32_t offset = 0, real_data_len = 0; uint64_t batch_id; uint8_t *buffer; int ret = 0, i, count; if (cmd->input_size != sizeof(struct ras_cmd_cper_record_req)) return RAS_CMD__ERROR_INVALID_INPUT_SIZE; if (!req->buf_size || !req->buf_ptr || !req->cper_num) return RAS_CMD__ERROR_INVALID_INPUT_DATA; buffer = kzalloc(req->buf_size, GFP_KERNEL); if (!buffer) return RAS_CMD__ERROR_GENERIC; ras_log_ring_get_batch_overview(ras_core, &overview); for (i = 0; i < req->cper_num; i++) { batch_id = req->cper_start_id + i; if (batch_id >= overview.last_batch_id) break; count = ras_log_ring_get_batch_records(ras_core, batch_id, trace, ARRAY_SIZE(trace)); if (count > 0) { ret = ras_cper_generate_cper(ras_core, trace, count, &buffer[offset], req->buf_size - offset, &real_data_len); if (ret) break; offset += real_data_len; } } if ((ret && (ret != -ENOMEM)) || copy_to_user(u64_to_user_ptr(req->buf_ptr), buffer, offset)) { kfree(buffer); return RAS_CMD__ERROR_GENERIC; } rsp->real_data_size = offset; rsp->real_cper_num = i; rsp->remain_num = (ret == -ENOMEM) ? (req->cper_num - i) : 0; rsp->version = 0; cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp); kfree(buffer); return RAS_CMD__SUCCESS; } static int ras_cmd_get_batch_trace_snapshot(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data) { struct ras_cmd_batch_trace_snapshot_rsp *rsp = (struct ras_cmd_batch_trace_snapshot_rsp *)cmd->output_buff_raw; struct ras_log_batch_overview overview; if (cmd->input_size != sizeof(struct ras_cmd_batch_trace_snapshot_req)) return RAS_CMD__ERROR_INVALID_INPUT_SIZE; ras_log_ring_get_batch_overview(ras_core, &overview); rsp->total_batch_num = overview.logged_batch_count; rsp->start_batch_id = overview.first_batch_id; rsp->latest_batch_id = overview.last_batch_id; rsp->version = 0; cmd->output_size = sizeof(struct ras_cmd_batch_trace_snapshot_rsp); return RAS_CMD__SUCCESS; } static int ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data) { struct ras_cmd_batch_trace_record_req *input_data = (struct ras_cmd_batch_trace_record_req *)cmd->input_buff_raw; struct ras_cmd_batch_trace_record_rsp *output_data = (struct ras_cmd_batch_trace_record_rsp *)cmd->output_buff_raw; struct ras_log_batch_overview overview; struct ras_log_info *trace_arry[MAX_RECORD_PER_BATCH] = {0}; struct ras_log_info *record; int i, j, count = 0, offset = 0; uint64_t id; bool completed = false; if (cmd->input_size != sizeof(struct ras_cmd_batch_trace_record_req)) return RAS_CMD__ERROR_INVALID_INPUT_SIZE; if ((!input_data->batch_num) || (input_data->batch_num > RAS_CMD_MAX_BATCH_NUM)) return RAS_CMD__ERROR_INVALID_INPUT_DATA; ras_log_ring_get_batch_overview(ras_core, &overview); if ((input_data->start_batch_id < overview.first_batch_id) || (input_data->start_batch_id >= overview.last_batch_id)) return RAS_CMD__ERROR_INVALID_INPUT_SIZE; for (i = 0; i < input_data->batch_num; i++) { id = input_data->start_batch_id + i; if (id >= overview.last_batch_id) { completed = true; break; } count = ras_log_ring_get_batch_records(ras_core, id, trace_arry, ARRAY_SIZE(trace_arry)); if (count > 0) { if ((offset + count) > RAS_CMD_MAX_TRACE_NUM) break; for (j = 0; j < count; j++) { record = &output_data->records[offset + j]; record->seqno = trace_arry[j]->seqno; record->timestamp = trace_arry[j]->timestamp; record->event = trace_arry[j]->event; memcpy(&record->aca_reg, &trace_arry[j]->aca_reg, sizeof(trace_arry[j]->aca_reg)); } } else { count = 0; } output_data->batchs[i].batch_id = id; output_data->batchs[i].offset = offset; output_data->batchs[i].trace_num = count; offset += count; } output_data->start_batch_id = input_data->start_batch_id; output_data->real_batch_num = i; output_data->remain_num = completed ? 0 : (input_data->batch_num - i); output_data->version = 0; cmd->output_size = sizeof(struct ras_cmd_batch_trace_record_rsp); return RAS_CMD__SUCCESS; } static enum ras_ta_block __get_ras_ta_block(enum ras_block_id block) { switch (block) { case RAS_BLOCK_ID__UMC: return RAS_TA_BLOCK__UMC; case RAS_BLOCK_ID__SDMA: return RAS_TA_BLOCK__SDMA; case RAS_BLOCK_ID__GFX: return RAS_TA_BLOCK__GFX; case RAS_BLOCK_ID__MMHUB: return RAS_TA_BLOCK__MMHUB; case RAS_BLOCK_ID__ATHUB: return RAS_TA_BLOCK__ATHUB; case RAS_BLOCK_ID__PCIE_BIF: return RAS_TA_BLOCK__PCIE_BIF; case RAS_BLOCK_ID__HDP: return RAS_TA_BLOCK__HDP; case RAS_BLOCK_ID__XGMI_WAFL: return RAS_TA_BLOCK__XGMI_WAFL; case RAS_BLOCK_ID__DF: return RAS_TA_BLOCK__DF; case RAS_BLOCK_ID__SMN: return RAS_TA_BLOCK__SMN; case RAS_BLOCK_ID__SEM: return RAS_TA_BLOCK__SEM; case RAS_BLOCK_ID__MP0: return RAS_TA_BLOCK__MP0; case RAS_BLOCK_ID__MP1: return RAS_TA_BLOCK__MP1; case RAS_BLOCK_ID__FUSE: return RAS_TA_BLOCK__FUSE; case RAS_BLOCK_ID__MCA: return RAS_TA_BLOCK__MCA; case RAS_BLOCK_ID__VCN: return RAS_TA_BLOCK__VCN; case RAS_BLOCK_ID__JPEG: return RAS_TA_BLOCK__JPEG; default: return RAS_TA_BLOCK__UMC; } } static enum ras_ta_error_type __get_ras_ta_err_type(enum ras_ecc_err_type error) { switch (error) { case RAS_ECC_ERR__NONE: return RAS_TA_ERROR__NONE; case RAS_ECC_ERR__PARITY: return RAS_TA_ERROR__PARITY; case RAS_ECC_ERR__SINGLE_CORRECTABLE: return RAS_TA_ERROR__SINGLE_CORRECTABLE; case RAS_ECC_ERR__MULTI_UNCORRECTABLE: return RAS_TA_ERROR__MULTI_UNCORRECTABLE; case RAS_ECC_ERR__POISON: return RAS_TA_ERROR__POISON; default: return RAS_TA_ERROR__NONE; } } static int ras_cmd_inject_error(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data) { struct ras_cmd_inject_error_req *req = (struct ras_cmd_inject_error_req *)cmd->input_buff_raw; struct ras_cmd_inject_error_rsp *output_data = (struct ras_cmd_inject_error_rsp *)cmd->output_buff_raw; int ret = 0; struct ras_ta_trigger_error_input block_info = { .block_id = __get_ras_ta_block(req->block_id), .sub_block_index = req->subblock_id, .inject_error_type = __get_ras_ta_err_type(req->error_type), .address = req->address, .value = req->method, }; ret = ras_psp_trigger_error(ras_core, &block_info, req->instance_mask); if (!ret) { output_data->version = 0; output_data->address = block_info.address; cmd->output_size = sizeof(struct ras_cmd_inject_error_rsp); } else { RAS_DEV_ERR(ras_core->dev, "ras inject block %u failed %d\n", req->block_id, ret); ret = RAS_CMD__ERROR_ACCESS_DENIED; } return ret; } static struct ras_cmd_func_map ras_cmd_maps[] = { {RAS_CMD__INJECT_ERROR, ras_cmd_inject_error}, {RAS_CMD__GET_BLOCK_ECC_STATUS, ras_get_block_ecc_info}, {RAS_CMD__GET_BAD_PAGES, ras_cmd_get_bad_pages}, {RAS_CMD__CLEAR_BAD_PAGE_INFO, ras_cmd_clear_bad_page_info}, {RAS_CMD__RESET_ALL_ERROR_COUNTS, ras_cmd_reset_all_error_counts}, {RAS_CMD__GET_CPER_SNAPSHOT, ras_cmd_get_cper_snapshot}, {RAS_CMD__GET_CPER_RECORD, ras_cmd_get_cper_records}, {RAS_CMD__GET_BATCH_TRACE_SNAPSHOT, ras_cmd_get_batch_trace_snapshot}, {RAS_CMD__GET_BATCH_TRACE_RECORD, ras_cmd_get_batch_trace_records}, }; int rascore_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data) { struct ras_cmd_func_map *ras_cmd = NULL; int i; for (i = 0; i < ARRAY_SIZE(ras_cmd_maps); i++) { if (cmd->cmd_id == ras_cmd_maps[i].cmd_id) { ras_cmd = &ras_cmd_maps[i]; break; } } if (!ras_cmd) return RAS_CMD__ERROR_UKNOWN_CMD; return ras_cmd->func(ras_core, cmd, data); } int ras_cmd_init(struct ras_core_context *ras_core) { return ras_cmd_add_device(ras_core); } int ras_cmd_fini(struct ras_core_context *ras_core) { ras_cmd_remove_device(ras_core); return 0; } int ras_cmd_query_interface_info(struct ras_core_context *ras_core, struct ras_query_interface_info_rsp *rsp) { rsp->ras_cmd_major_ver = RAS_CMD_MAJOR_VERSION; rsp->ras_cmd_minor_ver = RAS_CMD_MINOR_VERSION; return 0; } int ras_cmd_translate_soc_pa_to_bank(struct ras_core_context *ras_core, uint64_t soc_pa, struct ras_fb_bank_addr *bank_addr) { struct umc_bank_addr umc_bank = {0}; int ret; ret = ras_umc_translate_soc_pa_and_bank(ras_core, &soc_pa, &umc_bank, false); if (ret) return RAS_CMD__ERROR_GENERIC; bank_addr->stack_id = umc_bank.stack_id; bank_addr->bank_group = umc_bank.bank_group; bank_addr->bank = umc_bank.bank; bank_addr->row = umc_bank.row; bank_addr->column = umc_bank.column; bank_addr->channel = umc_bank.channel; bank_addr->subchannel = umc_bank.subchannel; return 0; } int ras_cmd_translate_bank_to_soc_pa(struct ras_core_context *ras_core, struct ras_fb_bank_addr bank_addr, uint64_t *soc_pa) { struct umc_bank_addr umc_bank = {0}; umc_bank.stack_id = bank_addr.stack_id; umc_bank.bank_group = bank_addr.bank_group; umc_bank.bank = bank_addr.bank; umc_bank.row = bank_addr.row; umc_bank.column = bank_addr.column; umc_bank.channel = bank_addr.channel; umc_bank.subchannel = bank_addr.subchannel; return ras_umc_translate_soc_pa_and_bank(ras_core, soc_pa, &umc_bank, true); } uint64_t ras_cmd_get_dev_handle(struct ras_core_context *ras_core) { return ras_core->ras_cmd.dev_handle; }