bluefield_edac.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Bluefield-specific EDAC driver.
  4. *
  5. * Copyright (c) 2019 Mellanox Technologies.
  6. */
  7. #include <linux/acpi.h>
  8. #include <linux/arm-smccc.h>
  9. #include <linux/bitfield.h>
  10. #include <linux/edac.h>
  11. #include <linux/io.h>
  12. #include <linux/module.h>
  13. #include <linux/platform_device.h>
  14. #include "edac_module.h"
  15. #define DRIVER_NAME "bluefield-edac"
  16. /*
  17. * Mellanox BlueField EMI (External Memory Interface) register definitions.
  18. */
  19. #define MLXBF_ECC_CNT 0x340
  20. #define MLXBF_ECC_CNT__SERR_CNT GENMASK(15, 0)
  21. #define MLXBF_ECC_CNT__DERR_CNT GENMASK(31, 16)
  22. #define MLXBF_ECC_ERR 0x348
  23. #define MLXBF_ECC_ERR__SECC BIT(0)
  24. #define MLXBF_ECC_ERR__DECC BIT(16)
  25. #define MLXBF_ECC_LATCH_SEL 0x354
  26. #define MLXBF_ECC_LATCH_SEL__START BIT(24)
  27. #define MLXBF_ERR_ADDR_0 0x358
  28. #define MLXBF_ERR_ADDR_1 0x37c
  29. #define MLXBF_SYNDROM 0x35c
  30. #define MLXBF_SYNDROM__DERR BIT(0)
  31. #define MLXBF_SYNDROM__SERR BIT(1)
  32. #define MLXBF_SYNDROM__SYN GENMASK(25, 16)
  33. #define MLXBF_ADD_INFO 0x364
  34. #define MLXBF_ADD_INFO__ERR_PRANK GENMASK(9, 8)
  35. #define MLXBF_EDAC_MAX_DIMM_PER_MC 2
  36. #define MLXBF_EDAC_ERROR_GRAIN 8
  37. #define MLXBF_WRITE_REG_32 (0x82000009)
  38. #define MLXBF_READ_REG_32 (0x8200000A)
  39. #define MLXBF_SIP_SVC_VERSION (0x8200ff03)
  40. #define MLXBF_SMCCC_ACCESS_VIOLATION (-4)
  41. #define MLXBF_SVC_REQ_MAJOR 0
  42. #define MLXBF_SVC_REQ_MINOR 3
  43. /*
  44. * Request MLXBF_SIP_GET_DIMM_INFO
  45. *
  46. * Retrieve information about DIMM on a certain slot.
  47. *
  48. * Call register usage:
  49. * a0: MLXBF_SIP_GET_DIMM_INFO
  50. * a1: (Memory controller index) << 16 | (Dimm index in memory controller)
  51. * a2-7: not used.
  52. *
  53. * Return status:
  54. * a0: MLXBF_DIMM_INFO defined below describing the DIMM.
  55. * a1-3: not used.
  56. */
  57. #define MLXBF_SIP_GET_DIMM_INFO 0x82000008
  58. /* Format for the SMC response about the memory information */
  59. #define MLXBF_DIMM_INFO__SIZE_GB GENMASK_ULL(15, 0)
  60. #define MLXBF_DIMM_INFO__IS_RDIMM BIT(16)
  61. #define MLXBF_DIMM_INFO__IS_LRDIMM BIT(17)
  62. #define MLXBF_DIMM_INFO__IS_NVDIMM BIT(18)
  63. #define MLXBF_DIMM_INFO__RANKS GENMASK_ULL(23, 21)
  64. #define MLXBF_DIMM_INFO__PACKAGE_X GENMASK_ULL(31, 24)
  65. struct bluefield_edac_priv {
  66. /* pointer to device structure */
  67. struct device *dev;
  68. int dimm_ranks[MLXBF_EDAC_MAX_DIMM_PER_MC];
  69. void __iomem *emi_base;
  70. int dimm_per_mc;
  71. /* access to secure regs supported */
  72. bool svc_sreg_support;
  73. /* SMC table# for secure regs access */
  74. u32 sreg_tbl;
  75. };
  76. static u64 smc_call1(u64 smc_op, u64 smc_arg)
  77. {
  78. struct arm_smccc_res res;
  79. arm_smccc_smc(smc_op, smc_arg, 0, 0, 0, 0, 0, 0, &res);
  80. return res.a0;
  81. }
  82. static int secure_readl(void __iomem *addr, u32 *result, u32 sreg_tbl)
  83. {
  84. struct arm_smccc_res res;
  85. int status;
  86. arm_smccc_smc(MLXBF_READ_REG_32, sreg_tbl, (uintptr_t)addr,
  87. 0, 0, 0, 0, 0, &res);
  88. status = res.a0;
  89. if (status == SMCCC_RET_NOT_SUPPORTED ||
  90. status == MLXBF_SMCCC_ACCESS_VIOLATION)
  91. return -1;
  92. *result = (u32)res.a1;
  93. return 0;
  94. }
  95. static int secure_writel(void __iomem *addr, u32 data, u32 sreg_tbl)
  96. {
  97. struct arm_smccc_res res;
  98. int status;
  99. arm_smccc_smc(MLXBF_WRITE_REG_32, sreg_tbl, data, (uintptr_t)addr,
  100. 0, 0, 0, 0, &res);
  101. status = res.a0;
  102. if (status == SMCCC_RET_NOT_SUPPORTED ||
  103. status == MLXBF_SMCCC_ACCESS_VIOLATION)
  104. return -1;
  105. else
  106. return 0;
  107. }
  108. static int bluefield_edac_readl(struct bluefield_edac_priv *priv, u32 offset, u32 *result)
  109. {
  110. void __iomem *addr;
  111. int err = 0;
  112. addr = priv->emi_base + offset;
  113. if (priv->svc_sreg_support)
  114. err = secure_readl(addr, result, priv->sreg_tbl);
  115. else
  116. *result = readl(addr);
  117. return err;
  118. }
  119. static int bluefield_edac_writel(struct bluefield_edac_priv *priv, u32 offset, u32 data)
  120. {
  121. void __iomem *addr;
  122. int err = 0;
  123. addr = priv->emi_base + offset;
  124. if (priv->svc_sreg_support)
  125. err = secure_writel(addr, data, priv->sreg_tbl);
  126. else
  127. writel(data, addr);
  128. return err;
  129. }
  130. /*
  131. * Gather the ECC information from the External Memory Interface registers
  132. * and report it to the edac handler.
  133. */
  134. static void bluefield_gather_report_ecc(struct mem_ctl_info *mci,
  135. int error_cnt,
  136. int is_single_ecc)
  137. {
  138. struct bluefield_edac_priv *priv = mci->pvt_info;
  139. u32 dram_additional_info, err_prank, edea0, edea1;
  140. u32 ecc_latch_select, dram_syndrom, serr, derr, syndrom;
  141. enum hw_event_mc_err_type ecc_type;
  142. u64 ecc_dimm_addr;
  143. int ecc_dimm, err;
  144. ecc_type = is_single_ecc ? HW_EVENT_ERR_CORRECTED :
  145. HW_EVENT_ERR_UNCORRECTED;
  146. /*
  147. * Tell the External Memory Interface to populate the relevant
  148. * registers with information about the last ECC error occurrence.
  149. */
  150. ecc_latch_select = MLXBF_ECC_LATCH_SEL__START;
  151. err = bluefield_edac_writel(priv, MLXBF_ECC_LATCH_SEL, ecc_latch_select);
  152. if (err)
  153. dev_err(priv->dev, "ECC latch select write failed.\n");
  154. /*
  155. * Verify that the ECC reported info in the registers is of the
  156. * same type as the one asked to report. If not, just report the
  157. * error without the detailed information.
  158. */
  159. err = bluefield_edac_readl(priv, MLXBF_SYNDROM, &dram_syndrom);
  160. if (err) {
  161. dev_err(priv->dev, "DRAM syndrom read failed.\n");
  162. return;
  163. }
  164. serr = FIELD_GET(MLXBF_SYNDROM__SERR, dram_syndrom);
  165. derr = FIELD_GET(MLXBF_SYNDROM__DERR, dram_syndrom);
  166. syndrom = FIELD_GET(MLXBF_SYNDROM__SYN, dram_syndrom);
  167. if ((is_single_ecc && !serr) || (!is_single_ecc && !derr)) {
  168. edac_mc_handle_error(ecc_type, mci, error_cnt, 0, 0, 0,
  169. 0, 0, -1, mci->ctl_name, "");
  170. return;
  171. }
  172. err = bluefield_edac_readl(priv, MLXBF_ADD_INFO, &dram_additional_info);
  173. if (err) {
  174. dev_err(priv->dev, "DRAM additional info read failed.\n");
  175. return;
  176. }
  177. err_prank = FIELD_GET(MLXBF_ADD_INFO__ERR_PRANK, dram_additional_info);
  178. ecc_dimm = (err_prank >= 2 && priv->dimm_ranks[0] <= 2) ? 1 : 0;
  179. err = bluefield_edac_readl(priv, MLXBF_ERR_ADDR_0, &edea0);
  180. if (err) {
  181. dev_err(priv->dev, "Error addr 0 read failed.\n");
  182. return;
  183. }
  184. err = bluefield_edac_readl(priv, MLXBF_ERR_ADDR_1, &edea1);
  185. if (err) {
  186. dev_err(priv->dev, "Error addr 1 read failed.\n");
  187. return;
  188. }
  189. ecc_dimm_addr = ((u64)edea1 << 32) | edea0;
  190. edac_mc_handle_error(ecc_type, mci, error_cnt,
  191. PFN_DOWN(ecc_dimm_addr),
  192. offset_in_page(ecc_dimm_addr),
  193. syndrom, ecc_dimm, 0, 0, mci->ctl_name, "");
  194. }
  195. static void bluefield_edac_check(struct mem_ctl_info *mci)
  196. {
  197. struct bluefield_edac_priv *priv = mci->pvt_info;
  198. u32 ecc_count, single_error_count, double_error_count, ecc_error = 0;
  199. int err;
  200. /*
  201. * The memory controller might not be initialized by the firmware
  202. * when there isn't memory, which may lead to bad register readings.
  203. */
  204. if (mci->edac_cap == EDAC_FLAG_NONE)
  205. return;
  206. err = bluefield_edac_readl(priv, MLXBF_ECC_CNT, &ecc_count);
  207. if (err) {
  208. dev_err(priv->dev, "ECC count read failed.\n");
  209. return;
  210. }
  211. single_error_count = FIELD_GET(MLXBF_ECC_CNT__SERR_CNT, ecc_count);
  212. double_error_count = FIELD_GET(MLXBF_ECC_CNT__DERR_CNT, ecc_count);
  213. if (single_error_count) {
  214. ecc_error |= MLXBF_ECC_ERR__SECC;
  215. bluefield_gather_report_ecc(mci, single_error_count, 1);
  216. }
  217. if (double_error_count) {
  218. ecc_error |= MLXBF_ECC_ERR__DECC;
  219. bluefield_gather_report_ecc(mci, double_error_count, 0);
  220. }
  221. /* Write to clear reported errors. */
  222. if (ecc_count) {
  223. err = bluefield_edac_writel(priv, MLXBF_ECC_ERR, ecc_error);
  224. if (err)
  225. dev_err(priv->dev, "ECC Error write failed.\n");
  226. }
  227. }
  228. /* Initialize the DIMMs information for the given memory controller. */
  229. static void bluefield_edac_init_dimms(struct mem_ctl_info *mci)
  230. {
  231. struct bluefield_edac_priv *priv = mci->pvt_info;
  232. u64 mem_ctrl_idx = mci->mc_idx;
  233. struct dimm_info *dimm;
  234. u64 smc_info, smc_arg;
  235. int is_empty = 1, i;
  236. for (i = 0; i < priv->dimm_per_mc; i++) {
  237. dimm = mci->dimms[i];
  238. smc_arg = mem_ctrl_idx << 16 | i;
  239. smc_info = smc_call1(MLXBF_SIP_GET_DIMM_INFO, smc_arg);
  240. if (!FIELD_GET(MLXBF_DIMM_INFO__SIZE_GB, smc_info)) {
  241. dimm->mtype = MEM_EMPTY;
  242. continue;
  243. }
  244. is_empty = 0;
  245. dimm->edac_mode = EDAC_SECDED;
  246. if (FIELD_GET(MLXBF_DIMM_INFO__IS_NVDIMM, smc_info))
  247. dimm->mtype = MEM_NVDIMM;
  248. else if (FIELD_GET(MLXBF_DIMM_INFO__IS_LRDIMM, smc_info))
  249. dimm->mtype = MEM_LRDDR4;
  250. else if (FIELD_GET(MLXBF_DIMM_INFO__IS_RDIMM, smc_info))
  251. dimm->mtype = MEM_RDDR4;
  252. else
  253. dimm->mtype = MEM_DDR4;
  254. dimm->nr_pages =
  255. FIELD_GET(MLXBF_DIMM_INFO__SIZE_GB, smc_info) *
  256. (SZ_1G / PAGE_SIZE);
  257. dimm->grain = MLXBF_EDAC_ERROR_GRAIN;
  258. /* Mem controller for BlueField only supports x4, x8 and x16 */
  259. switch (FIELD_GET(MLXBF_DIMM_INFO__PACKAGE_X, smc_info)) {
  260. case 4:
  261. dimm->dtype = DEV_X4;
  262. break;
  263. case 8:
  264. dimm->dtype = DEV_X8;
  265. break;
  266. case 16:
  267. dimm->dtype = DEV_X16;
  268. break;
  269. default:
  270. dimm->dtype = DEV_UNKNOWN;
  271. }
  272. priv->dimm_ranks[i] =
  273. FIELD_GET(MLXBF_DIMM_INFO__RANKS, smc_info);
  274. }
  275. if (is_empty)
  276. mci->edac_cap = EDAC_FLAG_NONE;
  277. else
  278. mci->edac_cap = EDAC_FLAG_SECDED;
  279. }
  280. static int bluefield_edac_mc_probe(struct platform_device *pdev)
  281. {
  282. struct bluefield_edac_priv *priv;
  283. struct device *dev = &pdev->dev;
  284. struct edac_mc_layer layers[1];
  285. struct arm_smccc_res res;
  286. struct mem_ctl_info *mci;
  287. struct resource *emi_res;
  288. unsigned int mc_idx, dimm_count;
  289. int rc, ret;
  290. /* Read the MSS (Memory SubSystem) index from ACPI table. */
  291. if (device_property_read_u32(dev, "mss_number", &mc_idx)) {
  292. dev_warn(dev, "bf_edac: MSS number unknown\n");
  293. return -EINVAL;
  294. }
  295. /* Read the DIMMs per MC from ACPI table. */
  296. if (device_property_read_u32(dev, "dimm_per_mc", &dimm_count)) {
  297. dev_warn(dev, "bf_edac: DIMMs per MC unknown\n");
  298. return -EINVAL;
  299. }
  300. if (dimm_count > MLXBF_EDAC_MAX_DIMM_PER_MC) {
  301. dev_warn(dev, "bf_edac: DIMMs per MC not valid\n");
  302. return -EINVAL;
  303. }
  304. emi_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
  305. if (!emi_res)
  306. return -EINVAL;
  307. layers[0].type = EDAC_MC_LAYER_SLOT;
  308. layers[0].size = dimm_count;
  309. layers[0].is_virt_csrow = true;
  310. mci = edac_mc_alloc(mc_idx, ARRAY_SIZE(layers), layers, sizeof(*priv));
  311. if (!mci)
  312. return -ENOMEM;
  313. priv = mci->pvt_info;
  314. priv->dev = dev;
  315. /*
  316. * The "sec_reg_block" property in the ACPI table determines the method
  317. * the driver uses to access the EMI registers:
  318. * a) property is not present - directly access registers via readl/writel
  319. * b) property is present - indirectly access registers via SMC calls
  320. * (assuming required Silicon Provider service version found)
  321. */
  322. if (device_property_read_u32(dev, "sec_reg_block", &priv->sreg_tbl)) {
  323. priv->svc_sreg_support = false;
  324. } else {
  325. /*
  326. * Check for minimum required Arm Silicon Provider (SiP) service
  327. * version, ensuring support of required SMC function IDs.
  328. */
  329. arm_smccc_smc(MLXBF_SIP_SVC_VERSION, 0, 0, 0, 0, 0, 0, 0, &res);
  330. if (res.a0 == MLXBF_SVC_REQ_MAJOR &&
  331. res.a1 >= MLXBF_SVC_REQ_MINOR) {
  332. priv->svc_sreg_support = true;
  333. } else {
  334. dev_err(dev, "Required SMCs are not supported.\n");
  335. ret = -EINVAL;
  336. goto err;
  337. }
  338. }
  339. priv->dimm_per_mc = dimm_count;
  340. if (!priv->svc_sreg_support) {
  341. priv->emi_base = devm_ioremap_resource(dev, emi_res);
  342. if (IS_ERR(priv->emi_base)) {
  343. dev_err(dev, "failed to map EMI IO resource\n");
  344. ret = PTR_ERR(priv->emi_base);
  345. goto err;
  346. }
  347. } else {
  348. priv->emi_base = (void __iomem *)emi_res->start;
  349. }
  350. mci->pdev = dev;
  351. mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_RDDR4 |
  352. MEM_FLAG_LRDDR4 | MEM_FLAG_NVDIMM;
  353. mci->edac_ctl_cap = EDAC_FLAG_SECDED;
  354. mci->mod_name = DRIVER_NAME;
  355. mci->ctl_name = "BlueField_Memory_Controller";
  356. mci->dev_name = dev_name(dev);
  357. mci->edac_check = bluefield_edac_check;
  358. /* Initialize mci with the actual populated DIMM information. */
  359. bluefield_edac_init_dimms(mci);
  360. platform_set_drvdata(pdev, mci);
  361. /* Register with EDAC core */
  362. rc = edac_mc_add_mc(mci);
  363. if (rc) {
  364. dev_err(dev, "failed to register with EDAC core\n");
  365. ret = rc;
  366. goto err;
  367. }
  368. /* Only POLL mode supported so far. */
  369. edac_op_state = EDAC_OPSTATE_POLL;
  370. return 0;
  371. err:
  372. edac_mc_free(mci);
  373. return ret;
  374. }
  375. static void bluefield_edac_mc_remove(struct platform_device *pdev)
  376. {
  377. struct mem_ctl_info *mci = platform_get_drvdata(pdev);
  378. edac_mc_del_mc(&pdev->dev);
  379. edac_mc_free(mci);
  380. }
  381. static const struct acpi_device_id bluefield_mc_acpi_ids[] = {
  382. {"MLNXBF08", 0},
  383. {}
  384. };
  385. MODULE_DEVICE_TABLE(acpi, bluefield_mc_acpi_ids);
  386. static struct platform_driver bluefield_edac_mc_driver = {
  387. .driver = {
  388. .name = DRIVER_NAME,
  389. .acpi_match_table = bluefield_mc_acpi_ids,
  390. },
  391. .probe = bluefield_edac_mc_probe,
  392. .remove = bluefield_edac_mc_remove,
  393. };
  394. module_platform_driver(bluefield_edac_mc_driver);
  395. MODULE_DESCRIPTION("Mellanox BlueField memory edac driver");
  396. MODULE_AUTHOR("Mellanox Technologies");
  397. MODULE_LICENSE("GPL v2");