skx_common.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. *
  4. * Shared code by both skx_edac and i10nm_edac. Originally split out
  5. * from the skx_edac driver.
  6. *
  7. * This file is linked into both skx_edac and i10nm_edac drivers. In
  8. * order to avoid link errors, this file must be like a pure library
  9. * without including symbols and defines which would otherwise conflict,
  10. * when linked once into a module and into a built-in object, at the
  11. * same time. For example, __this_module symbol references when that
  12. * file is being linked into a built-in object.
  13. *
  14. * Copyright (c) 2018, Intel Corporation.
  15. */
  16. #include <linux/topology.h>
  17. #include <linux/acpi.h>
  18. #include <linux/dmi.h>
  19. #include <linux/adxl.h>
  20. #include <linux/overflow.h>
  21. #include <acpi/nfit.h>
  22. #include <asm/mce.h>
  23. #include <asm/uv/uv.h>
  24. #include "edac_module.h"
  25. #include "skx_common.h"
  26. static const char * const component_names[] = {
  27. [INDEX_SOCKET] = "ProcessorSocketId",
  28. [INDEX_MEMCTRL] = "MemoryControllerId",
  29. [INDEX_CHANNEL] = "ChannelId",
  30. [INDEX_DIMM] = "DimmSlotId",
  31. [INDEX_CS] = "ChipSelect",
  32. [INDEX_NM_MEMCTRL] = "NmMemoryControllerId",
  33. [INDEX_NM_CHANNEL] = "NmChannelId",
  34. [INDEX_NM_DIMM] = "NmDimmSlotId",
  35. [INDEX_NM_CS] = "NmChipSelect",
  36. };
  37. static int component_indices[ARRAY_SIZE(component_names)];
  38. static int adxl_component_count;
  39. static const char * const *adxl_component_names;
  40. static u64 *adxl_values;
  41. static char *adxl_msg;
  42. static unsigned long adxl_nm_bitmap;
  43. static char skx_msg[MSG_SIZE];
  44. static skx_decode_f driver_decode;
  45. static skx_show_retry_log_f skx_show_retry_rd_err_log;
  46. static u64 skx_tolm, skx_tohm;
  47. static LIST_HEAD(dev_edac_list);
  48. static bool skx_mem_cfg_2lm;
  49. static struct res_config *skx_res_cfg;
  50. int skx_adxl_get(void)
  51. {
  52. const char * const *names;
  53. int i, j;
  54. names = adxl_get_component_names();
  55. if (!names) {
  56. skx_printk(KERN_NOTICE, "No firmware support for address translation.\n");
  57. return -ENODEV;
  58. }
  59. for (i = 0; i < INDEX_MAX; i++) {
  60. for (j = 0; names[j]; j++) {
  61. if (!strcmp(component_names[i], names[j])) {
  62. component_indices[i] = j;
  63. if (i >= INDEX_NM_FIRST)
  64. adxl_nm_bitmap |= 1 << i;
  65. break;
  66. }
  67. }
  68. if (!names[j] && i < INDEX_NM_FIRST)
  69. goto err;
  70. }
  71. if (skx_mem_cfg_2lm) {
  72. if (!adxl_nm_bitmap)
  73. skx_printk(KERN_NOTICE, "Not enough ADXL components for 2-level memory.\n");
  74. else
  75. edac_dbg(2, "adxl_nm_bitmap: 0x%lx\n", adxl_nm_bitmap);
  76. }
  77. adxl_component_names = names;
  78. while (*names++)
  79. adxl_component_count++;
  80. adxl_values = kcalloc(adxl_component_count, sizeof(*adxl_values),
  81. GFP_KERNEL);
  82. if (!adxl_values) {
  83. adxl_component_count = 0;
  84. return -ENOMEM;
  85. }
  86. adxl_msg = kzalloc(MSG_SIZE, GFP_KERNEL);
  87. if (!adxl_msg) {
  88. adxl_component_count = 0;
  89. kfree(adxl_values);
  90. return -ENOMEM;
  91. }
  92. return 0;
  93. err:
  94. skx_printk(KERN_ERR, "'%s' is not matched from DSM parameters: ",
  95. component_names[i]);
  96. for (j = 0; names[j]; j++)
  97. skx_printk(KERN_CONT, "%s ", names[j]);
  98. skx_printk(KERN_CONT, "\n");
  99. return -ENODEV;
  100. }
  101. EXPORT_SYMBOL_GPL(skx_adxl_get);
  102. void skx_adxl_put(void)
  103. {
  104. adxl_component_count = 0;
  105. kfree(adxl_values);
  106. kfree(adxl_msg);
  107. }
  108. EXPORT_SYMBOL_GPL(skx_adxl_put);
  109. void skx_init_mc_mapping(struct skx_dev *d)
  110. {
  111. /*
  112. * By default, the BIOS presents all memory controllers within each
  113. * socket to the EDAC driver. The physical indices are the same as
  114. * the logical indices of the memory controllers enumerated by the
  115. * EDAC driver.
  116. */
  117. for (int i = 0; i < d->num_imc; i++)
  118. d->imc[i].mc_mapping = i;
  119. }
  120. EXPORT_SYMBOL_GPL(skx_init_mc_mapping);
  121. void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc)
  122. {
  123. edac_dbg(0, "Set the mapping of mc phy idx to logical idx: %02d -> %02d\n",
  124. pmc, lmc);
  125. d->imc[lmc].mc_mapping = pmc;
  126. }
  127. EXPORT_SYMBOL_GPL(skx_set_mc_mapping);
  128. static int skx_get_mc_mapping(struct skx_dev *d, u8 pmc)
  129. {
  130. for (int lmc = 0; lmc < d->num_imc; lmc++) {
  131. if (d->imc[lmc].mc_mapping == pmc) {
  132. edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n",
  133. pmc, lmc);
  134. return lmc;
  135. }
  136. }
  137. return -1;
  138. }
  139. static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
  140. {
  141. int i, lmc, len = 0;
  142. struct skx_dev *d;
  143. if (res->addr >= skx_tohm || (res->addr >= skx_tolm &&
  144. res->addr < BIT_ULL(32))) {
  145. edac_dbg(0, "Address 0x%llx out of range\n", res->addr);
  146. return false;
  147. }
  148. if (adxl_decode(res->addr, adxl_values)) {
  149. edac_dbg(0, "Failed to decode 0x%llx\n", res->addr);
  150. return false;
  151. }
  152. /*
  153. * GNR with a Flat2LM memory configuration may mistakenly classify
  154. * a near-memory error(DDR5) as a far-memory error(CXL), resulting
  155. * in the incorrect selection of decoded ADXL components.
  156. * To address this, prefetch the decoded far-memory controller ID
  157. * and adjust the error source to near-memory if the far-memory
  158. * controller ID is invalid.
  159. */
  160. if (skx_res_cfg && skx_res_cfg->type == GNR && err_src == ERR_SRC_2LM_FM) {
  161. res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
  162. if (res->imc == -1) {
  163. err_src = ERR_SRC_2LM_NM;
  164. edac_dbg(0, "Adjust the error source to near-memory.\n");
  165. }
  166. }
  167. res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]];
  168. if (err_src == ERR_SRC_2LM_NM) {
  169. res->imc = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ?
  170. (int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1;
  171. res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ?
  172. (int)adxl_values[component_indices[INDEX_NM_CHANNEL]] : -1;
  173. res->dimm = (adxl_nm_bitmap & BIT_NM_DIMM) ?
  174. (int)adxl_values[component_indices[INDEX_NM_DIMM]] : -1;
  175. res->cs = (adxl_nm_bitmap & BIT_NM_CS) ?
  176. (int)adxl_values[component_indices[INDEX_NM_CS]] : -1;
  177. } else {
  178. res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
  179. res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
  180. res->dimm = (int)adxl_values[component_indices[INDEX_DIMM]];
  181. res->cs = (int)adxl_values[component_indices[INDEX_CS]];
  182. }
  183. if (res->imc < 0) {
  184. skx_printk(KERN_ERR, "Bad imc %d\n", res->imc);
  185. return false;
  186. }
  187. list_for_each_entry(d, &dev_edac_list, list) {
  188. if (d->imc[0].src_id == res->socket) {
  189. res->dev = d;
  190. break;
  191. }
  192. }
  193. if (!res->dev) {
  194. skx_printk(KERN_ERR, "No device for src_id %d imc %d\n",
  195. res->socket, res->imc);
  196. return false;
  197. }
  198. lmc = skx_get_mc_mapping(d, res->imc);
  199. if (lmc < 0) {
  200. skx_printk(KERN_ERR, "No lmc for imc %d\n", res->imc);
  201. return false;
  202. }
  203. res->imc = lmc;
  204. for (i = 0; i < adxl_component_count; i++) {
  205. if (adxl_values[i] == ~0x0ull)
  206. continue;
  207. len += snprintf(adxl_msg + len, MSG_SIZE - len, " %s:0x%llx",
  208. adxl_component_names[i], adxl_values[i]);
  209. if (MSG_SIZE - len <= 0)
  210. break;
  211. }
  212. res->decoded_by_adxl = true;
  213. return true;
  214. }
  215. void skx_set_mem_cfg(bool mem_cfg_2lm)
  216. {
  217. skx_mem_cfg_2lm = mem_cfg_2lm;
  218. }
  219. EXPORT_SYMBOL_GPL(skx_set_mem_cfg);
  220. void skx_set_res_cfg(struct res_config *cfg)
  221. {
  222. skx_res_cfg = cfg;
  223. }
  224. EXPORT_SYMBOL_GPL(skx_set_res_cfg);
  225. void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log)
  226. {
  227. driver_decode = decode;
  228. skx_show_retry_rd_err_log = show_retry_log;
  229. }
  230. EXPORT_SYMBOL_GPL(skx_set_decode);
  231. static int skx_get_pkg_id(struct skx_dev *d, u8 *id)
  232. {
  233. int node;
  234. int cpu;
  235. node = pcibus_to_node(d->util_all->bus);
  236. if (numa_valid_node(node)) {
  237. for_each_cpu(cpu, cpumask_of_pcibus(d->util_all->bus)) {
  238. struct cpuinfo_x86 *c = &cpu_data(cpu);
  239. if (c->initialized && cpu_to_node(cpu) == node) {
  240. *id = topology_physical_package_id(cpu);
  241. return 0;
  242. }
  243. }
  244. }
  245. skx_printk(KERN_ERR, "Failed to get package ID from NUMA information\n");
  246. return -ENODEV;
  247. }
  248. int skx_get_src_id(struct skx_dev *d, int off, u8 *id)
  249. {
  250. u32 reg;
  251. /*
  252. * The 3-bit source IDs in PCI configuration space registers are limited
  253. * to 8 unique IDs, and each ID is local to a UPI/QPI domain.
  254. *
  255. * Source IDs cannot be used to map devices to sockets on UV systems
  256. * because they can exceed 8 sockets and have multiple UPI/QPI domains
  257. * with identical, repeating source IDs.
  258. */
  259. if (is_uv_system())
  260. return skx_get_pkg_id(d, id);
  261. if (pci_read_config_dword(d->util_all, off, &reg)) {
  262. skx_printk(KERN_ERR, "Failed to read src id\n");
  263. return -ENODEV;
  264. }
  265. *id = GET_BITFIELD(reg, 12, 14);
  266. return 0;
  267. }
  268. EXPORT_SYMBOL_GPL(skx_get_src_id);
  269. static int get_width(u32 mtr)
  270. {
  271. switch (GET_BITFIELD(mtr, 8, 9)) {
  272. case 0:
  273. return DEV_X4;
  274. case 1:
  275. return DEV_X8;
  276. case 2:
  277. return DEV_X16;
  278. }
  279. return DEV_UNKNOWN;
  280. }
  281. /*
  282. * We use the per-socket device @cfg->did to count how many sockets are present,
  283. * and to detemine which PCI buses are associated with each socket. Allocate
  284. * and build the full list of all the skx_dev structures that we need here.
  285. */
  286. int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list)
  287. {
  288. int ndev = 0, imc_num = cfg->ddr_imc_num + cfg->hbm_imc_num;
  289. struct pci_dev *pdev, *prev;
  290. struct skx_dev *d;
  291. u32 reg;
  292. prev = NULL;
  293. for (;;) {
  294. pdev = pci_get_device(PCI_VENDOR_ID_INTEL, cfg->decs_did, prev);
  295. if (!pdev)
  296. break;
  297. ndev++;
  298. d = kzalloc_flex(*d, imc, imc_num);
  299. if (!d) {
  300. pci_dev_put(pdev);
  301. return -ENOMEM;
  302. }
  303. if (pci_read_config_dword(pdev, cfg->busno_cfg_offset, &reg)) {
  304. kfree(d);
  305. pci_dev_put(pdev);
  306. skx_printk(KERN_ERR, "Failed to read bus idx\n");
  307. return -ENODEV;
  308. }
  309. d->bus[0] = GET_BITFIELD(reg, 0, 7);
  310. d->bus[1] = GET_BITFIELD(reg, 8, 15);
  311. if (cfg->type == SKX) {
  312. d->seg = pci_domain_nr(pdev->bus);
  313. d->bus[2] = GET_BITFIELD(reg, 16, 23);
  314. d->bus[3] = GET_BITFIELD(reg, 24, 31);
  315. } else {
  316. d->seg = GET_BITFIELD(reg, 16, 23);
  317. }
  318. d->num_imc = imc_num;
  319. edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x, imcs %d\n",
  320. d->bus[0], d->bus[1], d->bus[2], d->bus[3], imc_num);
  321. list_add_tail(&d->list, &dev_edac_list);
  322. prev = pdev;
  323. skx_init_mc_mapping(d);
  324. }
  325. if (list)
  326. *list = &dev_edac_list;
  327. return ndev;
  328. }
  329. EXPORT_SYMBOL_GPL(skx_get_all_bus_mappings);
  330. struct list_head *skx_get_edac_list(void)
  331. {
  332. return &dev_edac_list;
  333. }
  334. EXPORT_SYMBOL_GPL(skx_get_edac_list);
  335. int skx_get_hi_lo(unsigned int did, int off[], u64 *tolm, u64 *tohm)
  336. {
  337. struct pci_dev *pdev;
  338. u32 reg;
  339. pdev = pci_get_device(PCI_VENDOR_ID_INTEL, did, NULL);
  340. if (!pdev) {
  341. edac_dbg(2, "Can't get tolm/tohm\n");
  342. return -ENODEV;
  343. }
  344. if (pci_read_config_dword(pdev, off[0], &reg)) {
  345. skx_printk(KERN_ERR, "Failed to read tolm\n");
  346. goto fail;
  347. }
  348. skx_tolm = reg;
  349. if (pci_read_config_dword(pdev, off[1], &reg)) {
  350. skx_printk(KERN_ERR, "Failed to read lower tohm\n");
  351. goto fail;
  352. }
  353. skx_tohm = reg;
  354. if (pci_read_config_dword(pdev, off[2], &reg)) {
  355. skx_printk(KERN_ERR, "Failed to read upper tohm\n");
  356. goto fail;
  357. }
  358. skx_tohm |= (u64)reg << 32;
  359. pci_dev_put(pdev);
  360. *tolm = skx_tolm;
  361. *tohm = skx_tohm;
  362. edac_dbg(2, "tolm = 0x%llx tohm = 0x%llx\n", skx_tolm, skx_tohm);
  363. return 0;
  364. fail:
  365. pci_dev_put(pdev);
  366. return -ENODEV;
  367. }
  368. EXPORT_SYMBOL_GPL(skx_get_hi_lo);
  369. void skx_set_hi_lo(u64 tolm, u64 tohm)
  370. {
  371. skx_tolm = tolm;
  372. skx_tohm = tohm;
  373. }
  374. EXPORT_SYMBOL_GPL(skx_set_hi_lo);
  375. static int skx_get_dimm_attr(u32 reg, int lobit, int hibit, int add,
  376. int minval, int maxval, const char *name)
  377. {
  378. u32 val = GET_BITFIELD(reg, lobit, hibit);
  379. if (val < minval || val > maxval) {
  380. edac_dbg(2, "bad %s = %d (raw=0x%x)\n", name, val, reg);
  381. return -EINVAL;
  382. }
  383. return val + add;
  384. }
  385. #define numrank(reg) skx_get_dimm_attr(reg, 12, 13, 0, 0, 2, "ranks")
  386. #define numrow(reg) skx_get_dimm_attr(reg, 2, 4, 12, 1, 7, "rows")
  387. #define numcol(reg) skx_get_dimm_attr(reg, 0, 1, 10, 0, 2, "cols")
  388. int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
  389. struct skx_imc *imc, int chan, int dimmno,
  390. struct res_config *cfg)
  391. {
  392. int banks, ranks, rows, cols, npages;
  393. enum mem_type mtype;
  394. u64 size;
  395. ranks = numrank(mtr);
  396. rows = numrow(mtr);
  397. cols = imc->hbm_mc ? 6 : numcol(mtr);
  398. if (imc->hbm_mc) {
  399. banks = 32;
  400. mtype = MEM_HBM2;
  401. } else if (cfg->support_ddr5) {
  402. banks = 32;
  403. mtype = MEM_DDR5;
  404. } else {
  405. banks = 16;
  406. mtype = MEM_DDR4;
  407. }
  408. /*
  409. * Compute size in 8-byte (2^3) words, then shift to MiB (2^20)
  410. */
  411. size = ((1ull << (rows + cols + ranks)) * banks) >> (20 - 3);
  412. npages = MiB_TO_PAGES(size);
  413. edac_dbg(0, "mc#%d: channel %d, dimm %d, %lld MiB (%d pages) bank: %d, rank: %d, row: 0x%x, col: 0x%x\n",
  414. imc->mc, chan, dimmno, size, npages,
  415. banks, 1 << ranks, rows, cols);
  416. imc->chan[chan].dimms[dimmno].close_pg = GET_BITFIELD(mcmtr, 0, 0);
  417. imc->chan[chan].dimms[dimmno].bank_xor_enable = GET_BITFIELD(mcmtr, 9, 9);
  418. imc->chan[chan].dimms[dimmno].fine_grain_bank = GET_BITFIELD(amap, 0, 0);
  419. imc->chan[chan].dimms[dimmno].rowbits = rows;
  420. imc->chan[chan].dimms[dimmno].colbits = cols;
  421. dimm->nr_pages = npages;
  422. dimm->grain = 32;
  423. dimm->dtype = get_width(mtr);
  424. dimm->mtype = mtype;
  425. dimm->edac_mode = EDAC_SECDED; /* likely better than this */
  426. if (imc->hbm_mc)
  427. snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_HBMC#%u_Chan#%u",
  428. imc->src_id, imc->lmc, chan);
  429. else
  430. snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
  431. imc->src_id, imc->lmc, chan, dimmno);
  432. return 1;
  433. }
  434. EXPORT_SYMBOL_GPL(skx_get_dimm_info);
  435. int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
  436. int chan, int dimmno, const char *mod_str)
  437. {
  438. int smbios_handle;
  439. u32 dev_handle;
  440. u16 flags;
  441. u64 size = 0;
  442. dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc,
  443. imc->src_id, 0);
  444. smbios_handle = nfit_get_smbios_id(dev_handle, &flags);
  445. if (smbios_handle == -EOPNOTSUPP) {
  446. pr_warn_once("%s: Can't find size of NVDIMM. Try enabling CONFIG_ACPI_NFIT\n", mod_str);
  447. goto unknown_size;
  448. }
  449. if (smbios_handle < 0) {
  450. skx_printk(KERN_ERR, "Can't find handle for NVDIMM ADR=0x%x\n", dev_handle);
  451. goto unknown_size;
  452. }
  453. if (flags & ACPI_NFIT_MEM_MAP_FAILED) {
  454. skx_printk(KERN_ERR, "NVDIMM ADR=0x%x is not mapped\n", dev_handle);
  455. goto unknown_size;
  456. }
  457. size = dmi_memdev_size(smbios_handle);
  458. if (size == ~0ull)
  459. skx_printk(KERN_ERR, "Can't find size for NVDIMM ADR=0x%x/SMBIOS=0x%x\n",
  460. dev_handle, smbios_handle);
  461. unknown_size:
  462. dimm->nr_pages = size >> PAGE_SHIFT;
  463. dimm->grain = 32;
  464. dimm->dtype = DEV_UNKNOWN;
  465. dimm->mtype = MEM_NVDIMM;
  466. dimm->edac_mode = EDAC_SECDED; /* likely better than this */
  467. edac_dbg(0, "mc#%d: channel %d, dimm %d, %llu MiB (%u pages)\n",
  468. imc->mc, chan, dimmno, size >> 20, dimm->nr_pages);
  469. snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
  470. imc->src_id, imc->lmc, chan, dimmno);
  471. return (size == 0 || size == ~0ull) ? 0 : 1;
  472. }
  473. EXPORT_SYMBOL_GPL(skx_get_nvdimm_info);
  474. int skx_register_mci(struct skx_imc *imc, struct device *dev,
  475. const char *dev_name, const char *ctl_name,
  476. const char *mod_str, get_dimm_config_f get_dimm_config,
  477. struct res_config *cfg)
  478. {
  479. struct mem_ctl_info *mci;
  480. struct edac_mc_layer layers[2];
  481. struct skx_pvt *pvt;
  482. int rc;
  483. /* Allocate a new MC control structure */
  484. layers[0].type = EDAC_MC_LAYER_CHANNEL;
  485. layers[0].size = imc->num_channels;
  486. layers[0].is_virt_csrow = false;
  487. layers[1].type = EDAC_MC_LAYER_SLOT;
  488. layers[1].size = imc->num_dimms;
  489. layers[1].is_virt_csrow = true;
  490. mci = edac_mc_alloc(imc->mc, ARRAY_SIZE(layers), layers,
  491. sizeof(struct skx_pvt));
  492. if (unlikely(!mci))
  493. return -ENOMEM;
  494. edac_dbg(0, "MC#%d: mci = %p\n", imc->mc, mci);
  495. /* Associate skx_dev and mci for future usage */
  496. imc->mci = mci;
  497. pvt = mci->pvt_info;
  498. pvt->imc = imc;
  499. mci->ctl_name = kasprintf(GFP_KERNEL, "%s#%d IMC#%d", ctl_name,
  500. imc->src_id, imc->lmc);
  501. if (!mci->ctl_name) {
  502. rc = -ENOMEM;
  503. goto fail0;
  504. }
  505. mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_NVDIMM;
  506. if (cfg->support_ddr5)
  507. mci->mtype_cap |= MEM_FLAG_DDR5;
  508. mci->edac_ctl_cap = EDAC_FLAG_NONE;
  509. mci->edac_cap = EDAC_FLAG_NONE;
  510. mci->mod_name = mod_str;
  511. mci->dev_name = dev_name;
  512. mci->ctl_page_to_phys = NULL;
  513. rc = get_dimm_config(mci, cfg);
  514. if (rc < 0)
  515. goto fail;
  516. /* Record ptr to the generic device */
  517. mci->pdev = dev;
  518. /* Add this new MC control structure to EDAC's list of MCs */
  519. if (unlikely(edac_mc_add_mc(mci))) {
  520. edac_dbg(0, "MC: failed edac_mc_add_mc()\n");
  521. rc = -EINVAL;
  522. goto fail;
  523. }
  524. return 0;
  525. fail:
  526. kfree(mci->ctl_name);
  527. fail0:
  528. edac_mc_free(mci);
  529. imc->mci = NULL;
  530. return rc;
  531. }
  532. EXPORT_SYMBOL_GPL(skx_register_mci);
  533. static void skx_unregister_mci(struct skx_imc *imc)
  534. {
  535. struct mem_ctl_info *mci = imc->mci;
  536. if (!mci)
  537. return;
  538. edac_dbg(0, "MC%d: mci = %p\n", imc->mc, mci);
  539. /* Remove MC sysfs nodes */
  540. edac_mc_del_mc(mci->pdev);
  541. edac_dbg(1, "%s: free mci struct\n", mci->ctl_name);
  542. kfree(mci->ctl_name);
  543. edac_mc_free(mci);
  544. }
  545. static void skx_mce_output_error(struct mem_ctl_info *mci,
  546. const struct mce *m,
  547. struct decoded_addr *res)
  548. {
  549. enum hw_event_mc_err_type tp_event;
  550. char *optype;
  551. bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
  552. bool overflow = GET_BITFIELD(m->status, 62, 62);
  553. bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
  554. bool scrub_err = false;
  555. bool recoverable;
  556. int len;
  557. u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52);
  558. u32 mscod = GET_BITFIELD(m->status, 16, 31);
  559. u32 errcode = GET_BITFIELD(m->status, 0, 15);
  560. u32 optypenum = GET_BITFIELD(m->status, 4, 6);
  561. recoverable = GET_BITFIELD(m->status, 56, 56);
  562. if (uncorrected_error) {
  563. core_err_cnt = 1;
  564. if (ripv) {
  565. tp_event = HW_EVENT_ERR_UNCORRECTED;
  566. } else {
  567. tp_event = HW_EVENT_ERR_FATAL;
  568. }
  569. } else {
  570. tp_event = HW_EVENT_ERR_CORRECTED;
  571. }
  572. switch (optypenum) {
  573. case 0:
  574. optype = "generic undef request error";
  575. break;
  576. case 1:
  577. optype = "memory read error";
  578. break;
  579. case 2:
  580. optype = "memory write error";
  581. break;
  582. case 3:
  583. optype = "addr/cmd error";
  584. break;
  585. case 4:
  586. optype = "memory scrubbing error";
  587. scrub_err = true;
  588. break;
  589. default:
  590. optype = "reserved";
  591. break;
  592. }
  593. if (res->decoded_by_adxl) {
  594. len = scnprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s",
  595. overflow ? " OVERFLOW" : "",
  596. (uncorrected_error && recoverable) ? " recoverable" : "",
  597. mscod, errcode, adxl_msg);
  598. } else {
  599. len = scnprintf(skx_msg, MSG_SIZE,
  600. "%s%s err_code:0x%04x:0x%04x ProcessorSocketId:0x%x MemoryControllerId:0x%x PhysicalRankId:0x%x Row:0x%x Column:0x%x Bank:0x%x BankGroup:0x%x",
  601. overflow ? " OVERFLOW" : "",
  602. (uncorrected_error && recoverable) ? " recoverable" : "",
  603. mscod, errcode,
  604. res->socket, res->imc, res->rank,
  605. res->row, res->column, res->bank_address, res->bank_group);
  606. }
  607. if (skx_show_retry_rd_err_log)
  608. skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len, scrub_err);
  609. edac_dbg(0, "%s\n", skx_msg);
  610. /* Call the helper to output message */
  611. edac_mc_handle_error(tp_event, mci, core_err_cnt,
  612. m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
  613. res->channel, res->dimm, -1,
  614. optype, skx_msg);
  615. }
  616. static enum error_source skx_error_source(const struct mce *m)
  617. {
  618. u32 errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
  619. if (errcode != MCACOD_MEM_CTL_ERR && errcode != MCACOD_EXT_MEM_ERR)
  620. return ERR_SRC_NOT_MEMORY;
  621. if (!skx_mem_cfg_2lm)
  622. return ERR_SRC_1LM;
  623. if (errcode == MCACOD_EXT_MEM_ERR)
  624. return ERR_SRC_2LM_NM;
  625. return ERR_SRC_2LM_FM;
  626. }
  627. int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
  628. void *data)
  629. {
  630. struct mce *mce = (struct mce *)data;
  631. enum error_source err_src;
  632. struct decoded_addr res;
  633. struct mem_ctl_info *mci;
  634. char *type;
  635. if (mce->kflags & MCE_HANDLED_CEC)
  636. return NOTIFY_DONE;
  637. err_src = skx_error_source(mce);
  638. /* Ignore unless this is memory related with an address */
  639. if (err_src == ERR_SRC_NOT_MEMORY || !(mce->status & MCI_STATUS_ADDRV))
  640. return NOTIFY_DONE;
  641. memset(&res, 0, sizeof(res));
  642. res.mce = mce;
  643. res.addr = mce->addr & MCI_ADDR_PHYSADDR;
  644. if (!pfn_to_online_page(res.addr >> PAGE_SHIFT) && !arch_is_platform_page(res.addr)) {
  645. pr_err("Invalid address 0x%llx in IA32_MC%d_ADDR\n", mce->addr, mce->bank);
  646. return NOTIFY_DONE;
  647. }
  648. /* Try driver decoder first */
  649. if (!(driver_decode && driver_decode(&res))) {
  650. /* Then try firmware decoder (ACPI DSM methods) */
  651. if (!(adxl_component_count && skx_adxl_decode(&res, err_src)))
  652. return NOTIFY_DONE;
  653. }
  654. mci = res.dev->imc[res.imc].mci;
  655. if (!mci)
  656. return NOTIFY_DONE;
  657. if (mce->mcgstatus & MCG_STATUS_MCIP)
  658. type = "Exception";
  659. else
  660. type = "Event";
  661. skx_mc_printk(mci, KERN_DEBUG, "HANDLING MCE MEMORY ERROR\n");
  662. skx_mc_printk(mci, KERN_DEBUG, "CPU %d: Machine Check %s: 0x%llx "
  663. "Bank %d: 0x%llx\n", mce->extcpu, type,
  664. mce->mcgstatus, mce->bank, mce->status);
  665. skx_mc_printk(mci, KERN_DEBUG, "TSC 0x%llx ", mce->tsc);
  666. skx_mc_printk(mci, KERN_DEBUG, "ADDR 0x%llx ", mce->addr);
  667. skx_mc_printk(mci, KERN_DEBUG, "MISC 0x%llx ", mce->misc);
  668. skx_mc_printk(mci, KERN_DEBUG, "PROCESSOR %u:0x%x TIME %llu SOCKET "
  669. "%u APIC 0x%x\n", mce->cpuvendor, mce->cpuid,
  670. mce->time, mce->socketid, mce->apicid);
  671. skx_mce_output_error(mci, mce, &res);
  672. mce->kflags |= MCE_HANDLED_EDAC;
  673. return NOTIFY_DONE;
  674. }
  675. EXPORT_SYMBOL_GPL(skx_mce_check_error);
  676. void skx_remove(void)
  677. {
  678. int i, j;
  679. struct skx_dev *d, *tmp;
  680. edac_dbg(0, "\n");
  681. list_for_each_entry_safe(d, tmp, &dev_edac_list, list) {
  682. list_del(&d->list);
  683. for (i = 0; i < d->num_imc; i++) {
  684. if (d->imc[i].mci)
  685. skx_unregister_mci(&d->imc[i]);
  686. if (d->imc[i].mdev)
  687. pci_dev_put(d->imc[i].mdev);
  688. if (d->imc[i].mbase)
  689. iounmap(d->imc[i].mbase);
  690. if (d->imc[i].dev)
  691. put_device(d->imc[i].dev);
  692. for (j = 0; j < d->imc[i].num_channels; j++) {
  693. if (d->imc[i].chan[j].cdev)
  694. pci_dev_put(d->imc[i].chan[j].cdev);
  695. }
  696. }
  697. if (d->util_all)
  698. pci_dev_put(d->util_all);
  699. if (d->pcu_cr3)
  700. pci_dev_put(d->pcu_cr3);
  701. if (d->sad_all)
  702. pci_dev_put(d->sad_all);
  703. if (d->uracu)
  704. pci_dev_put(d->uracu);
  705. kfree(d);
  706. }
  707. }
  708. EXPORT_SYMBOL_GPL(skx_remove);
  709. #ifdef CONFIG_EDAC_DEBUG
  710. /*
  711. * Debug feature.
  712. * Exercise the address decode logic by writing an address to
  713. * /sys/kernel/debug/edac/{skx,i10nm,imh}_test/addr.
  714. */
  715. static struct dentry *skx_test;
  716. static int debugfs_u64_set(void *data, u64 val)
  717. {
  718. struct mce m;
  719. pr_warn_once("Fake error to 0x%llx injected via debugfs\n", val);
  720. memset(&m, 0, sizeof(m));
  721. /* ADDRV + MemRd + Unknown channel */
  722. m.status = MCI_STATUS_ADDRV + 0x90;
  723. /* One corrected error */
  724. m.status |= BIT_ULL(MCI_STATUS_CEC_SHIFT);
  725. m.addr = val;
  726. skx_mce_check_error(NULL, 0, &m);
  727. return 0;
  728. }
  729. DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
  730. void skx_setup_debug(const char *name)
  731. {
  732. skx_test = edac_debugfs_create_dir(name);
  733. if (!skx_test)
  734. return;
  735. if (!edac_debugfs_create_file("addr", 0200, skx_test,
  736. NULL, &fops_u64_wo)) {
  737. debugfs_remove(skx_test);
  738. skx_test = NULL;
  739. }
  740. }
  741. EXPORT_SYMBOL_GPL(skx_setup_debug);
  742. void skx_teardown_debug(void)
  743. {
  744. debugfs_remove_recursive(skx_test);
  745. }
  746. EXPORT_SYMBOL_GPL(skx_teardown_debug);
  747. #endif /*CONFIG_EDAC_DEBUG*/
  748. MODULE_LICENSE("GPL v2");
  749. MODULE_AUTHOR("Tony Luck");
  750. MODULE_DESCRIPTION("MC Driver for Intel server processors");