aie2_error.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
  4. */
  5. #include <drm/drm_cache.h>
  6. #include <drm/drm_device.h>
  7. #include <drm/drm_print.h>
  8. #include <drm/gpu_scheduler.h>
  9. #include <linux/dma-mapping.h>
  10. #include <linux/kthread.h>
  11. #include <linux/kernel.h>
  12. #include "aie2_msg_priv.h"
  13. #include "aie2_pci.h"
  14. #include "amdxdna_error.h"
  15. #include "amdxdna_mailbox.h"
  16. #include "amdxdna_pci_drv.h"
  17. struct async_event {
  18. struct amdxdna_dev_hdl *ndev;
  19. struct async_event_msg_resp resp;
  20. struct workqueue_struct *wq;
  21. struct work_struct work;
  22. u8 *buf;
  23. dma_addr_t addr;
  24. u32 size;
  25. };
  26. struct async_events {
  27. struct workqueue_struct *wq;
  28. u8 *buf;
  29. dma_addr_t addr;
  30. u32 size;
  31. u32 event_cnt;
  32. struct async_event event[] __counted_by(event_cnt);
  33. };
  34. /*
  35. * Below enum, struct and lookup tables are porting from XAIE util header file.
  36. *
  37. * Below data is defined by AIE device and it is used for decode error message
  38. * from the device.
  39. */
  40. enum aie_module_type {
  41. AIE_MEM_MOD = 0,
  42. AIE_CORE_MOD,
  43. AIE_PL_MOD,
  44. AIE_UNKNOWN_MOD,
  45. };
  46. enum aie_error_category {
  47. AIE_ERROR_SATURATION = 0,
  48. AIE_ERROR_FP,
  49. AIE_ERROR_STREAM,
  50. AIE_ERROR_ACCESS,
  51. AIE_ERROR_BUS,
  52. AIE_ERROR_INSTRUCTION,
  53. AIE_ERROR_ECC,
  54. AIE_ERROR_LOCK,
  55. AIE_ERROR_DMA,
  56. AIE_ERROR_MEM_PARITY,
  57. /* Unknown is not from XAIE, added for better category */
  58. AIE_ERROR_UNKNOWN,
  59. };
  60. /* Don't pack, unless XAIE side changed */
  61. struct aie_error {
  62. __u8 row;
  63. __u8 col;
  64. __u32 mod_type;
  65. __u8 event_id;
  66. };
  67. struct aie_err_info {
  68. u32 err_cnt;
  69. u32 ret_code;
  70. u32 rsvd;
  71. struct aie_error payload[] __counted_by(err_cnt);
  72. };
  73. struct aie_event_category {
  74. u8 event_id;
  75. enum aie_error_category category;
  76. };
  77. #define EVENT_CATEGORY(id, cat) { id, cat }
  78. static const struct aie_event_category aie_ml_mem_event_cat[] = {
  79. EVENT_CATEGORY(88U, AIE_ERROR_ECC),
  80. EVENT_CATEGORY(90U, AIE_ERROR_ECC),
  81. EVENT_CATEGORY(91U, AIE_ERROR_MEM_PARITY),
  82. EVENT_CATEGORY(92U, AIE_ERROR_MEM_PARITY),
  83. EVENT_CATEGORY(93U, AIE_ERROR_MEM_PARITY),
  84. EVENT_CATEGORY(94U, AIE_ERROR_MEM_PARITY),
  85. EVENT_CATEGORY(95U, AIE_ERROR_MEM_PARITY),
  86. EVENT_CATEGORY(96U, AIE_ERROR_MEM_PARITY),
  87. EVENT_CATEGORY(97U, AIE_ERROR_DMA),
  88. EVENT_CATEGORY(98U, AIE_ERROR_DMA),
  89. EVENT_CATEGORY(99U, AIE_ERROR_DMA),
  90. EVENT_CATEGORY(100U, AIE_ERROR_DMA),
  91. EVENT_CATEGORY(101U, AIE_ERROR_LOCK),
  92. };
  93. static const struct aie_event_category aie_ml_core_event_cat[] = {
  94. EVENT_CATEGORY(55U, AIE_ERROR_ACCESS),
  95. EVENT_CATEGORY(56U, AIE_ERROR_STREAM),
  96. EVENT_CATEGORY(57U, AIE_ERROR_STREAM),
  97. EVENT_CATEGORY(58U, AIE_ERROR_BUS),
  98. EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION),
  99. EVENT_CATEGORY(60U, AIE_ERROR_ACCESS),
  100. EVENT_CATEGORY(62U, AIE_ERROR_ECC),
  101. EVENT_CATEGORY(64U, AIE_ERROR_ECC),
  102. EVENT_CATEGORY(65U, AIE_ERROR_ACCESS),
  103. EVENT_CATEGORY(66U, AIE_ERROR_ACCESS),
  104. EVENT_CATEGORY(67U, AIE_ERROR_LOCK),
  105. EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION),
  106. EVENT_CATEGORY(71U, AIE_ERROR_STREAM),
  107. EVENT_CATEGORY(72U, AIE_ERROR_BUS),
  108. };
  109. static const struct aie_event_category aie_ml_mem_tile_event_cat[] = {
  110. EVENT_CATEGORY(130U, AIE_ERROR_ECC),
  111. EVENT_CATEGORY(132U, AIE_ERROR_ECC),
  112. EVENT_CATEGORY(133U, AIE_ERROR_DMA),
  113. EVENT_CATEGORY(134U, AIE_ERROR_DMA),
  114. EVENT_CATEGORY(135U, AIE_ERROR_STREAM),
  115. EVENT_CATEGORY(136U, AIE_ERROR_STREAM),
  116. EVENT_CATEGORY(137U, AIE_ERROR_STREAM),
  117. EVENT_CATEGORY(138U, AIE_ERROR_BUS),
  118. EVENT_CATEGORY(139U, AIE_ERROR_LOCK),
  119. };
  120. static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
  121. EVENT_CATEGORY(64U, AIE_ERROR_BUS),
  122. EVENT_CATEGORY(65U, AIE_ERROR_STREAM),
  123. EVENT_CATEGORY(66U, AIE_ERROR_STREAM),
  124. EVENT_CATEGORY(67U, AIE_ERROR_BUS),
  125. EVENT_CATEGORY(68U, AIE_ERROR_BUS),
  126. EVENT_CATEGORY(69U, AIE_ERROR_BUS),
  127. EVENT_CATEGORY(70U, AIE_ERROR_BUS),
  128. EVENT_CATEGORY(71U, AIE_ERROR_BUS),
  129. EVENT_CATEGORY(72U, AIE_ERROR_DMA),
  130. EVENT_CATEGORY(73U, AIE_ERROR_DMA),
  131. EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
  132. };
  133. static const enum amdxdna_error_num aie_cat_err_num_map[] = {
  134. [AIE_ERROR_SATURATION] = AMDXDNA_ERROR_NUM_AIE_SATURATION,
  135. [AIE_ERROR_FP] = AMDXDNA_ERROR_NUM_AIE_FP,
  136. [AIE_ERROR_STREAM] = AMDXDNA_ERROR_NUM_AIE_STREAM,
  137. [AIE_ERROR_ACCESS] = AMDXDNA_ERROR_NUM_AIE_ACCESS,
  138. [AIE_ERROR_BUS] = AMDXDNA_ERROR_NUM_AIE_BUS,
  139. [AIE_ERROR_INSTRUCTION] = AMDXDNA_ERROR_NUM_AIE_INSTRUCTION,
  140. [AIE_ERROR_ECC] = AMDXDNA_ERROR_NUM_AIE_ECC,
  141. [AIE_ERROR_LOCK] = AMDXDNA_ERROR_NUM_AIE_LOCK,
  142. [AIE_ERROR_DMA] = AMDXDNA_ERROR_NUM_AIE_DMA,
  143. [AIE_ERROR_MEM_PARITY] = AMDXDNA_ERROR_NUM_AIE_MEM_PARITY,
  144. [AIE_ERROR_UNKNOWN] = AMDXDNA_ERROR_NUM_UNKNOWN,
  145. };
  146. static_assert(ARRAY_SIZE(aie_cat_err_num_map) == AIE_ERROR_UNKNOWN + 1);
  147. static const enum amdxdna_error_module aie_err_mod_map[] = {
  148. [AIE_MEM_MOD] = AMDXDNA_ERROR_MODULE_AIE_MEMORY,
  149. [AIE_CORE_MOD] = AMDXDNA_ERROR_MODULE_AIE_CORE,
  150. [AIE_PL_MOD] = AMDXDNA_ERROR_MODULE_AIE_PL,
  151. [AIE_UNKNOWN_MOD] = AMDXDNA_ERROR_MODULE_UNKNOWN,
  152. };
  153. static_assert(ARRAY_SIZE(aie_err_mod_map) == AIE_UNKNOWN_MOD + 1);
  154. static enum aie_error_category
  155. aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
  156. {
  157. const struct aie_event_category *lut;
  158. int num_entry;
  159. int i;
  160. switch (mod_type) {
  161. case AIE_PL_MOD:
  162. lut = aie_ml_shim_tile_event_cat;
  163. num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat);
  164. break;
  165. case AIE_CORE_MOD:
  166. lut = aie_ml_core_event_cat;
  167. num_entry = ARRAY_SIZE(aie_ml_core_event_cat);
  168. break;
  169. case AIE_MEM_MOD:
  170. if (row == 1) {
  171. lut = aie_ml_mem_tile_event_cat;
  172. num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat);
  173. } else {
  174. lut = aie_ml_mem_event_cat;
  175. num_entry = ARRAY_SIZE(aie_ml_mem_event_cat);
  176. }
  177. break;
  178. default:
  179. return AIE_ERROR_UNKNOWN;
  180. }
  181. for (i = 0; i < num_entry; i++) {
  182. if (event_id != lut[i].event_id)
  183. continue;
  184. if (lut[i].category > AIE_ERROR_UNKNOWN)
  185. return AIE_ERROR_UNKNOWN;
  186. return lut[i].category;
  187. }
  188. return AIE_ERROR_UNKNOWN;
  189. }
  190. static void aie2_update_last_async_error(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
  191. {
  192. struct aie_error *errs = err_info;
  193. enum amdxdna_error_module err_mod;
  194. enum aie_error_category aie_err;
  195. enum amdxdna_error_num err_num;
  196. struct aie_error *last_err;
  197. last_err = &errs[num_err - 1];
  198. if (last_err->mod_type >= AIE_UNKNOWN_MOD) {
  199. err_num = aie_cat_err_num_map[AIE_ERROR_UNKNOWN];
  200. err_mod = aie_err_mod_map[AIE_UNKNOWN_MOD];
  201. } else {
  202. aie_err = aie_get_error_category(last_err->row,
  203. last_err->event_id,
  204. last_err->mod_type);
  205. err_num = aie_cat_err_num_map[aie_err];
  206. err_mod = aie_err_mod_map[last_err->mod_type];
  207. }
  208. ndev->last_async_err.err_code = AMDXDNA_ERROR_ENCODE(err_num, err_mod);
  209. ndev->last_async_err.ts_us = ktime_to_us(ktime_get_real());
  210. ndev->last_async_err.ex_err_code = AMDXDNA_EXTRA_ERR_ENCODE(last_err->row, last_err->col);
  211. }
  212. static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
  213. {
  214. struct aie_error *errs = err_info;
  215. u32 err_col = 0; /* assume that AIE has less than 32 columns */
  216. int i;
  217. /* Get err column bitmap */
  218. for (i = 0; i < num_err; i++) {
  219. struct aie_error *err = &errs[i];
  220. enum aie_error_category cat;
  221. cat = aie_get_error_category(err->row, err->event_id, err->mod_type);
  222. XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d",
  223. err->row, err->col, err->mod_type,
  224. err->event_id, cat);
  225. if (err->col >= 32) {
  226. XDNA_WARN(ndev->xdna, "Invalid column number");
  227. break;
  228. }
  229. err_col |= (1 << err->col);
  230. }
  231. return err_col;
  232. }
  233. static int aie2_error_async_cb(void *handle, void __iomem *data, size_t size)
  234. {
  235. struct async_event *e = handle;
  236. if (data) {
  237. e->resp.type = readl(data + offsetof(struct async_event_msg_resp, type));
  238. wmb(); /* Update status in the end, so that no lock for here */
  239. e->resp.status = readl(data + offsetof(struct async_event_msg_resp, status));
  240. }
  241. queue_work(e->wq, &e->work);
  242. return 0;
  243. }
  244. static int aie2_error_event_send(struct async_event *e)
  245. {
  246. drm_clflush_virt_range(e->buf, e->size); /* device can access */
  247. return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e,
  248. aie2_error_async_cb);
  249. }
  250. static void aie2_error_worker(struct work_struct *err_work)
  251. {
  252. struct aie_err_info *info;
  253. struct amdxdna_dev *xdna;
  254. struct async_event *e;
  255. u32 max_err;
  256. u32 err_col;
  257. e = container_of(err_work, struct async_event, work);
  258. xdna = e->ndev->xdna;
  259. if (e->resp.status == MAX_AIE2_STATUS_CODE)
  260. return;
  261. e->resp.status = MAX_AIE2_STATUS_CODE;
  262. print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4,
  263. e->buf, 0x100, false);
  264. info = (struct aie_err_info *)e->buf;
  265. XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code);
  266. max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error);
  267. if (unlikely(info->err_cnt > max_err)) {
  268. WARN_ONCE(1, "Error count too large %d\n", info->err_cnt);
  269. return;
  270. }
  271. err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt);
  272. if (!err_col) {
  273. XDNA_WARN(xdna, "Did not get error column");
  274. return;
  275. }
  276. mutex_lock(&xdna->dev_lock);
  277. aie2_update_last_async_error(e->ndev, info->payload, info->err_cnt);
  278. /* Re-sent this event to firmware */
  279. if (aie2_error_event_send(e))
  280. XDNA_WARN(xdna, "Unable to register async event");
  281. mutex_unlock(&xdna->dev_lock);
  282. }
  283. void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
  284. {
  285. struct amdxdna_dev *xdna = ndev->xdna;
  286. struct async_events *events;
  287. events = ndev->async_events;
  288. mutex_unlock(&xdna->dev_lock);
  289. destroy_workqueue(events->wq);
  290. mutex_lock(&xdna->dev_lock);
  291. aie2_free_msg_buffer(ndev, events->size, events->buf, events->addr);
  292. kfree(events);
  293. }
  294. int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
  295. {
  296. struct amdxdna_dev *xdna = ndev->xdna;
  297. u32 total_col = ndev->total_col;
  298. u32 total_size = ASYNC_BUF_SIZE * total_col;
  299. struct async_events *events;
  300. int i, ret;
  301. events = kzalloc_flex(*events, event, total_col);
  302. if (!events)
  303. return -ENOMEM;
  304. events->buf = aie2_alloc_msg_buffer(ndev, &total_size, &events->addr);
  305. if (!events->buf) {
  306. ret = -ENOMEM;
  307. goto free_events;
  308. }
  309. events->size = total_size;
  310. events->event_cnt = total_col;
  311. events->wq = alloc_ordered_workqueue("async_wq", 0);
  312. if (!events->wq) {
  313. ret = -ENOMEM;
  314. goto free_buf;
  315. }
  316. for (i = 0; i < events->event_cnt; i++) {
  317. struct async_event *e = &events->event[i];
  318. u32 offset = i * ASYNC_BUF_SIZE;
  319. e->ndev = ndev;
  320. e->wq = events->wq;
  321. e->buf = &events->buf[offset];
  322. e->addr = events->addr + offset;
  323. e->size = ASYNC_BUF_SIZE;
  324. e->resp.status = MAX_AIE2_STATUS_CODE;
  325. INIT_WORK(&e->work, aie2_error_worker);
  326. ret = aie2_error_event_send(e);
  327. if (ret)
  328. goto free_wq;
  329. }
  330. ndev->async_events = events;
  331. XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
  332. events->event_cnt, events->size);
  333. return 0;
  334. free_wq:
  335. destroy_workqueue(events->wq);
  336. free_buf:
  337. aie2_free_msg_buffer(ndev, events->size, events->buf, events->addr);
  338. free_events:
  339. kfree(events);
  340. return ret;
  341. }
  342. int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, struct amdxdna_drm_get_array *args)
  343. {
  344. struct amdxdna_dev *xdna = ndev->xdna;
  345. drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
  346. args->num_element = 1;
  347. args->element_size = sizeof(ndev->last_async_err);
  348. if (copy_to_user(u64_to_user_ptr(args->buffer),
  349. &ndev->last_async_err, args->element_size))
  350. return -EFAULT;
  351. return 0;
  352. }