i915_gpu_error.h 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. /*
  2. * SPDX-License-Identifier: MIT
  3. *
  4. * Copyright © 2008-2018 Intel Corporation
  5. */
  6. #ifndef _I915_GPU_ERROR_H_
  7. #define _I915_GPU_ERROR_H_
  8. #include <linux/atomic.h>
  9. #include <linux/kref.h>
  10. #include <linux/ktime.h>
  11. #include <linux/sched.h>
  12. #include <drm/drm_mm.h>
  13. #include "gt/intel_engine.h"
  14. #include "gt/intel_engine_types.h"
  15. #include "gt/intel_gt_types.h"
  16. #include "gt/uc/intel_uc_fw.h"
  17. #include "intel_device_info.h"
  18. #include "i915_gem.h"
  19. #include "i915_gem_gtt.h"
  20. #include "i915_params.h"
  21. #include "i915_scheduler.h"
  22. struct drm_i915_private;
  23. struct i915_vma_compress;
  24. struct intel_engine_capture_vma;
  25. struct intel_display_snapshot;
  26. struct i915_vma_coredump {
  27. struct i915_vma_coredump *next;
  28. char name[20];
  29. u64 gtt_offset;
  30. u64 gtt_size;
  31. u32 gtt_page_sizes;
  32. int unused;
  33. struct list_head page_list;
  34. };
  35. struct i915_request_coredump {
  36. unsigned long flags;
  37. pid_t pid;
  38. u32 context;
  39. u32 seqno;
  40. u32 head;
  41. u32 tail;
  42. struct i915_sched_attr sched_attr;
  43. };
  44. struct __guc_capture_parsed_output;
  45. struct intel_engine_coredump {
  46. const struct intel_engine_cs *engine;
  47. bool hung;
  48. bool simulated;
  49. u32 reset_count;
  50. /* position of active request inside the ring */
  51. u32 rq_head, rq_post, rq_tail;
  52. /* Register state */
  53. u32 ccid;
  54. u32 start;
  55. u32 tail;
  56. u32 head;
  57. u32 ctl;
  58. u32 mode;
  59. u32 hws;
  60. u32 ipeir;
  61. u32 ipehr;
  62. u32 esr;
  63. u32 bbstate;
  64. u32 instpm;
  65. u32 instps;
  66. u64 bbaddr;
  67. u64 acthd;
  68. u32 fault_reg;
  69. u64 faddr;
  70. u32 rc_psmi; /* sleep state */
  71. u32 nopid;
  72. u32 excc;
  73. u32 cmd_cctl;
  74. u32 cscmdop;
  75. u32 ctx_sr_ctl;
  76. u32 dma_faddr_hi;
  77. u32 dma_faddr_lo;
  78. struct intel_instdone instdone;
  79. /* GuC matched capture-lists info */
  80. struct intel_guc_state_capture *guc_capture;
  81. struct __guc_capture_parsed_output *guc_capture_node;
  82. struct i915_gem_context_coredump {
  83. char comm[TASK_COMM_LEN];
  84. u64 total_runtime;
  85. u64 avg_runtime;
  86. pid_t pid;
  87. int active;
  88. int guilty;
  89. struct i915_sched_attr sched_attr;
  90. u32 hwsp_seqno;
  91. } context;
  92. struct i915_vma_coredump *vma;
  93. struct i915_request_coredump execlist[EXECLIST_MAX_PORTS];
  94. unsigned int num_ports;
  95. struct {
  96. u32 gfx_mode;
  97. union {
  98. u64 pdp[4];
  99. u32 pp_dir_base;
  100. };
  101. } vm_info;
  102. struct intel_engine_coredump *next;
  103. };
  104. struct intel_ctb_coredump {
  105. u32 raw_head, head;
  106. u32 raw_tail, tail;
  107. u32 raw_status;
  108. u32 desc_offset;
  109. u32 cmds_offset;
  110. u32 size;
  111. };
  112. struct intel_gt_coredump {
  113. const struct intel_gt *_gt;
  114. bool awake;
  115. bool simulated;
  116. struct intel_gt_info info;
  117. /* Generic register state */
  118. u32 eir;
  119. u32 pgtbl_er;
  120. u32 gtier[6], ngtier;
  121. u32 forcewake;
  122. u32 error; /* gen6+ */
  123. u32 err_int; /* gen7 */
  124. u32 fault_data0; /* gen8, gen9 */
  125. u32 fault_data1; /* gen8, gen9 */
  126. u32 done_reg;
  127. u32 gac_eco;
  128. u32 gam_ecochk;
  129. u32 gab_ctl;
  130. u32 gfx_mode;
  131. u32 gtt_cache;
  132. u32 aux_err; /* gen12 */
  133. u32 gam_done; /* gen12 */
  134. u32 clock_frequency;
  135. u32 clock_period_ns;
  136. u32 sfc_done[I915_MAX_SFC]; /* gen12 */
  137. u32 nfence;
  138. u64 fence[I915_MAX_NUM_FENCES];
  139. struct intel_engine_coredump *engine;
  140. struct intel_uc_coredump {
  141. struct intel_uc_fw guc_fw;
  142. struct intel_uc_fw huc_fw;
  143. struct guc_info {
  144. struct intel_ctb_coredump ctb[2];
  145. struct i915_vma_coredump *vma_ctb;
  146. struct i915_vma_coredump *vma_log;
  147. u32 *hw_state;
  148. u32 timestamp;
  149. u16 last_fence;
  150. bool is_guc_capture;
  151. } guc;
  152. } *uc;
  153. struct intel_gt_coredump *next;
  154. };
  155. struct i915_gpu_coredump {
  156. struct kref ref;
  157. ktime_t time;
  158. ktime_t boottime;
  159. ktime_t uptime;
  160. unsigned long capture;
  161. struct drm_i915_private *i915;
  162. struct intel_gt_coredump *gt;
  163. char error_msg[128];
  164. bool simulated;
  165. bool wakelock;
  166. bool suspended;
  167. int iommu;
  168. u32 reset_count;
  169. u32 suspend_count;
  170. struct intel_device_info device_info;
  171. struct intel_runtime_info runtime_info;
  172. struct intel_driver_caps driver_caps;
  173. struct i915_params params;
  174. struct scatterlist *sgl, *fit;
  175. struct intel_display_snapshot *display_snapshot;
  176. };
  177. struct i915_gpu_error {
  178. /* For reset and error_state handling. */
  179. spinlock_t lock;
  180. /* Protected by the above dev->gpu_error.lock. */
  181. struct i915_gpu_coredump *first_error;
  182. /** Number of times the device has been reset (global) */
  183. atomic_t reset_count;
  184. /** Number of times an engine has been reset */
  185. atomic_t reset_engine_count[MAX_ENGINE_CLASS];
  186. };
  187. struct drm_i915_error_state_buf {
  188. struct drm_i915_private *i915;
  189. struct scatterlist *sgl, *cur, *end;
  190. char *buf;
  191. size_t bytes;
  192. size_t size;
  193. loff_t iter;
  194. int err;
  195. };
  196. static inline u32 i915_reset_count(struct i915_gpu_error *error)
  197. {
  198. return atomic_read(&error->reset_count);
  199. }
  200. static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
  201. const struct intel_engine_cs *engine)
  202. {
  203. return atomic_read(&error->reset_engine_count[engine->class]);
  204. }
  205. static inline void
  206. i915_increase_reset_engine_count(struct i915_gpu_error *error,
  207. const struct intel_engine_cs *engine)
  208. {
  209. atomic_inc(&error->reset_engine_count[engine->class]);
  210. }
  211. #define CORE_DUMP_FLAG_NONE 0x0
  212. #define CORE_DUMP_FLAG_IS_GUC_CAPTURE BIT(0)
  213. #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) && IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
  214. void intel_klog_error_capture(struct intel_gt *gt,
  215. intel_engine_mask_t engine_mask);
  216. #else
  217. static inline void intel_klog_error_capture(struct intel_gt *gt,
  218. intel_engine_mask_t engine_mask)
  219. {
  220. }
  221. #endif
  222. #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
  223. __printf(2, 3)
  224. void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
  225. void i915_capture_error_state(struct intel_gt *gt,
  226. intel_engine_mask_t engine_mask, u32 dump_flags);
  227. struct i915_gpu_coredump *
  228. i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp);
  229. struct intel_gt_coredump *
  230. intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags);
  231. struct intel_engine_coredump *
  232. intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_flags);
  233. struct intel_engine_capture_vma *
  234. intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
  235. struct i915_request *rq,
  236. gfp_t gfp);
  237. void intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
  238. struct intel_engine_capture_vma *capture,
  239. struct i915_vma_compress *compress);
  240. struct i915_vma_compress *
  241. i915_vma_capture_prepare(struct intel_gt_coredump *gt);
  242. void i915_vma_capture_finish(struct intel_gt_coredump *gt,
  243. struct i915_vma_compress *compress);
  244. void i915_error_state_store(struct i915_gpu_coredump *error);
  245. static inline struct i915_gpu_coredump *
  246. i915_gpu_coredump_get(struct i915_gpu_coredump *gpu)
  247. {
  248. kref_get(&gpu->ref);
  249. return gpu;
  250. }
  251. ssize_t
  252. i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
  253. char *buf, loff_t offset, size_t count);
  254. void __i915_gpu_coredump_free(struct kref *kref);
  255. static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
  256. {
  257. if (gpu)
  258. kref_put(&gpu->ref, __i915_gpu_coredump_free);
  259. }
  260. void i915_reset_error_state(struct drm_i915_private *i915);
  261. void i915_disable_error_state(struct drm_i915_private *i915, int err);
  262. void i915_gpu_error_debugfs_register(struct drm_i915_private *i915);
  263. void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915);
  264. void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915);
  265. #else
  266. __printf(2, 3)
  267. static inline void
  268. i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
  269. {
  270. }
  271. static inline void
  272. i915_capture_error_state(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags)
  273. {
  274. }
  275. static inline struct i915_gpu_coredump *
  276. i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)
  277. {
  278. return NULL;
  279. }
  280. static inline struct intel_gt_coredump *
  281. intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags)
  282. {
  283. return NULL;
  284. }
  285. static inline struct intel_engine_coredump *
  286. intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_flags)
  287. {
  288. return NULL;
  289. }
  290. static inline struct intel_engine_capture_vma *
  291. intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
  292. struct i915_request *rq,
  293. gfp_t gfp)
  294. {
  295. return NULL;
  296. }
  297. static inline void
  298. intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
  299. struct intel_engine_capture_vma *capture,
  300. struct i915_vma_compress *compress)
  301. {
  302. }
  303. static inline struct i915_vma_compress *
  304. i915_vma_capture_prepare(struct intel_gt_coredump *gt)
  305. {
  306. return NULL;
  307. }
  308. static inline void
  309. i915_vma_capture_finish(struct intel_gt_coredump *gt,
  310. struct i915_vma_compress *compress)
  311. {
  312. }
  313. static inline void
  314. i915_error_state_store(struct i915_gpu_coredump *error)
  315. {
  316. }
  317. static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
  318. {
  319. }
  320. static inline void i915_reset_error_state(struct drm_i915_private *i915)
  321. {
  322. }
  323. static inline void i915_disable_error_state(struct drm_i915_private *i915,
  324. int err)
  325. {
  326. }
  327. static inline void i915_gpu_error_debugfs_register(struct drm_i915_private *i915)
  328. {
  329. }
  330. static inline void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915)
  331. {
  332. }
  333. static inline void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915)
  334. {
  335. }
  336. #endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */
  337. #endif /* _I915_GPU_ERROR_H_ */