i915_gpu_error.c 65 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628
  1. /*
  2. * Copyright (c) 2008 Intel Corporation
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a
  5. * copy of this software and associated documentation files (the "Software"),
  6. * to deal in the Software without restriction, including without limitation
  7. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. * and/or sell copies of the Software, and to permit persons to whom the
  9. * Software is furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice (including the next
  12. * paragraph) shall be included in all copies or substantial portions of the
  13. * Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21. * IN THE SOFTWARE.
  22. *
  23. * Authors:
  24. * Eric Anholt <eric@anholt.net>
  25. * Keith Packard <keithp@keithp.com>
  26. * Mika Kuoppala <mika.kuoppala@intel.com>
  27. *
  28. */
  29. #include <linux/ascii85.h>
  30. #include <linux/debugfs.h>
  31. #include <linux/highmem.h>
  32. #include <linux/nmi.h>
  33. #include <linux/pagevec.h>
  34. #include <linux/scatterlist.h>
  35. #include <linux/string_helpers.h>
  36. #include <linux/utsname.h>
  37. #include <linux/zlib.h>
  38. #include <drm/drm_cache.h>
  39. #include <drm/drm_print.h>
  40. #include "display/intel_display_snapshot.h"
  41. #include "gem/i915_gem_context.h"
  42. #include "gem/i915_gem_lmem.h"
  43. #include "gt/intel_engine_regs.h"
  44. #include "gt/intel_gt.h"
  45. #include "gt/intel_gt_mcr.h"
  46. #include "gt/intel_gt_pm.h"
  47. #include "gt/intel_gt_regs.h"
  48. #include "gt/uc/intel_guc_capture.h"
  49. #include "i915_driver.h"
  50. #include "i915_drv.h"
  51. #include "i915_gpu_error.h"
  52. #include "i915_memcpy.h"
  53. #include "i915_reg.h"
  54. #include "i915_scatterlist.h"
  55. #include "i915_sysfs.h"
  56. #include "i915_utils.h"
  57. #define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
  58. #define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN)
  59. static void __sg_set_buf(struct scatterlist *sg,
  60. void *addr, unsigned int len, loff_t it)
  61. {
  62. sg->page_link = (unsigned long)virt_to_page(addr);
  63. sg->offset = offset_in_page(addr);
  64. sg->length = len;
  65. sg->dma_address = it;
  66. }
  67. static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
  68. {
  69. if (!len)
  70. return false;
  71. if (e->bytes + len + 1 <= e->size)
  72. return true;
  73. if (e->bytes) {
  74. __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
  75. e->iter += e->bytes;
  76. e->buf = NULL;
  77. e->bytes = 0;
  78. }
  79. if (e->cur == e->end) {
  80. struct scatterlist *sgl;
  81. sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL);
  82. if (!sgl) {
  83. e->err = -ENOMEM;
  84. return false;
  85. }
  86. if (e->cur) {
  87. e->cur->offset = 0;
  88. e->cur->length = 0;
  89. e->cur->page_link =
  90. (unsigned long)sgl | SG_CHAIN;
  91. } else {
  92. e->sgl = sgl;
  93. }
  94. e->cur = sgl;
  95. e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
  96. }
  97. e->size = ALIGN(len + 1, SZ_64K);
  98. e->buf = kmalloc(e->size, ALLOW_FAIL);
  99. if (!e->buf) {
  100. e->size = PAGE_ALIGN(len + 1);
  101. e->buf = kmalloc(e->size, GFP_KERNEL);
  102. }
  103. if (!e->buf) {
  104. e->err = -ENOMEM;
  105. return false;
  106. }
  107. return true;
  108. }
  109. __printf(2, 0)
  110. static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
  111. const char *fmt, va_list args)
  112. {
  113. va_list ap;
  114. int len;
  115. if (e->err)
  116. return;
  117. va_copy(ap, args);
  118. len = vsnprintf(NULL, 0, fmt, ap);
  119. va_end(ap);
  120. if (len <= 0) {
  121. e->err = len;
  122. return;
  123. }
  124. if (!__i915_error_grow(e, len))
  125. return;
  126. GEM_BUG_ON(e->bytes >= e->size);
  127. len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
  128. if (len < 0) {
  129. e->err = len;
  130. return;
  131. }
  132. e->bytes += len;
  133. }
  134. static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
  135. {
  136. unsigned len;
  137. if (e->err || !str)
  138. return;
  139. len = strlen(str);
  140. if (!__i915_error_grow(e, len))
  141. return;
  142. GEM_BUG_ON(e->bytes + len > e->size);
  143. memcpy(e->buf + e->bytes, str, len);
  144. e->bytes += len;
  145. }
  146. #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
  147. #define err_puts(e, s) i915_error_puts(e, s)
  148. static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
  149. {
  150. i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
  151. }
  152. static inline struct drm_printer
  153. i915_error_printer(struct drm_i915_error_state_buf *e)
  154. {
  155. struct drm_printer p = {
  156. .printfn = __i915_printfn_error,
  157. .arg = e,
  158. };
  159. return p;
  160. }
  161. /* single threaded page allocator with a reserved stash for emergencies */
  162. static void pool_fini(struct folio_batch *fbatch)
  163. {
  164. folio_batch_release(fbatch);
  165. }
  166. static int pool_refill(struct folio_batch *fbatch, gfp_t gfp)
  167. {
  168. while (folio_batch_space(fbatch)) {
  169. struct folio *folio;
  170. folio = folio_alloc(gfp, 0);
  171. if (!folio)
  172. return -ENOMEM;
  173. folio_batch_add(fbatch, folio);
  174. }
  175. return 0;
  176. }
  177. static int pool_init(struct folio_batch *fbatch, gfp_t gfp)
  178. {
  179. int err;
  180. folio_batch_init(fbatch);
  181. err = pool_refill(fbatch, gfp);
  182. if (err)
  183. pool_fini(fbatch);
  184. return err;
  185. }
  186. static void *pool_alloc(struct folio_batch *fbatch, gfp_t gfp)
  187. {
  188. struct folio *folio;
  189. folio = folio_alloc(gfp, 0);
  190. if (!folio && folio_batch_count(fbatch))
  191. folio = fbatch->folios[--fbatch->nr];
  192. return folio ? folio_address(folio) : NULL;
  193. }
  194. static void pool_free(struct folio_batch *fbatch, void *addr)
  195. {
  196. struct folio *folio = virt_to_folio(addr);
  197. if (folio_batch_space(fbatch))
  198. folio_batch_add(fbatch, folio);
  199. else
  200. folio_put(folio);
  201. }
  202. #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
  203. struct i915_vma_compress {
  204. struct folio_batch pool;
  205. struct z_stream_s zstream;
  206. void *tmp;
  207. };
  208. static bool compress_init(struct i915_vma_compress *c)
  209. {
  210. struct z_stream_s *zstream = &c->zstream;
  211. if (pool_init(&c->pool, ALLOW_FAIL))
  212. return false;
  213. zstream->workspace =
  214. kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
  215. ALLOW_FAIL);
  216. if (!zstream->workspace) {
  217. pool_fini(&c->pool);
  218. return false;
  219. }
  220. c->tmp = NULL;
  221. if (i915_has_memcpy_from_wc())
  222. c->tmp = pool_alloc(&c->pool, ALLOW_FAIL);
  223. return true;
  224. }
  225. static bool compress_start(struct i915_vma_compress *c)
  226. {
  227. struct z_stream_s *zstream = &c->zstream;
  228. void *workspace = zstream->workspace;
  229. memset(zstream, 0, sizeof(*zstream));
  230. zstream->workspace = workspace;
  231. return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK;
  232. }
  233. static void *compress_next_page(struct i915_vma_compress *c,
  234. struct i915_vma_coredump *dst)
  235. {
  236. void *page_addr;
  237. struct page *page;
  238. page_addr = pool_alloc(&c->pool, ALLOW_FAIL);
  239. if (!page_addr)
  240. return ERR_PTR(-ENOMEM);
  241. page = virt_to_page(page_addr);
  242. list_add_tail(&page->lru, &dst->page_list);
  243. return page_addr;
  244. }
  245. static int compress_page(struct i915_vma_compress *c,
  246. void *src,
  247. struct i915_vma_coredump *dst,
  248. bool wc)
  249. {
  250. struct z_stream_s *zstream = &c->zstream;
  251. zstream->next_in = src;
  252. if (wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
  253. zstream->next_in = c->tmp;
  254. zstream->avail_in = PAGE_SIZE;
  255. do {
  256. if (zstream->avail_out == 0) {
  257. zstream->next_out = compress_next_page(c, dst);
  258. if (IS_ERR(zstream->next_out))
  259. return PTR_ERR(zstream->next_out);
  260. zstream->avail_out = PAGE_SIZE;
  261. }
  262. if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
  263. return -EIO;
  264. cond_resched();
  265. } while (zstream->avail_in);
  266. /* Fallback to uncompressed if we increase size? */
  267. if (0 && zstream->total_out > zstream->total_in)
  268. return -E2BIG;
  269. return 0;
  270. }
  271. static int compress_flush(struct i915_vma_compress *c,
  272. struct i915_vma_coredump *dst)
  273. {
  274. struct z_stream_s *zstream = &c->zstream;
  275. do {
  276. switch (zlib_deflate(zstream, Z_FINISH)) {
  277. case Z_OK: /* more space requested */
  278. zstream->next_out = compress_next_page(c, dst);
  279. if (IS_ERR(zstream->next_out))
  280. return PTR_ERR(zstream->next_out);
  281. zstream->avail_out = PAGE_SIZE;
  282. break;
  283. case Z_STREAM_END:
  284. goto end;
  285. default: /* any error */
  286. return -EIO;
  287. }
  288. } while (1);
  289. end:
  290. memset(zstream->next_out, 0, zstream->avail_out);
  291. dst->unused = zstream->avail_out;
  292. return 0;
  293. }
  294. static void compress_finish(struct i915_vma_compress *c)
  295. {
  296. zlib_deflateEnd(&c->zstream);
  297. }
  298. static void compress_fini(struct i915_vma_compress *c)
  299. {
  300. kfree(c->zstream.workspace);
  301. if (c->tmp)
  302. pool_free(&c->pool, c->tmp);
  303. pool_fini(&c->pool);
  304. }
  305. static void err_compression_marker(struct drm_i915_error_state_buf *m)
  306. {
  307. err_puts(m, ":");
  308. }
  309. #else
  310. struct i915_vma_compress {
  311. struct folio_batch pool;
  312. };
  313. static bool compress_init(struct i915_vma_compress *c)
  314. {
  315. return pool_init(&c->pool, ALLOW_FAIL) == 0;
  316. }
  317. static bool compress_start(struct i915_vma_compress *c)
  318. {
  319. return true;
  320. }
  321. static int compress_page(struct i915_vma_compress *c,
  322. void *src,
  323. struct i915_vma_coredump *dst,
  324. bool wc)
  325. {
  326. void *ptr;
  327. ptr = pool_alloc(&c->pool, ALLOW_FAIL);
  328. if (!ptr)
  329. return -ENOMEM;
  330. if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE)))
  331. memcpy(ptr, src, PAGE_SIZE);
  332. list_add_tail(&virt_to_page(ptr)->lru, &dst->page_list);
  333. cond_resched();
  334. return 0;
  335. }
  336. static int compress_flush(struct i915_vma_compress *c,
  337. struct i915_vma_coredump *dst)
  338. {
  339. return 0;
  340. }
  341. static void compress_finish(struct i915_vma_compress *c)
  342. {
  343. }
  344. static void compress_fini(struct i915_vma_compress *c)
  345. {
  346. pool_fini(&c->pool);
  347. }
  348. static void err_compression_marker(struct drm_i915_error_state_buf *m)
  349. {
  350. err_puts(m, "~");
  351. }
  352. #endif
  353. static void error_print_instdone(struct drm_i915_error_state_buf *m,
  354. const struct intel_engine_coredump *ee)
  355. {
  356. int slice;
  357. int subslice;
  358. int iter;
  359. err_printf(m, " INSTDONE: 0x%08x\n",
  360. ee->instdone.instdone);
  361. if (ee->engine->class != RENDER_CLASS || GRAPHICS_VER(m->i915) <= 3)
  362. return;
  363. err_printf(m, " SC_INSTDONE: 0x%08x\n",
  364. ee->instdone.slice_common);
  365. if (GRAPHICS_VER(m->i915) <= 6)
  366. return;
  367. for_each_ss_steering(iter, ee->engine->gt, slice, subslice)
  368. err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
  369. slice, subslice,
  370. ee->instdone.sampler[slice][subslice]);
  371. for_each_ss_steering(iter, ee->engine->gt, slice, subslice)
  372. err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n",
  373. slice, subslice,
  374. ee->instdone.row[slice][subslice]);
  375. if (GRAPHICS_VER(m->i915) < 12)
  376. return;
  377. if (GRAPHICS_VER_FULL(m->i915) >= IP_VER(12, 55)) {
  378. for_each_ss_steering(iter, ee->engine->gt, slice, subslice)
  379. err_printf(m, " GEOM_SVGUNIT_INSTDONE[%d][%d]: 0x%08x\n",
  380. slice, subslice,
  381. ee->instdone.geom_svg[slice][subslice]);
  382. }
  383. err_printf(m, " SC_INSTDONE_EXTRA: 0x%08x\n",
  384. ee->instdone.slice_common_extra[0]);
  385. err_printf(m, " SC_INSTDONE_EXTRA2: 0x%08x\n",
  386. ee->instdone.slice_common_extra[1]);
  387. }
  388. static void error_print_request(struct drm_i915_error_state_buf *m,
  389. const char *prefix,
  390. const struct i915_request_coredump *erq)
  391. {
  392. if (!erq->seqno)
  393. return;
  394. err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, head %08x, tail %08x\n",
  395. prefix, erq->pid, erq->context, erq->seqno,
  396. test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
  397. &erq->flags) ? "!" : "",
  398. test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
  399. &erq->flags) ? "+" : "",
  400. erq->sched_attr.priority,
  401. erq->head, erq->tail);
  402. }
  403. static void error_print_context(struct drm_i915_error_state_buf *m,
  404. const char *header,
  405. const struct i915_gem_context_coredump *ctx)
  406. {
  407. err_printf(m, "%s%s[%d] prio %d, guilty %d active %d, runtime total %lluns, avg %lluns\n",
  408. header, ctx->comm, ctx->pid, ctx->sched_attr.priority,
  409. ctx->guilty, ctx->active,
  410. ctx->total_runtime, ctx->avg_runtime);
  411. err_printf(m, " context timeline seqno %u\n", ctx->hwsp_seqno);
  412. }
  413. static struct i915_vma_coredump *
  414. __find_vma(struct i915_vma_coredump *vma, const char *name)
  415. {
  416. while (vma) {
  417. if (strcmp(vma->name, name) == 0)
  418. return vma;
  419. vma = vma->next;
  420. }
  421. return NULL;
  422. }
  423. static struct i915_vma_coredump *
  424. intel_gpu_error_find_batch(const struct intel_engine_coredump *ee)
  425. {
  426. return __find_vma(ee->vma, "batch");
  427. }
  428. static void error_print_engine(struct drm_i915_error_state_buf *m,
  429. const struct intel_engine_coredump *ee)
  430. {
  431. struct i915_vma_coredump *batch;
  432. int n;
  433. err_printf(m, "%s command stream:\n", ee->engine->name);
  434. err_printf(m, " CCID: 0x%08x\n", ee->ccid);
  435. err_printf(m, " START: 0x%08x\n", ee->start);
  436. err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head);
  437. err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n",
  438. ee->tail, ee->rq_post, ee->rq_tail);
  439. err_printf(m, " CTL: 0x%08x\n", ee->ctl);
  440. err_printf(m, " MODE: 0x%08x\n", ee->mode);
  441. err_printf(m, " HWS: 0x%08x\n", ee->hws);
  442. err_printf(m, " ACTHD: 0x%08x %08x\n",
  443. (u32)(ee->acthd>>32), (u32)ee->acthd);
  444. err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir);
  445. err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr);
  446. err_printf(m, " ESR: 0x%08x\n", ee->esr);
  447. error_print_instdone(m, ee);
  448. batch = intel_gpu_error_find_batch(ee);
  449. if (batch) {
  450. u64 start = batch->gtt_offset;
  451. u64 end = start + batch->gtt_size;
  452. err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n",
  453. upper_32_bits(start), lower_32_bits(start),
  454. upper_32_bits(end), lower_32_bits(end));
  455. }
  456. if (GRAPHICS_VER(m->i915) >= 4) {
  457. err_printf(m, " BBADDR: 0x%08x_%08x\n",
  458. (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
  459. err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate);
  460. err_printf(m, " INSTPS: 0x%08x\n", ee->instps);
  461. }
  462. err_printf(m, " INSTPM: 0x%08x\n", ee->instpm);
  463. err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
  464. lower_32_bits(ee->faddr));
  465. if (GRAPHICS_VER(m->i915) >= 6) {
  466. err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi);
  467. err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg);
  468. }
  469. if (GRAPHICS_VER(m->i915) >= 11) {
  470. err_printf(m, " NOPID: 0x%08x\n", ee->nopid);
  471. err_printf(m, " EXCC: 0x%08x\n", ee->excc);
  472. err_printf(m, " CMD_CCTL: 0x%08x\n", ee->cmd_cctl);
  473. err_printf(m, " CSCMDOP: 0x%08x\n", ee->cscmdop);
  474. err_printf(m, " CTX_SR_CTL: 0x%08x\n", ee->ctx_sr_ctl);
  475. err_printf(m, " DMA_FADDR_HI: 0x%08x\n", ee->dma_faddr_hi);
  476. err_printf(m, " DMA_FADDR_LO: 0x%08x\n", ee->dma_faddr_lo);
  477. }
  478. if (HAS_PPGTT(m->i915)) {
  479. err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
  480. if (GRAPHICS_VER(m->i915) >= 8) {
  481. int i;
  482. for (i = 0; i < 4; i++)
  483. err_printf(m, " PDP%d: 0x%016llx\n",
  484. i, ee->vm_info.pdp[i]);
  485. } else {
  486. err_printf(m, " PP_DIR_BASE: 0x%08x\n",
  487. ee->vm_info.pp_dir_base);
  488. }
  489. }
  490. for (n = 0; n < ee->num_ports; n++) {
  491. err_printf(m, " ELSP[%d]:", n);
  492. error_print_request(m, " ", &ee->execlist[n]);
  493. }
  494. }
  495. void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
  496. {
  497. va_list args;
  498. va_start(args, f);
  499. i915_error_vprintf(e, f, args);
  500. va_end(args);
  501. }
  502. static void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m,
  503. const struct intel_engine_cs *engine,
  504. const struct i915_vma_coredump *vma)
  505. {
  506. char out[ASCII85_BUFSZ];
  507. struct page *page;
  508. if (!vma)
  509. return;
  510. err_printf(m, "%s --- %s = 0x%08x %08x\n",
  511. engine ? engine->name : "global", vma->name,
  512. upper_32_bits(vma->gtt_offset),
  513. lower_32_bits(vma->gtt_offset));
  514. if (vma->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K)
  515. err_printf(m, "gtt_page_sizes = 0x%08x\n", vma->gtt_page_sizes);
  516. err_compression_marker(m);
  517. list_for_each_entry(page, &vma->page_list, lru) {
  518. int i, len;
  519. const u32 *addr = page_address(page);
  520. len = PAGE_SIZE;
  521. if (page == list_last_entry(&vma->page_list, typeof(*page), lru))
  522. len -= vma->unused;
  523. len = ascii85_encode_len(len);
  524. for (i = 0; i < len; i++)
  525. err_puts(m, ascii85_encode(addr[i], out));
  526. }
  527. err_puts(m, "\n");
  528. }
  529. static void err_print_capabilities(struct drm_i915_error_state_buf *m,
  530. struct i915_gpu_coredump *error)
  531. {
  532. struct drm_printer p = i915_error_printer(m);
  533. intel_device_info_print(&error->device_info, &error->runtime_info, &p);
  534. intel_driver_caps_print(&error->driver_caps, &p);
  535. }
  536. static void err_print_params(struct drm_i915_error_state_buf *m,
  537. const struct i915_params *params)
  538. {
  539. struct drm_printer p = i915_error_printer(m);
  540. i915_params_dump(params, &p);
  541. }
  542. static void err_print_pciid(struct drm_i915_error_state_buf *m,
  543. struct drm_i915_private *i915)
  544. {
  545. struct pci_dev *pdev = to_pci_dev(i915->drm.dev);
  546. err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
  547. err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
  548. err_printf(m, "PCI Subsystem: %04x:%04x\n",
  549. pdev->subsystem_vendor,
  550. pdev->subsystem_device);
  551. }
  552. static void err_print_guc_ctb(struct drm_i915_error_state_buf *m,
  553. const char *name,
  554. const struct intel_ctb_coredump *ctb)
  555. {
  556. if (!ctb->size)
  557. return;
  558. err_printf(m, "GuC %s CTB: raw: 0x%08X, 0x%08X/%08X, cached: 0x%08X/%08X, desc = 0x%08X, buf = 0x%08X x 0x%08X\n",
  559. name, ctb->raw_status, ctb->raw_head, ctb->raw_tail,
  560. ctb->head, ctb->tail, ctb->desc_offset, ctb->cmds_offset, ctb->size);
  561. }
  562. /* This list includes registers that are useful in debugging GuC hangs. */
  563. static const struct {
  564. u32 start;
  565. u32 count;
  566. } guc_hw_reg_state[] = {
  567. { 0xc0b0, 2 },
  568. { 0xc000, 65 },
  569. { 0xc140, 1 },
  570. { 0xc180, 16 },
  571. { 0xc1dc, 10 },
  572. { 0xc300, 79 },
  573. { 0xc4b4, 47 },
  574. { 0xc574, 1 },
  575. { 0xc57c, 1 },
  576. { 0xc584, 11 },
  577. { 0xc5c0, 8 },
  578. { 0xc5e4, 1 },
  579. { 0xc5ec, 103 },
  580. { 0xc7c0, 1 },
  581. { 0xc0b0, 2 }
  582. };
  583. static u32 print_range_line(struct drm_i915_error_state_buf *m, u32 start, u32 *dump, u32 count)
  584. {
  585. if (count >= 8) {
  586. err_printf(m, "[0x%04x] 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x\n",
  587. start, dump[0], dump[1], dump[2], dump[3],
  588. dump[4], dump[5], dump[6], dump[7]);
  589. return 8;
  590. } else if (count >= 4) {
  591. err_printf(m, "[0x%04x] 0x%08x 0x%08x 0x%08x 0x%08x\n",
  592. start, dump[0], dump[1], dump[2], dump[3]);
  593. return 4;
  594. } else if (count >= 2) {
  595. err_printf(m, "[0x%04x] 0x%08x 0x%08x\n", start, dump[0], dump[1]);
  596. return 2;
  597. }
  598. err_printf(m, "[0x%04x] 0x%08x\n", start, dump[0]);
  599. return 1;
  600. }
  601. static void err_print_guc_hw_state(struct drm_i915_error_state_buf *m, u32 *hw_state)
  602. {
  603. u32 total = 0;
  604. int i;
  605. if (!hw_state)
  606. return;
  607. err_printf(m, "GuC Register State:\n");
  608. for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++) {
  609. u32 entry = 0;
  610. while (entry < guc_hw_reg_state[i].count) {
  611. u32 start = guc_hw_reg_state[i].start + entry * sizeof(u32);
  612. u32 count = guc_hw_reg_state[i].count - entry;
  613. u32 *values = hw_state + total + entry;
  614. entry += print_range_line(m, start, values, count);
  615. }
  616. GEM_BUG_ON(entry != guc_hw_reg_state[i].count);
  617. total += entry;
  618. }
  619. }
  620. static void err_print_uc(struct drm_i915_error_state_buf *m,
  621. const struct intel_uc_coredump *error_uc)
  622. {
  623. struct drm_printer p = i915_error_printer(m);
  624. intel_uc_fw_dump(&error_uc->guc_fw, &p);
  625. intel_uc_fw_dump(&error_uc->huc_fw, &p);
  626. err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->guc.timestamp);
  627. err_print_guc_hw_state(m, error_uc->guc.hw_state);
  628. intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_log);
  629. err_printf(m, "GuC CTB fence: %d\n", error_uc->guc.last_fence);
  630. err_print_guc_ctb(m, "Send", error_uc->guc.ctb + 0);
  631. err_print_guc_ctb(m, "Recv", error_uc->guc.ctb + 1);
  632. intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_ctb);
  633. }
  634. static void err_free_sgl(struct scatterlist *sgl)
  635. {
  636. while (sgl) {
  637. struct scatterlist *sg;
  638. for (sg = sgl; !sg_is_chain(sg); sg++) {
  639. kfree(sg_virt(sg));
  640. if (sg_is_last(sg))
  641. break;
  642. }
  643. sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
  644. free_page((unsigned long)sgl);
  645. sgl = sg;
  646. }
  647. }
  648. static void err_print_gt_info(struct drm_i915_error_state_buf *m,
  649. struct intel_gt_coredump *gt)
  650. {
  651. struct drm_printer p = i915_error_printer(m);
  652. intel_gt_info_print(&gt->info, &p);
  653. intel_sseu_print_topology(gt->_gt->i915, &gt->info.sseu, &p);
  654. }
  655. static void err_print_gt_global_nonguc(struct drm_i915_error_state_buf *m,
  656. struct intel_gt_coredump *gt)
  657. {
  658. int i;
  659. err_printf(m, "GT awake: %s\n", str_yes_no(gt->awake));
  660. err_printf(m, "CS timestamp frequency: %u Hz, %d ns\n",
  661. gt->clock_frequency, gt->clock_period_ns);
  662. err_printf(m, "EIR: 0x%08x\n", gt->eir);
  663. err_printf(m, "PGTBL_ER: 0x%08x\n", gt->pgtbl_er);
  664. for (i = 0; i < gt->ngtier; i++)
  665. err_printf(m, "GTIER[%d]: 0x%08x\n", i, gt->gtier[i]);
  666. }
  667. static void err_print_gt_global(struct drm_i915_error_state_buf *m,
  668. struct intel_gt_coredump *gt)
  669. {
  670. err_printf(m, "FORCEWAKE: 0x%08x\n", gt->forcewake);
  671. if (IS_GRAPHICS_VER(m->i915, 6, 11)) {
  672. err_printf(m, "ERROR: 0x%08x\n", gt->error);
  673. err_printf(m, "DONE_REG: 0x%08x\n", gt->done_reg);
  674. }
  675. if (GRAPHICS_VER(m->i915) >= 8)
  676. err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
  677. gt->fault_data1, gt->fault_data0);
  678. if (GRAPHICS_VER(m->i915) == 7)
  679. err_printf(m, "ERR_INT: 0x%08x\n", gt->err_int);
  680. if (IS_GRAPHICS_VER(m->i915, 8, 11))
  681. err_printf(m, "GTT_CACHE_EN: 0x%08x\n", gt->gtt_cache);
  682. if (GRAPHICS_VER(m->i915) == 12)
  683. err_printf(m, "AUX_ERR_DBG: 0x%08x\n", gt->aux_err);
  684. if (GRAPHICS_VER(m->i915) >= 12) {
  685. int i;
  686. for (i = 0; i < I915_MAX_SFC; i++) {
  687. /*
  688. * SFC_DONE resides in the VD forcewake domain, so it
  689. * only exists if the corresponding VCS engine is
  690. * present.
  691. */
  692. if ((gt->_gt->info.sfc_mask & BIT(i)) == 0 ||
  693. !HAS_ENGINE(gt->_gt, _VCS(i * 2)))
  694. continue;
  695. err_printf(m, " SFC_DONE[%d]: 0x%08x\n", i,
  696. gt->sfc_done[i]);
  697. }
  698. err_printf(m, " GAM_DONE: 0x%08x\n", gt->gam_done);
  699. }
  700. }
  701. static void err_print_gt_fences(struct drm_i915_error_state_buf *m,
  702. struct intel_gt_coredump *gt)
  703. {
  704. int i;
  705. for (i = 0; i < gt->nfence; i++)
  706. err_printf(m, " fence[%d] = %08llx\n", i, gt->fence[i]);
  707. }
  708. static void err_print_gt_engines(struct drm_i915_error_state_buf *m,
  709. struct intel_gt_coredump *gt)
  710. {
  711. const struct intel_engine_coredump *ee;
  712. for (ee = gt->engine; ee; ee = ee->next) {
  713. const struct i915_vma_coredump *vma;
  714. if (gt->uc && gt->uc->guc.is_guc_capture) {
  715. if (ee->guc_capture_node)
  716. intel_guc_capture_print_engine_node(m, ee);
  717. else
  718. err_printf(m, " Missing GuC capture node for %s\n",
  719. ee->engine->name);
  720. } else {
  721. error_print_engine(m, ee);
  722. }
  723. err_printf(m, " hung: %u\n", ee->hung);
  724. err_printf(m, " engine reset count: %u\n", ee->reset_count);
  725. error_print_context(m, " Active context: ", &ee->context);
  726. for (vma = ee->vma; vma; vma = vma->next)
  727. intel_gpu_error_print_vma(m, ee->engine, vma);
  728. }
  729. }
  730. static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
  731. struct i915_gpu_coredump *error)
  732. {
  733. struct drm_printer p = i915_error_printer(m);
  734. const struct intel_engine_coredump *ee;
  735. struct timespec64 ts;
  736. if (*error->error_msg)
  737. err_printf(m, "%s\n", error->error_msg);
  738. err_printf(m, "Kernel: %s %s\n",
  739. init_utsname()->release,
  740. init_utsname()->machine);
  741. ts = ktime_to_timespec64(error->time);
  742. err_printf(m, "Time: %lld s %ld us\n",
  743. (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
  744. ts = ktime_to_timespec64(error->boottime);
  745. err_printf(m, "Boottime: %lld s %ld us\n",
  746. (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
  747. ts = ktime_to_timespec64(error->uptime);
  748. err_printf(m, "Uptime: %lld s %ld us\n",
  749. (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
  750. err_printf(m, "Capture: %lu jiffies; %d ms ago\n",
  751. error->capture, jiffies_to_msecs(jiffies - error->capture));
  752. for (ee = error->gt ? error->gt->engine : NULL; ee; ee = ee->next)
  753. err_printf(m, "Active process (on ring %s): %s [%d]\n",
  754. ee->engine->name,
  755. ee->context.comm,
  756. ee->context.pid);
  757. err_printf(m, "Reset count: %u\n", error->reset_count);
  758. err_printf(m, "Suspend count: %u\n", error->suspend_count);
  759. err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
  760. err_printf(m, "Subplatform: 0x%x\n",
  761. intel_subplatform(&error->runtime_info,
  762. error->device_info.platform));
  763. err_print_pciid(m, m->i915);
  764. err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
  765. err_printf(m, "RPM wakelock: %s\n", str_yes_no(error->wakelock));
  766. err_printf(m, "PM suspended: %s\n", str_yes_no(error->suspended));
  767. if (error->gt) {
  768. bool print_guc_capture = false;
  769. if (error->gt->uc && error->gt->uc->guc.is_guc_capture)
  770. print_guc_capture = true;
  771. err_print_gt_global_nonguc(m, error->gt);
  772. err_print_gt_fences(m, error->gt);
  773. /*
  774. * GuC dumped global, eng-class and eng-instance registers together
  775. * as part of engine state dump so we print in err_print_gt_engines
  776. */
  777. if (!print_guc_capture)
  778. err_print_gt_global(m, error->gt);
  779. err_print_gt_engines(m, error->gt);
  780. if (error->gt->uc)
  781. err_print_uc(m, error->gt->uc);
  782. err_print_gt_info(m, error->gt);
  783. }
  784. err_print_capabilities(m, error);
  785. err_print_params(m, &error->params);
  786. intel_display_snapshot_print(error->display_snapshot, &p);
  787. }
  788. static int err_print_to_sgl(struct i915_gpu_coredump *error)
  789. {
  790. struct drm_i915_error_state_buf m;
  791. if (IS_ERR(error))
  792. return PTR_ERR(error);
  793. if (READ_ONCE(error->sgl))
  794. return 0;
  795. memset(&m, 0, sizeof(m));
  796. m.i915 = error->i915;
  797. __err_print_to_sgl(&m, error);
  798. if (m.buf) {
  799. __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
  800. m.bytes = 0;
  801. m.buf = NULL;
  802. }
  803. if (m.cur) {
  804. GEM_BUG_ON(m.end < m.cur);
  805. sg_mark_end(m.cur - 1);
  806. }
  807. GEM_BUG_ON(m.sgl && !m.cur);
  808. if (m.err) {
  809. err_free_sgl(m.sgl);
  810. return m.err;
  811. }
  812. if (cmpxchg(&error->sgl, NULL, m.sgl))
  813. err_free_sgl(m.sgl);
  814. return 0;
  815. }
  816. ssize_t i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
  817. char *buf, loff_t off, size_t rem)
  818. {
  819. struct scatterlist *sg;
  820. size_t count;
  821. loff_t pos;
  822. int err;
  823. if (!error || !rem)
  824. return 0;
  825. err = err_print_to_sgl(error);
  826. if (err)
  827. return err;
  828. sg = READ_ONCE(error->fit);
  829. if (!sg || off < sg->dma_address)
  830. sg = error->sgl;
  831. if (!sg)
  832. return 0;
  833. pos = sg->dma_address;
  834. count = 0;
  835. do {
  836. size_t len, start;
  837. if (sg_is_chain(sg)) {
  838. sg = sg_chain_ptr(sg);
  839. GEM_BUG_ON(sg_is_chain(sg));
  840. }
  841. len = sg->length;
  842. if (pos + len <= off) {
  843. pos += len;
  844. continue;
  845. }
  846. start = sg->offset;
  847. if (pos < off) {
  848. GEM_BUG_ON(off - pos > len);
  849. len -= off - pos;
  850. start += off - pos;
  851. pos = off;
  852. }
  853. len = min(len, rem);
  854. GEM_BUG_ON(!len || len > sg->length);
  855. memcpy(buf, page_address(sg_page(sg)) + start, len);
  856. count += len;
  857. pos += len;
  858. buf += len;
  859. rem -= len;
  860. if (!rem) {
  861. WRITE_ONCE(error->fit, sg);
  862. break;
  863. }
  864. } while (!sg_is_last(sg++));
  865. return count;
  866. }
  867. static void i915_vma_coredump_free(struct i915_vma_coredump *vma)
  868. {
  869. while (vma) {
  870. struct i915_vma_coredump *next = vma->next;
  871. struct page *page, *n;
  872. list_for_each_entry_safe(page, n, &vma->page_list, lru) {
  873. list_del_init(&page->lru);
  874. __free_page(page);
  875. }
  876. kfree(vma);
  877. vma = next;
  878. }
  879. }
  880. static void cleanup_params(struct i915_gpu_coredump *error)
  881. {
  882. i915_params_free(&error->params);
  883. }
  884. static void cleanup_uc(struct intel_uc_coredump *uc)
  885. {
  886. kfree(uc->guc_fw.file_selected.path);
  887. kfree(uc->huc_fw.file_selected.path);
  888. kfree(uc->guc_fw.file_wanted.path);
  889. kfree(uc->huc_fw.file_wanted.path);
  890. i915_vma_coredump_free(uc->guc.vma_log);
  891. i915_vma_coredump_free(uc->guc.vma_ctb);
  892. kfree(uc->guc.hw_state);
  893. kfree(uc);
  894. }
  895. static void cleanup_gt(struct intel_gt_coredump *gt)
  896. {
  897. while (gt->engine) {
  898. struct intel_engine_coredump *ee = gt->engine;
  899. gt->engine = ee->next;
  900. i915_vma_coredump_free(ee->vma);
  901. intel_guc_capture_free_node(ee);
  902. kfree(ee);
  903. }
  904. if (gt->uc)
  905. cleanup_uc(gt->uc);
  906. kfree(gt);
  907. }
  908. void __i915_gpu_coredump_free(struct kref *error_ref)
  909. {
  910. struct i915_gpu_coredump *error =
  911. container_of(error_ref, typeof(*error), ref);
  912. while (error->gt) {
  913. struct intel_gt_coredump *gt = error->gt;
  914. error->gt = gt->next;
  915. cleanup_gt(gt);
  916. }
  917. intel_display_snapshot_free(error->display_snapshot);
  918. cleanup_params(error);
  919. err_free_sgl(error->sgl);
  920. kfree(error);
  921. }
  922. static struct i915_vma_coredump *
  923. i915_vma_coredump_create(const struct intel_gt *gt,
  924. const struct i915_vma_resource *vma_res,
  925. struct i915_vma_compress *compress,
  926. const char *name)
  927. {
  928. struct i915_ggtt *ggtt = gt->ggtt;
  929. const u64 slot = ggtt->error_capture.start;
  930. struct i915_vma_coredump *dst;
  931. struct sgt_iter iter;
  932. int ret;
  933. might_sleep();
  934. if (!vma_res || !vma_res->bi.pages || !compress)
  935. return NULL;
  936. dst = kmalloc_obj(*dst, ALLOW_FAIL);
  937. if (!dst)
  938. return NULL;
  939. if (!compress_start(compress)) {
  940. kfree(dst);
  941. return NULL;
  942. }
  943. INIT_LIST_HEAD(&dst->page_list);
  944. strscpy(dst->name, name);
  945. dst->next = NULL;
  946. dst->gtt_offset = vma_res->start;
  947. dst->gtt_size = vma_res->node_size;
  948. dst->gtt_page_sizes = vma_res->page_sizes_gtt;
  949. dst->unused = 0;
  950. ret = -EINVAL;
  951. if (drm_mm_node_allocated(&ggtt->error_capture)) {
  952. void __iomem *s;
  953. dma_addr_t dma;
  954. for_each_sgt_daddr(dma, iter, vma_res->bi.pages) {
  955. mutex_lock(&ggtt->error_mutex);
  956. if (ggtt->vm.raw_insert_page)
  957. ggtt->vm.raw_insert_page(&ggtt->vm, dma, slot,
  958. i915_gem_get_pat_index(gt->i915,
  959. I915_CACHE_NONE),
  960. 0);
  961. else
  962. ggtt->vm.insert_page(&ggtt->vm, dma, slot,
  963. i915_gem_get_pat_index(gt->i915,
  964. I915_CACHE_NONE),
  965. 0);
  966. mb();
  967. s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE);
  968. ret = compress_page(compress,
  969. (void __force *)s, dst,
  970. true);
  971. io_mapping_unmap(s);
  972. mb();
  973. ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
  974. mutex_unlock(&ggtt->error_mutex);
  975. if (ret)
  976. break;
  977. }
  978. } else if (vma_res->bi.lmem) {
  979. struct intel_memory_region *mem = vma_res->mr;
  980. dma_addr_t dma;
  981. for_each_sgt_daddr(dma, iter, vma_res->bi.pages) {
  982. dma_addr_t offset = dma - mem->region.start;
  983. void __iomem *s;
  984. if (offset + PAGE_SIZE > resource_size(&mem->io)) {
  985. ret = -EINVAL;
  986. break;
  987. }
  988. s = io_mapping_map_wc(&mem->iomap, offset, PAGE_SIZE);
  989. ret = compress_page(compress,
  990. (void __force *)s, dst,
  991. true);
  992. io_mapping_unmap(s);
  993. if (ret)
  994. break;
  995. }
  996. } else {
  997. struct page *page;
  998. for_each_sgt_page(page, iter, vma_res->bi.pages) {
  999. void *s;
  1000. drm_clflush_pages(&page, 1);
  1001. s = kmap_local_page(page);
  1002. ret = compress_page(compress, s, dst, false);
  1003. kunmap_local(s);
  1004. drm_clflush_pages(&page, 1);
  1005. if (ret)
  1006. break;
  1007. }
  1008. }
  1009. if (ret || compress_flush(compress, dst)) {
  1010. struct page *page, *n;
  1011. list_for_each_entry_safe_reverse(page, n, &dst->page_list, lru) {
  1012. list_del_init(&page->lru);
  1013. pool_free(&compress->pool, page_address(page));
  1014. }
  1015. kfree(dst);
  1016. dst = NULL;
  1017. }
  1018. compress_finish(compress);
  1019. return dst;
  1020. }
  1021. static void gt_record_fences(struct intel_gt_coredump *gt)
  1022. {
  1023. struct i915_ggtt *ggtt = gt->_gt->ggtt;
  1024. struct intel_uncore *uncore = gt->_gt->uncore;
  1025. int i;
  1026. if (GRAPHICS_VER(uncore->i915) >= 6) {
  1027. for (i = 0; i < ggtt->num_fences; i++)
  1028. gt->fence[i] =
  1029. intel_uncore_read64(uncore,
  1030. FENCE_REG_GEN6_LO(i));
  1031. } else if (GRAPHICS_VER(uncore->i915) >= 4) {
  1032. for (i = 0; i < ggtt->num_fences; i++)
  1033. gt->fence[i] =
  1034. intel_uncore_read64(uncore,
  1035. FENCE_REG_965_LO(i));
  1036. } else {
  1037. for (i = 0; i < ggtt->num_fences; i++)
  1038. gt->fence[i] =
  1039. intel_uncore_read(uncore, FENCE_REG(i));
  1040. }
  1041. gt->nfence = i;
  1042. }
  1043. static void engine_record_registers(struct intel_engine_coredump *ee)
  1044. {
  1045. const struct intel_engine_cs *engine = ee->engine;
  1046. struct drm_i915_private *i915 = engine->i915;
  1047. if (GRAPHICS_VER(i915) >= 6) {
  1048. ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL);
  1049. /*
  1050. * For the media GT, this ring fault register is not replicated,
  1051. * so don't do multicast/replicated register read/write
  1052. * operation on it.
  1053. */
  1054. if (MEDIA_VER(i915) >= 13 && engine->gt->type == GT_MEDIA)
  1055. ee->fault_reg = intel_uncore_read(engine->uncore,
  1056. XELPMP_RING_FAULT_REG);
  1057. else if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 55))
  1058. ee->fault_reg = intel_gt_mcr_read_any(engine->gt,
  1059. XEHP_RING_FAULT_REG);
  1060. else if (GRAPHICS_VER(i915) >= 12)
  1061. ee->fault_reg = intel_uncore_read(engine->uncore,
  1062. GEN12_RING_FAULT_REG);
  1063. else if (GRAPHICS_VER(i915) >= 8)
  1064. ee->fault_reg = intel_uncore_read(engine->uncore,
  1065. GEN8_RING_FAULT_REG);
  1066. else
  1067. ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine);
  1068. }
  1069. if (GRAPHICS_VER(i915) >= 4) {
  1070. ee->esr = ENGINE_READ(engine, RING_ESR);
  1071. ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
  1072. ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
  1073. ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
  1074. ee->instps = ENGINE_READ(engine, RING_INSTPS);
  1075. ee->bbaddr = ENGINE_READ(engine, RING_BBADDR);
  1076. ee->ccid = ENGINE_READ(engine, CCID);
  1077. if (GRAPHICS_VER(i915) >= 8) {
  1078. ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32;
  1079. ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32;
  1080. }
  1081. ee->bbstate = ENGINE_READ(engine, RING_BBSTATE);
  1082. } else {
  1083. ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX);
  1084. ee->ipeir = ENGINE_READ(engine, IPEIR);
  1085. ee->ipehr = ENGINE_READ(engine, IPEHR);
  1086. }
  1087. if (GRAPHICS_VER(i915) >= 11) {
  1088. ee->cmd_cctl = ENGINE_READ(engine, RING_CMD_CCTL);
  1089. ee->cscmdop = ENGINE_READ(engine, RING_CSCMDOP);
  1090. ee->ctx_sr_ctl = ENGINE_READ(engine, RING_CTX_SR_CTL);
  1091. ee->dma_faddr_hi = ENGINE_READ(engine, RING_DMA_FADD_UDW);
  1092. ee->dma_faddr_lo = ENGINE_READ(engine, RING_DMA_FADD);
  1093. ee->nopid = ENGINE_READ(engine, RING_NOPID);
  1094. ee->excc = ENGINE_READ(engine, RING_EXCC);
  1095. }
  1096. intel_engine_get_instdone(engine, &ee->instdone);
  1097. ee->instpm = ENGINE_READ(engine, RING_INSTPM);
  1098. ee->acthd = intel_engine_get_active_head(engine);
  1099. ee->start = ENGINE_READ(engine, RING_START);
  1100. ee->head = ENGINE_READ(engine, RING_HEAD);
  1101. ee->tail = ENGINE_READ(engine, RING_TAIL);
  1102. ee->ctl = ENGINE_READ(engine, RING_CTL);
  1103. if (GRAPHICS_VER(i915) > 2)
  1104. ee->mode = ENGINE_READ(engine, RING_MI_MODE);
  1105. if (!HWS_NEEDS_PHYSICAL(i915)) {
  1106. i915_reg_t mmio;
  1107. if (GRAPHICS_VER(i915) == 7) {
  1108. switch (engine->id) {
  1109. default:
  1110. MISSING_CASE(engine->id);
  1111. fallthrough;
  1112. case RCS0:
  1113. mmio = RENDER_HWS_PGA_GEN7;
  1114. break;
  1115. case BCS0:
  1116. mmio = BLT_HWS_PGA_GEN7;
  1117. break;
  1118. case VCS0:
  1119. mmio = BSD_HWS_PGA_GEN7;
  1120. break;
  1121. case VECS0:
  1122. mmio = VEBOX_HWS_PGA_GEN7;
  1123. break;
  1124. }
  1125. } else if (GRAPHICS_VER(engine->i915) == 6) {
  1126. mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
  1127. } else {
  1128. /* XXX: gen8 returns to sanity */
  1129. mmio = RING_HWS_PGA(engine->mmio_base);
  1130. }
  1131. ee->hws = intel_uncore_read(engine->uncore, mmio);
  1132. }
  1133. ee->reset_count = i915_reset_engine_count(&i915->gpu_error, engine);
  1134. if (HAS_PPGTT(i915)) {
  1135. int i;
  1136. ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7);
  1137. if (GRAPHICS_VER(i915) == 6) {
  1138. ee->vm_info.pp_dir_base =
  1139. ENGINE_READ(engine, RING_PP_DIR_BASE_READ);
  1140. } else if (GRAPHICS_VER(i915) == 7) {
  1141. ee->vm_info.pp_dir_base =
  1142. ENGINE_READ(engine, RING_PP_DIR_BASE);
  1143. } else if (GRAPHICS_VER(i915) >= 8) {
  1144. u32 base = engine->mmio_base;
  1145. for (i = 0; i < 4; i++) {
  1146. ee->vm_info.pdp[i] =
  1147. intel_uncore_read(engine->uncore,
  1148. GEN8_RING_PDP_UDW(base, i));
  1149. ee->vm_info.pdp[i] <<= 32;
  1150. ee->vm_info.pdp[i] |=
  1151. intel_uncore_read(engine->uncore,
  1152. GEN8_RING_PDP_LDW(base, i));
  1153. }
  1154. }
  1155. }
  1156. }
  1157. static void record_request(const struct i915_request *request,
  1158. struct i915_request_coredump *erq)
  1159. {
  1160. erq->flags = request->fence.flags;
  1161. erq->context = request->fence.context;
  1162. erq->seqno = request->fence.seqno;
  1163. erq->sched_attr = request->sched.attr;
  1164. erq->head = request->head;
  1165. erq->tail = request->tail;
  1166. erq->pid = 0;
  1167. rcu_read_lock();
  1168. if (!intel_context_is_closed(request->context)) {
  1169. const struct i915_gem_context *ctx;
  1170. ctx = rcu_dereference(request->context->gem_context);
  1171. if (ctx)
  1172. erq->pid = pid_nr(ctx->pid);
  1173. }
  1174. rcu_read_unlock();
  1175. }
  1176. static void engine_record_execlists(struct intel_engine_coredump *ee)
  1177. {
  1178. const struct intel_engine_execlists * const el = &ee->engine->execlists;
  1179. struct i915_request * const *port = el->active;
  1180. unsigned int n = 0;
  1181. while (*port)
  1182. record_request(*port++, &ee->execlist[n++]);
  1183. ee->num_ports = n;
  1184. }
  1185. static bool record_context(struct i915_gem_context_coredump *e,
  1186. struct intel_context *ce)
  1187. {
  1188. struct i915_gem_context *ctx;
  1189. struct task_struct *task;
  1190. bool simulated;
  1191. rcu_read_lock();
  1192. ctx = rcu_dereference(ce->gem_context);
  1193. if (ctx && !kref_get_unless_zero(&ctx->ref))
  1194. ctx = NULL;
  1195. rcu_read_unlock();
  1196. if (!ctx)
  1197. return true;
  1198. rcu_read_lock();
  1199. task = pid_task(ctx->pid, PIDTYPE_PID);
  1200. if (task) {
  1201. strscpy(e->comm, task->comm);
  1202. e->pid = task->pid;
  1203. }
  1204. rcu_read_unlock();
  1205. e->sched_attr = ctx->sched;
  1206. e->guilty = atomic_read(&ctx->guilty_count);
  1207. e->active = atomic_read(&ctx->active_count);
  1208. e->hwsp_seqno = (ce->timeline && ce->timeline->hwsp_seqno) ?
  1209. *ce->timeline->hwsp_seqno : ~0U;
  1210. e->total_runtime = intel_context_get_total_runtime_ns(ce);
  1211. e->avg_runtime = intel_context_get_avg_runtime_ns(ce);
  1212. simulated = i915_gem_context_no_error_capture(ctx);
  1213. i915_gem_context_put(ctx);
  1214. return simulated;
  1215. }
  1216. struct intel_engine_capture_vma {
  1217. struct intel_engine_capture_vma *next;
  1218. struct i915_vma_resource *vma_res;
  1219. char name[16];
  1220. bool lockdep_cookie;
  1221. };
  1222. static struct intel_engine_capture_vma *
  1223. capture_vma_snapshot(struct intel_engine_capture_vma *next,
  1224. struct i915_vma_resource *vma_res,
  1225. gfp_t gfp, const char *name)
  1226. {
  1227. struct intel_engine_capture_vma *c;
  1228. if (!vma_res)
  1229. return next;
  1230. c = kmalloc_obj(*c, gfp);
  1231. if (!c)
  1232. return next;
  1233. if (!i915_vma_resource_hold(vma_res, &c->lockdep_cookie)) {
  1234. kfree(c);
  1235. return next;
  1236. }
  1237. strscpy(c->name, name);
  1238. c->vma_res = i915_vma_resource_get(vma_res);
  1239. c->next = next;
  1240. return c;
  1241. }
  1242. static struct intel_engine_capture_vma *
  1243. capture_vma(struct intel_engine_capture_vma *next,
  1244. struct i915_vma *vma,
  1245. const char *name,
  1246. gfp_t gfp)
  1247. {
  1248. if (!vma)
  1249. return next;
  1250. /*
  1251. * If the vma isn't pinned, then the vma should be snapshotted
  1252. * to a struct i915_vma_snapshot at command submission time.
  1253. * Not here.
  1254. */
  1255. if (GEM_WARN_ON(!i915_vma_is_pinned(vma)))
  1256. return next;
  1257. next = capture_vma_snapshot(next, vma->resource, gfp, name);
  1258. return next;
  1259. }
  1260. static struct intel_engine_capture_vma *
  1261. capture_user(struct intel_engine_capture_vma *capture,
  1262. const struct i915_request *rq,
  1263. gfp_t gfp)
  1264. {
  1265. struct i915_capture_list *c;
  1266. for (c = rq->capture_list; c; c = c->next)
  1267. capture = capture_vma_snapshot(capture, c->vma_res, gfp,
  1268. "user");
  1269. return capture;
  1270. }
  1271. static void add_vma(struct intel_engine_coredump *ee,
  1272. struct i915_vma_coredump *vma)
  1273. {
  1274. if (vma) {
  1275. vma->next = ee->vma;
  1276. ee->vma = vma;
  1277. }
  1278. }
  1279. static struct i915_vma_coredump *
  1280. create_vma_coredump(const struct intel_gt *gt, struct i915_vma *vma,
  1281. const char *name, struct i915_vma_compress *compress)
  1282. {
  1283. struct i915_vma_coredump *ret = NULL;
  1284. struct i915_vma_resource *vma_res;
  1285. bool lockdep_cookie;
  1286. if (!vma)
  1287. return NULL;
  1288. vma_res = vma->resource;
  1289. if (i915_vma_resource_hold(vma_res, &lockdep_cookie)) {
  1290. ret = i915_vma_coredump_create(gt, vma_res, compress, name);
  1291. i915_vma_resource_unhold(vma_res, lockdep_cookie);
  1292. }
  1293. return ret;
  1294. }
  1295. static void add_vma_coredump(struct intel_engine_coredump *ee,
  1296. const struct intel_gt *gt,
  1297. struct i915_vma *vma,
  1298. const char *name,
  1299. struct i915_vma_compress *compress)
  1300. {
  1301. add_vma(ee, create_vma_coredump(gt, vma, name, compress));
  1302. }
  1303. struct intel_engine_coredump *
  1304. intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_flags)
  1305. {
  1306. struct intel_engine_coredump *ee;
  1307. ee = kzalloc_obj(*ee, gfp);
  1308. if (!ee)
  1309. return NULL;
  1310. ee->engine = engine;
  1311. if (!(dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)) {
  1312. engine_record_registers(ee);
  1313. engine_record_execlists(ee);
  1314. }
  1315. return ee;
  1316. }
  1317. static struct intel_engine_capture_vma *
  1318. engine_coredump_add_context(struct intel_engine_coredump *ee,
  1319. struct intel_context *ce,
  1320. gfp_t gfp)
  1321. {
  1322. struct intel_engine_capture_vma *vma = NULL;
  1323. ee->simulated |= record_context(&ee->context, ce);
  1324. if (ee->simulated)
  1325. return NULL;
  1326. /*
  1327. * We need to copy these to an anonymous buffer
  1328. * as the simplest method to avoid being overwritten
  1329. * by userspace.
  1330. */
  1331. vma = capture_vma(vma, ce->ring->vma, "ring", gfp);
  1332. vma = capture_vma(vma, ce->state, "HW context", gfp);
  1333. return vma;
  1334. }
  1335. struct intel_engine_capture_vma *
  1336. intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
  1337. struct i915_request *rq,
  1338. gfp_t gfp)
  1339. {
  1340. struct intel_engine_capture_vma *vma;
  1341. vma = engine_coredump_add_context(ee, rq->context, gfp);
  1342. if (!vma)
  1343. return NULL;
  1344. /*
  1345. * We need to copy these to an anonymous buffer
  1346. * as the simplest method to avoid being overwritten
  1347. * by userspace.
  1348. */
  1349. vma = capture_vma_snapshot(vma, rq->batch_res, gfp, "batch");
  1350. vma = capture_user(vma, rq, gfp);
  1351. ee->rq_head = rq->head;
  1352. ee->rq_post = rq->postfix;
  1353. ee->rq_tail = rq->tail;
  1354. return vma;
  1355. }
  1356. void
  1357. intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
  1358. struct intel_engine_capture_vma *capture,
  1359. struct i915_vma_compress *compress)
  1360. {
  1361. const struct intel_engine_cs *engine = ee->engine;
  1362. while (capture) {
  1363. struct intel_engine_capture_vma *this = capture;
  1364. struct i915_vma_resource *vma_res = this->vma_res;
  1365. add_vma(ee,
  1366. i915_vma_coredump_create(engine->gt, vma_res,
  1367. compress, this->name));
  1368. i915_vma_resource_unhold(vma_res, this->lockdep_cookie);
  1369. i915_vma_resource_put(vma_res);
  1370. capture = this->next;
  1371. kfree(this);
  1372. }
  1373. add_vma_coredump(ee, engine->gt, engine->status_page.vma,
  1374. "HW Status", compress);
  1375. add_vma_coredump(ee, engine->gt, engine->wa_ctx.vma,
  1376. "WA context", compress);
  1377. }
  1378. static struct intel_engine_coredump *
  1379. capture_engine(struct intel_engine_cs *engine,
  1380. struct i915_vma_compress *compress,
  1381. u32 dump_flags)
  1382. {
  1383. struct intel_engine_capture_vma *capture = NULL;
  1384. struct intel_engine_coredump *ee;
  1385. struct intel_context *ce = NULL;
  1386. struct i915_request *rq = NULL;
  1387. ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL, dump_flags);
  1388. if (!ee)
  1389. return NULL;
  1390. intel_engine_get_hung_entity(engine, &ce, &rq);
  1391. if (rq && !i915_request_started(rq)) {
  1392. /*
  1393. * We want to know also what is the guc_id of the context,
  1394. * but if we don't have the context reference, then skip
  1395. * printing it.
  1396. */
  1397. if (ce)
  1398. drm_info(&engine->gt->i915->drm,
  1399. "Got hung context on %s with active request %lld:%lld [0x%04X] not yet started\n",
  1400. engine->name, rq->fence.context, rq->fence.seqno, ce->guc_id.id);
  1401. else
  1402. drm_info(&engine->gt->i915->drm,
  1403. "Got hung context on %s with active request %lld:%lld not yet started\n",
  1404. engine->name, rq->fence.context, rq->fence.seqno);
  1405. }
  1406. if (rq) {
  1407. capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
  1408. i915_request_put(rq);
  1409. } else if (ce) {
  1410. capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL);
  1411. }
  1412. if (capture) {
  1413. intel_engine_coredump_add_vma(ee, capture, compress);
  1414. if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
  1415. intel_guc_capture_get_matching_node(engine->gt, ee, ce);
  1416. } else {
  1417. kfree(ee);
  1418. ee = NULL;
  1419. }
  1420. return ee;
  1421. }
  1422. static void
  1423. gt_record_engines(struct intel_gt_coredump *gt,
  1424. intel_engine_mask_t engine_mask,
  1425. struct i915_vma_compress *compress,
  1426. u32 dump_flags)
  1427. {
  1428. struct intel_engine_cs *engine;
  1429. enum intel_engine_id id;
  1430. for_each_engine(engine, gt->_gt, id) {
  1431. struct intel_engine_coredump *ee;
  1432. /* Refill our page pool before entering atomic section */
  1433. pool_refill(&compress->pool, ALLOW_FAIL);
  1434. ee = capture_engine(engine, compress, dump_flags);
  1435. if (!ee)
  1436. continue;
  1437. ee->hung = engine->mask & engine_mask;
  1438. gt->simulated |= ee->simulated;
  1439. if (ee->simulated) {
  1440. if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
  1441. intel_guc_capture_free_node(ee);
  1442. kfree(ee);
  1443. continue;
  1444. }
  1445. ee->next = gt->engine;
  1446. gt->engine = ee;
  1447. }
  1448. }
  1449. static void gt_record_guc_ctb(struct intel_ctb_coredump *saved,
  1450. const struct intel_guc_ct_buffer *ctb,
  1451. const void *blob_ptr, struct intel_guc *guc)
  1452. {
  1453. if (!ctb || !ctb->desc)
  1454. return;
  1455. saved->raw_status = ctb->desc->status;
  1456. saved->raw_head = ctb->desc->head;
  1457. saved->raw_tail = ctb->desc->tail;
  1458. saved->head = ctb->head;
  1459. saved->tail = ctb->tail;
  1460. saved->size = ctb->size;
  1461. saved->desc_offset = ((void *)ctb->desc) - blob_ptr;
  1462. saved->cmds_offset = ((void *)ctb->cmds) - blob_ptr;
  1463. }
  1464. static u32 read_guc_state_reg(struct intel_uncore *uncore, int range, int count)
  1465. {
  1466. GEM_BUG_ON(range >= ARRAY_SIZE(guc_hw_reg_state));
  1467. GEM_BUG_ON(count >= guc_hw_reg_state[range].count);
  1468. return intel_uncore_read(uncore,
  1469. _MMIO(guc_hw_reg_state[range].start + count * sizeof(u32)));
  1470. }
  1471. static void gt_record_guc_hw_state(struct intel_uncore *uncore,
  1472. struct intel_uc_coredump *error_uc)
  1473. {
  1474. u32 *hw_state;
  1475. u32 count = 0;
  1476. int i, j;
  1477. for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++)
  1478. count += guc_hw_reg_state[i].count;
  1479. hw_state = kcalloc(count, sizeof(u32), ALLOW_FAIL);
  1480. if (!hw_state)
  1481. return;
  1482. count = 0;
  1483. for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++)
  1484. for (j = 0; j < guc_hw_reg_state[i].count; j++)
  1485. hw_state[count++] = read_guc_state_reg(uncore, i, j);
  1486. error_uc->guc.hw_state = hw_state;
  1487. }
  1488. static struct intel_uc_coredump *
  1489. gt_record_uc(struct intel_gt_coredump *gt,
  1490. struct i915_vma_compress *compress)
  1491. {
  1492. const struct intel_uc *uc = &gt->_gt->uc;
  1493. struct intel_uc_coredump *error_uc;
  1494. error_uc = kzalloc_obj(*error_uc, ALLOW_FAIL);
  1495. if (!error_uc)
  1496. return NULL;
  1497. memcpy(&error_uc->guc_fw, &uc->guc.fw, sizeof(uc->guc.fw));
  1498. memcpy(&error_uc->huc_fw, &uc->huc.fw, sizeof(uc->huc.fw));
  1499. error_uc->guc_fw.file_selected.path = kstrdup(uc->guc.fw.file_selected.path, ALLOW_FAIL);
  1500. error_uc->huc_fw.file_selected.path = kstrdup(uc->huc.fw.file_selected.path, ALLOW_FAIL);
  1501. error_uc->guc_fw.file_wanted.path = kstrdup(uc->guc.fw.file_wanted.path, ALLOW_FAIL);
  1502. error_uc->huc_fw.file_wanted.path = kstrdup(uc->huc.fw.file_wanted.path, ALLOW_FAIL);
  1503. /*
  1504. * Save the GuC log and include a timestamp reference for converting the
  1505. * log times to system times (in conjunction with the error->boottime and
  1506. * gt->clock_frequency fields saved elsewhere).
  1507. */
  1508. error_uc->guc.timestamp = intel_uncore_read(gt->_gt->uncore, GUCPMTIMESTAMP);
  1509. error_uc->guc.vma_log = create_vma_coredump(gt->_gt, uc->guc.log.vma,
  1510. "GuC log buffer", compress);
  1511. error_uc->guc.vma_ctb = create_vma_coredump(gt->_gt, uc->guc.ct.vma,
  1512. "GuC CT buffer", compress);
  1513. error_uc->guc.last_fence = uc->guc.ct.requests.last_fence;
  1514. gt_record_guc_ctb(error_uc->guc.ctb + 0, &uc->guc.ct.ctbs.send,
  1515. uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc);
  1516. gt_record_guc_ctb(error_uc->guc.ctb + 1, &uc->guc.ct.ctbs.recv,
  1517. uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc);
  1518. gt_record_guc_hw_state(gt->_gt->uncore, error_uc);
  1519. return error_uc;
  1520. }
  1521. /* Capture all other registers that GuC doesn't capture. */
  1522. static void gt_record_global_nonguc_regs(struct intel_gt_coredump *gt)
  1523. {
  1524. struct intel_uncore *uncore = gt->_gt->uncore;
  1525. struct drm_i915_private *i915 = uncore->i915;
  1526. int i;
  1527. if (IS_VALLEYVIEW(i915)) {
  1528. gt->gtier[0] = intel_uncore_read(uncore, GTIER);
  1529. gt->ngtier = 1;
  1530. } else if (GRAPHICS_VER(i915) >= 11) {
  1531. gt->gtier[0] =
  1532. intel_uncore_read(uncore,
  1533. GEN11_RENDER_COPY_INTR_ENABLE);
  1534. gt->gtier[1] =
  1535. intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE);
  1536. gt->gtier[2] =
  1537. intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE);
  1538. gt->gtier[3] =
  1539. intel_uncore_read(uncore,
  1540. GEN11_GPM_WGBOXPERF_INTR_ENABLE);
  1541. gt->gtier[4] =
  1542. intel_uncore_read(uncore,
  1543. GEN11_CRYPTO_RSVD_INTR_ENABLE);
  1544. gt->gtier[5] =
  1545. intel_uncore_read(uncore,
  1546. GEN11_GUNIT_CSME_INTR_ENABLE);
  1547. gt->ngtier = 6;
  1548. } else if (GRAPHICS_VER(i915) >= 8) {
  1549. for (i = 0; i < 4; i++)
  1550. gt->gtier[i] =
  1551. intel_uncore_read(uncore, GEN8_GT_IER(i));
  1552. gt->ngtier = 4;
  1553. } else if (GRAPHICS_VER(i915) >= 5) {
  1554. gt->gtier[0] = intel_uncore_read(uncore, GTIER);
  1555. gt->ngtier = 1;
  1556. } else {
  1557. gt->gtier[0] = intel_uncore_read(uncore, GEN2_IER);
  1558. gt->ngtier = 1;
  1559. }
  1560. gt->eir = intel_uncore_read(uncore, EIR);
  1561. gt->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER);
  1562. }
  1563. /*
  1564. * Capture all registers that relate to workload submission.
  1565. * NOTE: In GuC submission, when GuC resets an engine, it can dump these for us
  1566. */
  1567. static void gt_record_global_regs(struct intel_gt_coredump *gt)
  1568. {
  1569. struct intel_uncore *uncore = gt->_gt->uncore;
  1570. struct drm_i915_private *i915 = uncore->i915;
  1571. int i;
  1572. /*
  1573. * General organization
  1574. * 1. Registers specific to a single generation
  1575. * 2. Registers which belong to multiple generations
  1576. * 3. Feature specific registers.
  1577. * 4. Everything else
  1578. * Please try to follow the order.
  1579. */
  1580. /* 1: Registers specific to a single generation */
  1581. if (IS_VALLEYVIEW(i915))
  1582. gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV);
  1583. if (GRAPHICS_VER(i915) == 7)
  1584. gt->err_int = intel_uncore_read(uncore, GEN7_ERR_INT);
  1585. if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 55)) {
  1586. gt->fault_data0 = intel_gt_mcr_read_any((struct intel_gt *)gt->_gt,
  1587. XEHP_FAULT_TLB_DATA0);
  1588. gt->fault_data1 = intel_gt_mcr_read_any((struct intel_gt *)gt->_gt,
  1589. XEHP_FAULT_TLB_DATA1);
  1590. } else if (GRAPHICS_VER(i915) >= 12) {
  1591. gt->fault_data0 = intel_uncore_read(uncore,
  1592. GEN12_FAULT_TLB_DATA0);
  1593. gt->fault_data1 = intel_uncore_read(uncore,
  1594. GEN12_FAULT_TLB_DATA1);
  1595. } else if (GRAPHICS_VER(i915) >= 8) {
  1596. gt->fault_data0 = intel_uncore_read(uncore,
  1597. GEN8_FAULT_TLB_DATA0);
  1598. gt->fault_data1 = intel_uncore_read(uncore,
  1599. GEN8_FAULT_TLB_DATA1);
  1600. }
  1601. if (GRAPHICS_VER(i915) == 6) {
  1602. gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE);
  1603. gt->gab_ctl = intel_uncore_read(uncore, GAB_CTL);
  1604. gt->gfx_mode = intel_uncore_read(uncore, GFX_MODE);
  1605. }
  1606. /* 2: Registers which belong to multiple generations */
  1607. if (GRAPHICS_VER(i915) >= 7)
  1608. gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT);
  1609. if (GRAPHICS_VER(i915) >= 6) {
  1610. if (GRAPHICS_VER(i915) < 12) {
  1611. gt->error = intel_uncore_read(uncore, ERROR_GEN6);
  1612. gt->done_reg = intel_uncore_read(uncore, DONE_REG);
  1613. }
  1614. }
  1615. /* 3: Feature specific registers */
  1616. if (IS_GRAPHICS_VER(i915, 6, 7)) {
  1617. gt->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK);
  1618. gt->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS);
  1619. }
  1620. if (IS_GRAPHICS_VER(i915, 8, 11))
  1621. gt->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN);
  1622. if (GRAPHICS_VER(i915) == 12)
  1623. gt->aux_err = intel_uncore_read(uncore, GEN12_AUX_ERR_DBG);
  1624. if (GRAPHICS_VER(i915) >= 12) {
  1625. for (i = 0; i < I915_MAX_SFC; i++) {
  1626. /*
  1627. * SFC_DONE resides in the VD forcewake domain, so it
  1628. * only exists if the corresponding VCS engine is
  1629. * present.
  1630. */
  1631. if ((gt->_gt->info.sfc_mask & BIT(i)) == 0 ||
  1632. !HAS_ENGINE(gt->_gt, _VCS(i * 2)))
  1633. continue;
  1634. gt->sfc_done[i] =
  1635. intel_uncore_read(uncore, GEN12_SFC_DONE(i));
  1636. }
  1637. gt->gam_done = intel_uncore_read(uncore, GEN12_GAM_DONE);
  1638. }
  1639. }
  1640. static void gt_record_info(struct intel_gt_coredump *gt)
  1641. {
  1642. memcpy(&gt->info, &gt->_gt->info, sizeof(struct intel_gt_info));
  1643. gt->clock_frequency = gt->_gt->clock_frequency;
  1644. gt->clock_period_ns = gt->_gt->clock_period_ns;
  1645. }
  1646. /*
  1647. * Generate a semi-unique error code. The code is not meant to have meaning, The
  1648. * code's only purpose is to try to prevent false duplicated bug reports by
  1649. * grossly estimating a GPU error state.
  1650. *
  1651. * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
  1652. * the hang if we could strip the GTT offset information from it.
  1653. *
  1654. * It's only a small step better than a random number in its current form.
  1655. */
  1656. static u32 generate_ecode(const struct intel_engine_coredump *ee)
  1657. {
  1658. /*
  1659. * IPEHR would be an ideal way to detect errors, as it's the gross
  1660. * measure of "the command that hung." However, has some very common
  1661. * synchronization commands which almost always appear in the case
  1662. * strictly a client bug. Use instdone to differentiate those some.
  1663. */
  1664. return ee ? ee->ipehr ^ ee->instdone.instdone : 0;
  1665. }
  1666. static const char *error_msg(struct i915_gpu_coredump *error)
  1667. {
  1668. struct intel_engine_coredump *first = NULL;
  1669. unsigned int hung_classes = 0;
  1670. struct intel_gt_coredump *gt;
  1671. int len;
  1672. for (gt = error->gt; gt; gt = gt->next) {
  1673. struct intel_engine_coredump *cs;
  1674. for (cs = gt->engine; cs; cs = cs->next) {
  1675. if (cs->hung) {
  1676. hung_classes |= BIT(cs->engine->uabi_class);
  1677. if (!first)
  1678. first = cs;
  1679. }
  1680. }
  1681. }
  1682. len = scnprintf(error->error_msg, sizeof(error->error_msg),
  1683. "GPU HANG: ecode %d:%x:%08x",
  1684. GRAPHICS_VER(error->i915), hung_classes,
  1685. generate_ecode(first));
  1686. if (first && first->context.pid) {
  1687. /* Just show the first executing process, more is confusing */
  1688. len += scnprintf(error->error_msg + len,
  1689. sizeof(error->error_msg) - len,
  1690. ", in %s [%d]",
  1691. first->context.comm, first->context.pid);
  1692. }
  1693. return error->error_msg;
  1694. }
  1695. static void capture_gen(struct i915_gpu_coredump *error)
  1696. {
  1697. struct drm_i915_private *i915 = error->i915;
  1698. error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
  1699. error->suspended = pm_runtime_suspended(i915->drm.dev);
  1700. error->iommu = i915_vtd_active(i915);
  1701. error->reset_count = i915_reset_count(&i915->gpu_error);
  1702. error->suspend_count = i915->suspend_count;
  1703. i915_params_copy(&error->params, &i915->params);
  1704. memcpy(&error->device_info,
  1705. INTEL_INFO(i915),
  1706. sizeof(error->device_info));
  1707. memcpy(&error->runtime_info,
  1708. RUNTIME_INFO(i915),
  1709. sizeof(error->runtime_info));
  1710. error->driver_caps = i915->caps;
  1711. }
  1712. struct i915_gpu_coredump *
  1713. i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)
  1714. {
  1715. struct i915_gpu_coredump *error;
  1716. if (!i915->params.error_capture)
  1717. return NULL;
  1718. error = kzalloc_obj(*error, gfp);
  1719. if (!error)
  1720. return NULL;
  1721. kref_init(&error->ref);
  1722. error->i915 = i915;
  1723. error->time = ktime_get_real();
  1724. error->boottime = ktime_get_boottime();
  1725. error->uptime = ktime_sub(ktime_get(), to_gt(i915)->last_init_time);
  1726. error->capture = jiffies;
  1727. capture_gen(error);
  1728. return error;
  1729. }
  1730. #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
  1731. struct intel_gt_coredump *
  1732. intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags)
  1733. {
  1734. struct intel_gt_coredump *gc;
  1735. gc = kzalloc_obj(*gc, gfp);
  1736. if (!gc)
  1737. return NULL;
  1738. gc->_gt = gt;
  1739. gc->awake = intel_gt_pm_is_awake(gt);
  1740. gt_record_global_nonguc_regs(gc);
  1741. /*
  1742. * GuC dumps global, eng-class and eng-instance registers
  1743. * (that can change as part of engine state during execution)
  1744. * before an engine is reset due to a hung context.
  1745. * GuC captures and reports all three groups of registers
  1746. * together as a single set before the engine is reset.
  1747. * Thus, if GuC triggered the context reset we retrieve
  1748. * the register values as part of gt_record_engines.
  1749. */
  1750. if (!(dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE))
  1751. gt_record_global_regs(gc);
  1752. gt_record_fences(gc);
  1753. return gc;
  1754. }
  1755. struct i915_vma_compress *
  1756. i915_vma_capture_prepare(struct intel_gt_coredump *gt)
  1757. {
  1758. struct i915_vma_compress *compress;
  1759. compress = kmalloc_obj(*compress, ALLOW_FAIL);
  1760. if (!compress)
  1761. return NULL;
  1762. if (!compress_init(compress)) {
  1763. kfree(compress);
  1764. return NULL;
  1765. }
  1766. return compress;
  1767. }
  1768. void i915_vma_capture_finish(struct intel_gt_coredump *gt,
  1769. struct i915_vma_compress *compress)
  1770. {
  1771. if (!compress)
  1772. return;
  1773. compress_fini(compress);
  1774. kfree(compress);
  1775. }
  1776. static struct i915_gpu_coredump *
  1777. __i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags)
  1778. {
  1779. struct drm_i915_private *i915 = gt->i915;
  1780. struct intel_display *display = i915->display;
  1781. struct i915_gpu_coredump *error;
  1782. /* Check if GPU capture has been disabled */
  1783. error = READ_ONCE(i915->gpu_error.first_error);
  1784. if (IS_ERR(error))
  1785. return error;
  1786. error = i915_gpu_coredump_alloc(i915, ALLOW_FAIL);
  1787. if (!error)
  1788. return ERR_PTR(-ENOMEM);
  1789. error->gt = intel_gt_coredump_alloc(gt, ALLOW_FAIL, dump_flags);
  1790. if (error->gt) {
  1791. struct i915_vma_compress *compress;
  1792. compress = i915_vma_capture_prepare(error->gt);
  1793. if (!compress) {
  1794. kfree(error->gt);
  1795. kfree(error);
  1796. return ERR_PTR(-ENOMEM);
  1797. }
  1798. if (INTEL_INFO(i915)->has_gt_uc) {
  1799. error->gt->uc = gt_record_uc(error->gt, compress);
  1800. if (error->gt->uc) {
  1801. if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
  1802. error->gt->uc->guc.is_guc_capture = true;
  1803. else
  1804. GEM_BUG_ON(error->gt->uc->guc.is_guc_capture);
  1805. }
  1806. }
  1807. gt_record_info(error->gt);
  1808. gt_record_engines(error->gt, engine_mask, compress, dump_flags);
  1809. i915_vma_capture_finish(error->gt, compress);
  1810. error->simulated |= error->gt->simulated;
  1811. }
  1812. error->display_snapshot = intel_display_snapshot_capture(display);
  1813. return error;
  1814. }
  1815. static struct i915_gpu_coredump *
  1816. i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags)
  1817. {
  1818. static DEFINE_MUTEX(capture_mutex);
  1819. int ret = mutex_lock_interruptible(&capture_mutex);
  1820. struct i915_gpu_coredump *dump;
  1821. if (ret)
  1822. return ERR_PTR(ret);
  1823. dump = __i915_gpu_coredump(gt, engine_mask, dump_flags);
  1824. mutex_unlock(&capture_mutex);
  1825. return dump;
  1826. }
  1827. void i915_error_state_store(struct i915_gpu_coredump *error)
  1828. {
  1829. struct drm_i915_private *i915;
  1830. if (IS_ERR_OR_NULL(error))
  1831. return;
  1832. i915 = error->i915;
  1833. drm_info(&i915->drm, "%s\n", error_msg(error));
  1834. if (error->simulated ||
  1835. cmpxchg(&i915->gpu_error.first_error, NULL, error))
  1836. return;
  1837. i915_gpu_coredump_get(error);
  1838. drm_info(&i915->drm, "GPU error state saved to /sys/class/drm/card%d/error\n",
  1839. i915->drm.primary->index);
  1840. }
  1841. /**
  1842. * i915_capture_error_state - capture an error record for later analysis
  1843. * @gt: intel_gt which originated the hang
  1844. * @engine_mask: hung engines
  1845. * @dump_flags: dump flags
  1846. *
  1847. * Should be called when an error is detected (either a hang or an error
  1848. * interrupt) to capture error state from the time of the error. Fills
  1849. * out a structure which becomes available in debugfs for user level tools
  1850. * to pick up.
  1851. */
  1852. void i915_capture_error_state(struct intel_gt *gt,
  1853. intel_engine_mask_t engine_mask, u32 dump_flags)
  1854. {
  1855. struct i915_gpu_coredump *error;
  1856. error = i915_gpu_coredump(gt, engine_mask, dump_flags);
  1857. if (IS_ERR(error)) {
  1858. cmpxchg(&gt->i915->gpu_error.first_error, NULL, error);
  1859. return;
  1860. }
  1861. i915_error_state_store(error);
  1862. i915_gpu_coredump_put(error);
  1863. }
  1864. static struct i915_gpu_coredump *
  1865. i915_first_error_state(struct drm_i915_private *i915)
  1866. {
  1867. struct i915_gpu_coredump *error;
  1868. spin_lock_irq(&i915->gpu_error.lock);
  1869. error = i915->gpu_error.first_error;
  1870. if (!IS_ERR_OR_NULL(error))
  1871. i915_gpu_coredump_get(error);
  1872. spin_unlock_irq(&i915->gpu_error.lock);
  1873. return error;
  1874. }
  1875. void i915_reset_error_state(struct drm_i915_private *i915)
  1876. {
  1877. struct i915_gpu_coredump *error;
  1878. spin_lock_irq(&i915->gpu_error.lock);
  1879. error = i915->gpu_error.first_error;
  1880. if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
  1881. i915->gpu_error.first_error = NULL;
  1882. spin_unlock_irq(&i915->gpu_error.lock);
  1883. if (!IS_ERR_OR_NULL(error))
  1884. i915_gpu_coredump_put(error);
  1885. }
  1886. void i915_disable_error_state(struct drm_i915_private *i915, int err)
  1887. {
  1888. spin_lock_irq(&i915->gpu_error.lock);
  1889. if (!i915->gpu_error.first_error)
  1890. i915->gpu_error.first_error = ERR_PTR(err);
  1891. spin_unlock_irq(&i915->gpu_error.lock);
  1892. }
  1893. #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
  1894. void intel_klog_error_capture(struct intel_gt *gt,
  1895. intel_engine_mask_t engine_mask)
  1896. {
  1897. static int g_count;
  1898. struct drm_i915_private *i915 = gt->i915;
  1899. struct i915_gpu_coredump *error;
  1900. intel_wakeref_t wakeref;
  1901. size_t buf_size = PAGE_SIZE * 128;
  1902. size_t pos_err;
  1903. char *buf, *ptr, *next;
  1904. int l_count = g_count++;
  1905. int line = 0;
  1906. /* Can't allocate memory during a reset */
  1907. if (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
  1908. drm_err(&gt->i915->drm, "[Capture/%d.%d] Inside GT reset, skipping error capture :(\n",
  1909. l_count, line++);
  1910. return;
  1911. }
  1912. error = READ_ONCE(i915->gpu_error.first_error);
  1913. if (error) {
  1914. drm_err(&i915->drm, "[Capture/%d.%d] Clearing existing error capture first...\n",
  1915. l_count, line++);
  1916. i915_reset_error_state(i915);
  1917. }
  1918. with_intel_runtime_pm(&i915->runtime_pm, wakeref)
  1919. error = i915_gpu_coredump(gt, engine_mask, CORE_DUMP_FLAG_NONE);
  1920. if (IS_ERR(error)) {
  1921. drm_err(&i915->drm, "[Capture/%d.%d] Failed to capture error capture: %ld!\n",
  1922. l_count, line++, PTR_ERR(error));
  1923. return;
  1924. }
  1925. buf = kvmalloc(buf_size, GFP_KERNEL);
  1926. if (!buf) {
  1927. drm_err(&i915->drm, "[Capture/%d.%d] Failed to allocate buffer for error capture!\n",
  1928. l_count, line++);
  1929. i915_gpu_coredump_put(error);
  1930. return;
  1931. }
  1932. drm_info(&i915->drm, "[Capture/%d.%d] Dumping i915 error capture for %ps...\n",
  1933. l_count, line++, __builtin_return_address(0));
  1934. /* Largest string length safe to print via dmesg */
  1935. # define MAX_CHUNK 800
  1936. pos_err = 0;
  1937. while (1) {
  1938. ssize_t got = i915_gpu_coredump_copy_to_buffer(error, buf, pos_err, buf_size - 1);
  1939. if (got <= 0)
  1940. break;
  1941. buf[got] = 0;
  1942. pos_err += got;
  1943. ptr = buf;
  1944. while (got > 0) {
  1945. size_t count;
  1946. char tag[2];
  1947. next = strnchr(ptr, got, '\n');
  1948. if (next) {
  1949. count = next - ptr;
  1950. *next = 0;
  1951. tag[0] = '>';
  1952. tag[1] = '<';
  1953. } else {
  1954. count = got;
  1955. tag[0] = '}';
  1956. tag[1] = '{';
  1957. }
  1958. if (count > MAX_CHUNK) {
  1959. size_t pos;
  1960. char *ptr2 = ptr;
  1961. for (pos = MAX_CHUNK; pos < count; pos += MAX_CHUNK) {
  1962. char chr = ptr[pos];
  1963. ptr[pos] = 0;
  1964. drm_info(&i915->drm, "[Capture/%d.%d] }%s{\n",
  1965. l_count, line++, ptr2);
  1966. ptr[pos] = chr;
  1967. ptr2 = ptr + pos;
  1968. /*
  1969. * If spewing large amounts of data via a serial console,
  1970. * this can be a very slow process. So be friendly and try
  1971. * not to cause 'softlockup on CPU' problems.
  1972. */
  1973. cond_resched();
  1974. }
  1975. if (ptr2 < (ptr + count))
  1976. drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n",
  1977. l_count, line++, tag[0], ptr2, tag[1]);
  1978. else if (tag[0] == '>')
  1979. drm_info(&i915->drm, "[Capture/%d.%d] ><\n",
  1980. l_count, line++);
  1981. } else {
  1982. drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n",
  1983. l_count, line++, tag[0], ptr, tag[1]);
  1984. }
  1985. ptr = next;
  1986. got -= count;
  1987. if (next) {
  1988. ptr++;
  1989. got--;
  1990. }
  1991. /* As above. */
  1992. cond_resched();
  1993. }
  1994. if (got)
  1995. drm_info(&i915->drm, "[Capture/%d.%d] Got %zd bytes remaining!\n",
  1996. l_count, line++, got);
  1997. }
  1998. kvfree(buf);
  1999. drm_info(&i915->drm, "[Capture/%d.%d] Dumped %zd bytes\n", l_count, line++, pos_err);
  2000. }
  2001. #endif
  2002. static ssize_t gpu_state_read(struct file *file, char __user *ubuf,
  2003. size_t count, loff_t *pos)
  2004. {
  2005. struct i915_gpu_coredump *error;
  2006. ssize_t ret;
  2007. void *buf;
  2008. error = file->private_data;
  2009. if (!error)
  2010. return 0;
  2011. /* Bounce buffer required because of kernfs __user API convenience. */
  2012. buf = kmalloc(count, GFP_KERNEL);
  2013. if (!buf)
  2014. return -ENOMEM;
  2015. ret = i915_gpu_coredump_copy_to_buffer(error, buf, *pos, count);
  2016. if (ret <= 0)
  2017. goto out;
  2018. if (!copy_to_user(ubuf, buf, ret))
  2019. *pos += ret;
  2020. else
  2021. ret = -EFAULT;
  2022. out:
  2023. kfree(buf);
  2024. return ret;
  2025. }
  2026. static int gpu_state_release(struct inode *inode, struct file *file)
  2027. {
  2028. i915_gpu_coredump_put(file->private_data);
  2029. return 0;
  2030. }
  2031. static int i915_gpu_info_open(struct inode *inode, struct file *file)
  2032. {
  2033. struct drm_i915_private *i915 = inode->i_private;
  2034. struct i915_gpu_coredump *gpu;
  2035. intel_wakeref_t wakeref;
  2036. gpu = NULL;
  2037. with_intel_runtime_pm(&i915->runtime_pm, wakeref)
  2038. gpu = i915_gpu_coredump(to_gt(i915), ALL_ENGINES, CORE_DUMP_FLAG_NONE);
  2039. if (IS_ERR(gpu))
  2040. return PTR_ERR(gpu);
  2041. file->private_data = gpu;
  2042. return 0;
  2043. }
  2044. static const struct file_operations i915_gpu_info_fops = {
  2045. .owner = THIS_MODULE,
  2046. .open = i915_gpu_info_open,
  2047. .read = gpu_state_read,
  2048. .llseek = default_llseek,
  2049. .release = gpu_state_release,
  2050. };
  2051. static ssize_t
  2052. i915_error_state_write(struct file *filp,
  2053. const char __user *ubuf,
  2054. size_t cnt,
  2055. loff_t *ppos)
  2056. {
  2057. struct i915_gpu_coredump *error = filp->private_data;
  2058. if (!error)
  2059. return 0;
  2060. drm_dbg(&error->i915->drm, "Resetting error state\n");
  2061. i915_reset_error_state(error->i915);
  2062. return cnt;
  2063. }
  2064. static int i915_error_state_open(struct inode *inode, struct file *file)
  2065. {
  2066. struct i915_gpu_coredump *error;
  2067. error = i915_first_error_state(inode->i_private);
  2068. if (IS_ERR(error))
  2069. return PTR_ERR(error);
  2070. file->private_data = error;
  2071. return 0;
  2072. }
  2073. static const struct file_operations i915_error_state_fops = {
  2074. .owner = THIS_MODULE,
  2075. .open = i915_error_state_open,
  2076. .read = gpu_state_read,
  2077. .write = i915_error_state_write,
  2078. .llseek = default_llseek,
  2079. .release = gpu_state_release,
  2080. };
  2081. void i915_gpu_error_debugfs_register(struct drm_i915_private *i915)
  2082. {
  2083. struct dentry *debugfs_root = i915->drm.debugfs_root;
  2084. debugfs_create_file("i915_error_state", 0644, debugfs_root, i915,
  2085. &i915_error_state_fops);
  2086. debugfs_create_file("i915_gpu_info", 0644, debugfs_root, i915,
  2087. &i915_gpu_info_fops);
  2088. }
  2089. static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
  2090. const struct bin_attribute *attr, char *buf,
  2091. loff_t off, size_t count)
  2092. {
  2093. struct device *kdev = kobj_to_dev(kobj);
  2094. struct drm_i915_private *i915 = kdev_minor_to_i915(kdev);
  2095. struct i915_gpu_coredump *gpu;
  2096. ssize_t ret = 0;
  2097. /*
  2098. * FIXME: Concurrent clients triggering resets and reading + clearing
  2099. * dumps can cause inconsistent sysfs reads when a user calls in with a
  2100. * non-zero offset to complete a prior partial read but the
  2101. * gpu_coredump has been cleared or replaced.
  2102. */
  2103. gpu = i915_first_error_state(i915);
  2104. if (IS_ERR(gpu)) {
  2105. ret = PTR_ERR(gpu);
  2106. } else if (gpu) {
  2107. ret = i915_gpu_coredump_copy_to_buffer(gpu, buf, off, count);
  2108. i915_gpu_coredump_put(gpu);
  2109. } else {
  2110. const char *str = "No error state collected\n";
  2111. size_t len = strlen(str);
  2112. if (off < len) {
  2113. ret = min_t(size_t, count, len - off);
  2114. memcpy(buf, str + off, ret);
  2115. }
  2116. }
  2117. return ret;
  2118. }
  2119. static ssize_t error_state_write(struct file *file, struct kobject *kobj,
  2120. const struct bin_attribute *attr, char *buf,
  2121. loff_t off, size_t count)
  2122. {
  2123. struct device *kdev = kobj_to_dev(kobj);
  2124. struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev);
  2125. drm_dbg(&dev_priv->drm, "Resetting error state\n");
  2126. i915_reset_error_state(dev_priv);
  2127. return count;
  2128. }
  2129. static const struct bin_attribute error_state_attr = {
  2130. .attr.name = "error",
  2131. .attr.mode = S_IRUSR | S_IWUSR,
  2132. .size = 0,
  2133. .read = error_state_read,
  2134. .write = error_state_write,
  2135. };
  2136. void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915)
  2137. {
  2138. struct device *kdev = i915->drm.primary->kdev;
  2139. if (sysfs_create_bin_file(&kdev->kobj, &error_state_attr))
  2140. drm_err(&i915->drm, "error_state sysfs setup failed\n");
  2141. }
  2142. void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915)
  2143. {
  2144. struct device *kdev = i915->drm.primary->kdev;
  2145. sysfs_remove_bin_file(&kdev->kobj, &error_state_attr);
  2146. }