intel_engine_heartbeat.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. // SPDX-License-Identifier: MIT
  2. /*
  3. * Copyright © 2019 Intel Corporation
  4. */
  5. #include <drm/drm_print.h>
  6. #include "i915_drv.h"
  7. #include "i915_jiffies.h"
  8. #include "i915_request.h"
  9. #include "intel_context.h"
  10. #include "intel_engine_heartbeat.h"
  11. #include "intel_engine_pm.h"
  12. #include "intel_engine.h"
  13. #include "intel_gt.h"
  14. #include "intel_reset.h"
  15. /*
  16. * While the engine is active, we send a periodic pulse along the engine
  17. * to check on its health and to flush any idle-barriers. If that request
  18. * is stuck, and we fail to preempt it, we declare the engine hung and
  19. * issue a reset -- in the hope that restores progress.
  20. */
  21. static bool next_heartbeat(struct intel_engine_cs *engine)
  22. {
  23. struct i915_request *rq;
  24. long delay;
  25. delay = READ_ONCE(engine->props.heartbeat_interval_ms);
  26. rq = engine->heartbeat.systole;
  27. /*
  28. * FIXME: The final period extension is disabled if the period has been
  29. * modified from the default. This is to prevent issues with certain
  30. * selftests which override the value and expect specific behaviour.
  31. * Once the selftests have been updated to either cope with variable
  32. * heartbeat periods (or to override the pre-emption timeout as well,
  33. * or just to add a selftest specific override of the extension), the
  34. * generic override can be removed.
  35. */
  36. if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER &&
  37. delay == engine->defaults.heartbeat_interval_ms) {
  38. long longer;
  39. /*
  40. * The final try is at the highest priority possible. Up until now
  41. * a pre-emption might not even have been attempted. So make sure
  42. * this last attempt allows enough time for a pre-emption to occur.
  43. */
  44. longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2;
  45. longer = intel_clamp_heartbeat_interval_ms(engine, longer);
  46. if (longer > delay)
  47. delay = longer;
  48. }
  49. if (!delay)
  50. return false;
  51. delay = msecs_to_jiffies_timeout(delay);
  52. if (delay >= HZ)
  53. delay = round_jiffies_up_relative(delay);
  54. mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1);
  55. return true;
  56. }
  57. static struct i915_request *
  58. heartbeat_create(struct intel_context *ce, gfp_t gfp)
  59. {
  60. struct i915_request *rq;
  61. intel_context_enter(ce);
  62. rq = __i915_request_create(ce, gfp);
  63. intel_context_exit(ce);
  64. return rq;
  65. }
  66. static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
  67. {
  68. engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
  69. i915_request_add_active_barriers(rq);
  70. if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine))
  71. engine->heartbeat.systole = i915_request_get(rq);
  72. }
  73. static void heartbeat_commit(struct i915_request *rq,
  74. const struct i915_sched_attr *attr)
  75. {
  76. idle_pulse(rq->engine, rq);
  77. __i915_request_commit(rq);
  78. __i915_request_queue(rq, attr);
  79. }
  80. static void show_heartbeat(const struct i915_request *rq,
  81. struct intel_engine_cs *engine)
  82. {
  83. struct drm_printer p =
  84. drm_dbg_printer(&engine->i915->drm, DRM_UT_DRIVER, "heartbeat");
  85. if (!rq) {
  86. intel_engine_dump(engine, &p,
  87. "%s heartbeat not ticking\n",
  88. engine->name);
  89. } else {
  90. intel_engine_dump(engine, &p,
  91. "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n",
  92. engine->name,
  93. rq->fence.context,
  94. rq->fence.seqno,
  95. rq->sched.attr.priority);
  96. }
  97. }
  98. static void
  99. reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
  100. {
  101. if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
  102. show_heartbeat(rq, engine);
  103. if (intel_engine_uses_guc(engine))
  104. /*
  105. * GuC itself is toast or GuC's hang detection
  106. * is disabled. Either way, need to find the
  107. * hang culprit manually.
  108. */
  109. intel_guc_find_hung_context(engine);
  110. intel_gt_handle_error(engine->gt, engine->mask,
  111. I915_ERROR_CAPTURE,
  112. "stopped heartbeat on %s",
  113. engine->name);
  114. }
  115. static void heartbeat(struct work_struct *wrk)
  116. {
  117. struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
  118. struct intel_engine_cs *engine =
  119. container_of(wrk, typeof(*engine), heartbeat.work.work);
  120. struct intel_context *ce = engine->kernel_context;
  121. struct i915_request *rq;
  122. unsigned long serial;
  123. /* Just in case everything has gone horribly wrong, give it a kick */
  124. intel_engine_flush_submission(engine);
  125. rq = xchg(&engine->heartbeat.systole, NULL);
  126. if (rq) {
  127. if (i915_request_completed(rq))
  128. i915_request_put(rq);
  129. else
  130. engine->heartbeat.systole = rq;
  131. }
  132. if (!intel_engine_pm_get_if_awake(engine))
  133. return;
  134. if (intel_gt_is_wedged(engine->gt))
  135. goto out;
  136. if (i915_sched_engine_disabled(engine->sched_engine)) {
  137. reset_engine(engine, engine->heartbeat.systole);
  138. goto out;
  139. }
  140. if (engine->heartbeat.systole) {
  141. long delay = READ_ONCE(engine->props.heartbeat_interval_ms);
  142. /* Safeguard against too-fast worker invocations */
  143. if (!time_after(jiffies,
  144. rq->emitted_jiffies + msecs_to_jiffies(delay)))
  145. goto out;
  146. if (!i915_sw_fence_signaled(&rq->submit)) {
  147. /*
  148. * Not yet submitted, system is stalled.
  149. *
  150. * This more often happens for ring submission,
  151. * where all contexts are funnelled into a common
  152. * ringbuffer. If one context is blocked on an
  153. * external fence, not only is it not submitted,
  154. * but all other contexts, including the kernel
  155. * context are stuck waiting for the signal.
  156. */
  157. } else if (engine->sched_engine->schedule &&
  158. rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
  159. /*
  160. * Gradually raise the priority of the heartbeat to
  161. * give high priority work [which presumably desires
  162. * low latency and no jitter] the chance to naturally
  163. * complete before being preempted.
  164. */
  165. attr.priority = I915_PRIORITY_NORMAL;
  166. if (rq->sched.attr.priority >= attr.priority)
  167. attr.priority = I915_PRIORITY_HEARTBEAT;
  168. if (rq->sched.attr.priority >= attr.priority)
  169. attr.priority = I915_PRIORITY_BARRIER;
  170. local_bh_disable();
  171. engine->sched_engine->schedule(rq, &attr);
  172. local_bh_enable();
  173. } else {
  174. reset_engine(engine, rq);
  175. }
  176. rq->emitted_jiffies = jiffies;
  177. goto out;
  178. }
  179. serial = READ_ONCE(engine->serial);
  180. if (engine->wakeref_serial == serial)
  181. goto out;
  182. if (!mutex_trylock(&ce->timeline->mutex)) {
  183. /* Unable to lock the kernel timeline, is the engine stuck? */
  184. if (xchg(&engine->heartbeat.blocked, serial) == serial)
  185. intel_gt_handle_error(engine->gt, engine->mask,
  186. I915_ERROR_CAPTURE,
  187. "no heartbeat on %s",
  188. engine->name);
  189. goto out;
  190. }
  191. rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
  192. if (IS_ERR(rq))
  193. goto unlock;
  194. heartbeat_commit(rq, &attr);
  195. unlock:
  196. mutex_unlock(&ce->timeline->mutex);
  197. out:
  198. if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine)) {
  199. rq = xchg(&engine->heartbeat.systole, NULL);
  200. if (rq)
  201. i915_request_put(rq);
  202. }
  203. intel_engine_pm_put(engine);
  204. }
  205. void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
  206. {
  207. if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL)
  208. return;
  209. next_heartbeat(engine);
  210. }
  211. void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
  212. {
  213. if (cancel_delayed_work(&engine->heartbeat.work)) {
  214. struct i915_request *rq;
  215. rq = xchg(&engine->heartbeat.systole, NULL);
  216. if (rq)
  217. i915_request_put(rq);
  218. }
  219. }
  220. void intel_gt_unpark_heartbeats(struct intel_gt *gt)
  221. {
  222. struct intel_engine_cs *engine;
  223. enum intel_engine_id id;
  224. for_each_engine(engine, gt, id)
  225. if (intel_engine_pm_is_awake(engine))
  226. intel_engine_unpark_heartbeat(engine);
  227. }
  228. void intel_gt_park_heartbeats(struct intel_gt *gt)
  229. {
  230. struct intel_engine_cs *engine;
  231. enum intel_engine_id id;
  232. for_each_engine(engine, gt, id)
  233. intel_engine_park_heartbeat(engine);
  234. }
  235. void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
  236. {
  237. INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat);
  238. }
  239. static int __intel_engine_pulse(struct intel_engine_cs *engine)
  240. {
  241. struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
  242. struct intel_context *ce = engine->kernel_context;
  243. struct i915_request *rq;
  244. lockdep_assert_held(&ce->timeline->mutex);
  245. GEM_BUG_ON(!intel_engine_has_preemption(engine));
  246. GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
  247. rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
  248. if (IS_ERR(rq))
  249. return PTR_ERR(rq);
  250. __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
  251. heartbeat_commit(rq, &attr);
  252. GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
  253. /* Ensure the forced pulse gets a full period to execute */
  254. next_heartbeat(engine);
  255. return 0;
  256. }
  257. static unsigned long set_heartbeat(struct intel_engine_cs *engine,
  258. unsigned long delay)
  259. {
  260. unsigned long old;
  261. old = xchg(&engine->props.heartbeat_interval_ms, delay);
  262. if (delay)
  263. intel_engine_unpark_heartbeat(engine);
  264. else
  265. intel_engine_park_heartbeat(engine);
  266. return old;
  267. }
  268. int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
  269. unsigned long delay)
  270. {
  271. struct intel_context *ce = engine->kernel_context;
  272. int err = 0;
  273. if (!delay && !intel_engine_has_preempt_reset(engine))
  274. return -ENODEV;
  275. /* FIXME: Remove together with equally marked hack in next_heartbeat. */
  276. if (delay != engine->defaults.heartbeat_interval_ms &&
  277. delay < 2 * engine->props.preempt_timeout_ms) {
  278. if (intel_engine_uses_guc(engine))
  279. drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n",
  280. engine->name);
  281. else
  282. drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n",
  283. engine->name);
  284. }
  285. intel_engine_pm_get(engine);
  286. err = mutex_lock_interruptible(&ce->timeline->mutex);
  287. if (err)
  288. goto out_rpm;
  289. if (delay != engine->props.heartbeat_interval_ms) {
  290. unsigned long saved = set_heartbeat(engine, delay);
  291. /* recheck current execution */
  292. if (intel_engine_has_preemption(engine)) {
  293. err = __intel_engine_pulse(engine);
  294. if (err)
  295. set_heartbeat(engine, saved);
  296. }
  297. }
  298. mutex_unlock(&ce->timeline->mutex);
  299. out_rpm:
  300. intel_engine_pm_put(engine);
  301. return err;
  302. }
  303. int intel_engine_pulse(struct intel_engine_cs *engine)
  304. {
  305. struct intel_context *ce = engine->kernel_context;
  306. int err;
  307. if (!intel_engine_has_preemption(engine))
  308. return -ENODEV;
  309. if (!intel_engine_pm_get_if_awake(engine))
  310. return 0;
  311. err = -EINTR;
  312. if (!mutex_lock_interruptible(&ce->timeline->mutex)) {
  313. err = __intel_engine_pulse(engine);
  314. mutex_unlock(&ce->timeline->mutex);
  315. }
  316. intel_engine_flush_submission(engine);
  317. intel_engine_pm_put(engine);
  318. return err;
  319. }
  320. int intel_engine_flush_barriers(struct intel_engine_cs *engine)
  321. {
  322. struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
  323. struct intel_context *ce = engine->kernel_context;
  324. struct i915_request *rq;
  325. int err;
  326. if (llist_empty(&engine->barrier_tasks))
  327. return 0;
  328. if (!intel_engine_pm_get_if_awake(engine))
  329. return 0;
  330. if (mutex_lock_interruptible(&ce->timeline->mutex)) {
  331. err = -EINTR;
  332. goto out_rpm;
  333. }
  334. rq = heartbeat_create(ce, GFP_KERNEL);
  335. if (IS_ERR(rq)) {
  336. err = PTR_ERR(rq);
  337. goto out_unlock;
  338. }
  339. heartbeat_commit(rq, &attr);
  340. err = 0;
  341. out_unlock:
  342. mutex_unlock(&ce->timeline->mutex);
  343. out_rpm:
  344. intel_engine_pm_put(engine);
  345. return err;
  346. }
  347. #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
  348. #include "selftest_engine_heartbeat.c"
  349. #endif