ivpu_pm.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 2020-2024 Intel Corporation
  4. */
  5. #include <linux/highmem.h>
  6. #include <linux/moduleparam.h>
  7. #include <linux/pci.h>
  8. #include <linux/pm_runtime.h>
  9. #include <linux/reboot.h>
  10. #include "ivpu_coredump.h"
  11. #include "ivpu_drv.h"
  12. #include "ivpu_fw.h"
  13. #include "ivpu_fw_log.h"
  14. #include "ivpu_hw.h"
  15. #include "ivpu_ipc.h"
  16. #include "ivpu_job.h"
  17. #include "ivpu_jsm_msg.h"
  18. #include "ivpu_mmu.h"
  19. #include "ivpu_ms.h"
  20. #include "ivpu_pm.h"
  21. #include "ivpu_trace.h"
  22. #include "vpu_boot_api.h"
  23. static bool ivpu_disable_recovery;
  24. #if IS_ENABLED(CONFIG_DRM_ACCEL_IVPU_DEBUG)
  25. module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644);
  26. MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected");
  27. #endif
  28. static unsigned long ivpu_tdr_timeout_ms;
  29. module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
  30. MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
  31. static unsigned long ivpu_inference_timeout_ms;
  32. module_param_named(inference_timeout_ms, ivpu_inference_timeout_ms, ulong, 0644);
  33. MODULE_PARM_DESC(inference_timeout_ms, "Inference maximum duration, in milliseconds, 0 - default");
  34. #define PM_RESCHEDULE_LIMIT 5
  35. static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
  36. {
  37. struct ivpu_fw_info *fw = vdev->fw;
  38. ivpu_cmdq_reset_all_contexts(vdev);
  39. ivpu_ipc_reset(vdev);
  40. ivpu_fw_log_reset(vdev);
  41. ivpu_fw_load(vdev);
  42. fw->last_heartbeat = 0;
  43. ivpu_dbg(vdev, FW_BOOT, "Cold boot entry point 0x%llx", vdev->fw->cold_boot_entry_point);
  44. fw->next_boot_mode = VPU_BOOT_TYPE_COLDBOOT;
  45. }
  46. static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
  47. {
  48. struct ivpu_fw_info *fw = vdev->fw;
  49. struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem_bp);
  50. fw->warm_boot_entry_point = bp->save_restore_ret_address;
  51. if (!fw->warm_boot_entry_point) {
  52. ivpu_pm_prepare_cold_boot(vdev);
  53. return;
  54. }
  55. ivpu_dbg(vdev, FW_BOOT, "Warm boot entry point 0x%llx", fw->warm_boot_entry_point);
  56. fw->next_boot_mode = VPU_BOOT_TYPE_WARMBOOT;
  57. }
  58. static int ivpu_suspend(struct ivpu_device *vdev)
  59. {
  60. int ret;
  61. ivpu_prepare_for_reset(vdev);
  62. ret = ivpu_shutdown(vdev);
  63. if (ret)
  64. ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret);
  65. return ret;
  66. }
  67. static int ivpu_resume(struct ivpu_device *vdev)
  68. {
  69. int ret;
  70. retry:
  71. pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0);
  72. pci_restore_state(to_pci_dev(vdev->drm.dev));
  73. ret = ivpu_hw_power_up(vdev);
  74. if (ret) {
  75. ivpu_err(vdev, "Failed to power up HW: %d\n", ret);
  76. goto err_power_down;
  77. }
  78. ret = ivpu_mmu_enable(vdev);
  79. if (ret) {
  80. ivpu_err(vdev, "Failed to resume MMU: %d\n", ret);
  81. goto err_power_down;
  82. }
  83. ret = ivpu_boot(vdev);
  84. if (ret)
  85. goto err_mmu_disable;
  86. return 0;
  87. err_mmu_disable:
  88. ivpu_mmu_disable(vdev);
  89. err_power_down:
  90. ivpu_hw_power_down(vdev);
  91. pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
  92. if (ivpu_fw_is_warm_boot(vdev)) {
  93. ivpu_pm_prepare_cold_boot(vdev);
  94. goto retry;
  95. } else {
  96. ivpu_err(vdev, "Failed to resume the FW: %d\n", ret);
  97. }
  98. return ret;
  99. }
  100. static void ivpu_pm_reset_begin(struct ivpu_device *vdev)
  101. {
  102. pm_runtime_disable(vdev->drm.dev);
  103. atomic_inc(&vdev->pm->reset_counter);
  104. atomic_set(&vdev->pm->reset_pending, 1);
  105. down_write(&vdev->pm->reset_lock);
  106. }
  107. static void ivpu_pm_reset_complete(struct ivpu_device *vdev)
  108. {
  109. int ret;
  110. ivpu_pm_prepare_cold_boot(vdev);
  111. ivpu_jobs_abort_all(vdev);
  112. ivpu_ms_cleanup_all(vdev);
  113. ret = ivpu_resume(vdev);
  114. if (ret) {
  115. ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
  116. pm_runtime_set_suspended(vdev->drm.dev);
  117. } else {
  118. pm_runtime_set_active(vdev->drm.dev);
  119. }
  120. up_write(&vdev->pm->reset_lock);
  121. atomic_set(&vdev->pm->reset_pending, 0);
  122. pm_runtime_mark_last_busy(vdev->drm.dev);
  123. pm_runtime_enable(vdev->drm.dev);
  124. }
  125. static void ivpu_pm_recovery_work(struct work_struct *work)
  126. {
  127. struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work);
  128. struct ivpu_device *vdev = pm->vdev;
  129. char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL};
  130. ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter));
  131. ivpu_pm_reset_begin(vdev);
  132. if (!pm_runtime_status_suspended(vdev->drm.dev)) {
  133. ivpu_jsm_state_dump(vdev);
  134. ivpu_dev_coredump(vdev);
  135. ivpu_suspend(vdev);
  136. }
  137. ivpu_pm_reset_complete(vdev);
  138. kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
  139. }
  140. void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason)
  141. {
  142. ivpu_err(vdev, "Recovery triggered by %s\n", reason);
  143. if (ivpu_disable_recovery) {
  144. ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n");
  145. return;
  146. }
  147. /* Trigger recovery if it's not in progress */
  148. if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) {
  149. ivpu_hw_diagnose_failure(vdev);
  150. ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */
  151. queue_work(system_dfl_wq, &vdev->pm->recovery_work);
  152. }
  153. }
  154. static void ivpu_job_timeout_work(struct work_struct *work)
  155. {
  156. struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
  157. struct ivpu_device *vdev = pm->vdev;
  158. unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
  159. unsigned long inference_timeout_ms = ivpu_inference_timeout_ms ? ivpu_inference_timeout_ms :
  160. vdev->timeout.inference;
  161. u64 inference_max_retries;
  162. u64 heartbeat;
  163. if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
  164. ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n");
  165. goto recovery;
  166. }
  167. inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms);
  168. if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) {
  169. ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n",
  170. inference_max_retries);
  171. goto recovery;
  172. }
  173. vdev->fw->last_heartbeat = heartbeat;
  174. ivpu_start_job_timeout_detection(vdev);
  175. return;
  176. recovery:
  177. atomic_set(&vdev->job_timeout_counter, 0);
  178. ivpu_pm_trigger_recovery(vdev, "TDR");
  179. }
  180. void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
  181. {
  182. unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
  183. /* No-op if already queued */
  184. queue_delayed_work(system_percpu_wq, &vdev->pm->job_timeout_work,
  185. msecs_to_jiffies(timeout_ms));
  186. }
  187. void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
  188. {
  189. cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
  190. atomic_set(&vdev->job_timeout_counter, 0);
  191. }
  192. int ivpu_pm_suspend_cb(struct device *dev)
  193. {
  194. struct drm_device *drm = dev_get_drvdata(dev);
  195. struct ivpu_device *vdev = to_ivpu_device(drm);
  196. unsigned long timeout;
  197. trace_pm("suspend");
  198. ivpu_dbg(vdev, PM, "Suspend..\n");
  199. timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr);
  200. while (!ivpu_hw_is_idle(vdev)) {
  201. cond_resched();
  202. if (time_after_eq(jiffies, timeout)) {
  203. ivpu_err(vdev, "Failed to enter idle on system suspend\n");
  204. return -EBUSY;
  205. }
  206. }
  207. ivpu_jsm_pwr_d0i3_enter(vdev);
  208. ivpu_suspend(vdev);
  209. ivpu_pm_prepare_warm_boot(vdev);
  210. ivpu_dbg(vdev, PM, "Suspend done.\n");
  211. trace_pm("suspend done");
  212. return 0;
  213. }
  214. int ivpu_pm_resume_cb(struct device *dev)
  215. {
  216. struct drm_device *drm = dev_get_drvdata(dev);
  217. struct ivpu_device *vdev = to_ivpu_device(drm);
  218. int ret;
  219. trace_pm("resume");
  220. ivpu_dbg(vdev, PM, "Resume..\n");
  221. ret = ivpu_resume(vdev);
  222. if (ret)
  223. ivpu_err(vdev, "Failed to resume: %d\n", ret);
  224. ivpu_dbg(vdev, PM, "Resume done.\n");
  225. trace_pm("resume done");
  226. return ret;
  227. }
  228. int ivpu_pm_runtime_suspend_cb(struct device *dev)
  229. {
  230. struct drm_device *drm = dev_get_drvdata(dev);
  231. struct ivpu_device *vdev = to_ivpu_device(drm);
  232. int ret, ret_d0i3;
  233. bool is_idle;
  234. drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
  235. drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work));
  236. trace_pm("runtime suspend");
  237. ivpu_dbg(vdev, PM, "Runtime suspend..\n");
  238. ivpu_mmu_disable(vdev);
  239. is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent;
  240. if (!is_idle)
  241. ivpu_err(vdev, "NPU is not idle before autosuspend\n");
  242. ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev);
  243. if (ret_d0i3)
  244. ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3);
  245. ret = ivpu_suspend(vdev);
  246. if (ret)
  247. ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret);
  248. if (!is_idle || ret_d0i3) {
  249. ivpu_err(vdev, "Forcing cold boot due to previous errors\n");
  250. atomic_inc(&vdev->pm->reset_counter);
  251. ivpu_dev_coredump(vdev);
  252. ivpu_pm_prepare_cold_boot(vdev);
  253. } else {
  254. ivpu_pm_prepare_warm_boot(vdev);
  255. }
  256. ivpu_dbg(vdev, PM, "Runtime suspend done.\n");
  257. trace_pm("runtime suspend done");
  258. return 0;
  259. }
  260. int ivpu_pm_runtime_resume_cb(struct device *dev)
  261. {
  262. struct drm_device *drm = dev_get_drvdata(dev);
  263. struct ivpu_device *vdev = to_ivpu_device(drm);
  264. int ret;
  265. trace_pm("runtime resume");
  266. ivpu_dbg(vdev, PM, "Runtime resume..\n");
  267. ret = ivpu_resume(vdev);
  268. if (ret)
  269. ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
  270. ivpu_dbg(vdev, PM, "Runtime resume done.\n");
  271. trace_pm("runtime resume done");
  272. return ret;
  273. }
  274. int ivpu_rpm_get(struct ivpu_device *vdev)
  275. {
  276. int ret;
  277. ret = pm_runtime_resume_and_get(vdev->drm.dev);
  278. if (ret < 0) {
  279. ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
  280. pm_runtime_set_suspended(vdev->drm.dev);
  281. }
  282. return ret;
  283. }
  284. void ivpu_rpm_put(struct ivpu_device *vdev)
  285. {
  286. pm_runtime_put_autosuspend(vdev->drm.dev);
  287. }
  288. void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev)
  289. {
  290. struct ivpu_device *vdev = pci_get_drvdata(pdev);
  291. ivpu_dbg(vdev, PM, "Pre-reset..\n");
  292. ivpu_pm_reset_begin(vdev);
  293. if (!pm_runtime_status_suspended(vdev->drm.dev)) {
  294. ivpu_prepare_for_reset(vdev);
  295. ivpu_hw_reset(vdev);
  296. }
  297. ivpu_dbg(vdev, PM, "Pre-reset done.\n");
  298. }
  299. void ivpu_pm_reset_done_cb(struct pci_dev *pdev)
  300. {
  301. struct ivpu_device *vdev = pci_get_drvdata(pdev);
  302. ivpu_dbg(vdev, PM, "Post-reset..\n");
  303. ivpu_pm_reset_complete(vdev);
  304. ivpu_dbg(vdev, PM, "Post-reset done.\n");
  305. }
  306. void ivpu_pm_init(struct ivpu_device *vdev)
  307. {
  308. struct device *dev = vdev->drm.dev;
  309. struct ivpu_pm_info *pm = vdev->pm;
  310. int delay;
  311. pm->vdev = vdev;
  312. init_rwsem(&pm->reset_lock);
  313. atomic_set(&pm->reset_pending, 0);
  314. atomic_set(&pm->reset_counter, 0);
  315. INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
  316. INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
  317. if (ivpu_disable_recovery)
  318. delay = -1;
  319. else
  320. delay = vdev->timeout.autosuspend;
  321. pm_runtime_use_autosuspend(dev);
  322. pm_runtime_set_autosuspend_delay(dev, delay);
  323. pm_runtime_set_active(dev);
  324. ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay);
  325. }
  326. void ivpu_pm_disable_recovery(struct ivpu_device *vdev)
  327. {
  328. drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work));
  329. disable_work_sync(&vdev->pm->recovery_work);
  330. }
  331. void ivpu_pm_enable(struct ivpu_device *vdev)
  332. {
  333. struct device *dev = vdev->drm.dev;
  334. pm_runtime_allow(dev);
  335. pm_runtime_put_autosuspend(dev);
  336. }
  337. void ivpu_pm_disable(struct ivpu_device *vdev)
  338. {
  339. pm_runtime_get_noresume(vdev->drm.dev);
  340. pm_runtime_forbid(vdev->drm.dev);
  341. }
  342. int ivpu_pm_dct_init(struct ivpu_device *vdev)
  343. {
  344. if (vdev->pm->dct_active_percent)
  345. return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent);
  346. return 0;
  347. }
  348. int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent)
  349. {
  350. u32 active_us, inactive_us;
  351. int ret;
  352. if (active_percent == 0 || active_percent > 100)
  353. return -EINVAL;
  354. active_us = (DCT_PERIOD_US * active_percent) / 100;
  355. inactive_us = DCT_PERIOD_US - active_us;
  356. vdev->pm->dct_active_percent = active_percent;
  357. ivpu_dbg(vdev, PM, "DCT requested %u%% (D0: %uus, D0i2: %uus)\n",
  358. active_percent, active_us, inactive_us);
  359. ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us);
  360. if (ret) {
  361. ivpu_err_ratelimited(vdev, "Failed to enable DCT: %d\n", ret);
  362. return ret;
  363. }
  364. return 0;
  365. }
  366. int ivpu_pm_dct_disable(struct ivpu_device *vdev)
  367. {
  368. int ret;
  369. vdev->pm->dct_active_percent = 0;
  370. ivpu_dbg(vdev, PM, "DCT requested to be disabled\n");
  371. ret = ivpu_jsm_dct_disable(vdev);
  372. if (ret) {
  373. ivpu_err_ratelimited(vdev, "Failed to disable DCT: %d\n", ret);
  374. return ret;
  375. }
  376. return 0;
  377. }
  378. void ivpu_pm_irq_dct_work_fn(struct work_struct *work)
  379. {
  380. struct ivpu_device *vdev = container_of(work, struct ivpu_device, irq_dct_work);
  381. bool enable;
  382. int ret;
  383. if (ivpu_hw_btrs_dct_get_request(vdev, &enable))
  384. return;
  385. if (enable)
  386. ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT);
  387. else
  388. ret = ivpu_pm_dct_disable(vdev);
  389. if (!ret) {
  390. /* Convert percent to U1.7 format */
  391. u8 val = DIV_ROUND_CLOSEST(vdev->pm->dct_active_percent * 128, 100);
  392. ivpu_hw_btrs_dct_set_status(vdev, enable, val);
  393. }
  394. }