x86_pkg_temp_thermal.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * x86_pkg_temp_thermal driver
  4. * Copyright (c) 2013, Intel Corporation.
  5. */
  6. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  7. #include <linux/module.h>
  8. #include <linux/init.h>
  9. #include <linux/intel_tcc.h>
  10. #include <linux/err.h>
  11. #include <linux/param.h>
  12. #include <linux/device.h>
  13. #include <linux/platform_device.h>
  14. #include <linux/cpu.h>
  15. #include <linux/smp.h>
  16. #include <linux/slab.h>
  17. #include <linux/pm.h>
  18. #include <linux/thermal.h>
  19. #include <linux/debugfs.h>
  20. #include <asm/cpu_device_id.h>
  21. #include <asm/msr.h>
  22. #include "thermal_interrupt.h"
  23. /*
  24. * Rate control delay: Idea is to introduce denounce effect
  25. * This should be long enough to avoid reduce events, when
  26. * threshold is set to a temperature, which is constantly
  27. * violated, but at the short enough to take any action.
  28. * The action can be remove threshold or change it to next
  29. * interesting setting. Based on experiments, in around
  30. * every 5 seconds under load will give us a significant
  31. * temperature change.
  32. */
  33. #define PKG_TEMP_THERMAL_NOTIFY_DELAY 5000
  34. static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY;
  35. module_param(notify_delay_ms, int, 0644);
  36. MODULE_PARM_DESC(notify_delay_ms,
  37. "User space notification delay in milli seconds.");
  38. /* Number of trip points in thermal zone. Currently it can't
  39. * be more than 2. MSR can allow setting and getting notifications
  40. * for only 2 thresholds. This define enforces this, if there
  41. * is some wrong values returned by cpuid for number of thresholds.
  42. */
  43. #define MAX_NUMBER_OF_TRIPS 2
  44. struct zone_device {
  45. int cpu;
  46. bool work_scheduled;
  47. u32 msr_pkg_therm_low;
  48. u32 msr_pkg_therm_high;
  49. struct delayed_work work;
  50. struct thermal_zone_device *tzone;
  51. struct cpumask cpumask;
  52. };
  53. static struct thermal_zone_params pkg_temp_tz_params = {
  54. .no_hwmon = true,
  55. };
  56. /* Keep track of how many zone pointers we allocated in init() */
  57. static int max_id __read_mostly;
  58. /* Array of zone pointers */
  59. static struct zone_device **zones;
  60. /* Serializes interrupt notification, work and hotplug */
  61. static DEFINE_RAW_SPINLOCK(pkg_temp_lock);
  62. /* Protects zone operation in the work function against hotplug removal */
  63. static DEFINE_MUTEX(thermal_zone_mutex);
  64. /* The dynamically assigned cpu hotplug state for module_exit() */
  65. static enum cpuhp_state pkg_thermal_hp_state __read_mostly;
  66. /* Debug counters to show using debugfs */
  67. static struct dentry *debugfs;
  68. static unsigned int pkg_interrupt_cnt;
  69. static unsigned int pkg_work_cnt;
  70. static void pkg_temp_debugfs_init(void)
  71. {
  72. debugfs = debugfs_create_dir("pkg_temp_thermal", NULL);
  73. debugfs_create_u32("pkg_thres_interrupt", S_IRUGO, debugfs,
  74. &pkg_interrupt_cnt);
  75. debugfs_create_u32("pkg_thres_work", S_IRUGO, debugfs,
  76. &pkg_work_cnt);
  77. }
  78. /*
  79. * Protection:
  80. *
  81. * - cpu hotplug: Read serialized by cpu hotplug lock
  82. * Write must hold pkg_temp_lock
  83. *
  84. * - Other callsites: Must hold pkg_temp_lock
  85. */
  86. static struct zone_device *pkg_temp_thermal_get_dev(unsigned int cpu)
  87. {
  88. int id = topology_logical_die_id(cpu);
  89. if (id >= 0 && id < max_id)
  90. return zones[id];
  91. return NULL;
  92. }
  93. static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
  94. {
  95. struct zone_device *zonedev = thermal_zone_device_priv(tzd);
  96. int val, ret;
  97. ret = intel_tcc_get_temp(zonedev->cpu, &val, true);
  98. if (ret < 0)
  99. return ret;
  100. *temp = val * 1000;
  101. pr_debug("sys_get_curr_temp %d\n", *temp);
  102. return 0;
  103. }
  104. static int
  105. sys_set_trip_temp(struct thermal_zone_device *tzd,
  106. const struct thermal_trip *trip, int temp)
  107. {
  108. struct zone_device *zonedev = thermal_zone_device_priv(tzd);
  109. unsigned int trip_index = THERMAL_TRIP_PRIV_TO_INT(trip->priv);
  110. u32 l, h, mask, shift, intr;
  111. int tj_max, val, ret;
  112. if (temp == THERMAL_TEMP_INVALID)
  113. temp = 0;
  114. tj_max = intel_tcc_get_tjmax(zonedev->cpu);
  115. if (tj_max < 0)
  116. return tj_max;
  117. tj_max *= 1000;
  118. val = (tj_max - temp)/1000;
  119. if (trip_index >= MAX_NUMBER_OF_TRIPS || val < 0 || val > 0x7f)
  120. return -EINVAL;
  121. ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
  122. &l, &h);
  123. if (ret < 0)
  124. return ret;
  125. if (trip_index) {
  126. mask = THERM_MASK_THRESHOLD1;
  127. shift = THERM_SHIFT_THRESHOLD1;
  128. intr = THERM_INT_THRESHOLD1_ENABLE;
  129. } else {
  130. mask = THERM_MASK_THRESHOLD0;
  131. shift = THERM_SHIFT_THRESHOLD0;
  132. intr = THERM_INT_THRESHOLD0_ENABLE;
  133. }
  134. l &= ~mask;
  135. /*
  136. * When users space sets a trip temperature == 0, which is indication
  137. * that, it is no longer interested in receiving notifications.
  138. */
  139. if (!temp) {
  140. l &= ~intr;
  141. } else {
  142. l |= val << shift;
  143. l |= intr;
  144. }
  145. return wrmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
  146. l, h);
  147. }
  148. /* Thermal zone callback registry */
  149. static const struct thermal_zone_device_ops tzone_ops = {
  150. .get_temp = sys_get_curr_temp,
  151. .set_trip_temp = sys_set_trip_temp,
  152. };
  153. static bool pkg_thermal_rate_control(void)
  154. {
  155. return true;
  156. }
  157. /* Enable threshold interrupt on local package/cpu */
  158. static inline void enable_pkg_thres_interrupt(void)
  159. {
  160. u8 thres_0, thres_1;
  161. u32 l, h;
  162. rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
  163. /* only enable/disable if it had valid threshold value */
  164. thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0;
  165. thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1;
  166. if (thres_0)
  167. l |= THERM_INT_THRESHOLD0_ENABLE;
  168. if (thres_1)
  169. l |= THERM_INT_THRESHOLD1_ENABLE;
  170. wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
  171. }
  172. /* Disable threshold interrupt on local package/cpu */
  173. static inline void disable_pkg_thres_interrupt(void)
  174. {
  175. u32 l, h;
  176. rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
  177. l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE);
  178. wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
  179. }
  180. static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
  181. {
  182. struct thermal_zone_device *tzone = NULL;
  183. int cpu = smp_processor_id();
  184. struct zone_device *zonedev;
  185. mutex_lock(&thermal_zone_mutex);
  186. raw_spin_lock_irq(&pkg_temp_lock);
  187. ++pkg_work_cnt;
  188. zonedev = pkg_temp_thermal_get_dev(cpu);
  189. if (!zonedev) {
  190. raw_spin_unlock_irq(&pkg_temp_lock);
  191. mutex_unlock(&thermal_zone_mutex);
  192. return;
  193. }
  194. zonedev->work_scheduled = false;
  195. thermal_clear_package_intr_status(PACKAGE_LEVEL, THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1);
  196. tzone = zonedev->tzone;
  197. enable_pkg_thres_interrupt();
  198. raw_spin_unlock_irq(&pkg_temp_lock);
  199. /*
  200. * If tzone is not NULL, then thermal_zone_mutex will prevent the
  201. * concurrent removal in the cpu offline callback.
  202. */
  203. if (tzone)
  204. thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED);
  205. mutex_unlock(&thermal_zone_mutex);
  206. }
  207. static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work)
  208. {
  209. unsigned long ms = msecs_to_jiffies(notify_delay_ms);
  210. schedule_delayed_work_on(cpu, work, ms);
  211. }
  212. static int pkg_thermal_notify(u64 msr_val)
  213. {
  214. int cpu = smp_processor_id();
  215. struct zone_device *zonedev;
  216. unsigned long flags;
  217. raw_spin_lock_irqsave(&pkg_temp_lock, flags);
  218. ++pkg_interrupt_cnt;
  219. disable_pkg_thres_interrupt();
  220. /* Work is per package, so scheduling it once is enough. */
  221. zonedev = pkg_temp_thermal_get_dev(cpu);
  222. if (zonedev && !zonedev->work_scheduled) {
  223. zonedev->work_scheduled = true;
  224. pkg_thermal_schedule_work(zonedev->cpu, &zonedev->work);
  225. }
  226. raw_spin_unlock_irqrestore(&pkg_temp_lock, flags);
  227. return 0;
  228. }
  229. static int pkg_temp_thermal_trips_init(int cpu, int tj_max,
  230. struct thermal_trip *trips, int num_trips)
  231. {
  232. unsigned long thres_reg_value;
  233. u32 mask, shift, eax, edx;
  234. int ret, i;
  235. for (i = 0; i < num_trips; i++) {
  236. if (i) {
  237. mask = THERM_MASK_THRESHOLD1;
  238. shift = THERM_SHIFT_THRESHOLD1;
  239. } else {
  240. mask = THERM_MASK_THRESHOLD0;
  241. shift = THERM_SHIFT_THRESHOLD0;
  242. }
  243. ret = rdmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
  244. &eax, &edx);
  245. if (ret < 0)
  246. return ret;
  247. thres_reg_value = (eax & mask) >> shift;
  248. trips[i].temperature = thres_reg_value ?
  249. tj_max - thres_reg_value * 1000 : THERMAL_TEMP_INVALID;
  250. trips[i].type = THERMAL_TRIP_PASSIVE;
  251. trips[i].flags |= THERMAL_TRIP_FLAG_RW_TEMP;
  252. trips[i].priv = THERMAL_INT_TO_TRIP_PRIV(i);
  253. pr_debug("%s: cpu=%d, trip=%d, temp=%d\n",
  254. __func__, cpu, i, trips[i].temperature);
  255. }
  256. return 0;
  257. }
  258. static int pkg_temp_thermal_device_add(unsigned int cpu)
  259. {
  260. struct thermal_trip trips[MAX_NUMBER_OF_TRIPS] = { 0 };
  261. int id = topology_logical_die_id(cpu);
  262. u32 eax, ebx, ecx, edx;
  263. struct zone_device *zonedev;
  264. int thres_count, err;
  265. int tj_max;
  266. if (id >= max_id)
  267. return -ENOMEM;
  268. cpuid(6, &eax, &ebx, &ecx, &edx);
  269. thres_count = ebx & 0x07;
  270. if (!thres_count)
  271. return -ENODEV;
  272. thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS);
  273. tj_max = intel_tcc_get_tjmax(cpu);
  274. if (tj_max < 0)
  275. return tj_max;
  276. tj_max *= 1000;
  277. zonedev = kzalloc_obj(*zonedev);
  278. if (!zonedev)
  279. return -ENOMEM;
  280. err = pkg_temp_thermal_trips_init(cpu, tj_max, trips, thres_count);
  281. if (err)
  282. goto out_kfree_zonedev;
  283. INIT_DELAYED_WORK(&zonedev->work, pkg_temp_thermal_threshold_work_fn);
  284. zonedev->cpu = cpu;
  285. zonedev->tzone = thermal_zone_device_register_with_trips("x86_pkg_temp",
  286. trips, thres_count,
  287. zonedev, &tzone_ops, &pkg_temp_tz_params, 0, 0);
  288. if (IS_ERR(zonedev->tzone)) {
  289. err = PTR_ERR(zonedev->tzone);
  290. goto out_kfree_zonedev;
  291. }
  292. err = thermal_zone_device_enable(zonedev->tzone);
  293. if (err)
  294. goto out_unregister_tz;
  295. /* Store MSR value for package thermal interrupt, to restore at exit */
  296. rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, zonedev->msr_pkg_therm_low,
  297. zonedev->msr_pkg_therm_high);
  298. cpumask_set_cpu(cpu, &zonedev->cpumask);
  299. raw_spin_lock_irq(&pkg_temp_lock);
  300. zones[id] = zonedev;
  301. raw_spin_unlock_irq(&pkg_temp_lock);
  302. return 0;
  303. out_unregister_tz:
  304. thermal_zone_device_unregister(zonedev->tzone);
  305. out_kfree_zonedev:
  306. kfree(zonedev);
  307. return err;
  308. }
  309. static int pkg_thermal_cpu_offline(unsigned int cpu)
  310. {
  311. struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
  312. bool lastcpu, was_target;
  313. int target;
  314. if (!zonedev)
  315. return 0;
  316. target = cpumask_any_but(&zonedev->cpumask, cpu);
  317. cpumask_clear_cpu(cpu, &zonedev->cpumask);
  318. lastcpu = target >= nr_cpu_ids;
  319. /*
  320. * Remove the sysfs files, if this is the last cpu in the package
  321. * before doing further cleanups.
  322. */
  323. if (lastcpu) {
  324. struct thermal_zone_device *tzone = zonedev->tzone;
  325. /*
  326. * We must protect against a work function calling
  327. * thermal_zone_update, after/while unregister. We null out
  328. * the pointer under the zone mutex, so the worker function
  329. * won't try to call.
  330. */
  331. mutex_lock(&thermal_zone_mutex);
  332. zonedev->tzone = NULL;
  333. mutex_unlock(&thermal_zone_mutex);
  334. thermal_zone_device_unregister(tzone);
  335. }
  336. /* Protect against work and interrupts */
  337. raw_spin_lock_irq(&pkg_temp_lock);
  338. /*
  339. * Check whether this cpu was the current target and store the new
  340. * one. When we drop the lock, then the interrupt notify function
  341. * will see the new target.
  342. */
  343. was_target = zonedev->cpu == cpu;
  344. zonedev->cpu = target;
  345. /*
  346. * If this is the last CPU in the package remove the package
  347. * reference from the array and restore the interrupt MSR. When we
  348. * drop the lock neither the interrupt notify function nor the
  349. * worker will see the package anymore.
  350. */
  351. if (lastcpu) {
  352. zones[topology_logical_die_id(cpu)] = NULL;
  353. /* After this point nothing touches the MSR anymore. */
  354. wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
  355. zonedev->msr_pkg_therm_low, zonedev->msr_pkg_therm_high);
  356. }
  357. /*
  358. * Check whether there is work scheduled and whether the work is
  359. * targeted at the outgoing CPU.
  360. */
  361. if (zonedev->work_scheduled && was_target) {
  362. /*
  363. * To cancel the work we need to drop the lock, otherwise
  364. * we might deadlock if the work needs to be flushed.
  365. */
  366. raw_spin_unlock_irq(&pkg_temp_lock);
  367. cancel_delayed_work_sync(&zonedev->work);
  368. raw_spin_lock_irq(&pkg_temp_lock);
  369. /*
  370. * If this is not the last cpu in the package and the work
  371. * did not run after we dropped the lock above, then we
  372. * need to reschedule the work, otherwise the interrupt
  373. * stays disabled forever.
  374. */
  375. if (!lastcpu && zonedev->work_scheduled)
  376. pkg_thermal_schedule_work(target, &zonedev->work);
  377. }
  378. raw_spin_unlock_irq(&pkg_temp_lock);
  379. /* Final cleanup if this is the last cpu */
  380. if (lastcpu)
  381. kfree(zonedev);
  382. return 0;
  383. }
  384. static int pkg_thermal_cpu_online(unsigned int cpu)
  385. {
  386. struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu);
  387. struct cpuinfo_x86 *c = &cpu_data(cpu);
  388. /* Paranoia check */
  389. if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS))
  390. return -ENODEV;
  391. /* If the package exists, nothing to do */
  392. if (zonedev) {
  393. cpumask_set_cpu(cpu, &zonedev->cpumask);
  394. return 0;
  395. }
  396. return pkg_temp_thermal_device_add(cpu);
  397. }
  398. static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = {
  399. X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_PTS, NULL),
  400. {}
  401. };
  402. MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
  403. static int __init pkg_temp_thermal_init(void)
  404. {
  405. int ret;
  406. if (!x86_match_cpu(pkg_temp_thermal_ids))
  407. return -ENODEV;
  408. max_id = topology_max_packages() * topology_max_dies_per_package();
  409. zones = kzalloc_objs(struct zone_device *, max_id);
  410. if (!zones)
  411. return -ENOMEM;
  412. ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online",
  413. pkg_thermal_cpu_online, pkg_thermal_cpu_offline);
  414. if (ret < 0)
  415. goto err;
  416. /* Store the state for module exit */
  417. pkg_thermal_hp_state = ret;
  418. platform_thermal_package_notify = pkg_thermal_notify;
  419. platform_thermal_package_rate_control = pkg_thermal_rate_control;
  420. /* Don't care if it fails */
  421. pkg_temp_debugfs_init();
  422. return 0;
  423. err:
  424. kfree(zones);
  425. return ret;
  426. }
  427. module_init(pkg_temp_thermal_init)
  428. static void __exit pkg_temp_thermal_exit(void)
  429. {
  430. platform_thermal_package_notify = NULL;
  431. platform_thermal_package_rate_control = NULL;
  432. cpuhp_remove_state(pkg_thermal_hp_state);
  433. debugfs_remove_recursive(debugfs);
  434. kfree(zones);
  435. }
  436. module_exit(pkg_temp_thermal_exit)
  437. MODULE_IMPORT_NS("INTEL_TCC");
  438. MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver");
  439. MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
  440. MODULE_LICENSE("GPL v2");