teo.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Timer events oriented CPU idle governor
  4. *
  5. * Copyright (C) 2018 - 2021 Intel Corporation
  6. * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
  7. */
  8. /**
  9. * DOC: teo-description
  10. *
  11. * The idea of this governor is based on the observation that on many systems
  12. * timer interrupts are two or more orders of magnitude more frequent than any
  13. * other interrupt types, so they are likely to dominate CPU wakeup patterns.
  14. * Moreover, in principle, the time when the next timer event is going to occur
  15. * can be determined at the idle state selection time, although doing that may
  16. * be costly, so it can be regarded as the most reliable source of information
  17. * for idle state selection.
  18. *
  19. * Of course, non-timer wakeup sources are more important in some use cases,
  20. * but even then it is generally unnecessary to consider idle duration values
  21. * greater than the time till the next timer event, referred as the sleep
  22. * length in what follows, because the closest timer will ultimately wake up the
  23. * CPU anyway unless it is woken up earlier.
  24. *
  25. * However, since obtaining the sleep length may be costly, the governor first
  26. * checks if it can select a shallow idle state using wakeup pattern information
  27. * from recent times, in which case it can do without knowing the sleep length
  28. * at all. For this purpose, it counts CPU wakeup events and looks for an idle
  29. * state whose target residency has not exceeded the idle duration (measured
  30. * after wakeup) in the majority of relevant recent cases. If the target
  31. * residency of that state is small enough, it may be used right away and the
  32. * sleep length need not be determined.
  33. *
  34. * The computations carried out by this governor are based on using bins whose
  35. * boundaries are aligned with the target residency parameter values of the CPU
  36. * idle states provided by the %CPUIdle driver in the ascending order. That is,
  37. * the first bin spans from 0 up to, but not including, the target residency of
  38. * the second idle state (idle state 1), the second bin spans from the target
  39. * residency of idle state 1 up to, but not including, the target residency of
  40. * idle state 2, the third bin spans from the target residency of idle state 2
  41. * up to, but not including, the target residency of idle state 3 and so on.
  42. * The last bin spans from the target residency of the deepest idle state
  43. * supplied by the driver to infinity.
  44. *
  45. * Two metrics called "hits" and "intercepts" are associated with each bin.
  46. * They are updated every time before selecting an idle state for the given CPU
  47. * in accordance with what happened last time.
  48. *
  49. * The "hits" metric reflects the relative frequency of situations in which the
  50. * sleep length and the idle duration measured after CPU wakeup are close enough
  51. * (that is, the CPU appears to wake up "on time" relative to the sleep length).
  52. * In turn, the "intercepts" metric reflects the relative frequency of non-timer
  53. * wakeup events for which the measured idle duration is significantly different
  54. * from the sleep length (these events are also referred to as "intercepts"
  55. * below).
  56. *
  57. * The governor also counts "intercepts" with the measured idle duration below
  58. * the tick period length and uses this information when deciding whether or not
  59. * to stop the scheduler tick.
  60. *
  61. * In order to select an idle state for a CPU, the governor takes the following
  62. * steps (modulo the possible latency constraint that must be taken into account
  63. * too):
  64. *
  65. * 1. Find the deepest enabled CPU idle state (the candidate idle state) and
  66. * compute 2 sums as follows:
  67. *
  68. * - The sum of the "hits" metric for all of the idle states shallower than
  69. * the candidate one (it represents the cases in which the CPU was likely
  70. * woken up by a timer).
  71. *
  72. * - The sum of the "intercepts" metric for all of the idle states shallower
  73. * than the candidate one (it represents the cases in which the CPU was
  74. * likely woken up by a non-timer wakeup source).
  75. *
  76. * Also find the idle state with the maximum intercepts metric (if there are
  77. * multiple states with the maximum intercepts metric, choose the one with
  78. * the highest index).
  79. *
  80. * 2. If the second sum computed in step 1 is greater than a half of the sum of
  81. * both metrics for the candidate state bin and all subsequent bins (if any),
  82. * a shallower idle state is likely to be more suitable, so look for it.
  83. *
  84. * - Traverse the enabled idle states shallower than the candidate one in the
  85. * descending order, starting at the state with the maximum intercepts
  86. * metric found in step 1.
  87. *
  88. * - For each of them compute the sum of the "intercepts" metrics over all
  89. * of the idle states between it and the candidate one (including the
  90. * former and excluding the latter).
  91. *
  92. * - If this sum is greater than a half of the second sum computed in step 1,
  93. * use the given idle state as the new candidate one.
  94. *
  95. * 3. If the current candidate state is state 0 or its target residency is short
  96. * enough, return it and prevent the scheduler tick from being stopped.
  97. *
  98. * 4. Obtain the sleep length value and check if it is below the target
  99. * residency of the current candidate state, in which case a new shallower
  100. * candidate state needs to be found, so look for it.
  101. */
  102. #include <linux/cpuidle.h>
  103. #include <linux/jiffies.h>
  104. #include <linux/kernel.h>
  105. #include <linux/sched/clock.h>
  106. #include <linux/tick.h>
  107. #include "gov.h"
  108. /*
  109. * Idle state exit latency threshold used for deciding whether or not to check
  110. * the time till the closest expected timer event.
  111. */
  112. #define LATENCY_THRESHOLD_NS (RESIDENCY_THRESHOLD_NS / 2)
  113. /*
  114. * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
  115. * is used for decreasing metrics on a regular basis.
  116. */
  117. #define PULSE 1024
  118. #define DECAY_SHIFT 3
  119. /**
  120. * struct teo_bin - Metrics used by the TEO cpuidle governor.
  121. * @intercepts: The "intercepts" metric.
  122. * @hits: The "hits" metric.
  123. */
  124. struct teo_bin {
  125. unsigned int intercepts;
  126. unsigned int hits;
  127. };
  128. /**
  129. * struct teo_cpu - CPU data used by the TEO cpuidle governor.
  130. * @sleep_length_ns: Time till the closest timer event (at the selection time).
  131. * @state_bins: Idle state data bins for this CPU.
  132. * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
  133. * @total_tick: Wakeups by the scheduler tick.
  134. * @tick_intercepts: "Intercepts" before TICK_NSEC.
  135. * @short_idles: Wakeups after short idle periods.
  136. * @tick_wakeup: Set if the last wakeup was by the scheduler tick.
  137. */
  138. struct teo_cpu {
  139. s64 sleep_length_ns;
  140. struct teo_bin state_bins[CPUIDLE_STATE_MAX];
  141. unsigned int total;
  142. unsigned int total_tick;
  143. unsigned int tick_intercepts;
  144. unsigned int short_idles;
  145. bool tick_wakeup;
  146. };
  147. static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
  148. static void teo_decay(unsigned int *metric)
  149. {
  150. unsigned int delta = *metric >> DECAY_SHIFT;
  151. if (delta)
  152. *metric -= delta;
  153. else
  154. *metric = 0;
  155. }
  156. /**
  157. * teo_update - Update CPU metrics after wakeup.
  158. * @drv: cpuidle driver containing state data.
  159. * @dev: Target CPU.
  160. */
  161. static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
  162. {
  163. s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
  164. struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
  165. int i, idx_timer = 0, idx_duration = 0;
  166. s64 target_residency_ns, measured_ns;
  167. unsigned int total = 0;
  168. teo_decay(&cpu_data->short_idles);
  169. if (dev->poll_time_limit) {
  170. dev->poll_time_limit = false;
  171. /*
  172. * Polling state timeout has triggered, so assume that this
  173. * might have been a long sleep.
  174. */
  175. measured_ns = S64_MAX;
  176. } else {
  177. measured_ns = dev->last_residency_ns;
  178. /*
  179. * The delay between the wakeup and the first instruction
  180. * executed by the CPU is not likely to be worst-case every
  181. * time, so take 1/2 of the exit latency as a very rough
  182. * approximation of the average of it.
  183. */
  184. if (measured_ns >= lat_ns) {
  185. measured_ns -= lat_ns / 2;
  186. if (measured_ns < RESIDENCY_THRESHOLD_NS)
  187. cpu_data->short_idles += PULSE;
  188. } else {
  189. measured_ns /= 2;
  190. cpu_data->short_idles += PULSE;
  191. }
  192. }
  193. /*
  194. * Decay the "hits" and "intercepts" metrics for all of the bins and
  195. * find the bins that the sleep length and the measured idle duration
  196. * fall into.
  197. */
  198. for (i = 0; i < drv->state_count; i++) {
  199. struct teo_bin *bin = &cpu_data->state_bins[i];
  200. teo_decay(&bin->hits);
  201. total += bin->hits;
  202. teo_decay(&bin->intercepts);
  203. total += bin->intercepts;
  204. target_residency_ns = drv->states[i].target_residency_ns;
  205. if (target_residency_ns <= cpu_data->sleep_length_ns) {
  206. idx_timer = i;
  207. if (target_residency_ns <= measured_ns)
  208. idx_duration = i;
  209. }
  210. }
  211. cpu_data->total = total + PULSE;
  212. teo_decay(&cpu_data->tick_intercepts);
  213. teo_decay(&cpu_data->total_tick);
  214. if (cpu_data->tick_wakeup) {
  215. cpu_data->total_tick += PULSE;
  216. /*
  217. * If tick wakeups dominate the wakeup pattern, count this one
  218. * as a hit on the deepest available idle state to increase the
  219. * likelihood of stopping the tick.
  220. */
  221. if (3 * cpu_data->total_tick > 2 * cpu_data->total) {
  222. cpu_data->state_bins[drv->state_count-1].hits += PULSE;
  223. return;
  224. }
  225. /*
  226. * If intercepts within the tick period range are not frequent
  227. * enough, count this wakeup as a hit, since it is likely that
  228. * the tick has woken up the CPU because an expected intercept
  229. * was not there. Otherwise, one of the intercepts may have
  230. * been incidentally preceded by the tick wakeup.
  231. */
  232. if (3 * cpu_data->tick_intercepts < 2 * total) {
  233. cpu_data->state_bins[idx_timer].hits += PULSE;
  234. return;
  235. }
  236. }
  237. /*
  238. * If the measured idle duration (adjusted for the entered state exit
  239. * latency) falls into the same bin as the sleep length and the latter
  240. * is less than the "raw" measured idle duration (so the wakeup appears
  241. * to have occurred after the anticipated timer event), this is a "hit",
  242. * so update the "hits" metric for that bin.
  243. *
  244. * Otherwise, update the "intercepts" metric for the bin fallen into by
  245. * the measured idle duration.
  246. */
  247. if (idx_timer == idx_duration &&
  248. cpu_data->sleep_length_ns - measured_ns < lat_ns / 2) {
  249. cpu_data->state_bins[idx_timer].hits += PULSE;
  250. } else {
  251. cpu_data->state_bins[idx_duration].intercepts += PULSE;
  252. if (measured_ns <= TICK_NSEC)
  253. cpu_data->tick_intercepts += PULSE;
  254. }
  255. }
  256. /**
  257. * teo_find_shallower_state - Find shallower idle state matching given duration.
  258. * @drv: cpuidle driver containing state data.
  259. * @dev: Target CPU.
  260. * @state_idx: Index of the capping idle state.
  261. * @duration_ns: Idle duration value to match.
  262. */
  263. static int teo_find_shallower_state(struct cpuidle_driver *drv,
  264. struct cpuidle_device *dev, int state_idx,
  265. s64 duration_ns)
  266. {
  267. int i;
  268. for (i = state_idx - 1; i >= 0; i--) {
  269. if (dev->states_usage[i].disable)
  270. continue;
  271. state_idx = i;
  272. if (drv->states[i].target_residency_ns <= duration_ns)
  273. break;
  274. }
  275. return state_idx;
  276. }
  277. /**
  278. * teo_select - Selects the next idle state to enter.
  279. * @drv: cpuidle driver containing state data.
  280. * @dev: Target CPU.
  281. * @stop_tick: Indication on whether or not to stop the scheduler tick.
  282. */
  283. static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
  284. bool *stop_tick)
  285. {
  286. struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
  287. s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
  288. ktime_t delta_tick = TICK_NSEC / 2;
  289. unsigned int idx_intercept_sum = 0;
  290. unsigned int intercept_sum = 0;
  291. unsigned int intercept_max = 0;
  292. unsigned int idx_hit_sum = 0;
  293. unsigned int hit_sum = 0;
  294. int intercept_max_idx = -1;
  295. int constraint_idx = 0;
  296. int idx0 = 0, idx = -1;
  297. s64 duration_ns;
  298. int i;
  299. if (dev->last_state_idx >= 0) {
  300. teo_update(drv, dev);
  301. dev->last_state_idx = -1;
  302. }
  303. /*
  304. * Set the sleep length to infinity in case the invocation of
  305. * tick_nohz_get_sleep_length() below is skipped, in which case it won't
  306. * be known whether or not the subsequent wakeup is caused by a timer.
  307. * It is generally fine to count the wakeup as an intercept then, except
  308. * for the cases when the CPU is mostly woken up by timers and there may
  309. * be opportunities to ask for a deeper idle state when no imminent
  310. * timers are scheduled which may be missed.
  311. */
  312. cpu_data->sleep_length_ns = KTIME_MAX;
  313. if (!dev->states_usage[0].disable)
  314. idx = 0;
  315. /*
  316. * Compute the sums of metrics for early wakeup pattern detection and
  317. * look for the state bin with the maximum intercepts metric below the
  318. * deepest enabled one (if there are multiple states with the maximum
  319. * intercepts metric, choose the one with the highest index).
  320. */
  321. for (i = 1; i < drv->state_count; i++) {
  322. struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
  323. unsigned int prev_intercepts = prev_bin->intercepts;
  324. struct cpuidle_state *s = &drv->states[i];
  325. /*
  326. * Update the sums of idle state metrics for all of the states
  327. * shallower than the current one.
  328. */
  329. hit_sum += prev_bin->hits;
  330. intercept_sum += prev_intercepts;
  331. /*
  332. * Check if this is the bin with the maximum number of
  333. * intercepts so far and in that case update the index of
  334. * the state with the maximum intercepts metric.
  335. */
  336. if (prev_intercepts >= intercept_max) {
  337. intercept_max = prev_intercepts;
  338. intercept_max_idx = i - 1;
  339. }
  340. if (dev->states_usage[i].disable)
  341. continue;
  342. if (idx < 0)
  343. idx0 = i; /* first enabled state */
  344. idx = i;
  345. if (s->exit_latency_ns <= latency_req)
  346. constraint_idx = i;
  347. /* Save the sums for the current state. */
  348. idx_intercept_sum = intercept_sum;
  349. idx_hit_sum = hit_sum;
  350. }
  351. /* Avoid unnecessary overhead. */
  352. if (idx < 0) {
  353. idx = 0; /* No states enabled, must use 0. */
  354. goto out_tick;
  355. }
  356. if (idx == idx0) {
  357. /*
  358. * Only one idle state is enabled, so use it, but do not
  359. * allow the tick to be stopped it is shallow enough.
  360. */
  361. duration_ns = drv->states[idx].target_residency_ns;
  362. goto end;
  363. }
  364. /*
  365. * If the sum of the intercepts metric for all of the idle states
  366. * shallower than the current candidate one (idx) is greater than the
  367. * sum of the intercepts and hits metrics for the candidate state and
  368. * all of the deeper states, a shallower idle state is likely to be a
  369. * better choice.
  370. */
  371. if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) {
  372. int min_idx = idx0;
  373. if (tick_nohz_tick_stopped()) {
  374. /*
  375. * Look for the shallowest idle state below the current
  376. * candidate one whose target residency is at least
  377. * equal to the tick period length.
  378. */
  379. while (min_idx < idx &&
  380. drv->states[min_idx].target_residency_ns < TICK_NSEC)
  381. min_idx++;
  382. /*
  383. * Avoid selecting a state with a lower index, but with
  384. * the same target residency as the current candidate
  385. * one.
  386. */
  387. if (drv->states[min_idx].target_residency_ns ==
  388. drv->states[idx].target_residency_ns)
  389. goto constraint;
  390. }
  391. /*
  392. * If the minimum state index is greater than or equal to the
  393. * index of the state with the maximum intercepts metric and
  394. * the corresponding state is enabled, there is no need to look
  395. * at the deeper states.
  396. */
  397. if (min_idx >= intercept_max_idx &&
  398. !dev->states_usage[min_idx].disable) {
  399. idx = min_idx;
  400. goto constraint;
  401. }
  402. /*
  403. * Look for the deepest enabled idle state, at most as deep as
  404. * the one with the maximum intercepts metric, whose target
  405. * residency had not been greater than the idle duration in over
  406. * a half of the relevant cases in the past.
  407. *
  408. * Take the possible duration limitation present if the tick
  409. * has been stopped already into account.
  410. */
  411. for (i = idx - 1, intercept_sum = 0; i >= min_idx; i--) {
  412. intercept_sum += cpu_data->state_bins[i].intercepts;
  413. if (dev->states_usage[i].disable)
  414. continue;
  415. idx = i;
  416. if (2 * intercept_sum > idx_intercept_sum &&
  417. i <= intercept_max_idx)
  418. break;
  419. }
  420. }
  421. constraint:
  422. /*
  423. * If there is a latency constraint, it may be necessary to select an
  424. * idle state shallower than the current candidate one.
  425. */
  426. if (idx > constraint_idx)
  427. idx = constraint_idx;
  428. /*
  429. * If either the candidate state is state 0 or its target residency is
  430. * low enough, there is basically nothing more to do, but if the sleep
  431. * length is not updated, the subsequent wakeup will be counted as an
  432. * "intercept" which may be problematic in the cases when timer wakeups
  433. * are dominant. Namely, it may effectively prevent deeper idle states
  434. * from being selected at one point even if no imminent timers are
  435. * scheduled.
  436. *
  437. * However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one
  438. * CPU are unlikely (user space has a default 50 us slack value for
  439. * hrtimers and there are relatively few timers with a lower deadline
  440. * value in the kernel), and even if they did happen, the potential
  441. * benefit from using a deep idle state in that case would be
  442. * questionable anyway for latency reasons. Thus if the measured idle
  443. * duration falls into that range in the majority of cases, assume
  444. * non-timer wakeups to be dominant and skip updating the sleep length
  445. * to reduce latency.
  446. *
  447. * Also, if the latency constraint is sufficiently low, it will force
  448. * shallow idle states regardless of the wakeup type, so the sleep
  449. * length need not be known in that case.
  450. */
  451. if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) &&
  452. (2 * cpu_data->short_idles >= cpu_data->total ||
  453. latency_req < LATENCY_THRESHOLD_NS))
  454. goto out_tick;
  455. duration_ns = tick_nohz_get_sleep_length(&delta_tick);
  456. cpu_data->sleep_length_ns = duration_ns;
  457. if (!idx)
  458. goto out_tick;
  459. /*
  460. * If the closest expected timer is before the target residency of the
  461. * candidate state, a shallower one needs to be found.
  462. */
  463. if (drv->states[idx].target_residency_ns > duration_ns)
  464. idx = teo_find_shallower_state(drv, dev, idx, duration_ns);
  465. /*
  466. * If the selected state's target residency is below the tick length
  467. * and intercepts occurring before the tick length are the majority of
  468. * total wakeup events, do not stop the tick.
  469. */
  470. if (drv->states[idx].target_residency_ns < TICK_NSEC &&
  471. 3 * cpu_data->tick_intercepts >= 2 * cpu_data->total)
  472. duration_ns = TICK_NSEC / 2;
  473. end:
  474. /*
  475. * Allow the tick to be stopped unless the selected state is a polling
  476. * one or the expected idle duration is shorter than the tick period
  477. * length.
  478. */
  479. if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
  480. duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped())
  481. return idx;
  482. /*
  483. * The tick is not going to be stopped, so if the target residency of
  484. * the state to be returned is not within the time till the closest
  485. * timer including the tick, try to correct that.
  486. */
  487. if (idx > idx0 &&
  488. drv->states[idx].target_residency_ns > delta_tick)
  489. idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
  490. out_tick:
  491. *stop_tick = false;
  492. return idx;
  493. }
  494. /**
  495. * teo_reflect - Note that governor data for the CPU need to be updated.
  496. * @dev: Target CPU.
  497. * @state: Entered state.
  498. */
  499. static void teo_reflect(struct cpuidle_device *dev, int state)
  500. {
  501. struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
  502. cpu_data->tick_wakeup = tick_nohz_idle_got_tick();
  503. dev->last_state_idx = state;
  504. }
  505. /**
  506. * teo_enable_device - Initialize the governor's data for the target CPU.
  507. * @drv: cpuidle driver (not used).
  508. * @dev: Target CPU.
  509. */
  510. static int teo_enable_device(struct cpuidle_driver *drv,
  511. struct cpuidle_device *dev)
  512. {
  513. struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
  514. memset(cpu_data, 0, sizeof(*cpu_data));
  515. return 0;
  516. }
  517. static struct cpuidle_governor teo_governor = {
  518. .name = "teo",
  519. .rating = 19,
  520. .enable = teo_enable_device,
  521. .select = teo_select,
  522. .reflect = teo_reflect,
  523. };
  524. static int __init teo_governor_init(void)
  525. {
  526. return cpuidle_register_governor(&teo_governor);
  527. }
  528. postcore_initcall(teo_governor_init);