menu.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * menu.c - the menu idle governor
  4. *
  5. * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
  6. * Copyright (C) 2009 Intel Corporation
  7. * Author:
  8. * Arjan van de Ven <arjan@linux.intel.com>
  9. */
  10. #include <linux/kernel.h>
  11. #include <linux/cpuidle.h>
  12. #include <linux/time.h>
  13. #include <linux/ktime.h>
  14. #include <linux/hrtimer.h>
  15. #include <linux/tick.h>
  16. #include <linux/sched/stat.h>
  17. #include <linux/math64.h>
  18. #include "gov.h"
  19. #define BUCKETS 6
  20. #define INTERVAL_SHIFT 3
  21. #define INTERVALS (1UL << INTERVAL_SHIFT)
  22. #define RESOLUTION 1024
  23. #define DECAY 8
  24. #define MAX_INTERESTING (50000 * NSEC_PER_USEC)
  25. /*
  26. * Concepts and ideas behind the menu governor
  27. *
  28. * For the menu governor, there are 2 decision factors for picking a C
  29. * state:
  30. * 1) Energy break even point
  31. * 2) Latency tolerance (from pmqos infrastructure)
  32. * These two factors are treated independently.
  33. *
  34. * Energy break even point
  35. * -----------------------
  36. * C state entry and exit have an energy cost, and a certain amount of time in
  37. * the C state is required to actually break even on this cost. CPUIDLE
  38. * provides us this duration in the "target_residency" field. So all that we
  39. * need is a good prediction of how long we'll be idle. Like the traditional
  40. * menu governor, we take the actual known "next timer event" time.
  41. *
  42. * Since there are other source of wakeups (interrupts for example) than
  43. * the next timer event, this estimation is rather optimistic. To get a
  44. * more realistic estimate, a correction factor is applied to the estimate,
  45. * that is based on historic behavior. For example, if in the past the actual
  46. * duration always was 50% of the next timer tick, the correction factor will
  47. * be 0.5.
  48. *
  49. * menu uses a running average for this correction factor, but it uses a set of
  50. * factors, not just a single factor. This stems from the realization that the
  51. * ratio is dependent on the order of magnitude of the expected duration; if we
  52. * expect 500 milliseconds of idle time the likelihood of getting an interrupt
  53. * very early is much higher than if we expect 50 micro seconds of idle time.
  54. * For this reason, menu keeps an array of 6 independent factors, that gets
  55. * indexed based on the magnitude of the expected duration.
  56. *
  57. * Repeatable-interval-detector
  58. * ----------------------------
  59. * There are some cases where "next timer" is a completely unusable predictor:
  60. * Those cases where the interval is fixed, for example due to hardware
  61. * interrupt mitigation, but also due to fixed transfer rate devices like mice.
  62. * For this, we use a different predictor: We track the duration of the last 8
  63. * intervals and use them to estimate the duration of the next one.
  64. */
  65. struct menu_device {
  66. int needs_update;
  67. int tick_wakeup;
  68. u64 next_timer_ns;
  69. unsigned int bucket;
  70. unsigned int correction_factor[BUCKETS];
  71. unsigned int intervals[INTERVALS];
  72. int interval_ptr;
  73. };
  74. static inline int which_bucket(u64 duration_ns)
  75. {
  76. int bucket = 0;
  77. if (duration_ns < 10ULL * NSEC_PER_USEC)
  78. return bucket;
  79. if (duration_ns < 100ULL * NSEC_PER_USEC)
  80. return bucket + 1;
  81. if (duration_ns < 1000ULL * NSEC_PER_USEC)
  82. return bucket + 2;
  83. if (duration_ns < 10000ULL * NSEC_PER_USEC)
  84. return bucket + 3;
  85. if (duration_ns < 100000ULL * NSEC_PER_USEC)
  86. return bucket + 4;
  87. return bucket + 5;
  88. }
  89. static DEFINE_PER_CPU(struct menu_device, menu_devices);
  90. static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
  91. {
  92. /* Update the repeating-pattern data. */
  93. data->intervals[data->interval_ptr++] = interval_us;
  94. if (data->interval_ptr >= INTERVALS)
  95. data->interval_ptr = 0;
  96. }
  97. static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
  98. /*
  99. * Try detecting repeating patterns by keeping track of the last 8
  100. * intervals, and checking if the standard deviation of that set
  101. * of points is below a threshold. If it is... then use the
  102. * average of these 8 points as the estimated value.
  103. */
  104. static unsigned int get_typical_interval(struct menu_device *data)
  105. {
  106. s64 value, min_thresh = -1, max_thresh = UINT_MAX;
  107. unsigned int max, min, divisor;
  108. u64 avg, variance, avg_sq;
  109. int i;
  110. again:
  111. /* Compute the average and variance of past intervals. */
  112. max = 0;
  113. min = UINT_MAX;
  114. avg = 0;
  115. variance = 0;
  116. divisor = 0;
  117. for (i = 0; i < INTERVALS; i++) {
  118. value = data->intervals[i];
  119. /*
  120. * Discard the samples outside the interval between the min and
  121. * max thresholds.
  122. */
  123. if (value <= min_thresh || value >= max_thresh)
  124. continue;
  125. divisor++;
  126. avg += value;
  127. variance += value * value;
  128. if (value > max)
  129. max = value;
  130. if (value < min)
  131. min = value;
  132. }
  133. if (!max)
  134. return UINT_MAX;
  135. if (divisor == INTERVALS) {
  136. avg >>= INTERVAL_SHIFT;
  137. variance >>= INTERVAL_SHIFT;
  138. } else {
  139. do_div(avg, divisor);
  140. do_div(variance, divisor);
  141. }
  142. avg_sq = avg * avg;
  143. variance -= avg_sq;
  144. /*
  145. * The typical interval is obtained when standard deviation is
  146. * small (stddev <= 20 us, variance <= 400 us^2) or standard
  147. * deviation is small compared to the average interval (avg >
  148. * 6*stddev, avg^2 > 36*variance). The average is smaller than
  149. * UINT_MAX aka U32_MAX, so computing its square does not
  150. * overflow a u64. We simply reject this candidate average if
  151. * the standard deviation is greater than 715 s (which is
  152. * rather unlikely).
  153. *
  154. * Use this result only if there is no timer to wake us up sooner.
  155. */
  156. if (likely(variance <= U64_MAX/36)) {
  157. if ((avg_sq > variance * 36 && divisor * 4 >= INTERVALS * 3) ||
  158. variance <= 400)
  159. return avg;
  160. }
  161. /*
  162. * If there are outliers, discard them by setting thresholds to exclude
  163. * data points at a large enough distance from the average, then
  164. * calculate the average and standard deviation again. Once we get
  165. * down to the last 3/4 of our samples, stop excluding samples.
  166. *
  167. * This can deal with workloads that have long pauses interspersed
  168. * with sporadic activity with a bunch of short pauses.
  169. *
  170. * However, if the number of remaining samples is too small to exclude
  171. * any more outliers, allow the deepest available idle state to be
  172. * selected because there are systems where the time spent by CPUs in
  173. * deep idle states is correlated to the maximum frequency the CPUs
  174. * can get to. On those systems, shallow idle states should be avoided
  175. * unless there is a clear indication that the given CPU is most likley
  176. * going to be woken up shortly.
  177. */
  178. if (divisor * 4 <= INTERVALS * 3)
  179. return UINT_MAX;
  180. /* Update the thresholds for the next round. */
  181. if (avg - min > max - avg)
  182. min_thresh = min;
  183. else
  184. max_thresh = max;
  185. goto again;
  186. }
  187. /**
  188. * menu_select - selects the next idle state to enter
  189. * @drv: cpuidle driver containing state data
  190. * @dev: the CPU
  191. * @stop_tick: indication on whether or not to stop the tick
  192. */
  193. static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
  194. bool *stop_tick)
  195. {
  196. struct menu_device *data = this_cpu_ptr(&menu_devices);
  197. s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
  198. u64 predicted_ns;
  199. ktime_t delta, delta_tick;
  200. int i, idx;
  201. if (data->needs_update) {
  202. menu_update(drv, dev);
  203. data->needs_update = 0;
  204. } else if (!dev->last_residency_ns) {
  205. /*
  206. * This happens when the driver rejects the previously selected
  207. * idle state and returns an error, so update the recent
  208. * intervals table to prevent invalid information from being
  209. * used going forward.
  210. */
  211. menu_update_intervals(data, UINT_MAX);
  212. }
  213. /* Find the shortest expected idle interval. */
  214. predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
  215. if (predicted_ns > RESIDENCY_THRESHOLD_NS || tick_nohz_tick_stopped()) {
  216. unsigned int timer_us;
  217. /* Determine the time till the closest timer. */
  218. delta = tick_nohz_get_sleep_length(&delta_tick);
  219. if (unlikely(delta < 0)) {
  220. delta = 0;
  221. delta_tick = 0;
  222. }
  223. data->next_timer_ns = delta;
  224. data->bucket = which_bucket(data->next_timer_ns);
  225. /* Round up the result for half microseconds. */
  226. timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
  227. data->next_timer_ns *
  228. data->correction_factor[data->bucket],
  229. RESOLUTION * DECAY * NSEC_PER_USEC);
  230. /* Use the lowest expected idle interval to pick the idle state. */
  231. predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns);
  232. /*
  233. * If the tick is already stopped, the cost of possible short
  234. * idle duration misprediction is much higher, because the CPU
  235. * may be stuck in a shallow idle state for a long time as a
  236. * result of it. In that case, say we might mispredict and use
  237. * the known time till the closest timer event for the idle
  238. * state selection.
  239. */
  240. if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC)
  241. predicted_ns = data->next_timer_ns;
  242. } else {
  243. /*
  244. * Because the next timer event is not going to be determined
  245. * in this case, assume that without the tick the closest timer
  246. * will be in distant future and that the closest tick will occur
  247. * after 1/2 of the tick period.
  248. */
  249. data->next_timer_ns = KTIME_MAX;
  250. delta_tick = TICK_NSEC / 2;
  251. data->bucket = BUCKETS - 1;
  252. }
  253. if (latency_req == 0 ||
  254. ((data->next_timer_ns < drv->states[1].target_residency_ns ||
  255. latency_req < drv->states[1].exit_latency_ns) &&
  256. !dev->states_usage[0].disable)) {
  257. /*
  258. * In this case state[0] will be used no matter what, so return
  259. * it right away and keep the tick running if state[0] is a
  260. * polling one.
  261. */
  262. *stop_tick = !(drv->states[0].flags & CPUIDLE_FLAG_POLLING);
  263. return 0;
  264. }
  265. /*
  266. * Find the idle state with the lowest power while satisfying
  267. * our constraints.
  268. */
  269. idx = -1;
  270. for (i = 0; i < drv->state_count; i++) {
  271. struct cpuidle_state *s = &drv->states[i];
  272. if (dev->states_usage[i].disable)
  273. continue;
  274. if (idx == -1)
  275. idx = i; /* first enabled state */
  276. if (s->exit_latency_ns > latency_req)
  277. break;
  278. if (s->target_residency_ns <= predicted_ns) {
  279. idx = i;
  280. continue;
  281. }
  282. /*
  283. * Use a physical idle state instead of busy polling so long as
  284. * its target residency is below the residency threshold, its
  285. * exit latency is not greater than the predicted idle duration,
  286. * and the next timer doesn't expire soon.
  287. */
  288. if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
  289. s->target_residency_ns < RESIDENCY_THRESHOLD_NS &&
  290. s->target_residency_ns <= data->next_timer_ns &&
  291. s->exit_latency_ns <= predicted_ns) {
  292. predicted_ns = s->target_residency_ns;
  293. idx = i;
  294. break;
  295. }
  296. if (predicted_ns < TICK_NSEC)
  297. break;
  298. if (!tick_nohz_tick_stopped()) {
  299. /*
  300. * If the state selected so far is shallow, waking up
  301. * early won't hurt, so retain the tick in that case and
  302. * let the governor run again in the next iteration of
  303. * the idle loop.
  304. */
  305. predicted_ns = drv->states[idx].target_residency_ns;
  306. break;
  307. }
  308. /*
  309. * If the state selected so far is shallow and this state's
  310. * target residency matches the time till the closest timer
  311. * event, select this one to avoid getting stuck in the shallow
  312. * one for too long.
  313. */
  314. if (drv->states[idx].target_residency_ns < TICK_NSEC &&
  315. s->target_residency_ns <= delta_tick)
  316. idx = i;
  317. return idx;
  318. }
  319. if (idx == -1)
  320. idx = 0; /* No states enabled. Must use 0. */
  321. /*
  322. * Don't stop the tick if the selected state is a polling one or if the
  323. * expected idle duration is shorter than the tick period length.
  324. */
  325. if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
  326. predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
  327. *stop_tick = false;
  328. if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) {
  329. /*
  330. * The tick is not going to be stopped and the target
  331. * residency of the state to be returned is not within
  332. * the time until the next timer event including the
  333. * tick, so try to correct that.
  334. */
  335. for (i = idx - 1; i >= 0; i--) {
  336. if (dev->states_usage[i].disable)
  337. continue;
  338. idx = i;
  339. if (drv->states[i].target_residency_ns <= delta_tick)
  340. break;
  341. }
  342. }
  343. }
  344. return idx;
  345. }
  346. /**
  347. * menu_reflect - records that data structures need update
  348. * @dev: the CPU
  349. * @index: the index of actual entered state
  350. *
  351. * NOTE: it's important to be fast here because this operation will add to
  352. * the overall exit latency.
  353. */
  354. static void menu_reflect(struct cpuidle_device *dev, int index)
  355. {
  356. struct menu_device *data = this_cpu_ptr(&menu_devices);
  357. dev->last_state_idx = index;
  358. data->needs_update = 1;
  359. data->tick_wakeup = tick_nohz_idle_got_tick();
  360. }
  361. /**
  362. * menu_update - attempts to guess what happened after entry
  363. * @drv: cpuidle driver containing state data
  364. * @dev: the CPU
  365. */
  366. static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
  367. {
  368. struct menu_device *data = this_cpu_ptr(&menu_devices);
  369. int last_idx = dev->last_state_idx;
  370. struct cpuidle_state *target = &drv->states[last_idx];
  371. u64 measured_ns;
  372. unsigned int new_factor;
  373. /*
  374. * Try to figure out how much time passed between entry to low
  375. * power state and occurrence of the wakeup event.
  376. *
  377. * If the entered idle state didn't support residency measurements,
  378. * we use them anyway if they are short, and if long,
  379. * truncate to the whole expected time.
  380. *
  381. * Any measured amount of time will include the exit latency.
  382. * Since we are interested in when the wakeup begun, not when it
  383. * was completed, we must subtract the exit latency. However, if
  384. * the measured amount of time is less than the exit latency,
  385. * assume the state was never reached and the exit latency is 0.
  386. */
  387. if (data->tick_wakeup && data->next_timer_ns > TICK_NSEC) {
  388. /*
  389. * The nohz code said that there wouldn't be any events within
  390. * the tick boundary (if the tick was stopped), but the idle
  391. * duration predictor had a differing opinion. Since the CPU
  392. * was woken up by a tick (that wasn't stopped after all), the
  393. * predictor was not quite right, so assume that the CPU could
  394. * have been idle long (but not forever) to help the idle
  395. * duration predictor do a better job next time.
  396. */
  397. measured_ns = 9 * MAX_INTERESTING / 10;
  398. } else if ((drv->states[last_idx].flags & CPUIDLE_FLAG_POLLING) &&
  399. dev->poll_time_limit) {
  400. /*
  401. * The CPU exited the "polling" state due to a time limit, so
  402. * the idle duration prediction leading to the selection of that
  403. * state was inaccurate. If a better prediction had been made,
  404. * the CPU might have been woken up from idle by the next timer.
  405. * Assume that to be the case.
  406. */
  407. measured_ns = data->next_timer_ns;
  408. } else {
  409. /* measured value */
  410. measured_ns = dev->last_residency_ns;
  411. /* Deduct exit latency */
  412. if (measured_ns > 2 * target->exit_latency_ns)
  413. measured_ns -= target->exit_latency_ns;
  414. else
  415. measured_ns /= 2;
  416. }
  417. /* Make sure our coefficients do not exceed unity */
  418. if (measured_ns > data->next_timer_ns)
  419. measured_ns = data->next_timer_ns;
  420. /* Update our correction ratio */
  421. new_factor = data->correction_factor[data->bucket];
  422. new_factor -= new_factor / DECAY;
  423. if (data->next_timer_ns > 0 && measured_ns < MAX_INTERESTING)
  424. new_factor += div64_u64(RESOLUTION * measured_ns,
  425. data->next_timer_ns);
  426. else
  427. /*
  428. * we were idle so long that we count it as a perfect
  429. * prediction
  430. */
  431. new_factor += RESOLUTION;
  432. /*
  433. * We don't want 0 as factor; we always want at least
  434. * a tiny bit of estimated time. Fortunately, due to rounding,
  435. * new_factor will stay nonzero regardless of measured_us values
  436. * and the compiler can eliminate this test as long as DECAY > 1.
  437. */
  438. if (DECAY == 1 && unlikely(new_factor == 0))
  439. new_factor = 1;
  440. data->correction_factor[data->bucket] = new_factor;
  441. menu_update_intervals(data, ktime_to_us(measured_ns));
  442. }
  443. /**
  444. * menu_enable_device - scans a CPU's states and does setup
  445. * @drv: cpuidle driver
  446. * @dev: the CPU
  447. */
  448. static int menu_enable_device(struct cpuidle_driver *drv,
  449. struct cpuidle_device *dev)
  450. {
  451. struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
  452. int i;
  453. memset(data, 0, sizeof(struct menu_device));
  454. /*
  455. * if the correction factor is 0 (eg first time init or cpu hotplug
  456. * etc), we actually want to start out with a unity factor.
  457. */
  458. for(i = 0; i < BUCKETS; i++)
  459. data->correction_factor[i] = RESOLUTION * DECAY;
  460. return 0;
  461. }
  462. static struct cpuidle_governor menu_governor = {
  463. .name = "menu",
  464. .rating = 20,
  465. .enable = menu_enable_device,
  466. .select = menu_select,
  467. .reflect = menu_reflect,
  468. };
  469. /**
  470. * init_menu - initializes the governor
  471. */
  472. static int __init init_menu(void)
  473. {
  474. return cpuidle_register_governor(&menu_governor);
  475. }
  476. postcore_initcall(init_menu);