blk-iocost.c 99 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551
  1. /* SPDX-License-Identifier: GPL-2.0
  2. *
  3. * IO cost model based controller.
  4. *
  5. * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
  6. * Copyright (C) 2019 Andy Newell <newella@fb.com>
  7. * Copyright (C) 2019 Facebook
  8. *
  9. * One challenge of controlling IO resources is the lack of trivially
  10. * observable cost metric. This is distinguished from CPU and memory where
  11. * wallclock time and the number of bytes can serve as accurate enough
  12. * approximations.
  13. *
  14. * Bandwidth and iops are the most commonly used metrics for IO devices but
  15. * depending on the type and specifics of the device, different IO patterns
  16. * easily lead to multiple orders of magnitude variations rendering them
  17. * useless for the purpose of IO capacity distribution. While on-device
  18. * time, with a lot of clutches, could serve as a useful approximation for
  19. * non-queued rotational devices, this is no longer viable with modern
  20. * devices, even the rotational ones.
  21. *
  22. * While there is no cost metric we can trivially observe, it isn't a
  23. * complete mystery. For example, on a rotational device, seek cost
  24. * dominates while a contiguous transfer contributes a smaller amount
  25. * proportional to the size. If we can characterize at least the relative
  26. * costs of these different types of IOs, it should be possible to
  27. * implement a reasonable work-conserving proportional IO resource
  28. * distribution.
  29. *
  30. * 1. IO Cost Model
  31. *
  32. * IO cost model estimates the cost of an IO given its basic parameters and
  33. * history (e.g. the end sector of the last IO). The cost is measured in
  34. * device time. If a given IO is estimated to cost 10ms, the device should
  35. * be able to process ~100 of those IOs in a second.
  36. *
  37. * Currently, there's only one builtin cost model - linear. Each IO is
  38. * classified as sequential or random and given a base cost accordingly.
  39. * On top of that, a size cost proportional to the length of the IO is
  40. * added. While simple, this model captures the operational
  41. * characteristics of a wide varienty of devices well enough. Default
  42. * parameters for several different classes of devices are provided and the
  43. * parameters can be configured from userspace via
  44. * /sys/fs/cgroup/io.cost.model.
  45. *
  46. * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
  47. * device-specific coefficients.
  48. *
  49. * 2. Control Strategy
  50. *
  51. * The device virtual time (vtime) is used as the primary control metric.
  52. * The control strategy is composed of the following three parts.
  53. *
  54. * 2-1. Vtime Distribution
  55. *
  56. * When a cgroup becomes active in terms of IOs, its hierarchical share is
  57. * calculated. Please consider the following hierarchy where the numbers
  58. * inside parentheses denote the configured weights.
  59. *
  60. * root
  61. * / \
  62. * A (w:100) B (w:300)
  63. * / \
  64. * A0 (w:100) A1 (w:100)
  65. *
  66. * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
  67. * of equal weight, each gets 50% share. If then B starts issuing IOs, B
  68. * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
  69. * 12.5% each. The distribution mechanism only cares about these flattened
  70. * shares. They're called hweights (hierarchical weights) and always add
  71. * upto 1 (WEIGHT_ONE).
  72. *
  73. * A given cgroup's vtime runs slower in inverse proportion to its hweight.
  74. * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
  75. * against the device vtime - an IO which takes 10ms on the underlying
  76. * device is considered to take 80ms on A0.
  77. *
  78. * This constitutes the basis of IO capacity distribution. Each cgroup's
  79. * vtime is running at a rate determined by its hweight. A cgroup tracks
  80. * the vtime consumed by past IOs and can issue a new IO if doing so
  81. * wouldn't outrun the current device vtime. Otherwise, the IO is
  82. * suspended until the vtime has progressed enough to cover it.
  83. *
  84. * 2-2. Vrate Adjustment
  85. *
  86. * It's unrealistic to expect the cost model to be perfect. There are too
  87. * many devices and even on the same device the overall performance
  88. * fluctuates depending on numerous factors such as IO mixture and device
  89. * internal garbage collection. The controller needs to adapt dynamically.
  90. *
  91. * This is achieved by adjusting the overall IO rate according to how busy
  92. * the device is. If the device becomes overloaded, we're sending down too
  93. * many IOs and should generally slow down. If there are waiting issuers
  94. * but the device isn't saturated, we're issuing too few and should
  95. * generally speed up.
  96. *
  97. * To slow down, we lower the vrate - the rate at which the device vtime
  98. * passes compared to the wall clock. For example, if the vtime is running
  99. * at the vrate of 75%, all cgroups added up would only be able to issue
  100. * 750ms worth of IOs per second, and vice-versa for speeding up.
  101. *
  102. * Device business is determined using two criteria - rq wait and
  103. * completion latencies.
  104. *
  105. * When a device gets saturated, the on-device and then the request queues
  106. * fill up and a bio which is ready to be issued has to wait for a request
  107. * to become available. When this delay becomes noticeable, it's a clear
  108. * indication that the device is saturated and we lower the vrate. This
  109. * saturation signal is fairly conservative as it only triggers when both
  110. * hardware and software queues are filled up, and is used as the default
  111. * busy signal.
  112. *
  113. * As devices can have deep queues and be unfair in how the queued commands
  114. * are executed, solely depending on rq wait may not result in satisfactory
  115. * control quality. For a better control quality, completion latency QoS
  116. * parameters can be configured so that the device is considered saturated
  117. * if N'th percentile completion latency rises above the set point.
  118. *
  119. * The completion latency requirements are a function of both the
  120. * underlying device characteristics and the desired IO latency quality of
  121. * service. There is an inherent trade-off - the tighter the latency QoS,
  122. * the higher the bandwidth lossage. Latency QoS is disabled by default
  123. * and can be set through /sys/fs/cgroup/io.cost.qos.
  124. *
  125. * 2-3. Work Conservation
  126. *
  127. * Imagine two cgroups A and B with equal weights. A is issuing a small IO
  128. * periodically while B is sending out enough parallel IOs to saturate the
  129. * device on its own. Let's say A's usage amounts to 100ms worth of IO
  130. * cost per second, i.e., 10% of the device capacity. The naive
  131. * distribution of half and half would lead to 60% utilization of the
  132. * device, a significant reduction in the total amount of work done
  133. * compared to free-for-all competition. This is too high a cost to pay
  134. * for IO control.
  135. *
  136. * To conserve the total amount of work done, we keep track of how much
  137. * each active cgroup is actually using and yield part of its weight if
  138. * there are other cgroups which can make use of it. In the above case,
  139. * A's weight will be lowered so that it hovers above the actual usage and
  140. * B would be able to use the rest.
  141. *
  142. * As we don't want to penalize a cgroup for donating its weight, the
  143. * surplus weight adjustment factors in a margin and has an immediate
  144. * snapback mechanism in case the cgroup needs more IO vtime for itself.
  145. *
  146. * Note that adjusting down surplus weights has the same effects as
  147. * accelerating vtime for other cgroups and work conservation can also be
  148. * implemented by adjusting vrate dynamically. However, squaring who can
  149. * donate and should take back how much requires hweight propagations
  150. * anyway making it easier to implement and understand as a separate
  151. * mechanism.
  152. *
  153. * 3. Monitoring
  154. *
  155. * Instead of debugfs or other clumsy monitoring mechanisms, this
  156. * controller uses a drgn based monitoring script -
  157. * tools/cgroup/iocost_monitor.py. For details on drgn, please see
  158. * https://github.com/osandov/drgn. The output looks like the following.
  159. *
  160. * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
  161. * active weight hweight% inflt% dbt delay usages%
  162. * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
  163. * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
  164. *
  165. * - per : Timer period
  166. * - cur_per : Internal wall and device vtime clock
  167. * - vrate : Device virtual time rate against wall clock
  168. * - weight : Surplus-adjusted and configured weights
  169. * - hweight : Surplus-adjusted and configured hierarchical weights
  170. * - inflt : The percentage of in-flight IO cost at the end of last period
  171. * - del_ms : Deferred issuer delay induction level and duration
  172. * - usages : Usage history
  173. */
  174. #include <linux/kernel.h>
  175. #include <linux/module.h>
  176. #include <linux/timer.h>
  177. #include <linux/time64.h>
  178. #include <linux/parser.h>
  179. #include <linux/sched/signal.h>
  180. #include <asm/local.h>
  181. #include <asm/local64.h>
  182. #include "blk-rq-qos.h"
  183. #include "blk-stat.h"
  184. #include "blk-wbt.h"
  185. #include "blk-cgroup.h"
  186. #ifdef CONFIG_TRACEPOINTS
  187. /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
  188. #define TRACE_IOCG_PATH_LEN 1024
  189. static DEFINE_SPINLOCK(trace_iocg_path_lock);
  190. static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
  191. #define TRACE_IOCG_PATH(type, iocg, ...) \
  192. do { \
  193. unsigned long flags; \
  194. if (trace_iocost_##type##_enabled()) { \
  195. spin_lock_irqsave(&trace_iocg_path_lock, flags); \
  196. cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
  197. trace_iocg_path, TRACE_IOCG_PATH_LEN); \
  198. trace_iocost_##type(iocg, trace_iocg_path, \
  199. ##__VA_ARGS__); \
  200. spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
  201. } \
  202. } while (0)
  203. #else /* CONFIG_TRACE_POINTS */
  204. #define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
  205. #endif /* CONFIG_TRACE_POINTS */
  206. enum {
  207. MILLION = 1000000,
  208. /* timer period is calculated from latency requirements, bound it */
  209. MIN_PERIOD = USEC_PER_MSEC,
  210. MAX_PERIOD = USEC_PER_SEC,
  211. /*
  212. * iocg->vtime is targeted at 50% behind the device vtime, which
  213. * serves as its IO credit buffer. Surplus weight adjustment is
  214. * immediately canceled if the vtime margin runs below 10%.
  215. */
  216. MARGIN_MIN_PCT = 10,
  217. MARGIN_LOW_PCT = 20,
  218. MARGIN_TARGET_PCT = 50,
  219. INUSE_ADJ_STEP_PCT = 25,
  220. /* Have some play in timer operations */
  221. TIMER_SLACK_PCT = 1,
  222. /* 1/64k is granular enough and can easily be handled w/ u32 */
  223. WEIGHT_ONE = 1 << 16,
  224. };
  225. enum {
  226. /*
  227. * As vtime is used to calculate the cost of each IO, it needs to
  228. * be fairly high precision. For example, it should be able to
  229. * represent the cost of a single page worth of discard with
  230. * suffificient accuracy. At the same time, it should be able to
  231. * represent reasonably long enough durations to be useful and
  232. * convenient during operation.
  233. *
  234. * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
  235. * granularity and days of wrap-around time even at extreme vrates.
  236. */
  237. VTIME_PER_SEC_SHIFT = 37,
  238. VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
  239. VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
  240. VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
  241. /* bound vrate adjustments within two orders of magnitude */
  242. VRATE_MIN_PPM = 10000, /* 1% */
  243. VRATE_MAX_PPM = 100000000, /* 10000% */
  244. VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
  245. VRATE_CLAMP_ADJ_PCT = 4,
  246. /* switch iff the conditions are met for longer than this */
  247. AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
  248. };
  249. enum {
  250. /* if IOs end up waiting for requests, issue less */
  251. RQ_WAIT_BUSY_PCT = 5,
  252. /* unbusy hysterisis */
  253. UNBUSY_THR_PCT = 75,
  254. /*
  255. * The effect of delay is indirect and non-linear and a huge amount of
  256. * future debt can accumulate abruptly while unthrottled. Linearly scale
  257. * up delay as debt is going up and then let it decay exponentially.
  258. * This gives us quick ramp ups while delay is accumulating and long
  259. * tails which can help reducing the frequency of debt explosions on
  260. * unthrottle. The parameters are experimentally determined.
  261. *
  262. * The delay mechanism provides adequate protection and behavior in many
  263. * cases. However, this is far from ideal and falls shorts on both
  264. * fronts. The debtors are often throttled too harshly costing a
  265. * significant level of fairness and possibly total work while the
  266. * protection against their impacts on the system can be choppy and
  267. * unreliable.
  268. *
  269. * The shortcoming primarily stems from the fact that, unlike for page
  270. * cache, the kernel doesn't have well-defined back-pressure propagation
  271. * mechanism and policies for anonymous memory. Fully addressing this
  272. * issue will likely require substantial improvements in the area.
  273. */
  274. MIN_DELAY_THR_PCT = 500,
  275. MAX_DELAY_THR_PCT = 25000,
  276. MIN_DELAY = 250,
  277. MAX_DELAY = 250 * USEC_PER_MSEC,
  278. /* halve debts if avg usage over 100ms is under 50% */
  279. DFGV_USAGE_PCT = 50,
  280. DFGV_PERIOD = 100 * USEC_PER_MSEC,
  281. /* don't let cmds which take a very long time pin lagging for too long */
  282. MAX_LAGGING_PERIODS = 10,
  283. /*
  284. * Count IO size in 4k pages. The 12bit shift helps keeping
  285. * size-proportional components of cost calculation in closer
  286. * numbers of digits to per-IO cost components.
  287. */
  288. IOC_PAGE_SHIFT = 12,
  289. IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
  290. IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
  291. /* if apart further than 16M, consider randio for linear model */
  292. LCOEF_RANDIO_PAGES = 4096,
  293. };
  294. enum ioc_running {
  295. IOC_IDLE,
  296. IOC_RUNNING,
  297. IOC_STOP,
  298. };
  299. /* io.cost.qos controls including per-dev enable of the whole controller */
  300. enum {
  301. QOS_ENABLE,
  302. QOS_CTRL,
  303. NR_QOS_CTRL_PARAMS,
  304. };
  305. /* io.cost.qos params */
  306. enum {
  307. QOS_RPPM,
  308. QOS_RLAT,
  309. QOS_WPPM,
  310. QOS_WLAT,
  311. QOS_MIN,
  312. QOS_MAX,
  313. NR_QOS_PARAMS,
  314. };
  315. /* io.cost.model controls */
  316. enum {
  317. COST_CTRL,
  318. COST_MODEL,
  319. NR_COST_CTRL_PARAMS,
  320. };
  321. /* builtin linear cost model coefficients */
  322. enum {
  323. I_LCOEF_RBPS,
  324. I_LCOEF_RSEQIOPS,
  325. I_LCOEF_RRANDIOPS,
  326. I_LCOEF_WBPS,
  327. I_LCOEF_WSEQIOPS,
  328. I_LCOEF_WRANDIOPS,
  329. NR_I_LCOEFS,
  330. };
  331. enum {
  332. LCOEF_RPAGE,
  333. LCOEF_RSEQIO,
  334. LCOEF_RRANDIO,
  335. LCOEF_WPAGE,
  336. LCOEF_WSEQIO,
  337. LCOEF_WRANDIO,
  338. NR_LCOEFS,
  339. };
  340. enum {
  341. AUTOP_INVALID,
  342. AUTOP_HDD,
  343. AUTOP_SSD_QD1,
  344. AUTOP_SSD_DFL,
  345. AUTOP_SSD_FAST,
  346. };
  347. struct ioc_params {
  348. u32 qos[NR_QOS_PARAMS];
  349. u64 i_lcoefs[NR_I_LCOEFS];
  350. u64 lcoefs[NR_LCOEFS];
  351. u32 too_fast_vrate_pct;
  352. u32 too_slow_vrate_pct;
  353. };
  354. struct ioc_margins {
  355. s64 min;
  356. s64 low;
  357. s64 target;
  358. };
  359. struct ioc_missed {
  360. local_t nr_met;
  361. local_t nr_missed;
  362. u32 last_met;
  363. u32 last_missed;
  364. };
  365. struct ioc_pcpu_stat {
  366. struct ioc_missed missed[2];
  367. local64_t rq_wait_ns;
  368. u64 last_rq_wait_ns;
  369. };
  370. /* per device */
  371. struct ioc {
  372. struct rq_qos rqos;
  373. bool enabled;
  374. struct ioc_params params;
  375. struct ioc_margins margins;
  376. u32 period_us;
  377. u32 timer_slack_ns;
  378. u64 vrate_min;
  379. u64 vrate_max;
  380. spinlock_t lock;
  381. struct timer_list timer;
  382. struct list_head active_iocgs; /* active cgroups */
  383. struct ioc_pcpu_stat __percpu *pcpu_stat;
  384. enum ioc_running running;
  385. atomic64_t vtime_rate;
  386. u64 vtime_base_rate;
  387. s64 vtime_err;
  388. seqcount_spinlock_t period_seqcount;
  389. u64 period_at; /* wallclock starttime */
  390. u64 period_at_vtime; /* vtime starttime */
  391. atomic64_t cur_period; /* inc'd each period */
  392. int busy_level; /* saturation history */
  393. bool weights_updated;
  394. atomic_t hweight_gen; /* for lazy hweights */
  395. /* debt forgivness */
  396. u64 dfgv_period_at;
  397. u64 dfgv_period_rem;
  398. u64 dfgv_usage_us_sum;
  399. u64 autop_too_fast_at;
  400. u64 autop_too_slow_at;
  401. int autop_idx;
  402. bool user_qos_params:1;
  403. bool user_cost_model:1;
  404. };
  405. struct iocg_pcpu_stat {
  406. local64_t abs_vusage;
  407. };
  408. struct iocg_stat {
  409. u64 usage_us;
  410. u64 wait_us;
  411. u64 indebt_us;
  412. u64 indelay_us;
  413. };
  414. /* per device-cgroup pair */
  415. struct ioc_gq {
  416. struct blkg_policy_data pd;
  417. struct ioc *ioc;
  418. /*
  419. * A iocg can get its weight from two sources - an explicit
  420. * per-device-cgroup configuration or the default weight of the
  421. * cgroup. `cfg_weight` is the explicit per-device-cgroup
  422. * configuration. `weight` is the effective considering both
  423. * sources.
  424. *
  425. * When an idle cgroup becomes active its `active` goes from 0 to
  426. * `weight`. `inuse` is the surplus adjusted active weight.
  427. * `active` and `inuse` are used to calculate `hweight_active` and
  428. * `hweight_inuse`.
  429. *
  430. * `last_inuse` remembers `inuse` while an iocg is idle to persist
  431. * surplus adjustments.
  432. *
  433. * `inuse` may be adjusted dynamically during period. `saved_*` are used
  434. * to determine and track adjustments.
  435. */
  436. u32 cfg_weight;
  437. u32 weight;
  438. u32 active;
  439. u32 inuse;
  440. u32 last_inuse;
  441. s64 saved_margin;
  442. sector_t cursor; /* to detect randio */
  443. /*
  444. * `vtime` is this iocg's vtime cursor which progresses as IOs are
  445. * issued. If lagging behind device vtime, the delta represents
  446. * the currently available IO budget. If running ahead, the
  447. * overage.
  448. *
  449. * `vtime_done` is the same but progressed on completion rather
  450. * than issue. The delta behind `vtime` represents the cost of
  451. * currently in-flight IOs.
  452. */
  453. atomic64_t vtime;
  454. atomic64_t done_vtime;
  455. u64 abs_vdebt;
  456. /* current delay in effect and when it started */
  457. u64 delay;
  458. u64 delay_at;
  459. /*
  460. * The period this iocg was last active in. Used for deactivation
  461. * and invalidating `vtime`.
  462. */
  463. atomic64_t active_period;
  464. struct list_head active_list;
  465. /* see __propagate_weights() and current_hweight() for details */
  466. u64 child_active_sum;
  467. u64 child_inuse_sum;
  468. u64 child_adjusted_sum;
  469. int hweight_gen;
  470. u32 hweight_active;
  471. u32 hweight_inuse;
  472. u32 hweight_donating;
  473. u32 hweight_after_donation;
  474. struct list_head walk_list;
  475. struct list_head surplus_list;
  476. struct wait_queue_head waitq;
  477. struct hrtimer waitq_timer;
  478. /* timestamp at the latest activation */
  479. u64 activated_at;
  480. /* statistics */
  481. struct iocg_pcpu_stat __percpu *pcpu_stat;
  482. struct iocg_stat stat;
  483. struct iocg_stat last_stat;
  484. u64 last_stat_abs_vusage;
  485. u64 usage_delta_us;
  486. u64 wait_since;
  487. u64 indebt_since;
  488. u64 indelay_since;
  489. /* this iocg's depth in the hierarchy and ancestors including self */
  490. int level;
  491. struct ioc_gq *ancestors[];
  492. };
  493. /* per cgroup */
  494. struct ioc_cgrp {
  495. struct blkcg_policy_data cpd;
  496. unsigned int dfl_weight;
  497. };
  498. struct ioc_now {
  499. u64 now_ns;
  500. u64 now;
  501. u64 vnow;
  502. };
  503. struct iocg_wait {
  504. struct wait_queue_entry wait;
  505. struct bio *bio;
  506. u64 abs_cost;
  507. bool committed;
  508. };
  509. struct iocg_wake_ctx {
  510. struct ioc_gq *iocg;
  511. u32 hw_inuse;
  512. s64 vbudget;
  513. };
  514. static const struct ioc_params autop[] = {
  515. [AUTOP_HDD] = {
  516. .qos = {
  517. [QOS_RLAT] = 250000, /* 250ms */
  518. [QOS_WLAT] = 250000,
  519. [QOS_MIN] = VRATE_MIN_PPM,
  520. [QOS_MAX] = VRATE_MAX_PPM,
  521. },
  522. .i_lcoefs = {
  523. [I_LCOEF_RBPS] = 174019176,
  524. [I_LCOEF_RSEQIOPS] = 41708,
  525. [I_LCOEF_RRANDIOPS] = 370,
  526. [I_LCOEF_WBPS] = 178075866,
  527. [I_LCOEF_WSEQIOPS] = 42705,
  528. [I_LCOEF_WRANDIOPS] = 378,
  529. },
  530. },
  531. [AUTOP_SSD_QD1] = {
  532. .qos = {
  533. [QOS_RLAT] = 25000, /* 25ms */
  534. [QOS_WLAT] = 25000,
  535. [QOS_MIN] = VRATE_MIN_PPM,
  536. [QOS_MAX] = VRATE_MAX_PPM,
  537. },
  538. .i_lcoefs = {
  539. [I_LCOEF_RBPS] = 245855193,
  540. [I_LCOEF_RSEQIOPS] = 61575,
  541. [I_LCOEF_RRANDIOPS] = 6946,
  542. [I_LCOEF_WBPS] = 141365009,
  543. [I_LCOEF_WSEQIOPS] = 33716,
  544. [I_LCOEF_WRANDIOPS] = 26796,
  545. },
  546. },
  547. [AUTOP_SSD_DFL] = {
  548. .qos = {
  549. [QOS_RLAT] = 25000, /* 25ms */
  550. [QOS_WLAT] = 25000,
  551. [QOS_MIN] = VRATE_MIN_PPM,
  552. [QOS_MAX] = VRATE_MAX_PPM,
  553. },
  554. .i_lcoefs = {
  555. [I_LCOEF_RBPS] = 488636629,
  556. [I_LCOEF_RSEQIOPS] = 8932,
  557. [I_LCOEF_RRANDIOPS] = 8518,
  558. [I_LCOEF_WBPS] = 427891549,
  559. [I_LCOEF_WSEQIOPS] = 28755,
  560. [I_LCOEF_WRANDIOPS] = 21940,
  561. },
  562. .too_fast_vrate_pct = 500,
  563. },
  564. [AUTOP_SSD_FAST] = {
  565. .qos = {
  566. [QOS_RLAT] = 5000, /* 5ms */
  567. [QOS_WLAT] = 5000,
  568. [QOS_MIN] = VRATE_MIN_PPM,
  569. [QOS_MAX] = VRATE_MAX_PPM,
  570. },
  571. .i_lcoefs = {
  572. [I_LCOEF_RBPS] = 3102524156LLU,
  573. [I_LCOEF_RSEQIOPS] = 724816,
  574. [I_LCOEF_RRANDIOPS] = 778122,
  575. [I_LCOEF_WBPS] = 1742780862LLU,
  576. [I_LCOEF_WSEQIOPS] = 425702,
  577. [I_LCOEF_WRANDIOPS] = 443193,
  578. },
  579. .too_slow_vrate_pct = 10,
  580. },
  581. };
  582. /*
  583. * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
  584. * vtime credit shortage and down on device saturation.
  585. */
  586. static const u32 vrate_adj_pct[] =
  587. { 0, 0, 0, 0,
  588. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  589. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  590. 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
  591. static struct blkcg_policy blkcg_policy_iocost;
  592. /* accessors and helpers */
  593. static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
  594. {
  595. return container_of(rqos, struct ioc, rqos);
  596. }
  597. static struct ioc *q_to_ioc(struct request_queue *q)
  598. {
  599. return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
  600. }
  601. static const char __maybe_unused *ioc_name(struct ioc *ioc)
  602. {
  603. struct gendisk *disk = ioc->rqos.disk;
  604. if (!disk)
  605. return "<unknown>";
  606. return disk->disk_name;
  607. }
  608. static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
  609. {
  610. return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
  611. }
  612. static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
  613. {
  614. return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
  615. }
  616. static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
  617. {
  618. return pd_to_blkg(&iocg->pd);
  619. }
  620. static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
  621. {
  622. return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
  623. struct ioc_cgrp, cpd);
  624. }
  625. /*
  626. * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
  627. * weight, the more expensive each IO. Must round up.
  628. */
  629. static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
  630. {
  631. return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
  632. }
  633. /*
  634. * The inverse of abs_cost_to_cost(). Must round up.
  635. */
  636. static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
  637. {
  638. return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
  639. }
  640. static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
  641. u64 abs_cost, u64 cost)
  642. {
  643. struct iocg_pcpu_stat *gcs;
  644. bio->bi_iocost_cost = cost;
  645. atomic64_add(cost, &iocg->vtime);
  646. gcs = get_cpu_ptr(iocg->pcpu_stat);
  647. local64_add(abs_cost, &gcs->abs_vusage);
  648. put_cpu_ptr(gcs);
  649. }
  650. static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
  651. {
  652. if (lock_ioc) {
  653. spin_lock_irqsave(&iocg->ioc->lock, *flags);
  654. spin_lock(&iocg->waitq.lock);
  655. } else {
  656. spin_lock_irqsave(&iocg->waitq.lock, *flags);
  657. }
  658. }
  659. static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
  660. {
  661. if (unlock_ioc) {
  662. spin_unlock(&iocg->waitq.lock);
  663. spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
  664. } else {
  665. spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
  666. }
  667. }
  668. #define CREATE_TRACE_POINTS
  669. #include <trace/events/iocost.h>
  670. static void ioc_refresh_margins(struct ioc *ioc)
  671. {
  672. struct ioc_margins *margins = &ioc->margins;
  673. u32 period_us = ioc->period_us;
  674. u64 vrate = ioc->vtime_base_rate;
  675. margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
  676. margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
  677. margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
  678. }
  679. /* latency Qos params changed, update period_us and all the dependent params */
  680. static void ioc_refresh_period_us(struct ioc *ioc)
  681. {
  682. u32 ppm, lat, multi, period_us;
  683. lockdep_assert_held(&ioc->lock);
  684. /* pick the higher latency target */
  685. if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
  686. ppm = ioc->params.qos[QOS_RPPM];
  687. lat = ioc->params.qos[QOS_RLAT];
  688. } else {
  689. ppm = ioc->params.qos[QOS_WPPM];
  690. lat = ioc->params.qos[QOS_WLAT];
  691. }
  692. /*
  693. * We want the period to be long enough to contain a healthy number
  694. * of IOs while short enough for granular control. Define it as a
  695. * multiple of the latency target. Ideally, the multiplier should
  696. * be scaled according to the percentile so that it would nominally
  697. * contain a certain number of requests. Let's be simpler and
  698. * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
  699. */
  700. if (ppm)
  701. multi = max_t(u32, (MILLION - ppm) / 50000, 2);
  702. else
  703. multi = 2;
  704. period_us = multi * lat;
  705. period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
  706. /* calculate dependent params */
  707. ioc->period_us = period_us;
  708. ioc->timer_slack_ns = div64_u64(
  709. (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
  710. 100);
  711. ioc_refresh_margins(ioc);
  712. }
  713. /*
  714. * ioc->rqos.disk isn't initialized when this function is called from
  715. * the init path.
  716. */
  717. static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
  718. {
  719. int idx = ioc->autop_idx;
  720. const struct ioc_params *p = &autop[idx];
  721. u32 vrate_pct;
  722. u64 now_ns;
  723. /* rotational? */
  724. if (blk_queue_rot(disk->queue))
  725. return AUTOP_HDD;
  726. /* handle SATA SSDs w/ broken NCQ */
  727. if (blk_queue_depth(disk->queue) == 1)
  728. return AUTOP_SSD_QD1;
  729. /* use one of the normal ssd sets */
  730. if (idx < AUTOP_SSD_DFL)
  731. return AUTOP_SSD_DFL;
  732. /* if user is overriding anything, maintain what was there */
  733. if (ioc->user_qos_params || ioc->user_cost_model)
  734. return idx;
  735. /* step up/down based on the vrate */
  736. vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
  737. now_ns = blk_time_get_ns();
  738. if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
  739. if (!ioc->autop_too_fast_at)
  740. ioc->autop_too_fast_at = now_ns;
  741. if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
  742. return idx + 1;
  743. } else {
  744. ioc->autop_too_fast_at = 0;
  745. }
  746. if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
  747. if (!ioc->autop_too_slow_at)
  748. ioc->autop_too_slow_at = now_ns;
  749. if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
  750. return idx - 1;
  751. } else {
  752. ioc->autop_too_slow_at = 0;
  753. }
  754. return idx;
  755. }
  756. /*
  757. * Take the followings as input
  758. *
  759. * @bps maximum sequential throughput
  760. * @seqiops maximum sequential 4k iops
  761. * @randiops maximum random 4k iops
  762. *
  763. * and calculate the linear model cost coefficients.
  764. *
  765. * *@page per-page cost 1s / (@bps / 4096)
  766. * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
  767. * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
  768. */
  769. static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
  770. u64 *page, u64 *seqio, u64 *randio)
  771. {
  772. u64 v;
  773. *page = *seqio = *randio = 0;
  774. if (bps) {
  775. u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE);
  776. if (bps_pages)
  777. *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages);
  778. else
  779. *page = 1;
  780. }
  781. if (seqiops) {
  782. v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
  783. if (v > *page)
  784. *seqio = v - *page;
  785. }
  786. if (randiops) {
  787. v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
  788. if (v > *page)
  789. *randio = v - *page;
  790. }
  791. }
  792. static void ioc_refresh_lcoefs(struct ioc *ioc)
  793. {
  794. u64 *u = ioc->params.i_lcoefs;
  795. u64 *c = ioc->params.lcoefs;
  796. calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
  797. &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
  798. calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
  799. &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
  800. }
  801. /*
  802. * struct gendisk is required as an argument because ioc->rqos.disk
  803. * is not properly initialized when called from the init path.
  804. */
  805. static bool ioc_refresh_params_disk(struct ioc *ioc, bool force,
  806. struct gendisk *disk)
  807. {
  808. const struct ioc_params *p;
  809. int idx;
  810. lockdep_assert_held(&ioc->lock);
  811. idx = ioc_autop_idx(ioc, disk);
  812. p = &autop[idx];
  813. if (idx == ioc->autop_idx && !force)
  814. return false;
  815. if (idx != ioc->autop_idx) {
  816. atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
  817. ioc->vtime_base_rate = VTIME_PER_USEC;
  818. }
  819. ioc->autop_idx = idx;
  820. ioc->autop_too_fast_at = 0;
  821. ioc->autop_too_slow_at = 0;
  822. if (!ioc->user_qos_params)
  823. memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
  824. if (!ioc->user_cost_model)
  825. memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
  826. ioc_refresh_period_us(ioc);
  827. ioc_refresh_lcoefs(ioc);
  828. ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
  829. VTIME_PER_USEC, MILLION);
  830. ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] *
  831. VTIME_PER_USEC, MILLION);
  832. return true;
  833. }
  834. static bool ioc_refresh_params(struct ioc *ioc, bool force)
  835. {
  836. return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk);
  837. }
  838. /*
  839. * When an iocg accumulates too much vtime or gets deactivated, we throw away
  840. * some vtime, which lowers the overall device utilization. As the exact amount
  841. * which is being thrown away is known, we can compensate by accelerating the
  842. * vrate accordingly so that the extra vtime generated in the current period
  843. * matches what got lost.
  844. */
  845. static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
  846. {
  847. s64 pleft = ioc->period_at + ioc->period_us - now->now;
  848. s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
  849. s64 vcomp, vcomp_min, vcomp_max;
  850. lockdep_assert_held(&ioc->lock);
  851. /* we need some time left in this period */
  852. if (pleft <= 0)
  853. goto done;
  854. /*
  855. * Calculate how much vrate should be adjusted to offset the error.
  856. * Limit the amount of adjustment and deduct the adjusted amount from
  857. * the error.
  858. */
  859. vcomp = -div64_s64(ioc->vtime_err, pleft);
  860. vcomp_min = -(ioc->vtime_base_rate >> 1);
  861. vcomp_max = ioc->vtime_base_rate;
  862. vcomp = clamp(vcomp, vcomp_min, vcomp_max);
  863. ioc->vtime_err += vcomp * pleft;
  864. atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
  865. done:
  866. /* bound how much error can accumulate */
  867. ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
  868. }
  869. static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
  870. int nr_lagging, int nr_shortages,
  871. int prev_busy_level, u32 *missed_ppm)
  872. {
  873. u64 vrate = ioc->vtime_base_rate;
  874. u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
  875. if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
  876. if (ioc->busy_level != prev_busy_level || nr_lagging)
  877. trace_iocost_ioc_vrate_adj(ioc, vrate,
  878. missed_ppm, rq_wait_pct,
  879. nr_lagging, nr_shortages);
  880. return;
  881. }
  882. /*
  883. * If vrate is out of bounds, apply clamp gradually as the
  884. * bounds can change abruptly. Otherwise, apply busy_level
  885. * based adjustment.
  886. */
  887. if (vrate < vrate_min) {
  888. vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
  889. vrate = min(vrate, vrate_min);
  890. } else if (vrate > vrate_max) {
  891. vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
  892. vrate = max(vrate, vrate_max);
  893. } else {
  894. int idx = min_t(int, abs(ioc->busy_level),
  895. ARRAY_SIZE(vrate_adj_pct) - 1);
  896. u32 adj_pct = vrate_adj_pct[idx];
  897. if (ioc->busy_level > 0)
  898. adj_pct = 100 - adj_pct;
  899. else
  900. adj_pct = 100 + adj_pct;
  901. vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
  902. vrate_min, vrate_max);
  903. }
  904. trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
  905. nr_lagging, nr_shortages);
  906. ioc->vtime_base_rate = vrate;
  907. ioc_refresh_margins(ioc);
  908. }
  909. /* take a snapshot of the current [v]time and vrate */
  910. static void ioc_now(struct ioc *ioc, struct ioc_now *now)
  911. {
  912. unsigned seq;
  913. u64 vrate;
  914. now->now_ns = blk_time_get_ns();
  915. now->now = ktime_to_us(now->now_ns);
  916. vrate = atomic64_read(&ioc->vtime_rate);
  917. /*
  918. * The current vtime is
  919. *
  920. * vtime at period start + (wallclock time since the start) * vrate
  921. *
  922. * As a consistent snapshot of `period_at_vtime` and `period_at` is
  923. * needed, they're seqcount protected.
  924. */
  925. do {
  926. seq = read_seqcount_begin(&ioc->period_seqcount);
  927. now->vnow = ioc->period_at_vtime +
  928. (now->now - ioc->period_at) * vrate;
  929. } while (read_seqcount_retry(&ioc->period_seqcount, seq));
  930. }
  931. static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
  932. {
  933. WARN_ON_ONCE(ioc->running != IOC_RUNNING);
  934. write_seqcount_begin(&ioc->period_seqcount);
  935. ioc->period_at = now->now;
  936. ioc->period_at_vtime = now->vnow;
  937. write_seqcount_end(&ioc->period_seqcount);
  938. ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
  939. add_timer(&ioc->timer);
  940. }
  941. /*
  942. * Update @iocg's `active` and `inuse` to @active and @inuse, update level
  943. * weight sums and propagate upwards accordingly. If @save, the current margin
  944. * is saved to be used as reference for later inuse in-period adjustments.
  945. */
  946. static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
  947. bool save, struct ioc_now *now)
  948. {
  949. struct ioc *ioc = iocg->ioc;
  950. int lvl;
  951. lockdep_assert_held(&ioc->lock);
  952. /*
  953. * For an active leaf node, its inuse shouldn't be zero or exceed
  954. * @active. An active internal node's inuse is solely determined by the
  955. * inuse to active ratio of its children regardless of @inuse.
  956. */
  957. if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
  958. inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
  959. iocg->child_active_sum);
  960. } else {
  961. /*
  962. * It may be tempting to turn this into a clamp expression with
  963. * a lower limit of 1 but active may be 0, which cannot be used
  964. * as an upper limit in that situation. This expression allows
  965. * active to clamp inuse unless it is 0, in which case inuse
  966. * becomes 1.
  967. */
  968. inuse = min(inuse, active) ?: 1;
  969. }
  970. iocg->last_inuse = iocg->inuse;
  971. if (save)
  972. iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
  973. if (active == iocg->active && inuse == iocg->inuse)
  974. return;
  975. for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
  976. struct ioc_gq *parent = iocg->ancestors[lvl];
  977. struct ioc_gq *child = iocg->ancestors[lvl + 1];
  978. u32 parent_active = 0, parent_inuse = 0;
  979. /* update the level sums */
  980. parent->child_active_sum += (s32)(active - child->active);
  981. parent->child_inuse_sum += (s32)(inuse - child->inuse);
  982. /* apply the updates */
  983. child->active = active;
  984. child->inuse = inuse;
  985. /*
  986. * The delta between inuse and active sums indicates that
  987. * much of weight is being given away. Parent's inuse
  988. * and active should reflect the ratio.
  989. */
  990. if (parent->child_active_sum) {
  991. parent_active = parent->weight;
  992. parent_inuse = DIV64_U64_ROUND_UP(
  993. parent_active * parent->child_inuse_sum,
  994. parent->child_active_sum);
  995. }
  996. /* do we need to keep walking up? */
  997. if (parent_active == parent->active &&
  998. parent_inuse == parent->inuse)
  999. break;
  1000. active = parent_active;
  1001. inuse = parent_inuse;
  1002. }
  1003. ioc->weights_updated = true;
  1004. }
  1005. static void commit_weights(struct ioc *ioc)
  1006. {
  1007. lockdep_assert_held(&ioc->lock);
  1008. if (ioc->weights_updated) {
  1009. /* paired with rmb in current_hweight(), see there */
  1010. smp_wmb();
  1011. atomic_inc(&ioc->hweight_gen);
  1012. ioc->weights_updated = false;
  1013. }
  1014. }
  1015. static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
  1016. bool save, struct ioc_now *now)
  1017. {
  1018. __propagate_weights(iocg, active, inuse, save, now);
  1019. commit_weights(iocg->ioc);
  1020. }
  1021. static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
  1022. {
  1023. struct ioc *ioc = iocg->ioc;
  1024. int lvl;
  1025. u32 hwa, hwi;
  1026. int ioc_gen;
  1027. /* hot path - if uptodate, use cached */
  1028. ioc_gen = atomic_read(&ioc->hweight_gen);
  1029. if (ioc_gen == iocg->hweight_gen)
  1030. goto out;
  1031. /*
  1032. * Paired with wmb in commit_weights(). If we saw the updated
  1033. * hweight_gen, all the weight updates from __propagate_weights() are
  1034. * visible too.
  1035. *
  1036. * We can race with weight updates during calculation and get it
  1037. * wrong. However, hweight_gen would have changed and a future
  1038. * reader will recalculate and we're guaranteed to discard the
  1039. * wrong result soon.
  1040. */
  1041. smp_rmb();
  1042. hwa = hwi = WEIGHT_ONE;
  1043. for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
  1044. struct ioc_gq *parent = iocg->ancestors[lvl];
  1045. struct ioc_gq *child = iocg->ancestors[lvl + 1];
  1046. u64 active_sum = READ_ONCE(parent->child_active_sum);
  1047. u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
  1048. u32 active = READ_ONCE(child->active);
  1049. u32 inuse = READ_ONCE(child->inuse);
  1050. /* we can race with deactivations and either may read as zero */
  1051. if (!active_sum || !inuse_sum)
  1052. continue;
  1053. active_sum = max_t(u64, active, active_sum);
  1054. hwa = div64_u64((u64)hwa * active, active_sum);
  1055. inuse_sum = max_t(u64, inuse, inuse_sum);
  1056. hwi = div64_u64((u64)hwi * inuse, inuse_sum);
  1057. }
  1058. iocg->hweight_active = max_t(u32, hwa, 1);
  1059. iocg->hweight_inuse = max_t(u32, hwi, 1);
  1060. iocg->hweight_gen = ioc_gen;
  1061. out:
  1062. if (hw_activep)
  1063. *hw_activep = iocg->hweight_active;
  1064. if (hw_inusep)
  1065. *hw_inusep = iocg->hweight_inuse;
  1066. }
  1067. /*
  1068. * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
  1069. * other weights stay unchanged.
  1070. */
  1071. static u32 current_hweight_max(struct ioc_gq *iocg)
  1072. {
  1073. u32 hwm = WEIGHT_ONE;
  1074. u32 inuse = iocg->active;
  1075. u64 child_inuse_sum;
  1076. int lvl;
  1077. lockdep_assert_held(&iocg->ioc->lock);
  1078. for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
  1079. struct ioc_gq *parent = iocg->ancestors[lvl];
  1080. struct ioc_gq *child = iocg->ancestors[lvl + 1];
  1081. child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
  1082. hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
  1083. inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
  1084. parent->child_active_sum);
  1085. }
  1086. return max_t(u32, hwm, 1);
  1087. }
  1088. static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
  1089. {
  1090. struct ioc *ioc = iocg->ioc;
  1091. struct blkcg_gq *blkg = iocg_to_blkg(iocg);
  1092. struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
  1093. u32 weight;
  1094. lockdep_assert_held(&ioc->lock);
  1095. weight = iocg->cfg_weight ?: iocc->dfl_weight;
  1096. if (weight != iocg->weight && iocg->active)
  1097. propagate_weights(iocg, weight, iocg->inuse, true, now);
  1098. iocg->weight = weight;
  1099. }
  1100. static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
  1101. {
  1102. struct ioc *ioc = iocg->ioc;
  1103. u64 __maybe_unused last_period, cur_period;
  1104. u64 vtime, vtarget;
  1105. int i;
  1106. /*
  1107. * If seem to be already active, just update the stamp to tell the
  1108. * timer that we're still active. We don't mind occassional races.
  1109. */
  1110. if (!list_empty(&iocg->active_list)) {
  1111. ioc_now(ioc, now);
  1112. cur_period = atomic64_read(&ioc->cur_period);
  1113. if (atomic64_read(&iocg->active_period) != cur_period)
  1114. atomic64_set(&iocg->active_period, cur_period);
  1115. return true;
  1116. }
  1117. /* racy check on internal node IOs, treat as root level IOs */
  1118. if (iocg->child_active_sum)
  1119. return false;
  1120. spin_lock_irq(&ioc->lock);
  1121. ioc_now(ioc, now);
  1122. /* update period */
  1123. cur_period = atomic64_read(&ioc->cur_period);
  1124. last_period = atomic64_read(&iocg->active_period);
  1125. atomic64_set(&iocg->active_period, cur_period);
  1126. /* already activated or breaking leaf-only constraint? */
  1127. if (!list_empty(&iocg->active_list))
  1128. goto succeed_unlock;
  1129. for (i = iocg->level - 1; i > 0; i--)
  1130. if (!list_empty(&iocg->ancestors[i]->active_list))
  1131. goto fail_unlock;
  1132. if (iocg->child_active_sum)
  1133. goto fail_unlock;
  1134. /*
  1135. * Always start with the target budget. On deactivation, we throw away
  1136. * anything above it.
  1137. */
  1138. vtarget = now->vnow - ioc->margins.target;
  1139. vtime = atomic64_read(&iocg->vtime);
  1140. atomic64_add(vtarget - vtime, &iocg->vtime);
  1141. atomic64_add(vtarget - vtime, &iocg->done_vtime);
  1142. vtime = vtarget;
  1143. /*
  1144. * Activate, propagate weight and start period timer if not
  1145. * running. Reset hweight_gen to avoid accidental match from
  1146. * wrapping.
  1147. */
  1148. iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
  1149. list_add(&iocg->active_list, &ioc->active_iocgs);
  1150. propagate_weights(iocg, iocg->weight,
  1151. iocg->last_inuse ?: iocg->weight, true, now);
  1152. TRACE_IOCG_PATH(iocg_activate, iocg, now,
  1153. last_period, cur_period, vtime);
  1154. iocg->activated_at = now->now;
  1155. if (ioc->running == IOC_IDLE) {
  1156. ioc->running = IOC_RUNNING;
  1157. ioc->dfgv_period_at = now->now;
  1158. ioc->dfgv_period_rem = 0;
  1159. ioc_start_period(ioc, now);
  1160. }
  1161. succeed_unlock:
  1162. spin_unlock_irq(&ioc->lock);
  1163. return true;
  1164. fail_unlock:
  1165. spin_unlock_irq(&ioc->lock);
  1166. return false;
  1167. }
  1168. static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
  1169. {
  1170. struct ioc *ioc = iocg->ioc;
  1171. struct blkcg_gq *blkg = iocg_to_blkg(iocg);
  1172. u64 tdelta, delay, new_delay, shift;
  1173. s64 vover, vover_pct;
  1174. u32 hwa;
  1175. lockdep_assert_held(&iocg->waitq.lock);
  1176. /*
  1177. * If the delay is set by another CPU, we may be in the past. No need to
  1178. * change anything if so. This avoids decay calculation underflow.
  1179. */
  1180. if (time_before64(now->now, iocg->delay_at))
  1181. return false;
  1182. /* calculate the current delay in effect - 1/2 every second */
  1183. tdelta = now->now - iocg->delay_at;
  1184. shift = div64_u64(tdelta, USEC_PER_SEC);
  1185. if (iocg->delay && shift < BITS_PER_LONG)
  1186. delay = iocg->delay >> shift;
  1187. else
  1188. delay = 0;
  1189. /* calculate the new delay from the debt amount */
  1190. current_hweight(iocg, &hwa, NULL);
  1191. vover = atomic64_read(&iocg->vtime) +
  1192. abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
  1193. vover_pct = div64_s64(100 * vover,
  1194. ioc->period_us * ioc->vtime_base_rate);
  1195. if (vover_pct <= MIN_DELAY_THR_PCT)
  1196. new_delay = 0;
  1197. else if (vover_pct >= MAX_DELAY_THR_PCT)
  1198. new_delay = MAX_DELAY;
  1199. else
  1200. new_delay = MIN_DELAY +
  1201. div_u64((MAX_DELAY - MIN_DELAY) *
  1202. (vover_pct - MIN_DELAY_THR_PCT),
  1203. MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
  1204. /* pick the higher one and apply */
  1205. if (new_delay > delay) {
  1206. iocg->delay = new_delay;
  1207. iocg->delay_at = now->now;
  1208. delay = new_delay;
  1209. }
  1210. if (delay >= MIN_DELAY) {
  1211. if (!iocg->indelay_since)
  1212. iocg->indelay_since = now->now;
  1213. blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
  1214. return true;
  1215. } else {
  1216. if (iocg->indelay_since) {
  1217. iocg->stat.indelay_us += now->now - iocg->indelay_since;
  1218. iocg->indelay_since = 0;
  1219. }
  1220. iocg->delay = 0;
  1221. blkcg_clear_delay(blkg);
  1222. return false;
  1223. }
  1224. }
  1225. static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
  1226. struct ioc_now *now)
  1227. {
  1228. struct iocg_pcpu_stat *gcs;
  1229. lockdep_assert_held(&iocg->ioc->lock);
  1230. lockdep_assert_held(&iocg->waitq.lock);
  1231. WARN_ON_ONCE(list_empty(&iocg->active_list));
  1232. /*
  1233. * Once in debt, debt handling owns inuse. @iocg stays at the minimum
  1234. * inuse donating all of it share to others until its debt is paid off.
  1235. */
  1236. if (!iocg->abs_vdebt && abs_cost) {
  1237. iocg->indebt_since = now->now;
  1238. propagate_weights(iocg, iocg->active, 0, false, now);
  1239. }
  1240. iocg->abs_vdebt += abs_cost;
  1241. gcs = get_cpu_ptr(iocg->pcpu_stat);
  1242. local64_add(abs_cost, &gcs->abs_vusage);
  1243. put_cpu_ptr(gcs);
  1244. }
  1245. static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
  1246. struct ioc_now *now)
  1247. {
  1248. lockdep_assert_held(&iocg->ioc->lock);
  1249. lockdep_assert_held(&iocg->waitq.lock);
  1250. /*
  1251. * make sure that nobody messed with @iocg. Check iocg->pd.online
  1252. * to avoid warn when removing blkcg or disk.
  1253. */
  1254. WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online);
  1255. WARN_ON_ONCE(iocg->inuse > 1);
  1256. iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
  1257. /* if debt is paid in full, restore inuse */
  1258. if (!iocg->abs_vdebt) {
  1259. iocg->stat.indebt_us += now->now - iocg->indebt_since;
  1260. iocg->indebt_since = 0;
  1261. propagate_weights(iocg, iocg->active, iocg->last_inuse,
  1262. false, now);
  1263. }
  1264. }
  1265. static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
  1266. int flags, void *key)
  1267. {
  1268. struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
  1269. struct iocg_wake_ctx *ctx = key;
  1270. u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
  1271. ctx->vbudget -= cost;
  1272. if (ctx->vbudget < 0)
  1273. return -1;
  1274. iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
  1275. wait->committed = true;
  1276. /*
  1277. * autoremove_wake_function() removes the wait entry only when it
  1278. * actually changed the task state. We want the wait always removed.
  1279. * Remove explicitly and use default_wake_function(). Note that the
  1280. * order of operations is important as finish_wait() tests whether
  1281. * @wq_entry is removed without grabbing the lock.
  1282. */
  1283. default_wake_function(wq_entry, mode, flags, key);
  1284. list_del_init_careful(&wq_entry->entry);
  1285. return 0;
  1286. }
  1287. /*
  1288. * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
  1289. * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
  1290. * addition to iocg->waitq.lock.
  1291. */
  1292. static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
  1293. struct ioc_now *now)
  1294. {
  1295. struct ioc *ioc = iocg->ioc;
  1296. struct iocg_wake_ctx ctx = { .iocg = iocg };
  1297. u64 vshortage, expires, oexpires;
  1298. s64 vbudget;
  1299. u32 hwa;
  1300. lockdep_assert_held(&iocg->waitq.lock);
  1301. current_hweight(iocg, &hwa, NULL);
  1302. vbudget = now->vnow - atomic64_read(&iocg->vtime);
  1303. /* pay off debt */
  1304. if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
  1305. u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
  1306. u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
  1307. u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
  1308. lockdep_assert_held(&ioc->lock);
  1309. atomic64_add(vpay, &iocg->vtime);
  1310. atomic64_add(vpay, &iocg->done_vtime);
  1311. iocg_pay_debt(iocg, abs_vpay, now);
  1312. vbudget -= vpay;
  1313. }
  1314. if (iocg->abs_vdebt || iocg->delay)
  1315. iocg_kick_delay(iocg, now);
  1316. /*
  1317. * Debt can still be outstanding if we haven't paid all yet or the
  1318. * caller raced and called without @pay_debt. Shouldn't wake up waiters
  1319. * under debt. Make sure @vbudget reflects the outstanding amount and is
  1320. * not positive.
  1321. */
  1322. if (iocg->abs_vdebt) {
  1323. s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
  1324. vbudget = min_t(s64, 0, vbudget - vdebt);
  1325. }
  1326. /*
  1327. * Wake up the ones which are due and see how much vtime we'll need for
  1328. * the next one. As paying off debt restores hw_inuse, it must be read
  1329. * after the above debt payment.
  1330. */
  1331. ctx.vbudget = vbudget;
  1332. current_hweight(iocg, NULL, &ctx.hw_inuse);
  1333. __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
  1334. if (!waitqueue_active(&iocg->waitq)) {
  1335. if (iocg->wait_since) {
  1336. iocg->stat.wait_us += now->now - iocg->wait_since;
  1337. iocg->wait_since = 0;
  1338. }
  1339. return;
  1340. }
  1341. if (!iocg->wait_since)
  1342. iocg->wait_since = now->now;
  1343. if (WARN_ON_ONCE(ctx.vbudget >= 0))
  1344. return;
  1345. /* determine next wakeup, add a timer margin to guarantee chunking */
  1346. vshortage = -ctx.vbudget;
  1347. expires = now->now_ns +
  1348. DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
  1349. NSEC_PER_USEC;
  1350. expires += ioc->timer_slack_ns;
  1351. /* if already active and close enough, don't bother */
  1352. oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
  1353. if (hrtimer_is_queued(&iocg->waitq_timer) &&
  1354. abs(oexpires - expires) <= ioc->timer_slack_ns)
  1355. return;
  1356. hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
  1357. ioc->timer_slack_ns, HRTIMER_MODE_ABS);
  1358. }
  1359. static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
  1360. {
  1361. struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
  1362. bool pay_debt = READ_ONCE(iocg->abs_vdebt);
  1363. struct ioc_now now;
  1364. unsigned long flags;
  1365. ioc_now(iocg->ioc, &now);
  1366. iocg_lock(iocg, pay_debt, &flags);
  1367. iocg_kick_waitq(iocg, pay_debt, &now);
  1368. iocg_unlock(iocg, pay_debt, &flags);
  1369. return HRTIMER_NORESTART;
  1370. }
  1371. static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
  1372. {
  1373. u32 nr_met[2] = { };
  1374. u32 nr_missed[2] = { };
  1375. u64 rq_wait_ns = 0;
  1376. int cpu, rw;
  1377. for_each_online_cpu(cpu) {
  1378. struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
  1379. u64 this_rq_wait_ns;
  1380. for (rw = READ; rw <= WRITE; rw++) {
  1381. u32 this_met = local_read(&stat->missed[rw].nr_met);
  1382. u32 this_missed = local_read(&stat->missed[rw].nr_missed);
  1383. nr_met[rw] += this_met - stat->missed[rw].last_met;
  1384. nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
  1385. stat->missed[rw].last_met = this_met;
  1386. stat->missed[rw].last_missed = this_missed;
  1387. }
  1388. this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
  1389. rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
  1390. stat->last_rq_wait_ns = this_rq_wait_ns;
  1391. }
  1392. for (rw = READ; rw <= WRITE; rw++) {
  1393. if (nr_met[rw] + nr_missed[rw])
  1394. missed_ppm_ar[rw] =
  1395. DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
  1396. nr_met[rw] + nr_missed[rw]);
  1397. else
  1398. missed_ppm_ar[rw] = 0;
  1399. }
  1400. *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
  1401. ioc->period_us * NSEC_PER_USEC);
  1402. }
  1403. /* was iocg idle this period? */
  1404. static bool iocg_is_idle(struct ioc_gq *iocg)
  1405. {
  1406. struct ioc *ioc = iocg->ioc;
  1407. /* did something get issued this period? */
  1408. if (atomic64_read(&iocg->active_period) ==
  1409. atomic64_read(&ioc->cur_period))
  1410. return false;
  1411. /* is something in flight? */
  1412. if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
  1413. return false;
  1414. return true;
  1415. }
  1416. /*
  1417. * Call this function on the target leaf @iocg's to build pre-order traversal
  1418. * list of all the ancestors in @inner_walk. The inner nodes are linked through
  1419. * ->walk_list and the caller is responsible for dissolving the list after use.
  1420. */
  1421. static void iocg_build_inner_walk(struct ioc_gq *iocg,
  1422. struct list_head *inner_walk)
  1423. {
  1424. int lvl;
  1425. WARN_ON_ONCE(!list_empty(&iocg->walk_list));
  1426. /* find the first ancestor which hasn't been visited yet */
  1427. for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
  1428. if (!list_empty(&iocg->ancestors[lvl]->walk_list))
  1429. break;
  1430. }
  1431. /* walk down and visit the inner nodes to get pre-order traversal */
  1432. while (++lvl <= iocg->level - 1) {
  1433. struct ioc_gq *inner = iocg->ancestors[lvl];
  1434. /* record traversal order */
  1435. list_add_tail(&inner->walk_list, inner_walk);
  1436. }
  1437. }
  1438. /* propagate the deltas to the parent */
  1439. static void iocg_flush_stat_upward(struct ioc_gq *iocg)
  1440. {
  1441. if (iocg->level > 0) {
  1442. struct iocg_stat *parent_stat =
  1443. &iocg->ancestors[iocg->level - 1]->stat;
  1444. parent_stat->usage_us +=
  1445. iocg->stat.usage_us - iocg->last_stat.usage_us;
  1446. parent_stat->wait_us +=
  1447. iocg->stat.wait_us - iocg->last_stat.wait_us;
  1448. parent_stat->indebt_us +=
  1449. iocg->stat.indebt_us - iocg->last_stat.indebt_us;
  1450. parent_stat->indelay_us +=
  1451. iocg->stat.indelay_us - iocg->last_stat.indelay_us;
  1452. }
  1453. iocg->last_stat = iocg->stat;
  1454. }
  1455. /* collect per-cpu counters and propagate the deltas to the parent */
  1456. static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now)
  1457. {
  1458. struct ioc *ioc = iocg->ioc;
  1459. u64 abs_vusage = 0;
  1460. u64 vusage_delta;
  1461. int cpu;
  1462. lockdep_assert_held(&iocg->ioc->lock);
  1463. /* collect per-cpu counters */
  1464. for_each_possible_cpu(cpu) {
  1465. abs_vusage += local64_read(
  1466. per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
  1467. }
  1468. vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
  1469. iocg->last_stat_abs_vusage = abs_vusage;
  1470. iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
  1471. iocg->stat.usage_us += iocg->usage_delta_us;
  1472. iocg_flush_stat_upward(iocg);
  1473. }
  1474. /* get stat counters ready for reading on all active iocgs */
  1475. static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
  1476. {
  1477. LIST_HEAD(inner_walk);
  1478. struct ioc_gq *iocg, *tiocg;
  1479. /* flush leaves and build inner node walk list */
  1480. list_for_each_entry(iocg, target_iocgs, active_list) {
  1481. iocg_flush_stat_leaf(iocg, now);
  1482. iocg_build_inner_walk(iocg, &inner_walk);
  1483. }
  1484. /* keep flushing upwards by walking the inner list backwards */
  1485. list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
  1486. iocg_flush_stat_upward(iocg);
  1487. list_del_init(&iocg->walk_list);
  1488. }
  1489. }
  1490. /*
  1491. * Determine what @iocg's hweight_inuse should be after donating unused
  1492. * capacity. @hwm is the upper bound and used to signal no donation. This
  1493. * function also throws away @iocg's excess budget.
  1494. */
  1495. static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
  1496. u32 usage, struct ioc_now *now)
  1497. {
  1498. struct ioc *ioc = iocg->ioc;
  1499. u64 vtime = atomic64_read(&iocg->vtime);
  1500. s64 excess, delta, target, new_hwi;
  1501. /* debt handling owns inuse for debtors */
  1502. if (iocg->abs_vdebt)
  1503. return 1;
  1504. /* see whether minimum margin requirement is met */
  1505. if (waitqueue_active(&iocg->waitq) ||
  1506. time_after64(vtime, now->vnow - ioc->margins.min))
  1507. return hwm;
  1508. /* throw away excess above target */
  1509. excess = now->vnow - vtime - ioc->margins.target;
  1510. if (excess > 0) {
  1511. atomic64_add(excess, &iocg->vtime);
  1512. atomic64_add(excess, &iocg->done_vtime);
  1513. vtime += excess;
  1514. ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
  1515. }
  1516. /*
  1517. * Let's say the distance between iocg's and device's vtimes as a
  1518. * fraction of period duration is delta. Assuming that the iocg will
  1519. * consume the usage determined above, we want to determine new_hwi so
  1520. * that delta equals MARGIN_TARGET at the end of the next period.
  1521. *
  1522. * We need to execute usage worth of IOs while spending the sum of the
  1523. * new budget (1 - MARGIN_TARGET) and the leftover from the last period
  1524. * (delta):
  1525. *
  1526. * usage = (1 - MARGIN_TARGET + delta) * new_hwi
  1527. *
  1528. * Therefore, the new_hwi is:
  1529. *
  1530. * new_hwi = usage / (1 - MARGIN_TARGET + delta)
  1531. */
  1532. delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
  1533. now->vnow - ioc->period_at_vtime);
  1534. target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
  1535. new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
  1536. return clamp_t(s64, new_hwi, 1, hwm);
  1537. }
  1538. /*
  1539. * For work-conservation, an iocg which isn't using all of its share should
  1540. * donate the leftover to other iocgs. There are two ways to achieve this - 1.
  1541. * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
  1542. *
  1543. * #1 is mathematically simpler but has the drawback of requiring synchronous
  1544. * global hweight_inuse updates when idle iocg's get activated or inuse weights
  1545. * change due to donation snapbacks as it has the possibility of grossly
  1546. * overshooting what's allowed by the model and vrate.
  1547. *
  1548. * #2 is inherently safe with local operations. The donating iocg can easily
  1549. * snap back to higher weights when needed without worrying about impacts on
  1550. * other nodes as the impacts will be inherently correct. This also makes idle
  1551. * iocg activations safe. The only effect activations have is decreasing
  1552. * hweight_inuse of others, the right solution to which is for those iocgs to
  1553. * snap back to higher weights.
  1554. *
  1555. * So, we go with #2. The challenge is calculating how each donating iocg's
  1556. * inuse should be adjusted to achieve the target donation amounts. This is done
  1557. * using Andy's method described in the following pdf.
  1558. *
  1559. * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
  1560. *
  1561. * Given the weights and target after-donation hweight_inuse values, Andy's
  1562. * method determines how the proportional distribution should look like at each
  1563. * sibling level to maintain the relative relationship between all non-donating
  1564. * pairs. To roughly summarize, it divides the tree into donating and
  1565. * non-donating parts, calculates global donation rate which is used to
  1566. * determine the target hweight_inuse for each node, and then derives per-level
  1567. * proportions.
  1568. *
  1569. * The following pdf shows that global distribution calculated this way can be
  1570. * achieved by scaling inuse weights of donating leaves and propagating the
  1571. * adjustments upwards proportionally.
  1572. *
  1573. * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
  1574. *
  1575. * Combining the above two, we can determine how each leaf iocg's inuse should
  1576. * be adjusted to achieve the target donation.
  1577. *
  1578. * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
  1579. *
  1580. * The inline comments use symbols from the last pdf.
  1581. *
  1582. * b is the sum of the absolute budgets in the subtree. 1 for the root node.
  1583. * f is the sum of the absolute budgets of non-donating nodes in the subtree.
  1584. * t is the sum of the absolute budgets of donating nodes in the subtree.
  1585. * w is the weight of the node. w = w_f + w_t
  1586. * w_f is the non-donating portion of w. w_f = w * f / b
  1587. * w_b is the donating portion of w. w_t = w * t / b
  1588. * s is the sum of all sibling weights. s = Sum(w) for siblings
  1589. * s_f and s_t are the non-donating and donating portions of s.
  1590. *
  1591. * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
  1592. * w_pt is the donating portion of the parent's weight and w'_pt the same value
  1593. * after adjustments. Subscript r denotes the root node's values.
  1594. */
  1595. static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
  1596. {
  1597. LIST_HEAD(over_hwa);
  1598. LIST_HEAD(inner_walk);
  1599. struct ioc_gq *iocg, *tiocg, *root_iocg;
  1600. u32 after_sum, over_sum, over_target, gamma;
  1601. /*
  1602. * It's pretty unlikely but possible for the total sum of
  1603. * hweight_after_donation's to be higher than WEIGHT_ONE, which will
  1604. * confuse the following calculations. If such condition is detected,
  1605. * scale down everyone over its full share equally to keep the sum below
  1606. * WEIGHT_ONE.
  1607. */
  1608. after_sum = 0;
  1609. over_sum = 0;
  1610. list_for_each_entry(iocg, surpluses, surplus_list) {
  1611. u32 hwa;
  1612. current_hweight(iocg, &hwa, NULL);
  1613. after_sum += iocg->hweight_after_donation;
  1614. if (iocg->hweight_after_donation > hwa) {
  1615. over_sum += iocg->hweight_after_donation;
  1616. list_add(&iocg->walk_list, &over_hwa);
  1617. }
  1618. }
  1619. if (after_sum >= WEIGHT_ONE) {
  1620. /*
  1621. * The delta should be deducted from the over_sum, calculate
  1622. * target over_sum value.
  1623. */
  1624. u32 over_delta = after_sum - (WEIGHT_ONE - 1);
  1625. WARN_ON_ONCE(over_sum <= over_delta);
  1626. over_target = over_sum - over_delta;
  1627. } else {
  1628. over_target = 0;
  1629. }
  1630. list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
  1631. if (over_target)
  1632. iocg->hweight_after_donation =
  1633. div_u64((u64)iocg->hweight_after_donation *
  1634. over_target, over_sum);
  1635. list_del_init(&iocg->walk_list);
  1636. }
  1637. /*
  1638. * Build pre-order inner node walk list and prepare for donation
  1639. * adjustment calculations.
  1640. */
  1641. list_for_each_entry(iocg, surpluses, surplus_list) {
  1642. iocg_build_inner_walk(iocg, &inner_walk);
  1643. }
  1644. root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
  1645. WARN_ON_ONCE(root_iocg->level > 0);
  1646. list_for_each_entry(iocg, &inner_walk, walk_list) {
  1647. iocg->child_adjusted_sum = 0;
  1648. iocg->hweight_donating = 0;
  1649. iocg->hweight_after_donation = 0;
  1650. }
  1651. /*
  1652. * Propagate the donating budget (b_t) and after donation budget (b'_t)
  1653. * up the hierarchy.
  1654. */
  1655. list_for_each_entry(iocg, surpluses, surplus_list) {
  1656. struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
  1657. parent->hweight_donating += iocg->hweight_donating;
  1658. parent->hweight_after_donation += iocg->hweight_after_donation;
  1659. }
  1660. list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
  1661. if (iocg->level > 0) {
  1662. struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
  1663. parent->hweight_donating += iocg->hweight_donating;
  1664. parent->hweight_after_donation += iocg->hweight_after_donation;
  1665. }
  1666. }
  1667. /*
  1668. * Calculate inner hwa's (b) and make sure the donation values are
  1669. * within the accepted ranges as we're doing low res calculations with
  1670. * roundups.
  1671. */
  1672. list_for_each_entry(iocg, &inner_walk, walk_list) {
  1673. if (iocg->level) {
  1674. struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
  1675. iocg->hweight_active = DIV64_U64_ROUND_UP(
  1676. (u64)parent->hweight_active * iocg->active,
  1677. parent->child_active_sum);
  1678. }
  1679. iocg->hweight_donating = min(iocg->hweight_donating,
  1680. iocg->hweight_active);
  1681. iocg->hweight_after_donation = min(iocg->hweight_after_donation,
  1682. iocg->hweight_donating - 1);
  1683. if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
  1684. iocg->hweight_donating <= 1 ||
  1685. iocg->hweight_after_donation == 0)) {
  1686. pr_warn("iocg: invalid donation weights in ");
  1687. pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
  1688. pr_cont(": active=%u donating=%u after=%u\n",
  1689. iocg->hweight_active, iocg->hweight_donating,
  1690. iocg->hweight_after_donation);
  1691. }
  1692. }
  1693. /*
  1694. * Calculate the global donation rate (gamma) - the rate to adjust
  1695. * non-donating budgets by.
  1696. *
  1697. * No need to use 64bit multiplication here as the first operand is
  1698. * guaranteed to be smaller than WEIGHT_ONE (1<<16).
  1699. *
  1700. * We know that there are beneficiary nodes and the sum of the donating
  1701. * hweights can't be whole; however, due to the round-ups during hweight
  1702. * calculations, root_iocg->hweight_donating might still end up equal to
  1703. * or greater than whole. Limit the range when calculating the divider.
  1704. *
  1705. * gamma = (1 - t_r') / (1 - t_r)
  1706. */
  1707. gamma = DIV_ROUND_UP(
  1708. (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
  1709. WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
  1710. /*
  1711. * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
  1712. * nodes.
  1713. */
  1714. list_for_each_entry(iocg, &inner_walk, walk_list) {
  1715. struct ioc_gq *parent;
  1716. u32 inuse, wpt, wptp;
  1717. u64 st, sf;
  1718. if (iocg->level == 0) {
  1719. /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
  1720. iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
  1721. iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
  1722. WEIGHT_ONE - iocg->hweight_after_donation);
  1723. continue;
  1724. }
  1725. parent = iocg->ancestors[iocg->level - 1];
  1726. /* b' = gamma * b_f + b_t' */
  1727. iocg->hweight_inuse = DIV64_U64_ROUND_UP(
  1728. (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
  1729. WEIGHT_ONE) + iocg->hweight_after_donation;
  1730. /* w' = s' * b' / b'_p */
  1731. inuse = DIV64_U64_ROUND_UP(
  1732. (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
  1733. parent->hweight_inuse);
  1734. /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
  1735. st = DIV64_U64_ROUND_UP(
  1736. iocg->child_active_sum * iocg->hweight_donating,
  1737. iocg->hweight_active);
  1738. sf = iocg->child_active_sum - st;
  1739. wpt = DIV64_U64_ROUND_UP(
  1740. (u64)iocg->active * iocg->hweight_donating,
  1741. iocg->hweight_active);
  1742. wptp = DIV64_U64_ROUND_UP(
  1743. (u64)inuse * iocg->hweight_after_donation,
  1744. iocg->hweight_inuse);
  1745. iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
  1746. }
  1747. /*
  1748. * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
  1749. * we can finally determine leaf adjustments.
  1750. */
  1751. list_for_each_entry(iocg, surpluses, surplus_list) {
  1752. struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
  1753. u32 inuse;
  1754. /*
  1755. * In-debt iocgs participated in the donation calculation with
  1756. * the minimum target hweight_inuse. Configuring inuse
  1757. * accordingly would work fine but debt handling expects
  1758. * @iocg->inuse stay at the minimum and we don't wanna
  1759. * interfere.
  1760. */
  1761. if (iocg->abs_vdebt) {
  1762. WARN_ON_ONCE(iocg->inuse > 1);
  1763. continue;
  1764. }
  1765. /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
  1766. inuse = DIV64_U64_ROUND_UP(
  1767. parent->child_adjusted_sum * iocg->hweight_after_donation,
  1768. parent->hweight_inuse);
  1769. TRACE_IOCG_PATH(inuse_transfer, iocg, now,
  1770. iocg->inuse, inuse,
  1771. iocg->hweight_inuse,
  1772. iocg->hweight_after_donation);
  1773. __propagate_weights(iocg, iocg->active, inuse, true, now);
  1774. }
  1775. /* walk list should be dissolved after use */
  1776. list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
  1777. list_del_init(&iocg->walk_list);
  1778. }
  1779. /*
  1780. * A low weight iocg can amass a large amount of debt, for example, when
  1781. * anonymous memory gets reclaimed aggressively. If the system has a lot of
  1782. * memory paired with a slow IO device, the debt can span multiple seconds or
  1783. * more. If there are no other subsequent IO issuers, the in-debt iocg may end
  1784. * up blocked paying its debt while the IO device is idle.
  1785. *
  1786. * The following protects against such cases. If the device has been
  1787. * sufficiently idle for a while, the debts are halved and delays are
  1788. * recalculated.
  1789. */
  1790. static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
  1791. struct ioc_now *now)
  1792. {
  1793. struct ioc_gq *iocg;
  1794. u64 dur, usage_pct, nr_cycles, nr_cycles_shift;
  1795. /* if no debtor, reset the cycle */
  1796. if (!nr_debtors) {
  1797. ioc->dfgv_period_at = now->now;
  1798. ioc->dfgv_period_rem = 0;
  1799. ioc->dfgv_usage_us_sum = 0;
  1800. return;
  1801. }
  1802. /*
  1803. * Debtors can pass through a lot of writes choking the device and we
  1804. * don't want to be forgiving debts while the device is struggling from
  1805. * write bursts. If we're missing latency targets, consider the device
  1806. * fully utilized.
  1807. */
  1808. if (ioc->busy_level > 0)
  1809. usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
  1810. ioc->dfgv_usage_us_sum += usage_us_sum;
  1811. if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
  1812. return;
  1813. /*
  1814. * At least DFGV_PERIOD has passed since the last period. Calculate the
  1815. * average usage and reset the period counters.
  1816. */
  1817. dur = now->now - ioc->dfgv_period_at;
  1818. usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
  1819. ioc->dfgv_period_at = now->now;
  1820. ioc->dfgv_usage_us_sum = 0;
  1821. /* if was too busy, reset everything */
  1822. if (usage_pct > DFGV_USAGE_PCT) {
  1823. ioc->dfgv_period_rem = 0;
  1824. return;
  1825. }
  1826. /*
  1827. * Usage is lower than threshold. Let's forgive some debts. Debt
  1828. * forgiveness runs off of the usual ioc timer but its period usually
  1829. * doesn't match ioc's. Compensate the difference by performing the
  1830. * reduction as many times as would fit in the duration since the last
  1831. * run and carrying over the left-over duration in @ioc->dfgv_period_rem
  1832. * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
  1833. * reductions is doubled.
  1834. */
  1835. nr_cycles = dur + ioc->dfgv_period_rem;
  1836. ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
  1837. list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
  1838. u64 __maybe_unused old_debt, __maybe_unused old_delay;
  1839. if (!iocg->abs_vdebt && !iocg->delay)
  1840. continue;
  1841. spin_lock(&iocg->waitq.lock);
  1842. old_debt = iocg->abs_vdebt;
  1843. old_delay = iocg->delay;
  1844. nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1);
  1845. if (iocg->abs_vdebt)
  1846. iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1;
  1847. if (iocg->delay)
  1848. iocg->delay = iocg->delay >> nr_cycles_shift ?: 1;
  1849. iocg_kick_waitq(iocg, true, now);
  1850. TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
  1851. old_debt, iocg->abs_vdebt,
  1852. old_delay, iocg->delay);
  1853. spin_unlock(&iocg->waitq.lock);
  1854. }
  1855. }
  1856. /*
  1857. * Check the active iocgs' state to avoid oversleeping and deactive
  1858. * idle iocgs.
  1859. *
  1860. * Since waiters determine the sleep durations based on the vrate
  1861. * they saw at the time of sleep, if vrate has increased, some
  1862. * waiters could be sleeping for too long. Wake up tardy waiters
  1863. * which should have woken up in the last period and expire idle
  1864. * iocgs.
  1865. */
  1866. static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
  1867. {
  1868. int nr_debtors = 0;
  1869. struct ioc_gq *iocg, *tiocg;
  1870. list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
  1871. if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
  1872. !iocg->delay && !iocg_is_idle(iocg))
  1873. continue;
  1874. spin_lock(&iocg->waitq.lock);
  1875. /* flush wait and indebt stat deltas */
  1876. if (iocg->wait_since) {
  1877. iocg->stat.wait_us += now->now - iocg->wait_since;
  1878. iocg->wait_since = now->now;
  1879. }
  1880. if (iocg->indebt_since) {
  1881. iocg->stat.indebt_us +=
  1882. now->now - iocg->indebt_since;
  1883. iocg->indebt_since = now->now;
  1884. }
  1885. if (iocg->indelay_since) {
  1886. iocg->stat.indelay_us +=
  1887. now->now - iocg->indelay_since;
  1888. iocg->indelay_since = now->now;
  1889. }
  1890. if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
  1891. iocg->delay) {
  1892. /* might be oversleeping vtime / hweight changes, kick */
  1893. iocg_kick_waitq(iocg, true, now);
  1894. if (iocg->abs_vdebt || iocg->delay)
  1895. nr_debtors++;
  1896. } else if (iocg_is_idle(iocg)) {
  1897. /* no waiter and idle, deactivate */
  1898. u64 vtime = atomic64_read(&iocg->vtime);
  1899. s64 excess;
  1900. /*
  1901. * @iocg has been inactive for a full duration and will
  1902. * have a high budget. Account anything above target as
  1903. * error and throw away. On reactivation, it'll start
  1904. * with the target budget.
  1905. */
  1906. excess = now->vnow - vtime - ioc->margins.target;
  1907. if (excess > 0) {
  1908. u32 old_hwi;
  1909. current_hweight(iocg, NULL, &old_hwi);
  1910. ioc->vtime_err -= div64_u64(excess * old_hwi,
  1911. WEIGHT_ONE);
  1912. }
  1913. TRACE_IOCG_PATH(iocg_idle, iocg, now,
  1914. atomic64_read(&iocg->active_period),
  1915. atomic64_read(&ioc->cur_period), vtime);
  1916. __propagate_weights(iocg, 0, 0, false, now);
  1917. list_del_init(&iocg->active_list);
  1918. }
  1919. spin_unlock(&iocg->waitq.lock);
  1920. }
  1921. commit_weights(ioc);
  1922. return nr_debtors;
  1923. }
  1924. static void ioc_timer_fn(struct timer_list *timer)
  1925. {
  1926. struct ioc *ioc = container_of(timer, struct ioc, timer);
  1927. struct ioc_gq *iocg, *tiocg;
  1928. struct ioc_now now;
  1929. LIST_HEAD(surpluses);
  1930. int nr_debtors, nr_shortages = 0, nr_lagging = 0;
  1931. u64 usage_us_sum = 0;
  1932. u32 ppm_rthr;
  1933. u32 ppm_wthr;
  1934. u32 missed_ppm[2], rq_wait_pct;
  1935. u64 period_vtime;
  1936. int prev_busy_level;
  1937. /* how were the latencies during the period? */
  1938. ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
  1939. /* take care of active iocgs */
  1940. spin_lock_irq(&ioc->lock);
  1941. ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
  1942. ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
  1943. ioc_now(ioc, &now);
  1944. period_vtime = now.vnow - ioc->period_at_vtime;
  1945. if (WARN_ON_ONCE(!period_vtime)) {
  1946. spin_unlock_irq(&ioc->lock);
  1947. return;
  1948. }
  1949. nr_debtors = ioc_check_iocgs(ioc, &now);
  1950. /*
  1951. * Wait and indebt stat are flushed above and the donation calculation
  1952. * below needs updated usage stat. Let's bring stat up-to-date.
  1953. */
  1954. iocg_flush_stat(&ioc->active_iocgs, &now);
  1955. /* calc usage and see whether some weights need to be moved around */
  1956. list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
  1957. u64 vdone, vtime, usage_us;
  1958. u32 hw_active, hw_inuse;
  1959. /*
  1960. * Collect unused and wind vtime closer to vnow to prevent
  1961. * iocgs from accumulating a large amount of budget.
  1962. */
  1963. vdone = atomic64_read(&iocg->done_vtime);
  1964. vtime = atomic64_read(&iocg->vtime);
  1965. current_hweight(iocg, &hw_active, &hw_inuse);
  1966. /*
  1967. * Latency QoS detection doesn't account for IOs which are
  1968. * in-flight for longer than a period. Detect them by
  1969. * comparing vdone against period start. If lagging behind
  1970. * IOs from past periods, don't increase vrate.
  1971. */
  1972. if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
  1973. !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
  1974. time_after64(vtime, vdone) &&
  1975. time_after64(vtime, now.vnow -
  1976. MAX_LAGGING_PERIODS * period_vtime) &&
  1977. time_before64(vdone, now.vnow - period_vtime))
  1978. nr_lagging++;
  1979. /*
  1980. * Determine absolute usage factoring in in-flight IOs to avoid
  1981. * high-latency completions appearing as idle.
  1982. */
  1983. usage_us = iocg->usage_delta_us;
  1984. usage_us_sum += usage_us;
  1985. /* see whether there's surplus vtime */
  1986. WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
  1987. if (hw_inuse < hw_active ||
  1988. (!waitqueue_active(&iocg->waitq) &&
  1989. time_before64(vtime, now.vnow - ioc->margins.low))) {
  1990. u32 hwa, old_hwi, hwm, new_hwi, usage;
  1991. u64 usage_dur;
  1992. if (vdone != vtime) {
  1993. u64 inflight_us = DIV64_U64_ROUND_UP(
  1994. cost_to_abs_cost(vtime - vdone, hw_inuse),
  1995. ioc->vtime_base_rate);
  1996. usage_us = max(usage_us, inflight_us);
  1997. }
  1998. /* convert to hweight based usage ratio */
  1999. if (time_after64(iocg->activated_at, ioc->period_at))
  2000. usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
  2001. else
  2002. usage_dur = max_t(u64, now.now - ioc->period_at, 1);
  2003. usage = clamp(DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, usage_dur),
  2004. 1, WEIGHT_ONE);
  2005. /*
  2006. * Already donating or accumulated enough to start.
  2007. * Determine the donation amount.
  2008. */
  2009. current_hweight(iocg, &hwa, &old_hwi);
  2010. hwm = current_hweight_max(iocg);
  2011. new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
  2012. usage, &now);
  2013. /*
  2014. * Donation calculation assumes hweight_after_donation
  2015. * to be positive, a condition that a donor w/ hwa < 2
  2016. * can't meet. Don't bother with donation if hwa is
  2017. * below 2. It's not gonna make a meaningful difference
  2018. * anyway.
  2019. */
  2020. if (new_hwi < hwm && hwa >= 2) {
  2021. iocg->hweight_donating = hwa;
  2022. iocg->hweight_after_donation = new_hwi;
  2023. list_add(&iocg->surplus_list, &surpluses);
  2024. } else if (!iocg->abs_vdebt) {
  2025. /*
  2026. * @iocg doesn't have enough to donate. Reset
  2027. * its inuse to active.
  2028. *
  2029. * Don't reset debtors as their inuse's are
  2030. * owned by debt handling. This shouldn't affect
  2031. * donation calculuation in any meaningful way
  2032. * as @iocg doesn't have a meaningful amount of
  2033. * share anyway.
  2034. */
  2035. TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
  2036. iocg->inuse, iocg->active,
  2037. iocg->hweight_inuse, new_hwi);
  2038. __propagate_weights(iocg, iocg->active,
  2039. iocg->active, true, &now);
  2040. nr_shortages++;
  2041. }
  2042. } else {
  2043. /* genuinely short on vtime */
  2044. nr_shortages++;
  2045. }
  2046. }
  2047. if (!list_empty(&surpluses) && nr_shortages)
  2048. transfer_surpluses(&surpluses, &now);
  2049. commit_weights(ioc);
  2050. /* surplus list should be dissolved after use */
  2051. list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
  2052. list_del_init(&iocg->surplus_list);
  2053. /*
  2054. * If q is getting clogged or we're missing too much, we're issuing
  2055. * too much IO and should lower vtime rate. If we're not missing
  2056. * and experiencing shortages but not surpluses, we're too stingy
  2057. * and should increase vtime rate.
  2058. */
  2059. prev_busy_level = ioc->busy_level;
  2060. if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
  2061. missed_ppm[READ] > ppm_rthr ||
  2062. missed_ppm[WRITE] > ppm_wthr) {
  2063. /* clearly missing QoS targets, slow down vrate */
  2064. ioc->busy_level = max(ioc->busy_level, 0);
  2065. ioc->busy_level++;
  2066. } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
  2067. missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
  2068. missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
  2069. /* QoS targets are being met with >25% margin */
  2070. if (nr_shortages) {
  2071. /*
  2072. * We're throttling while the device has spare
  2073. * capacity. If vrate was being slowed down, stop.
  2074. */
  2075. ioc->busy_level = min(ioc->busy_level, 0);
  2076. /*
  2077. * If there are IOs spanning multiple periods, wait
  2078. * them out before pushing the device harder.
  2079. */
  2080. if (!nr_lagging)
  2081. ioc->busy_level--;
  2082. } else {
  2083. /*
  2084. * Nobody is being throttled and the users aren't
  2085. * issuing enough IOs to saturate the device. We
  2086. * simply don't know how close the device is to
  2087. * saturation. Coast.
  2088. */
  2089. ioc->busy_level = 0;
  2090. }
  2091. } else {
  2092. /* inside the hysterisis margin, we're good */
  2093. ioc->busy_level = 0;
  2094. }
  2095. ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
  2096. ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
  2097. prev_busy_level, missed_ppm);
  2098. ioc_refresh_params(ioc, false);
  2099. ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
  2100. /*
  2101. * This period is done. Move onto the next one. If nothing's
  2102. * going on with the device, stop the timer.
  2103. */
  2104. atomic64_inc(&ioc->cur_period);
  2105. if (ioc->running != IOC_STOP) {
  2106. if (!list_empty(&ioc->active_iocgs)) {
  2107. ioc_start_period(ioc, &now);
  2108. } else {
  2109. ioc->busy_level = 0;
  2110. ioc->vtime_err = 0;
  2111. ioc->running = IOC_IDLE;
  2112. }
  2113. ioc_refresh_vrate(ioc, &now);
  2114. }
  2115. spin_unlock_irq(&ioc->lock);
  2116. }
  2117. static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
  2118. u64 abs_cost, struct ioc_now *now)
  2119. {
  2120. struct ioc *ioc = iocg->ioc;
  2121. struct ioc_margins *margins = &ioc->margins;
  2122. u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
  2123. u32 hwi, adj_step;
  2124. s64 margin;
  2125. u64 cost, new_inuse;
  2126. unsigned long flags;
  2127. current_hweight(iocg, NULL, &hwi);
  2128. old_hwi = hwi;
  2129. cost = abs_cost_to_cost(abs_cost, hwi);
  2130. margin = now->vnow - vtime - cost;
  2131. /* debt handling owns inuse for debtors */
  2132. if (iocg->abs_vdebt)
  2133. return cost;
  2134. /*
  2135. * We only increase inuse during period and do so if the margin has
  2136. * deteriorated since the previous adjustment.
  2137. */
  2138. if (margin >= iocg->saved_margin || margin >= margins->low ||
  2139. iocg->inuse == iocg->active)
  2140. return cost;
  2141. spin_lock_irqsave(&ioc->lock, flags);
  2142. /* we own inuse only when @iocg is in the normal active state */
  2143. if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
  2144. spin_unlock_irqrestore(&ioc->lock, flags);
  2145. return cost;
  2146. }
  2147. /*
  2148. * Bump up inuse till @abs_cost fits in the existing budget.
  2149. * adj_step must be determined after acquiring ioc->lock - we might
  2150. * have raced and lost to another thread for activation and could
  2151. * be reading 0 iocg->active before ioc->lock which will lead to
  2152. * infinite loop.
  2153. */
  2154. new_inuse = iocg->inuse;
  2155. adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
  2156. do {
  2157. new_inuse = new_inuse + adj_step;
  2158. propagate_weights(iocg, iocg->active, new_inuse, true, now);
  2159. current_hweight(iocg, NULL, &hwi);
  2160. cost = abs_cost_to_cost(abs_cost, hwi);
  2161. } while (time_after64(vtime + cost, now->vnow) &&
  2162. iocg->inuse != iocg->active);
  2163. spin_unlock_irqrestore(&ioc->lock, flags);
  2164. TRACE_IOCG_PATH(inuse_adjust, iocg, now,
  2165. old_inuse, iocg->inuse, old_hwi, hwi);
  2166. return cost;
  2167. }
  2168. static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
  2169. bool is_merge, u64 *costp)
  2170. {
  2171. struct ioc *ioc = iocg->ioc;
  2172. u64 coef_seqio, coef_randio, coef_page;
  2173. u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
  2174. u64 seek_pages = 0;
  2175. u64 cost = 0;
  2176. /* Can't calculate cost for empty bio */
  2177. if (!bio->bi_iter.bi_size)
  2178. goto out;
  2179. switch (bio_op(bio)) {
  2180. case REQ_OP_READ:
  2181. coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
  2182. coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
  2183. coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
  2184. break;
  2185. case REQ_OP_WRITE:
  2186. coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
  2187. coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
  2188. coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
  2189. break;
  2190. default:
  2191. goto out;
  2192. }
  2193. if (iocg->cursor) {
  2194. seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
  2195. seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
  2196. }
  2197. if (!is_merge) {
  2198. if (seek_pages > LCOEF_RANDIO_PAGES) {
  2199. cost += coef_randio;
  2200. } else {
  2201. cost += coef_seqio;
  2202. }
  2203. }
  2204. cost += pages * coef_page;
  2205. out:
  2206. *costp = cost;
  2207. }
  2208. static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
  2209. {
  2210. u64 cost;
  2211. calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
  2212. return cost;
  2213. }
  2214. static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
  2215. u64 *costp)
  2216. {
  2217. unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
  2218. switch (req_op(rq)) {
  2219. case REQ_OP_READ:
  2220. *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
  2221. break;
  2222. case REQ_OP_WRITE:
  2223. *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
  2224. break;
  2225. default:
  2226. *costp = 0;
  2227. }
  2228. }
  2229. static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
  2230. {
  2231. u64 cost;
  2232. calc_size_vtime_cost_builtin(rq, ioc, &cost);
  2233. return cost;
  2234. }
  2235. static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
  2236. {
  2237. struct blkcg_gq *blkg = bio->bi_blkg;
  2238. struct ioc *ioc = rqos_to_ioc(rqos);
  2239. struct ioc_gq *iocg = blkg_to_iocg(blkg);
  2240. struct ioc_now now;
  2241. struct iocg_wait wait;
  2242. u64 abs_cost, cost, vtime;
  2243. bool use_debt, ioc_locked;
  2244. unsigned long flags;
  2245. /* bypass IOs if disabled, still initializing, or for root cgroup */
  2246. if (!ioc->enabled || !iocg || !iocg->level)
  2247. return;
  2248. /* calculate the absolute vtime cost */
  2249. abs_cost = calc_vtime_cost(bio, iocg, false);
  2250. if (!abs_cost)
  2251. return;
  2252. if (!iocg_activate(iocg, &now))
  2253. return;
  2254. iocg->cursor = bio_end_sector(bio);
  2255. vtime = atomic64_read(&iocg->vtime);
  2256. cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
  2257. /*
  2258. * If no one's waiting and within budget, issue right away. The
  2259. * tests are racy but the races aren't systemic - we only miss once
  2260. * in a while which is fine.
  2261. */
  2262. if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
  2263. time_before_eq64(vtime + cost, now.vnow)) {
  2264. iocg_commit_bio(iocg, bio, abs_cost, cost);
  2265. return;
  2266. }
  2267. /*
  2268. * We're over budget. This can be handled in two ways. IOs which may
  2269. * cause priority inversions are punted to @ioc->aux_iocg and charged as
  2270. * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
  2271. * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
  2272. * whether debt handling is needed and acquire locks accordingly.
  2273. */
  2274. use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
  2275. ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
  2276. retry_lock:
  2277. iocg_lock(iocg, ioc_locked, &flags);
  2278. /*
  2279. * @iocg must stay activated for debt and waitq handling. Deactivation
  2280. * is synchronized against both ioc->lock and waitq.lock and we won't
  2281. * get deactivated as long as we're waiting or has debt, so we're good
  2282. * if we're activated here. In the unlikely cases that we aren't, just
  2283. * issue the IO.
  2284. */
  2285. if (unlikely(list_empty(&iocg->active_list))) {
  2286. iocg_unlock(iocg, ioc_locked, &flags);
  2287. iocg_commit_bio(iocg, bio, abs_cost, cost);
  2288. return;
  2289. }
  2290. /*
  2291. * We're over budget. If @bio has to be issued regardless, remember
  2292. * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
  2293. * off the debt before waking more IOs.
  2294. *
  2295. * This way, the debt is continuously paid off each period with the
  2296. * actual budget available to the cgroup. If we just wound vtime, we
  2297. * would incorrectly use the current hw_inuse for the entire amount
  2298. * which, for example, can lead to the cgroup staying blocked for a
  2299. * long time even with substantially raised hw_inuse.
  2300. *
  2301. * An iocg with vdebt should stay online so that the timer can keep
  2302. * deducting its vdebt and [de]activate use_delay mechanism
  2303. * accordingly. We don't want to race against the timer trying to
  2304. * clear them and leave @iocg inactive w/ dangling use_delay heavily
  2305. * penalizing the cgroup and its descendants.
  2306. */
  2307. if (use_debt) {
  2308. iocg_incur_debt(iocg, abs_cost, &now);
  2309. if (iocg_kick_delay(iocg, &now))
  2310. blkcg_schedule_throttle(rqos->disk,
  2311. (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
  2312. iocg_unlock(iocg, ioc_locked, &flags);
  2313. return;
  2314. }
  2315. /* guarantee that iocgs w/ waiters have maximum inuse */
  2316. if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
  2317. if (!ioc_locked) {
  2318. iocg_unlock(iocg, false, &flags);
  2319. ioc_locked = true;
  2320. goto retry_lock;
  2321. }
  2322. propagate_weights(iocg, iocg->active, iocg->active, true,
  2323. &now);
  2324. }
  2325. /*
  2326. * Append self to the waitq and schedule the wakeup timer if we're
  2327. * the first waiter. The timer duration is calculated based on the
  2328. * current vrate. vtime and hweight changes can make it too short
  2329. * or too long. Each wait entry records the absolute cost it's
  2330. * waiting for to allow re-evaluation using a custom wait entry.
  2331. *
  2332. * If too short, the timer simply reschedules itself. If too long,
  2333. * the period timer will notice and trigger wakeups.
  2334. *
  2335. * All waiters are on iocg->waitq and the wait states are
  2336. * synchronized using waitq.lock.
  2337. */
  2338. init_wait_func(&wait.wait, iocg_wake_fn);
  2339. wait.bio = bio;
  2340. wait.abs_cost = abs_cost;
  2341. wait.committed = false; /* will be set true by waker */
  2342. __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
  2343. iocg_kick_waitq(iocg, ioc_locked, &now);
  2344. iocg_unlock(iocg, ioc_locked, &flags);
  2345. while (true) {
  2346. set_current_state(TASK_UNINTERRUPTIBLE);
  2347. if (wait.committed)
  2348. break;
  2349. io_schedule();
  2350. }
  2351. /* waker already committed us, proceed */
  2352. finish_wait(&iocg->waitq, &wait.wait);
  2353. }
  2354. static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
  2355. struct bio *bio)
  2356. {
  2357. struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
  2358. struct ioc *ioc = rqos_to_ioc(rqos);
  2359. sector_t bio_end = bio_end_sector(bio);
  2360. struct ioc_now now;
  2361. u64 vtime, abs_cost, cost;
  2362. unsigned long flags;
  2363. /* bypass if disabled, still initializing, or for root cgroup */
  2364. if (!ioc->enabled || !iocg || !iocg->level)
  2365. return;
  2366. abs_cost = calc_vtime_cost(bio, iocg, true);
  2367. if (!abs_cost)
  2368. return;
  2369. ioc_now(ioc, &now);
  2370. vtime = atomic64_read(&iocg->vtime);
  2371. cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
  2372. /* update cursor if backmerging into the request at the cursor */
  2373. if (blk_rq_pos(rq) < bio_end &&
  2374. blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
  2375. iocg->cursor = bio_end;
  2376. /*
  2377. * Charge if there's enough vtime budget and the existing request has
  2378. * cost assigned.
  2379. */
  2380. if (rq->bio && rq->bio->bi_iocost_cost &&
  2381. time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
  2382. iocg_commit_bio(iocg, bio, abs_cost, cost);
  2383. return;
  2384. }
  2385. /*
  2386. * Otherwise, account it as debt if @iocg is online, which it should
  2387. * be for the vast majority of cases. See debt handling in
  2388. * ioc_rqos_throttle() for details.
  2389. */
  2390. spin_lock_irqsave(&ioc->lock, flags);
  2391. spin_lock(&iocg->waitq.lock);
  2392. if (likely(!list_empty(&iocg->active_list))) {
  2393. iocg_incur_debt(iocg, abs_cost, &now);
  2394. if (iocg_kick_delay(iocg, &now))
  2395. blkcg_schedule_throttle(rqos->disk,
  2396. (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
  2397. } else {
  2398. iocg_commit_bio(iocg, bio, abs_cost, cost);
  2399. }
  2400. spin_unlock(&iocg->waitq.lock);
  2401. spin_unlock_irqrestore(&ioc->lock, flags);
  2402. }
  2403. static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
  2404. {
  2405. struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
  2406. if (iocg && bio->bi_iocost_cost)
  2407. atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
  2408. }
  2409. static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
  2410. {
  2411. struct ioc *ioc = rqos_to_ioc(rqos);
  2412. struct ioc_pcpu_stat *ccs;
  2413. u64 on_q_ns, rq_wait_ns, size_nsec;
  2414. int pidx, rw;
  2415. if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
  2416. return;
  2417. switch (req_op(rq)) {
  2418. case REQ_OP_READ:
  2419. pidx = QOS_RLAT;
  2420. rw = READ;
  2421. break;
  2422. case REQ_OP_WRITE:
  2423. pidx = QOS_WLAT;
  2424. rw = WRITE;
  2425. break;
  2426. default:
  2427. return;
  2428. }
  2429. on_q_ns = blk_time_get_ns() - rq->alloc_time_ns;
  2430. rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
  2431. size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
  2432. ccs = get_cpu_ptr(ioc->pcpu_stat);
  2433. if (on_q_ns <= size_nsec ||
  2434. on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
  2435. local_inc(&ccs->missed[rw].nr_met);
  2436. else
  2437. local_inc(&ccs->missed[rw].nr_missed);
  2438. local64_add(rq_wait_ns, &ccs->rq_wait_ns);
  2439. put_cpu_ptr(ccs);
  2440. }
  2441. static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
  2442. {
  2443. struct ioc *ioc = rqos_to_ioc(rqos);
  2444. spin_lock_irq(&ioc->lock);
  2445. ioc_refresh_params(ioc, false);
  2446. spin_unlock_irq(&ioc->lock);
  2447. }
  2448. static void ioc_rqos_exit(struct rq_qos *rqos)
  2449. {
  2450. struct ioc *ioc = rqos_to_ioc(rqos);
  2451. blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost);
  2452. spin_lock_irq(&ioc->lock);
  2453. ioc->running = IOC_STOP;
  2454. spin_unlock_irq(&ioc->lock);
  2455. timer_shutdown_sync(&ioc->timer);
  2456. free_percpu(ioc->pcpu_stat);
  2457. kfree(ioc);
  2458. }
  2459. static const struct rq_qos_ops ioc_rqos_ops = {
  2460. .throttle = ioc_rqos_throttle,
  2461. .merge = ioc_rqos_merge,
  2462. .done_bio = ioc_rqos_done_bio,
  2463. .done = ioc_rqos_done,
  2464. .queue_depth_changed = ioc_rqos_queue_depth_changed,
  2465. .exit = ioc_rqos_exit,
  2466. };
  2467. static int blk_iocost_init(struct gendisk *disk)
  2468. {
  2469. struct ioc *ioc;
  2470. int i, cpu, ret;
  2471. ioc = kzalloc_obj(*ioc);
  2472. if (!ioc)
  2473. return -ENOMEM;
  2474. ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
  2475. if (!ioc->pcpu_stat) {
  2476. kfree(ioc);
  2477. return -ENOMEM;
  2478. }
  2479. for_each_possible_cpu(cpu) {
  2480. struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
  2481. for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
  2482. local_set(&ccs->missed[i].nr_met, 0);
  2483. local_set(&ccs->missed[i].nr_missed, 0);
  2484. }
  2485. local64_set(&ccs->rq_wait_ns, 0);
  2486. }
  2487. spin_lock_init(&ioc->lock);
  2488. timer_setup(&ioc->timer, ioc_timer_fn, 0);
  2489. INIT_LIST_HEAD(&ioc->active_iocgs);
  2490. ioc->running = IOC_IDLE;
  2491. ioc->vtime_base_rate = VTIME_PER_USEC;
  2492. atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
  2493. seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
  2494. ioc->period_at = ktime_to_us(blk_time_get());
  2495. atomic64_set(&ioc->cur_period, 0);
  2496. atomic_set(&ioc->hweight_gen, 0);
  2497. spin_lock_irq(&ioc->lock);
  2498. ioc->autop_idx = AUTOP_INVALID;
  2499. ioc_refresh_params_disk(ioc, true, disk);
  2500. spin_unlock_irq(&ioc->lock);
  2501. /*
  2502. * rqos must be added before activation to allow ioc_pd_init() to
  2503. * lookup the ioc from q. This means that the rqos methods may get
  2504. * called before policy activation completion, can't assume that the
  2505. * target bio has an iocg associated and need to test for NULL iocg.
  2506. */
  2507. ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
  2508. if (ret)
  2509. goto err_free_ioc;
  2510. ret = blkcg_activate_policy(disk, &blkcg_policy_iocost);
  2511. if (ret)
  2512. goto err_del_qos;
  2513. return 0;
  2514. err_del_qos:
  2515. rq_qos_del(&ioc->rqos);
  2516. err_free_ioc:
  2517. free_percpu(ioc->pcpu_stat);
  2518. kfree(ioc);
  2519. return ret;
  2520. }
  2521. static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
  2522. {
  2523. struct ioc_cgrp *iocc;
  2524. iocc = kzalloc_obj(struct ioc_cgrp, gfp);
  2525. if (!iocc)
  2526. return NULL;
  2527. iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
  2528. return &iocc->cpd;
  2529. }
  2530. static void ioc_cpd_free(struct blkcg_policy_data *cpd)
  2531. {
  2532. kfree(container_of(cpd, struct ioc_cgrp, cpd));
  2533. }
  2534. static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk,
  2535. struct blkcg *blkcg, gfp_t gfp)
  2536. {
  2537. int levels = blkcg->css.cgroup->level + 1;
  2538. struct ioc_gq *iocg;
  2539. iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp,
  2540. disk->node_id);
  2541. if (!iocg)
  2542. return NULL;
  2543. iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
  2544. if (!iocg->pcpu_stat) {
  2545. kfree(iocg);
  2546. return NULL;
  2547. }
  2548. return &iocg->pd;
  2549. }
  2550. static void ioc_pd_init(struct blkg_policy_data *pd)
  2551. {
  2552. struct ioc_gq *iocg = pd_to_iocg(pd);
  2553. struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
  2554. struct ioc *ioc = q_to_ioc(blkg->q);
  2555. struct ioc_now now;
  2556. struct blkcg_gq *tblkg;
  2557. unsigned long flags;
  2558. ioc_now(ioc, &now);
  2559. iocg->ioc = ioc;
  2560. atomic64_set(&iocg->vtime, now.vnow);
  2561. atomic64_set(&iocg->done_vtime, now.vnow);
  2562. atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
  2563. INIT_LIST_HEAD(&iocg->active_list);
  2564. INIT_LIST_HEAD(&iocg->walk_list);
  2565. INIT_LIST_HEAD(&iocg->surplus_list);
  2566. iocg->hweight_active = WEIGHT_ONE;
  2567. iocg->hweight_inuse = WEIGHT_ONE;
  2568. init_waitqueue_head(&iocg->waitq);
  2569. hrtimer_setup(&iocg->waitq_timer, iocg_waitq_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
  2570. iocg->level = blkg->blkcg->css.cgroup->level;
  2571. for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
  2572. struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
  2573. iocg->ancestors[tiocg->level] = tiocg;
  2574. }
  2575. spin_lock_irqsave(&ioc->lock, flags);
  2576. weight_updated(iocg, &now);
  2577. spin_unlock_irqrestore(&ioc->lock, flags);
  2578. }
  2579. static void ioc_pd_free(struct blkg_policy_data *pd)
  2580. {
  2581. struct ioc_gq *iocg = pd_to_iocg(pd);
  2582. struct ioc *ioc = iocg->ioc;
  2583. unsigned long flags;
  2584. if (ioc) {
  2585. spin_lock_irqsave(&ioc->lock, flags);
  2586. if (!list_empty(&iocg->active_list)) {
  2587. struct ioc_now now;
  2588. ioc_now(ioc, &now);
  2589. propagate_weights(iocg, 0, 0, false, &now);
  2590. list_del_init(&iocg->active_list);
  2591. }
  2592. WARN_ON_ONCE(!list_empty(&iocg->walk_list));
  2593. WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
  2594. spin_unlock_irqrestore(&ioc->lock, flags);
  2595. hrtimer_cancel(&iocg->waitq_timer);
  2596. }
  2597. free_percpu(iocg->pcpu_stat);
  2598. kfree(iocg);
  2599. }
  2600. static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
  2601. {
  2602. struct ioc_gq *iocg = pd_to_iocg(pd);
  2603. struct ioc *ioc = iocg->ioc;
  2604. if (!ioc->enabled)
  2605. return;
  2606. if (iocg->level == 0) {
  2607. unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
  2608. ioc->vtime_base_rate * 10000,
  2609. VTIME_PER_USEC);
  2610. seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
  2611. }
  2612. seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
  2613. if (blkcg_debug_stats)
  2614. seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
  2615. iocg->last_stat.wait_us,
  2616. iocg->last_stat.indebt_us,
  2617. iocg->last_stat.indelay_us);
  2618. }
  2619. static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
  2620. int off)
  2621. {
  2622. const char *dname = blkg_dev_name(pd->blkg);
  2623. struct ioc_gq *iocg = pd_to_iocg(pd);
  2624. if (dname && iocg->cfg_weight)
  2625. seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
  2626. return 0;
  2627. }
  2628. static int ioc_weight_show(struct seq_file *sf, void *v)
  2629. {
  2630. struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
  2631. struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
  2632. seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
  2633. blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
  2634. &blkcg_policy_iocost, seq_cft(sf)->private, false);
  2635. return 0;
  2636. }
  2637. static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
  2638. size_t nbytes, loff_t off)
  2639. {
  2640. struct blkcg *blkcg = css_to_blkcg(of_css(of));
  2641. struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
  2642. struct blkg_conf_ctx ctx;
  2643. struct ioc_now now;
  2644. struct ioc_gq *iocg;
  2645. u32 v;
  2646. int ret;
  2647. if (!strchr(buf, ':')) {
  2648. struct blkcg_gq *blkg;
  2649. if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
  2650. return -EINVAL;
  2651. if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
  2652. return -EINVAL;
  2653. spin_lock_irq(&blkcg->lock);
  2654. iocc->dfl_weight = v * WEIGHT_ONE;
  2655. hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
  2656. struct ioc_gq *iocg = blkg_to_iocg(blkg);
  2657. if (iocg) {
  2658. spin_lock(&iocg->ioc->lock);
  2659. ioc_now(iocg->ioc, &now);
  2660. weight_updated(iocg, &now);
  2661. spin_unlock(&iocg->ioc->lock);
  2662. }
  2663. }
  2664. spin_unlock_irq(&blkcg->lock);
  2665. return nbytes;
  2666. }
  2667. blkg_conf_init(&ctx, buf);
  2668. ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
  2669. if (ret)
  2670. goto err;
  2671. iocg = blkg_to_iocg(ctx.blkg);
  2672. if (!strncmp(ctx.body, "default", 7)) {
  2673. v = 0;
  2674. } else {
  2675. if (!sscanf(ctx.body, "%u", &v))
  2676. goto einval;
  2677. if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
  2678. goto einval;
  2679. }
  2680. spin_lock(&iocg->ioc->lock);
  2681. iocg->cfg_weight = v * WEIGHT_ONE;
  2682. ioc_now(iocg->ioc, &now);
  2683. weight_updated(iocg, &now);
  2684. spin_unlock(&iocg->ioc->lock);
  2685. blkg_conf_exit(&ctx);
  2686. return nbytes;
  2687. einval:
  2688. ret = -EINVAL;
  2689. err:
  2690. blkg_conf_exit(&ctx);
  2691. return ret;
  2692. }
  2693. static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
  2694. int off)
  2695. {
  2696. const char *dname = blkg_dev_name(pd->blkg);
  2697. struct ioc *ioc = pd_to_iocg(pd)->ioc;
  2698. if (!dname)
  2699. return 0;
  2700. spin_lock(&ioc->lock);
  2701. seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
  2702. dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
  2703. ioc->params.qos[QOS_RPPM] / 10000,
  2704. ioc->params.qos[QOS_RPPM] % 10000 / 100,
  2705. ioc->params.qos[QOS_RLAT],
  2706. ioc->params.qos[QOS_WPPM] / 10000,
  2707. ioc->params.qos[QOS_WPPM] % 10000 / 100,
  2708. ioc->params.qos[QOS_WLAT],
  2709. ioc->params.qos[QOS_MIN] / 10000,
  2710. ioc->params.qos[QOS_MIN] % 10000 / 100,
  2711. ioc->params.qos[QOS_MAX] / 10000,
  2712. ioc->params.qos[QOS_MAX] % 10000 / 100);
  2713. spin_unlock(&ioc->lock);
  2714. return 0;
  2715. }
  2716. static int ioc_qos_show(struct seq_file *sf, void *v)
  2717. {
  2718. struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
  2719. blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
  2720. &blkcg_policy_iocost, seq_cft(sf)->private, false);
  2721. return 0;
  2722. }
  2723. static const match_table_t qos_ctrl_tokens = {
  2724. { QOS_ENABLE, "enable=%u" },
  2725. { QOS_CTRL, "ctrl=%s" },
  2726. { NR_QOS_CTRL_PARAMS, NULL },
  2727. };
  2728. static const match_table_t qos_tokens = {
  2729. { QOS_RPPM, "rpct=%s" },
  2730. { QOS_RLAT, "rlat=%u" },
  2731. { QOS_WPPM, "wpct=%s" },
  2732. { QOS_WLAT, "wlat=%u" },
  2733. { QOS_MIN, "min=%s" },
  2734. { QOS_MAX, "max=%s" },
  2735. { NR_QOS_PARAMS, NULL },
  2736. };
  2737. static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
  2738. size_t nbytes, loff_t off)
  2739. {
  2740. struct blkg_conf_ctx ctx;
  2741. struct gendisk *disk;
  2742. struct ioc *ioc;
  2743. u32 qos[NR_QOS_PARAMS];
  2744. bool enable, user;
  2745. char *body, *p;
  2746. unsigned long memflags;
  2747. int ret;
  2748. blkg_conf_init(&ctx, input);
  2749. memflags = blkg_conf_open_bdev_frozen(&ctx);
  2750. if (IS_ERR_VALUE(memflags)) {
  2751. ret = memflags;
  2752. goto err;
  2753. }
  2754. body = ctx.body;
  2755. disk = ctx.bdev->bd_disk;
  2756. if (!queue_is_mq(disk->queue)) {
  2757. ret = -EOPNOTSUPP;
  2758. goto err;
  2759. }
  2760. ioc = q_to_ioc(disk->queue);
  2761. if (!ioc) {
  2762. ret = blk_iocost_init(disk);
  2763. if (ret)
  2764. goto err;
  2765. ioc = q_to_ioc(disk->queue);
  2766. }
  2767. blk_mq_quiesce_queue(disk->queue);
  2768. spin_lock_irq(&ioc->lock);
  2769. memcpy(qos, ioc->params.qos, sizeof(qos));
  2770. enable = ioc->enabled;
  2771. user = ioc->user_qos_params;
  2772. while ((p = strsep(&body, " \t\n"))) {
  2773. substring_t args[MAX_OPT_ARGS];
  2774. char buf[32];
  2775. int tok;
  2776. s64 v;
  2777. if (!*p)
  2778. continue;
  2779. switch (match_token(p, qos_ctrl_tokens, args)) {
  2780. case QOS_ENABLE:
  2781. if (match_u64(&args[0], &v))
  2782. goto einval;
  2783. enable = v;
  2784. continue;
  2785. case QOS_CTRL:
  2786. match_strlcpy(buf, &args[0], sizeof(buf));
  2787. if (!strcmp(buf, "auto"))
  2788. user = false;
  2789. else if (!strcmp(buf, "user"))
  2790. user = true;
  2791. else
  2792. goto einval;
  2793. continue;
  2794. }
  2795. tok = match_token(p, qos_tokens, args);
  2796. switch (tok) {
  2797. case QOS_RPPM:
  2798. case QOS_WPPM:
  2799. if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
  2800. sizeof(buf))
  2801. goto einval;
  2802. if (cgroup_parse_float(buf, 2, &v))
  2803. goto einval;
  2804. if (v < 0 || v > 10000)
  2805. goto einval;
  2806. qos[tok] = v * 100;
  2807. break;
  2808. case QOS_RLAT:
  2809. case QOS_WLAT:
  2810. if (match_u64(&args[0], &v))
  2811. goto einval;
  2812. qos[tok] = v;
  2813. break;
  2814. case QOS_MIN:
  2815. case QOS_MAX:
  2816. if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
  2817. sizeof(buf))
  2818. goto einval;
  2819. if (cgroup_parse_float(buf, 2, &v))
  2820. goto einval;
  2821. if (v < 0)
  2822. goto einval;
  2823. qos[tok] = clamp_t(s64, v * 100,
  2824. VRATE_MIN_PPM, VRATE_MAX_PPM);
  2825. break;
  2826. default:
  2827. goto einval;
  2828. }
  2829. user = true;
  2830. }
  2831. if (qos[QOS_MIN] > qos[QOS_MAX])
  2832. goto einval;
  2833. if (enable && !ioc->enabled) {
  2834. blk_stat_enable_accounting(disk->queue);
  2835. blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
  2836. ioc->enabled = true;
  2837. } else if (!enable && ioc->enabled) {
  2838. blk_stat_disable_accounting(disk->queue);
  2839. blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
  2840. ioc->enabled = false;
  2841. }
  2842. if (user) {
  2843. memcpy(ioc->params.qos, qos, sizeof(qos));
  2844. ioc->user_qos_params = true;
  2845. } else {
  2846. ioc->user_qos_params = false;
  2847. }
  2848. ioc_refresh_params(ioc, true);
  2849. spin_unlock_irq(&ioc->lock);
  2850. if (enable)
  2851. wbt_disable_default(disk);
  2852. else
  2853. wbt_enable_default(disk);
  2854. blk_mq_unquiesce_queue(disk->queue);
  2855. blkg_conf_exit_frozen(&ctx, memflags);
  2856. return nbytes;
  2857. einval:
  2858. spin_unlock_irq(&ioc->lock);
  2859. blk_mq_unquiesce_queue(disk->queue);
  2860. ret = -EINVAL;
  2861. err:
  2862. blkg_conf_exit_frozen(&ctx, memflags);
  2863. return ret;
  2864. }
  2865. static u64 ioc_cost_model_prfill(struct seq_file *sf,
  2866. struct blkg_policy_data *pd, int off)
  2867. {
  2868. const char *dname = blkg_dev_name(pd->blkg);
  2869. struct ioc *ioc = pd_to_iocg(pd)->ioc;
  2870. u64 *u = ioc->params.i_lcoefs;
  2871. if (!dname)
  2872. return 0;
  2873. spin_lock(&ioc->lock);
  2874. seq_printf(sf, "%s ctrl=%s model=linear "
  2875. "rbps=%llu rseqiops=%llu rrandiops=%llu "
  2876. "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
  2877. dname, ioc->user_cost_model ? "user" : "auto",
  2878. u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
  2879. u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
  2880. spin_unlock(&ioc->lock);
  2881. return 0;
  2882. }
  2883. static int ioc_cost_model_show(struct seq_file *sf, void *v)
  2884. {
  2885. struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
  2886. blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
  2887. &blkcg_policy_iocost, seq_cft(sf)->private, false);
  2888. return 0;
  2889. }
  2890. static const match_table_t cost_ctrl_tokens = {
  2891. { COST_CTRL, "ctrl=%s" },
  2892. { COST_MODEL, "model=%s" },
  2893. { NR_COST_CTRL_PARAMS, NULL },
  2894. };
  2895. static const match_table_t i_lcoef_tokens = {
  2896. { I_LCOEF_RBPS, "rbps=%u" },
  2897. { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
  2898. { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
  2899. { I_LCOEF_WBPS, "wbps=%u" },
  2900. { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
  2901. { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
  2902. { NR_I_LCOEFS, NULL },
  2903. };
  2904. static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
  2905. size_t nbytes, loff_t off)
  2906. {
  2907. struct blkg_conf_ctx ctx;
  2908. struct request_queue *q;
  2909. unsigned int memflags;
  2910. struct ioc *ioc;
  2911. u64 u[NR_I_LCOEFS];
  2912. bool user;
  2913. char *body, *p;
  2914. int ret;
  2915. blkg_conf_init(&ctx, input);
  2916. ret = blkg_conf_open_bdev(&ctx);
  2917. if (ret)
  2918. goto err;
  2919. body = ctx.body;
  2920. q = bdev_get_queue(ctx.bdev);
  2921. if (!queue_is_mq(q)) {
  2922. ret = -EOPNOTSUPP;
  2923. goto err;
  2924. }
  2925. ioc = q_to_ioc(q);
  2926. if (!ioc) {
  2927. ret = blk_iocost_init(ctx.bdev->bd_disk);
  2928. if (ret)
  2929. goto err;
  2930. ioc = q_to_ioc(q);
  2931. }
  2932. memflags = blk_mq_freeze_queue(q);
  2933. blk_mq_quiesce_queue(q);
  2934. spin_lock_irq(&ioc->lock);
  2935. memcpy(u, ioc->params.i_lcoefs, sizeof(u));
  2936. user = ioc->user_cost_model;
  2937. while ((p = strsep(&body, " \t\n"))) {
  2938. substring_t args[MAX_OPT_ARGS];
  2939. char buf[32];
  2940. int tok;
  2941. u64 v;
  2942. if (!*p)
  2943. continue;
  2944. switch (match_token(p, cost_ctrl_tokens, args)) {
  2945. case COST_CTRL:
  2946. match_strlcpy(buf, &args[0], sizeof(buf));
  2947. if (!strcmp(buf, "auto"))
  2948. user = false;
  2949. else if (!strcmp(buf, "user"))
  2950. user = true;
  2951. else
  2952. goto einval;
  2953. continue;
  2954. case COST_MODEL:
  2955. match_strlcpy(buf, &args[0], sizeof(buf));
  2956. if (strcmp(buf, "linear"))
  2957. goto einval;
  2958. continue;
  2959. }
  2960. tok = match_token(p, i_lcoef_tokens, args);
  2961. if (tok == NR_I_LCOEFS)
  2962. goto einval;
  2963. if (match_u64(&args[0], &v))
  2964. goto einval;
  2965. u[tok] = v;
  2966. user = true;
  2967. }
  2968. if (user) {
  2969. memcpy(ioc->params.i_lcoefs, u, sizeof(u));
  2970. ioc->user_cost_model = true;
  2971. } else {
  2972. ioc->user_cost_model = false;
  2973. }
  2974. ioc_refresh_params(ioc, true);
  2975. spin_unlock_irq(&ioc->lock);
  2976. blk_mq_unquiesce_queue(q);
  2977. blk_mq_unfreeze_queue(q, memflags);
  2978. blkg_conf_exit(&ctx);
  2979. return nbytes;
  2980. einval:
  2981. spin_unlock_irq(&ioc->lock);
  2982. blk_mq_unquiesce_queue(q);
  2983. blk_mq_unfreeze_queue(q, memflags);
  2984. ret = -EINVAL;
  2985. err:
  2986. blkg_conf_exit(&ctx);
  2987. return ret;
  2988. }
  2989. static struct cftype ioc_files[] = {
  2990. {
  2991. .name = "weight",
  2992. .flags = CFTYPE_NOT_ON_ROOT,
  2993. .seq_show = ioc_weight_show,
  2994. .write = ioc_weight_write,
  2995. },
  2996. {
  2997. .name = "cost.qos",
  2998. .flags = CFTYPE_ONLY_ON_ROOT,
  2999. .seq_show = ioc_qos_show,
  3000. .write = ioc_qos_write,
  3001. },
  3002. {
  3003. .name = "cost.model",
  3004. .flags = CFTYPE_ONLY_ON_ROOT,
  3005. .seq_show = ioc_cost_model_show,
  3006. .write = ioc_cost_model_write,
  3007. },
  3008. {}
  3009. };
  3010. static struct blkcg_policy blkcg_policy_iocost = {
  3011. .dfl_cftypes = ioc_files,
  3012. .cpd_alloc_fn = ioc_cpd_alloc,
  3013. .cpd_free_fn = ioc_cpd_free,
  3014. .pd_alloc_fn = ioc_pd_alloc,
  3015. .pd_init_fn = ioc_pd_init,
  3016. .pd_free_fn = ioc_pd_free,
  3017. .pd_stat_fn = ioc_pd_stat,
  3018. };
  3019. static int __init ioc_init(void)
  3020. {
  3021. return blkcg_policy_register(&blkcg_policy_iocost);
  3022. }
  3023. static void __exit ioc_exit(void)
  3024. {
  3025. blkcg_policy_unregister(&blkcg_policy_iocost);
  3026. }
  3027. module_init(ioc_init);
  3028. module_exit(ioc_exit);