dedupe.c 100 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright 2023 Red Hat
  4. */
  5. /**
  6. * DOC:
  7. *
  8. * Hash Locks:
  9. *
  10. * A hash_lock controls and coordinates writing, index access, and dedupe among groups of data_vios
  11. * concurrently writing identical blocks, allowing them to deduplicate not only against advice but
  12. * also against each other. This saves on index queries and allows those data_vios to concurrently
  13. * deduplicate against a single block instead of being serialized through a PBN read lock. Only one
  14. * index query is needed for each hash_lock, instead of one for every data_vio.
  15. *
  16. * Hash_locks are assigned to hash_zones by computing a modulus on the hash itself. Each hash_zone
  17. * has a single dedicated queue and thread for performing all operations on the hash_locks assigned
  18. * to that zone. The concurrency guarantees of this single-threaded model allow the code to omit
  19. * more fine-grained locking for the hash_lock structures.
  20. *
  21. * A hash_lock acts like a state machine perhaps more than as a lock. Other than the starting and
  22. * ending states INITIALIZING and BYPASSING, every state represents and is held for the duration of
  23. * an asynchronous operation. All state transitions are performed on the thread of the hash_zone
  24. * containing the lock. An asynchronous operation is almost always performed upon entering a state,
  25. * and the callback from that operation triggers exiting the state and entering a new state.
  26. *
  27. * In all states except DEDUPING, there is a single data_vio, called the lock agent, performing the
  28. * asynchronous operations on behalf of the lock. The agent will change during the lifetime of the
  29. * lock if the lock is shared by more than one data_vio. data_vios waiting to deduplicate are kept
  30. * on a wait queue. Viewed a different way, the agent holds the lock exclusively until the lock
  31. * enters the DEDUPING state, at which point it becomes a shared lock that all the waiters (and any
  32. * new data_vios that arrive) use to share a PBN lock. In state DEDUPING, there is no agent. When
  33. * the last data_vio in the lock calls back in DEDUPING, it becomes the agent and the lock becomes
  34. * exclusive again. New data_vios that arrive in the lock will also go on the wait queue.
  35. *
  36. * The existence of lock waiters is a key factor controlling which state the lock transitions to
  37. * next. When the lock is new or has waiters, it will always try to reach DEDUPING, and when it
  38. * doesn't, it will try to clean up and exit.
  39. *
  40. * Deduping requires holding a PBN lock on a block that is known to contain data identical to the
  41. * data_vios in the lock, so the lock will send the agent to the duplicate zone to acquire the PBN
  42. * lock (LOCKING), to the kernel I/O threads to read and verify the data (VERIFYING), or to write a
  43. * new copy of the data to a full data block or a slot in a compressed block (WRITING).
  44. *
  45. * Cleaning up consists of updating the index when the data location is different from the initial
  46. * index query (UPDATING, triggered by stale advice, compression, and rollover), releasing the PBN
  47. * lock on the duplicate block (UNLOCKING), and if the agent is the last data_vio referencing the
  48. * lock, releasing the hash_lock itself back to the hash zone (BYPASSING).
  49. *
  50. * The shortest sequence of states is for non-concurrent writes of new data:
  51. * INITIALIZING -> QUERYING -> WRITING -> BYPASSING
  52. * This sequence is short because no PBN read lock or index update is needed.
  53. *
  54. * Non-concurrent, finding valid advice looks like this (endpoints elided):
  55. * -> QUERYING -> LOCKING -> VERIFYING -> DEDUPING -> UNLOCKING ->
  56. * Or with stale advice (endpoints elided):
  57. * -> QUERYING -> LOCKING -> VERIFYING -> UNLOCKING -> WRITING -> UPDATING ->
  58. *
  59. * When there are not enough available reference count increments available on a PBN for a data_vio
  60. * to deduplicate, a new lock is forked and the excess waiters roll over to the new lock (which
  61. * goes directly to WRITING). The new lock takes the place of the old lock in the lock map so new
  62. * data_vios will be directed to it. The two locks will proceed independently, but only the new
  63. * lock will have the right to update the index (unless it also forks).
  64. *
  65. * Since rollover happens in a lock instance, once a valid data location has been selected, it will
  66. * not change. QUERYING and WRITING are only performed once per lock lifetime. All other
  67. * non-endpoint states can be re-entered.
  68. *
  69. * The function names in this module follow a convention referencing the states and transitions in
  70. * the state machine. For example, for the LOCKING state, there are start_locking() and
  71. * finish_locking() functions. start_locking() is invoked by the finish function of the state (or
  72. * states) that transition to LOCKING. It performs the actual lock state change and must be invoked
  73. * on the hash zone thread. finish_locking() is called by (or continued via callback from) the
  74. * code actually obtaining the lock. It does any bookkeeping or decision-making required and
  75. * invokes the appropriate start function of the state being transitioned to after LOCKING.
  76. *
  77. * ----------------------------------------------------------------------
  78. *
  79. * Index Queries:
  80. *
  81. * A query to the UDS index is handled asynchronously by the index's threads. When the query is
  82. * complete, a callback supplied with the query will be called from one of the those threads. Under
  83. * heavy system load, the index may be slower to respond than is desirable for reasonable I/O
  84. * throughput. Since deduplication of writes is not necessary for correct operation of a VDO
  85. * device, it is acceptable to timeout out slow index queries and proceed to fulfill a write
  86. * request without deduplicating. However, because the uds_request struct itself is supplied by the
  87. * caller, we can not simply reuse a uds_request object which we have chosen to timeout. Hence,
  88. * each hash_zone maintains a pool of dedupe_contexts which each contain a uds_request along with a
  89. * reference to the data_vio on behalf of which they are performing a query.
  90. *
  91. * When a hash_lock needs to query the index, it attempts to acquire an unused dedupe_context from
  92. * its hash_zone's pool. If one is available, that context is prepared, associated with the
  93. * hash_lock's agent, added to the list of pending contexts, and then sent to the index. The
  94. * context's state will be transitioned from DEDUPE_CONTEXT_IDLE to DEDUPE_CONTEXT_PENDING. If all
  95. * goes well, the dedupe callback will be called by the index which will change the context's state
  96. * to DEDUPE_CONTEXT_COMPLETE, and the associated data_vio will be enqueued to run back in the hash
  97. * zone where the query results will be processed and the context will be put back in the idle
  98. * state and returned to the hash_zone's available list.
  99. *
  100. * The first time an index query is launched from a given hash_zone, a timer is started. When the
  101. * timer fires, the hash_zone's completion is enqueued to run in the hash_zone where the zone's
  102. * pending list will be searched for any contexts in the pending state which have been running for
  103. * too long. Those contexts are transitioned to the DEDUPE_CONTEXT_TIMED_OUT state and moved to the
  104. * zone's timed_out list where they won't be examined again if there is a subsequent time out). The
  105. * data_vios associated with timed out contexts are sent to continue processing their write
  106. * operation without deduplicating. The timer is also restarted.
  107. *
  108. * When the dedupe callback is run for a context which is in the timed out state, that context is
  109. * moved to the DEDUPE_CONTEXT_TIMED_OUT_COMPLETE state. No other action need be taken as the
  110. * associated data_vios have already been dispatched.
  111. *
  112. * If a hash_lock needs a dedupe context, and the available list is empty, the timed_out list will
  113. * be searched for any contexts which are timed out and complete. One of these will be used
  114. * immediately, and the rest will be returned to the available list and marked idle.
  115. */
  116. #include "dedupe.h"
  117. #include <linux/atomic.h>
  118. #include <linux/jiffies.h>
  119. #include <linux/kernel.h>
  120. #include <linux/list.h>
  121. #include <linux/ratelimit.h>
  122. #include <linux/spinlock.h>
  123. #include <linux/timer.h>
  124. #include "logger.h"
  125. #include "memory-alloc.h"
  126. #include "numeric.h"
  127. #include "permassert.h"
  128. #include "string-utils.h"
  129. #include "indexer.h"
  130. #include "action-manager.h"
  131. #include "admin-state.h"
  132. #include "completion.h"
  133. #include "constants.h"
  134. #include "data-vio.h"
  135. #include "int-map.h"
  136. #include "io-submitter.h"
  137. #include "packer.h"
  138. #include "physical-zone.h"
  139. #include "slab-depot.h"
  140. #include "statistics.h"
  141. #include "types.h"
  142. #include "vdo.h"
  143. #include "wait-queue.h"
  144. #define DEDUPE_QUERY_TIMER_IDLE 0
  145. #define DEDUPE_QUERY_TIMER_RUNNING 1
  146. #define DEDUPE_QUERY_TIMER_FIRED 2
  147. enum dedupe_context_state {
  148. DEDUPE_CONTEXT_IDLE,
  149. DEDUPE_CONTEXT_PENDING,
  150. DEDUPE_CONTEXT_TIMED_OUT,
  151. DEDUPE_CONTEXT_COMPLETE,
  152. DEDUPE_CONTEXT_TIMED_OUT_COMPLETE,
  153. };
  154. /* Possible index states: closed, opened, or transitioning between those two. */
  155. enum index_state {
  156. IS_CLOSED,
  157. IS_CHANGING,
  158. IS_OPENED,
  159. };
  160. static const char *CLOSED = "closed";
  161. static const char *CLOSING = "closing";
  162. static const char *ERROR = "error";
  163. static const char *OFFLINE = "offline";
  164. static const char *ONLINE = "online";
  165. static const char *OPENING = "opening";
  166. static const char *SUSPENDED = "suspended";
  167. static const char *UNKNOWN = "unknown";
  168. /* Version 2 uses the kernel space UDS index and is limited to 16 bytes */
  169. #define UDS_ADVICE_VERSION 2
  170. /* version byte + state byte + 64-bit little-endian PBN */
  171. #define UDS_ADVICE_SIZE (1 + 1 + sizeof(u64))
  172. enum hash_lock_state {
  173. /* State for locks that are not in use or are being initialized. */
  174. VDO_HASH_LOCK_INITIALIZING,
  175. /* This is the sequence of states typically used on the non-dedupe path. */
  176. VDO_HASH_LOCK_QUERYING,
  177. VDO_HASH_LOCK_WRITING,
  178. VDO_HASH_LOCK_UPDATING,
  179. /* The remaining states are typically used on the dedupe path in this order. */
  180. VDO_HASH_LOCK_LOCKING,
  181. VDO_HASH_LOCK_VERIFYING,
  182. VDO_HASH_LOCK_DEDUPING,
  183. VDO_HASH_LOCK_UNLOCKING,
  184. /*
  185. * Terminal state for locks returning to the pool. Must be last both because it's the final
  186. * state, and also because it's used to count the states.
  187. */
  188. VDO_HASH_LOCK_BYPASSING,
  189. };
  190. static const char * const LOCK_STATE_NAMES[] = {
  191. [VDO_HASH_LOCK_BYPASSING] = "BYPASSING",
  192. [VDO_HASH_LOCK_DEDUPING] = "DEDUPING",
  193. [VDO_HASH_LOCK_INITIALIZING] = "INITIALIZING",
  194. [VDO_HASH_LOCK_LOCKING] = "LOCKING",
  195. [VDO_HASH_LOCK_QUERYING] = "QUERYING",
  196. [VDO_HASH_LOCK_UNLOCKING] = "UNLOCKING",
  197. [VDO_HASH_LOCK_UPDATING] = "UPDATING",
  198. [VDO_HASH_LOCK_VERIFYING] = "VERIFYING",
  199. [VDO_HASH_LOCK_WRITING] = "WRITING",
  200. };
  201. struct hash_lock {
  202. /* The block hash covered by this lock */
  203. struct uds_record_name hash;
  204. /* When the lock is unused, this list entry allows the lock to be pooled */
  205. struct list_head pool_node;
  206. /*
  207. * A list containing the data VIOs sharing this lock, all having the same record name and
  208. * data block contents, linked by their hash_lock_node fields.
  209. */
  210. struct list_head duplicate_vios;
  211. /* The number of data_vios sharing this lock instance */
  212. data_vio_count_t reference_count;
  213. /* The maximum value of reference_count in the lifetime of this lock */
  214. data_vio_count_t max_references;
  215. /* The current state of this lock */
  216. enum hash_lock_state state;
  217. /* True if the UDS index should be updated with new advice */
  218. bool update_advice;
  219. /* True if the advice has been verified to be a true duplicate */
  220. bool verified;
  221. /* True if the lock has already accounted for an initial verification */
  222. bool verify_counted;
  223. /* True if this lock is registered in the lock map (cleared on rollover) */
  224. bool registered;
  225. /*
  226. * If verified is false, this is the location of a possible duplicate. If verified is true,
  227. * it is the verified location of a true duplicate.
  228. */
  229. struct zoned_pbn duplicate;
  230. /* The PBN lock on the block containing the duplicate data */
  231. struct pbn_lock *duplicate_lock;
  232. /* The data_vio designated to act on behalf of the lock */
  233. struct data_vio *agent;
  234. /*
  235. * Other data_vios with data identical to the agent who are currently waiting for the agent
  236. * to get the information they all need to deduplicate--either against each other, or
  237. * against an existing duplicate on disk.
  238. */
  239. struct vdo_wait_queue waiters;
  240. };
  241. #define LOCK_POOL_CAPACITY MAXIMUM_VDO_USER_VIOS
  242. struct hash_zones {
  243. struct action_manager *manager;
  244. struct uds_parameters parameters;
  245. struct uds_index_session *index_session;
  246. struct ratelimit_state ratelimiter;
  247. atomic64_t timeouts;
  248. atomic64_t dedupe_context_busy;
  249. /* This spinlock protects the state fields and the starting of dedupe requests. */
  250. spinlock_t lock;
  251. /* The fields in the next block are all protected by the lock */
  252. struct vdo_completion completion;
  253. enum index_state index_state;
  254. enum index_state index_target;
  255. struct admin_state state;
  256. bool changing;
  257. bool create_flag;
  258. bool dedupe_flag;
  259. bool error_flag;
  260. u64 reported_timeouts;
  261. /* The number of zones */
  262. zone_count_t zone_count;
  263. /* The hash zones themselves */
  264. struct hash_zone zones[];
  265. };
  266. /* These are in milliseconds. */
  267. unsigned int vdo_dedupe_index_timeout_interval = 5000;
  268. unsigned int vdo_dedupe_index_min_timer_interval = 100;
  269. /* Same two variables, in jiffies for easier consumption. */
  270. static u64 vdo_dedupe_index_timeout_jiffies;
  271. static u64 vdo_dedupe_index_min_timer_jiffies;
  272. static inline struct hash_zone *as_hash_zone(struct vdo_completion *completion)
  273. {
  274. vdo_assert_completion_type(completion, VDO_HASH_ZONE_COMPLETION);
  275. return container_of(completion, struct hash_zone, completion);
  276. }
  277. static inline struct hash_zones *as_hash_zones(struct vdo_completion *completion)
  278. {
  279. vdo_assert_completion_type(completion, VDO_HASH_ZONES_COMPLETION);
  280. return container_of(completion, struct hash_zones, completion);
  281. }
  282. static inline void assert_in_hash_zone(struct hash_zone *zone, const char *name)
  283. {
  284. VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == zone->thread_id),
  285. "%s called on hash zone thread", name);
  286. }
  287. static inline bool change_context_state(struct dedupe_context *context, int old, int new)
  288. {
  289. return (atomic_cmpxchg(&context->state, old, new) == old);
  290. }
  291. static inline bool change_timer_state(struct hash_zone *zone, int old, int new)
  292. {
  293. return (atomic_cmpxchg(&zone->timer_state, old, new) == old);
  294. }
  295. /**
  296. * return_hash_lock_to_pool() - (Re)initialize a hash lock and return it to its pool.
  297. * @zone: The zone from which the lock was borrowed.
  298. * @lock: The lock that is no longer in use.
  299. */
  300. static void return_hash_lock_to_pool(struct hash_zone *zone, struct hash_lock *lock)
  301. {
  302. memset(lock, 0, sizeof(*lock));
  303. INIT_LIST_HEAD(&lock->pool_node);
  304. INIT_LIST_HEAD(&lock->duplicate_vios);
  305. vdo_waitq_init(&lock->waiters);
  306. list_add_tail(&lock->pool_node, &zone->lock_pool);
  307. }
  308. /**
  309. * vdo_get_duplicate_lock() - Get the PBN lock on the duplicate data location for a data_vio from
  310. * the hash_lock the data_vio holds (if there is one).
  311. * @data_vio: The data_vio to query.
  312. *
  313. * Return: The PBN lock on the data_vio's duplicate location.
  314. */
  315. struct pbn_lock *vdo_get_duplicate_lock(struct data_vio *data_vio)
  316. {
  317. if (data_vio->hash_lock == NULL)
  318. return NULL;
  319. return data_vio->hash_lock->duplicate_lock;
  320. }
  321. /**
  322. * hash_lock_key() - Return hash_lock's record name as a hash code.
  323. * @lock: The hash lock.
  324. *
  325. * Return: The key to use for the int map.
  326. */
  327. static inline u64 hash_lock_key(struct hash_lock *lock)
  328. {
  329. return get_unaligned_le64(&lock->hash.name);
  330. }
  331. /**
  332. * get_hash_lock_state_name() - Get the string representation of a hash lock state.
  333. * @state: The hash lock state.
  334. *
  335. * Return: The short string representing the state
  336. */
  337. static const char *get_hash_lock_state_name(enum hash_lock_state state)
  338. {
  339. /* Catch if a state has been added without updating the name array. */
  340. BUILD_BUG_ON((VDO_HASH_LOCK_BYPASSING + 1) != ARRAY_SIZE(LOCK_STATE_NAMES));
  341. return (state < ARRAY_SIZE(LOCK_STATE_NAMES)) ? LOCK_STATE_NAMES[state] : "INVALID";
  342. }
  343. /**
  344. * assert_hash_lock_agent() - Assert that a data_vio is the agent of its hash lock, and that this
  345. * is being called in the hash zone.
  346. * @data_vio: The data_vio expected to be the lock agent.
  347. * @where: A string describing the function making the assertion.
  348. */
  349. static void assert_hash_lock_agent(struct data_vio *data_vio, const char *where)
  350. {
  351. /* Not safe to access the agent field except from the hash zone. */
  352. assert_data_vio_in_hash_zone(data_vio);
  353. VDO_ASSERT_LOG_ONLY(data_vio == data_vio->hash_lock->agent,
  354. "%s must be for the hash lock agent", where);
  355. }
  356. /**
  357. * set_duplicate_lock() - Set the duplicate lock held by a hash lock. May only be called in the
  358. * physical zone of the PBN lock.
  359. * @hash_lock: The hash lock to update.
  360. * @pbn_lock: The PBN read lock to use as the duplicate lock.
  361. */
  362. static void set_duplicate_lock(struct hash_lock *hash_lock, struct pbn_lock *pbn_lock)
  363. {
  364. VDO_ASSERT_LOG_ONLY((hash_lock->duplicate_lock == NULL),
  365. "hash lock must not already hold a duplicate lock");
  366. pbn_lock->holder_count += 1;
  367. hash_lock->duplicate_lock = pbn_lock;
  368. }
  369. /**
  370. * dequeue_lock_waiter() - Remove the first data_vio from the lock's waitq and return it.
  371. * @lock: The lock containing the wait queue.
  372. *
  373. * Return: The first (oldest) waiter in the queue, or NULL if the queue is empty.
  374. */
  375. static inline struct data_vio *dequeue_lock_waiter(struct hash_lock *lock)
  376. {
  377. return vdo_waiter_as_data_vio(vdo_waitq_dequeue_waiter(&lock->waiters));
  378. }
  379. /**
  380. * set_hash_lock() - Set, change, or clear the hash lock a data_vio is using.
  381. * @data_vio: The data_vio to update.
  382. * @new_lock: The hash lock the data_vio is joining.
  383. *
  384. * Updates the hash lock (or locks) to reflect the change in membership.
  385. */
  386. static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock)
  387. {
  388. struct hash_lock *old_lock = data_vio->hash_lock;
  389. if (old_lock != NULL) {
  390. VDO_ASSERT_LOG_ONLY(data_vio->hash_zone != NULL,
  391. "must have a hash zone when holding a hash lock");
  392. VDO_ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry),
  393. "must be on a hash lock list when holding a hash lock");
  394. VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 0,
  395. "hash lock reference must be counted");
  396. if ((old_lock->state != VDO_HASH_LOCK_BYPASSING) &&
  397. (old_lock->state != VDO_HASH_LOCK_UNLOCKING)) {
  398. /*
  399. * If the reference count goes to zero in a non-terminal state, we're most
  400. * likely leaking this lock.
  401. */
  402. VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 1,
  403. "hash locks should only become unreferenced in a terminal state, not state %s",
  404. get_hash_lock_state_name(old_lock->state));
  405. }
  406. list_del_init(&data_vio->hash_lock_entry);
  407. old_lock->reference_count -= 1;
  408. data_vio->hash_lock = NULL;
  409. }
  410. if (new_lock != NULL) {
  411. /*
  412. * Keep all data_vios sharing the lock on a list since they can complete in any
  413. * order and we'll always need a pointer to one to compare data.
  414. */
  415. list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_vios);
  416. new_lock->reference_count += 1;
  417. if (new_lock->max_references < new_lock->reference_count)
  418. new_lock->max_references = new_lock->reference_count;
  419. data_vio->hash_lock = new_lock;
  420. }
  421. }
  422. /* There are loops in the state diagram, so some forward decl's are needed. */
  423. static void start_deduping(struct hash_lock *lock, struct data_vio *agent,
  424. bool agent_is_done);
  425. static void start_locking(struct hash_lock *lock, struct data_vio *agent);
  426. static void start_writing(struct hash_lock *lock, struct data_vio *agent);
  427. static void unlock_duplicate_pbn(struct vdo_completion *completion);
  428. static void transfer_allocation_lock(struct data_vio *data_vio);
  429. /**
  430. * exit_hash_lock() - Bottleneck for data_vios that have written or deduplicated and that are no
  431. * longer needed to be an agent for the hash lock.
  432. * @data_vio: The data_vio to complete and send to be cleaned up.
  433. */
  434. static void exit_hash_lock(struct data_vio *data_vio)
  435. {
  436. /* Release the hash lock now, saving a thread transition in cleanup. */
  437. vdo_release_hash_lock(data_vio);
  438. /* Complete the data_vio and start the clean-up path to release any locks it still holds. */
  439. data_vio->vio.completion.callback = complete_data_vio;
  440. continue_data_vio(data_vio);
  441. }
  442. /**
  443. * set_duplicate_location() - Set the location of the duplicate block for data_vio, updating the
  444. * is_duplicate and duplicate fields from a zoned_pbn.
  445. * @data_vio: The data_vio to modify.
  446. * @source: The location of the duplicate.
  447. */
  448. static void set_duplicate_location(struct data_vio *data_vio,
  449. const struct zoned_pbn source)
  450. {
  451. data_vio->is_duplicate = (source.pbn != VDO_ZERO_BLOCK);
  452. data_vio->duplicate = source;
  453. }
  454. /**
  455. * retire_lock_agent() - Retire the active lock agent, replacing it with the first lock waiter, and
  456. * make the retired agent exit the hash lock.
  457. * @lock: The hash lock to update.
  458. *
  459. * Return: The new lock agent (which will be NULL if there was no waiter)
  460. */
  461. static struct data_vio *retire_lock_agent(struct hash_lock *lock)
  462. {
  463. struct data_vio *old_agent = lock->agent;
  464. struct data_vio *new_agent = dequeue_lock_waiter(lock);
  465. lock->agent = new_agent;
  466. exit_hash_lock(old_agent);
  467. if (new_agent != NULL)
  468. set_duplicate_location(new_agent, lock->duplicate);
  469. return new_agent;
  470. }
  471. /**
  472. * wait_on_hash_lock() - Add a data_vio to the lock's queue of waiters.
  473. * @lock: The hash lock on which to wait.
  474. * @data_vio: The data_vio to add to the queue.
  475. */
  476. static void wait_on_hash_lock(struct hash_lock *lock, struct data_vio *data_vio)
  477. {
  478. vdo_waitq_enqueue_waiter(&lock->waiters, &data_vio->waiter);
  479. /*
  480. * Make sure the agent doesn't block indefinitely in the packer since it now has at least
  481. * one other data_vio waiting on it.
  482. */
  483. if ((lock->state != VDO_HASH_LOCK_WRITING) || !cancel_data_vio_compression(lock->agent))
  484. return;
  485. /*
  486. * Even though we're waiting, we also have to send ourselves as a one-way message to the
  487. * packer to ensure the agent continues executing. This is safe because
  488. * cancel_vio_compression() guarantees the agent won't continue executing until this
  489. * message arrives in the packer, and because the wait queue link isn't used for sending
  490. * the message.
  491. */
  492. data_vio->compression.lock_holder = lock->agent;
  493. launch_data_vio_packer_callback(data_vio, vdo_remove_lock_holder_from_packer);
  494. }
  495. /**
  496. * abort_waiter() - waiter_callback_fn function that shunts waiters to write their blocks without
  497. * optimization.
  498. * @waiter: The data_vio's waiter link.
  499. * @context: Not used.
  500. */
  501. static void abort_waiter(struct vdo_waiter *waiter, void __always_unused *context)
  502. {
  503. write_data_vio(vdo_waiter_as_data_vio(waiter));
  504. }
  505. /**
  506. * start_bypassing() - Stop using the hash lock.
  507. * @lock: The hash lock.
  508. * @agent: The data_vio acting as the agent for the lock.
  509. *
  510. * Stops using the hash lock. This is the final transition for hash locks which did not get an
  511. * error.
  512. */
  513. static void start_bypassing(struct hash_lock *lock, struct data_vio *agent)
  514. {
  515. lock->state = VDO_HASH_LOCK_BYPASSING;
  516. exit_hash_lock(agent);
  517. }
  518. void vdo_clean_failed_hash_lock(struct data_vio *data_vio)
  519. {
  520. struct hash_lock *lock = data_vio->hash_lock;
  521. if (lock->state == VDO_HASH_LOCK_BYPASSING) {
  522. exit_hash_lock(data_vio);
  523. return;
  524. }
  525. if (lock->agent == NULL) {
  526. lock->agent = data_vio;
  527. } else if (data_vio != lock->agent) {
  528. exit_hash_lock(data_vio);
  529. return;
  530. }
  531. lock->state = VDO_HASH_LOCK_BYPASSING;
  532. /* Ensure we don't attempt to update advice when cleaning up. */
  533. lock->update_advice = false;
  534. vdo_waitq_notify_all_waiters(&lock->waiters, abort_waiter, NULL);
  535. if (lock->duplicate_lock != NULL) {
  536. /* The agent must reference the duplicate zone to launch it. */
  537. data_vio->duplicate = lock->duplicate;
  538. launch_data_vio_duplicate_zone_callback(data_vio, unlock_duplicate_pbn);
  539. return;
  540. }
  541. lock->agent = NULL;
  542. data_vio->is_duplicate = false;
  543. exit_hash_lock(data_vio);
  544. }
  545. /**
  546. * finish_unlocking() - Handle the result of the agent for the lock releasing a read lock on
  547. * duplicate candidate.
  548. * @completion: The completion of the data_vio acting as the lock's agent.
  549. *
  550. * This continuation is registered in unlock_duplicate_pbn().
  551. */
  552. static void finish_unlocking(struct vdo_completion *completion)
  553. {
  554. struct data_vio *agent = as_data_vio(completion);
  555. struct hash_lock *lock = agent->hash_lock;
  556. assert_hash_lock_agent(agent, __func__);
  557. VDO_ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
  558. "must have released the duplicate lock for the hash lock");
  559. if (!lock->verified) {
  560. /*
  561. * UNLOCKING -> WRITING transition: The lock we released was on an unverified
  562. * block, so it must have been a lock on advice we were verifying, not on a
  563. * location that was used for deduplication. Go write (or compress) the block to
  564. * get a location to dedupe against.
  565. */
  566. start_writing(lock, agent);
  567. return;
  568. }
  569. /*
  570. * With the lock released, the verified duplicate block may already have changed and will
  571. * need to be re-verified if a waiter arrived.
  572. */
  573. lock->verified = false;
  574. if (vdo_waitq_has_waiters(&lock->waiters)) {
  575. /*
  576. * UNLOCKING -> LOCKING transition: A new data_vio entered the hash lock while the
  577. * agent was releasing the PBN lock. The current agent exits and the waiter has to
  578. * re-lock and re-verify the duplicate location.
  579. *
  580. * TODO: If we used the current agent to re-acquire the PBN lock we wouldn't need
  581. * to re-verify.
  582. */
  583. agent = retire_lock_agent(lock);
  584. start_locking(lock, agent);
  585. return;
  586. }
  587. /*
  588. * UNLOCKING -> BYPASSING transition: The agent is done with the lock and no other
  589. * data_vios reference it, so remove it from the lock map and return it to the pool.
  590. */
  591. start_bypassing(lock, agent);
  592. }
  593. /**
  594. * unlock_duplicate_pbn() - Release a read lock on the PBN of the block that may or may not have
  595. * contained duplicate data.
  596. * @completion: The completion of the data_vio acting as the lock's agent.
  597. *
  598. * This continuation is launched by start_unlocking(), and calls back to finish_unlocking() on the
  599. * hash zone thread.
  600. */
  601. static void unlock_duplicate_pbn(struct vdo_completion *completion)
  602. {
  603. struct data_vio *agent = as_data_vio(completion);
  604. struct hash_lock *lock = agent->hash_lock;
  605. assert_data_vio_in_duplicate_zone(agent);
  606. VDO_ASSERT_LOG_ONLY(lock->duplicate_lock != NULL,
  607. "must have a duplicate lock to release");
  608. vdo_release_physical_zone_pbn_lock(agent->duplicate.zone, agent->duplicate.pbn,
  609. vdo_forget(lock->duplicate_lock));
  610. if (lock->state == VDO_HASH_LOCK_BYPASSING) {
  611. complete_data_vio(completion);
  612. return;
  613. }
  614. launch_data_vio_hash_zone_callback(agent, finish_unlocking);
  615. }
  616. /**
  617. * start_unlocking() - Release a read lock on the PBN of the block that may or may not have
  618. * contained duplicate data.
  619. * @lock: The hash lock.
  620. * @agent: The data_vio currently acting as the agent for the lock.
  621. */
  622. static void start_unlocking(struct hash_lock *lock, struct data_vio *agent)
  623. {
  624. lock->state = VDO_HASH_LOCK_UNLOCKING;
  625. launch_data_vio_duplicate_zone_callback(agent, unlock_duplicate_pbn);
  626. }
  627. static void release_context(struct dedupe_context *context)
  628. {
  629. struct hash_zone *zone = context->zone;
  630. WRITE_ONCE(zone->active, zone->active - 1);
  631. list_move(&context->list_entry, &zone->available);
  632. }
  633. static void process_update_result(struct data_vio *agent)
  634. {
  635. struct dedupe_context *context = agent->dedupe_context;
  636. if ((context == NULL) ||
  637. !change_context_state(context, DEDUPE_CONTEXT_COMPLETE, DEDUPE_CONTEXT_IDLE))
  638. return;
  639. agent->dedupe_context = NULL;
  640. release_context(context);
  641. }
  642. /**
  643. * finish_updating() - Process the result of a UDS update performed by the agent for the lock.
  644. * @completion: The completion of the data_vio that performed the update
  645. *
  646. * This continuation is registered in start_querying().
  647. */
  648. static void finish_updating(struct vdo_completion *completion)
  649. {
  650. struct data_vio *agent = as_data_vio(completion);
  651. struct hash_lock *lock = agent->hash_lock;
  652. assert_hash_lock_agent(agent, __func__);
  653. process_update_result(agent);
  654. /*
  655. * UDS was updated successfully, so don't update again unless the duplicate location
  656. * changes due to rollover.
  657. */
  658. lock->update_advice = false;
  659. if (vdo_waitq_has_waiters(&lock->waiters)) {
  660. /*
  661. * UPDATING -> DEDUPING transition: A new data_vio arrived during the UDS update.
  662. * Send it on the verified dedupe path. The agent is done with the lock, but the
  663. * lock may still need to use it to clean up after rollover.
  664. */
  665. start_deduping(lock, agent, true);
  666. return;
  667. }
  668. if (lock->duplicate_lock != NULL) {
  669. /*
  670. * UPDATING -> UNLOCKING transition: No one is waiting to dedupe, but we hold a
  671. * duplicate PBN lock, so go release it.
  672. */
  673. start_unlocking(lock, agent);
  674. return;
  675. }
  676. /*
  677. * UPDATING -> BYPASSING transition: No one is waiting to dedupe and there's no lock to
  678. * release.
  679. */
  680. start_bypassing(lock, agent);
  681. }
  682. static void query_index(struct data_vio *data_vio, enum uds_request_type operation);
  683. /**
  684. * start_updating() - Continue deduplication with the last step, updating UDS with the location of
  685. * the duplicate that should be returned as advice in the future.
  686. * @lock: The hash lock.
  687. * @agent: The data_vio currently acting as the agent for the lock.
  688. */
  689. static void start_updating(struct hash_lock *lock, struct data_vio *agent)
  690. {
  691. lock->state = VDO_HASH_LOCK_UPDATING;
  692. VDO_ASSERT_LOG_ONLY(lock->verified, "new advice should have been verified");
  693. VDO_ASSERT_LOG_ONLY(lock->update_advice, "should only update advice if needed");
  694. agent->last_async_operation = VIO_ASYNC_OP_UPDATE_DEDUPE_INDEX;
  695. set_data_vio_hash_zone_callback(agent, finish_updating);
  696. query_index(agent, UDS_UPDATE);
  697. }
  698. /**
  699. * finish_deduping() - Handle a data_vio that has finished deduplicating against the block locked
  700. * by the hash lock.
  701. * @lock: The hash lock.
  702. * @data_vio: The lock holder that has finished deduplicating.
  703. *
  704. * If there are other data_vios still sharing the lock, this will just release the data_vio's share
  705. * of the lock and finish processing the data_vio. If this is the last data_vio holding the lock,
  706. * this makes the data_vio the lock agent and uses it to advance the state of the lock so it can
  707. * eventually be released.
  708. */
  709. static void finish_deduping(struct hash_lock *lock, struct data_vio *data_vio)
  710. {
  711. struct data_vio *agent = data_vio;
  712. VDO_ASSERT_LOG_ONLY(lock->agent == NULL, "shouldn't have an agent in DEDUPING");
  713. VDO_ASSERT_LOG_ONLY(!vdo_waitq_has_waiters(&lock->waiters),
  714. "shouldn't have any lock waiters in DEDUPING");
  715. /* Just release the lock reference if other data_vios are still deduping. */
  716. if (lock->reference_count > 1) {
  717. exit_hash_lock(data_vio);
  718. return;
  719. }
  720. /* The hash lock must have an agent for all other lock states. */
  721. lock->agent = agent;
  722. if (lock->update_advice) {
  723. /*
  724. * DEDUPING -> UPDATING transition: The location of the duplicate block changed
  725. * since the initial UDS query because of compression, rollover, or because the
  726. * query agent didn't have an allocation. The UDS update was delayed in case there
  727. * was another change in location, but with only this data_vio using the hash lock,
  728. * it's time to update the advice.
  729. */
  730. start_updating(lock, agent);
  731. } else {
  732. /*
  733. * DEDUPING -> UNLOCKING transition: Release the PBN read lock on the duplicate
  734. * location so the hash lock itself can be released (contingent on no new data_vios
  735. * arriving in the lock before the agent returns).
  736. */
  737. start_unlocking(lock, agent);
  738. }
  739. }
  740. /**
  741. * acquire_lock() - Get the lock for a record name.
  742. * @zone: The zone responsible for the hash.
  743. * @hash: The hash to lock.
  744. * @replace_lock: If non-NULL, the lock already registered for the hash which should be replaced by
  745. * the new lock.
  746. * @lock_ptr: A pointer to receive the hash lock.
  747. *
  748. * Gets the lock for the hash (record name) of the data in a data_vio, or if one does not exist (or
  749. * if we are explicitly rolling over), initialize a new lock for the hash and register it in the
  750. * zone. This must only be called in the correct thread for the zone.
  751. *
  752. * Return: VDO_SUCCESS or an error code.
  753. */
  754. static int __must_check acquire_lock(struct hash_zone *zone,
  755. const struct uds_record_name *hash,
  756. struct hash_lock *replace_lock,
  757. struct hash_lock **lock_ptr)
  758. {
  759. struct hash_lock *lock, *new_lock;
  760. int result;
  761. /*
  762. * Borrow and prepare a lock from the pool so we don't have to do two int_map accesses
  763. * in the common case of no lock contention.
  764. */
  765. result = VDO_ASSERT(!list_empty(&zone->lock_pool),
  766. "never need to wait for a free hash lock");
  767. if (result != VDO_SUCCESS)
  768. return result;
  769. new_lock = list_entry(zone->lock_pool.prev, struct hash_lock, pool_node);
  770. list_del_init(&new_lock->pool_node);
  771. /*
  772. * Fill in the hash of the new lock so we can map it, since we have to use the hash as the
  773. * map key.
  774. */
  775. new_lock->hash = *hash;
  776. result = vdo_int_map_put(zone->hash_lock_map, hash_lock_key(new_lock),
  777. new_lock, (replace_lock != NULL), (void **) &lock);
  778. if (result != VDO_SUCCESS) {
  779. return_hash_lock_to_pool(zone, vdo_forget(new_lock));
  780. return result;
  781. }
  782. if (replace_lock != NULL) {
  783. /* On mismatch put the old lock back and return a severe error */
  784. VDO_ASSERT_LOG_ONLY(lock == replace_lock,
  785. "old lock must have been in the lock map");
  786. /* TODO: Check earlier and bail out? */
  787. VDO_ASSERT_LOG_ONLY(replace_lock->registered,
  788. "old lock must have been marked registered");
  789. replace_lock->registered = false;
  790. }
  791. if (lock == replace_lock) {
  792. lock = new_lock;
  793. lock->registered = true;
  794. } else {
  795. /* There's already a lock for the hash, so we don't need the borrowed lock. */
  796. return_hash_lock_to_pool(zone, vdo_forget(new_lock));
  797. }
  798. *lock_ptr = lock;
  799. return VDO_SUCCESS;
  800. }
  801. /**
  802. * enter_forked_lock() - Bind the data_vio to a new hash lock.
  803. * @waiter: The data_vio's waiter link.
  804. * @context: The new hash lock.
  805. *
  806. * Implements waiter_callback_fn. Binds the data_vio that was waiting to a new hash lock and waits
  807. * on that lock.
  808. */
  809. static void enter_forked_lock(struct vdo_waiter *waiter, void *context)
  810. {
  811. struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
  812. struct hash_lock *new_lock = context;
  813. set_hash_lock(data_vio, new_lock);
  814. wait_on_hash_lock(new_lock, data_vio);
  815. }
  816. /**
  817. * fork_hash_lock() - Fork a hash lock because it has run out of increments on the duplicate PBN.
  818. * @old_lock: The hash lock to fork.
  819. * @new_agent: The data_vio that will be the agent for the new lock.
  820. *
  821. * Transfers the new agent and any lock waiters to a new hash lock instance which takes the place
  822. * of the old lock in the lock map. The old lock remains active, but will not update advice.
  823. */
  824. static void fork_hash_lock(struct hash_lock *old_lock, struct data_vio *new_agent)
  825. {
  826. struct hash_lock *new_lock;
  827. int result;
  828. result = acquire_lock(new_agent->hash_zone, &new_agent->record_name, old_lock,
  829. &new_lock);
  830. if (result != VDO_SUCCESS) {
  831. continue_data_vio_with_error(new_agent, result);
  832. return;
  833. }
  834. /*
  835. * Only one of the two locks should update UDS. The old lock is out of references, so it
  836. * would be poor dedupe advice in the short term.
  837. */
  838. old_lock->update_advice = false;
  839. new_lock->update_advice = true;
  840. set_hash_lock(new_agent, new_lock);
  841. new_lock->agent = new_agent;
  842. vdo_waitq_notify_all_waiters(&old_lock->waiters, enter_forked_lock, new_lock);
  843. new_agent->is_duplicate = false;
  844. start_writing(new_lock, new_agent);
  845. }
  846. /**
  847. * launch_dedupe() - Reserve a reference count increment for a data_vio and launch it on the dedupe
  848. * path.
  849. * @lock: The hash lock.
  850. * @data_vio: The data_vio to deduplicate using the hash lock.
  851. * @has_claim: True if the data_vio already has claimed an increment from the duplicate lock.
  852. *
  853. * If no increments are available, this will roll over to a new hash lock and launch the data_vio
  854. * as the writing agent for that lock.
  855. */
  856. static void launch_dedupe(struct hash_lock *lock, struct data_vio *data_vio,
  857. bool has_claim)
  858. {
  859. if (!has_claim && !vdo_claim_pbn_lock_increment(lock->duplicate_lock)) {
  860. /* Out of increments, so must roll over to a new lock. */
  861. fork_hash_lock(lock, data_vio);
  862. return;
  863. }
  864. /* Deduplicate against the lock's verified location. */
  865. set_duplicate_location(data_vio, lock->duplicate);
  866. data_vio->new_mapped = data_vio->duplicate;
  867. update_metadata_for_data_vio_write(data_vio, lock->duplicate_lock);
  868. }
  869. /**
  870. * start_deduping() - Enter the hash lock state where data_vios deduplicate in parallel against a
  871. * true copy of their data on disk.
  872. * @lock: The hash lock.
  873. * @agent: The data_vio acting as the agent for the lock.
  874. * @agent_is_done: True only if the agent has already written or deduplicated against its data.
  875. *
  876. * If the agent itself needs to deduplicate, an increment for it must already have been claimed
  877. * from the duplicate lock, ensuring the hash lock will still have a data_vio holding it.
  878. */
  879. static void start_deduping(struct hash_lock *lock, struct data_vio *agent,
  880. bool agent_is_done)
  881. {
  882. lock->state = VDO_HASH_LOCK_DEDUPING;
  883. /*
  884. * We don't take the downgraded allocation lock from the agent unless we actually need to
  885. * deduplicate against it.
  886. */
  887. if (lock->duplicate_lock == NULL) {
  888. VDO_ASSERT_LOG_ONLY(!vdo_is_state_compressed(agent->new_mapped.state),
  889. "compression must have shared a lock");
  890. VDO_ASSERT_LOG_ONLY(agent_is_done,
  891. "agent must have written the new duplicate");
  892. transfer_allocation_lock(agent);
  893. }
  894. VDO_ASSERT_LOG_ONLY(vdo_is_pbn_read_lock(lock->duplicate_lock),
  895. "duplicate_lock must be a PBN read lock");
  896. /*
  897. * This state is not like any of the other states. There is no designated agent--the agent
  898. * transitioning to this state and all the waiters will be launched to deduplicate in
  899. * parallel.
  900. */
  901. lock->agent = NULL;
  902. /*
  903. * Launch the agent (if not already deduplicated) and as many lock waiters as we have
  904. * available increments for on the dedupe path. If we run out of increments, rollover will
  905. * be triggered and the remaining waiters will be transferred to the new lock.
  906. */
  907. if (!agent_is_done) {
  908. launch_dedupe(lock, agent, true);
  909. agent = NULL;
  910. }
  911. while (vdo_waitq_has_waiters(&lock->waiters))
  912. launch_dedupe(lock, dequeue_lock_waiter(lock), false);
  913. if (agent_is_done) {
  914. /*
  915. * In the degenerate case where all the waiters rolled over to a new lock, this
  916. * will continue to use the old agent to clean up this lock, and otherwise it just
  917. * lets the agent exit the lock.
  918. */
  919. finish_deduping(lock, agent);
  920. }
  921. }
  922. /**
  923. * increment_stat() - Increment a statistic counter in a non-atomic yet thread-safe manner.
  924. * @stat: The statistic field to increment.
  925. */
  926. static inline void increment_stat(u64 *stat)
  927. {
  928. /*
  929. * Must only be mutated on the hash zone thread. Prevents any compiler shenanigans from
  930. * affecting other threads reading stats.
  931. */
  932. WRITE_ONCE(*stat, *stat + 1);
  933. }
  934. /**
  935. * finish_verifying() - Handle the result of the agent for the lock comparing its data to the
  936. * duplicate candidate.
  937. * @completion: The completion of the data_vio used to verify dedupe
  938. *
  939. * This continuation is registered in start_verifying().
  940. */
  941. static void finish_verifying(struct vdo_completion *completion)
  942. {
  943. struct data_vio *agent = as_data_vio(completion);
  944. struct hash_lock *lock = agent->hash_lock;
  945. assert_hash_lock_agent(agent, __func__);
  946. lock->verified = agent->is_duplicate;
  947. /*
  948. * Only count the result of the initial verification of the advice as valid or stale, and
  949. * not any re-verifications due to PBN lock releases.
  950. */
  951. if (!lock->verify_counted) {
  952. lock->verify_counted = true;
  953. if (lock->verified)
  954. increment_stat(&agent->hash_zone->statistics.dedupe_advice_valid);
  955. else
  956. increment_stat(&agent->hash_zone->statistics.dedupe_advice_stale);
  957. }
  958. /*
  959. * Even if the block is a verified duplicate, we can't start to deduplicate unless we can
  960. * claim a reference count increment for the agent.
  961. */
  962. if (lock->verified && !vdo_claim_pbn_lock_increment(lock->duplicate_lock)) {
  963. agent->is_duplicate = false;
  964. lock->verified = false;
  965. }
  966. if (lock->verified) {
  967. /*
  968. * VERIFYING -> DEDUPING transition: The advice is for a true duplicate, so start
  969. * deduplicating against it, if references are available.
  970. */
  971. start_deduping(lock, agent, false);
  972. } else {
  973. /*
  974. * VERIFYING -> UNLOCKING transition: Either the verify failed or we'd try to
  975. * dedupe and roll over immediately, which would fail because it would leave the
  976. * lock without an agent to release the PBN lock. In both cases, the data will have
  977. * to be written or compressed, but first the advice PBN must be unlocked by the
  978. * VERIFYING agent.
  979. */
  980. lock->update_advice = true;
  981. start_unlocking(lock, agent);
  982. }
  983. }
  984. static bool blocks_equal(char *block1, char *block2)
  985. {
  986. int i;
  987. for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(u64)) {
  988. if (*((u64 *) &block1[i]) != *((u64 *) &block2[i]))
  989. return false;
  990. }
  991. return true;
  992. }
  993. static void verify_callback(struct vdo_completion *completion)
  994. {
  995. struct data_vio *agent = as_data_vio(completion);
  996. agent->is_duplicate = blocks_equal(agent->vio.data, agent->scratch_block);
  997. launch_data_vio_hash_zone_callback(agent, finish_verifying);
  998. }
  999. static void uncompress_and_verify(struct vdo_completion *completion)
  1000. {
  1001. struct data_vio *agent = as_data_vio(completion);
  1002. int result;
  1003. result = uncompress_data_vio(agent, agent->duplicate.state,
  1004. agent->scratch_block);
  1005. if (result == VDO_SUCCESS) {
  1006. verify_callback(completion);
  1007. return;
  1008. }
  1009. agent->is_duplicate = false;
  1010. launch_data_vio_hash_zone_callback(agent, finish_verifying);
  1011. }
  1012. static void verify_endio(struct bio *bio)
  1013. {
  1014. struct data_vio *agent = vio_as_data_vio(bio->bi_private);
  1015. int result = blk_status_to_errno(bio->bi_status);
  1016. vdo_count_completed_bios(bio);
  1017. if (result != VDO_SUCCESS) {
  1018. agent->is_duplicate = false;
  1019. launch_data_vio_hash_zone_callback(agent, finish_verifying);
  1020. return;
  1021. }
  1022. if (vdo_is_state_compressed(agent->duplicate.state)) {
  1023. launch_data_vio_cpu_callback(agent, uncompress_and_verify,
  1024. CPU_Q_COMPRESS_BLOCK_PRIORITY);
  1025. return;
  1026. }
  1027. launch_data_vio_cpu_callback(agent, verify_callback,
  1028. CPU_Q_COMPLETE_READ_PRIORITY);
  1029. }
  1030. /**
  1031. * start_verifying() - Begin the data verification phase.
  1032. * @lock: The hash lock (must be LOCKING).
  1033. * @agent: The data_vio to use to read and compare candidate data.
  1034. *
  1035. * Continue the deduplication path for a hash lock by using the agent to read (and possibly
  1036. * decompress) the data at the candidate duplicate location, comparing it to the data in the agent
  1037. * to verify that the candidate is identical to all the data_vios sharing the hash. If so, it can
  1038. * be deduplicated against, otherwise a data_vio allocation will have to be written to and used for
  1039. * dedupe.
  1040. */
  1041. static void start_verifying(struct hash_lock *lock, struct data_vio *agent)
  1042. {
  1043. int result;
  1044. struct vio *vio = &agent->vio;
  1045. char *buffer = (vdo_is_state_compressed(agent->duplicate.state) ?
  1046. (char *) agent->compression.block :
  1047. agent->scratch_block);
  1048. lock->state = VDO_HASH_LOCK_VERIFYING;
  1049. VDO_ASSERT_LOG_ONLY(!lock->verified, "hash lock only verifies advice once");
  1050. agent->last_async_operation = VIO_ASYNC_OP_VERIFY_DUPLICATION;
  1051. result = vio_reset_bio(vio, buffer, verify_endio, REQ_OP_READ,
  1052. agent->duplicate.pbn);
  1053. if (result != VDO_SUCCESS) {
  1054. set_data_vio_hash_zone_callback(agent, finish_verifying);
  1055. continue_data_vio_with_error(agent, result);
  1056. return;
  1057. }
  1058. set_data_vio_bio_zone_callback(agent, vdo_submit_vio);
  1059. vdo_launch_completion_with_priority(&vio->completion, BIO_Q_VERIFY_PRIORITY);
  1060. }
  1061. /**
  1062. * finish_locking() - Handle the result of the agent for the lock attempting to obtain a PBN read
  1063. * lock on the candidate duplicate block.
  1064. * @completion: The completion of the data_vio that attempted to get the read lock.
  1065. *
  1066. * This continuation is registered in lock_duplicate_pbn().
  1067. */
  1068. static void finish_locking(struct vdo_completion *completion)
  1069. {
  1070. struct data_vio *agent = as_data_vio(completion);
  1071. struct hash_lock *lock = agent->hash_lock;
  1072. assert_hash_lock_agent(agent, __func__);
  1073. if (!agent->is_duplicate) {
  1074. VDO_ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
  1075. "must not hold duplicate_lock if not flagged as a duplicate");
  1076. /*
  1077. * LOCKING -> WRITING transition: The advice block is being modified or has no
  1078. * available references, so try to write or compress the data, remembering to
  1079. * update UDS later with the new advice.
  1080. */
  1081. increment_stat(&agent->hash_zone->statistics.dedupe_advice_stale);
  1082. lock->update_advice = true;
  1083. start_writing(lock, agent);
  1084. return;
  1085. }
  1086. VDO_ASSERT_LOG_ONLY(lock->duplicate_lock != NULL,
  1087. "must hold duplicate_lock if flagged as a duplicate");
  1088. if (!lock->verified) {
  1089. /*
  1090. * LOCKING -> VERIFYING transition: Continue on the unverified dedupe path, reading
  1091. * the candidate duplicate and comparing it to the agent's data to decide whether
  1092. * it is a true duplicate or stale advice.
  1093. */
  1094. start_verifying(lock, agent);
  1095. return;
  1096. }
  1097. if (!vdo_claim_pbn_lock_increment(lock->duplicate_lock)) {
  1098. /*
  1099. * LOCKING -> UNLOCKING transition: The verified block was re-locked, but has no
  1100. * available increments left. Must first release the useless PBN read lock before
  1101. * rolling over to a new copy of the block.
  1102. */
  1103. agent->is_duplicate = false;
  1104. lock->verified = false;
  1105. lock->update_advice = true;
  1106. start_unlocking(lock, agent);
  1107. return;
  1108. }
  1109. /*
  1110. * LOCKING -> DEDUPING transition: Continue on the verified dedupe path, deduplicating
  1111. * against a location that was previously verified or written to.
  1112. */
  1113. start_deduping(lock, agent, false);
  1114. }
  1115. static bool acquire_provisional_reference(struct data_vio *agent, struct pbn_lock *lock,
  1116. struct slab_depot *depot)
  1117. {
  1118. /* Ensure that the newly-locked block is referenced. */
  1119. struct vdo_slab *slab = vdo_get_slab(depot, agent->duplicate.pbn);
  1120. int result = vdo_acquire_provisional_reference(slab, agent->duplicate.pbn, lock);
  1121. if (result == VDO_SUCCESS)
  1122. return true;
  1123. vdo_log_warning_strerror(result,
  1124. "Error acquiring provisional reference for dedupe candidate; aborting dedupe");
  1125. agent->is_duplicate = false;
  1126. vdo_release_physical_zone_pbn_lock(agent->duplicate.zone,
  1127. agent->duplicate.pbn, lock);
  1128. continue_data_vio_with_error(agent, result);
  1129. return false;
  1130. }
  1131. /**
  1132. * lock_duplicate_pbn() - Acquire a read lock on the PBN of the block containing candidate
  1133. * duplicate data (compressed or uncompressed).
  1134. * @completion: The completion of the data_vio attempting to acquire the physical block lock on
  1135. * behalf of its hash lock.
  1136. *
  1137. * If the PBN is already locked for writing, the lock attempt is abandoned and is_duplicate will be
  1138. * cleared before calling back. This continuation is launched from start_locking(), and calls back
  1139. * to finish_locking() on the hash zone thread.
  1140. */
  1141. static void lock_duplicate_pbn(struct vdo_completion *completion)
  1142. {
  1143. unsigned int increment_limit;
  1144. struct pbn_lock *lock;
  1145. int result;
  1146. struct data_vio *agent = as_data_vio(completion);
  1147. struct slab_depot *depot = vdo_from_data_vio(agent)->depot;
  1148. struct physical_zone *zone = agent->duplicate.zone;
  1149. assert_data_vio_in_duplicate_zone(agent);
  1150. set_data_vio_hash_zone_callback(agent, finish_locking);
  1151. /*
  1152. * While in the zone that owns it, find out how many additional references can be made to
  1153. * the block if it turns out to truly be a duplicate.
  1154. */
  1155. increment_limit = vdo_get_increment_limit(depot, agent->duplicate.pbn);
  1156. if (increment_limit == 0) {
  1157. /*
  1158. * We could deduplicate against it later if a reference happened to be released
  1159. * during verification, but it's probably better to bail out now.
  1160. */
  1161. agent->is_duplicate = false;
  1162. continue_data_vio(agent);
  1163. return;
  1164. }
  1165. result = vdo_attempt_physical_zone_pbn_lock(zone, agent->duplicate.pbn,
  1166. VIO_READ_LOCK, &lock);
  1167. if (result != VDO_SUCCESS) {
  1168. continue_data_vio_with_error(agent, result);
  1169. return;
  1170. }
  1171. if (!vdo_is_pbn_read_lock(lock)) {
  1172. /*
  1173. * There are three cases of write locks: uncompressed data block writes, compressed
  1174. * (packed) block writes, and block map page writes. In all three cases, we give up
  1175. * on trying to verify the advice and don't bother to try deduplicate against the
  1176. * data in the write lock holder.
  1177. *
  1178. * 1) We don't ever want to try to deduplicate against a block map page.
  1179. *
  1180. * 2a) It's very unlikely we'd deduplicate against an entire packed block, both
  1181. * because of the chance of matching it, and because we don't record advice for it,
  1182. * but for the uncompressed representation of all the fragments it contains. The
  1183. * only way we'd be getting lock contention is if we've written the same
  1184. * representation coincidentally before, had it become unreferenced, and it just
  1185. * happened to be packed together from compressed writes when we go to verify the
  1186. * lucky advice. Giving up is a minuscule loss of potential dedupe.
  1187. *
  1188. * 2b) If the advice is for a slot of a compressed block, it's about to get
  1189. * smashed, and the write smashing it cannot contain our data--it would have to be
  1190. * writing on behalf of our hash lock, but that's impossible since we're the lock
  1191. * agent.
  1192. *
  1193. * 3a) If the lock is held by a data_vio with different data, the advice is already
  1194. * stale or is about to become stale.
  1195. *
  1196. * 3b) If the lock is held by a data_vio that matches us, we may as well either
  1197. * write it ourselves (or reference the copy we already wrote) instead of
  1198. * potentially having many duplicates wait for the lock holder to write, journal,
  1199. * hash, and finally arrive in the hash lock. We lose a chance to avoid a UDS
  1200. * update in the very rare case of advice for a free block that just happened to be
  1201. * allocated to a data_vio with the same hash. There's also a chance to save on a
  1202. * block write, at the cost of a block verify. Saving on a full block compare in
  1203. * all stale advice cases almost certainly outweighs saving a UDS update and
  1204. * trading a write for a read in a lucky case where advice would have been saved
  1205. * from becoming stale.
  1206. */
  1207. agent->is_duplicate = false;
  1208. continue_data_vio(agent);
  1209. return;
  1210. }
  1211. if (lock->holder_count == 0) {
  1212. if (!acquire_provisional_reference(agent, lock, depot))
  1213. return;
  1214. /*
  1215. * The increment limit we grabbed earlier is still valid. The lock now holds the
  1216. * rights to acquire all those references. Those rights will be claimed by hash
  1217. * locks sharing this read lock.
  1218. */
  1219. lock->increment_limit = increment_limit;
  1220. }
  1221. /*
  1222. * We've successfully acquired a read lock on behalf of the hash lock, so mark it as such.
  1223. */
  1224. set_duplicate_lock(agent->hash_lock, lock);
  1225. /*
  1226. * TODO: Optimization: We could directly launch the block verify, then switch to a hash
  1227. * thread.
  1228. */
  1229. continue_data_vio(agent);
  1230. }
  1231. /**
  1232. * start_locking() - Continue deduplication for a hash lock that has obtained valid advice of a
  1233. * potential duplicate through its agent.
  1234. * @lock: The hash lock (currently must be QUERYING).
  1235. * @agent: The data_vio bearing the dedupe advice.
  1236. */
  1237. static void start_locking(struct hash_lock *lock, struct data_vio *agent)
  1238. {
  1239. VDO_ASSERT_LOG_ONLY(lock->duplicate_lock == NULL,
  1240. "must not acquire a duplicate lock when already holding it");
  1241. lock->state = VDO_HASH_LOCK_LOCKING;
  1242. /*
  1243. * TODO: Optimization: If we arrange to continue on the duplicate zone thread when
  1244. * accepting the advice, and don't explicitly change lock states (or use an agent-local
  1245. * state, or an atomic), we can avoid a thread transition here.
  1246. */
  1247. agent->last_async_operation = VIO_ASYNC_OP_LOCK_DUPLICATE_PBN;
  1248. launch_data_vio_duplicate_zone_callback(agent, lock_duplicate_pbn);
  1249. }
  1250. /**
  1251. * finish_writing() - Re-entry point for the lock agent after it has finished writing or
  1252. * compressing its copy of the data block.
  1253. * @lock: The hash lock, which must be in state WRITING.
  1254. * @agent: The data_vio that wrote its data for the lock.
  1255. *
  1256. * The agent will never need to dedupe against anything, so it's done with the lock, but the lock
  1257. * may not be finished with it, as a UDS update might still be needed.
  1258. *
  1259. * If there are other lock holders, the agent will hand the job to one of them and exit, leaving
  1260. * the lock to deduplicate against the just-written block. If there are no other lock holders, the
  1261. * agent either exits (and later tears down the hash lock), or it remains the agent and updates
  1262. * UDS.
  1263. */
  1264. static void finish_writing(struct hash_lock *lock, struct data_vio *agent)
  1265. {
  1266. /*
  1267. * Dedupe against the data block or compressed block slot the agent wrote. Since we know
  1268. * the write succeeded, there's no need to verify it.
  1269. */
  1270. lock->duplicate = agent->new_mapped;
  1271. lock->verified = true;
  1272. if (vdo_is_state_compressed(lock->duplicate.state) && lock->registered) {
  1273. /*
  1274. * Compression means the location we gave in the UDS query is not the location
  1275. * we're using to deduplicate.
  1276. */
  1277. lock->update_advice = true;
  1278. }
  1279. /* If there are any waiters, we need to start deduping them. */
  1280. if (vdo_waitq_has_waiters(&lock->waiters)) {
  1281. /*
  1282. * WRITING -> DEDUPING transition: an asynchronously-written block failed to
  1283. * compress, so the PBN lock on the written copy was already transferred. The agent
  1284. * is done with the lock, but the lock may still need to use it to clean up after
  1285. * rollover.
  1286. */
  1287. start_deduping(lock, agent, true);
  1288. return;
  1289. }
  1290. /*
  1291. * There are no waiters and the agent has successfully written, so take a step towards
  1292. * being able to release the hash lock (or just release it).
  1293. */
  1294. if (lock->update_advice) {
  1295. /*
  1296. * WRITING -> UPDATING transition: There's no waiter and a UDS update is needed, so
  1297. * retain the WRITING agent and use it to launch the update. The happens on
  1298. * compression, rollover, or the QUERYING agent not having an allocation.
  1299. */
  1300. start_updating(lock, agent);
  1301. } else if (lock->duplicate_lock != NULL) {
  1302. /*
  1303. * WRITING -> UNLOCKING transition: There's no waiter and no update needed, but the
  1304. * compressed write gave us a shared duplicate lock that we must release.
  1305. */
  1306. set_duplicate_location(agent, lock->duplicate);
  1307. start_unlocking(lock, agent);
  1308. } else {
  1309. /*
  1310. * WRITING -> BYPASSING transition: There's no waiter, no update needed, and no
  1311. * duplicate lock held, so both the agent and lock have no more work to do. The
  1312. * agent will release its allocation lock in cleanup.
  1313. */
  1314. start_bypassing(lock, agent);
  1315. }
  1316. }
  1317. /**
  1318. * select_writing_agent() - Search through the lock waiters for a data_vio that has an allocation.
  1319. * @lock: The hash lock to modify.
  1320. *
  1321. * If an allocation is found, swap agents, put the old agent at the head of the wait queue, then
  1322. * return the new agent. Otherwise, just return the current agent.
  1323. */
  1324. static struct data_vio *select_writing_agent(struct hash_lock *lock)
  1325. {
  1326. struct vdo_wait_queue temp_queue;
  1327. struct data_vio *data_vio;
  1328. vdo_waitq_init(&temp_queue);
  1329. /*
  1330. * Move waiters to the temp queue one-by-one until we find an allocation. Not ideal to
  1331. * search, but it only happens when nearly out of space.
  1332. */
  1333. while (((data_vio = dequeue_lock_waiter(lock)) != NULL) &&
  1334. !data_vio_has_allocation(data_vio)) {
  1335. /* Use the lower-level enqueue since we're just moving waiters around. */
  1336. vdo_waitq_enqueue_waiter(&temp_queue, &data_vio->waiter);
  1337. }
  1338. if (data_vio != NULL) {
  1339. /*
  1340. * Move the rest of the waiters over to the temp queue, preserving the order they
  1341. * arrived at the lock.
  1342. */
  1343. vdo_waitq_transfer_all_waiters(&lock->waiters, &temp_queue);
  1344. /*
  1345. * The current agent is being replaced and will have to wait to dedupe; make it the
  1346. * first waiter since it was the first to reach the lock.
  1347. */
  1348. vdo_waitq_enqueue_waiter(&lock->waiters, &lock->agent->waiter);
  1349. lock->agent = data_vio;
  1350. } else {
  1351. /* No one has an allocation, so keep the current agent. */
  1352. data_vio = lock->agent;
  1353. }
  1354. /* Swap all the waiters back onto the lock's queue. */
  1355. vdo_waitq_transfer_all_waiters(&temp_queue, &lock->waiters);
  1356. return data_vio;
  1357. }
  1358. /**
  1359. * start_writing() - Begin the non-duplicate write path.
  1360. * @lock: The hash lock (currently must be QUERYING).
  1361. * @agent: The data_vio currently acting as the agent for the lock.
  1362. *
  1363. * Begins the non-duplicate write path for a hash lock that had no advice, selecting a data_vio
  1364. * with an allocation as a new agent, if necessary, then resuming the agent on the data_vio write
  1365. * path.
  1366. */
  1367. static void start_writing(struct hash_lock *lock, struct data_vio *agent)
  1368. {
  1369. lock->state = VDO_HASH_LOCK_WRITING;
  1370. /*
  1371. * The agent might not have received an allocation and so can't be used for writing, but
  1372. * it's entirely possible that one of the waiters did.
  1373. */
  1374. if (!data_vio_has_allocation(agent)) {
  1375. agent = select_writing_agent(lock);
  1376. /* If none of the waiters had an allocation, the writes all have to fail. */
  1377. if (!data_vio_has_allocation(agent)) {
  1378. /*
  1379. * TODO: Should we keep a variant of BYPASSING that causes new arrivals to
  1380. * fail immediately if they don't have an allocation? It might be possible
  1381. * that on some path there would be non-waiters still referencing the lock,
  1382. * so it would remain in the map as everything is currently spelled, even
  1383. * if the agent and all waiters release.
  1384. */
  1385. continue_data_vio_with_error(agent, VDO_NO_SPACE);
  1386. return;
  1387. }
  1388. }
  1389. /*
  1390. * If the agent compresses, it might wait indefinitely in the packer, which would be bad if
  1391. * there are any other data_vios waiting.
  1392. */
  1393. if (vdo_waitq_has_waiters(&lock->waiters))
  1394. cancel_data_vio_compression(agent);
  1395. /*
  1396. * Send the agent to the compress/pack/write path in vioWrite. If it succeeds, it will
  1397. * return to the hash lock via vdo_continue_hash_lock() and call finish_writing().
  1398. */
  1399. launch_compress_data_vio(agent);
  1400. }
  1401. /*
  1402. * Decode VDO duplicate advice from the old_metadata field of a UDS request.
  1403. * Returns true if valid advice was found and decoded
  1404. */
  1405. static bool decode_uds_advice(struct dedupe_context *context)
  1406. {
  1407. const struct uds_request *request = &context->request;
  1408. struct data_vio *data_vio = context->requestor;
  1409. size_t offset = 0;
  1410. const struct uds_record_data *encoding = &request->old_metadata;
  1411. struct vdo *vdo = vdo_from_data_vio(data_vio);
  1412. struct zoned_pbn *advice = &data_vio->duplicate;
  1413. u8 version;
  1414. int result;
  1415. if ((request->status != UDS_SUCCESS) || !request->found)
  1416. return false;
  1417. version = encoding->data[offset++];
  1418. if (version != UDS_ADVICE_VERSION) {
  1419. vdo_log_error("invalid UDS advice version code %u", version);
  1420. return false;
  1421. }
  1422. advice->state = encoding->data[offset++];
  1423. advice->pbn = get_unaligned_le64(&encoding->data[offset]);
  1424. offset += sizeof(u64);
  1425. BUG_ON(offset != UDS_ADVICE_SIZE);
  1426. /* Don't use advice that's clearly meaningless. */
  1427. if ((advice->state == VDO_MAPPING_STATE_UNMAPPED) || (advice->pbn == VDO_ZERO_BLOCK)) {
  1428. vdo_log_debug("Invalid advice from deduplication server: pbn %llu, state %u. Giving up on deduplication of logical block %llu",
  1429. (unsigned long long) advice->pbn, advice->state,
  1430. (unsigned long long) data_vio->logical.lbn);
  1431. atomic64_inc(&vdo->stats.invalid_advice_pbn_count);
  1432. return false;
  1433. }
  1434. result = vdo_get_physical_zone(vdo, advice->pbn, &advice->zone);
  1435. if ((result != VDO_SUCCESS) || (advice->zone == NULL)) {
  1436. vdo_log_debug("Invalid physical block number from deduplication server: %llu, giving up on deduplication of logical block %llu",
  1437. (unsigned long long) advice->pbn,
  1438. (unsigned long long) data_vio->logical.lbn);
  1439. atomic64_inc(&vdo->stats.invalid_advice_pbn_count);
  1440. return false;
  1441. }
  1442. return true;
  1443. }
  1444. static void process_query_result(struct data_vio *agent)
  1445. {
  1446. struct dedupe_context *context = agent->dedupe_context;
  1447. if (context == NULL)
  1448. return;
  1449. if (change_context_state(context, DEDUPE_CONTEXT_COMPLETE, DEDUPE_CONTEXT_IDLE)) {
  1450. agent->is_duplicate = decode_uds_advice(context);
  1451. agent->dedupe_context = NULL;
  1452. release_context(context);
  1453. }
  1454. }
  1455. /**
  1456. * finish_querying() - Process the result of a UDS query performed by the agent for the lock.
  1457. * @completion: The completion of the data_vio that performed the query.
  1458. *
  1459. * This continuation is registered in start_querying().
  1460. */
  1461. static void finish_querying(struct vdo_completion *completion)
  1462. {
  1463. struct data_vio *agent = as_data_vio(completion);
  1464. struct hash_lock *lock = agent->hash_lock;
  1465. assert_hash_lock_agent(agent, __func__);
  1466. process_query_result(agent);
  1467. if (agent->is_duplicate) {
  1468. lock->duplicate = agent->duplicate;
  1469. /*
  1470. * QUERYING -> LOCKING transition: Valid advice was obtained from UDS. Use the
  1471. * QUERYING agent to start the hash lock on the unverified dedupe path, verifying
  1472. * that the advice can be used.
  1473. */
  1474. start_locking(lock, agent);
  1475. } else {
  1476. /*
  1477. * The agent will be used as the duplicate if has an allocation; if it does, that
  1478. * location was posted to UDS, so no update will be needed.
  1479. */
  1480. lock->update_advice = !data_vio_has_allocation(agent);
  1481. /*
  1482. * QUERYING -> WRITING transition: There was no advice or the advice wasn't valid,
  1483. * so try to write or compress the data.
  1484. */
  1485. start_writing(lock, agent);
  1486. }
  1487. }
  1488. /**
  1489. * start_querying() - Start deduplication for a hash lock.
  1490. * @lock: The initialized hash lock.
  1491. * @data_vio: The data_vio that has just obtained the new lock.
  1492. *
  1493. * Starts deduplication for a hash lock that has finished initializing by making the data_vio that
  1494. * requested it the agent, entering the QUERYING state, and using the agent to perform the UDS
  1495. * query on behalf of the lock.
  1496. */
  1497. static void start_querying(struct hash_lock *lock, struct data_vio *data_vio)
  1498. {
  1499. lock->agent = data_vio;
  1500. lock->state = VDO_HASH_LOCK_QUERYING;
  1501. data_vio->last_async_operation = VIO_ASYNC_OP_CHECK_FOR_DUPLICATION;
  1502. set_data_vio_hash_zone_callback(data_vio, finish_querying);
  1503. query_index(data_vio,
  1504. (data_vio_has_allocation(data_vio) ? UDS_POST : UDS_QUERY));
  1505. }
  1506. /**
  1507. * report_bogus_lock_state() - Complain that a data_vio has entered a hash_lock that is in an
  1508. * unimplemented or unusable state and continue the data_vio with an
  1509. * error.
  1510. * @lock: The hash lock.
  1511. * @data_vio: The data_vio attempting to enter the lock.
  1512. */
  1513. static void report_bogus_lock_state(struct hash_lock *lock, struct data_vio *data_vio)
  1514. {
  1515. VDO_ASSERT_LOG_ONLY(false, "hash lock must not be in unimplemented state %s",
  1516. get_hash_lock_state_name(lock->state));
  1517. continue_data_vio_with_error(data_vio, VDO_LOCK_ERROR);
  1518. }
  1519. /**
  1520. * vdo_continue_hash_lock() - Continue the processing state after writing, compressing, or
  1521. * deduplicating.
  1522. * @completion: The data_vio completion to continue processing in its hash lock.
  1523. *
  1524. * Asynchronously continue processing a data_vio in its hash lock after it has finished writing,
  1525. * compressing, or deduplicating, so it can share the result with any data_vios waiting in the hash
  1526. * lock, or update the UDS index, or simply release its share of the lock.
  1527. *
  1528. * Context: This must only be called in the correct thread for the hash zone.
  1529. */
  1530. void vdo_continue_hash_lock(struct vdo_completion *completion)
  1531. {
  1532. struct data_vio *data_vio = as_data_vio(completion);
  1533. struct hash_lock *lock = data_vio->hash_lock;
  1534. switch (lock->state) {
  1535. case VDO_HASH_LOCK_WRITING:
  1536. VDO_ASSERT_LOG_ONLY(data_vio == lock->agent,
  1537. "only the lock agent may continue the lock");
  1538. finish_writing(lock, data_vio);
  1539. break;
  1540. case VDO_HASH_LOCK_DEDUPING:
  1541. finish_deduping(lock, data_vio);
  1542. break;
  1543. case VDO_HASH_LOCK_BYPASSING:
  1544. /* This data_vio has finished the write path and the lock doesn't need it. */
  1545. exit_hash_lock(data_vio);
  1546. break;
  1547. case VDO_HASH_LOCK_INITIALIZING:
  1548. case VDO_HASH_LOCK_QUERYING:
  1549. case VDO_HASH_LOCK_UPDATING:
  1550. case VDO_HASH_LOCK_LOCKING:
  1551. case VDO_HASH_LOCK_VERIFYING:
  1552. case VDO_HASH_LOCK_UNLOCKING:
  1553. /* A lock in this state should never be re-entered. */
  1554. report_bogus_lock_state(lock, data_vio);
  1555. break;
  1556. default:
  1557. report_bogus_lock_state(lock, data_vio);
  1558. }
  1559. }
  1560. /**
  1561. * is_hash_collision() - Check to see if a hash collision has occurred.
  1562. * @lock: The lock to check.
  1563. * @candidate: The data_vio seeking to share the lock.
  1564. *
  1565. * Check whether the data in data_vios sharing a lock is different than in a data_vio seeking to
  1566. * share the lock, which should only be possible in the extremely unlikely case of a hash
  1567. * collision.
  1568. *
  1569. * Return: true if the given data_vio must not share the lock because it doesn't have the same data
  1570. * as the lock holders.
  1571. */
  1572. static bool is_hash_collision(struct hash_lock *lock, struct data_vio *candidate)
  1573. {
  1574. struct data_vio *lock_holder;
  1575. struct hash_zone *zone;
  1576. bool collides;
  1577. if (list_empty(&lock->duplicate_vios))
  1578. return false;
  1579. lock_holder = list_first_entry(&lock->duplicate_vios, struct data_vio,
  1580. hash_lock_entry);
  1581. zone = candidate->hash_zone;
  1582. collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data);
  1583. if (collides)
  1584. increment_stat(&zone->statistics.concurrent_hash_collisions);
  1585. else
  1586. increment_stat(&zone->statistics.concurrent_data_matches);
  1587. return collides;
  1588. }
  1589. static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio)
  1590. {
  1591. int result;
  1592. /* FIXME: BUG_ON() and/or enter read-only mode? */
  1593. result = VDO_ASSERT(data_vio->hash_lock == NULL,
  1594. "must not already hold a hash lock");
  1595. if (result != VDO_SUCCESS)
  1596. return result;
  1597. result = VDO_ASSERT(list_empty(&data_vio->hash_lock_entry),
  1598. "must not already be a member of a hash lock list");
  1599. if (result != VDO_SUCCESS)
  1600. return result;
  1601. return VDO_ASSERT(data_vio->recovery_sequence_number == 0,
  1602. "must not hold a recovery lock when getting a hash lock");
  1603. }
  1604. /**
  1605. * vdo_acquire_hash_lock() - Acquire or share a lock on a record name.
  1606. * @completion: The data_vio completion acquiring a lock on its record name.
  1607. *
  1608. * Acquire or share a lock on the hash (record name) of the data in a data_vio, updating the
  1609. * data_vio to reference the lock. This must only be called in the correct thread for the zone. In
  1610. * the unlikely case of a hash collision, this function will succeed, but the data_vio will not get
  1611. * a lock reference.
  1612. */
  1613. void vdo_acquire_hash_lock(struct vdo_completion *completion)
  1614. {
  1615. struct data_vio *data_vio = as_data_vio(completion);
  1616. struct hash_lock *lock;
  1617. int result;
  1618. assert_data_vio_in_hash_zone(data_vio);
  1619. result = assert_hash_lock_preconditions(data_vio);
  1620. if (result != VDO_SUCCESS) {
  1621. continue_data_vio_with_error(data_vio, result);
  1622. return;
  1623. }
  1624. result = acquire_lock(data_vio->hash_zone, &data_vio->record_name, NULL, &lock);
  1625. if (result != VDO_SUCCESS) {
  1626. continue_data_vio_with_error(data_vio, result);
  1627. return;
  1628. }
  1629. if (is_hash_collision(lock, data_vio)) {
  1630. /*
  1631. * Hash collisions are extremely unlikely, but the bogus dedupe would be a data
  1632. * corruption. Bypass optimization entirely. We can't compress a data_vio without
  1633. * a hash_lock as the compressed write depends on the hash_lock to manage the
  1634. * references for the compressed block.
  1635. */
  1636. write_data_vio(data_vio);
  1637. return;
  1638. }
  1639. set_hash_lock(data_vio, lock);
  1640. switch (lock->state) {
  1641. case VDO_HASH_LOCK_INITIALIZING:
  1642. start_querying(lock, data_vio);
  1643. return;
  1644. case VDO_HASH_LOCK_QUERYING:
  1645. case VDO_HASH_LOCK_WRITING:
  1646. case VDO_HASH_LOCK_UPDATING:
  1647. case VDO_HASH_LOCK_LOCKING:
  1648. case VDO_HASH_LOCK_VERIFYING:
  1649. case VDO_HASH_LOCK_UNLOCKING:
  1650. /* The lock is busy, and can't be shared yet. */
  1651. wait_on_hash_lock(lock, data_vio);
  1652. return;
  1653. case VDO_HASH_LOCK_BYPASSING:
  1654. /* We can't use this lock, so bypass optimization entirely. */
  1655. vdo_release_hash_lock(data_vio);
  1656. write_data_vio(data_vio);
  1657. return;
  1658. case VDO_HASH_LOCK_DEDUPING:
  1659. launch_dedupe(lock, data_vio, false);
  1660. return;
  1661. default:
  1662. /* A lock in this state should not be acquired by new VIOs. */
  1663. report_bogus_lock_state(lock, data_vio);
  1664. }
  1665. }
  1666. /**
  1667. * vdo_release_hash_lock() - Release a data_vio's share of a hash lock, if held, and null out the
  1668. * data_vio's reference to it.
  1669. * @data_vio: The data_vio releasing its hash lock.
  1670. *
  1671. * If the data_vio is the only one holding the lock, this also releases any resources or locks used
  1672. * by the hash lock (such as a PBN read lock on a block containing data with the same hash) and
  1673. * returns the lock to the hash zone's lock pool.
  1674. *
  1675. * Context: This must only be called in the correct thread for the hash zone.
  1676. */
  1677. void vdo_release_hash_lock(struct data_vio *data_vio)
  1678. {
  1679. u64 lock_key;
  1680. struct hash_lock *lock = data_vio->hash_lock;
  1681. struct hash_zone *zone = data_vio->hash_zone;
  1682. if (lock == NULL)
  1683. return;
  1684. set_hash_lock(data_vio, NULL);
  1685. if (lock->reference_count > 0) {
  1686. /* The lock is still in use by other data_vios. */
  1687. return;
  1688. }
  1689. lock_key = hash_lock_key(lock);
  1690. if (lock->registered) {
  1691. struct hash_lock *removed;
  1692. removed = vdo_int_map_remove(zone->hash_lock_map, lock_key);
  1693. VDO_ASSERT_LOG_ONLY(lock == removed,
  1694. "hash lock being released must have been mapped");
  1695. } else {
  1696. VDO_ASSERT_LOG_ONLY(lock != vdo_int_map_get(zone->hash_lock_map, lock_key),
  1697. "unregistered hash lock must not be in the lock map");
  1698. }
  1699. VDO_ASSERT_LOG_ONLY(!vdo_waitq_has_waiters(&lock->waiters),
  1700. "hash lock returned to zone must have no waiters");
  1701. VDO_ASSERT_LOG_ONLY((lock->duplicate_lock == NULL),
  1702. "hash lock returned to zone must not reference a PBN lock");
  1703. VDO_ASSERT_LOG_ONLY((lock->state == VDO_HASH_LOCK_BYPASSING),
  1704. "returned hash lock must not be in use with state %s",
  1705. get_hash_lock_state_name(lock->state));
  1706. VDO_ASSERT_LOG_ONLY(list_empty(&lock->pool_node),
  1707. "hash lock returned to zone must not be in a pool list");
  1708. VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_vios),
  1709. "hash lock returned to zone must not reference DataVIOs");
  1710. return_hash_lock_to_pool(zone, lock);
  1711. }
  1712. /**
  1713. * transfer_allocation_lock() - Transfer a data_vio's downgraded allocation PBN lock to the
  1714. * data_vio's hash lock, converting it to a duplicate PBN lock.
  1715. * @data_vio: The data_vio holding the allocation lock to transfer.
  1716. */
  1717. static void transfer_allocation_lock(struct data_vio *data_vio)
  1718. {
  1719. struct allocation *allocation = &data_vio->allocation;
  1720. struct hash_lock *hash_lock = data_vio->hash_lock;
  1721. VDO_ASSERT_LOG_ONLY(data_vio->new_mapped.pbn == allocation->pbn,
  1722. "transferred lock must be for the block written");
  1723. allocation->pbn = VDO_ZERO_BLOCK;
  1724. VDO_ASSERT_LOG_ONLY(vdo_is_pbn_read_lock(allocation->lock),
  1725. "must have downgraded the allocation lock before transfer");
  1726. hash_lock->duplicate = data_vio->new_mapped;
  1727. data_vio->duplicate = data_vio->new_mapped;
  1728. /*
  1729. * Since the lock is being transferred, the holder count doesn't change (and isn't even
  1730. * safe to examine on this thread).
  1731. */
  1732. hash_lock->duplicate_lock = vdo_forget(allocation->lock);
  1733. }
  1734. /**
  1735. * vdo_share_compressed_write_lock() - Make a data_vio's hash lock a shared holder of the PBN lock
  1736. * on the compressed block to which its data was just written.
  1737. * @data_vio: The data_vio which was just compressed.
  1738. * @pbn_lock: The PBN lock on the compressed block.
  1739. *
  1740. * If the lock is still a write lock (as it will be for the first share), it will be converted to a
  1741. * read lock. This also reserves a reference count increment for the data_vio.
  1742. */
  1743. void vdo_share_compressed_write_lock(struct data_vio *data_vio,
  1744. struct pbn_lock *pbn_lock)
  1745. {
  1746. bool claimed;
  1747. VDO_ASSERT_LOG_ONLY(vdo_get_duplicate_lock(data_vio) == NULL,
  1748. "a duplicate PBN lock should not exist when writing");
  1749. VDO_ASSERT_LOG_ONLY(vdo_is_state_compressed(data_vio->new_mapped.state),
  1750. "lock transfer must be for a compressed write");
  1751. assert_data_vio_in_new_mapped_zone(data_vio);
  1752. /* First sharer downgrades the lock. */
  1753. if (!vdo_is_pbn_read_lock(pbn_lock))
  1754. vdo_downgrade_pbn_write_lock(pbn_lock, true);
  1755. /*
  1756. * Get a share of the PBN lock, ensuring it cannot be released until after this data_vio
  1757. * has had a chance to journal a reference.
  1758. */
  1759. data_vio->duplicate = data_vio->new_mapped;
  1760. data_vio->hash_lock->duplicate = data_vio->new_mapped;
  1761. set_duplicate_lock(data_vio->hash_lock, pbn_lock);
  1762. /*
  1763. * Claim a reference for this data_vio. Necessary since another hash_lock might start
  1764. * deduplicating against it before our incRef.
  1765. */
  1766. claimed = vdo_claim_pbn_lock_increment(pbn_lock);
  1767. VDO_ASSERT_LOG_ONLY(claimed, "impossible to fail to claim an initial increment");
  1768. }
  1769. static void start_uds_queue(void *ptr)
  1770. {
  1771. /*
  1772. * Allow the UDS dedupe worker thread to do memory allocations. It will only do allocations
  1773. * during the UDS calls that open or close an index, but those allocations can safely sleep
  1774. * while reserving a large amount of memory. We could use an allocations_allowed boolean
  1775. * (like the base threads do), but it would be an unnecessary embellishment.
  1776. */
  1777. struct vdo_thread *thread = vdo_get_work_queue_owner(vdo_get_current_work_queue());
  1778. vdo_register_allocating_thread(&thread->allocating_thread, NULL);
  1779. }
  1780. static void finish_uds_queue(void *ptr __always_unused)
  1781. {
  1782. vdo_unregister_allocating_thread();
  1783. }
  1784. static void close_index(struct hash_zones *zones)
  1785. __must_hold(&zones->lock)
  1786. {
  1787. int result;
  1788. /*
  1789. * Change the index state so that get_index_statistics() will not try to use the index
  1790. * session we are closing.
  1791. */
  1792. zones->index_state = IS_CHANGING;
  1793. /* Close the index session, while not holding the lock. */
  1794. spin_unlock(&zones->lock);
  1795. result = uds_close_index(zones->index_session);
  1796. if (result != UDS_SUCCESS)
  1797. vdo_log_error_strerror(result, "Error closing index");
  1798. spin_lock(&zones->lock);
  1799. zones->index_state = IS_CLOSED;
  1800. zones->error_flag |= result != UDS_SUCCESS;
  1801. /* ASSERTION: We leave in IS_CLOSED state. */
  1802. }
  1803. static void open_index(struct hash_zones *zones)
  1804. __must_hold(&zones->lock)
  1805. {
  1806. /* ASSERTION: We enter in IS_CLOSED state. */
  1807. int result;
  1808. bool create_flag = zones->create_flag;
  1809. zones->create_flag = false;
  1810. /*
  1811. * Change the index state so that the it will be reported to the outside world as
  1812. * "opening".
  1813. */
  1814. zones->index_state = IS_CHANGING;
  1815. zones->error_flag = false;
  1816. /* Open the index session, while not holding the lock */
  1817. spin_unlock(&zones->lock);
  1818. result = uds_open_index(create_flag ? UDS_CREATE : UDS_LOAD,
  1819. &zones->parameters, zones->index_session);
  1820. if (result != UDS_SUCCESS)
  1821. vdo_log_error_strerror(result, "Error opening index");
  1822. spin_lock(&zones->lock);
  1823. if (!create_flag) {
  1824. switch (result) {
  1825. case -ENOENT:
  1826. /*
  1827. * Either there is no index, or there is no way we can recover the index.
  1828. * We will be called again and try to create a new index.
  1829. */
  1830. zones->index_state = IS_CLOSED;
  1831. zones->create_flag = true;
  1832. return;
  1833. default:
  1834. break;
  1835. }
  1836. }
  1837. if (result == UDS_SUCCESS) {
  1838. zones->index_state = IS_OPENED;
  1839. } else {
  1840. zones->index_state = IS_CLOSED;
  1841. zones->index_target = IS_CLOSED;
  1842. zones->error_flag = true;
  1843. spin_unlock(&zones->lock);
  1844. vdo_log_info("Setting UDS index target state to error");
  1845. spin_lock(&zones->lock);
  1846. }
  1847. /*
  1848. * ASSERTION: On success, we leave in IS_OPENED state.
  1849. * ASSERTION: On failure, we leave in IS_CLOSED state.
  1850. */
  1851. }
  1852. static void change_dedupe_state(struct vdo_completion *completion)
  1853. {
  1854. struct hash_zones *zones = as_hash_zones(completion);
  1855. spin_lock(&zones->lock);
  1856. /* Loop until the index is in the target state and the create flag is clear. */
  1857. while (vdo_is_state_normal(&zones->state) &&
  1858. ((zones->index_state != zones->index_target) || zones->create_flag)) {
  1859. if (zones->index_state == IS_OPENED)
  1860. close_index(zones);
  1861. else
  1862. open_index(zones);
  1863. }
  1864. zones->changing = false;
  1865. spin_unlock(&zones->lock);
  1866. }
  1867. static void start_expiration_timer(struct dedupe_context *context)
  1868. {
  1869. u64 start_time = context->submission_jiffies;
  1870. u64 end_time;
  1871. if (!change_timer_state(context->zone, DEDUPE_QUERY_TIMER_IDLE,
  1872. DEDUPE_QUERY_TIMER_RUNNING))
  1873. return;
  1874. end_time = max(start_time + vdo_dedupe_index_timeout_jiffies,
  1875. jiffies + vdo_dedupe_index_min_timer_jiffies);
  1876. mod_timer(&context->zone->timer, end_time);
  1877. }
  1878. /**
  1879. * report_dedupe_timeouts() - Record and eventually report that some dedupe requests reached their
  1880. * expiration time without getting answers, so we timed them out.
  1881. * @zones: The hash zones.
  1882. * @timeouts: The number of newly timed out requests.
  1883. */
  1884. static void report_dedupe_timeouts(struct hash_zones *zones, unsigned int timeouts)
  1885. {
  1886. atomic64_add(timeouts, &zones->timeouts);
  1887. spin_lock(&zones->lock);
  1888. if (__ratelimit(&zones->ratelimiter)) {
  1889. u64 unreported = atomic64_read(&zones->timeouts);
  1890. unreported -= zones->reported_timeouts;
  1891. vdo_log_debug("UDS index timeout on %llu requests",
  1892. (unsigned long long) unreported);
  1893. zones->reported_timeouts += unreported;
  1894. }
  1895. spin_unlock(&zones->lock);
  1896. }
  1897. static int initialize_index(struct vdo *vdo, struct hash_zones *zones)
  1898. {
  1899. int result;
  1900. off_t uds_offset;
  1901. struct volume_geometry geometry = vdo->geometry;
  1902. static const struct vdo_work_queue_type uds_queue_type = {
  1903. .start = start_uds_queue,
  1904. .finish = finish_uds_queue,
  1905. .max_priority = UDS_Q_MAX_PRIORITY,
  1906. .default_priority = UDS_Q_PRIORITY,
  1907. };
  1908. vdo_set_dedupe_index_timeout_interval(vdo_dedupe_index_timeout_interval);
  1909. vdo_set_dedupe_index_min_timer_interval(vdo_dedupe_index_min_timer_interval);
  1910. spin_lock_init(&zones->lock);
  1911. /*
  1912. * Since we will save up the timeouts that would have been reported but were ratelimited,
  1913. * we don't need to report ratelimiting.
  1914. */
  1915. ratelimit_default_init(&zones->ratelimiter);
  1916. ratelimit_set_flags(&zones->ratelimiter, RATELIMIT_MSG_ON_RELEASE);
  1917. uds_offset = ((vdo_get_index_region_start(geometry) -
  1918. geometry.bio_offset) * VDO_BLOCK_SIZE);
  1919. zones->parameters = (struct uds_parameters) {
  1920. .bdev = vdo->device_config->owned_device->bdev,
  1921. .offset = uds_offset,
  1922. .size = (vdo_get_index_region_size(geometry) * VDO_BLOCK_SIZE),
  1923. .memory_size = geometry.index_config.mem,
  1924. .sparse = geometry.index_config.sparse,
  1925. .nonce = (u64) geometry.nonce,
  1926. };
  1927. result = uds_create_index_session(&zones->index_session);
  1928. if (result != UDS_SUCCESS)
  1929. return result;
  1930. result = vdo_make_thread(vdo, vdo->thread_config.dedupe_thread, &uds_queue_type,
  1931. 1, NULL);
  1932. if (result != VDO_SUCCESS) {
  1933. uds_destroy_index_session(vdo_forget(zones->index_session));
  1934. vdo_log_error("UDS index queue initialization failed (%d)", result);
  1935. return result;
  1936. }
  1937. vdo_initialize_completion(&zones->completion, vdo, VDO_HASH_ZONES_COMPLETION);
  1938. vdo_set_completion_callback(&zones->completion, change_dedupe_state,
  1939. vdo->thread_config.dedupe_thread);
  1940. return VDO_SUCCESS;
  1941. }
  1942. /**
  1943. * finish_index_operation() - This is the UDS callback for index queries.
  1944. * @request: The uds request which has just completed.
  1945. */
  1946. static void finish_index_operation(struct uds_request *request)
  1947. {
  1948. struct dedupe_context *context = container_of(request, struct dedupe_context,
  1949. request);
  1950. if (change_context_state(context, DEDUPE_CONTEXT_PENDING,
  1951. DEDUPE_CONTEXT_COMPLETE)) {
  1952. /*
  1953. * This query has not timed out, so send its data_vio back to its hash zone to
  1954. * process the results.
  1955. */
  1956. continue_data_vio(context->requestor);
  1957. return;
  1958. }
  1959. /*
  1960. * This query has timed out, so try to mark it complete and hence eligible for reuse. Its
  1961. * data_vio has already moved on.
  1962. */
  1963. if (!change_context_state(context, DEDUPE_CONTEXT_TIMED_OUT,
  1964. DEDUPE_CONTEXT_TIMED_OUT_COMPLETE)) {
  1965. VDO_ASSERT_LOG_ONLY(false, "uds request was timed out (state %d)",
  1966. atomic_read(&context->state));
  1967. }
  1968. vdo_funnel_queue_put(context->zone->timed_out_complete, &context->queue_entry);
  1969. }
  1970. /**
  1971. * check_for_drain_complete() - Check whether this zone has drained.
  1972. * @zone: The zone to check.
  1973. */
  1974. static void check_for_drain_complete(struct hash_zone *zone)
  1975. {
  1976. data_vio_count_t recycled = 0;
  1977. if (!vdo_is_state_draining(&zone->state))
  1978. return;
  1979. if ((atomic_read(&zone->timer_state) == DEDUPE_QUERY_TIMER_IDLE) ||
  1980. change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING,
  1981. DEDUPE_QUERY_TIMER_IDLE)) {
  1982. timer_delete_sync(&zone->timer);
  1983. } else {
  1984. /*
  1985. * There is an in flight time-out, which must get processed before we can continue.
  1986. */
  1987. return;
  1988. }
  1989. for (;;) {
  1990. struct dedupe_context *context;
  1991. struct funnel_queue_entry *entry;
  1992. entry = vdo_funnel_queue_poll(zone->timed_out_complete);
  1993. if (entry == NULL)
  1994. break;
  1995. context = container_of(entry, struct dedupe_context, queue_entry);
  1996. atomic_set(&context->state, DEDUPE_CONTEXT_IDLE);
  1997. list_add(&context->list_entry, &zone->available);
  1998. recycled++;
  1999. }
  2000. if (recycled > 0)
  2001. WRITE_ONCE(zone->active, zone->active - recycled);
  2002. VDO_ASSERT_LOG_ONLY(READ_ONCE(zone->active) == 0, "all contexts inactive");
  2003. vdo_finish_draining(&zone->state);
  2004. }
  2005. static void timeout_index_operations_callback(struct vdo_completion *completion)
  2006. {
  2007. struct dedupe_context *context, *tmp;
  2008. struct hash_zone *zone = as_hash_zone(completion);
  2009. u64 timeout_jiffies = msecs_to_jiffies(vdo_dedupe_index_timeout_interval);
  2010. unsigned long cutoff = jiffies - timeout_jiffies;
  2011. unsigned int timed_out = 0;
  2012. atomic_set(&zone->timer_state, DEDUPE_QUERY_TIMER_IDLE);
  2013. list_for_each_entry_safe(context, tmp, &zone->pending, list_entry) {
  2014. if (cutoff <= context->submission_jiffies) {
  2015. /*
  2016. * We have reached the oldest query which has not timed out yet, so restart
  2017. * the timer.
  2018. */
  2019. start_expiration_timer(context);
  2020. break;
  2021. }
  2022. if (!change_context_state(context, DEDUPE_CONTEXT_PENDING,
  2023. DEDUPE_CONTEXT_TIMED_OUT)) {
  2024. /*
  2025. * This context completed between the time the timeout fired, and now. We
  2026. * can treat it as a successful query, its requestor is already enqueued
  2027. * to process it.
  2028. */
  2029. continue;
  2030. }
  2031. /*
  2032. * Remove this context from the pending list so we won't look at it again on a
  2033. * subsequent timeout. Once the index completes it, it will be reused. Meanwhile,
  2034. * send its requestor on its way.
  2035. */
  2036. list_del_init(&context->list_entry);
  2037. context->requestor->dedupe_context = NULL;
  2038. continue_data_vio(context->requestor);
  2039. timed_out++;
  2040. }
  2041. if (timed_out > 0)
  2042. report_dedupe_timeouts(completion->vdo->hash_zones, timed_out);
  2043. check_for_drain_complete(zone);
  2044. }
  2045. static void timeout_index_operations(struct timer_list *t)
  2046. {
  2047. struct hash_zone *zone = timer_container_of(zone, t, timer);
  2048. if (change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING,
  2049. DEDUPE_QUERY_TIMER_FIRED))
  2050. vdo_launch_completion(&zone->completion);
  2051. }
  2052. static int __must_check initialize_zone(struct vdo *vdo, struct hash_zones *zones,
  2053. zone_count_t zone_number)
  2054. {
  2055. int result;
  2056. data_vio_count_t i;
  2057. struct hash_zone *zone = &zones->zones[zone_number];
  2058. result = vdo_int_map_create(VDO_LOCK_MAP_CAPACITY, &zone->hash_lock_map);
  2059. if (result != VDO_SUCCESS)
  2060. return result;
  2061. vdo_set_admin_state_code(&zone->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
  2062. zone->zone_number = zone_number;
  2063. zone->thread_id = vdo->thread_config.hash_zone_threads[zone_number];
  2064. vdo_initialize_completion(&zone->completion, vdo, VDO_HASH_ZONE_COMPLETION);
  2065. vdo_set_completion_callback(&zone->completion, timeout_index_operations_callback,
  2066. zone->thread_id);
  2067. INIT_LIST_HEAD(&zone->lock_pool);
  2068. result = vdo_allocate(LOCK_POOL_CAPACITY, struct hash_lock, "hash_lock array",
  2069. &zone->lock_array);
  2070. if (result != VDO_SUCCESS)
  2071. return result;
  2072. for (i = 0; i < LOCK_POOL_CAPACITY; i++)
  2073. return_hash_lock_to_pool(zone, &zone->lock_array[i]);
  2074. INIT_LIST_HEAD(&zone->available);
  2075. INIT_LIST_HEAD(&zone->pending);
  2076. result = vdo_make_funnel_queue(&zone->timed_out_complete);
  2077. if (result != VDO_SUCCESS)
  2078. return result;
  2079. timer_setup(&zone->timer, timeout_index_operations, 0);
  2080. for (i = 0; i < MAXIMUM_VDO_USER_VIOS; i++) {
  2081. struct dedupe_context *context = &zone->contexts[i];
  2082. context->zone = zone;
  2083. context->request.callback = finish_index_operation;
  2084. context->request.session = zones->index_session;
  2085. list_add(&context->list_entry, &zone->available);
  2086. }
  2087. return vdo_make_default_thread(vdo, zone->thread_id);
  2088. }
  2089. /** get_thread_id_for_zone() - Implements vdo_zone_thread_getter_fn. */
  2090. static thread_id_t get_thread_id_for_zone(void *context, zone_count_t zone_number)
  2091. {
  2092. struct hash_zones *zones = context;
  2093. return zones->zones[zone_number].thread_id;
  2094. }
  2095. /**
  2096. * vdo_make_hash_zones() - Create the hash zones.
  2097. *
  2098. * @vdo: The vdo to which the zone will belong.
  2099. * @zones_ptr: A pointer to hold the zones.
  2100. *
  2101. * Return: VDO_SUCCESS or an error code.
  2102. */
  2103. int vdo_make_hash_zones(struct vdo *vdo, struct hash_zones **zones_ptr)
  2104. {
  2105. int result;
  2106. struct hash_zones *zones;
  2107. zone_count_t z;
  2108. zone_count_t zone_count = vdo->thread_config.hash_zone_count;
  2109. if (zone_count == 0)
  2110. return VDO_SUCCESS;
  2111. result = vdo_allocate_extended(struct hash_zones, zone_count, struct hash_zone,
  2112. __func__, &zones);
  2113. if (result != VDO_SUCCESS)
  2114. return result;
  2115. result = initialize_index(vdo, zones);
  2116. if (result != VDO_SUCCESS) {
  2117. vdo_free(zones);
  2118. return result;
  2119. }
  2120. vdo_set_admin_state_code(&zones->state, VDO_ADMIN_STATE_NEW);
  2121. zones->zone_count = zone_count;
  2122. for (z = 0; z < zone_count; z++) {
  2123. result = initialize_zone(vdo, zones, z);
  2124. if (result != VDO_SUCCESS) {
  2125. vdo_free_hash_zones(zones);
  2126. return result;
  2127. }
  2128. }
  2129. result = vdo_make_action_manager(zones->zone_count, get_thread_id_for_zone,
  2130. vdo->thread_config.admin_thread, zones, NULL,
  2131. vdo, &zones->manager);
  2132. if (result != VDO_SUCCESS) {
  2133. vdo_free_hash_zones(zones);
  2134. return result;
  2135. }
  2136. *zones_ptr = zones;
  2137. return VDO_SUCCESS;
  2138. }
  2139. void vdo_finish_dedupe_index(struct hash_zones *zones)
  2140. {
  2141. if (zones == NULL)
  2142. return;
  2143. uds_destroy_index_session(vdo_forget(zones->index_session));
  2144. }
  2145. /**
  2146. * vdo_free_hash_zones() - Free the hash zones.
  2147. * @zones: The zone to free.
  2148. */
  2149. void vdo_free_hash_zones(struct hash_zones *zones)
  2150. {
  2151. zone_count_t i;
  2152. if (zones == NULL)
  2153. return;
  2154. vdo_free(vdo_forget(zones->manager));
  2155. for (i = 0; i < zones->zone_count; i++) {
  2156. struct hash_zone *zone = &zones->zones[i];
  2157. vdo_free_funnel_queue(vdo_forget(zone->timed_out_complete));
  2158. vdo_int_map_free(vdo_forget(zone->hash_lock_map));
  2159. vdo_free(vdo_forget(zone->lock_array));
  2160. }
  2161. if (zones->index_session != NULL)
  2162. vdo_finish_dedupe_index(zones);
  2163. ratelimit_state_exit(&zones->ratelimiter);
  2164. vdo_free(zones);
  2165. }
  2166. static void initiate_suspend_index(struct admin_state *state)
  2167. {
  2168. struct hash_zones *zones = container_of(state, struct hash_zones, state);
  2169. enum index_state index_state;
  2170. spin_lock(&zones->lock);
  2171. index_state = zones->index_state;
  2172. spin_unlock(&zones->lock);
  2173. if (index_state != IS_CLOSED) {
  2174. bool save = vdo_is_state_saving(&zones->state);
  2175. int result;
  2176. result = uds_suspend_index_session(zones->index_session, save);
  2177. if (result != UDS_SUCCESS)
  2178. vdo_log_error_strerror(result, "Error suspending dedupe index");
  2179. }
  2180. vdo_finish_draining(state);
  2181. }
  2182. /**
  2183. * suspend_index() - Suspend the UDS index prior to draining hash zones.
  2184. * @context: Not used.
  2185. * @completion: The completion for the suspend operation.
  2186. *
  2187. * Implements vdo_action_preamble_fn
  2188. */
  2189. static void suspend_index(void *context, struct vdo_completion *completion)
  2190. {
  2191. struct hash_zones *zones = context;
  2192. vdo_start_draining(&zones->state,
  2193. vdo_get_current_manager_operation(zones->manager), completion,
  2194. initiate_suspend_index);
  2195. }
  2196. /** Implements vdo_admin_initiator_fn. */
  2197. static void initiate_drain(struct admin_state *state)
  2198. {
  2199. check_for_drain_complete(container_of(state, struct hash_zone, state));
  2200. }
  2201. /** Implements vdo_zone_action_fn. */
  2202. static void drain_hash_zone(void *context, zone_count_t zone_number,
  2203. struct vdo_completion *parent)
  2204. {
  2205. struct hash_zones *zones = context;
  2206. vdo_start_draining(&zones->zones[zone_number].state,
  2207. vdo_get_current_manager_operation(zones->manager), parent,
  2208. initiate_drain);
  2209. }
  2210. /** vdo_drain_hash_zones() - Drain all hash zones. */
  2211. void vdo_drain_hash_zones(struct hash_zones *zones, struct vdo_completion *parent)
  2212. {
  2213. vdo_schedule_operation(zones->manager, parent->vdo->suspend_type, suspend_index,
  2214. drain_hash_zone, NULL, parent);
  2215. }
  2216. static void launch_dedupe_state_change(struct hash_zones *zones)
  2217. __must_hold(&zones->lock)
  2218. {
  2219. /* ASSERTION: We enter with the lock held. */
  2220. if (zones->changing || !vdo_is_state_normal(&zones->state))
  2221. /* Either a change is already in progress, or changes are not allowed. */
  2222. return;
  2223. if (zones->create_flag || (zones->index_state != zones->index_target)) {
  2224. zones->changing = true;
  2225. vdo_launch_completion(&zones->completion);
  2226. return;
  2227. }
  2228. /* ASSERTION: We exit with the lock held. */
  2229. }
  2230. /**
  2231. * resume_index() - Resume the UDS index prior to resuming hash zones.
  2232. * @context: Not used.
  2233. * @parent: The completion for the resume operation.
  2234. *
  2235. * Implements vdo_action_preamble_fn
  2236. */
  2237. static void resume_index(void *context, struct vdo_completion *parent)
  2238. {
  2239. struct hash_zones *zones = context;
  2240. struct device_config *config = parent->vdo->device_config;
  2241. int result;
  2242. zones->parameters.bdev = config->owned_device->bdev;
  2243. result = uds_resume_index_session(zones->index_session, zones->parameters.bdev);
  2244. if (result != UDS_SUCCESS)
  2245. vdo_log_error_strerror(result, "Error resuming dedupe index");
  2246. spin_lock(&zones->lock);
  2247. vdo_resume_if_quiescent(&zones->state);
  2248. if (config->deduplication) {
  2249. zones->index_target = IS_OPENED;
  2250. WRITE_ONCE(zones->dedupe_flag, true);
  2251. } else {
  2252. zones->index_target = IS_CLOSED;
  2253. }
  2254. launch_dedupe_state_change(zones);
  2255. spin_unlock(&zones->lock);
  2256. vdo_finish_completion(parent);
  2257. }
  2258. /** Implements vdo_zone_action_fn. */
  2259. static void resume_hash_zone(void *context, zone_count_t zone_number,
  2260. struct vdo_completion *parent)
  2261. {
  2262. struct hash_zone *zone = &(((struct hash_zones *) context)->zones[zone_number]);
  2263. vdo_fail_completion(parent, vdo_resume_if_quiescent(&zone->state));
  2264. }
  2265. /**
  2266. * vdo_resume_hash_zones() - Resume a set of hash zones.
  2267. * @zones: The hash zones to resume.
  2268. * @parent: The object to notify when the zones have resumed.
  2269. */
  2270. void vdo_resume_hash_zones(struct hash_zones *zones, struct vdo_completion *parent)
  2271. {
  2272. if (vdo_is_read_only(parent->vdo)) {
  2273. vdo_launch_completion(parent);
  2274. return;
  2275. }
  2276. vdo_schedule_operation(zones->manager, VDO_ADMIN_STATE_RESUMING, resume_index,
  2277. resume_hash_zone, NULL, parent);
  2278. }
  2279. /**
  2280. * get_hash_zone_statistics() - Add the statistics for this hash zone to the tally for all zones.
  2281. * @zone: The hash zone to query.
  2282. * @tally: The tally.
  2283. */
  2284. static void get_hash_zone_statistics(const struct hash_zone *zone,
  2285. struct hash_lock_statistics *tally)
  2286. {
  2287. const struct hash_lock_statistics *stats = &zone->statistics;
  2288. tally->dedupe_advice_valid += READ_ONCE(stats->dedupe_advice_valid);
  2289. tally->dedupe_advice_stale += READ_ONCE(stats->dedupe_advice_stale);
  2290. tally->concurrent_data_matches += READ_ONCE(stats->concurrent_data_matches);
  2291. tally->concurrent_hash_collisions += READ_ONCE(stats->concurrent_hash_collisions);
  2292. tally->curr_dedupe_queries += READ_ONCE(zone->active);
  2293. }
  2294. static void get_index_statistics(struct hash_zones *zones,
  2295. struct index_statistics *stats)
  2296. {
  2297. enum index_state state;
  2298. struct uds_index_stats index_stats;
  2299. int result;
  2300. spin_lock(&zones->lock);
  2301. state = zones->index_state;
  2302. spin_unlock(&zones->lock);
  2303. if (state != IS_OPENED)
  2304. return;
  2305. result = uds_get_index_session_stats(zones->index_session, &index_stats);
  2306. if (result != UDS_SUCCESS) {
  2307. vdo_log_error_strerror(result, "Error reading index stats");
  2308. return;
  2309. }
  2310. stats->entries_indexed = index_stats.entries_indexed;
  2311. stats->posts_found = index_stats.posts_found;
  2312. stats->posts_not_found = index_stats.posts_not_found;
  2313. stats->queries_found = index_stats.queries_found;
  2314. stats->queries_not_found = index_stats.queries_not_found;
  2315. stats->updates_found = index_stats.updates_found;
  2316. stats->updates_not_found = index_stats.updates_not_found;
  2317. stats->entries_discarded = index_stats.entries_discarded;
  2318. }
  2319. /**
  2320. * vdo_get_dedupe_statistics() - Tally the statistics from all the hash zones and the UDS index.
  2321. * @zones: The hash zones to query.
  2322. * @stats: A structure to store the statistics.
  2323. *
  2324. * Return: The sum of the hash lock statistics from all hash zones plus the statistics from the UDS
  2325. * index
  2326. */
  2327. void vdo_get_dedupe_statistics(struct hash_zones *zones, struct vdo_statistics *stats)
  2328. {
  2329. zone_count_t zone;
  2330. for (zone = 0; zone < zones->zone_count; zone++)
  2331. get_hash_zone_statistics(&zones->zones[zone], &stats->hash_lock);
  2332. get_index_statistics(zones, &stats->index);
  2333. /*
  2334. * zones->timeouts gives the number of timeouts, and dedupe_context_busy gives the number
  2335. * of queries not made because of earlier timeouts.
  2336. */
  2337. stats->dedupe_advice_timeouts =
  2338. (atomic64_read(&zones->timeouts) + atomic64_read(&zones->dedupe_context_busy));
  2339. }
  2340. /**
  2341. * vdo_select_hash_zone() - Select the hash zone responsible for locking a given record name.
  2342. * @zones: The hash_zones from which to select.
  2343. * @name: The record name.
  2344. *
  2345. * Return: The hash zone responsible for the record name.
  2346. */
  2347. struct hash_zone *vdo_select_hash_zone(struct hash_zones *zones,
  2348. const struct uds_record_name *name)
  2349. {
  2350. /*
  2351. * Use a fragment of the record name as a hash code. Eight bits of hash should suffice
  2352. * since the number of hash zones is small.
  2353. * TODO: Verify that the first byte is independent enough.
  2354. */
  2355. u32 hash = name->name[0];
  2356. /*
  2357. * Scale the 8-bit hash fragment to a zone index by treating it as a binary fraction and
  2358. * multiplying that by the zone count. If the hash is uniformly distributed over [0 ..
  2359. * 2^8-1], then (hash * count / 2^8) should be uniformly distributed over [0 .. count-1].
  2360. * The multiply and shift is much faster than a divide (modulus) on X86 CPUs.
  2361. */
  2362. hash = (hash * zones->zone_count) >> 8;
  2363. return &zones->zones[hash];
  2364. }
  2365. /**
  2366. * dump_hash_lock() - Dump a compact description of hash_lock to the log if the lock is not on the
  2367. * free list.
  2368. * @lock: The hash lock to dump.
  2369. */
  2370. static void dump_hash_lock(const struct hash_lock *lock)
  2371. {
  2372. const char *state;
  2373. if (!list_empty(&lock->pool_node)) {
  2374. /* This lock is on the free list. */
  2375. return;
  2376. }
  2377. /*
  2378. * Necessarily cryptic since we can log a lot of these. First three chars of state is
  2379. * unambiguous. 'U' indicates a lock not registered in the map.
  2380. */
  2381. state = get_hash_lock_state_name(lock->state);
  2382. vdo_log_info(" hl %px: %3.3s %c%llu/%u rc=%u wc=%zu agt=%px",
  2383. lock, state, (lock->registered ? 'D' : 'U'),
  2384. (unsigned long long) lock->duplicate.pbn,
  2385. lock->duplicate.state, lock->reference_count,
  2386. vdo_waitq_num_waiters(&lock->waiters), lock->agent);
  2387. }
  2388. static const char *index_state_to_string(struct hash_zones *zones,
  2389. enum index_state state)
  2390. {
  2391. if (!vdo_is_state_normal(&zones->state))
  2392. return SUSPENDED;
  2393. switch (state) {
  2394. case IS_CLOSED:
  2395. return zones->error_flag ? ERROR : CLOSED;
  2396. case IS_CHANGING:
  2397. return zones->index_target == IS_OPENED ? OPENING : CLOSING;
  2398. case IS_OPENED:
  2399. return READ_ONCE(zones->dedupe_flag) ? ONLINE : OFFLINE;
  2400. default:
  2401. return UNKNOWN;
  2402. }
  2403. }
  2404. /**
  2405. * dump_hash_zone() - Dump information about a hash zone to the log for debugging.
  2406. * @zone: The zone to dump.
  2407. */
  2408. static void dump_hash_zone(const struct hash_zone *zone)
  2409. {
  2410. data_vio_count_t i;
  2411. if (zone->hash_lock_map == NULL) {
  2412. vdo_log_info("struct hash_zone %u: NULL map", zone->zone_number);
  2413. return;
  2414. }
  2415. vdo_log_info("struct hash_zone %u: mapSize=%zu",
  2416. zone->zone_number, vdo_int_map_size(zone->hash_lock_map));
  2417. for (i = 0; i < LOCK_POOL_CAPACITY; i++)
  2418. dump_hash_lock(&zone->lock_array[i]);
  2419. }
  2420. /**
  2421. * vdo_dump_hash_zones() - Dump information about the hash zones to the log for debugging.
  2422. * @zones: The zones to dump.
  2423. */
  2424. void vdo_dump_hash_zones(struct hash_zones *zones)
  2425. {
  2426. const char *state, *target;
  2427. zone_count_t zone;
  2428. spin_lock(&zones->lock);
  2429. state = index_state_to_string(zones, zones->index_state);
  2430. target = (zones->changing ? index_state_to_string(zones, zones->index_target) : NULL);
  2431. spin_unlock(&zones->lock);
  2432. vdo_log_info("UDS index: state: %s", state);
  2433. if (target != NULL)
  2434. vdo_log_info("UDS index: changing to state: %s", target);
  2435. for (zone = 0; zone < zones->zone_count; zone++)
  2436. dump_hash_zone(&zones->zones[zone]);
  2437. }
  2438. void vdo_set_dedupe_index_timeout_interval(unsigned int value)
  2439. {
  2440. u64 alb_jiffies;
  2441. /* Arbitrary maximum value is two minutes */
  2442. if (value > 120000)
  2443. value = 120000;
  2444. /* Arbitrary minimum value is 2 jiffies */
  2445. alb_jiffies = msecs_to_jiffies(value);
  2446. if (alb_jiffies < 2) {
  2447. alb_jiffies = 2;
  2448. value = jiffies_to_msecs(alb_jiffies);
  2449. }
  2450. vdo_dedupe_index_timeout_interval = value;
  2451. vdo_dedupe_index_timeout_jiffies = alb_jiffies;
  2452. }
  2453. void vdo_set_dedupe_index_min_timer_interval(unsigned int value)
  2454. {
  2455. u64 min_jiffies;
  2456. /* Arbitrary maximum value is one second */
  2457. if (value > 1000)
  2458. value = 1000;
  2459. /* Arbitrary minimum value is 2 jiffies */
  2460. min_jiffies = msecs_to_jiffies(value);
  2461. if (min_jiffies < 2) {
  2462. min_jiffies = 2;
  2463. value = jiffies_to_msecs(min_jiffies);
  2464. }
  2465. vdo_dedupe_index_min_timer_interval = value;
  2466. vdo_dedupe_index_min_timer_jiffies = min_jiffies;
  2467. }
  2468. /**
  2469. * acquire_context() - Acquire a dedupe context from a hash_zone if any are available.
  2470. * @zone: The hash zone.
  2471. *
  2472. * Return: A dedupe_context or NULL if none are available.
  2473. */
  2474. static struct dedupe_context * __must_check acquire_context(struct hash_zone *zone)
  2475. {
  2476. struct dedupe_context *context;
  2477. struct funnel_queue_entry *entry;
  2478. assert_in_hash_zone(zone, __func__);
  2479. if (!list_empty(&zone->available)) {
  2480. WRITE_ONCE(zone->active, zone->active + 1);
  2481. context = list_first_entry(&zone->available, struct dedupe_context,
  2482. list_entry);
  2483. list_del_init(&context->list_entry);
  2484. return context;
  2485. }
  2486. entry = vdo_funnel_queue_poll(zone->timed_out_complete);
  2487. return ((entry == NULL) ?
  2488. NULL : container_of(entry, struct dedupe_context, queue_entry));
  2489. }
  2490. static void prepare_uds_request(struct uds_request *request, struct data_vio *data_vio,
  2491. enum uds_request_type operation)
  2492. {
  2493. request->record_name = data_vio->record_name;
  2494. request->type = operation;
  2495. if ((operation == UDS_POST) || (operation == UDS_UPDATE)) {
  2496. size_t offset = 0;
  2497. struct uds_record_data *encoding = &request->new_metadata;
  2498. encoding->data[offset++] = UDS_ADVICE_VERSION;
  2499. encoding->data[offset++] = data_vio->new_mapped.state;
  2500. put_unaligned_le64(data_vio->new_mapped.pbn, &encoding->data[offset]);
  2501. offset += sizeof(u64);
  2502. BUG_ON(offset != UDS_ADVICE_SIZE);
  2503. }
  2504. }
  2505. /*
  2506. * The index operation will inquire about data_vio.record_name, providing (if the operation is
  2507. * appropriate) advice from the data_vio's new_mapped fields. The advice found in the index (or
  2508. * NULL if none) will be returned via receive_data_vio_dedupe_advice(). dedupe_context.status is
  2509. * set to the return status code of any asynchronous index processing.
  2510. */
  2511. static void query_index(struct data_vio *data_vio, enum uds_request_type operation)
  2512. {
  2513. int result;
  2514. struct dedupe_context *context;
  2515. struct vdo *vdo = vdo_from_data_vio(data_vio);
  2516. struct hash_zone *zone = data_vio->hash_zone;
  2517. assert_data_vio_in_hash_zone(data_vio);
  2518. if (!READ_ONCE(vdo->hash_zones->dedupe_flag)) {
  2519. continue_data_vio(data_vio);
  2520. return;
  2521. }
  2522. context = acquire_context(zone);
  2523. if (context == NULL) {
  2524. atomic64_inc(&vdo->hash_zones->dedupe_context_busy);
  2525. continue_data_vio(data_vio);
  2526. return;
  2527. }
  2528. data_vio->dedupe_context = context;
  2529. context->requestor = data_vio;
  2530. context->submission_jiffies = jiffies;
  2531. prepare_uds_request(&context->request, data_vio, operation);
  2532. atomic_set(&context->state, DEDUPE_CONTEXT_PENDING);
  2533. list_add_tail(&context->list_entry, &zone->pending);
  2534. start_expiration_timer(context);
  2535. result = uds_launch_request(&context->request);
  2536. if (result != UDS_SUCCESS) {
  2537. context->request.status = result;
  2538. finish_index_operation(&context->request);
  2539. }
  2540. }
  2541. static void set_target_state(struct hash_zones *zones, enum index_state target,
  2542. bool change_dedupe, bool dedupe, bool set_create)
  2543. {
  2544. const char *old_state, *new_state;
  2545. spin_lock(&zones->lock);
  2546. old_state = index_state_to_string(zones, zones->index_target);
  2547. if (change_dedupe)
  2548. WRITE_ONCE(zones->dedupe_flag, dedupe);
  2549. if (set_create)
  2550. zones->create_flag = true;
  2551. zones->index_target = target;
  2552. launch_dedupe_state_change(zones);
  2553. new_state = index_state_to_string(zones, zones->index_target);
  2554. spin_unlock(&zones->lock);
  2555. if (old_state != new_state)
  2556. vdo_log_info("Setting UDS index target state to %s", new_state);
  2557. }
  2558. const char *vdo_get_dedupe_index_state_name(struct hash_zones *zones)
  2559. {
  2560. const char *state;
  2561. spin_lock(&zones->lock);
  2562. state = index_state_to_string(zones, zones->index_state);
  2563. spin_unlock(&zones->lock);
  2564. return state;
  2565. }
  2566. /* Handle a dmsetup message relevant to the index. */
  2567. int vdo_message_dedupe_index(struct hash_zones *zones, const char *name)
  2568. {
  2569. if (strcasecmp(name, "index-close") == 0) {
  2570. set_target_state(zones, IS_CLOSED, false, false, false);
  2571. return 0;
  2572. } else if (strcasecmp(name, "index-create") == 0) {
  2573. set_target_state(zones, IS_OPENED, false, false, true);
  2574. return 0;
  2575. } else if (strcasecmp(name, "index-disable") == 0) {
  2576. set_target_state(zones, IS_OPENED, true, false, false);
  2577. return 0;
  2578. } else if (strcasecmp(name, "index-enable") == 0) {
  2579. set_target_state(zones, IS_OPENED, true, true, false);
  2580. return 0;
  2581. }
  2582. return -EINVAL;
  2583. }
  2584. void vdo_set_dedupe_state_normal(struct hash_zones *zones)
  2585. {
  2586. vdo_set_admin_state_code(&zones->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
  2587. }
  2588. /* If create_flag, create a new index without first attempting to load an existing index. */
  2589. void vdo_start_dedupe_index(struct hash_zones *zones, bool create_flag)
  2590. {
  2591. set_target_state(zones, IS_OPENED, true, true, create_flag);
  2592. }