ctree.c 133 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2007,2008 Oracle. All rights reserved.
  4. */
  5. #include <linux/sched.h>
  6. #include <linux/slab.h>
  7. #include <linux/rbtree.h>
  8. #include <linux/mm.h>
  9. #include <linux/error-injection.h>
  10. #include "messages.h"
  11. #include "ctree.h"
  12. #include "disk-io.h"
  13. #include "transaction.h"
  14. #include "print-tree.h"
  15. #include "locking.h"
  16. #include "volumes.h"
  17. #include "qgroup.h"
  18. #include "tree-mod-log.h"
  19. #include "tree-checker.h"
  20. #include "fs.h"
  21. #include "accessors.h"
  22. #include "extent-tree.h"
  23. #include "relocation.h"
  24. #include "file-item.h"
  25. static struct kmem_cache *btrfs_path_cachep;
  26. static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
  27. *root, struct btrfs_path *path, int level);
  28. static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  29. const struct btrfs_key *ins_key, struct btrfs_path *path,
  30. int data_size, bool extend);
  31. static int push_node_left(struct btrfs_trans_handle *trans,
  32. struct extent_buffer *dst,
  33. struct extent_buffer *src, bool empty);
  34. static int balance_node_right(struct btrfs_trans_handle *trans,
  35. struct extent_buffer *dst_buf,
  36. struct extent_buffer *src_buf);
  37. /*
  38. * The leaf data grows from end-to-front in the node. this returns the address
  39. * of the start of the last item, which is the stop of the leaf data stack.
  40. */
  41. static unsigned int leaf_data_end(const struct extent_buffer *leaf)
  42. {
  43. u32 nr = btrfs_header_nritems(leaf);
  44. if (nr == 0)
  45. return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
  46. return btrfs_item_offset(leaf, nr - 1);
  47. }
  48. /*
  49. * Move data in a @leaf (using memmove, safe for overlapping ranges).
  50. *
  51. * @leaf: leaf that we're doing a memmove on
  52. * @dst_offset: item data offset we're moving to
  53. * @src_offset: item data offset were' moving from
  54. * @len: length of the data we're moving
  55. *
  56. * Wrapper around memmove_extent_buffer() that takes into account the header on
  57. * the leaf. The btrfs_item offset's start directly after the header, so we
  58. * have to adjust any offsets to account for the header in the leaf. This
  59. * handles that math to simplify the callers.
  60. */
  61. static inline void memmove_leaf_data(const struct extent_buffer *leaf,
  62. unsigned long dst_offset,
  63. unsigned long src_offset,
  64. unsigned long len)
  65. {
  66. memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
  67. btrfs_item_nr_offset(leaf, 0) + src_offset, len);
  68. }
  69. /*
  70. * Copy item data from @src into @dst at the given @offset.
  71. *
  72. * @dst: destination leaf that we're copying into
  73. * @src: source leaf that we're copying from
  74. * @dst_offset: item data offset we're copying to
  75. * @src_offset: item data offset were' copying from
  76. * @len: length of the data we're copying
  77. *
  78. * Wrapper around copy_extent_buffer() that takes into account the header on
  79. * the leaf. The btrfs_item offset's start directly after the header, so we
  80. * have to adjust any offsets to account for the header in the leaf. This
  81. * handles that math to simplify the callers.
  82. */
  83. static inline void copy_leaf_data(const struct extent_buffer *dst,
  84. const struct extent_buffer *src,
  85. unsigned long dst_offset,
  86. unsigned long src_offset, unsigned long len)
  87. {
  88. copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
  89. btrfs_item_nr_offset(src, 0) + src_offset, len);
  90. }
  91. /*
  92. * Move items in a @leaf (using memmove).
  93. *
  94. * @dst: destination leaf for the items
  95. * @dst_item: the item nr we're copying into
  96. * @src_item: the item nr we're copying from
  97. * @nr_items: the number of items to copy
  98. *
  99. * Wrapper around memmove_extent_buffer() that does the math to get the
  100. * appropriate offsets into the leaf from the item numbers.
  101. */
  102. static inline void memmove_leaf_items(const struct extent_buffer *leaf,
  103. int dst_item, int src_item, int nr_items)
  104. {
  105. memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
  106. btrfs_item_nr_offset(leaf, src_item),
  107. nr_items * sizeof(struct btrfs_item));
  108. }
  109. /*
  110. * Copy items from @src into @dst at the given @offset.
  111. *
  112. * @dst: destination leaf for the items
  113. * @src: source leaf for the items
  114. * @dst_item: the item nr we're copying into
  115. * @src_item: the item nr we're copying from
  116. * @nr_items: the number of items to copy
  117. *
  118. * Wrapper around copy_extent_buffer() that does the math to get the
  119. * appropriate offsets into the leaf from the item numbers.
  120. */
  121. static inline void copy_leaf_items(const struct extent_buffer *dst,
  122. const struct extent_buffer *src,
  123. int dst_item, int src_item, int nr_items)
  124. {
  125. copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
  126. btrfs_item_nr_offset(src, src_item),
  127. nr_items * sizeof(struct btrfs_item));
  128. }
  129. struct btrfs_path *btrfs_alloc_path(void)
  130. {
  131. might_sleep();
  132. return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
  133. }
  134. /* this also releases the path */
  135. void btrfs_free_path(struct btrfs_path *p)
  136. {
  137. if (!p)
  138. return;
  139. btrfs_release_path(p);
  140. kmem_cache_free(btrfs_path_cachep, p);
  141. }
  142. /*
  143. * path release drops references on the extent buffers in the path
  144. * and it drops any locks held by this path
  145. *
  146. * It is safe to call this on paths that no locks or extent buffers held.
  147. */
  148. noinline void btrfs_release_path(struct btrfs_path *p)
  149. {
  150. int i;
  151. for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
  152. p->slots[i] = 0;
  153. if (!p->nodes[i])
  154. continue;
  155. if (p->locks[i]) {
  156. btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
  157. p->locks[i] = 0;
  158. }
  159. free_extent_buffer(p->nodes[i]);
  160. p->nodes[i] = NULL;
  161. }
  162. }
  163. /*
  164. * safely gets a reference on the root node of a tree. A lock
  165. * is not taken, so a concurrent writer may put a different node
  166. * at the root of the tree. See btrfs_lock_root_node for the
  167. * looping required.
  168. *
  169. * The extent buffer returned by this has a reference taken, so
  170. * it won't disappear. It may stop being the root of the tree
  171. * at any time because there are no locks held.
  172. */
  173. struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
  174. {
  175. struct extent_buffer *eb;
  176. while (1) {
  177. rcu_read_lock();
  178. eb = rcu_dereference(root->node);
  179. /*
  180. * RCU really hurts here, we could free up the root node because
  181. * it was COWed but we may not get the new root node yet so do
  182. * the inc_not_zero dance and if it doesn't work then
  183. * synchronize_rcu and try again.
  184. */
  185. if (refcount_inc_not_zero(&eb->refs)) {
  186. rcu_read_unlock();
  187. break;
  188. }
  189. rcu_read_unlock();
  190. synchronize_rcu();
  191. }
  192. return eb;
  193. }
  194. /*
  195. * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
  196. * just get put onto a simple dirty list. Transaction walks this list to make
  197. * sure they get properly updated on disk.
  198. */
  199. static void add_root_to_dirty_list(struct btrfs_root *root)
  200. {
  201. struct btrfs_fs_info *fs_info = root->fs_info;
  202. if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
  203. !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
  204. return;
  205. spin_lock(&fs_info->trans_lock);
  206. if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
  207. /* Want the extent tree to be the last on the list */
  208. if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID)
  209. list_move_tail(&root->dirty_list,
  210. &fs_info->dirty_cowonly_roots);
  211. else
  212. list_move(&root->dirty_list,
  213. &fs_info->dirty_cowonly_roots);
  214. }
  215. spin_unlock(&fs_info->trans_lock);
  216. }
  217. /*
  218. * used by snapshot creation to make a copy of a root for a tree with
  219. * a given objectid. The buffer with the new root node is returned in
  220. * cow_ret, and this func returns zero on success or a negative error code.
  221. */
  222. int btrfs_copy_root(struct btrfs_trans_handle *trans,
  223. struct btrfs_root *root,
  224. struct extent_buffer *buf,
  225. struct extent_buffer **cow_ret, u64 new_root_objectid)
  226. {
  227. struct btrfs_fs_info *fs_info = root->fs_info;
  228. struct extent_buffer *cow;
  229. int ret = 0;
  230. int level;
  231. struct btrfs_disk_key disk_key;
  232. const bool is_reloc_root = (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID);
  233. u64 reloc_src_root = 0;
  234. WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
  235. trans->transid != fs_info->running_transaction->transid);
  236. WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
  237. trans->transid != btrfs_get_root_last_trans(root));
  238. level = btrfs_header_level(buf);
  239. if (level == 0)
  240. btrfs_item_key(buf, &disk_key, 0);
  241. else
  242. btrfs_node_key(buf, &disk_key, 0);
  243. if (is_reloc_root)
  244. reloc_src_root = btrfs_header_owner(buf);
  245. cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
  246. &disk_key, level, buf->start, 0,
  247. reloc_src_root, BTRFS_NESTING_NEW_ROOT);
  248. if (IS_ERR(cow))
  249. return PTR_ERR(cow);
  250. copy_extent_buffer_full(cow, buf);
  251. btrfs_set_header_bytenr(cow, cow->start);
  252. btrfs_set_header_generation(cow, trans->transid);
  253. btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
  254. btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
  255. BTRFS_HEADER_FLAG_RELOC);
  256. if (is_reloc_root)
  257. btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
  258. else
  259. btrfs_set_header_owner(cow, new_root_objectid);
  260. write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
  261. if (unlikely(btrfs_header_generation(buf) > trans->transid)) {
  262. btrfs_tree_unlock(cow);
  263. free_extent_buffer(cow);
  264. ret = -EUCLEAN;
  265. btrfs_abort_transaction(trans, ret);
  266. return ret;
  267. }
  268. ret = btrfs_inc_ref(trans, root, cow, is_reloc_root);
  269. if (unlikely(ret)) {
  270. btrfs_abort_transaction(trans, ret);
  271. btrfs_tree_unlock(cow);
  272. free_extent_buffer(cow);
  273. return ret;
  274. }
  275. btrfs_mark_buffer_dirty(trans, cow);
  276. *cow_ret = cow;
  277. return 0;
  278. }
  279. /*
  280. * check if the tree block can be shared by multiple trees
  281. */
  282. bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans,
  283. const struct btrfs_root *root,
  284. const struct extent_buffer *buf)
  285. {
  286. const u64 buf_gen = btrfs_header_generation(buf);
  287. /*
  288. * Tree blocks not in shareable trees and tree roots are never shared.
  289. * If a block was allocated after the last snapshot and the block was
  290. * not allocated by tree relocation, we know the block is not shared.
  291. */
  292. if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
  293. return false;
  294. if (buf == root->node)
  295. return false;
  296. if (buf_gen > btrfs_root_last_snapshot(&root->root_item) &&
  297. !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
  298. return false;
  299. if (buf != root->commit_root)
  300. return true;
  301. /*
  302. * An extent buffer that used to be the commit root may still be shared
  303. * because the tree height may have increased and it became a child of a
  304. * higher level root. This can happen when snapshotting a subvolume
  305. * created in the current transaction.
  306. */
  307. if (buf_gen == trans->transid)
  308. return true;
  309. return false;
  310. }
  311. static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
  312. struct btrfs_root *root,
  313. struct extent_buffer *buf,
  314. struct extent_buffer *cow,
  315. int *last_ref)
  316. {
  317. struct btrfs_fs_info *fs_info = root->fs_info;
  318. u64 refs;
  319. u64 owner;
  320. u64 flags;
  321. int ret;
  322. const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
  323. /*
  324. * Backrefs update rules:
  325. *
  326. * Always use full backrefs for extent pointers in tree block
  327. * allocated by tree relocation.
  328. *
  329. * If a shared tree block is no longer referenced by its owner
  330. * tree (btrfs_header_owner(buf) == root->root_key.objectid),
  331. * use full backrefs for extent pointers in tree block.
  332. *
  333. * If a tree block is been relocating
  334. * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
  335. * use full backrefs for extent pointers in tree block.
  336. * The reason for this is some operations (such as drop tree)
  337. * are only allowed for blocks use full backrefs.
  338. */
  339. if (btrfs_block_can_be_shared(trans, root, buf)) {
  340. ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
  341. btrfs_header_level(buf), 1,
  342. &refs, &flags, NULL);
  343. if (ret)
  344. return ret;
  345. if (unlikely(refs == 0)) {
  346. btrfs_crit(fs_info,
  347. "found 0 references for tree block at bytenr %llu level %d root %llu",
  348. buf->start, btrfs_header_level(buf),
  349. btrfs_root_id(root));
  350. ret = -EUCLEAN;
  351. btrfs_abort_transaction(trans, ret);
  352. return ret;
  353. }
  354. } else {
  355. refs = 1;
  356. if (is_reloc_root || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
  357. flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
  358. else
  359. flags = 0;
  360. }
  361. owner = btrfs_header_owner(buf);
  362. if (unlikely(owner == BTRFS_TREE_RELOC_OBJECTID &&
  363. !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))) {
  364. btrfs_crit(fs_info,
  365. "found tree block at bytenr %llu level %d root %llu refs %llu flags %llx without full backref flag set",
  366. buf->start, btrfs_header_level(buf),
  367. btrfs_root_id(root), refs, flags);
  368. ret = -EUCLEAN;
  369. btrfs_abort_transaction(trans, ret);
  370. return ret;
  371. }
  372. if (refs > 1) {
  373. if ((owner == btrfs_root_id(root) || is_reloc_root) &&
  374. !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
  375. ret = btrfs_inc_ref(trans, root, buf, true);
  376. if (ret)
  377. return ret;
  378. if (is_reloc_root) {
  379. ret = btrfs_dec_ref(trans, root, buf, false);
  380. if (ret)
  381. return ret;
  382. ret = btrfs_inc_ref(trans, root, cow, true);
  383. if (ret)
  384. return ret;
  385. }
  386. ret = btrfs_set_disk_extent_flags(trans, buf,
  387. BTRFS_BLOCK_FLAG_FULL_BACKREF);
  388. if (ret)
  389. return ret;
  390. } else {
  391. ret = btrfs_inc_ref(trans, root, cow, is_reloc_root);
  392. if (ret)
  393. return ret;
  394. }
  395. } else {
  396. if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
  397. ret = btrfs_inc_ref(trans, root, cow, is_reloc_root);
  398. if (ret)
  399. return ret;
  400. ret = btrfs_dec_ref(trans, root, buf, true);
  401. if (ret)
  402. return ret;
  403. }
  404. btrfs_clear_buffer_dirty(trans, buf);
  405. *last_ref = 1;
  406. }
  407. return 0;
  408. }
  409. /*
  410. * does the dirty work in cow of a single block. The parent block (if
  411. * supplied) is updated to point to the new cow copy. The new buffer is marked
  412. * dirty and returned locked. If you modify the block it needs to be marked
  413. * dirty again.
  414. *
  415. * search_start -- an allocation hint for the new block
  416. *
  417. * empty_size -- a hint that you plan on doing more cow. This is the size in
  418. * bytes the allocator should try to find free next to the block it returns.
  419. * This is just a hint and may be ignored by the allocator.
  420. */
  421. int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
  422. struct btrfs_root *root,
  423. struct extent_buffer *buf,
  424. struct extent_buffer *parent, int parent_slot,
  425. struct extent_buffer **cow_ret,
  426. u64 search_start, u64 empty_size,
  427. enum btrfs_lock_nesting nest)
  428. {
  429. struct btrfs_fs_info *fs_info = root->fs_info;
  430. struct btrfs_disk_key disk_key;
  431. struct extent_buffer *cow;
  432. int level, ret;
  433. int last_ref = 0;
  434. int unlock_orig = 0;
  435. u64 parent_start = 0;
  436. u64 reloc_src_root = 0;
  437. if (*cow_ret == buf)
  438. unlock_orig = 1;
  439. btrfs_assert_tree_write_locked(buf);
  440. WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
  441. trans->transid != fs_info->running_transaction->transid);
  442. WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
  443. trans->transid != btrfs_get_root_last_trans(root));
  444. level = btrfs_header_level(buf);
  445. if (level == 0)
  446. btrfs_item_key(buf, &disk_key, 0);
  447. else
  448. btrfs_node_key(buf, &disk_key, 0);
  449. if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
  450. if (parent)
  451. parent_start = parent->start;
  452. reloc_src_root = btrfs_header_owner(buf);
  453. }
  454. cow = btrfs_alloc_tree_block(trans, root, parent_start,
  455. btrfs_root_id(root), &disk_key, level,
  456. search_start, empty_size, reloc_src_root, nest);
  457. if (IS_ERR(cow))
  458. return PTR_ERR(cow);
  459. /* cow is set to blocking by btrfs_init_new_buffer */
  460. copy_extent_buffer_full(cow, buf);
  461. btrfs_set_header_bytenr(cow, cow->start);
  462. btrfs_set_header_generation(cow, trans->transid);
  463. btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
  464. btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
  465. BTRFS_HEADER_FLAG_RELOC);
  466. if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
  467. btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
  468. else
  469. btrfs_set_header_owner(cow, btrfs_root_id(root));
  470. write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
  471. ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
  472. if (unlikely(ret)) {
  473. btrfs_abort_transaction(trans, ret);
  474. goto error_unlock_cow;
  475. }
  476. if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
  477. ret = btrfs_reloc_cow_block(trans, root, buf, cow);
  478. if (unlikely(ret)) {
  479. btrfs_abort_transaction(trans, ret);
  480. goto error_unlock_cow;
  481. }
  482. }
  483. if (buf == root->node) {
  484. WARN_ON(parent && parent != buf);
  485. if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
  486. btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
  487. parent_start = buf->start;
  488. ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
  489. if (unlikely(ret < 0)) {
  490. btrfs_abort_transaction(trans, ret);
  491. goto error_unlock_cow;
  492. }
  493. refcount_inc(&cow->refs);
  494. rcu_assign_pointer(root->node, cow);
  495. ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
  496. parent_start, last_ref);
  497. free_extent_buffer(buf);
  498. add_root_to_dirty_list(root);
  499. if (unlikely(ret < 0)) {
  500. btrfs_abort_transaction(trans, ret);
  501. goto error_unlock_cow;
  502. }
  503. } else {
  504. WARN_ON(trans->transid != btrfs_header_generation(parent));
  505. ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
  506. BTRFS_MOD_LOG_KEY_REPLACE);
  507. if (unlikely(ret)) {
  508. btrfs_abort_transaction(trans, ret);
  509. goto error_unlock_cow;
  510. }
  511. btrfs_set_node_blockptr(parent, parent_slot,
  512. cow->start);
  513. btrfs_set_node_ptr_generation(parent, parent_slot,
  514. trans->transid);
  515. btrfs_mark_buffer_dirty(trans, parent);
  516. if (last_ref) {
  517. ret = btrfs_tree_mod_log_free_eb(buf);
  518. if (unlikely(ret)) {
  519. btrfs_abort_transaction(trans, ret);
  520. goto error_unlock_cow;
  521. }
  522. }
  523. ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
  524. parent_start, last_ref);
  525. if (unlikely(ret < 0)) {
  526. btrfs_abort_transaction(trans, ret);
  527. goto error_unlock_cow;
  528. }
  529. }
  530. trace_btrfs_cow_block(root, buf, cow);
  531. if (unlock_orig)
  532. btrfs_tree_unlock(buf);
  533. free_extent_buffer_stale(buf);
  534. btrfs_mark_buffer_dirty(trans, cow);
  535. *cow_ret = cow;
  536. return 0;
  537. error_unlock_cow:
  538. btrfs_tree_unlock(cow);
  539. free_extent_buffer(cow);
  540. return ret;
  541. }
  542. static inline bool should_cow_block(const struct btrfs_trans_handle *trans,
  543. const struct btrfs_root *root,
  544. const struct extent_buffer *buf)
  545. {
  546. if (btrfs_is_testing(root->fs_info))
  547. return false;
  548. /*
  549. * We do not need to cow a block if
  550. * 1) this block is not created or changed in this transaction;
  551. * 2) this block does not belong to TREE_RELOC tree;
  552. * 3) the root is not forced COW.
  553. *
  554. * What is forced COW:
  555. * when we create snapshot during committing the transaction,
  556. * after we've finished copying src root, we must COW the shared
  557. * block to ensure the metadata consistency.
  558. */
  559. if (btrfs_header_generation(buf) != trans->transid)
  560. return true;
  561. if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN))
  562. return true;
  563. /* Ensure we can see the FORCE_COW bit. */
  564. smp_mb__before_atomic();
  565. if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
  566. return true;
  567. if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
  568. return false;
  569. if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
  570. return true;
  571. return false;
  572. }
  573. /*
  574. * COWs a single block, see btrfs_force_cow_block() for the real work.
  575. * This version of it has extra checks so that a block isn't COWed more than
  576. * once per transaction, as long as it hasn't been written yet
  577. */
  578. int btrfs_cow_block(struct btrfs_trans_handle *trans,
  579. struct btrfs_root *root, struct extent_buffer *buf,
  580. struct extent_buffer *parent, int parent_slot,
  581. struct extent_buffer **cow_ret,
  582. enum btrfs_lock_nesting nest)
  583. {
  584. struct btrfs_fs_info *fs_info = root->fs_info;
  585. u64 search_start;
  586. if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
  587. btrfs_abort_transaction(trans, -EUCLEAN);
  588. btrfs_crit(fs_info,
  589. "attempt to COW block %llu on root %llu that is being deleted",
  590. buf->start, btrfs_root_id(root));
  591. return -EUCLEAN;
  592. }
  593. /*
  594. * COWing must happen through a running transaction, which always
  595. * matches the current fs generation (it's a transaction with a state
  596. * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
  597. * into error state to prevent the commit of any transaction.
  598. */
  599. if (unlikely(trans->transaction != fs_info->running_transaction ||
  600. trans->transid != fs_info->generation)) {
  601. btrfs_abort_transaction(trans, -EUCLEAN);
  602. btrfs_crit(fs_info,
  603. "unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu",
  604. buf->start, btrfs_root_id(root), trans->transid,
  605. fs_info->running_transaction->transid,
  606. fs_info->generation);
  607. return -EUCLEAN;
  608. }
  609. if (!should_cow_block(trans, root, buf)) {
  610. *cow_ret = buf;
  611. return 0;
  612. }
  613. search_start = round_down(buf->start, SZ_1G);
  614. /*
  615. * Before CoWing this block for later modification, check if it's
  616. * the subtree root and do the delayed subtree trace if needed.
  617. *
  618. * Also We don't care about the error, as it's handled internally.
  619. */
  620. btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
  621. return btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
  622. cow_ret, search_start, 0, nest);
  623. }
  624. ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
  625. /*
  626. * same as comp_keys only with two btrfs_key's
  627. */
  628. int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
  629. {
  630. if (k1->objectid > k2->objectid)
  631. return 1;
  632. if (k1->objectid < k2->objectid)
  633. return -1;
  634. if (k1->type > k2->type)
  635. return 1;
  636. if (k1->type < k2->type)
  637. return -1;
  638. if (k1->offset > k2->offset)
  639. return 1;
  640. if (k1->offset < k2->offset)
  641. return -1;
  642. return 0;
  643. }
  644. /*
  645. * Search for a key in the given extent_buffer.
  646. *
  647. * The lower boundary for the search is specified by the slot number @first_slot.
  648. * Use a value of 0 to search over the whole extent buffer. Works for both
  649. * leaves and nodes.
  650. *
  651. * The slot in the extent buffer is returned via @slot. If the key exists in the
  652. * extent buffer, then @slot will point to the slot where the key is, otherwise
  653. * it points to the slot where you would insert the key.
  654. *
  655. * Slot may point to the total number of items (i.e. one position beyond the last
  656. * key) if the key is bigger than the last key in the extent buffer.
  657. */
  658. int btrfs_bin_search(const struct extent_buffer *eb, int first_slot,
  659. const struct btrfs_key *key, int *slot)
  660. {
  661. unsigned long p;
  662. int item_size;
  663. /*
  664. * Use unsigned types for the low and high slots, so that we get a more
  665. * efficient division in the search loop below.
  666. */
  667. u32 low = first_slot;
  668. u32 high = btrfs_header_nritems(eb);
  669. int ret;
  670. const int key_size = sizeof(struct btrfs_disk_key);
  671. if (unlikely(low > high)) {
  672. btrfs_err(eb->fs_info,
  673. "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
  674. __func__, low, high, eb->start,
  675. btrfs_header_owner(eb), btrfs_header_level(eb));
  676. return -EINVAL;
  677. }
  678. if (btrfs_header_level(eb) == 0) {
  679. p = offsetof(struct btrfs_leaf, items);
  680. item_size = sizeof(struct btrfs_item);
  681. } else {
  682. p = offsetof(struct btrfs_node, ptrs);
  683. item_size = sizeof(struct btrfs_key_ptr);
  684. }
  685. while (low < high) {
  686. const int unit_size = eb->folio_size;
  687. unsigned long oil;
  688. unsigned long offset;
  689. struct btrfs_disk_key *tmp;
  690. struct btrfs_disk_key unaligned;
  691. int mid;
  692. mid = (low + high) / 2;
  693. offset = p + mid * item_size;
  694. oil = get_eb_offset_in_folio(eb, offset);
  695. if (oil + key_size <= unit_size) {
  696. const unsigned long idx = get_eb_folio_index(eb, offset);
  697. char *kaddr = folio_address(eb->folios[idx]);
  698. oil = get_eb_offset_in_folio(eb, offset);
  699. tmp = (struct btrfs_disk_key *)(kaddr + oil);
  700. } else {
  701. read_extent_buffer(eb, &unaligned, offset, key_size);
  702. tmp = &unaligned;
  703. }
  704. ret = btrfs_comp_keys(tmp, key);
  705. if (ret < 0)
  706. low = mid + 1;
  707. else if (ret > 0)
  708. high = mid;
  709. else {
  710. *slot = mid;
  711. return 0;
  712. }
  713. }
  714. *slot = low;
  715. return 1;
  716. }
  717. static void root_add_used_bytes(struct btrfs_root *root)
  718. {
  719. spin_lock(&root->accounting_lock);
  720. btrfs_set_root_used(&root->root_item,
  721. btrfs_root_used(&root->root_item) + root->fs_info->nodesize);
  722. spin_unlock(&root->accounting_lock);
  723. }
  724. static void root_sub_used_bytes(struct btrfs_root *root)
  725. {
  726. spin_lock(&root->accounting_lock);
  727. btrfs_set_root_used(&root->root_item,
  728. btrfs_root_used(&root->root_item) - root->fs_info->nodesize);
  729. spin_unlock(&root->accounting_lock);
  730. }
  731. /* given a node and slot number, this reads the blocks it points to. The
  732. * extent buffer is returned with a reference taken (but unlocked).
  733. */
  734. struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
  735. int slot)
  736. {
  737. int level = btrfs_header_level(parent);
  738. struct btrfs_tree_parent_check check = { 0 };
  739. struct extent_buffer *eb;
  740. if (slot < 0 || slot >= btrfs_header_nritems(parent))
  741. return ERR_PTR(-ENOENT);
  742. ASSERT(level);
  743. check.level = level - 1;
  744. check.transid = btrfs_node_ptr_generation(parent, slot);
  745. check.owner_root = btrfs_header_owner(parent);
  746. check.has_first_key = true;
  747. btrfs_node_key_to_cpu(parent, &check.first_key, slot);
  748. eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
  749. &check);
  750. if (IS_ERR(eb))
  751. return eb;
  752. if (unlikely(!extent_buffer_uptodate(eb))) {
  753. free_extent_buffer(eb);
  754. return ERR_PTR(-EIO);
  755. }
  756. return eb;
  757. }
  758. /*
  759. * Promote a child node to become the new tree root.
  760. *
  761. * @trans: Transaction handle
  762. * @root: Tree root structure to update
  763. * @path: Path holding nodes and locks
  764. * @level: Level of the parent (old root)
  765. * @parent: The parent (old root) with exactly one item
  766. *
  767. * This helper is called during rebalancing when the root node contains only
  768. * a single item (nritems == 1). We can reduce the tree height by promoting
  769. * that child to become the new root and freeing the old root node. The path
  770. * locks and references are updated accordingly.
  771. *
  772. * Return: 0 on success, negative errno on failure. The transaction is aborted
  773. * on critical errors.
  774. */
  775. static int promote_child_to_root(struct btrfs_trans_handle *trans,
  776. struct btrfs_root *root, struct btrfs_path *path,
  777. int level, struct extent_buffer *parent)
  778. {
  779. struct extent_buffer *child;
  780. int ret;
  781. ASSERT(btrfs_header_nritems(parent) == 1);
  782. child = btrfs_read_node_slot(parent, 0);
  783. if (IS_ERR(child))
  784. return PTR_ERR(child);
  785. btrfs_tree_lock(child);
  786. ret = btrfs_cow_block(trans, root, child, parent, 0, &child, BTRFS_NESTING_COW);
  787. if (ret) {
  788. btrfs_tree_unlock(child);
  789. free_extent_buffer(child);
  790. return ret;
  791. }
  792. ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
  793. if (unlikely(ret < 0)) {
  794. btrfs_tree_unlock(child);
  795. free_extent_buffer(child);
  796. btrfs_abort_transaction(trans, ret);
  797. return ret;
  798. }
  799. rcu_assign_pointer(root->node, child);
  800. add_root_to_dirty_list(root);
  801. btrfs_tree_unlock(child);
  802. path->locks[level] = 0;
  803. path->nodes[level] = NULL;
  804. btrfs_clear_buffer_dirty(trans, parent);
  805. btrfs_tree_unlock(parent);
  806. /* Once for the path. */
  807. free_extent_buffer(parent);
  808. root_sub_used_bytes(root);
  809. ret = btrfs_free_tree_block(trans, btrfs_root_id(root), parent, 0, 1);
  810. /* Once for the root ptr. */
  811. free_extent_buffer_stale(parent);
  812. if (unlikely(ret < 0)) {
  813. btrfs_abort_transaction(trans, ret);
  814. return ret;
  815. }
  816. return 0;
  817. }
  818. /*
  819. * node level balancing, used to make sure nodes are in proper order for
  820. * item deletion. We balance from the top down, so we have to make sure
  821. * that a deletion won't leave an node completely empty later on.
  822. */
  823. static noinline int balance_level(struct btrfs_trans_handle *trans,
  824. struct btrfs_root *root,
  825. struct btrfs_path *path, int level)
  826. {
  827. struct btrfs_fs_info *fs_info = root->fs_info;
  828. struct extent_buffer *right = NULL;
  829. struct extent_buffer *mid;
  830. struct extent_buffer *left = NULL;
  831. struct extent_buffer *parent = NULL;
  832. int ret = 0;
  833. int wret;
  834. int pslot;
  835. int orig_slot = path->slots[level];
  836. u64 orig_ptr;
  837. ASSERT(level > 0);
  838. mid = path->nodes[level];
  839. WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
  840. WARN_ON(btrfs_header_generation(mid) != trans->transid);
  841. orig_ptr = btrfs_node_blockptr(mid, orig_slot);
  842. if (level < BTRFS_MAX_LEVEL - 1) {
  843. parent = path->nodes[level + 1];
  844. pslot = path->slots[level + 1];
  845. }
  846. /*
  847. * deal with the case where there is only one pointer in the root
  848. * by promoting the node below to a root
  849. */
  850. if (!parent) {
  851. if (btrfs_header_nritems(mid) != 1)
  852. return 0;
  853. return promote_child_to_root(trans, root, path, level, mid);
  854. }
  855. if (btrfs_header_nritems(mid) >
  856. BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
  857. return 0;
  858. if (pslot) {
  859. left = btrfs_read_node_slot(parent, pslot - 1);
  860. if (IS_ERR(left)) {
  861. ret = PTR_ERR(left);
  862. left = NULL;
  863. goto out;
  864. }
  865. btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
  866. wret = btrfs_cow_block(trans, root, left,
  867. parent, pslot - 1, &left,
  868. BTRFS_NESTING_LEFT_COW);
  869. if (wret) {
  870. ret = wret;
  871. goto out;
  872. }
  873. }
  874. if (pslot + 1 < btrfs_header_nritems(parent)) {
  875. right = btrfs_read_node_slot(parent, pslot + 1);
  876. if (IS_ERR(right)) {
  877. ret = PTR_ERR(right);
  878. right = NULL;
  879. goto out;
  880. }
  881. btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
  882. wret = btrfs_cow_block(trans, root, right,
  883. parent, pslot + 1, &right,
  884. BTRFS_NESTING_RIGHT_COW);
  885. if (wret) {
  886. ret = wret;
  887. goto out;
  888. }
  889. }
  890. /* first, try to make some room in the middle buffer */
  891. if (left) {
  892. orig_slot += btrfs_header_nritems(left);
  893. wret = push_node_left(trans, left, mid, 1);
  894. if (wret < 0)
  895. ret = wret;
  896. }
  897. /*
  898. * then try to empty the right most buffer into the middle
  899. */
  900. if (right) {
  901. wret = push_node_left(trans, mid, right, 1);
  902. if (wret < 0 && wret != -ENOSPC)
  903. ret = wret;
  904. if (btrfs_header_nritems(right) == 0) {
  905. btrfs_clear_buffer_dirty(trans, right);
  906. btrfs_tree_unlock(right);
  907. ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1);
  908. if (ret < 0) {
  909. free_extent_buffer_stale(right);
  910. right = NULL;
  911. goto out;
  912. }
  913. root_sub_used_bytes(root);
  914. ret = btrfs_free_tree_block(trans, btrfs_root_id(root),
  915. right, 0, 1);
  916. free_extent_buffer_stale(right);
  917. right = NULL;
  918. if (unlikely(ret < 0)) {
  919. btrfs_abort_transaction(trans, ret);
  920. goto out;
  921. }
  922. } else {
  923. struct btrfs_disk_key right_key;
  924. btrfs_node_key(right, &right_key, 0);
  925. ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
  926. BTRFS_MOD_LOG_KEY_REPLACE);
  927. if (unlikely(ret < 0)) {
  928. btrfs_abort_transaction(trans, ret);
  929. goto out;
  930. }
  931. btrfs_set_node_key(parent, &right_key, pslot + 1);
  932. btrfs_mark_buffer_dirty(trans, parent);
  933. }
  934. }
  935. if (btrfs_header_nritems(mid) == 1) {
  936. /*
  937. * we're not allowed to leave a node with one item in the
  938. * tree during a delete. A deletion from lower in the tree
  939. * could try to delete the only pointer in this node.
  940. * So, pull some keys from the left.
  941. * There has to be a left pointer at this point because
  942. * otherwise we would have pulled some pointers from the
  943. * right
  944. */
  945. if (unlikely(!left)) {
  946. btrfs_crit(fs_info,
  947. "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu",
  948. parent->start, btrfs_header_level(parent),
  949. mid->start, btrfs_root_id(root));
  950. ret = -EUCLEAN;
  951. btrfs_abort_transaction(trans, ret);
  952. goto out;
  953. }
  954. wret = balance_node_right(trans, mid, left);
  955. if (wret < 0) {
  956. ret = wret;
  957. goto out;
  958. }
  959. if (wret == 1) {
  960. wret = push_node_left(trans, left, mid, 1);
  961. if (wret < 0)
  962. ret = wret;
  963. }
  964. BUG_ON(wret == 1);
  965. }
  966. if (btrfs_header_nritems(mid) == 0) {
  967. btrfs_clear_buffer_dirty(trans, mid);
  968. btrfs_tree_unlock(mid);
  969. ret = btrfs_del_ptr(trans, root, path, level + 1, pslot);
  970. if (ret < 0) {
  971. free_extent_buffer_stale(mid);
  972. mid = NULL;
  973. goto out;
  974. }
  975. root_sub_used_bytes(root);
  976. ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
  977. free_extent_buffer_stale(mid);
  978. mid = NULL;
  979. if (unlikely(ret < 0)) {
  980. btrfs_abort_transaction(trans, ret);
  981. goto out;
  982. }
  983. } else {
  984. /* update the parent key to reflect our changes */
  985. struct btrfs_disk_key mid_key;
  986. btrfs_node_key(mid, &mid_key, 0);
  987. ret = btrfs_tree_mod_log_insert_key(parent, pslot,
  988. BTRFS_MOD_LOG_KEY_REPLACE);
  989. if (unlikely(ret < 0)) {
  990. btrfs_abort_transaction(trans, ret);
  991. goto out;
  992. }
  993. btrfs_set_node_key(parent, &mid_key, pslot);
  994. btrfs_mark_buffer_dirty(trans, parent);
  995. }
  996. /* update the path */
  997. if (left) {
  998. if (btrfs_header_nritems(left) > orig_slot) {
  999. /* left was locked after cow */
  1000. path->nodes[level] = left;
  1001. path->slots[level + 1] -= 1;
  1002. path->slots[level] = orig_slot;
  1003. /* Left is now owned by path. */
  1004. left = NULL;
  1005. if (mid) {
  1006. btrfs_tree_unlock(mid);
  1007. free_extent_buffer(mid);
  1008. }
  1009. } else {
  1010. orig_slot -= btrfs_header_nritems(left);
  1011. path->slots[level] = orig_slot;
  1012. }
  1013. }
  1014. /* double check we haven't messed things up */
  1015. if (orig_ptr !=
  1016. btrfs_node_blockptr(path->nodes[level], path->slots[level]))
  1017. BUG();
  1018. out:
  1019. if (right) {
  1020. btrfs_tree_unlock(right);
  1021. free_extent_buffer(right);
  1022. }
  1023. if (left) {
  1024. btrfs_tree_unlock(left);
  1025. free_extent_buffer(left);
  1026. }
  1027. return ret;
  1028. }
  1029. /* Node balancing for insertion. Here we only split or push nodes around
  1030. * when they are completely full. This is also done top down, so we
  1031. * have to be pessimistic.
  1032. */
  1033. static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
  1034. struct btrfs_root *root,
  1035. struct btrfs_path *path, int level)
  1036. {
  1037. struct btrfs_fs_info *fs_info = root->fs_info;
  1038. struct extent_buffer *right = NULL;
  1039. struct extent_buffer *mid;
  1040. struct extent_buffer *left = NULL;
  1041. struct extent_buffer *parent = NULL;
  1042. int ret = 0;
  1043. int wret;
  1044. int pslot;
  1045. int orig_slot = path->slots[level];
  1046. if (level == 0)
  1047. return 1;
  1048. mid = path->nodes[level];
  1049. WARN_ON(btrfs_header_generation(mid) != trans->transid);
  1050. if (level < BTRFS_MAX_LEVEL - 1) {
  1051. parent = path->nodes[level + 1];
  1052. pslot = path->slots[level + 1];
  1053. }
  1054. if (!parent)
  1055. return 1;
  1056. /* first, try to make some room in the middle buffer */
  1057. if (pslot) {
  1058. u32 left_nr;
  1059. left = btrfs_read_node_slot(parent, pslot - 1);
  1060. if (IS_ERR(left))
  1061. return PTR_ERR(left);
  1062. btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
  1063. left_nr = btrfs_header_nritems(left);
  1064. if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
  1065. wret = 1;
  1066. } else {
  1067. ret = btrfs_cow_block(trans, root, left, parent,
  1068. pslot - 1, &left,
  1069. BTRFS_NESTING_LEFT_COW);
  1070. if (ret)
  1071. wret = 1;
  1072. else {
  1073. wret = push_node_left(trans, left, mid, 0);
  1074. }
  1075. }
  1076. if (wret < 0)
  1077. ret = wret;
  1078. if (wret == 0) {
  1079. struct btrfs_disk_key disk_key;
  1080. orig_slot += left_nr;
  1081. btrfs_node_key(mid, &disk_key, 0);
  1082. ret = btrfs_tree_mod_log_insert_key(parent, pslot,
  1083. BTRFS_MOD_LOG_KEY_REPLACE);
  1084. if (unlikely(ret < 0)) {
  1085. btrfs_tree_unlock(left);
  1086. free_extent_buffer(left);
  1087. btrfs_abort_transaction(trans, ret);
  1088. return ret;
  1089. }
  1090. btrfs_set_node_key(parent, &disk_key, pslot);
  1091. btrfs_mark_buffer_dirty(trans, parent);
  1092. if (btrfs_header_nritems(left) > orig_slot) {
  1093. path->nodes[level] = left;
  1094. path->slots[level + 1] -= 1;
  1095. path->slots[level] = orig_slot;
  1096. btrfs_tree_unlock(mid);
  1097. free_extent_buffer(mid);
  1098. } else {
  1099. orig_slot -=
  1100. btrfs_header_nritems(left);
  1101. path->slots[level] = orig_slot;
  1102. btrfs_tree_unlock(left);
  1103. free_extent_buffer(left);
  1104. }
  1105. return 0;
  1106. }
  1107. btrfs_tree_unlock(left);
  1108. free_extent_buffer(left);
  1109. }
  1110. /*
  1111. * then try to empty the right most buffer into the middle
  1112. */
  1113. if (pslot + 1 < btrfs_header_nritems(parent)) {
  1114. u32 right_nr;
  1115. right = btrfs_read_node_slot(parent, pslot + 1);
  1116. if (IS_ERR(right))
  1117. return PTR_ERR(right);
  1118. btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
  1119. right_nr = btrfs_header_nritems(right);
  1120. if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
  1121. wret = 1;
  1122. } else {
  1123. ret = btrfs_cow_block(trans, root, right,
  1124. parent, pslot + 1,
  1125. &right, BTRFS_NESTING_RIGHT_COW);
  1126. if (ret)
  1127. wret = 1;
  1128. else {
  1129. wret = balance_node_right(trans, right, mid);
  1130. }
  1131. }
  1132. if (wret < 0)
  1133. ret = wret;
  1134. if (wret == 0) {
  1135. struct btrfs_disk_key disk_key;
  1136. btrfs_node_key(right, &disk_key, 0);
  1137. ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
  1138. BTRFS_MOD_LOG_KEY_REPLACE);
  1139. if (unlikely(ret < 0)) {
  1140. btrfs_tree_unlock(right);
  1141. free_extent_buffer(right);
  1142. btrfs_abort_transaction(trans, ret);
  1143. return ret;
  1144. }
  1145. btrfs_set_node_key(parent, &disk_key, pslot + 1);
  1146. btrfs_mark_buffer_dirty(trans, parent);
  1147. if (btrfs_header_nritems(mid) <= orig_slot) {
  1148. path->nodes[level] = right;
  1149. path->slots[level + 1] += 1;
  1150. path->slots[level] = orig_slot -
  1151. btrfs_header_nritems(mid);
  1152. btrfs_tree_unlock(mid);
  1153. free_extent_buffer(mid);
  1154. } else {
  1155. btrfs_tree_unlock(right);
  1156. free_extent_buffer(right);
  1157. }
  1158. return 0;
  1159. }
  1160. btrfs_tree_unlock(right);
  1161. free_extent_buffer(right);
  1162. }
  1163. return 1;
  1164. }
  1165. /*
  1166. * readahead one full node of leaves, finding things that are close
  1167. * to the block in 'slot', and triggering ra on them.
  1168. */
  1169. static void reada_for_search(struct btrfs_fs_info *fs_info,
  1170. const struct btrfs_path *path,
  1171. int level, int slot, u64 objectid)
  1172. {
  1173. struct extent_buffer *node;
  1174. struct btrfs_disk_key disk_key;
  1175. u32 nritems;
  1176. u64 search;
  1177. u64 target;
  1178. u64 nread = 0;
  1179. u64 nread_max;
  1180. u32 nr;
  1181. u32 blocksize;
  1182. u32 nscan = 0;
  1183. if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
  1184. return;
  1185. if (!path->nodes[level])
  1186. return;
  1187. node = path->nodes[level];
  1188. /*
  1189. * Since the time between visiting leaves is much shorter than the time
  1190. * between visiting nodes, limit read ahead of nodes to 1, to avoid too
  1191. * much IO at once (possibly random).
  1192. */
  1193. if (path->reada == READA_FORWARD_ALWAYS) {
  1194. if (level > 1)
  1195. nread_max = node->fs_info->nodesize;
  1196. else
  1197. nread_max = SZ_128K;
  1198. } else {
  1199. nread_max = SZ_64K;
  1200. }
  1201. search = btrfs_node_blockptr(node, slot);
  1202. blocksize = fs_info->nodesize;
  1203. if (path->reada != READA_FORWARD_ALWAYS) {
  1204. struct extent_buffer *eb;
  1205. eb = find_extent_buffer(fs_info, search);
  1206. if (eb) {
  1207. free_extent_buffer(eb);
  1208. return;
  1209. }
  1210. }
  1211. target = search;
  1212. nritems = btrfs_header_nritems(node);
  1213. nr = slot;
  1214. while (1) {
  1215. if (path->reada == READA_BACK) {
  1216. if (nr == 0)
  1217. break;
  1218. nr--;
  1219. } else if (path->reada == READA_FORWARD ||
  1220. path->reada == READA_FORWARD_ALWAYS) {
  1221. nr++;
  1222. if (nr >= nritems)
  1223. break;
  1224. }
  1225. if (path->reada == READA_BACK && objectid) {
  1226. btrfs_node_key(node, &disk_key, nr);
  1227. if (btrfs_disk_key_objectid(&disk_key) != objectid)
  1228. break;
  1229. }
  1230. search = btrfs_node_blockptr(node, nr);
  1231. if (path->reada == READA_FORWARD_ALWAYS ||
  1232. (search <= target && target - search <= 65536) ||
  1233. (search > target && search - target <= 65536)) {
  1234. btrfs_readahead_node_child(node, nr);
  1235. nread += blocksize;
  1236. }
  1237. nscan++;
  1238. if (nread > nread_max || nscan > 32)
  1239. break;
  1240. }
  1241. }
  1242. static noinline void reada_for_balance(const struct btrfs_path *path, int level)
  1243. {
  1244. struct extent_buffer *parent;
  1245. int slot;
  1246. int nritems;
  1247. parent = path->nodes[level + 1];
  1248. if (!parent)
  1249. return;
  1250. nritems = btrfs_header_nritems(parent);
  1251. slot = path->slots[level + 1];
  1252. if (slot > 0)
  1253. btrfs_readahead_node_child(parent, slot - 1);
  1254. if (slot + 1 < nritems)
  1255. btrfs_readahead_node_child(parent, slot + 1);
  1256. }
  1257. /*
  1258. * when we walk down the tree, it is usually safe to unlock the higher layers
  1259. * in the tree. The exceptions are when our path goes through slot 0, because
  1260. * operations on the tree might require changing key pointers higher up in the
  1261. * tree.
  1262. *
  1263. * callers might also have set path->keep_locks, which tells this code to keep
  1264. * the lock if the path points to the last slot in the block. This is part of
  1265. * walking through the tree, and selecting the next slot in the higher block.
  1266. *
  1267. * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
  1268. * if lowest_unlock is 1, level 0 won't be unlocked
  1269. */
  1270. static noinline void unlock_up(struct btrfs_path *path, int level,
  1271. int lowest_unlock, int min_write_lock_level,
  1272. int *write_lock_level)
  1273. {
  1274. int i;
  1275. int skip_level = level;
  1276. bool check_skip = true;
  1277. for (i = level; i < BTRFS_MAX_LEVEL; i++) {
  1278. if (!path->nodes[i])
  1279. break;
  1280. if (!path->locks[i])
  1281. break;
  1282. if (check_skip) {
  1283. if (path->slots[i] == 0) {
  1284. skip_level = i + 1;
  1285. continue;
  1286. }
  1287. if (path->keep_locks) {
  1288. u32 nritems;
  1289. nritems = btrfs_header_nritems(path->nodes[i]);
  1290. if (nritems < 1 || path->slots[i] >= nritems - 1) {
  1291. skip_level = i + 1;
  1292. continue;
  1293. }
  1294. }
  1295. }
  1296. if (i >= lowest_unlock && i > skip_level) {
  1297. btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
  1298. check_skip = false;
  1299. path->locks[i] = 0;
  1300. if (write_lock_level &&
  1301. i > min_write_lock_level &&
  1302. i <= *write_lock_level) {
  1303. *write_lock_level = i - 1;
  1304. }
  1305. }
  1306. }
  1307. }
  1308. /*
  1309. * Helper function for btrfs_search_slot() and other functions that do a search
  1310. * on a btree. The goal is to find a tree block in the cache (the radix tree at
  1311. * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
  1312. * its pages from disk.
  1313. *
  1314. * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
  1315. * whole btree search, starting again from the current root node.
  1316. */
  1317. static int
  1318. read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
  1319. struct extent_buffer **eb_ret, int slot,
  1320. const struct btrfs_key *key)
  1321. {
  1322. struct btrfs_fs_info *fs_info = root->fs_info;
  1323. struct btrfs_tree_parent_check check = { 0 };
  1324. u64 blocknr;
  1325. struct extent_buffer *tmp = NULL;
  1326. int ret = 0;
  1327. int ret2;
  1328. int parent_level;
  1329. bool read_tmp = false;
  1330. bool tmp_locked = false;
  1331. bool path_released = false;
  1332. blocknr = btrfs_node_blockptr(*eb_ret, slot);
  1333. parent_level = btrfs_header_level(*eb_ret);
  1334. btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
  1335. check.has_first_key = true;
  1336. check.level = parent_level - 1;
  1337. check.transid = btrfs_node_ptr_generation(*eb_ret, slot);
  1338. check.owner_root = btrfs_root_id(root);
  1339. /*
  1340. * If we need to read an extent buffer from disk and we are holding locks
  1341. * on upper level nodes, we unlock all the upper nodes before reading the
  1342. * extent buffer, and then return -EAGAIN to the caller as it needs to
  1343. * restart the search. We don't release the lock on the current level
  1344. * because we need to walk this node to figure out which blocks to read.
  1345. */
  1346. tmp = find_extent_buffer(fs_info, blocknr);
  1347. if (tmp) {
  1348. if (p->reada == READA_FORWARD_ALWAYS)
  1349. reada_for_search(fs_info, p, parent_level, slot, key->objectid);
  1350. /* first we do an atomic uptodate check */
  1351. if (btrfs_buffer_uptodate(tmp, check.transid, true) > 0) {
  1352. /*
  1353. * Do extra check for first_key, eb can be stale due to
  1354. * being cached, read from scrub, or have multiple
  1355. * parents (shared tree blocks).
  1356. */
  1357. if (unlikely(btrfs_verify_level_key(tmp, &check))) {
  1358. ret = -EUCLEAN;
  1359. goto out;
  1360. }
  1361. *eb_ret = tmp;
  1362. tmp = NULL;
  1363. ret = 0;
  1364. goto out;
  1365. }
  1366. if (p->nowait) {
  1367. ret = -EAGAIN;
  1368. goto out;
  1369. }
  1370. if (!p->skip_locking) {
  1371. btrfs_unlock_up_safe(p, parent_level + 1);
  1372. btrfs_maybe_reset_lockdep_class(root, tmp);
  1373. tmp_locked = true;
  1374. btrfs_tree_read_lock(tmp);
  1375. btrfs_release_path(p);
  1376. ret = -EAGAIN;
  1377. path_released = true;
  1378. }
  1379. /* Now we're allowed to do a blocking uptodate check. */
  1380. ret2 = btrfs_read_extent_buffer(tmp, &check);
  1381. if (ret2) {
  1382. ret = ret2;
  1383. goto out;
  1384. }
  1385. if (ret == 0) {
  1386. ASSERT(!tmp_locked);
  1387. *eb_ret = tmp;
  1388. tmp = NULL;
  1389. }
  1390. goto out;
  1391. } else if (p->nowait) {
  1392. ret = -EAGAIN;
  1393. goto out;
  1394. }
  1395. if (!p->skip_locking) {
  1396. btrfs_unlock_up_safe(p, parent_level + 1);
  1397. ret = -EAGAIN;
  1398. }
  1399. if (p->reada != READA_NONE)
  1400. reada_for_search(fs_info, p, parent_level, slot, key->objectid);
  1401. tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
  1402. if (IS_ERR(tmp)) {
  1403. ret = PTR_ERR(tmp);
  1404. tmp = NULL;
  1405. goto out;
  1406. }
  1407. read_tmp = true;
  1408. if (!p->skip_locking) {
  1409. ASSERT(ret == -EAGAIN);
  1410. btrfs_maybe_reset_lockdep_class(root, tmp);
  1411. tmp_locked = true;
  1412. btrfs_tree_read_lock(tmp);
  1413. btrfs_release_path(p);
  1414. path_released = true;
  1415. }
  1416. /* Now we're allowed to do a blocking uptodate check. */
  1417. ret2 = btrfs_read_extent_buffer(tmp, &check);
  1418. if (ret2) {
  1419. ret = ret2;
  1420. goto out;
  1421. }
  1422. /*
  1423. * If the read above didn't mark this buffer up to date,
  1424. * it will never end up being up to date. Set ret to EIO now
  1425. * and give up so that our caller doesn't loop forever
  1426. * on our EAGAINs.
  1427. */
  1428. if (unlikely(!extent_buffer_uptodate(tmp))) {
  1429. ret = -EIO;
  1430. goto out;
  1431. }
  1432. if (ret == 0) {
  1433. ASSERT(!tmp_locked);
  1434. *eb_ret = tmp;
  1435. tmp = NULL;
  1436. }
  1437. out:
  1438. if (tmp) {
  1439. if (tmp_locked)
  1440. btrfs_tree_read_unlock(tmp);
  1441. if (read_tmp && ret && ret != -EAGAIN)
  1442. free_extent_buffer_stale(tmp);
  1443. else
  1444. free_extent_buffer(tmp);
  1445. }
  1446. if (ret && !path_released)
  1447. btrfs_release_path(p);
  1448. return ret;
  1449. }
  1450. /*
  1451. * helper function for btrfs_search_slot. This does all of the checks
  1452. * for node-level blocks and does any balancing required based on
  1453. * the ins_len.
  1454. *
  1455. * If no extra work was required, zero is returned. If we had to
  1456. * drop the path, -EAGAIN is returned and btrfs_search_slot must
  1457. * start over
  1458. */
  1459. static int
  1460. setup_nodes_for_search(struct btrfs_trans_handle *trans,
  1461. struct btrfs_root *root, struct btrfs_path *p,
  1462. struct extent_buffer *b, int level, int ins_len,
  1463. int *write_lock_level)
  1464. {
  1465. struct btrfs_fs_info *fs_info = root->fs_info;
  1466. int ret = 0;
  1467. if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
  1468. BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
  1469. if (*write_lock_level < level + 1) {
  1470. *write_lock_level = level + 1;
  1471. btrfs_release_path(p);
  1472. return -EAGAIN;
  1473. }
  1474. reada_for_balance(p, level);
  1475. ret = split_node(trans, root, p, level);
  1476. b = p->nodes[level];
  1477. } else if (ins_len < 0 && btrfs_header_nritems(b) <
  1478. BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
  1479. if (*write_lock_level < level + 1) {
  1480. *write_lock_level = level + 1;
  1481. btrfs_release_path(p);
  1482. return -EAGAIN;
  1483. }
  1484. reada_for_balance(p, level);
  1485. ret = balance_level(trans, root, p, level);
  1486. if (ret)
  1487. return ret;
  1488. b = p->nodes[level];
  1489. if (!b) {
  1490. btrfs_release_path(p);
  1491. return -EAGAIN;
  1492. }
  1493. BUG_ON(btrfs_header_nritems(b) == 1);
  1494. }
  1495. return ret;
  1496. }
  1497. int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
  1498. u64 iobjectid, u64 ioff, u8 key_type,
  1499. struct btrfs_key *found_key)
  1500. {
  1501. int ret;
  1502. struct btrfs_key key;
  1503. struct extent_buffer *eb;
  1504. ASSERT(path);
  1505. ASSERT(found_key);
  1506. key.type = key_type;
  1507. key.objectid = iobjectid;
  1508. key.offset = ioff;
  1509. ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
  1510. if (ret < 0)
  1511. return ret;
  1512. eb = path->nodes[0];
  1513. if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
  1514. ret = btrfs_next_leaf(fs_root, path);
  1515. if (ret)
  1516. return ret;
  1517. eb = path->nodes[0];
  1518. }
  1519. btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
  1520. if (found_key->type != key.type ||
  1521. found_key->objectid != key.objectid)
  1522. return 1;
  1523. return 0;
  1524. }
  1525. static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
  1526. struct btrfs_path *p,
  1527. int write_lock_level)
  1528. {
  1529. struct extent_buffer *b;
  1530. int root_lock = 0;
  1531. int level = 0;
  1532. if (p->search_commit_root) {
  1533. b = root->commit_root;
  1534. refcount_inc(&b->refs);
  1535. level = btrfs_header_level(b);
  1536. /*
  1537. * Ensure that all callers have set skip_locking when
  1538. * p->search_commit_root is true.
  1539. */
  1540. ASSERT(p->skip_locking);
  1541. goto out;
  1542. }
  1543. if (p->skip_locking) {
  1544. b = btrfs_root_node(root);
  1545. level = btrfs_header_level(b);
  1546. goto out;
  1547. }
  1548. /* We try very hard to do read locks on the root */
  1549. root_lock = BTRFS_READ_LOCK;
  1550. /*
  1551. * If the level is set to maximum, we can skip trying to get the read
  1552. * lock.
  1553. */
  1554. if (write_lock_level < BTRFS_MAX_LEVEL) {
  1555. /*
  1556. * We don't know the level of the root node until we actually
  1557. * have it read locked
  1558. */
  1559. if (p->nowait) {
  1560. b = btrfs_try_read_lock_root_node(root);
  1561. if (IS_ERR(b))
  1562. return b;
  1563. } else {
  1564. b = btrfs_read_lock_root_node(root);
  1565. }
  1566. level = btrfs_header_level(b);
  1567. if (level > write_lock_level)
  1568. goto out;
  1569. /* Whoops, must trade for write lock */
  1570. btrfs_tree_read_unlock(b);
  1571. free_extent_buffer(b);
  1572. }
  1573. b = btrfs_lock_root_node(root);
  1574. root_lock = BTRFS_WRITE_LOCK;
  1575. /* The level might have changed, check again */
  1576. level = btrfs_header_level(b);
  1577. out:
  1578. /*
  1579. * The root may have failed to write out at some point, and thus is no
  1580. * longer valid, return an error in this case.
  1581. */
  1582. if (unlikely(!extent_buffer_uptodate(b))) {
  1583. if (root_lock)
  1584. btrfs_tree_unlock_rw(b, root_lock);
  1585. free_extent_buffer(b);
  1586. return ERR_PTR(-EIO);
  1587. }
  1588. p->nodes[level] = b;
  1589. if (!p->skip_locking)
  1590. p->locks[level] = root_lock;
  1591. /*
  1592. * Callers are responsible for dropping b's references.
  1593. */
  1594. return b;
  1595. }
  1596. /*
  1597. * Replace the extent buffer at the lowest level of the path with a cloned
  1598. * version. The purpose is to be able to use it safely, after releasing the
  1599. * commit root semaphore, even if relocation is happening in parallel, the
  1600. * transaction used for relocation is committed and the extent buffer is
  1601. * reallocated in the next transaction.
  1602. *
  1603. * This is used in a context where the caller does not prevent transaction
  1604. * commits from happening, either by holding a transaction handle or holding
  1605. * some lock, while it's doing searches through a commit root.
  1606. * At the moment it's only used for send operations.
  1607. */
  1608. static int finish_need_commit_sem_search(struct btrfs_path *path)
  1609. {
  1610. const int i = path->lowest_level;
  1611. const int slot = path->slots[i];
  1612. struct extent_buffer *lowest = path->nodes[i];
  1613. struct extent_buffer *clone;
  1614. ASSERT(path->need_commit_sem);
  1615. if (!lowest)
  1616. return 0;
  1617. lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
  1618. clone = btrfs_clone_extent_buffer(lowest);
  1619. if (!clone)
  1620. return -ENOMEM;
  1621. btrfs_release_path(path);
  1622. path->nodes[i] = clone;
  1623. path->slots[i] = slot;
  1624. return 0;
  1625. }
  1626. static inline int search_for_key_slot(const struct extent_buffer *eb,
  1627. int search_low_slot,
  1628. const struct btrfs_key *key,
  1629. int prev_cmp,
  1630. int *slot)
  1631. {
  1632. /*
  1633. * If a previous call to btrfs_bin_search() on a parent node returned an
  1634. * exact match (prev_cmp == 0), we can safely assume the target key will
  1635. * always be at slot 0 on lower levels, since each key pointer
  1636. * (struct btrfs_key_ptr) refers to the lowest key accessible from the
  1637. * subtree it points to. Thus we can skip searching lower levels.
  1638. */
  1639. if (prev_cmp == 0) {
  1640. *slot = 0;
  1641. return 0;
  1642. }
  1643. return btrfs_bin_search(eb, search_low_slot, key, slot);
  1644. }
  1645. static int search_leaf(struct btrfs_trans_handle *trans,
  1646. struct btrfs_root *root,
  1647. const struct btrfs_key *key,
  1648. struct btrfs_path *path,
  1649. int ins_len,
  1650. int prev_cmp)
  1651. {
  1652. struct extent_buffer *leaf = path->nodes[0];
  1653. int leaf_free_space = -1;
  1654. int search_low_slot = 0;
  1655. int ret;
  1656. bool do_bin_search = true;
  1657. /*
  1658. * If we are doing an insertion, the leaf has enough free space and the
  1659. * destination slot for the key is not slot 0, then we can unlock our
  1660. * write lock on the parent, and any other upper nodes, before doing the
  1661. * binary search on the leaf (with search_for_key_slot()), allowing other
  1662. * tasks to lock the parent and any other upper nodes.
  1663. */
  1664. if (ins_len > 0) {
  1665. /*
  1666. * Cache the leaf free space, since we will need it later and it
  1667. * will not change until then.
  1668. */
  1669. leaf_free_space = btrfs_leaf_free_space(leaf);
  1670. /*
  1671. * !path->locks[1] means we have a single node tree, the leaf is
  1672. * the root of the tree.
  1673. */
  1674. if (path->locks[1] && leaf_free_space >= ins_len) {
  1675. struct btrfs_disk_key first_key;
  1676. ASSERT(btrfs_header_nritems(leaf) > 0);
  1677. btrfs_item_key(leaf, &first_key, 0);
  1678. /*
  1679. * Doing the extra comparison with the first key is cheap,
  1680. * taking into account that the first key is very likely
  1681. * already in a cache line because it immediately follows
  1682. * the extent buffer's header and we have recently accessed
  1683. * the header's level field.
  1684. */
  1685. ret = btrfs_comp_keys(&first_key, key);
  1686. if (ret < 0) {
  1687. /*
  1688. * The first key is smaller than the key we want
  1689. * to insert, so we are safe to unlock all upper
  1690. * nodes and we have to do the binary search.
  1691. *
  1692. * We do use btrfs_unlock_up_safe() and not
  1693. * unlock_up() because the later does not unlock
  1694. * nodes with a slot of 0 - we can safely unlock
  1695. * any node even if its slot is 0 since in this
  1696. * case the key does not end up at slot 0 of the
  1697. * leaf and there's no need to split the leaf.
  1698. */
  1699. btrfs_unlock_up_safe(path, 1);
  1700. search_low_slot = 1;
  1701. } else {
  1702. /*
  1703. * The first key is >= then the key we want to
  1704. * insert, so we can skip the binary search as
  1705. * the target key will be at slot 0.
  1706. *
  1707. * We can not unlock upper nodes when the key is
  1708. * less than the first key, because we will need
  1709. * to update the key at slot 0 of the parent node
  1710. * and possibly of other upper nodes too.
  1711. * If the key matches the first key, then we can
  1712. * unlock all the upper nodes, using
  1713. * btrfs_unlock_up_safe() instead of unlock_up()
  1714. * as stated above.
  1715. */
  1716. if (ret == 0)
  1717. btrfs_unlock_up_safe(path, 1);
  1718. /*
  1719. * ret is already 0 or 1, matching the result of
  1720. * a btrfs_bin_search() call, so there is no need
  1721. * to adjust it.
  1722. */
  1723. do_bin_search = false;
  1724. path->slots[0] = 0;
  1725. }
  1726. }
  1727. }
  1728. if (do_bin_search) {
  1729. ret = search_for_key_slot(leaf, search_low_slot, key,
  1730. prev_cmp, &path->slots[0]);
  1731. if (ret < 0)
  1732. return ret;
  1733. }
  1734. if (ins_len > 0) {
  1735. /*
  1736. * Item key already exists. In this case, if we are allowed to
  1737. * insert the item (for example, in dir_item case, item key
  1738. * collision is allowed), it will be merged with the original
  1739. * item. Only the item size grows, no new btrfs item will be
  1740. * added. If search_for_extension is not set, ins_len already
  1741. * accounts the size btrfs_item, deduct it here so leaf space
  1742. * check will be correct.
  1743. */
  1744. if (ret == 0 && !path->search_for_extension) {
  1745. ASSERT(ins_len >= sizeof(struct btrfs_item));
  1746. ins_len -= sizeof(struct btrfs_item);
  1747. }
  1748. ASSERT(leaf_free_space >= 0);
  1749. if (leaf_free_space < ins_len) {
  1750. int ret2;
  1751. ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0));
  1752. ASSERT(ret2 <= 0);
  1753. if (WARN_ON(ret2 > 0))
  1754. ret2 = -EUCLEAN;
  1755. if (ret2)
  1756. ret = ret2;
  1757. }
  1758. }
  1759. return ret;
  1760. }
  1761. /*
  1762. * Look for a key in a tree and perform necessary modifications to preserve
  1763. * tree invariants.
  1764. *
  1765. * @trans: Handle of transaction, used when modifying the tree
  1766. * @p: Holds all btree nodes along the search path
  1767. * @root: The root node of the tree
  1768. * @key: The key we are looking for
  1769. * @ins_len: Indicates purpose of search:
  1770. * >0 for inserts it's size of item inserted (*)
  1771. * <0 for deletions
  1772. * 0 for plain searches, not modifying the tree
  1773. *
  1774. * (*) If size of item inserted doesn't include
  1775. * sizeof(struct btrfs_item), then p->search_for_extension must
  1776. * be set.
  1777. * @cow: boolean should CoW operations be performed. Must always be 1
  1778. * when modifying the tree.
  1779. *
  1780. * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
  1781. * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
  1782. *
  1783. * If @key is found, 0 is returned and you can find the item in the leaf level
  1784. * of the path (level 0)
  1785. *
  1786. * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
  1787. * points to the slot where it should be inserted
  1788. *
  1789. * If an error is encountered while searching the tree a negative error number
  1790. * is returned
  1791. */
  1792. int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  1793. const struct btrfs_key *key, struct btrfs_path *p,
  1794. int ins_len, int cow)
  1795. {
  1796. struct btrfs_fs_info *fs_info;
  1797. struct extent_buffer *b;
  1798. int slot;
  1799. int ret;
  1800. int level;
  1801. int lowest_unlock = 1;
  1802. /* everything at write_lock_level or lower must be write locked */
  1803. int write_lock_level = 0;
  1804. u8 lowest_level = 0;
  1805. int min_write_lock_level;
  1806. int prev_cmp;
  1807. if (!root)
  1808. return -EINVAL;
  1809. fs_info = root->fs_info;
  1810. might_sleep();
  1811. lowest_level = p->lowest_level;
  1812. WARN_ON(lowest_level && ins_len > 0);
  1813. WARN_ON(p->nodes[0] != NULL);
  1814. BUG_ON(!cow && ins_len);
  1815. /*
  1816. * For now only allow nowait for read only operations. There's no
  1817. * strict reason why we can't, we just only need it for reads so it's
  1818. * only implemented for reads.
  1819. */
  1820. ASSERT(!p->nowait || !cow);
  1821. if (ins_len < 0) {
  1822. lowest_unlock = 2;
  1823. /* when we are removing items, we might have to go up to level
  1824. * two as we update tree pointers Make sure we keep write
  1825. * for those levels as well
  1826. */
  1827. write_lock_level = 2;
  1828. } else if (ins_len > 0) {
  1829. /*
  1830. * for inserting items, make sure we have a write lock on
  1831. * level 1 so we can update keys
  1832. */
  1833. write_lock_level = 1;
  1834. }
  1835. if (!cow)
  1836. write_lock_level = -1;
  1837. if (cow && (p->keep_locks || p->lowest_level))
  1838. write_lock_level = BTRFS_MAX_LEVEL;
  1839. min_write_lock_level = write_lock_level;
  1840. if (p->need_commit_sem) {
  1841. ASSERT(p->search_commit_root);
  1842. if (p->nowait) {
  1843. if (!down_read_trylock(&fs_info->commit_root_sem))
  1844. return -EAGAIN;
  1845. } else {
  1846. down_read(&fs_info->commit_root_sem);
  1847. }
  1848. }
  1849. again:
  1850. prev_cmp = -1;
  1851. b = btrfs_search_slot_get_root(root, p, write_lock_level);
  1852. if (IS_ERR(b)) {
  1853. ret = PTR_ERR(b);
  1854. goto done;
  1855. }
  1856. while (b) {
  1857. int dec = 0;
  1858. int ret2;
  1859. level = btrfs_header_level(b);
  1860. if (cow) {
  1861. bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
  1862. /*
  1863. * if we don't really need to cow this block
  1864. * then we don't want to set the path blocking,
  1865. * so we test it here
  1866. */
  1867. if (!should_cow_block(trans, root, b))
  1868. goto cow_done;
  1869. /*
  1870. * must have write locks on this node and the
  1871. * parent
  1872. */
  1873. if (level > write_lock_level ||
  1874. (level + 1 > write_lock_level &&
  1875. level + 1 < BTRFS_MAX_LEVEL &&
  1876. p->nodes[level + 1])) {
  1877. write_lock_level = level + 1;
  1878. btrfs_release_path(p);
  1879. goto again;
  1880. }
  1881. if (last_level)
  1882. ret2 = btrfs_cow_block(trans, root, b, NULL, 0,
  1883. &b, BTRFS_NESTING_COW);
  1884. else
  1885. ret2 = btrfs_cow_block(trans, root, b,
  1886. p->nodes[level + 1],
  1887. p->slots[level + 1], &b,
  1888. BTRFS_NESTING_COW);
  1889. if (ret2) {
  1890. ret = ret2;
  1891. goto done;
  1892. }
  1893. }
  1894. cow_done:
  1895. p->nodes[level] = b;
  1896. /*
  1897. * we have a lock on b and as long as we aren't changing
  1898. * the tree, there is no way to for the items in b to change.
  1899. * It is safe to drop the lock on our parent before we
  1900. * go through the expensive btree search on b.
  1901. *
  1902. * If we're inserting or deleting (ins_len != 0), then we might
  1903. * be changing slot zero, which may require changing the parent.
  1904. * So, we can't drop the lock until after we know which slot
  1905. * we're operating on.
  1906. */
  1907. if (!ins_len && !p->keep_locks) {
  1908. int u = level + 1;
  1909. if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
  1910. btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
  1911. p->locks[u] = 0;
  1912. }
  1913. }
  1914. if (level == 0) {
  1915. if (ins_len > 0)
  1916. ASSERT(write_lock_level >= 1);
  1917. ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
  1918. if (!p->search_for_split)
  1919. unlock_up(p, level, lowest_unlock,
  1920. min_write_lock_level, NULL);
  1921. goto done;
  1922. }
  1923. ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
  1924. if (ret < 0)
  1925. goto done;
  1926. prev_cmp = ret;
  1927. if (ret && slot > 0) {
  1928. dec = 1;
  1929. slot--;
  1930. }
  1931. p->slots[level] = slot;
  1932. ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len,
  1933. &write_lock_level);
  1934. if (ret2 == -EAGAIN)
  1935. goto again;
  1936. if (ret2) {
  1937. ret = ret2;
  1938. goto done;
  1939. }
  1940. b = p->nodes[level];
  1941. slot = p->slots[level];
  1942. /*
  1943. * Slot 0 is special, if we change the key we have to update
  1944. * the parent pointer which means we must have a write lock on
  1945. * the parent
  1946. */
  1947. if (slot == 0 && ins_len && write_lock_level < level + 1) {
  1948. write_lock_level = level + 1;
  1949. btrfs_release_path(p);
  1950. goto again;
  1951. }
  1952. unlock_up(p, level, lowest_unlock, min_write_lock_level,
  1953. &write_lock_level);
  1954. if (level == lowest_level) {
  1955. if (dec)
  1956. p->slots[level]++;
  1957. goto done;
  1958. }
  1959. ret2 = read_block_for_search(root, p, &b, slot, key);
  1960. if (ret2 == -EAGAIN && !p->nowait)
  1961. goto again;
  1962. if (ret2) {
  1963. ret = ret2;
  1964. goto done;
  1965. }
  1966. if (!p->skip_locking) {
  1967. level = btrfs_header_level(b);
  1968. btrfs_maybe_reset_lockdep_class(root, b);
  1969. if (level <= write_lock_level) {
  1970. btrfs_tree_lock(b);
  1971. p->locks[level] = BTRFS_WRITE_LOCK;
  1972. } else {
  1973. if (p->nowait) {
  1974. if (!btrfs_try_tree_read_lock(b)) {
  1975. free_extent_buffer(b);
  1976. ret = -EAGAIN;
  1977. goto done;
  1978. }
  1979. } else {
  1980. btrfs_tree_read_lock(b);
  1981. }
  1982. p->locks[level] = BTRFS_READ_LOCK;
  1983. }
  1984. p->nodes[level] = b;
  1985. }
  1986. }
  1987. ret = 1;
  1988. done:
  1989. if (ret < 0 && !p->skip_release_on_error)
  1990. btrfs_release_path(p);
  1991. if (p->need_commit_sem) {
  1992. int ret2;
  1993. ret2 = finish_need_commit_sem_search(p);
  1994. up_read(&fs_info->commit_root_sem);
  1995. if (ret2)
  1996. ret = ret2;
  1997. }
  1998. return ret;
  1999. }
  2000. ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
  2001. /*
  2002. * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
  2003. * current state of the tree together with the operations recorded in the tree
  2004. * modification log to search for the key in a previous version of this tree, as
  2005. * denoted by the time_seq parameter.
  2006. *
  2007. * Naturally, there is no support for insert, delete or cow operations.
  2008. *
  2009. * The resulting path and return value will be set up as if we called
  2010. * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
  2011. */
  2012. int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
  2013. struct btrfs_path *p, u64 time_seq)
  2014. {
  2015. struct btrfs_fs_info *fs_info = root->fs_info;
  2016. struct extent_buffer *b;
  2017. int slot;
  2018. int ret;
  2019. int level;
  2020. int lowest_unlock = 1;
  2021. u8 lowest_level = 0;
  2022. lowest_level = p->lowest_level;
  2023. WARN_ON(p->nodes[0] != NULL);
  2024. ASSERT(!p->nowait);
  2025. if (p->search_commit_root) {
  2026. BUG_ON(time_seq);
  2027. return btrfs_search_slot(NULL, root, key, p, 0, 0);
  2028. }
  2029. again:
  2030. b = btrfs_get_old_root(root, time_seq);
  2031. if (unlikely(!b)) {
  2032. ret = -EIO;
  2033. goto done;
  2034. }
  2035. level = btrfs_header_level(b);
  2036. p->locks[level] = BTRFS_READ_LOCK;
  2037. while (b) {
  2038. int dec = 0;
  2039. int ret2;
  2040. level = btrfs_header_level(b);
  2041. p->nodes[level] = b;
  2042. /*
  2043. * we have a lock on b and as long as we aren't changing
  2044. * the tree, there is no way to for the items in b to change.
  2045. * It is safe to drop the lock on our parent before we
  2046. * go through the expensive btree search on b.
  2047. */
  2048. btrfs_unlock_up_safe(p, level + 1);
  2049. ret = btrfs_bin_search(b, 0, key, &slot);
  2050. if (ret < 0)
  2051. goto done;
  2052. if (level == 0) {
  2053. p->slots[level] = slot;
  2054. unlock_up(p, level, lowest_unlock, 0, NULL);
  2055. goto done;
  2056. }
  2057. if (ret && slot > 0) {
  2058. dec = 1;
  2059. slot--;
  2060. }
  2061. p->slots[level] = slot;
  2062. unlock_up(p, level, lowest_unlock, 0, NULL);
  2063. if (level == lowest_level) {
  2064. if (dec)
  2065. p->slots[level]++;
  2066. goto done;
  2067. }
  2068. ret2 = read_block_for_search(root, p, &b, slot, key);
  2069. if (ret2 == -EAGAIN && !p->nowait)
  2070. goto again;
  2071. if (ret2) {
  2072. ret = ret2;
  2073. goto done;
  2074. }
  2075. level = btrfs_header_level(b);
  2076. btrfs_tree_read_lock(b);
  2077. b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq);
  2078. if (!b) {
  2079. ret = -ENOMEM;
  2080. goto done;
  2081. }
  2082. p->locks[level] = BTRFS_READ_LOCK;
  2083. p->nodes[level] = b;
  2084. }
  2085. ret = 1;
  2086. done:
  2087. if (ret < 0)
  2088. btrfs_release_path(p);
  2089. return ret;
  2090. }
  2091. /*
  2092. * Search the tree again to find a leaf with smaller keys.
  2093. * Returns 0 if it found something.
  2094. * Returns 1 if there are no smaller keys.
  2095. * Returns < 0 on error.
  2096. *
  2097. * This may release the path, and so you may lose any locks held at the
  2098. * time you call it.
  2099. */
  2100. static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
  2101. {
  2102. struct btrfs_key key;
  2103. struct btrfs_key orig_key;
  2104. struct btrfs_disk_key found_key;
  2105. int ret;
  2106. btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
  2107. orig_key = key;
  2108. if (key.offset > 0) {
  2109. key.offset--;
  2110. } else if (key.type > 0) {
  2111. key.type--;
  2112. key.offset = (u64)-1;
  2113. } else if (key.objectid > 0) {
  2114. key.objectid--;
  2115. key.type = (u8)-1;
  2116. key.offset = (u64)-1;
  2117. } else {
  2118. return 1;
  2119. }
  2120. btrfs_release_path(path);
  2121. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  2122. if (ret <= 0)
  2123. return ret;
  2124. /*
  2125. * Previous key not found. Even if we were at slot 0 of the leaf we had
  2126. * before releasing the path and calling btrfs_search_slot(), we now may
  2127. * be in a slot pointing to the same original key - this can happen if
  2128. * after we released the path, one of more items were moved from a
  2129. * sibling leaf into the front of the leaf we had due to an insertion
  2130. * (see push_leaf_right()).
  2131. * If we hit this case and our slot is > 0 and just decrement the slot
  2132. * so that the caller does not process the same key again, which may or
  2133. * may not break the caller, depending on its logic.
  2134. */
  2135. if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
  2136. btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
  2137. ret = btrfs_comp_keys(&found_key, &orig_key);
  2138. if (ret == 0) {
  2139. if (path->slots[0] > 0) {
  2140. path->slots[0]--;
  2141. return 0;
  2142. }
  2143. /*
  2144. * At slot 0, same key as before, it means orig_key is
  2145. * the lowest, leftmost, key in the tree. We're done.
  2146. */
  2147. return 1;
  2148. }
  2149. }
  2150. btrfs_item_key(path->nodes[0], &found_key, 0);
  2151. ret = btrfs_comp_keys(&found_key, &key);
  2152. /*
  2153. * We might have had an item with the previous key in the tree right
  2154. * before we released our path. And after we released our path, that
  2155. * item might have been pushed to the first slot (0) of the leaf we
  2156. * were holding due to a tree balance. Alternatively, an item with the
  2157. * previous key can exist as the only element of a leaf (big fat item).
  2158. * Therefore account for these 2 cases, so that our callers (like
  2159. * btrfs_previous_item) don't miss an existing item with a key matching
  2160. * the previous key we computed above.
  2161. */
  2162. if (ret <= 0)
  2163. return 0;
  2164. return 1;
  2165. }
  2166. /*
  2167. * helper to use instead of search slot if no exact match is needed but
  2168. * instead the next or previous item should be returned.
  2169. * When find_higher is true, the next higher item is returned, the next lower
  2170. * otherwise.
  2171. * When return_any and find_higher are both true, and no higher item is found,
  2172. * return the next lower instead.
  2173. * When return_any is true and find_higher is false, and no lower item is found,
  2174. * return the next higher instead.
  2175. * It returns 0 if any item is found, 1 if none is found (tree empty), and
  2176. * < 0 on error
  2177. */
  2178. int btrfs_search_slot_for_read(struct btrfs_root *root,
  2179. const struct btrfs_key *key,
  2180. struct btrfs_path *p, int find_higher,
  2181. int return_any)
  2182. {
  2183. int ret;
  2184. struct extent_buffer *leaf;
  2185. again:
  2186. ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
  2187. if (ret <= 0)
  2188. return ret;
  2189. /*
  2190. * a return value of 1 means the path is at the position where the
  2191. * item should be inserted. Normally this is the next bigger item,
  2192. * but in case the previous item is the last in a leaf, path points
  2193. * to the first free slot in the previous leaf, i.e. at an invalid
  2194. * item.
  2195. */
  2196. leaf = p->nodes[0];
  2197. if (find_higher) {
  2198. if (p->slots[0] >= btrfs_header_nritems(leaf)) {
  2199. ret = btrfs_next_leaf(root, p);
  2200. if (ret <= 0)
  2201. return ret;
  2202. if (!return_any)
  2203. return 1;
  2204. /*
  2205. * no higher item found, return the next
  2206. * lower instead
  2207. */
  2208. return_any = 0;
  2209. find_higher = 0;
  2210. btrfs_release_path(p);
  2211. goto again;
  2212. }
  2213. } else {
  2214. if (p->slots[0] == 0) {
  2215. ret = btrfs_prev_leaf(root, p);
  2216. if (ret < 0)
  2217. return ret;
  2218. if (!ret) {
  2219. leaf = p->nodes[0];
  2220. if (p->slots[0] == btrfs_header_nritems(leaf))
  2221. p->slots[0]--;
  2222. return 0;
  2223. }
  2224. if (!return_any)
  2225. return 1;
  2226. /*
  2227. * no lower item found, return the next
  2228. * higher instead
  2229. */
  2230. return_any = 0;
  2231. find_higher = 1;
  2232. btrfs_release_path(p);
  2233. goto again;
  2234. } else {
  2235. --p->slots[0];
  2236. }
  2237. }
  2238. return 0;
  2239. }
  2240. /*
  2241. * Execute search and call btrfs_previous_item to traverse backwards if the item
  2242. * was not found.
  2243. *
  2244. * Return 0 if found, 1 if not found and < 0 if error.
  2245. */
  2246. int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
  2247. struct btrfs_path *path)
  2248. {
  2249. int ret;
  2250. ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
  2251. if (ret > 0)
  2252. ret = btrfs_previous_item(root, path, key->objectid, key->type);
  2253. if (ret == 0)
  2254. btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
  2255. return ret;
  2256. }
  2257. /*
  2258. * Search for a valid slot for the given path.
  2259. *
  2260. * @root: The root node of the tree.
  2261. * @key: Will contain a valid item if found.
  2262. * @path: The starting point to validate the slot.
  2263. *
  2264. * Return: 0 if the item is valid
  2265. * 1 if not found
  2266. * <0 if error.
  2267. */
  2268. int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
  2269. struct btrfs_path *path)
  2270. {
  2271. if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
  2272. int ret;
  2273. ret = btrfs_next_leaf(root, path);
  2274. if (ret)
  2275. return ret;
  2276. }
  2277. btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
  2278. return 0;
  2279. }
  2280. /*
  2281. * adjust the pointers going up the tree, starting at level
  2282. * making sure the right key of each node is points to 'key'.
  2283. * This is used after shifting pointers to the left, so it stops
  2284. * fixing up pointers when a given leaf/node is not in slot 0 of the
  2285. * higher levels
  2286. *
  2287. */
  2288. static void fixup_low_keys(struct btrfs_trans_handle *trans,
  2289. const struct btrfs_path *path,
  2290. const struct btrfs_disk_key *key, int level)
  2291. {
  2292. int i;
  2293. struct extent_buffer *t;
  2294. int ret;
  2295. for (i = level; i < BTRFS_MAX_LEVEL; i++) {
  2296. int tslot = path->slots[i];
  2297. if (!path->nodes[i])
  2298. break;
  2299. t = path->nodes[i];
  2300. ret = btrfs_tree_mod_log_insert_key(t, tslot,
  2301. BTRFS_MOD_LOG_KEY_REPLACE);
  2302. BUG_ON(ret < 0);
  2303. btrfs_set_node_key(t, key, tslot);
  2304. btrfs_mark_buffer_dirty(trans, path->nodes[i]);
  2305. if (tslot != 0)
  2306. break;
  2307. }
  2308. }
  2309. /*
  2310. * update item key.
  2311. *
  2312. * This function isn't completely safe. It's the caller's responsibility
  2313. * that the new key won't break the order
  2314. */
  2315. void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
  2316. const struct btrfs_path *path,
  2317. const struct btrfs_key *new_key)
  2318. {
  2319. struct btrfs_fs_info *fs_info = trans->fs_info;
  2320. struct btrfs_disk_key disk_key;
  2321. struct extent_buffer *eb;
  2322. int slot;
  2323. eb = path->nodes[0];
  2324. slot = path->slots[0];
  2325. if (slot > 0) {
  2326. btrfs_item_key(eb, &disk_key, slot - 1);
  2327. if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) {
  2328. btrfs_print_leaf(eb);
  2329. btrfs_crit(fs_info,
  2330. "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT,
  2331. slot, btrfs_disk_key_objectid(&disk_key),
  2332. btrfs_disk_key_type(&disk_key),
  2333. btrfs_disk_key_offset(&disk_key),
  2334. BTRFS_KEY_FMT_VALUE(new_key));
  2335. BUG();
  2336. }
  2337. }
  2338. if (slot < btrfs_header_nritems(eb) - 1) {
  2339. btrfs_item_key(eb, &disk_key, slot + 1);
  2340. if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) {
  2341. btrfs_print_leaf(eb);
  2342. btrfs_crit(fs_info,
  2343. "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT,
  2344. slot, btrfs_disk_key_objectid(&disk_key),
  2345. btrfs_disk_key_type(&disk_key),
  2346. btrfs_disk_key_offset(&disk_key),
  2347. BTRFS_KEY_FMT_VALUE(new_key));
  2348. BUG();
  2349. }
  2350. }
  2351. btrfs_cpu_key_to_disk(&disk_key, new_key);
  2352. btrfs_set_item_key(eb, &disk_key, slot);
  2353. btrfs_mark_buffer_dirty(trans, eb);
  2354. if (slot == 0)
  2355. fixup_low_keys(trans, path, &disk_key, 1);
  2356. }
  2357. /*
  2358. * Check key order of two sibling extent buffers.
  2359. *
  2360. * Return true if something is wrong.
  2361. * Return false if everything is fine.
  2362. *
  2363. * Tree-checker only works inside one tree block, thus the following
  2364. * corruption can not be detected by tree-checker:
  2365. *
  2366. * Leaf @left | Leaf @right
  2367. * --------------------------------------------------------------
  2368. * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
  2369. *
  2370. * Key f6 in leaf @left itself is valid, but not valid when the next
  2371. * key in leaf @right is 7.
  2372. * This can only be checked at tree block merge time.
  2373. * And since tree checker has ensured all key order in each tree block
  2374. * is correct, we only need to bother the last key of @left and the first
  2375. * key of @right.
  2376. */
  2377. static bool check_sibling_keys(const struct extent_buffer *left,
  2378. const struct extent_buffer *right)
  2379. {
  2380. struct btrfs_key left_last;
  2381. struct btrfs_key right_first;
  2382. int level = btrfs_header_level(left);
  2383. int nr_left = btrfs_header_nritems(left);
  2384. int nr_right = btrfs_header_nritems(right);
  2385. /* No key to check in one of the tree blocks */
  2386. if (!nr_left || !nr_right)
  2387. return false;
  2388. if (level) {
  2389. btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
  2390. btrfs_node_key_to_cpu(right, &right_first, 0);
  2391. } else {
  2392. btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
  2393. btrfs_item_key_to_cpu(right, &right_first, 0);
  2394. }
  2395. if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) {
  2396. btrfs_crit(left->fs_info, "left extent buffer:");
  2397. btrfs_print_tree(left, false);
  2398. btrfs_crit(left->fs_info, "right extent buffer:");
  2399. btrfs_print_tree(right, false);
  2400. btrfs_crit(left->fs_info,
  2401. "bad key order, sibling blocks, left last " BTRFS_KEY_FMT " right first " BTRFS_KEY_FMT,
  2402. BTRFS_KEY_FMT_VALUE(&left_last),
  2403. BTRFS_KEY_FMT_VALUE(&right_first));
  2404. return true;
  2405. }
  2406. return false;
  2407. }
  2408. /*
  2409. * try to push data from one node into the next node left in the
  2410. * tree.
  2411. *
  2412. * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
  2413. * error, and > 0 if there was no room in the left hand block.
  2414. */
  2415. static int push_node_left(struct btrfs_trans_handle *trans,
  2416. struct extent_buffer *dst,
  2417. struct extent_buffer *src, bool empty)
  2418. {
  2419. struct btrfs_fs_info *fs_info = trans->fs_info;
  2420. int push_items = 0;
  2421. int src_nritems;
  2422. int dst_nritems;
  2423. int ret = 0;
  2424. src_nritems = btrfs_header_nritems(src);
  2425. dst_nritems = btrfs_header_nritems(dst);
  2426. push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
  2427. WARN_ON(btrfs_header_generation(src) != trans->transid);
  2428. WARN_ON(btrfs_header_generation(dst) != trans->transid);
  2429. if (!empty && src_nritems <= 8)
  2430. return 1;
  2431. if (push_items <= 0)
  2432. return 1;
  2433. if (empty) {
  2434. push_items = min(src_nritems, push_items);
  2435. if (push_items < src_nritems) {
  2436. /* leave at least 8 pointers in the node if
  2437. * we aren't going to empty it
  2438. */
  2439. if (src_nritems - push_items < 8) {
  2440. if (push_items <= 8)
  2441. return 1;
  2442. push_items -= 8;
  2443. }
  2444. }
  2445. } else
  2446. push_items = min(src_nritems - 8, push_items);
  2447. /* dst is the left eb, src is the middle eb */
  2448. if (unlikely(check_sibling_keys(dst, src))) {
  2449. ret = -EUCLEAN;
  2450. btrfs_abort_transaction(trans, ret);
  2451. return ret;
  2452. }
  2453. ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
  2454. if (unlikely(ret)) {
  2455. btrfs_abort_transaction(trans, ret);
  2456. return ret;
  2457. }
  2458. copy_extent_buffer(dst, src,
  2459. btrfs_node_key_ptr_offset(dst, dst_nritems),
  2460. btrfs_node_key_ptr_offset(src, 0),
  2461. push_items * sizeof(struct btrfs_key_ptr));
  2462. if (push_items < src_nritems) {
  2463. /*
  2464. * btrfs_tree_mod_log_eb_copy handles logging the move, so we
  2465. * don't need to do an explicit tree mod log operation for it.
  2466. */
  2467. memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
  2468. btrfs_node_key_ptr_offset(src, push_items),
  2469. (src_nritems - push_items) *
  2470. sizeof(struct btrfs_key_ptr));
  2471. }
  2472. btrfs_set_header_nritems(src, src_nritems - push_items);
  2473. btrfs_set_header_nritems(dst, dst_nritems + push_items);
  2474. btrfs_mark_buffer_dirty(trans, src);
  2475. btrfs_mark_buffer_dirty(trans, dst);
  2476. return ret;
  2477. }
  2478. /*
  2479. * try to push data from one node into the next node right in the
  2480. * tree.
  2481. *
  2482. * returns 0 if some ptrs were pushed, < 0 if there was some horrible
  2483. * error, and > 0 if there was no room in the right hand block.
  2484. *
  2485. * this will only push up to 1/2 the contents of the left node over
  2486. */
  2487. static int balance_node_right(struct btrfs_trans_handle *trans,
  2488. struct extent_buffer *dst,
  2489. struct extent_buffer *src)
  2490. {
  2491. struct btrfs_fs_info *fs_info = trans->fs_info;
  2492. int push_items = 0;
  2493. int max_push;
  2494. int src_nritems;
  2495. int dst_nritems;
  2496. int ret = 0;
  2497. WARN_ON(btrfs_header_generation(src) != trans->transid);
  2498. WARN_ON(btrfs_header_generation(dst) != trans->transid);
  2499. src_nritems = btrfs_header_nritems(src);
  2500. dst_nritems = btrfs_header_nritems(dst);
  2501. push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
  2502. if (push_items <= 0)
  2503. return 1;
  2504. if (src_nritems < 4)
  2505. return 1;
  2506. max_push = src_nritems / 2 + 1;
  2507. /* don't try to empty the node */
  2508. if (max_push >= src_nritems)
  2509. return 1;
  2510. if (max_push < push_items)
  2511. push_items = max_push;
  2512. /* dst is the right eb, src is the middle eb */
  2513. if (unlikely(check_sibling_keys(src, dst))) {
  2514. ret = -EUCLEAN;
  2515. btrfs_abort_transaction(trans, ret);
  2516. return ret;
  2517. }
  2518. /*
  2519. * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't
  2520. * need to do an explicit tree mod log operation for it.
  2521. */
  2522. memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
  2523. btrfs_node_key_ptr_offset(dst, 0),
  2524. (dst_nritems) *
  2525. sizeof(struct btrfs_key_ptr));
  2526. ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
  2527. push_items);
  2528. if (unlikely(ret)) {
  2529. btrfs_abort_transaction(trans, ret);
  2530. return ret;
  2531. }
  2532. copy_extent_buffer(dst, src,
  2533. btrfs_node_key_ptr_offset(dst, 0),
  2534. btrfs_node_key_ptr_offset(src, src_nritems - push_items),
  2535. push_items * sizeof(struct btrfs_key_ptr));
  2536. btrfs_set_header_nritems(src, src_nritems - push_items);
  2537. btrfs_set_header_nritems(dst, dst_nritems + push_items);
  2538. btrfs_mark_buffer_dirty(trans, src);
  2539. btrfs_mark_buffer_dirty(trans, dst);
  2540. return ret;
  2541. }
  2542. /*
  2543. * helper function to insert a new root level in the tree.
  2544. * A new node is allocated, and a single item is inserted to
  2545. * point to the existing root
  2546. *
  2547. * returns zero on success or < 0 on failure.
  2548. */
  2549. static noinline int insert_new_root(struct btrfs_trans_handle *trans,
  2550. struct btrfs_root *root,
  2551. struct btrfs_path *path, int level)
  2552. {
  2553. u64 lower_gen;
  2554. struct extent_buffer *lower;
  2555. struct extent_buffer *c;
  2556. struct extent_buffer *old;
  2557. struct btrfs_disk_key lower_key;
  2558. int ret;
  2559. BUG_ON(path->nodes[level]);
  2560. BUG_ON(path->nodes[level-1] != root->node);
  2561. lower = path->nodes[level-1];
  2562. if (level == 1)
  2563. btrfs_item_key(lower, &lower_key, 0);
  2564. else
  2565. btrfs_node_key(lower, &lower_key, 0);
  2566. c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
  2567. &lower_key, level, root->node->start, 0,
  2568. 0, BTRFS_NESTING_NEW_ROOT);
  2569. if (IS_ERR(c))
  2570. return PTR_ERR(c);
  2571. root_add_used_bytes(root);
  2572. btrfs_set_header_nritems(c, 1);
  2573. btrfs_set_node_key(c, &lower_key, 0);
  2574. btrfs_set_node_blockptr(c, 0, lower->start);
  2575. lower_gen = btrfs_header_generation(lower);
  2576. WARN_ON(lower_gen != trans->transid);
  2577. btrfs_set_node_ptr_generation(c, 0, lower_gen);
  2578. btrfs_mark_buffer_dirty(trans, c);
  2579. old = root->node;
  2580. ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
  2581. if (ret < 0) {
  2582. int ret2;
  2583. btrfs_clear_buffer_dirty(trans, c);
  2584. ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
  2585. if (unlikely(ret2 < 0))
  2586. btrfs_abort_transaction(trans, ret2);
  2587. btrfs_tree_unlock(c);
  2588. free_extent_buffer(c);
  2589. return ret;
  2590. }
  2591. rcu_assign_pointer(root->node, c);
  2592. /* the super has an extra ref to root->node */
  2593. free_extent_buffer(old);
  2594. add_root_to_dirty_list(root);
  2595. refcount_inc(&c->refs);
  2596. path->nodes[level] = c;
  2597. path->locks[level] = BTRFS_WRITE_LOCK;
  2598. path->slots[level] = 0;
  2599. return 0;
  2600. }
  2601. /*
  2602. * worker function to insert a single pointer in a node.
  2603. * the node should have enough room for the pointer already
  2604. *
  2605. * slot and level indicate where you want the key to go, and
  2606. * blocknr is the block the key points to.
  2607. */
  2608. static int insert_ptr(struct btrfs_trans_handle *trans,
  2609. const struct btrfs_path *path,
  2610. const struct btrfs_disk_key *key, u64 bytenr,
  2611. int slot, int level)
  2612. {
  2613. struct extent_buffer *lower;
  2614. int nritems;
  2615. int ret;
  2616. BUG_ON(!path->nodes[level]);
  2617. btrfs_assert_tree_write_locked(path->nodes[level]);
  2618. lower = path->nodes[level];
  2619. nritems = btrfs_header_nritems(lower);
  2620. BUG_ON(slot > nritems);
  2621. BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
  2622. if (slot != nritems) {
  2623. if (level) {
  2624. ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
  2625. slot, nritems - slot);
  2626. if (unlikely(ret < 0)) {
  2627. btrfs_abort_transaction(trans, ret);
  2628. return ret;
  2629. }
  2630. }
  2631. memmove_extent_buffer(lower,
  2632. btrfs_node_key_ptr_offset(lower, slot + 1),
  2633. btrfs_node_key_ptr_offset(lower, slot),
  2634. (nritems - slot) * sizeof(struct btrfs_key_ptr));
  2635. }
  2636. if (level) {
  2637. ret = btrfs_tree_mod_log_insert_key(lower, slot,
  2638. BTRFS_MOD_LOG_KEY_ADD);
  2639. if (unlikely(ret < 0)) {
  2640. btrfs_abort_transaction(trans, ret);
  2641. return ret;
  2642. }
  2643. }
  2644. btrfs_set_node_key(lower, key, slot);
  2645. btrfs_set_node_blockptr(lower, slot, bytenr);
  2646. WARN_ON(trans->transid == 0);
  2647. btrfs_set_node_ptr_generation(lower, slot, trans->transid);
  2648. btrfs_set_header_nritems(lower, nritems + 1);
  2649. btrfs_mark_buffer_dirty(trans, lower);
  2650. return 0;
  2651. }
  2652. /*
  2653. * split the node at the specified level in path in two.
  2654. * The path is corrected to point to the appropriate node after the split
  2655. *
  2656. * Before splitting this tries to make some room in the node by pushing
  2657. * left and right, if either one works, it returns right away.
  2658. *
  2659. * returns 0 on success and < 0 on failure
  2660. */
  2661. static noinline int split_node(struct btrfs_trans_handle *trans,
  2662. struct btrfs_root *root,
  2663. struct btrfs_path *path, int level)
  2664. {
  2665. struct btrfs_fs_info *fs_info = root->fs_info;
  2666. struct extent_buffer *c;
  2667. struct extent_buffer *split;
  2668. struct btrfs_disk_key disk_key;
  2669. int mid;
  2670. int ret;
  2671. u32 c_nritems;
  2672. c = path->nodes[level];
  2673. WARN_ON(btrfs_header_generation(c) != trans->transid);
  2674. if (c == root->node) {
  2675. /*
  2676. * trying to split the root, lets make a new one
  2677. *
  2678. * tree mod log: We don't log_removal old root in
  2679. * insert_new_root, because that root buffer will be kept as a
  2680. * normal node. We are going to log removal of half of the
  2681. * elements below with btrfs_tree_mod_log_eb_copy(). We're
  2682. * holding a tree lock on the buffer, which is why we cannot
  2683. * race with other tree_mod_log users.
  2684. */
  2685. ret = insert_new_root(trans, root, path, level + 1);
  2686. if (ret)
  2687. return ret;
  2688. } else {
  2689. ret = push_nodes_for_insert(trans, root, path, level);
  2690. c = path->nodes[level];
  2691. if (!ret && btrfs_header_nritems(c) <
  2692. BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
  2693. return 0;
  2694. if (ret < 0)
  2695. return ret;
  2696. }
  2697. c_nritems = btrfs_header_nritems(c);
  2698. mid = (c_nritems + 1) / 2;
  2699. btrfs_node_key(c, &disk_key, mid);
  2700. split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
  2701. &disk_key, level, c->start, 0,
  2702. 0, BTRFS_NESTING_SPLIT);
  2703. if (IS_ERR(split))
  2704. return PTR_ERR(split);
  2705. root_add_used_bytes(root);
  2706. ASSERT(btrfs_header_level(c) == level);
  2707. ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
  2708. if (unlikely(ret)) {
  2709. btrfs_tree_unlock(split);
  2710. free_extent_buffer(split);
  2711. btrfs_abort_transaction(trans, ret);
  2712. return ret;
  2713. }
  2714. copy_extent_buffer(split, c,
  2715. btrfs_node_key_ptr_offset(split, 0),
  2716. btrfs_node_key_ptr_offset(c, mid),
  2717. (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
  2718. btrfs_set_header_nritems(split, c_nritems - mid);
  2719. btrfs_set_header_nritems(c, mid);
  2720. btrfs_mark_buffer_dirty(trans, c);
  2721. btrfs_mark_buffer_dirty(trans, split);
  2722. ret = insert_ptr(trans, path, &disk_key, split->start,
  2723. path->slots[level + 1] + 1, level + 1);
  2724. if (ret < 0) {
  2725. btrfs_tree_unlock(split);
  2726. free_extent_buffer(split);
  2727. return ret;
  2728. }
  2729. if (path->slots[level] >= mid) {
  2730. path->slots[level] -= mid;
  2731. btrfs_tree_unlock(c);
  2732. free_extent_buffer(c);
  2733. path->nodes[level] = split;
  2734. path->slots[level + 1] += 1;
  2735. } else {
  2736. btrfs_tree_unlock(split);
  2737. free_extent_buffer(split);
  2738. }
  2739. return 0;
  2740. }
  2741. /*
  2742. * how many bytes are required to store the items in a leaf. start
  2743. * and nr indicate which items in the leaf to check. This totals up the
  2744. * space used both by the item structs and the item data
  2745. */
  2746. static int leaf_space_used(const struct extent_buffer *l, int start, int nr)
  2747. {
  2748. int data_len;
  2749. int nritems = btrfs_header_nritems(l);
  2750. int end = min(nritems, start + nr) - 1;
  2751. if (!nr)
  2752. return 0;
  2753. data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
  2754. data_len = data_len - btrfs_item_offset(l, end);
  2755. data_len += sizeof(struct btrfs_item) * nr;
  2756. WARN_ON(data_len < 0);
  2757. return data_len;
  2758. }
  2759. /*
  2760. * The space between the end of the leaf items and
  2761. * the start of the leaf data. IOW, how much room
  2762. * the leaf has left for both items and data
  2763. */
  2764. int btrfs_leaf_free_space(const struct extent_buffer *leaf)
  2765. {
  2766. struct btrfs_fs_info *fs_info = leaf->fs_info;
  2767. int nritems = btrfs_header_nritems(leaf);
  2768. int ret;
  2769. ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
  2770. if (unlikely(ret < 0)) {
  2771. btrfs_crit(fs_info,
  2772. "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
  2773. ret,
  2774. (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
  2775. leaf_space_used(leaf, 0, nritems), nritems);
  2776. }
  2777. return ret;
  2778. }
  2779. /*
  2780. * min slot controls the lowest index we're willing to push to the
  2781. * right. We'll push up to and including min_slot, but no lower
  2782. */
  2783. static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
  2784. struct btrfs_path *path,
  2785. int data_size, bool empty,
  2786. struct extent_buffer *right,
  2787. int free_space, u32 left_nritems,
  2788. u32 min_slot)
  2789. {
  2790. struct btrfs_fs_info *fs_info = right->fs_info;
  2791. struct extent_buffer *left = path->nodes[0];
  2792. struct extent_buffer *upper = path->nodes[1];
  2793. struct btrfs_disk_key disk_key;
  2794. int slot;
  2795. u32 i;
  2796. int push_space = 0;
  2797. int push_items = 0;
  2798. u32 nr;
  2799. u32 right_nritems;
  2800. u32 data_end;
  2801. u32 this_item_size;
  2802. if (empty)
  2803. nr = 0;
  2804. else
  2805. nr = max_t(u32, 1, min_slot);
  2806. if (path->slots[0] >= left_nritems)
  2807. push_space += data_size;
  2808. slot = path->slots[1];
  2809. i = left_nritems - 1;
  2810. while (i >= nr) {
  2811. if (!empty && push_items > 0) {
  2812. if (path->slots[0] > i)
  2813. break;
  2814. if (path->slots[0] == i) {
  2815. int space = btrfs_leaf_free_space(left);
  2816. if (space + push_space * 2 > free_space)
  2817. break;
  2818. }
  2819. }
  2820. if (path->slots[0] == i)
  2821. push_space += data_size;
  2822. this_item_size = btrfs_item_size(left, i);
  2823. if (this_item_size + sizeof(struct btrfs_item) +
  2824. push_space > free_space)
  2825. break;
  2826. push_items++;
  2827. push_space += this_item_size + sizeof(struct btrfs_item);
  2828. if (i == 0)
  2829. break;
  2830. i--;
  2831. }
  2832. if (push_items == 0)
  2833. goto out_unlock;
  2834. WARN_ON(!empty && push_items == left_nritems);
  2835. /* push left to right */
  2836. right_nritems = btrfs_header_nritems(right);
  2837. push_space = btrfs_item_data_end(left, left_nritems - push_items);
  2838. push_space -= leaf_data_end(left);
  2839. /* make room in the right data area */
  2840. data_end = leaf_data_end(right);
  2841. memmove_leaf_data(right, data_end - push_space, data_end,
  2842. BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
  2843. /* copy from the left data area */
  2844. copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
  2845. leaf_data_end(left), push_space);
  2846. memmove_leaf_items(right, push_items, 0, right_nritems);
  2847. /* copy the items from left to right */
  2848. copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
  2849. /* update the item pointers */
  2850. right_nritems += push_items;
  2851. btrfs_set_header_nritems(right, right_nritems);
  2852. push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
  2853. for (i = 0; i < right_nritems; i++) {
  2854. push_space -= btrfs_item_size(right, i);
  2855. btrfs_set_item_offset(right, i, push_space);
  2856. }
  2857. left_nritems -= push_items;
  2858. btrfs_set_header_nritems(left, left_nritems);
  2859. if (left_nritems)
  2860. btrfs_mark_buffer_dirty(trans, left);
  2861. else
  2862. btrfs_clear_buffer_dirty(trans, left);
  2863. btrfs_mark_buffer_dirty(trans, right);
  2864. btrfs_item_key(right, &disk_key, 0);
  2865. btrfs_set_node_key(upper, &disk_key, slot + 1);
  2866. btrfs_mark_buffer_dirty(trans, upper);
  2867. /* then fixup the leaf pointer in the path */
  2868. if (path->slots[0] >= left_nritems) {
  2869. path->slots[0] -= left_nritems;
  2870. btrfs_tree_unlock(left);
  2871. free_extent_buffer(left);
  2872. path->nodes[0] = right;
  2873. path->slots[1] += 1;
  2874. } else {
  2875. btrfs_tree_unlock(right);
  2876. free_extent_buffer(right);
  2877. }
  2878. return 0;
  2879. out_unlock:
  2880. btrfs_tree_unlock(right);
  2881. free_extent_buffer(right);
  2882. return 1;
  2883. }
  2884. /*
  2885. * push some data in the path leaf to the right, trying to free up at
  2886. * least data_size bytes. returns zero if the push worked, nonzero otherwise
  2887. *
  2888. * returns 1 if the push failed because the other node didn't have enough
  2889. * room, 0 if everything worked out and < 0 if there were major errors.
  2890. *
  2891. * this will push starting from min_slot to the end of the leaf. It won't
  2892. * push any slot lower than min_slot
  2893. */
  2894. static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
  2895. *root, struct btrfs_path *path,
  2896. int min_data_size, int data_size,
  2897. bool empty, u32 min_slot)
  2898. {
  2899. struct extent_buffer *left = path->nodes[0];
  2900. struct extent_buffer *right;
  2901. struct extent_buffer *upper;
  2902. int slot;
  2903. int free_space;
  2904. u32 left_nritems;
  2905. int ret;
  2906. if (!path->nodes[1])
  2907. return 1;
  2908. slot = path->slots[1];
  2909. upper = path->nodes[1];
  2910. if (slot >= btrfs_header_nritems(upper) - 1)
  2911. return 1;
  2912. btrfs_assert_tree_write_locked(path->nodes[1]);
  2913. right = btrfs_read_node_slot(upper, slot + 1);
  2914. if (IS_ERR(right))
  2915. return PTR_ERR(right);
  2916. btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
  2917. free_space = btrfs_leaf_free_space(right);
  2918. if (free_space < data_size)
  2919. goto out_unlock;
  2920. ret = btrfs_cow_block(trans, root, right, upper,
  2921. slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
  2922. if (ret)
  2923. goto out_unlock;
  2924. left_nritems = btrfs_header_nritems(left);
  2925. if (left_nritems == 0)
  2926. goto out_unlock;
  2927. if (unlikely(check_sibling_keys(left, right))) {
  2928. ret = -EUCLEAN;
  2929. btrfs_abort_transaction(trans, ret);
  2930. btrfs_tree_unlock(right);
  2931. free_extent_buffer(right);
  2932. return ret;
  2933. }
  2934. if (path->slots[0] == left_nritems && !empty) {
  2935. /* Key greater than all keys in the leaf, right neighbor has
  2936. * enough room for it and we're not emptying our leaf to delete
  2937. * it, therefore use right neighbor to insert the new item and
  2938. * no need to touch/dirty our left leaf. */
  2939. btrfs_tree_unlock(left);
  2940. free_extent_buffer(left);
  2941. path->nodes[0] = right;
  2942. path->slots[0] = 0;
  2943. path->slots[1]++;
  2944. return 0;
  2945. }
  2946. return __push_leaf_right(trans, path, min_data_size, empty, right,
  2947. free_space, left_nritems, min_slot);
  2948. out_unlock:
  2949. btrfs_tree_unlock(right);
  2950. free_extent_buffer(right);
  2951. return 1;
  2952. }
  2953. /*
  2954. * push some data in the path leaf to the left, trying to free up at
  2955. * least data_size bytes. returns zero if the push worked, nonzero otherwise
  2956. *
  2957. * max_slot can put a limit on how far into the leaf we'll push items. The
  2958. * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
  2959. * items
  2960. */
  2961. static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
  2962. struct btrfs_path *path, int data_size,
  2963. bool empty, struct extent_buffer *left,
  2964. int free_space, u32 right_nritems,
  2965. u32 max_slot)
  2966. {
  2967. struct btrfs_fs_info *fs_info = left->fs_info;
  2968. struct btrfs_disk_key disk_key;
  2969. struct extent_buffer *right = path->nodes[0];
  2970. int i;
  2971. int push_space = 0;
  2972. int push_items = 0;
  2973. u32 old_left_nritems;
  2974. u32 nr;
  2975. int ret = 0;
  2976. u32 this_item_size;
  2977. u32 old_left_item_size;
  2978. if (empty)
  2979. nr = min(right_nritems, max_slot);
  2980. else
  2981. nr = min(right_nritems - 1, max_slot);
  2982. for (i = 0; i < nr; i++) {
  2983. if (!empty && push_items > 0) {
  2984. if (path->slots[0] < i)
  2985. break;
  2986. if (path->slots[0] == i) {
  2987. int space = btrfs_leaf_free_space(right);
  2988. if (space + push_space * 2 > free_space)
  2989. break;
  2990. }
  2991. }
  2992. if (path->slots[0] == i)
  2993. push_space += data_size;
  2994. this_item_size = btrfs_item_size(right, i);
  2995. if (this_item_size + sizeof(struct btrfs_item) + push_space >
  2996. free_space)
  2997. break;
  2998. push_items++;
  2999. push_space += this_item_size + sizeof(struct btrfs_item);
  3000. }
  3001. if (push_items == 0) {
  3002. ret = 1;
  3003. goto out;
  3004. }
  3005. WARN_ON(!empty && push_items == btrfs_header_nritems(right));
  3006. /* push data from right to left */
  3007. copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
  3008. push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
  3009. btrfs_item_offset(right, push_items - 1);
  3010. copy_leaf_data(left, right, leaf_data_end(left) - push_space,
  3011. btrfs_item_offset(right, push_items - 1), push_space);
  3012. old_left_nritems = btrfs_header_nritems(left);
  3013. BUG_ON(old_left_nritems <= 0);
  3014. old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
  3015. for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
  3016. u32 ioff;
  3017. ioff = btrfs_item_offset(left, i);
  3018. btrfs_set_item_offset(left, i,
  3019. ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
  3020. }
  3021. btrfs_set_header_nritems(left, old_left_nritems + push_items);
  3022. /* fixup right node */
  3023. if (unlikely(push_items > right_nritems)) {
  3024. ret = -EUCLEAN;
  3025. btrfs_abort_transaction(trans, ret);
  3026. btrfs_crit(fs_info, "push items (%d) > right leaf items (%u)",
  3027. push_items, right_nritems);
  3028. goto out;
  3029. }
  3030. if (push_items < right_nritems) {
  3031. push_space = btrfs_item_offset(right, push_items - 1) -
  3032. leaf_data_end(right);
  3033. memmove_leaf_data(right,
  3034. BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
  3035. leaf_data_end(right), push_space);
  3036. memmove_leaf_items(right, 0, push_items,
  3037. btrfs_header_nritems(right) - push_items);
  3038. }
  3039. right_nritems -= push_items;
  3040. btrfs_set_header_nritems(right, right_nritems);
  3041. push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
  3042. for (i = 0; i < right_nritems; i++) {
  3043. push_space = push_space - btrfs_item_size(right, i);
  3044. btrfs_set_item_offset(right, i, push_space);
  3045. }
  3046. btrfs_mark_buffer_dirty(trans, left);
  3047. if (right_nritems)
  3048. btrfs_mark_buffer_dirty(trans, right);
  3049. else
  3050. btrfs_clear_buffer_dirty(trans, right);
  3051. btrfs_item_key(right, &disk_key, 0);
  3052. fixup_low_keys(trans, path, &disk_key, 1);
  3053. /* then fixup the leaf pointer in the path */
  3054. if (path->slots[0] < push_items) {
  3055. path->slots[0] += old_left_nritems;
  3056. btrfs_tree_unlock(right);
  3057. free_extent_buffer(right);
  3058. path->nodes[0] = left;
  3059. path->slots[1] -= 1;
  3060. } else {
  3061. btrfs_tree_unlock(left);
  3062. free_extent_buffer(left);
  3063. path->slots[0] -= push_items;
  3064. }
  3065. BUG_ON(path->slots[0] < 0);
  3066. return ret;
  3067. out:
  3068. btrfs_tree_unlock(left);
  3069. free_extent_buffer(left);
  3070. return ret;
  3071. }
  3072. /*
  3073. * push some data in the path leaf to the left, trying to free up at
  3074. * least data_size bytes. returns zero if the push worked, nonzero otherwise
  3075. *
  3076. * max_slot can put a limit on how far into the leaf we'll push items. The
  3077. * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
  3078. * items
  3079. */
  3080. static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
  3081. *root, struct btrfs_path *path, int min_data_size,
  3082. int data_size, int empty, u32 max_slot)
  3083. {
  3084. struct extent_buffer *right = path->nodes[0];
  3085. struct extent_buffer *left;
  3086. int slot;
  3087. int free_space;
  3088. u32 right_nritems;
  3089. int ret = 0;
  3090. slot = path->slots[1];
  3091. if (slot == 0)
  3092. return 1;
  3093. if (!path->nodes[1])
  3094. return 1;
  3095. right_nritems = btrfs_header_nritems(right);
  3096. if (right_nritems == 0)
  3097. return 1;
  3098. btrfs_assert_tree_write_locked(path->nodes[1]);
  3099. left = btrfs_read_node_slot(path->nodes[1], slot - 1);
  3100. if (IS_ERR(left))
  3101. return PTR_ERR(left);
  3102. btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
  3103. free_space = btrfs_leaf_free_space(left);
  3104. if (free_space < data_size) {
  3105. ret = 1;
  3106. goto out;
  3107. }
  3108. ret = btrfs_cow_block(trans, root, left,
  3109. path->nodes[1], slot - 1, &left,
  3110. BTRFS_NESTING_LEFT_COW);
  3111. if (ret) {
  3112. /* we hit -ENOSPC, but it isn't fatal here */
  3113. if (ret == -ENOSPC)
  3114. ret = 1;
  3115. goto out;
  3116. }
  3117. if (unlikely(check_sibling_keys(left, right))) {
  3118. ret = -EUCLEAN;
  3119. btrfs_abort_transaction(trans, ret);
  3120. goto out;
  3121. }
  3122. return __push_leaf_left(trans, path, min_data_size, empty, left,
  3123. free_space, right_nritems, max_slot);
  3124. out:
  3125. btrfs_tree_unlock(left);
  3126. free_extent_buffer(left);
  3127. return ret;
  3128. }
  3129. /*
  3130. * split the path's leaf in two, making sure there is at least data_size
  3131. * available for the resulting leaf level of the path.
  3132. */
  3133. static noinline int copy_for_split(struct btrfs_trans_handle *trans,
  3134. struct btrfs_path *path,
  3135. struct extent_buffer *l,
  3136. struct extent_buffer *right,
  3137. int slot, int mid, int nritems)
  3138. {
  3139. struct btrfs_fs_info *fs_info = trans->fs_info;
  3140. int data_copy_size;
  3141. int rt_data_off;
  3142. int i;
  3143. int ret;
  3144. struct btrfs_disk_key disk_key;
  3145. nritems = nritems - mid;
  3146. btrfs_set_header_nritems(right, nritems);
  3147. data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
  3148. copy_leaf_items(right, l, 0, mid, nritems);
  3149. copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
  3150. leaf_data_end(l), data_copy_size);
  3151. rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
  3152. for (i = 0; i < nritems; i++) {
  3153. u32 ioff;
  3154. ioff = btrfs_item_offset(right, i);
  3155. btrfs_set_item_offset(right, i, ioff + rt_data_off);
  3156. }
  3157. btrfs_set_header_nritems(l, mid);
  3158. btrfs_item_key(right, &disk_key, 0);
  3159. ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
  3160. if (ret < 0)
  3161. return ret;
  3162. btrfs_mark_buffer_dirty(trans, right);
  3163. btrfs_mark_buffer_dirty(trans, l);
  3164. BUG_ON(path->slots[0] != slot);
  3165. if (mid <= slot) {
  3166. btrfs_tree_unlock(path->nodes[0]);
  3167. free_extent_buffer(path->nodes[0]);
  3168. path->nodes[0] = right;
  3169. path->slots[0] -= mid;
  3170. path->slots[1] += 1;
  3171. } else {
  3172. btrfs_tree_unlock(right);
  3173. free_extent_buffer(right);
  3174. }
  3175. BUG_ON(path->slots[0] < 0);
  3176. return 0;
  3177. }
  3178. /*
  3179. * double splits happen when we need to insert a big item in the middle
  3180. * of a leaf. A double split can leave us with 3 mostly empty leaves:
  3181. * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
  3182. * A B C
  3183. *
  3184. * We avoid this by trying to push the items on either side of our target
  3185. * into the adjacent leaves. If all goes well we can avoid the double split
  3186. * completely.
  3187. */
  3188. static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
  3189. struct btrfs_root *root,
  3190. struct btrfs_path *path,
  3191. int data_size)
  3192. {
  3193. int ret;
  3194. int progress = 0;
  3195. int slot;
  3196. u32 nritems;
  3197. int space_needed = data_size;
  3198. slot = path->slots[0];
  3199. if (slot < btrfs_header_nritems(path->nodes[0]))
  3200. space_needed -= btrfs_leaf_free_space(path->nodes[0]);
  3201. /*
  3202. * try to push all the items after our slot into the
  3203. * right leaf
  3204. */
  3205. ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
  3206. if (ret < 0)
  3207. return ret;
  3208. if (ret == 0)
  3209. progress++;
  3210. nritems = btrfs_header_nritems(path->nodes[0]);
  3211. /*
  3212. * our goal is to get our slot at the start or end of a leaf. If
  3213. * we've done so we're done
  3214. */
  3215. if (path->slots[0] == 0 || path->slots[0] == nritems)
  3216. return 0;
  3217. if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
  3218. return 0;
  3219. /* try to push all the items before our slot into the next leaf */
  3220. slot = path->slots[0];
  3221. space_needed = data_size;
  3222. if (slot > 0)
  3223. space_needed -= btrfs_leaf_free_space(path->nodes[0]);
  3224. ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
  3225. if (ret < 0)
  3226. return ret;
  3227. if (ret == 0)
  3228. progress++;
  3229. if (progress)
  3230. return 0;
  3231. return 1;
  3232. }
  3233. /*
  3234. * split the path's leaf in two, making sure there is at least data_size
  3235. * available for the resulting leaf level of the path.
  3236. *
  3237. * returns 0 if all went well and < 0 on failure.
  3238. */
  3239. static noinline int split_leaf(struct btrfs_trans_handle *trans,
  3240. struct btrfs_root *root,
  3241. const struct btrfs_key *ins_key,
  3242. struct btrfs_path *path, int data_size,
  3243. bool extend)
  3244. {
  3245. struct btrfs_disk_key disk_key;
  3246. struct extent_buffer *l;
  3247. u32 nritems;
  3248. int mid;
  3249. int slot;
  3250. struct extent_buffer *right;
  3251. struct btrfs_fs_info *fs_info = root->fs_info;
  3252. int ret = 0;
  3253. int wret;
  3254. int split;
  3255. int num_doubles = 0;
  3256. int tried_avoid_double = 0;
  3257. l = path->nodes[0];
  3258. slot = path->slots[0];
  3259. if (extend && data_size + btrfs_item_size(l, slot) +
  3260. sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
  3261. return -EOVERFLOW;
  3262. /* first try to make some room by pushing left and right */
  3263. if (data_size && path->nodes[1]) {
  3264. int space_needed = data_size;
  3265. if (slot < btrfs_header_nritems(l))
  3266. space_needed -= btrfs_leaf_free_space(l);
  3267. wret = push_leaf_right(trans, root, path, space_needed,
  3268. space_needed, 0, 0);
  3269. if (wret < 0)
  3270. return wret;
  3271. if (wret) {
  3272. space_needed = data_size;
  3273. if (slot > 0)
  3274. space_needed -= btrfs_leaf_free_space(l);
  3275. wret = push_leaf_left(trans, root, path, space_needed,
  3276. space_needed, 0, (u32)-1);
  3277. if (wret < 0)
  3278. return wret;
  3279. }
  3280. l = path->nodes[0];
  3281. /* did the pushes work? */
  3282. if (btrfs_leaf_free_space(l) >= data_size)
  3283. return 0;
  3284. }
  3285. if (!path->nodes[1]) {
  3286. ret = insert_new_root(trans, root, path, 1);
  3287. if (ret)
  3288. return ret;
  3289. }
  3290. again:
  3291. split = 1;
  3292. l = path->nodes[0];
  3293. slot = path->slots[0];
  3294. nritems = btrfs_header_nritems(l);
  3295. mid = (nritems + 1) / 2;
  3296. if (mid <= slot) {
  3297. if (nritems == 1 ||
  3298. leaf_space_used(l, mid, nritems - mid) + data_size >
  3299. BTRFS_LEAF_DATA_SIZE(fs_info)) {
  3300. if (slot >= nritems) {
  3301. split = 0;
  3302. } else {
  3303. mid = slot;
  3304. if (mid != nritems &&
  3305. leaf_space_used(l, mid, nritems - mid) +
  3306. data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
  3307. if (data_size && !tried_avoid_double)
  3308. goto push_for_double;
  3309. split = 2;
  3310. }
  3311. }
  3312. }
  3313. } else {
  3314. if (leaf_space_used(l, 0, mid) + data_size >
  3315. BTRFS_LEAF_DATA_SIZE(fs_info)) {
  3316. if (!extend && data_size && slot == 0) {
  3317. split = 0;
  3318. } else if ((extend || !data_size) && slot == 0) {
  3319. mid = 1;
  3320. } else {
  3321. mid = slot;
  3322. if (mid != nritems &&
  3323. leaf_space_used(l, mid, nritems - mid) +
  3324. data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
  3325. if (data_size && !tried_avoid_double)
  3326. goto push_for_double;
  3327. split = 2;
  3328. }
  3329. }
  3330. }
  3331. }
  3332. if (split == 0)
  3333. btrfs_cpu_key_to_disk(&disk_key, ins_key);
  3334. else
  3335. btrfs_item_key(l, &disk_key, mid);
  3336. /*
  3337. * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
  3338. * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
  3339. * subclasses, which is 8 at the time of this patch, and we've maxed it
  3340. * out. In the future we could add a
  3341. * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
  3342. * use BTRFS_NESTING_NEW_ROOT.
  3343. */
  3344. right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
  3345. &disk_key, 0, l->start, 0, 0,
  3346. num_doubles ? BTRFS_NESTING_NEW_ROOT :
  3347. BTRFS_NESTING_SPLIT);
  3348. if (IS_ERR(right))
  3349. return PTR_ERR(right);
  3350. root_add_used_bytes(root);
  3351. if (split == 0) {
  3352. if (mid <= slot) {
  3353. btrfs_set_header_nritems(right, 0);
  3354. ret = insert_ptr(trans, path, &disk_key,
  3355. right->start, path->slots[1] + 1, 1);
  3356. if (ret < 0) {
  3357. btrfs_tree_unlock(right);
  3358. free_extent_buffer(right);
  3359. return ret;
  3360. }
  3361. btrfs_tree_unlock(path->nodes[0]);
  3362. free_extent_buffer(path->nodes[0]);
  3363. path->nodes[0] = right;
  3364. path->slots[0] = 0;
  3365. path->slots[1] += 1;
  3366. } else {
  3367. btrfs_set_header_nritems(right, 0);
  3368. ret = insert_ptr(trans, path, &disk_key,
  3369. right->start, path->slots[1], 1);
  3370. if (ret < 0) {
  3371. btrfs_tree_unlock(right);
  3372. free_extent_buffer(right);
  3373. return ret;
  3374. }
  3375. btrfs_tree_unlock(path->nodes[0]);
  3376. free_extent_buffer(path->nodes[0]);
  3377. path->nodes[0] = right;
  3378. path->slots[0] = 0;
  3379. if (path->slots[1] == 0)
  3380. fixup_low_keys(trans, path, &disk_key, 1);
  3381. }
  3382. /*
  3383. * We create a new leaf 'right' for the required ins_len and
  3384. * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
  3385. * the content of ins_len to 'right'.
  3386. */
  3387. return ret;
  3388. }
  3389. ret = copy_for_split(trans, path, l, right, slot, mid, nritems);
  3390. if (ret < 0) {
  3391. btrfs_tree_unlock(right);
  3392. free_extent_buffer(right);
  3393. return ret;
  3394. }
  3395. if (split == 2) {
  3396. BUG_ON(num_doubles != 0);
  3397. num_doubles++;
  3398. goto again;
  3399. }
  3400. return 0;
  3401. push_for_double:
  3402. push_for_double_split(trans, root, path, data_size);
  3403. tried_avoid_double = 1;
  3404. if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
  3405. return 0;
  3406. goto again;
  3407. }
  3408. static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
  3409. struct btrfs_root *root,
  3410. struct btrfs_path *path, int ins_len)
  3411. {
  3412. struct btrfs_key key;
  3413. struct extent_buffer *leaf;
  3414. struct btrfs_file_extent_item *fi;
  3415. u64 extent_len = 0;
  3416. u32 item_size;
  3417. int ret;
  3418. leaf = path->nodes[0];
  3419. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  3420. BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
  3421. key.type != BTRFS_RAID_STRIPE_KEY &&
  3422. key.type != BTRFS_EXTENT_CSUM_KEY);
  3423. if (btrfs_leaf_free_space(leaf) >= ins_len)
  3424. return 0;
  3425. item_size = btrfs_item_size(leaf, path->slots[0]);
  3426. if (key.type == BTRFS_EXTENT_DATA_KEY) {
  3427. fi = btrfs_item_ptr(leaf, path->slots[0],
  3428. struct btrfs_file_extent_item);
  3429. extent_len = btrfs_file_extent_num_bytes(leaf, fi);
  3430. }
  3431. btrfs_release_path(path);
  3432. path->keep_locks = true;
  3433. path->search_for_split = true;
  3434. ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
  3435. path->search_for_split = false;
  3436. if (ret > 0)
  3437. ret = -EAGAIN;
  3438. if (ret < 0)
  3439. goto err;
  3440. ret = -EAGAIN;
  3441. leaf = path->nodes[0];
  3442. /* if our item isn't there, return now */
  3443. if (item_size != btrfs_item_size(leaf, path->slots[0]))
  3444. goto err;
  3445. /* the leaf has changed, it now has room. return now */
  3446. if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
  3447. goto err;
  3448. if (key.type == BTRFS_EXTENT_DATA_KEY) {
  3449. fi = btrfs_item_ptr(leaf, path->slots[0],
  3450. struct btrfs_file_extent_item);
  3451. if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
  3452. goto err;
  3453. }
  3454. ret = split_leaf(trans, root, &key, path, ins_len, 1);
  3455. if (ret)
  3456. goto err;
  3457. path->keep_locks = false;
  3458. btrfs_unlock_up_safe(path, 1);
  3459. return 0;
  3460. err:
  3461. path->keep_locks = false;
  3462. return ret;
  3463. }
  3464. static noinline int split_item(struct btrfs_trans_handle *trans,
  3465. struct btrfs_path *path,
  3466. const struct btrfs_key *new_key,
  3467. unsigned long split_offset)
  3468. {
  3469. struct extent_buffer *leaf;
  3470. int orig_slot, slot;
  3471. char *buf;
  3472. u32 nritems;
  3473. u32 item_size;
  3474. u32 orig_offset;
  3475. struct btrfs_disk_key disk_key;
  3476. leaf = path->nodes[0];
  3477. /*
  3478. * Shouldn't happen because the caller must have previously called
  3479. * setup_leaf_for_split() to make room for the new item in the leaf.
  3480. */
  3481. if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)))
  3482. return -ENOSPC;
  3483. orig_slot = path->slots[0];
  3484. orig_offset = btrfs_item_offset(leaf, path->slots[0]);
  3485. item_size = btrfs_item_size(leaf, path->slots[0]);
  3486. buf = kmalloc(item_size, GFP_NOFS);
  3487. if (!buf)
  3488. return -ENOMEM;
  3489. read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
  3490. path->slots[0]), item_size);
  3491. slot = path->slots[0] + 1;
  3492. nritems = btrfs_header_nritems(leaf);
  3493. if (slot != nritems) {
  3494. /* shift the items */
  3495. memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
  3496. }
  3497. btrfs_cpu_key_to_disk(&disk_key, new_key);
  3498. btrfs_set_item_key(leaf, &disk_key, slot);
  3499. btrfs_set_item_offset(leaf, slot, orig_offset);
  3500. btrfs_set_item_size(leaf, slot, item_size - split_offset);
  3501. btrfs_set_item_offset(leaf, orig_slot,
  3502. orig_offset + item_size - split_offset);
  3503. btrfs_set_item_size(leaf, orig_slot, split_offset);
  3504. btrfs_set_header_nritems(leaf, nritems + 1);
  3505. /* write the data for the start of the original item */
  3506. write_extent_buffer(leaf, buf,
  3507. btrfs_item_ptr_offset(leaf, path->slots[0]),
  3508. split_offset);
  3509. /* write the data for the new item */
  3510. write_extent_buffer(leaf, buf + split_offset,
  3511. btrfs_item_ptr_offset(leaf, slot),
  3512. item_size - split_offset);
  3513. btrfs_mark_buffer_dirty(trans, leaf);
  3514. BUG_ON(btrfs_leaf_free_space(leaf) < 0);
  3515. kfree(buf);
  3516. return 0;
  3517. }
  3518. /*
  3519. * This function splits a single item into two items,
  3520. * giving 'new_key' to the new item and splitting the
  3521. * old one at split_offset (from the start of the item).
  3522. *
  3523. * The path may be released by this operation. After
  3524. * the split, the path is pointing to the old item. The
  3525. * new item is going to be in the same node as the old one.
  3526. *
  3527. * Note, the item being split must be smaller enough to live alone on
  3528. * a tree block with room for one extra struct btrfs_item
  3529. *
  3530. * This allows us to split the item in place, keeping a lock on the
  3531. * leaf the entire time.
  3532. */
  3533. int btrfs_split_item(struct btrfs_trans_handle *trans,
  3534. struct btrfs_root *root,
  3535. struct btrfs_path *path,
  3536. const struct btrfs_key *new_key,
  3537. unsigned long split_offset)
  3538. {
  3539. int ret;
  3540. ret = setup_leaf_for_split(trans, root, path,
  3541. sizeof(struct btrfs_item));
  3542. if (ret)
  3543. return ret;
  3544. return split_item(trans, path, new_key, split_offset);
  3545. }
  3546. /*
  3547. * make the item pointed to by the path smaller. new_size indicates
  3548. * how small to make it, and from_end tells us if we just chop bytes
  3549. * off the end of the item or if we shift the item to chop bytes off
  3550. * the front.
  3551. */
  3552. void btrfs_truncate_item(struct btrfs_trans_handle *trans,
  3553. const struct btrfs_path *path, u32 new_size, int from_end)
  3554. {
  3555. int slot;
  3556. struct extent_buffer *leaf;
  3557. u32 nritems;
  3558. unsigned int data_end;
  3559. unsigned int old_data_start;
  3560. unsigned int old_size;
  3561. unsigned int size_diff;
  3562. int i;
  3563. leaf = path->nodes[0];
  3564. slot = path->slots[0];
  3565. old_size = btrfs_item_size(leaf, slot);
  3566. if (old_size == new_size)
  3567. return;
  3568. nritems = btrfs_header_nritems(leaf);
  3569. data_end = leaf_data_end(leaf);
  3570. old_data_start = btrfs_item_offset(leaf, slot);
  3571. size_diff = old_size - new_size;
  3572. BUG_ON(slot < 0);
  3573. BUG_ON(slot >= nritems);
  3574. /*
  3575. * item0..itemN ... dataN.offset..dataN.size .. data0.size
  3576. */
  3577. /* first correct the data pointers */
  3578. for (i = slot; i < nritems; i++) {
  3579. u32 ioff;
  3580. ioff = btrfs_item_offset(leaf, i);
  3581. btrfs_set_item_offset(leaf, i, ioff + size_diff);
  3582. }
  3583. /* shift the data */
  3584. if (from_end) {
  3585. memmove_leaf_data(leaf, data_end + size_diff, data_end,
  3586. old_data_start + new_size - data_end);
  3587. } else {
  3588. struct btrfs_disk_key disk_key;
  3589. u64 offset;
  3590. btrfs_item_key(leaf, &disk_key, slot);
  3591. if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
  3592. unsigned long ptr;
  3593. struct btrfs_file_extent_item *fi;
  3594. fi = btrfs_item_ptr(leaf, slot,
  3595. struct btrfs_file_extent_item);
  3596. fi = (struct btrfs_file_extent_item *)(
  3597. (unsigned long)fi - size_diff);
  3598. if (btrfs_file_extent_type(leaf, fi) ==
  3599. BTRFS_FILE_EXTENT_INLINE) {
  3600. ptr = btrfs_item_ptr_offset(leaf, slot);
  3601. memmove_extent_buffer(leaf, ptr,
  3602. (unsigned long)fi,
  3603. BTRFS_FILE_EXTENT_INLINE_DATA_START);
  3604. }
  3605. }
  3606. memmove_leaf_data(leaf, data_end + size_diff, data_end,
  3607. old_data_start - data_end);
  3608. offset = btrfs_disk_key_offset(&disk_key);
  3609. btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
  3610. btrfs_set_item_key(leaf, &disk_key, slot);
  3611. if (slot == 0)
  3612. fixup_low_keys(trans, path, &disk_key, 1);
  3613. }
  3614. btrfs_set_item_size(leaf, slot, new_size);
  3615. btrfs_mark_buffer_dirty(trans, leaf);
  3616. if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
  3617. btrfs_print_leaf(leaf);
  3618. BUG();
  3619. }
  3620. }
  3621. /*
  3622. * make the item pointed to by the path bigger, data_size is the added size.
  3623. */
  3624. void btrfs_extend_item(struct btrfs_trans_handle *trans,
  3625. const struct btrfs_path *path, u32 data_size)
  3626. {
  3627. int slot;
  3628. struct extent_buffer *leaf;
  3629. u32 nritems;
  3630. unsigned int data_end;
  3631. unsigned int old_data;
  3632. unsigned int old_size;
  3633. int i;
  3634. leaf = path->nodes[0];
  3635. nritems = btrfs_header_nritems(leaf);
  3636. data_end = leaf_data_end(leaf);
  3637. if (unlikely(btrfs_leaf_free_space(leaf) < data_size)) {
  3638. btrfs_print_leaf(leaf);
  3639. BUG();
  3640. }
  3641. slot = path->slots[0];
  3642. old_data = btrfs_item_data_end(leaf, slot);
  3643. BUG_ON(slot < 0);
  3644. if (unlikely(slot >= nritems)) {
  3645. btrfs_print_leaf(leaf);
  3646. btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
  3647. slot, nritems);
  3648. BUG();
  3649. }
  3650. /*
  3651. * item0..itemN ... dataN.offset..dataN.size .. data0.size
  3652. */
  3653. /* first correct the data pointers */
  3654. for (i = slot; i < nritems; i++) {
  3655. u32 ioff;
  3656. ioff = btrfs_item_offset(leaf, i);
  3657. btrfs_set_item_offset(leaf, i, ioff - data_size);
  3658. }
  3659. /* shift the data */
  3660. memmove_leaf_data(leaf, data_end - data_size, data_end,
  3661. old_data - data_end);
  3662. old_size = btrfs_item_size(leaf, slot);
  3663. btrfs_set_item_size(leaf, slot, old_size + data_size);
  3664. btrfs_mark_buffer_dirty(trans, leaf);
  3665. if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
  3666. btrfs_print_leaf(leaf);
  3667. BUG();
  3668. }
  3669. }
  3670. /*
  3671. * Make space in the node before inserting one or more items.
  3672. *
  3673. * @trans: transaction handle
  3674. * @root: root we are inserting items to
  3675. * @path: points to the leaf/slot where we are going to insert new items
  3676. * @batch: information about the batch of items to insert
  3677. *
  3678. * Main purpose is to save stack depth by doing the bulk of the work in a
  3679. * function that doesn't call btrfs_search_slot
  3680. */
  3681. static void setup_items_for_insert(struct btrfs_trans_handle *trans,
  3682. struct btrfs_root *root, struct btrfs_path *path,
  3683. const struct btrfs_item_batch *batch)
  3684. {
  3685. struct btrfs_fs_info *fs_info = root->fs_info;
  3686. int i;
  3687. u32 nritems;
  3688. unsigned int data_end;
  3689. struct btrfs_disk_key disk_key;
  3690. struct extent_buffer *leaf;
  3691. int slot;
  3692. u32 total_size;
  3693. /*
  3694. * Before anything else, update keys in the parent and other ancestors
  3695. * if needed, then release the write locks on them, so that other tasks
  3696. * can use them while we modify the leaf.
  3697. */
  3698. if (path->slots[0] == 0) {
  3699. btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
  3700. fixup_low_keys(trans, path, &disk_key, 1);
  3701. }
  3702. btrfs_unlock_up_safe(path, 1);
  3703. leaf = path->nodes[0];
  3704. slot = path->slots[0];
  3705. nritems = btrfs_header_nritems(leaf);
  3706. data_end = leaf_data_end(leaf);
  3707. total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
  3708. if (unlikely(btrfs_leaf_free_space(leaf) < total_size)) {
  3709. btrfs_print_leaf(leaf);
  3710. btrfs_crit(fs_info, "not enough freespace need %u have %d",
  3711. total_size, btrfs_leaf_free_space(leaf));
  3712. BUG();
  3713. }
  3714. if (slot != nritems) {
  3715. unsigned int old_data = btrfs_item_data_end(leaf, slot);
  3716. if (unlikely(old_data < data_end)) {
  3717. btrfs_print_leaf(leaf);
  3718. btrfs_crit(fs_info,
  3719. "item at slot %d with data offset %u beyond data end of leaf %u",
  3720. slot, old_data, data_end);
  3721. BUG();
  3722. }
  3723. /*
  3724. * item0..itemN ... dataN.offset..dataN.size .. data0.size
  3725. */
  3726. /* first correct the data pointers */
  3727. for (i = slot; i < nritems; i++) {
  3728. u32 ioff;
  3729. ioff = btrfs_item_offset(leaf, i);
  3730. btrfs_set_item_offset(leaf, i,
  3731. ioff - batch->total_data_size);
  3732. }
  3733. /* shift the items */
  3734. memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
  3735. /* shift the data */
  3736. memmove_leaf_data(leaf, data_end - batch->total_data_size,
  3737. data_end, old_data - data_end);
  3738. data_end = old_data;
  3739. }
  3740. /* setup the item for the new data */
  3741. for (i = 0; i < batch->nr; i++) {
  3742. btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
  3743. btrfs_set_item_key(leaf, &disk_key, slot + i);
  3744. data_end -= batch->data_sizes[i];
  3745. btrfs_set_item_offset(leaf, slot + i, data_end);
  3746. btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]);
  3747. }
  3748. btrfs_set_header_nritems(leaf, nritems + batch->nr);
  3749. btrfs_mark_buffer_dirty(trans, leaf);
  3750. if (unlikely(btrfs_leaf_free_space(leaf) < 0)) {
  3751. btrfs_print_leaf(leaf);
  3752. BUG();
  3753. }
  3754. }
  3755. /*
  3756. * Insert a new item into a leaf.
  3757. *
  3758. * @trans: Transaction handle.
  3759. * @root: The root of the btree.
  3760. * @path: A path pointing to the target leaf and slot.
  3761. * @key: The key of the new item.
  3762. * @data_size: The size of the data associated with the new key.
  3763. */
  3764. void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
  3765. struct btrfs_root *root,
  3766. struct btrfs_path *path,
  3767. const struct btrfs_key *key,
  3768. u32 data_size)
  3769. {
  3770. struct btrfs_item_batch batch;
  3771. batch.keys = key;
  3772. batch.data_sizes = &data_size;
  3773. batch.total_data_size = data_size;
  3774. batch.nr = 1;
  3775. setup_items_for_insert(trans, root, path, &batch);
  3776. }
  3777. /*
  3778. * Given a key and some data, insert items into the tree.
  3779. * This does all the path init required, making room in the tree if needed.
  3780. *
  3781. * Returns: 0 on success
  3782. * -EEXIST if the first key already exists
  3783. * < 0 on other errors
  3784. */
  3785. int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
  3786. struct btrfs_root *root,
  3787. struct btrfs_path *path,
  3788. const struct btrfs_item_batch *batch)
  3789. {
  3790. int ret = 0;
  3791. int slot;
  3792. u32 total_size;
  3793. total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
  3794. ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
  3795. if (ret == 0)
  3796. return -EEXIST;
  3797. if (ret < 0)
  3798. return ret;
  3799. slot = path->slots[0];
  3800. BUG_ON(slot < 0);
  3801. setup_items_for_insert(trans, root, path, batch);
  3802. return 0;
  3803. }
  3804. /*
  3805. * Given a key and some data, insert an item into the tree.
  3806. * This does all the path init required, making room in the tree if needed.
  3807. */
  3808. int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  3809. const struct btrfs_key *cpu_key, void *data,
  3810. u32 data_size)
  3811. {
  3812. int ret = 0;
  3813. BTRFS_PATH_AUTO_FREE(path);
  3814. struct extent_buffer *leaf;
  3815. unsigned long ptr;
  3816. path = btrfs_alloc_path();
  3817. if (!path)
  3818. return -ENOMEM;
  3819. ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
  3820. if (!ret) {
  3821. leaf = path->nodes[0];
  3822. ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
  3823. write_extent_buffer(leaf, data, ptr, data_size);
  3824. btrfs_mark_buffer_dirty(trans, leaf);
  3825. }
  3826. return ret;
  3827. }
  3828. /*
  3829. * This function duplicates an item, giving 'new_key' to the new item.
  3830. * It guarantees both items live in the same tree leaf and the new item is
  3831. * contiguous with the original item.
  3832. *
  3833. * This allows us to split a file extent in place, keeping a lock on the leaf
  3834. * the entire time.
  3835. */
  3836. int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
  3837. struct btrfs_root *root,
  3838. struct btrfs_path *path,
  3839. const struct btrfs_key *new_key)
  3840. {
  3841. struct extent_buffer *leaf;
  3842. int ret;
  3843. u32 item_size;
  3844. leaf = path->nodes[0];
  3845. item_size = btrfs_item_size(leaf, path->slots[0]);
  3846. ret = setup_leaf_for_split(trans, root, path,
  3847. item_size + sizeof(struct btrfs_item));
  3848. if (ret)
  3849. return ret;
  3850. path->slots[0]++;
  3851. btrfs_setup_item_for_insert(trans, root, path, new_key, item_size);
  3852. leaf = path->nodes[0];
  3853. memcpy_extent_buffer(leaf,
  3854. btrfs_item_ptr_offset(leaf, path->slots[0]),
  3855. btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
  3856. item_size);
  3857. return 0;
  3858. }
  3859. /*
  3860. * delete the pointer from a given node.
  3861. *
  3862. * the tree should have been previously balanced so the deletion does not
  3863. * empty a node.
  3864. *
  3865. * This is exported for use inside btrfs-progs, don't un-export it.
  3866. */
  3867. int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  3868. struct btrfs_path *path, int level, int slot)
  3869. {
  3870. struct extent_buffer *parent = path->nodes[level];
  3871. u32 nritems;
  3872. int ret;
  3873. nritems = btrfs_header_nritems(parent);
  3874. if (slot != nritems - 1) {
  3875. if (level) {
  3876. ret = btrfs_tree_mod_log_insert_move(parent, slot,
  3877. slot + 1, nritems - slot - 1);
  3878. if (unlikely(ret < 0)) {
  3879. btrfs_abort_transaction(trans, ret);
  3880. return ret;
  3881. }
  3882. }
  3883. memmove_extent_buffer(parent,
  3884. btrfs_node_key_ptr_offset(parent, slot),
  3885. btrfs_node_key_ptr_offset(parent, slot + 1),
  3886. sizeof(struct btrfs_key_ptr) *
  3887. (nritems - slot - 1));
  3888. } else if (level) {
  3889. ret = btrfs_tree_mod_log_insert_key(parent, slot,
  3890. BTRFS_MOD_LOG_KEY_REMOVE);
  3891. if (unlikely(ret < 0)) {
  3892. btrfs_abort_transaction(trans, ret);
  3893. return ret;
  3894. }
  3895. }
  3896. nritems--;
  3897. btrfs_set_header_nritems(parent, nritems);
  3898. if (nritems == 0 && parent == root->node) {
  3899. BUG_ON(btrfs_header_level(root->node) != 1);
  3900. /* just turn the root into a leaf and break */
  3901. btrfs_set_header_level(root->node, 0);
  3902. } else if (slot == 0) {
  3903. struct btrfs_disk_key disk_key;
  3904. btrfs_node_key(parent, &disk_key, 0);
  3905. fixup_low_keys(trans, path, &disk_key, level + 1);
  3906. }
  3907. btrfs_mark_buffer_dirty(trans, parent);
  3908. return 0;
  3909. }
  3910. /*
  3911. * a helper function to delete the leaf pointed to by path->slots[1] and
  3912. * path->nodes[1].
  3913. *
  3914. * This deletes the pointer in path->nodes[1] and frees the leaf
  3915. * block extent. zero is returned if it all worked out, < 0 otherwise.
  3916. *
  3917. * The path must have already been setup for deleting the leaf, including
  3918. * all the proper balancing. path->nodes[1] must be locked.
  3919. */
  3920. static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
  3921. struct btrfs_root *root,
  3922. struct btrfs_path *path,
  3923. struct extent_buffer *leaf)
  3924. {
  3925. int ret;
  3926. WARN_ON(btrfs_header_generation(leaf) != trans->transid);
  3927. ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]);
  3928. if (ret < 0)
  3929. return ret;
  3930. /*
  3931. * btrfs_free_extent is expensive, we want to make sure we
  3932. * aren't holding any locks when we call it
  3933. */
  3934. btrfs_unlock_up_safe(path, 0);
  3935. root_sub_used_bytes(root);
  3936. refcount_inc(&leaf->refs);
  3937. ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
  3938. free_extent_buffer_stale(leaf);
  3939. if (ret < 0)
  3940. btrfs_abort_transaction(trans, ret);
  3941. return ret;
  3942. }
  3943. /*
  3944. * delete the item at the leaf level in path. If that empties
  3945. * the leaf, remove it from the tree
  3946. */
  3947. int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  3948. struct btrfs_path *path, int slot, int nr)
  3949. {
  3950. struct btrfs_fs_info *fs_info = root->fs_info;
  3951. struct extent_buffer *leaf;
  3952. int ret = 0;
  3953. int wret;
  3954. u32 nritems;
  3955. leaf = path->nodes[0];
  3956. nritems = btrfs_header_nritems(leaf);
  3957. if (slot + nr != nritems) {
  3958. const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
  3959. const int data_end = leaf_data_end(leaf);
  3960. u32 dsize = 0;
  3961. int i;
  3962. for (i = 0; i < nr; i++)
  3963. dsize += btrfs_item_size(leaf, slot + i);
  3964. memmove_leaf_data(leaf, data_end + dsize, data_end,
  3965. last_off - data_end);
  3966. for (i = slot + nr; i < nritems; i++) {
  3967. u32 ioff;
  3968. ioff = btrfs_item_offset(leaf, i);
  3969. btrfs_set_item_offset(leaf, i, ioff + dsize);
  3970. }
  3971. memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
  3972. }
  3973. btrfs_set_header_nritems(leaf, nritems - nr);
  3974. nritems -= nr;
  3975. /* delete the leaf if we've emptied it */
  3976. if (nritems == 0) {
  3977. if (leaf != root->node) {
  3978. btrfs_clear_buffer_dirty(trans, leaf);
  3979. ret = btrfs_del_leaf(trans, root, path, leaf);
  3980. if (ret < 0)
  3981. return ret;
  3982. }
  3983. } else {
  3984. int used = leaf_space_used(leaf, 0, nritems);
  3985. if (slot == 0) {
  3986. struct btrfs_disk_key disk_key;
  3987. btrfs_item_key(leaf, &disk_key, 0);
  3988. fixup_low_keys(trans, path, &disk_key, 1);
  3989. }
  3990. /*
  3991. * Try to delete the leaf if it is mostly empty. We do this by
  3992. * trying to move all its items into its left and right neighbours.
  3993. * If we can't move all the items, then we don't delete it - it's
  3994. * not ideal, but future insertions might fill the leaf with more
  3995. * items, or items from other leaves might be moved later into our
  3996. * leaf due to deletions on those leaves.
  3997. */
  3998. if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
  3999. u32 min_push_space;
  4000. /* push_leaf_left fixes the path.
  4001. * make sure the path still points to our leaf
  4002. * for possible call to btrfs_del_ptr below
  4003. */
  4004. slot = path->slots[1];
  4005. refcount_inc(&leaf->refs);
  4006. /*
  4007. * We want to be able to at least push one item to the
  4008. * left neighbour leaf, and that's the first item.
  4009. */
  4010. min_push_space = sizeof(struct btrfs_item) +
  4011. btrfs_item_size(leaf, 0);
  4012. wret = push_leaf_left(trans, root, path, 0,
  4013. min_push_space, 1, (u32)-1);
  4014. if (wret < 0 && wret != -ENOSPC)
  4015. ret = wret;
  4016. if (path->nodes[0] == leaf &&
  4017. btrfs_header_nritems(leaf)) {
  4018. /*
  4019. * If we were not able to push all items from our
  4020. * leaf to its left neighbour, then attempt to
  4021. * either push all the remaining items to the
  4022. * right neighbour or none. There's no advantage
  4023. * in pushing only some items, instead of all, as
  4024. * it's pointless to end up with a leaf having
  4025. * too few items while the neighbours can be full
  4026. * or nearly full.
  4027. */
  4028. nritems = btrfs_header_nritems(leaf);
  4029. min_push_space = leaf_space_used(leaf, 0, nritems);
  4030. wret = push_leaf_right(trans, root, path, 0,
  4031. min_push_space, 1, 0);
  4032. if (wret < 0 && wret != -ENOSPC)
  4033. ret = wret;
  4034. }
  4035. if (btrfs_header_nritems(leaf) == 0) {
  4036. path->slots[1] = slot;
  4037. ret = btrfs_del_leaf(trans, root, path, leaf);
  4038. free_extent_buffer(leaf);
  4039. if (ret < 0)
  4040. return ret;
  4041. } else {
  4042. /* if we're still in the path, make sure
  4043. * we're dirty. Otherwise, one of the
  4044. * push_leaf functions must have already
  4045. * dirtied this buffer
  4046. */
  4047. if (path->nodes[0] == leaf)
  4048. btrfs_mark_buffer_dirty(trans, leaf);
  4049. free_extent_buffer(leaf);
  4050. }
  4051. } else {
  4052. btrfs_mark_buffer_dirty(trans, leaf);
  4053. }
  4054. }
  4055. return ret;
  4056. }
  4057. /*
  4058. * A helper function to walk down the tree starting at min_key, and looking
  4059. * for leaves that have a minimum transaction id.
  4060. * This is used by the btree defrag code, and tree logging
  4061. *
  4062. * This does not cow, but it does stuff the starting key it finds back
  4063. * into min_key, so you can call btrfs_search_slot with cow=1 on the
  4064. * key and get a writable path.
  4065. *
  4066. * min_trans indicates the oldest transaction that you are interested
  4067. * in walking through. Any nodes or leaves older than min_trans are
  4068. * skipped over (without reading them).
  4069. *
  4070. * returns zero if something useful was found, < 0 on error and 1 if there
  4071. * was nothing in the tree that matched the search criteria.
  4072. */
  4073. int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
  4074. struct btrfs_path *path,
  4075. u64 min_trans)
  4076. {
  4077. struct extent_buffer *cur;
  4078. int slot;
  4079. int sret;
  4080. u32 nritems;
  4081. int level;
  4082. int ret = 1;
  4083. const bool keep_locks = path->keep_locks;
  4084. ASSERT(!path->nowait);
  4085. ASSERT(path->lowest_level == 0);
  4086. path->keep_locks = true;
  4087. again:
  4088. cur = btrfs_read_lock_root_node(root);
  4089. level = btrfs_header_level(cur);
  4090. WARN_ON(path->nodes[level]);
  4091. path->nodes[level] = cur;
  4092. path->locks[level] = BTRFS_READ_LOCK;
  4093. if (btrfs_header_generation(cur) < min_trans) {
  4094. ret = 1;
  4095. goto out;
  4096. }
  4097. while (1) {
  4098. nritems = btrfs_header_nritems(cur);
  4099. level = btrfs_header_level(cur);
  4100. sret = btrfs_bin_search(cur, 0, min_key, &slot);
  4101. if (sret < 0) {
  4102. ret = sret;
  4103. goto out;
  4104. }
  4105. /* At level 0 we're done, setup the path and exit. */
  4106. if (level == 0) {
  4107. if (slot >= nritems)
  4108. goto find_next_key;
  4109. ret = 0;
  4110. path->slots[level] = slot;
  4111. /* Save our key for returning back. */
  4112. btrfs_item_key_to_cpu(cur, min_key, slot);
  4113. goto out;
  4114. }
  4115. if (sret && slot > 0)
  4116. slot--;
  4117. /*
  4118. * check this node pointer against the min_trans parameters.
  4119. * If it is too old, skip to the next one.
  4120. */
  4121. while (slot < nritems) {
  4122. u64 gen;
  4123. gen = btrfs_node_ptr_generation(cur, slot);
  4124. if (gen < min_trans) {
  4125. slot++;
  4126. continue;
  4127. }
  4128. break;
  4129. }
  4130. find_next_key:
  4131. /*
  4132. * we didn't find a candidate key in this node, walk forward
  4133. * and find another one
  4134. */
  4135. path->slots[level] = slot;
  4136. if (slot >= nritems) {
  4137. sret = btrfs_find_next_key(root, path, min_key, level,
  4138. min_trans);
  4139. if (sret == 0) {
  4140. btrfs_release_path(path);
  4141. goto again;
  4142. } else {
  4143. goto out;
  4144. }
  4145. }
  4146. cur = btrfs_read_node_slot(cur, slot);
  4147. if (IS_ERR(cur)) {
  4148. ret = PTR_ERR(cur);
  4149. goto out;
  4150. }
  4151. btrfs_tree_read_lock(cur);
  4152. path->locks[level - 1] = BTRFS_READ_LOCK;
  4153. path->nodes[level - 1] = cur;
  4154. unlock_up(path, level, 1, 0, NULL);
  4155. }
  4156. out:
  4157. path->keep_locks = keep_locks;
  4158. if (ret == 0)
  4159. btrfs_unlock_up_safe(path, 1);
  4160. return ret;
  4161. }
  4162. /*
  4163. * this is similar to btrfs_next_leaf, but does not try to preserve
  4164. * and fixup the path. It looks for and returns the next key in the
  4165. * tree based on the current path and the min_trans parameters.
  4166. *
  4167. * 0 is returned if another key is found, < 0 if there are any errors
  4168. * and 1 is returned if there are no higher keys in the tree
  4169. *
  4170. * path->keep_locks should be set to true on the search made before
  4171. * calling this function.
  4172. */
  4173. int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
  4174. struct btrfs_key *key, int level, u64 min_trans)
  4175. {
  4176. int slot;
  4177. struct extent_buffer *c;
  4178. WARN_ON(!path->keep_locks && !path->skip_locking);
  4179. while (level < BTRFS_MAX_LEVEL) {
  4180. if (!path->nodes[level])
  4181. return 1;
  4182. slot = path->slots[level] + 1;
  4183. c = path->nodes[level];
  4184. next:
  4185. if (slot >= btrfs_header_nritems(c)) {
  4186. int ret;
  4187. int orig_lowest;
  4188. struct btrfs_key cur_key;
  4189. if (level + 1 >= BTRFS_MAX_LEVEL ||
  4190. !path->nodes[level + 1])
  4191. return 1;
  4192. if (path->locks[level + 1] || path->skip_locking) {
  4193. level++;
  4194. continue;
  4195. }
  4196. slot = btrfs_header_nritems(c) - 1;
  4197. if (level == 0)
  4198. btrfs_item_key_to_cpu(c, &cur_key, slot);
  4199. else
  4200. btrfs_node_key_to_cpu(c, &cur_key, slot);
  4201. orig_lowest = path->lowest_level;
  4202. btrfs_release_path(path);
  4203. path->lowest_level = level;
  4204. ret = btrfs_search_slot(NULL, root, &cur_key, path,
  4205. 0, 0);
  4206. path->lowest_level = orig_lowest;
  4207. if (ret < 0)
  4208. return ret;
  4209. c = path->nodes[level];
  4210. slot = path->slots[level];
  4211. if (ret == 0)
  4212. slot++;
  4213. goto next;
  4214. }
  4215. if (level == 0)
  4216. btrfs_item_key_to_cpu(c, key, slot);
  4217. else {
  4218. u64 gen = btrfs_node_ptr_generation(c, slot);
  4219. if (gen < min_trans) {
  4220. slot++;
  4221. goto next;
  4222. }
  4223. btrfs_node_key_to_cpu(c, key, slot);
  4224. }
  4225. return 0;
  4226. }
  4227. return 1;
  4228. }
  4229. int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
  4230. u64 time_seq)
  4231. {
  4232. int slot;
  4233. int level;
  4234. struct extent_buffer *c;
  4235. struct extent_buffer *next;
  4236. struct btrfs_fs_info *fs_info = root->fs_info;
  4237. struct btrfs_key key;
  4238. bool need_commit_sem = false;
  4239. u32 nritems;
  4240. int ret;
  4241. int i;
  4242. /*
  4243. * The nowait semantics are used only for write paths, where we don't
  4244. * use the tree mod log and sequence numbers.
  4245. */
  4246. if (time_seq)
  4247. ASSERT(!path->nowait);
  4248. nritems = btrfs_header_nritems(path->nodes[0]);
  4249. if (nritems == 0)
  4250. return 1;
  4251. btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
  4252. again:
  4253. level = 1;
  4254. next = NULL;
  4255. btrfs_release_path(path);
  4256. path->keep_locks = true;
  4257. if (time_seq) {
  4258. ret = btrfs_search_old_slot(root, &key, path, time_seq);
  4259. } else {
  4260. if (path->need_commit_sem) {
  4261. path->need_commit_sem = false;
  4262. need_commit_sem = true;
  4263. if (path->nowait) {
  4264. if (!down_read_trylock(&fs_info->commit_root_sem)) {
  4265. ret = -EAGAIN;
  4266. goto done;
  4267. }
  4268. } else {
  4269. down_read(&fs_info->commit_root_sem);
  4270. }
  4271. }
  4272. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  4273. }
  4274. path->keep_locks = false;
  4275. if (ret < 0)
  4276. goto done;
  4277. nritems = btrfs_header_nritems(path->nodes[0]);
  4278. /*
  4279. * By releasing the path above we dropped all our locks. A balance
  4280. * could have happened and
  4281. *
  4282. * 1. added more items after the previous last item
  4283. * 2. deleted the previous last item
  4284. *
  4285. * So, check again here and advance the path if there are now more
  4286. * items available.
  4287. */
  4288. if (nritems > 0 && path->slots[0] <= nritems - 1) {
  4289. if (ret == 0 && path->slots[0] != nritems - 1) {
  4290. path->slots[0]++;
  4291. goto done;
  4292. } else if (ret > 0) {
  4293. ret = 0;
  4294. goto done;
  4295. }
  4296. }
  4297. while (level < BTRFS_MAX_LEVEL) {
  4298. if (!path->nodes[level]) {
  4299. ret = 1;
  4300. goto done;
  4301. }
  4302. slot = path->slots[level] + 1;
  4303. c = path->nodes[level];
  4304. if (slot >= btrfs_header_nritems(c)) {
  4305. level++;
  4306. if (level == BTRFS_MAX_LEVEL) {
  4307. ret = 1;
  4308. goto done;
  4309. }
  4310. continue;
  4311. }
  4312. /*
  4313. * Our current level is where we're going to start from, and to
  4314. * make sure lockdep doesn't complain we need to drop our locks
  4315. * and nodes from 0 to our current level.
  4316. */
  4317. for (i = 0; i < level; i++) {
  4318. if (path->locks[level]) {
  4319. btrfs_tree_read_unlock(path->nodes[i]);
  4320. path->locks[i] = 0;
  4321. }
  4322. free_extent_buffer(path->nodes[i]);
  4323. path->nodes[i] = NULL;
  4324. }
  4325. next = c;
  4326. ret = read_block_for_search(root, path, &next, slot, &key);
  4327. if (ret == -EAGAIN && !path->nowait)
  4328. goto again;
  4329. if (ret < 0) {
  4330. btrfs_release_path(path);
  4331. goto done;
  4332. }
  4333. if (!path->skip_locking) {
  4334. ret = btrfs_try_tree_read_lock(next);
  4335. if (!ret && path->nowait) {
  4336. ret = -EAGAIN;
  4337. goto done;
  4338. }
  4339. if (!ret && time_seq) {
  4340. /*
  4341. * If we don't get the lock, we may be racing
  4342. * with push_leaf_left, holding that lock while
  4343. * itself waiting for the leaf we've currently
  4344. * locked. To solve this situation, we give up
  4345. * on our lock and cycle.
  4346. */
  4347. free_extent_buffer(next);
  4348. btrfs_release_path(path);
  4349. cond_resched();
  4350. goto again;
  4351. }
  4352. if (!ret)
  4353. btrfs_tree_read_lock(next);
  4354. }
  4355. break;
  4356. }
  4357. path->slots[level] = slot;
  4358. while (1) {
  4359. level--;
  4360. path->nodes[level] = next;
  4361. path->slots[level] = 0;
  4362. if (!path->skip_locking)
  4363. path->locks[level] = BTRFS_READ_LOCK;
  4364. if (!level)
  4365. break;
  4366. ret = read_block_for_search(root, path, &next, 0, &key);
  4367. if (ret == -EAGAIN && !path->nowait)
  4368. goto again;
  4369. if (ret < 0) {
  4370. btrfs_release_path(path);
  4371. goto done;
  4372. }
  4373. if (!path->skip_locking) {
  4374. if (path->nowait) {
  4375. if (!btrfs_try_tree_read_lock(next)) {
  4376. ret = -EAGAIN;
  4377. goto done;
  4378. }
  4379. } else {
  4380. btrfs_tree_read_lock(next);
  4381. }
  4382. }
  4383. }
  4384. ret = 0;
  4385. done:
  4386. unlock_up(path, 0, 1, 0, NULL);
  4387. if (need_commit_sem) {
  4388. int ret2;
  4389. path->need_commit_sem = true;
  4390. ret2 = finish_need_commit_sem_search(path);
  4391. up_read(&fs_info->commit_root_sem);
  4392. if (ret2)
  4393. ret = ret2;
  4394. }
  4395. return ret;
  4396. }
  4397. int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
  4398. {
  4399. path->slots[0]++;
  4400. if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
  4401. return btrfs_next_old_leaf(root, path, time_seq);
  4402. return 0;
  4403. }
  4404. /*
  4405. * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
  4406. * searching until it gets past min_objectid or finds an item of 'type'
  4407. *
  4408. * returns 0 if something is found, 1 if nothing was found and < 0 on error
  4409. */
  4410. int btrfs_previous_item(struct btrfs_root *root,
  4411. struct btrfs_path *path, u64 min_objectid,
  4412. int type)
  4413. {
  4414. struct btrfs_key found_key;
  4415. struct extent_buffer *leaf;
  4416. u32 nritems;
  4417. int ret;
  4418. while (1) {
  4419. if (path->slots[0] == 0) {
  4420. ret = btrfs_prev_leaf(root, path);
  4421. if (ret != 0)
  4422. return ret;
  4423. } else {
  4424. path->slots[0]--;
  4425. }
  4426. leaf = path->nodes[0];
  4427. nritems = btrfs_header_nritems(leaf);
  4428. if (nritems == 0)
  4429. return 1;
  4430. if (path->slots[0] == nritems)
  4431. path->slots[0]--;
  4432. btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
  4433. if (found_key.objectid < min_objectid)
  4434. break;
  4435. if (found_key.type == type)
  4436. return 0;
  4437. if (found_key.objectid == min_objectid &&
  4438. found_key.type < type)
  4439. break;
  4440. }
  4441. return 1;
  4442. }
  4443. /*
  4444. * search in extent tree to find a previous Metadata/Data extent item with
  4445. * min objecitd.
  4446. *
  4447. * returns 0 if something is found, 1 if nothing was found and < 0 on error
  4448. */
  4449. int btrfs_previous_extent_item(struct btrfs_root *root,
  4450. struct btrfs_path *path, u64 min_objectid)
  4451. {
  4452. struct btrfs_key found_key;
  4453. struct extent_buffer *leaf;
  4454. u32 nritems;
  4455. int ret;
  4456. while (1) {
  4457. if (path->slots[0] == 0) {
  4458. ret = btrfs_prev_leaf(root, path);
  4459. if (ret != 0)
  4460. return ret;
  4461. } else {
  4462. path->slots[0]--;
  4463. }
  4464. leaf = path->nodes[0];
  4465. nritems = btrfs_header_nritems(leaf);
  4466. if (nritems == 0)
  4467. return 1;
  4468. if (path->slots[0] == nritems)
  4469. path->slots[0]--;
  4470. btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
  4471. if (found_key.objectid < min_objectid)
  4472. break;
  4473. if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
  4474. found_key.type == BTRFS_METADATA_ITEM_KEY)
  4475. return 0;
  4476. if (found_key.objectid == min_objectid &&
  4477. found_key.type < BTRFS_EXTENT_ITEM_KEY)
  4478. break;
  4479. }
  4480. return 1;
  4481. }
  4482. int __init btrfs_ctree_init(void)
  4483. {
  4484. btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0);
  4485. if (!btrfs_path_cachep)
  4486. return -ENOMEM;
  4487. return 0;
  4488. }
  4489. void __cold btrfs_ctree_exit(void)
  4490. {
  4491. kmem_cache_destroy(btrfs_path_cachep);
  4492. }