mballoc.c 205 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
  4. * Written by Alex Tomas <alex@clusterfs.com>
  5. */
  6. /*
  7. * mballoc.c contains the multiblocks allocation routines
  8. */
  9. #include "ext4_jbd2.h"
  10. #include "mballoc.h"
  11. #include <linux/log2.h>
  12. #include <linux/module.h>
  13. #include <linux/slab.h>
  14. #include <linux/nospec.h>
  15. #include <linux/backing-dev.h>
  16. #include <linux/freezer.h>
  17. #include <trace/events/ext4.h>
  18. #include <kunit/static_stub.h>
  19. /*
  20. * MUSTDO:
  21. * - test ext4_ext_search_left() and ext4_ext_search_right()
  22. * - search for metadata in few groups
  23. *
  24. * TODO v4:
  25. * - normalization should take into account whether file is still open
  26. * - discard preallocations if no free space left (policy?)
  27. * - don't normalize tails
  28. * - quota
  29. * - reservation for superuser
  30. *
  31. * TODO v3:
  32. * - bitmap read-ahead (proposed by Oleg Drokin aka green)
  33. * - track min/max extents in each group for better group selection
  34. * - mb_mark_used() may allocate chunk right after splitting buddy
  35. * - tree of groups sorted by number of free blocks
  36. * - error handling
  37. */
  38. /*
  39. * The allocation request involve request for multiple number of blocks
  40. * near to the goal(block) value specified.
  41. *
  42. * During initialization phase of the allocator we decide to use the
  43. * group preallocation or inode preallocation depending on the size of
  44. * the file. The size of the file could be the resulting file size we
  45. * would have after allocation, or the current file size, which ever
  46. * is larger. If the size is less than sbi->s_mb_stream_request we
  47. * select to use the group preallocation. The default value of
  48. * s_mb_stream_request is 16 blocks. This can also be tuned via
  49. * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
  50. * terms of number of blocks.
  51. *
  52. * The main motivation for having small file use group preallocation is to
  53. * ensure that we have small files closer together on the disk.
  54. *
  55. * First stage the allocator looks at the inode prealloc list,
  56. * ext4_inode_info->i_prealloc_list, which contains list of prealloc
  57. * spaces for this particular inode. The inode prealloc space is
  58. * represented as:
  59. *
  60. * pa_lstart -> the logical start block for this prealloc space
  61. * pa_pstart -> the physical start block for this prealloc space
  62. * pa_len -> length for this prealloc space (in clusters)
  63. * pa_free -> free space available in this prealloc space (in clusters)
  64. *
  65. * The inode preallocation space is used looking at the _logical_ start
  66. * block. If only the logical file block falls within the range of prealloc
  67. * space we will consume the particular prealloc space. This makes sure that
  68. * we have contiguous physical blocks representing the file blocks
  69. *
  70. * The important thing to be noted in case of inode prealloc space is that
  71. * we don't modify the values associated to inode prealloc space except
  72. * pa_free.
  73. *
  74. * If we are not able to find blocks in the inode prealloc space and if we
  75. * have the group allocation flag set then we look at the locality group
  76. * prealloc space. These are per CPU prealloc list represented as
  77. *
  78. * ext4_sb_info.s_locality_groups[smp_processor_id()]
  79. *
  80. * The reason for having a per cpu locality group is to reduce the contention
  81. * between CPUs. It is possible to get scheduled at this point.
  82. *
  83. * The locality group prealloc space is used looking at whether we have
  84. * enough free space (pa_free) within the prealloc space.
  85. *
  86. * If we can't allocate blocks via inode prealloc or/and locality group
  87. * prealloc then we look at the buddy cache. The buddy cache is represented
  88. * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
  89. * mapped to the buddy and bitmap information regarding different
  90. * groups. The buddy information is attached to buddy cache inode so that
  91. * we can access them through the page cache. The information regarding
  92. * each group is loaded via ext4_mb_load_buddy. The information involve
  93. * block bitmap and buddy information. The information are stored in the
  94. * inode as:
  95. *
  96. * { folio }
  97. * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
  98. *
  99. *
  100. * one block each for bitmap and buddy information. So for each group we
  101. * take up 2 blocks. A folio can contain blocks_per_folio (folio_size /
  102. * blocksize) blocks. So it can have information regarding groups_per_folio
  103. * which is blocks_per_folio/2
  104. *
  105. * The buddy cache inode is not stored on disk. The inode is thrown
  106. * away when the filesystem is unmounted.
  107. *
  108. * We look for count number of blocks in the buddy cache. If we were able
  109. * to locate that many free blocks we return with additional information
  110. * regarding rest of the contiguous physical block available
  111. *
  112. * Before allocating blocks via buddy cache we normalize the request
  113. * blocks. This ensure we ask for more blocks that we needed. The extra
  114. * blocks that we get after allocation is added to the respective prealloc
  115. * list. In case of inode preallocation we follow a list of heuristics
  116. * based on file size. This can be found in ext4_mb_normalize_request. If
  117. * we are doing a group prealloc we try to normalize the request to
  118. * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is
  119. * dependent on the cluster size; for non-bigalloc file systems, it is
  120. * 512 blocks. This can be tuned via
  121. * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
  122. * terms of number of blocks. If we have mounted the file system with -O
  123. * stripe=<value> option the group prealloc request is normalized to the
  124. * smallest multiple of the stripe value (sbi->s_stripe) which is
  125. * greater than the default mb_group_prealloc.
  126. *
  127. * If "mb_optimize_scan" mount option is set, we maintain in memory group info
  128. * structures in two data structures:
  129. *
  130. * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders)
  131. *
  132. * Locking: Writers use xa_lock, readers use rcu_read_lock.
  133. *
  134. * This is an array of xarrays where the index in the array represents the
  135. * largest free order in the buddy bitmap of the participating group infos of
  136. * that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total
  137. * number of buddy bitmap orders possible) number of xarrays. Group-infos are
  138. * placed in appropriate xarrays.
  139. *
  140. * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size)
  141. *
  142. * Locking: Writers use xa_lock, readers use rcu_read_lock.
  143. *
  144. * This is an array of xarrays where in the i-th xarray there are groups with
  145. * average fragment size >= 2^i and < 2^(i+1). The average fragment size
  146. * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
  147. * Note that we don't bother with a special xarray for completely empty
  148. * groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed
  149. * in appropriate xarrays.
  150. *
  151. * In xarray, the index is the block group number, the value is the block group
  152. * information, and a non-empty value indicates the block group is present in
  153. * the current xarray.
  154. *
  155. * When "mb_optimize_scan" mount option is set, mballoc consults the above data
  156. * structures to decide the order in which groups are to be traversed for
  157. * fulfilling an allocation request.
  158. *
  159. * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order
  160. * >= the order of the request. We directly look at the largest free order list
  161. * in the data structure (1) above where largest_free_order = order of the
  162. * request. If that list is empty, we look at remaining list in the increasing
  163. * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED
  164. * lookup in O(1) time.
  165. *
  166. * At CR_GOAL_LEN_FAST, we only consider groups where
  167. * average fragment size > request size. So, we lookup a group which has average
  168. * fragment size just above or equal to request size using our average fragment
  169. * size group lists (data structure 2) in O(1) time.
  170. *
  171. * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied
  172. * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in
  173. * CR_GOAL_LEN_FAST suggests that there is no BG that has avg
  174. * fragment size > goal length. So before falling to the slower
  175. * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and
  176. * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big
  177. * enough average fragment size. This increases the chances of finding a
  178. * suitable block group in O(1) time and results in faster allocation at the
  179. * cost of reduced size of allocation.
  180. *
  181. * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
  182. * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and
  183. * CR_GOAL_LEN_FAST phase.
  184. *
  185. * The regular allocator (using the buddy cache) supports a few tunables.
  186. *
  187. * /sys/fs/ext4/<partition>/mb_min_to_scan
  188. * /sys/fs/ext4/<partition>/mb_max_to_scan
  189. * /sys/fs/ext4/<partition>/mb_order2_req
  190. * /sys/fs/ext4/<partition>/mb_max_linear_groups
  191. *
  192. * The regular allocator uses buddy scan only if the request len is power of
  193. * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
  194. * value of s_mb_order2_reqs can be tuned via
  195. * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
  196. * stripe size (sbi->s_stripe), we try to search for contiguous block in
  197. * stripe size. This should result in better allocation on RAID setups. If
  198. * not, we search in the specific group using bitmap for best extents. The
  199. * tunable min_to_scan and max_to_scan control the behaviour here.
  200. * min_to_scan indicate how long the mballoc __must__ look for a best
  201. * extent and max_to_scan indicates how long the mballoc __can__ look for a
  202. * best extent in the found extents. Searching for the blocks starts with
  203. * the group specified as the goal value in allocation context via
  204. * ac_g_ex. Each group is first checked based on the criteria whether it
  205. * can be used for allocation. ext4_mb_good_group explains how the groups are
  206. * checked.
  207. *
  208. * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
  209. * get traversed linearly. That may result in subsequent allocations being not
  210. * close to each other. And so, the underlying device may get filled up in a
  211. * non-linear fashion. While that may not matter on non-rotational devices, for
  212. * rotational devices that may result in higher seek times. "mb_max_linear_groups"
  213. * tells mballoc how many groups mballoc should search linearly before
  214. * performing consulting above data structures for more efficient lookups. For
  215. * non rotational devices, this value defaults to 0 and for rotational devices
  216. * this is set to MB_DEFAULT_LINEAR_LIMIT.
  217. *
  218. * Both the prealloc space are getting populated as above. So for the first
  219. * request we will hit the buddy cache which will result in this prealloc
  220. * space getting filled. The prealloc space is then later used for the
  221. * subsequent request.
  222. */
  223. /*
  224. * mballoc operates on the following data:
  225. * - on-disk bitmap
  226. * - in-core buddy (actually includes buddy and bitmap)
  227. * - preallocation descriptors (PAs)
  228. *
  229. * there are two types of preallocations:
  230. * - inode
  231. * assiged to specific inode and can be used for this inode only.
  232. * it describes part of inode's space preallocated to specific
  233. * physical blocks. any block from that preallocated can be used
  234. * independent. the descriptor just tracks number of blocks left
  235. * unused. so, before taking some block from descriptor, one must
  236. * make sure corresponded logical block isn't allocated yet. this
  237. * also means that freeing any block within descriptor's range
  238. * must discard all preallocated blocks.
  239. * - locality group
  240. * assigned to specific locality group which does not translate to
  241. * permanent set of inodes: inode can join and leave group. space
  242. * from this type of preallocation can be used for any inode. thus
  243. * it's consumed from the beginning to the end.
  244. *
  245. * relation between them can be expressed as:
  246. * in-core buddy = on-disk bitmap + preallocation descriptors
  247. *
  248. * this mean blocks mballoc considers used are:
  249. * - allocated blocks (persistent)
  250. * - preallocated blocks (non-persistent)
  251. *
  252. * consistency in mballoc world means that at any time a block is either
  253. * free or used in ALL structures. notice: "any time" should not be read
  254. * literally -- time is discrete and delimited by locks.
  255. *
  256. * to keep it simple, we don't use block numbers, instead we count number of
  257. * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
  258. *
  259. * all operations can be expressed as:
  260. * - init buddy: buddy = on-disk + PAs
  261. * - new PA: buddy += N; PA = N
  262. * - use inode PA: on-disk += N; PA -= N
  263. * - discard inode PA buddy -= on-disk - PA; PA = 0
  264. * - use locality group PA on-disk += N; PA -= N
  265. * - discard locality group PA buddy -= PA; PA = 0
  266. * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
  267. * is used in real operation because we can't know actual used
  268. * bits from PA, only from on-disk bitmap
  269. *
  270. * if we follow this strict logic, then all operations above should be atomic.
  271. * given some of them can block, we'd have to use something like semaphores
  272. * killing performance on high-end SMP hardware. let's try to relax it using
  273. * the following knowledge:
  274. * 1) if buddy is referenced, it's already initialized
  275. * 2) while block is used in buddy and the buddy is referenced,
  276. * nobody can re-allocate that block
  277. * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
  278. * bit set and PA claims same block, it's OK. IOW, one can set bit in
  279. * on-disk bitmap if buddy has same bit set or/and PA covers corresponded
  280. * block
  281. *
  282. * so, now we're building a concurrency table:
  283. * - init buddy vs.
  284. * - new PA
  285. * blocks for PA are allocated in the buddy, buddy must be referenced
  286. * until PA is linked to allocation group to avoid concurrent buddy init
  287. * - use inode PA
  288. * we need to make sure that either on-disk bitmap or PA has uptodate data
  289. * given (3) we care that PA-=N operation doesn't interfere with init
  290. * - discard inode PA
  291. * the simplest way would be to have buddy initialized by the discard
  292. * - use locality group PA
  293. * again PA-=N must be serialized with init
  294. * - discard locality group PA
  295. * the simplest way would be to have buddy initialized by the discard
  296. * - new PA vs.
  297. * - use inode PA
  298. * i_data_sem serializes them
  299. * - discard inode PA
  300. * discard process must wait until PA isn't used by another process
  301. * - use locality group PA
  302. * some mutex should serialize them
  303. * - discard locality group PA
  304. * discard process must wait until PA isn't used by another process
  305. * - use inode PA
  306. * - use inode PA
  307. * i_data_sem or another mutex should serializes them
  308. * - discard inode PA
  309. * discard process must wait until PA isn't used by another process
  310. * - use locality group PA
  311. * nothing wrong here -- they're different PAs covering different blocks
  312. * - discard locality group PA
  313. * discard process must wait until PA isn't used by another process
  314. *
  315. * now we're ready to make few consequences:
  316. * - PA is referenced and while it is no discard is possible
  317. * - PA is referenced until block isn't marked in on-disk bitmap
  318. * - PA changes only after on-disk bitmap
  319. * - discard must not compete with init. either init is done before
  320. * any discard or they're serialized somehow
  321. * - buddy init as sum of on-disk bitmap and PAs is done atomically
  322. *
  323. * a special case when we've used PA to emptiness. no need to modify buddy
  324. * in this case, but we should care about concurrent init
  325. *
  326. */
  327. /*
  328. * Logic in few words:
  329. *
  330. * - allocation:
  331. * load group
  332. * find blocks
  333. * mark bits in on-disk bitmap
  334. * release group
  335. *
  336. * - use preallocation:
  337. * find proper PA (per-inode or group)
  338. * load group
  339. * mark bits in on-disk bitmap
  340. * release group
  341. * release PA
  342. *
  343. * - free:
  344. * load group
  345. * mark bits in on-disk bitmap
  346. * release group
  347. *
  348. * - discard preallocations in group:
  349. * mark PAs deleted
  350. * move them onto local list
  351. * load on-disk bitmap
  352. * load group
  353. * remove PA from object (inode or locality group)
  354. * mark free blocks in-core
  355. *
  356. * - discard inode's preallocations:
  357. */
  358. /*
  359. * Locking rules
  360. *
  361. * Locks:
  362. * - bitlock on a group (group)
  363. * - object (inode/locality) (object)
  364. * - per-pa lock (pa)
  365. * - cr_power2_aligned lists lock (cr_power2_aligned)
  366. * - cr_goal_len_fast lists lock (cr_goal_len_fast)
  367. *
  368. * Paths:
  369. * - new pa
  370. * object
  371. * group
  372. *
  373. * - find and use pa:
  374. * pa
  375. *
  376. * - release consumed pa:
  377. * pa
  378. * group
  379. * object
  380. *
  381. * - generate in-core bitmap:
  382. * group
  383. * pa
  384. *
  385. * - discard all for given object (inode, locality group):
  386. * object
  387. * pa
  388. * group
  389. *
  390. * - discard all for given group:
  391. * group
  392. * pa
  393. * group
  394. * object
  395. *
  396. * - allocation path (ext4_mb_regular_allocator)
  397. * group
  398. * cr_power2_aligned/cr_goal_len_fast
  399. */
  400. static struct kmem_cache *ext4_pspace_cachep;
  401. static struct kmem_cache *ext4_ac_cachep;
  402. static struct kmem_cache *ext4_free_data_cachep;
  403. /* We create slab caches for groupinfo data structures based on the
  404. * superblock block size. There will be one per mounted filesystem for
  405. * each unique s_blocksize_bits */
  406. #define NR_GRPINFO_CACHES 8
  407. static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
  408. static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
  409. "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
  410. "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
  411. "ext4_groupinfo_64k", "ext4_groupinfo_128k"
  412. };
  413. static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
  414. ext4_group_t group);
  415. static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
  416. static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
  417. ext4_group_t group);
  418. static int ext4_try_to_trim_range(struct super_block *sb,
  419. struct ext4_buddy *e4b, ext4_grpblk_t start,
  420. ext4_grpblk_t max, ext4_grpblk_t minblocks);
  421. /*
  422. * The algorithm using this percpu seq counter goes below:
  423. * 1. We sample the percpu discard_pa_seq counter before trying for block
  424. * allocation in ext4_mb_new_blocks().
  425. * 2. We increment this percpu discard_pa_seq counter when we either allocate
  426. * or free these blocks i.e. while marking those blocks as used/free in
  427. * mb_mark_used()/mb_free_blocks().
  428. * 3. We also increment this percpu seq counter when we successfully identify
  429. * that the bb_prealloc_list is not empty and hence proceed for discarding
  430. * of those PAs inside ext4_mb_discard_group_preallocations().
  431. *
  432. * Now to make sure that the regular fast path of block allocation is not
  433. * affected, as a small optimization we only sample the percpu seq counter
  434. * on that cpu. Only when the block allocation fails and when freed blocks
  435. * found were 0, that is when we sample percpu seq counter for all cpus using
  436. * below function ext4_get_discard_pa_seq_sum(). This happens after making
  437. * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
  438. */
  439. static DEFINE_PER_CPU(u64, discard_pa_seq);
  440. static inline u64 ext4_get_discard_pa_seq_sum(void)
  441. {
  442. int __cpu;
  443. u64 __seq = 0;
  444. for_each_possible_cpu(__cpu)
  445. __seq += per_cpu(discard_pa_seq, __cpu);
  446. return __seq;
  447. }
  448. static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
  449. {
  450. #if BITS_PER_LONG == 64
  451. *bit += ((unsigned long) addr & 7UL) << 3;
  452. addr = (void *) ((unsigned long) addr & ~7UL);
  453. #elif BITS_PER_LONG == 32
  454. *bit += ((unsigned long) addr & 3UL) << 3;
  455. addr = (void *) ((unsigned long) addr & ~3UL);
  456. #else
  457. #error "how many bits you are?!"
  458. #endif
  459. return addr;
  460. }
  461. static inline int mb_test_bit(int bit, void *addr)
  462. {
  463. /*
  464. * ext4_test_bit on architecture like powerpc
  465. * needs unsigned long aligned address
  466. */
  467. addr = mb_correct_addr_and_bit(&bit, addr);
  468. return ext4_test_bit(bit, addr);
  469. }
  470. static inline void mb_set_bit(int bit, void *addr)
  471. {
  472. addr = mb_correct_addr_and_bit(&bit, addr);
  473. ext4_set_bit(bit, addr);
  474. }
  475. static inline void mb_clear_bit(int bit, void *addr)
  476. {
  477. addr = mb_correct_addr_and_bit(&bit, addr);
  478. ext4_clear_bit(bit, addr);
  479. }
  480. static inline int mb_test_and_clear_bit(int bit, void *addr)
  481. {
  482. addr = mb_correct_addr_and_bit(&bit, addr);
  483. return ext4_test_and_clear_bit(bit, addr);
  484. }
  485. static inline int mb_find_next_zero_bit(void *addr, int max, int start)
  486. {
  487. int fix = 0, ret, tmpmax;
  488. addr = mb_correct_addr_and_bit(&fix, addr);
  489. tmpmax = max + fix;
  490. start += fix;
  491. ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
  492. if (ret > max)
  493. return max;
  494. return ret;
  495. }
  496. static inline int mb_find_next_bit(void *addr, int max, int start)
  497. {
  498. int fix = 0, ret, tmpmax;
  499. addr = mb_correct_addr_and_bit(&fix, addr);
  500. tmpmax = max + fix;
  501. start += fix;
  502. ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
  503. if (ret > max)
  504. return max;
  505. return ret;
  506. }
  507. static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
  508. {
  509. char *bb;
  510. BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
  511. BUG_ON(max == NULL);
  512. if (order > e4b->bd_blkbits + 1) {
  513. *max = 0;
  514. return NULL;
  515. }
  516. /* at order 0 we see each particular block */
  517. if (order == 0) {
  518. *max = 1 << (e4b->bd_blkbits + 3);
  519. return e4b->bd_bitmap;
  520. }
  521. bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
  522. *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
  523. return bb;
  524. }
  525. #ifdef DOUBLE_CHECK
  526. static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
  527. int first, int count)
  528. {
  529. int i;
  530. struct super_block *sb = e4b->bd_sb;
  531. if (unlikely(e4b->bd_info->bb_bitmap == NULL))
  532. return;
  533. assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
  534. for (i = 0; i < count; i++) {
  535. if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
  536. ext4_fsblk_t blocknr;
  537. blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
  538. blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
  539. ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
  540. EXT4_GROUP_INFO_BBITMAP_CORRUPT);
  541. ext4_grp_locked_error(sb, e4b->bd_group,
  542. inode ? inode->i_ino : 0,
  543. blocknr,
  544. "freeing block already freed "
  545. "(bit %u)",
  546. first + i);
  547. }
  548. mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
  549. }
  550. }
  551. static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
  552. {
  553. int i;
  554. if (unlikely(e4b->bd_info->bb_bitmap == NULL))
  555. return;
  556. assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
  557. for (i = 0; i < count; i++) {
  558. BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
  559. mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
  560. }
  561. }
  562. static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
  563. {
  564. if (unlikely(e4b->bd_info->bb_bitmap == NULL))
  565. return;
  566. if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
  567. unsigned char *b1, *b2;
  568. int i;
  569. b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
  570. b2 = (unsigned char *) bitmap;
  571. for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
  572. if (b1[i] != b2[i]) {
  573. ext4_msg(e4b->bd_sb, KERN_ERR,
  574. "corruption in group %u "
  575. "at byte %u(%u): %x in copy != %x "
  576. "on disk/prealloc",
  577. e4b->bd_group, i, i * 8, b1[i], b2[i]);
  578. BUG();
  579. }
  580. }
  581. }
  582. }
  583. static void mb_group_bb_bitmap_alloc(struct super_block *sb,
  584. struct ext4_group_info *grp, ext4_group_t group)
  585. {
  586. struct buffer_head *bh;
  587. grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
  588. if (!grp->bb_bitmap)
  589. return;
  590. bh = ext4_read_block_bitmap(sb, group);
  591. if (IS_ERR_OR_NULL(bh)) {
  592. kfree(grp->bb_bitmap);
  593. grp->bb_bitmap = NULL;
  594. return;
  595. }
  596. memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
  597. put_bh(bh);
  598. }
  599. static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
  600. {
  601. kfree(grp->bb_bitmap);
  602. }
  603. #else
  604. static inline void mb_free_blocks_double(struct inode *inode,
  605. struct ext4_buddy *e4b, int first, int count)
  606. {
  607. return;
  608. }
  609. static inline void mb_mark_used_double(struct ext4_buddy *e4b,
  610. int first, int count)
  611. {
  612. return;
  613. }
  614. static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
  615. {
  616. return;
  617. }
  618. static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
  619. struct ext4_group_info *grp, ext4_group_t group)
  620. {
  621. return;
  622. }
  623. static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
  624. {
  625. return;
  626. }
  627. #endif
  628. #ifdef AGGRESSIVE_CHECK
  629. #define MB_CHECK_ASSERT(assert) \
  630. do { \
  631. if (!(assert)) { \
  632. printk(KERN_EMERG \
  633. "Assertion failure in %s() at %s:%d: \"%s\"\n", \
  634. function, file, line, # assert); \
  635. BUG(); \
  636. } \
  637. } while (0)
  638. /*
  639. * Perform buddy integrity check with the following steps:
  640. *
  641. * 1. Top-down validation (from highest order down to order 1, excluding order-0 bitmap):
  642. * For each pair of adjacent orders, if a higher-order bit is set (indicating a free block),
  643. * at most one of the two corresponding lower-order bits may be clear (free).
  644. *
  645. * 2. Order-0 (bitmap) validation, performed on bit pairs:
  646. * - If either bit in a pair is set (1, allocated), then all corresponding higher-order bits
  647. * must not be free (0).
  648. * - If both bits in a pair are clear (0, free), then exactly one of the corresponding
  649. * higher-order bits must be free (0).
  650. *
  651. * 3. Preallocation (pa) list validation:
  652. * For each preallocated block (pa) in the group:
  653. * - Verify that pa_pstart falls within the bounds of this block group.
  654. * - Ensure the corresponding bit(s) in the order-0 bitmap are marked as allocated (1).
  655. */
  656. static void __mb_check_buddy(struct ext4_buddy *e4b, char *file,
  657. const char *function, int line)
  658. {
  659. struct super_block *sb = e4b->bd_sb;
  660. int order = e4b->bd_blkbits + 1;
  661. int max;
  662. int max2;
  663. int i;
  664. int j;
  665. int k;
  666. int count;
  667. struct ext4_group_info *grp;
  668. int fragments = 0;
  669. int fstart;
  670. struct list_head *cur;
  671. void *buddy;
  672. void *buddy2;
  673. if (e4b->bd_info->bb_check_counter++ % 10)
  674. return;
  675. while (order > 1) {
  676. buddy = mb_find_buddy(e4b, order, &max);
  677. MB_CHECK_ASSERT(buddy);
  678. buddy2 = mb_find_buddy(e4b, order - 1, &max2);
  679. MB_CHECK_ASSERT(buddy2);
  680. MB_CHECK_ASSERT(buddy != buddy2);
  681. MB_CHECK_ASSERT(max * 2 == max2);
  682. count = 0;
  683. for (i = 0; i < max; i++) {
  684. if (mb_test_bit(i, buddy)) {
  685. /* only single bit in buddy2 may be 0 */
  686. if (!mb_test_bit(i << 1, buddy2)) {
  687. MB_CHECK_ASSERT(
  688. mb_test_bit((i<<1)+1, buddy2));
  689. }
  690. continue;
  691. }
  692. count++;
  693. }
  694. MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
  695. order--;
  696. }
  697. fstart = -1;
  698. buddy = mb_find_buddy(e4b, 0, &max);
  699. for (i = 0; i < max; i++) {
  700. if (!mb_test_bit(i, buddy)) {
  701. MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
  702. if (fstart == -1) {
  703. fragments++;
  704. fstart = i;
  705. }
  706. } else {
  707. fstart = -1;
  708. }
  709. if (!(i & 1)) {
  710. int in_use, zero_bit_count = 0;
  711. in_use = mb_test_bit(i, buddy) || mb_test_bit(i + 1, buddy);
  712. for (j = 1; j < e4b->bd_blkbits + 2; j++) {
  713. buddy2 = mb_find_buddy(e4b, j, &max2);
  714. k = i >> j;
  715. MB_CHECK_ASSERT(k < max2);
  716. if (!mb_test_bit(k, buddy2))
  717. zero_bit_count++;
  718. }
  719. MB_CHECK_ASSERT(zero_bit_count == !in_use);
  720. }
  721. }
  722. MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
  723. MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
  724. grp = ext4_get_group_info(sb, e4b->bd_group);
  725. if (!grp)
  726. return;
  727. list_for_each(cur, &grp->bb_prealloc_list) {
  728. ext4_group_t groupnr;
  729. struct ext4_prealloc_space *pa;
  730. pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
  731. if (!pa->pa_len)
  732. continue;
  733. ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
  734. MB_CHECK_ASSERT(groupnr == e4b->bd_group);
  735. for (i = 0; i < pa->pa_len; i++)
  736. MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
  737. }
  738. }
  739. #undef MB_CHECK_ASSERT
  740. #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
  741. __FILE__, __func__, __LINE__)
  742. #else
  743. #define mb_check_buddy(e4b)
  744. #endif
  745. /*
  746. * Divide blocks started from @first with length @len into
  747. * smaller chunks with power of 2 blocks.
  748. * Clear the bits in bitmap which the blocks of the chunk(s) covered,
  749. * then increase bb_counters[] for corresponded chunk size.
  750. */
  751. static void ext4_mb_mark_free_simple(struct super_block *sb,
  752. void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
  753. struct ext4_group_info *grp)
  754. {
  755. struct ext4_sb_info *sbi = EXT4_SB(sb);
  756. ext4_grpblk_t min;
  757. ext4_grpblk_t max;
  758. ext4_grpblk_t chunk;
  759. unsigned int border;
  760. BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
  761. border = 2 << sb->s_blocksize_bits;
  762. while (len > 0) {
  763. /* find how many blocks can be covered since this position */
  764. max = ffs(first | border) - 1;
  765. /* find how many blocks of power 2 we need to mark */
  766. min = fls(len) - 1;
  767. if (max < min)
  768. min = max;
  769. chunk = 1 << min;
  770. /* mark multiblock chunks only */
  771. grp->bb_counters[min]++;
  772. if (min > 0)
  773. mb_clear_bit(first >> min,
  774. buddy + sbi->s_mb_offsets[min]);
  775. len -= chunk;
  776. first += chunk;
  777. }
  778. }
  779. static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
  780. {
  781. int order;
  782. /*
  783. * We don't bother with a special lists groups with only 1 block free
  784. * extents and for completely empty groups.
  785. */
  786. order = fls(len) - 2;
  787. if (order < 0)
  788. return 0;
  789. if (order == MB_NUM_ORDERS(sb))
  790. order--;
  791. if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
  792. order = MB_NUM_ORDERS(sb) - 1;
  793. return order;
  794. }
  795. /* Move group to appropriate avg_fragment_size list */
  796. static void
  797. mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
  798. {
  799. struct ext4_sb_info *sbi = EXT4_SB(sb);
  800. int new, old;
  801. if (!test_opt2(sb, MB_OPTIMIZE_SCAN))
  802. return;
  803. old = grp->bb_avg_fragment_size_order;
  804. new = grp->bb_fragments == 0 ? -1 :
  805. mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments);
  806. if (new == old)
  807. return;
  808. if (old >= 0)
  809. xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group);
  810. grp->bb_avg_fragment_size_order = new;
  811. if (new >= 0) {
  812. /*
  813. * Cannot use __GFP_NOFAIL because we hold the group lock.
  814. * Although allocation for insertion may fails, it's not fatal
  815. * as we have linear traversal to fall back on.
  816. */
  817. int err = xa_insert(&sbi->s_mb_avg_fragment_size[new],
  818. grp->bb_group, grp, GFP_ATOMIC);
  819. if (err)
  820. mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d",
  821. grp->bb_group, new, err);
  822. }
  823. }
  824. static ext4_group_t ext4_get_allocation_groups_count(
  825. struct ext4_allocation_context *ac)
  826. {
  827. ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
  828. /* non-extent files are limited to low blocks/groups */
  829. if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
  830. ngroups = EXT4_SB(ac->ac_sb)->s_blockfile_groups;
  831. /* Pairs with smp_wmb() in ext4_update_super() */
  832. smp_rmb();
  833. return ngroups;
  834. }
  835. static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
  836. struct xarray *xa,
  837. ext4_group_t start, ext4_group_t end)
  838. {
  839. struct super_block *sb = ac->ac_sb;
  840. struct ext4_sb_info *sbi = EXT4_SB(sb);
  841. enum criteria cr = ac->ac_criteria;
  842. ext4_group_t ngroups = ext4_get_allocation_groups_count(ac);
  843. unsigned long group = start;
  844. struct ext4_group_info *grp;
  845. if (WARN_ON_ONCE(end > ngroups || start >= end))
  846. return 0;
  847. xa_for_each_range(xa, group, grp, start, end - 1) {
  848. int err;
  849. if (sbi->s_mb_stats)
  850. atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
  851. err = ext4_mb_scan_group(ac, grp->bb_group);
  852. if (err || ac->ac_status != AC_STATUS_CONTINUE)
  853. return err;
  854. cond_resched();
  855. }
  856. return 0;
  857. }
  858. /*
  859. * Find a suitable group of given order from the largest free orders xarray.
  860. */
  861. static inline int
  862. ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac,
  863. int order, ext4_group_t start,
  864. ext4_group_t end)
  865. {
  866. struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order];
  867. if (xa_empty(xa))
  868. return 0;
  869. return ext4_mb_scan_groups_xa_range(ac, xa, start, end);
  870. }
  871. /*
  872. * Choose next group by traversing largest_free_order lists. Updates *new_cr if
  873. * cr level needs an update.
  874. */
  875. static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac,
  876. ext4_group_t group)
  877. {
  878. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  879. int i;
  880. int ret = 0;
  881. ext4_group_t start, end;
  882. start = group;
  883. end = ext4_get_allocation_groups_count(ac);
  884. wrap_around:
  885. for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
  886. ret = ext4_mb_scan_groups_largest_free_order_range(ac, i,
  887. start, end);
  888. if (ret || ac->ac_status != AC_STATUS_CONTINUE)
  889. return ret;
  890. }
  891. if (start) {
  892. end = start;
  893. start = 0;
  894. goto wrap_around;
  895. }
  896. if (sbi->s_mb_stats)
  897. atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
  898. /* Increment cr and search again if no group is found */
  899. ac->ac_criteria = CR_GOAL_LEN_FAST;
  900. return ret;
  901. }
  902. /*
  903. * Find a suitable group of given order from the average fragments xarray.
  904. */
  905. static int
  906. ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac,
  907. int order, ext4_group_t start,
  908. ext4_group_t end)
  909. {
  910. struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order];
  911. if (xa_empty(xa))
  912. return 0;
  913. return ext4_mb_scan_groups_xa_range(ac, xa, start, end);
  914. }
  915. /*
  916. * Choose next group by traversing average fragment size list of suitable
  917. * order. Updates *new_cr if cr level needs an update.
  918. */
  919. static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac,
  920. ext4_group_t group)
  921. {
  922. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  923. int i, ret = 0;
  924. ext4_group_t start, end;
  925. start = group;
  926. end = ext4_get_allocation_groups_count(ac);
  927. wrap_around:
  928. i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
  929. for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
  930. ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i,
  931. start, end);
  932. if (ret || ac->ac_status != AC_STATUS_CONTINUE)
  933. return ret;
  934. }
  935. if (start) {
  936. end = start;
  937. start = 0;
  938. goto wrap_around;
  939. }
  940. if (sbi->s_mb_stats)
  941. atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
  942. /*
  943. * CR_BEST_AVAIL_LEN works based on the concept that we have
  944. * a larger normalized goal len request which can be trimmed to
  945. * a smaller goal len such that it can still satisfy original
  946. * request len. However, allocation request for non-regular
  947. * files never gets normalized.
  948. * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA).
  949. */
  950. if (ac->ac_flags & EXT4_MB_HINT_DATA)
  951. ac->ac_criteria = CR_BEST_AVAIL_LEN;
  952. else
  953. ac->ac_criteria = CR_GOAL_LEN_SLOW;
  954. return ret;
  955. }
  956. /*
  957. * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment
  958. * order we have and proactively trim the goal request length to that order to
  959. * find a suitable group faster.
  960. *
  961. * This optimizes allocation speed at the cost of slightly reduced
  962. * preallocations. However, we make sure that we don't trim the request too
  963. * much and fall to CR_GOAL_LEN_SLOW in that case.
  964. */
  965. static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
  966. ext4_group_t group)
  967. {
  968. int ret = 0;
  969. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  970. int i, order, min_order;
  971. unsigned long num_stripe_clusters = 0;
  972. ext4_group_t start, end;
  973. /*
  974. * mb_avg_fragment_size_order() returns order in a way that makes
  975. * retrieving back the length using (1 << order) inaccurate. Hence, use
  976. * fls() instead since we need to know the actual length while modifying
  977. * goal length.
  978. */
  979. order = fls(ac->ac_g_ex.fe_len) - 1;
  980. if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb)))
  981. order = MB_NUM_ORDERS(ac->ac_sb);
  982. min_order = order - sbi->s_mb_best_avail_max_trim_order;
  983. if (min_order < 0)
  984. min_order = 0;
  985. if (sbi->s_stripe > 0) {
  986. /*
  987. * We are assuming that stripe size is always a multiple of
  988. * cluster ratio otherwise __ext4_fill_super exists early.
  989. */
  990. num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
  991. if (1 << min_order < num_stripe_clusters)
  992. /*
  993. * We consider 1 order less because later we round
  994. * up the goal len to num_stripe_clusters
  995. */
  996. min_order = fls(num_stripe_clusters) - 1;
  997. }
  998. if (1 << min_order < ac->ac_o_ex.fe_len)
  999. min_order = fls(ac->ac_o_ex.fe_len);
  1000. start = group;
  1001. end = ext4_get_allocation_groups_count(ac);
  1002. wrap_around:
  1003. for (i = order; i >= min_order; i--) {
  1004. int frag_order;
  1005. /*
  1006. * Scale down goal len to make sure we find something
  1007. * in the free fragments list. Basically, reduce
  1008. * preallocations.
  1009. */
  1010. ac->ac_g_ex.fe_len = 1 << i;
  1011. if (num_stripe_clusters > 0) {
  1012. /*
  1013. * Try to round up the adjusted goal length to
  1014. * stripe size (in cluster units) multiple for
  1015. * efficiency.
  1016. */
  1017. ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len,
  1018. num_stripe_clusters);
  1019. }
  1020. frag_order = mb_avg_fragment_size_order(ac->ac_sb,
  1021. ac->ac_g_ex.fe_len);
  1022. ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order,
  1023. start, end);
  1024. if (ret || ac->ac_status != AC_STATUS_CONTINUE)
  1025. return ret;
  1026. }
  1027. if (start) {
  1028. end = start;
  1029. start = 0;
  1030. goto wrap_around;
  1031. }
  1032. /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
  1033. ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
  1034. if (sbi->s_mb_stats)
  1035. atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
  1036. ac->ac_criteria = CR_GOAL_LEN_SLOW;
  1037. return ret;
  1038. }
  1039. static inline int should_optimize_scan(struct ext4_allocation_context *ac)
  1040. {
  1041. if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
  1042. return 0;
  1043. if (ac->ac_criteria >= CR_GOAL_LEN_SLOW)
  1044. return 0;
  1045. return 1;
  1046. }
  1047. /*
  1048. * next linear group for allocation.
  1049. */
  1050. static void next_linear_group(ext4_group_t *group, ext4_group_t ngroups)
  1051. {
  1052. /*
  1053. * Artificially restricted ngroups for non-extent
  1054. * files makes group > ngroups possible on first loop.
  1055. */
  1056. *group = *group + 1 >= ngroups ? 0 : *group + 1;
  1057. }
  1058. static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac,
  1059. ext4_group_t ngroups, ext4_group_t *start, ext4_group_t count)
  1060. {
  1061. int ret, i;
  1062. enum criteria cr = ac->ac_criteria;
  1063. struct super_block *sb = ac->ac_sb;
  1064. struct ext4_sb_info *sbi = EXT4_SB(sb);
  1065. ext4_group_t group = *start;
  1066. for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) {
  1067. ret = ext4_mb_scan_group(ac, group);
  1068. if (ret || ac->ac_status != AC_STATUS_CONTINUE)
  1069. return ret;
  1070. cond_resched();
  1071. }
  1072. *start = group;
  1073. if (count == ngroups)
  1074. ac->ac_criteria++;
  1075. /* Processed all groups and haven't found blocks */
  1076. if (sbi->s_mb_stats && i == ngroups)
  1077. atomic64_inc(&sbi->s_bal_cX_failed[cr]);
  1078. return 0;
  1079. }
  1080. static int ext4_mb_scan_groups(struct ext4_allocation_context *ac)
  1081. {
  1082. int ret = 0;
  1083. ext4_group_t start;
  1084. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  1085. ext4_group_t ngroups = ext4_get_allocation_groups_count(ac);
  1086. /* searching for the right group start from the goal value specified */
  1087. start = ac->ac_g_ex.fe_group;
  1088. if (start >= ngroups)
  1089. start = 0;
  1090. ac->ac_prefetch_grp = start;
  1091. ac->ac_prefetch_nr = 0;
  1092. if (!should_optimize_scan(ac))
  1093. return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups);
  1094. /*
  1095. * Optimized scanning can return non adjacent groups which can cause
  1096. * seek overhead for rotational disks. So try few linear groups before
  1097. * trying optimized scan.
  1098. */
  1099. if (sbi->s_mb_max_linear_groups)
  1100. ret = ext4_mb_scan_groups_linear(ac, ngroups, &start,
  1101. sbi->s_mb_max_linear_groups);
  1102. if (ret || ac->ac_status != AC_STATUS_CONTINUE)
  1103. return ret;
  1104. switch (ac->ac_criteria) {
  1105. case CR_POWER2_ALIGNED:
  1106. return ext4_mb_scan_groups_p2_aligned(ac, start);
  1107. case CR_GOAL_LEN_FAST:
  1108. return ext4_mb_scan_groups_goal_fast(ac, start);
  1109. case CR_BEST_AVAIL_LEN:
  1110. return ext4_mb_scan_groups_best_avail(ac, start);
  1111. default:
  1112. /*
  1113. * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an
  1114. * rb tree sorted by bb_free. But until that happens, we should
  1115. * never come here.
  1116. */
  1117. WARN_ON(1);
  1118. }
  1119. return 0;
  1120. }
  1121. /*
  1122. * Cache the order of the largest free extent we have available in this block
  1123. * group.
  1124. */
  1125. static void
  1126. mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
  1127. {
  1128. struct ext4_sb_info *sbi = EXT4_SB(sb);
  1129. int new, old = grp->bb_largest_free_order;
  1130. for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--)
  1131. if (grp->bb_counters[new] > 0)
  1132. break;
  1133. /* No need to move between order lists? */
  1134. if (new == old)
  1135. return;
  1136. if (old >= 0) {
  1137. struct xarray *xa = &sbi->s_mb_largest_free_orders[old];
  1138. if (!xa_empty(xa) && xa_load(xa, grp->bb_group))
  1139. xa_erase(xa, grp->bb_group);
  1140. }
  1141. grp->bb_largest_free_order = new;
  1142. if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) {
  1143. /*
  1144. * Cannot use __GFP_NOFAIL because we hold the group lock.
  1145. * Although allocation for insertion may fails, it's not fatal
  1146. * as we have linear traversal to fall back on.
  1147. */
  1148. int err = xa_insert(&sbi->s_mb_largest_free_orders[new],
  1149. grp->bb_group, grp, GFP_ATOMIC);
  1150. if (err)
  1151. mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d",
  1152. grp->bb_group, new, err);
  1153. }
  1154. }
  1155. static noinline_for_stack
  1156. void ext4_mb_generate_buddy(struct super_block *sb,
  1157. void *buddy, void *bitmap, ext4_group_t group,
  1158. struct ext4_group_info *grp)
  1159. {
  1160. struct ext4_sb_info *sbi = EXT4_SB(sb);
  1161. ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
  1162. ext4_grpblk_t i = 0;
  1163. ext4_grpblk_t first;
  1164. ext4_grpblk_t len;
  1165. unsigned free = 0;
  1166. unsigned fragments = 0;
  1167. unsigned long long period = get_cycles();
  1168. /* initialize buddy from bitmap which is aggregation
  1169. * of on-disk bitmap and preallocations */
  1170. i = mb_find_next_zero_bit(bitmap, max, 0);
  1171. grp->bb_first_free = i;
  1172. while (i < max) {
  1173. fragments++;
  1174. first = i;
  1175. i = mb_find_next_bit(bitmap, max, i);
  1176. len = i - first;
  1177. free += len;
  1178. if (len > 1)
  1179. ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
  1180. else
  1181. grp->bb_counters[0]++;
  1182. if (i < max)
  1183. i = mb_find_next_zero_bit(bitmap, max, i);
  1184. }
  1185. grp->bb_fragments = fragments;
  1186. if (free != grp->bb_free) {
  1187. ext4_grp_locked_error(sb, group, 0, 0,
  1188. "block bitmap and bg descriptor "
  1189. "inconsistent: %u vs %u free clusters",
  1190. free, grp->bb_free);
  1191. /*
  1192. * If we intend to continue, we consider group descriptor
  1193. * corrupt and update bb_free using bitmap value
  1194. */
  1195. grp->bb_free = free;
  1196. ext4_mark_group_bitmap_corrupted(sb, group,
  1197. EXT4_GROUP_INFO_BBITMAP_CORRUPT);
  1198. }
  1199. mb_set_largest_free_order(sb, grp);
  1200. mb_update_avg_fragment_size(sb, grp);
  1201. clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
  1202. period = get_cycles() - period;
  1203. atomic_inc(&sbi->s_mb_buddies_generated);
  1204. atomic64_add(period, &sbi->s_mb_generation_time);
  1205. }
  1206. static void mb_regenerate_buddy(struct ext4_buddy *e4b)
  1207. {
  1208. int count;
  1209. int order = 1;
  1210. void *buddy;
  1211. while ((buddy = mb_find_buddy(e4b, order++, &count)))
  1212. mb_set_bits(buddy, 0, count);
  1213. e4b->bd_info->bb_fragments = 0;
  1214. memset(e4b->bd_info->bb_counters, 0,
  1215. sizeof(*e4b->bd_info->bb_counters) *
  1216. (e4b->bd_sb->s_blocksize_bits + 2));
  1217. ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
  1218. e4b->bd_bitmap, e4b->bd_group, e4b->bd_info);
  1219. }
  1220. /* The buddy information is attached the buddy cache inode
  1221. * for convenience. The information regarding each group
  1222. * is loaded via ext4_mb_load_buddy. The information involve
  1223. * block bitmap and buddy information. The information are
  1224. * stored in the inode as
  1225. *
  1226. * { folio }
  1227. * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
  1228. *
  1229. *
  1230. * one block each for bitmap and buddy information.
  1231. * So for each group we take up 2 blocks. A folio can
  1232. * contain blocks_per_folio (folio_size / blocksize) blocks.
  1233. * So it can have information regarding groups_per_folio which
  1234. * is blocks_per_folio/2
  1235. *
  1236. * Locking note: This routine takes the block group lock of all groups
  1237. * for this folio; do not hold this lock when calling this routine!
  1238. */
  1239. static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
  1240. {
  1241. ext4_group_t ngroups;
  1242. unsigned int blocksize;
  1243. int blocks_per_folio;
  1244. int groups_per_folio;
  1245. int err = 0;
  1246. int i;
  1247. ext4_group_t first_group, group;
  1248. int first_block;
  1249. struct super_block *sb;
  1250. struct buffer_head *bhs;
  1251. struct buffer_head **bh = NULL;
  1252. struct inode *inode;
  1253. char *data;
  1254. char *bitmap;
  1255. struct ext4_group_info *grinfo;
  1256. inode = folio->mapping->host;
  1257. sb = inode->i_sb;
  1258. ngroups = ext4_get_groups_count(sb);
  1259. blocksize = i_blocksize(inode);
  1260. blocks_per_folio = folio_size(folio) / blocksize;
  1261. WARN_ON_ONCE(!blocks_per_folio);
  1262. groups_per_folio = DIV_ROUND_UP(blocks_per_folio, 2);
  1263. mb_debug(sb, "init folio %lu\n", folio->index);
  1264. /* allocate buffer_heads to read bitmaps */
  1265. if (groups_per_folio > 1) {
  1266. i = sizeof(struct buffer_head *) * groups_per_folio;
  1267. bh = kzalloc(i, gfp);
  1268. if (bh == NULL)
  1269. return -ENOMEM;
  1270. } else
  1271. bh = &bhs;
  1272. /* read all groups the folio covers into the cache */
  1273. first_group = EXT4_PG_TO_LBLK(inode, folio->index) / 2;
  1274. for (i = 0, group = first_group; i < groups_per_folio; i++, group++) {
  1275. if (group >= ngroups)
  1276. break;
  1277. grinfo = ext4_get_group_info(sb, group);
  1278. if (!grinfo)
  1279. continue;
  1280. /*
  1281. * If folio is uptodate then we came here after online resize
  1282. * which added some new uninitialized group info structs, so
  1283. * we must skip all initialized uptodate buddies on the folio,
  1284. * which may be currently in use by an allocating task.
  1285. */
  1286. if (folio_test_uptodate(folio) &&
  1287. !EXT4_MB_GRP_NEED_INIT(grinfo)) {
  1288. bh[i] = NULL;
  1289. continue;
  1290. }
  1291. bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
  1292. if (IS_ERR(bh[i])) {
  1293. err = PTR_ERR(bh[i]);
  1294. bh[i] = NULL;
  1295. goto out;
  1296. }
  1297. mb_debug(sb, "read bitmap for group %u\n", group);
  1298. }
  1299. /* wait for I/O completion */
  1300. for (i = 0, group = first_group; i < groups_per_folio; i++, group++) {
  1301. int err2;
  1302. if (!bh[i])
  1303. continue;
  1304. err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
  1305. if (!err)
  1306. err = err2;
  1307. }
  1308. first_block = EXT4_PG_TO_LBLK(inode, folio->index);
  1309. for (i = 0; i < blocks_per_folio; i++) {
  1310. group = (first_block + i) >> 1;
  1311. if (group >= ngroups)
  1312. break;
  1313. if (!bh[group - first_group])
  1314. /* skip initialized uptodate buddy */
  1315. continue;
  1316. if (!buffer_verified(bh[group - first_group]))
  1317. /* Skip faulty bitmaps */
  1318. continue;
  1319. err = 0;
  1320. /*
  1321. * data carry information regarding this
  1322. * particular group in the format specified
  1323. * above
  1324. *
  1325. */
  1326. data = folio_address(folio) + (i * blocksize);
  1327. bitmap = bh[group - first_group]->b_data;
  1328. /*
  1329. * We place the buddy block and bitmap block
  1330. * close together
  1331. */
  1332. grinfo = ext4_get_group_info(sb, group);
  1333. if (!grinfo) {
  1334. err = -EFSCORRUPTED;
  1335. goto out;
  1336. }
  1337. if ((first_block + i) & 1) {
  1338. /* this is block of buddy */
  1339. BUG_ON(incore == NULL);
  1340. mb_debug(sb, "put buddy for group %u in folio %lu/%x\n",
  1341. group, folio->index, i * blocksize);
  1342. trace_ext4_mb_buddy_bitmap_load(sb, group);
  1343. grinfo->bb_fragments = 0;
  1344. memset(grinfo->bb_counters, 0,
  1345. sizeof(*grinfo->bb_counters) *
  1346. (MB_NUM_ORDERS(sb)));
  1347. /*
  1348. * incore got set to the group block bitmap below
  1349. */
  1350. ext4_lock_group(sb, group);
  1351. /* init the buddy */
  1352. memset(data, 0xff, blocksize);
  1353. ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
  1354. ext4_unlock_group(sb, group);
  1355. incore = NULL;
  1356. } else {
  1357. /* this is block of bitmap */
  1358. BUG_ON(incore != NULL);
  1359. mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n",
  1360. group, folio->index, i * blocksize);
  1361. trace_ext4_mb_bitmap_load(sb, group);
  1362. /* see comments in ext4_mb_put_pa() */
  1363. ext4_lock_group(sb, group);
  1364. memcpy(data, bitmap, blocksize);
  1365. /* mark all preallocated blks used in in-core bitmap */
  1366. ext4_mb_generate_from_pa(sb, data, group);
  1367. WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root));
  1368. ext4_unlock_group(sb, group);
  1369. /* set incore so that the buddy information can be
  1370. * generated using this
  1371. */
  1372. incore = data;
  1373. }
  1374. }
  1375. folio_mark_uptodate(folio);
  1376. out:
  1377. if (bh) {
  1378. for (i = 0; i < groups_per_folio; i++)
  1379. brelse(bh[i]);
  1380. if (bh != &bhs)
  1381. kfree(bh);
  1382. }
  1383. return err;
  1384. }
  1385. /*
  1386. * Lock the buddy and bitmap folios. This makes sure other parallel init_group
  1387. * on the same buddy folio doesn't happen while holding the buddy folio lock.
  1388. * Return locked buddy and bitmap folios on e4b struct. If buddy and bitmap
  1389. * are on the same folio e4b->bd_buddy_folio is NULL and return value is 0.
  1390. */
  1391. static int ext4_mb_get_buddy_folio_lock(struct super_block *sb,
  1392. ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
  1393. {
  1394. struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
  1395. int block, pnum;
  1396. struct folio *folio;
  1397. e4b->bd_buddy_folio = NULL;
  1398. e4b->bd_bitmap_folio = NULL;
  1399. /*
  1400. * the buddy cache inode stores the block bitmap
  1401. * and buddy information in consecutive blocks.
  1402. * So for each group we need two blocks.
  1403. */
  1404. block = group * 2;
  1405. pnum = EXT4_LBLK_TO_PG(inode, block);
  1406. folio = __filemap_get_folio(inode->i_mapping, pnum,
  1407. FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
  1408. if (IS_ERR(folio))
  1409. return PTR_ERR(folio);
  1410. BUG_ON(folio->mapping != inode->i_mapping);
  1411. WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize);
  1412. e4b->bd_bitmap_folio = folio;
  1413. e4b->bd_bitmap = folio_address(folio) +
  1414. offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block));
  1415. block++;
  1416. pnum = EXT4_LBLK_TO_PG(inode, block);
  1417. if (folio_contains(folio, pnum)) {
  1418. /* buddy and bitmap are on the same folio */
  1419. return 0;
  1420. }
  1421. /* we need another folio for the buddy */
  1422. folio = __filemap_get_folio(inode->i_mapping, pnum,
  1423. FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
  1424. if (IS_ERR(folio))
  1425. return PTR_ERR(folio);
  1426. BUG_ON(folio->mapping != inode->i_mapping);
  1427. WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize);
  1428. e4b->bd_buddy_folio = folio;
  1429. return 0;
  1430. }
  1431. static void ext4_mb_put_buddy_folio_lock(struct ext4_buddy *e4b)
  1432. {
  1433. if (e4b->bd_bitmap_folio) {
  1434. folio_unlock(e4b->bd_bitmap_folio);
  1435. folio_put(e4b->bd_bitmap_folio);
  1436. }
  1437. if (e4b->bd_buddy_folio) {
  1438. folio_unlock(e4b->bd_buddy_folio);
  1439. folio_put(e4b->bd_buddy_folio);
  1440. }
  1441. }
  1442. /*
  1443. * Locking note: This routine calls ext4_mb_init_cache(), which takes the
  1444. * block group lock of all groups for this folio; do not hold the BG lock when
  1445. * calling this routine!
  1446. */
  1447. static noinline_for_stack
  1448. int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
  1449. {
  1450. struct ext4_group_info *this_grp;
  1451. struct ext4_buddy e4b;
  1452. struct folio *folio;
  1453. int ret = 0;
  1454. might_sleep();
  1455. mb_debug(sb, "init group %u\n", group);
  1456. this_grp = ext4_get_group_info(sb, group);
  1457. if (!this_grp)
  1458. return -EFSCORRUPTED;
  1459. /*
  1460. * This ensures that we don't reinit the buddy cache
  1461. * folio which map to the group from which we are already
  1462. * allocating. If we are looking at the buddy cache we would
  1463. * have taken a reference using ext4_mb_load_buddy and that
  1464. * would have pinned buddy folio to page cache.
  1465. * The call to ext4_mb_get_buddy_folio_lock will mark the
  1466. * folio accessed.
  1467. */
  1468. ret = ext4_mb_get_buddy_folio_lock(sb, group, &e4b, gfp);
  1469. if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
  1470. /*
  1471. * somebody initialized the group
  1472. * return without doing anything
  1473. */
  1474. goto err;
  1475. }
  1476. folio = e4b.bd_bitmap_folio;
  1477. ret = ext4_mb_init_cache(folio, NULL, gfp);
  1478. if (ret)
  1479. goto err;
  1480. if (!folio_test_uptodate(folio)) {
  1481. ret = -EIO;
  1482. goto err;
  1483. }
  1484. if (e4b.bd_buddy_folio == NULL) {
  1485. /*
  1486. * If both the bitmap and buddy are in
  1487. * the same folio we don't need to force
  1488. * init the buddy
  1489. */
  1490. ret = 0;
  1491. goto err;
  1492. }
  1493. /* init buddy cache */
  1494. folio = e4b.bd_buddy_folio;
  1495. ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp);
  1496. if (ret)
  1497. goto err;
  1498. if (!folio_test_uptodate(folio)) {
  1499. ret = -EIO;
  1500. goto err;
  1501. }
  1502. err:
  1503. ext4_mb_put_buddy_folio_lock(&e4b);
  1504. return ret;
  1505. }
  1506. /*
  1507. * Locking note: This routine calls ext4_mb_init_cache(), which takes the
  1508. * block group lock of all groups for this folio; do not hold the BG lock when
  1509. * calling this routine!
  1510. */
  1511. static noinline_for_stack int
  1512. ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
  1513. struct ext4_buddy *e4b, gfp_t gfp)
  1514. {
  1515. int block;
  1516. int pnum;
  1517. struct folio *folio;
  1518. int ret;
  1519. struct ext4_group_info *grp;
  1520. struct ext4_sb_info *sbi = EXT4_SB(sb);
  1521. struct inode *inode = sbi->s_buddy_cache;
  1522. might_sleep();
  1523. mb_debug(sb, "load group %u\n", group);
  1524. grp = ext4_get_group_info(sb, group);
  1525. if (!grp)
  1526. return -EFSCORRUPTED;
  1527. e4b->bd_blkbits = sb->s_blocksize_bits;
  1528. e4b->bd_info = grp;
  1529. e4b->bd_sb = sb;
  1530. e4b->bd_group = group;
  1531. e4b->bd_buddy_folio = NULL;
  1532. e4b->bd_bitmap_folio = NULL;
  1533. if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
  1534. /*
  1535. * we need full data about the group
  1536. * to make a good selection
  1537. */
  1538. ret = ext4_mb_init_group(sb, group, gfp);
  1539. if (ret)
  1540. return ret;
  1541. }
  1542. /*
  1543. * the buddy cache inode stores the block bitmap
  1544. * and buddy information in consecutive blocks.
  1545. * So for each group we need two blocks.
  1546. */
  1547. block = group * 2;
  1548. pnum = EXT4_LBLK_TO_PG(inode, block);
  1549. /* Avoid locking the folio in the fast path ... */
  1550. folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
  1551. if (IS_ERR(folio) || !folio_test_uptodate(folio) || folio_test_locked(folio)) {
  1552. /*
  1553. * folio_test_locked is employed to detect ongoing folio
  1554. * migrations, since concurrent migrations can lead to
  1555. * bitmap inconsistency. And if we are not uptodate that
  1556. * implies somebody just created the folio but is yet to
  1557. * initialize it. We can drop the folio reference and
  1558. * try to get the folio with lock in both cases to avoid
  1559. * concurrency.
  1560. */
  1561. if (!IS_ERR(folio))
  1562. folio_put(folio);
  1563. folio = __filemap_get_folio(inode->i_mapping, pnum,
  1564. FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
  1565. if (!IS_ERR(folio)) {
  1566. if (WARN_RATELIMIT(folio->mapping != inode->i_mapping,
  1567. "ext4: bitmap's mapping != inode->i_mapping\n")) {
  1568. /* should never happen */
  1569. folio_unlock(folio);
  1570. ret = -EINVAL;
  1571. goto err;
  1572. }
  1573. if (!folio_test_uptodate(folio)) {
  1574. ret = ext4_mb_init_cache(folio, NULL, gfp);
  1575. if (ret) {
  1576. folio_unlock(folio);
  1577. goto err;
  1578. }
  1579. mb_cmp_bitmaps(e4b, folio_address(folio) +
  1580. offset_in_folio(folio,
  1581. EXT4_LBLK_TO_B(inode, block)));
  1582. }
  1583. folio_unlock(folio);
  1584. }
  1585. }
  1586. if (IS_ERR(folio)) {
  1587. ret = PTR_ERR(folio);
  1588. goto err;
  1589. }
  1590. if (!folio_test_uptodate(folio)) {
  1591. ret = -EIO;
  1592. goto err;
  1593. }
  1594. /* Folios marked accessed already */
  1595. e4b->bd_bitmap_folio = folio;
  1596. e4b->bd_bitmap = folio_address(folio) +
  1597. offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block));
  1598. block++;
  1599. pnum = EXT4_LBLK_TO_PG(inode, block);
  1600. /* buddy and bitmap are on the same folio? */
  1601. if (folio_contains(folio, pnum)) {
  1602. folio_get(folio);
  1603. goto update_buddy;
  1604. }
  1605. /* we need another folio for the buddy */
  1606. folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
  1607. if (IS_ERR(folio) || !folio_test_uptodate(folio) || folio_test_locked(folio)) {
  1608. if (!IS_ERR(folio))
  1609. folio_put(folio);
  1610. folio = __filemap_get_folio(inode->i_mapping, pnum,
  1611. FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
  1612. if (!IS_ERR(folio)) {
  1613. if (WARN_RATELIMIT(folio->mapping != inode->i_mapping,
  1614. "ext4: buddy bitmap's mapping != inode->i_mapping\n")) {
  1615. /* should never happen */
  1616. folio_unlock(folio);
  1617. ret = -EINVAL;
  1618. goto err;
  1619. }
  1620. if (!folio_test_uptodate(folio)) {
  1621. ret = ext4_mb_init_cache(folio, e4b->bd_bitmap,
  1622. gfp);
  1623. if (ret) {
  1624. folio_unlock(folio);
  1625. goto err;
  1626. }
  1627. }
  1628. folio_unlock(folio);
  1629. }
  1630. }
  1631. if (IS_ERR(folio)) {
  1632. ret = PTR_ERR(folio);
  1633. goto err;
  1634. }
  1635. if (!folio_test_uptodate(folio)) {
  1636. ret = -EIO;
  1637. goto err;
  1638. }
  1639. update_buddy:
  1640. /* Folios marked accessed already */
  1641. e4b->bd_buddy_folio = folio;
  1642. e4b->bd_buddy = folio_address(folio) +
  1643. offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block));
  1644. return 0;
  1645. err:
  1646. if (!IS_ERR_OR_NULL(folio))
  1647. folio_put(folio);
  1648. if (e4b->bd_bitmap_folio)
  1649. folio_put(e4b->bd_bitmap_folio);
  1650. e4b->bd_buddy = NULL;
  1651. e4b->bd_bitmap = NULL;
  1652. return ret;
  1653. }
  1654. static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
  1655. struct ext4_buddy *e4b)
  1656. {
  1657. return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
  1658. }
  1659. static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
  1660. {
  1661. if (e4b->bd_bitmap_folio)
  1662. folio_put(e4b->bd_bitmap_folio);
  1663. if (e4b->bd_buddy_folio)
  1664. folio_put(e4b->bd_buddy_folio);
  1665. }
  1666. static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
  1667. {
  1668. int order = 1, max;
  1669. void *bb;
  1670. BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
  1671. BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
  1672. while (order <= e4b->bd_blkbits + 1) {
  1673. bb = mb_find_buddy(e4b, order, &max);
  1674. if (!mb_test_bit(block >> order, bb)) {
  1675. /* this block is part of buddy of order 'order' */
  1676. return order;
  1677. }
  1678. order++;
  1679. }
  1680. return 0;
  1681. }
  1682. static void mb_clear_bits(void *bm, int cur, int len)
  1683. {
  1684. __u32 *addr;
  1685. len = cur + len;
  1686. while (cur < len) {
  1687. if ((cur & 31) == 0 && (len - cur) >= 32) {
  1688. /* fast path: clear whole word at once */
  1689. addr = bm + (cur >> 3);
  1690. *addr = 0;
  1691. cur += 32;
  1692. continue;
  1693. }
  1694. mb_clear_bit(cur, bm);
  1695. cur++;
  1696. }
  1697. }
  1698. /* clear bits in given range
  1699. * will return first found zero bit if any, -1 otherwise
  1700. */
  1701. static int mb_test_and_clear_bits(void *bm, int cur, int len)
  1702. {
  1703. __u32 *addr;
  1704. int zero_bit = -1;
  1705. len = cur + len;
  1706. while (cur < len) {
  1707. if ((cur & 31) == 0 && (len - cur) >= 32) {
  1708. /* fast path: clear whole word at once */
  1709. addr = bm + (cur >> 3);
  1710. if (*addr != (__u32)(-1) && zero_bit == -1)
  1711. zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
  1712. *addr = 0;
  1713. cur += 32;
  1714. continue;
  1715. }
  1716. if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
  1717. zero_bit = cur;
  1718. cur++;
  1719. }
  1720. return zero_bit;
  1721. }
  1722. void mb_set_bits(void *bm, int cur, int len)
  1723. {
  1724. __u32 *addr;
  1725. len = cur + len;
  1726. while (cur < len) {
  1727. if ((cur & 31) == 0 && (len - cur) >= 32) {
  1728. /* fast path: set whole word at once */
  1729. addr = bm + (cur >> 3);
  1730. *addr = 0xffffffff;
  1731. cur += 32;
  1732. continue;
  1733. }
  1734. mb_set_bit(cur, bm);
  1735. cur++;
  1736. }
  1737. }
  1738. static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
  1739. {
  1740. if (mb_test_bit(*bit + side, bitmap)) {
  1741. mb_clear_bit(*bit, bitmap);
  1742. (*bit) -= side;
  1743. return 1;
  1744. }
  1745. else {
  1746. (*bit) += side;
  1747. mb_set_bit(*bit, bitmap);
  1748. return -1;
  1749. }
  1750. }
  1751. static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
  1752. {
  1753. int max;
  1754. int order = 1;
  1755. void *buddy = mb_find_buddy(e4b, order, &max);
  1756. while (buddy) {
  1757. void *buddy2;
  1758. /* Bits in range [first; last] are known to be set since
  1759. * corresponding blocks were allocated. Bits in range
  1760. * (first; last) will stay set because they form buddies on
  1761. * upper layer. We just deal with borders if they don't
  1762. * align with upper layer and then go up.
  1763. * Releasing entire group is all about clearing
  1764. * single bit of highest order buddy.
  1765. */
  1766. /* Example:
  1767. * ---------------------------------
  1768. * | 1 | 1 | 1 | 1 |
  1769. * ---------------------------------
  1770. * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
  1771. * ---------------------------------
  1772. * 0 1 2 3 4 5 6 7
  1773. * \_____________________/
  1774. *
  1775. * Neither [1] nor [6] is aligned to above layer.
  1776. * Left neighbour [0] is free, so mark it busy,
  1777. * decrease bb_counters and extend range to
  1778. * [0; 6]
  1779. * Right neighbour [7] is busy. It can't be coaleasced with [6], so
  1780. * mark [6] free, increase bb_counters and shrink range to
  1781. * [0; 5].
  1782. * Then shift range to [0; 2], go up and do the same.
  1783. */
  1784. if (first & 1)
  1785. e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
  1786. if (!(last & 1))
  1787. e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
  1788. if (first > last)
  1789. break;
  1790. order++;
  1791. buddy2 = mb_find_buddy(e4b, order, &max);
  1792. if (!buddy2) {
  1793. mb_clear_bits(buddy, first, last - first + 1);
  1794. e4b->bd_info->bb_counters[order - 1] += last - first + 1;
  1795. break;
  1796. }
  1797. first >>= 1;
  1798. last >>= 1;
  1799. buddy = buddy2;
  1800. }
  1801. }
  1802. static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
  1803. int first, int count)
  1804. {
  1805. int left_is_free = 0;
  1806. int right_is_free = 0;
  1807. int block;
  1808. int last = first + count - 1;
  1809. struct super_block *sb = e4b->bd_sb;
  1810. if (WARN_ON(count == 0))
  1811. return;
  1812. BUG_ON(last >= (sb->s_blocksize << 3));
  1813. assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
  1814. /* Don't bother if the block group is corrupt. */
  1815. if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
  1816. return;
  1817. mb_check_buddy(e4b);
  1818. mb_free_blocks_double(inode, e4b, first, count);
  1819. /* access memory sequentially: check left neighbour,
  1820. * clear range and then check right neighbour
  1821. */
  1822. if (first != 0)
  1823. left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
  1824. block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
  1825. if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
  1826. right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
  1827. if (unlikely(block != -1)) {
  1828. struct ext4_sb_info *sbi = EXT4_SB(sb);
  1829. ext4_fsblk_t blocknr;
  1830. /*
  1831. * Fastcommit replay can free already freed blocks which
  1832. * corrupts allocation info. Regenerate it.
  1833. */
  1834. if (sbi->s_mount_state & EXT4_FC_REPLAY) {
  1835. mb_regenerate_buddy(e4b);
  1836. goto check;
  1837. }
  1838. blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
  1839. blocknr += EXT4_C2B(sbi, block);
  1840. ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
  1841. EXT4_GROUP_INFO_BBITMAP_CORRUPT);
  1842. ext4_grp_locked_error(sb, e4b->bd_group,
  1843. inode ? inode->i_ino : 0, blocknr,
  1844. "freeing already freed block (bit %u); block bitmap corrupt.",
  1845. block);
  1846. return;
  1847. }
  1848. this_cpu_inc(discard_pa_seq);
  1849. e4b->bd_info->bb_free += count;
  1850. if (first < e4b->bd_info->bb_first_free)
  1851. e4b->bd_info->bb_first_free = first;
  1852. /* let's maintain fragments counter */
  1853. if (left_is_free && right_is_free)
  1854. e4b->bd_info->bb_fragments--;
  1855. else if (!left_is_free && !right_is_free)
  1856. e4b->bd_info->bb_fragments++;
  1857. /* buddy[0] == bd_bitmap is a special case, so handle
  1858. * it right away and let mb_buddy_mark_free stay free of
  1859. * zero order checks.
  1860. * Check if neighbours are to be coaleasced,
  1861. * adjust bitmap bb_counters and borders appropriately.
  1862. */
  1863. if (first & 1) {
  1864. first += !left_is_free;
  1865. e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
  1866. }
  1867. if (!(last & 1)) {
  1868. last -= !right_is_free;
  1869. e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
  1870. }
  1871. if (first <= last)
  1872. mb_buddy_mark_free(e4b, first >> 1, last >> 1);
  1873. mb_set_largest_free_order(sb, e4b->bd_info);
  1874. mb_update_avg_fragment_size(sb, e4b->bd_info);
  1875. check:
  1876. mb_check_buddy(e4b);
  1877. }
  1878. static int mb_find_extent(struct ext4_buddy *e4b, int block,
  1879. int needed, struct ext4_free_extent *ex)
  1880. {
  1881. int max, order, next;
  1882. void *buddy;
  1883. assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
  1884. BUG_ON(ex == NULL);
  1885. buddy = mb_find_buddy(e4b, 0, &max);
  1886. BUG_ON(buddy == NULL);
  1887. BUG_ON(block >= max);
  1888. if (mb_test_bit(block, buddy)) {
  1889. ex->fe_len = 0;
  1890. ex->fe_start = 0;
  1891. ex->fe_group = 0;
  1892. return 0;
  1893. }
  1894. /* find actual order */
  1895. order = mb_find_order_for_block(e4b, block);
  1896. ex->fe_len = (1 << order) - (block & ((1 << order) - 1));
  1897. ex->fe_start = block;
  1898. ex->fe_group = e4b->bd_group;
  1899. block = block >> order;
  1900. while (needed > ex->fe_len &&
  1901. mb_find_buddy(e4b, order, &max)) {
  1902. if (block + 1 >= max)
  1903. break;
  1904. next = (block + 1) * (1 << order);
  1905. if (mb_test_bit(next, e4b->bd_bitmap))
  1906. break;
  1907. order = mb_find_order_for_block(e4b, next);
  1908. block = next >> order;
  1909. ex->fe_len += 1 << order;
  1910. }
  1911. if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
  1912. /* Should never happen! (but apparently sometimes does?!?) */
  1913. WARN_ON(1);
  1914. ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
  1915. "corruption or bug in mb_find_extent "
  1916. "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
  1917. block, order, needed, ex->fe_group, ex->fe_start,
  1918. ex->fe_len, ex->fe_logical);
  1919. ex->fe_len = 0;
  1920. ex->fe_start = 0;
  1921. ex->fe_group = 0;
  1922. }
  1923. return ex->fe_len;
  1924. }
  1925. static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
  1926. {
  1927. int ord;
  1928. int mlen = 0;
  1929. int max = 0;
  1930. int start = ex->fe_start;
  1931. int len = ex->fe_len;
  1932. unsigned ret = 0;
  1933. int len0 = len;
  1934. void *buddy;
  1935. int ord_start, ord_end;
  1936. BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
  1937. BUG_ON(e4b->bd_group != ex->fe_group);
  1938. assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
  1939. mb_check_buddy(e4b);
  1940. mb_mark_used_double(e4b, start, len);
  1941. this_cpu_inc(discard_pa_seq);
  1942. e4b->bd_info->bb_free -= len;
  1943. if (e4b->bd_info->bb_first_free == start)
  1944. e4b->bd_info->bb_first_free += len;
  1945. /* let's maintain fragments counter */
  1946. if (start != 0)
  1947. mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
  1948. if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
  1949. max = !mb_test_bit(start + len, e4b->bd_bitmap);
  1950. if (mlen && max)
  1951. e4b->bd_info->bb_fragments++;
  1952. else if (!mlen && !max)
  1953. e4b->bd_info->bb_fragments--;
  1954. /* let's maintain buddy itself */
  1955. while (len) {
  1956. ord = mb_find_order_for_block(e4b, start);
  1957. if (((start >> ord) << ord) == start && len >= (1 << ord)) {
  1958. /* the whole chunk may be allocated at once! */
  1959. mlen = 1 << ord;
  1960. buddy = mb_find_buddy(e4b, ord, &max);
  1961. BUG_ON((start >> ord) >= max);
  1962. mb_set_bit(start >> ord, buddy);
  1963. e4b->bd_info->bb_counters[ord]--;
  1964. start += mlen;
  1965. len -= mlen;
  1966. BUG_ON(len < 0);
  1967. continue;
  1968. }
  1969. /* store for history */
  1970. if (ret == 0)
  1971. ret = len | (ord << 16);
  1972. BUG_ON(ord <= 0);
  1973. buddy = mb_find_buddy(e4b, ord, &max);
  1974. mb_set_bit(start >> ord, buddy);
  1975. e4b->bd_info->bb_counters[ord]--;
  1976. ord_start = (start >> ord) << ord;
  1977. ord_end = ord_start + (1 << ord);
  1978. /* first chunk */
  1979. if (start > ord_start)
  1980. ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy,
  1981. ord_start, start - ord_start,
  1982. e4b->bd_info);
  1983. /* last chunk */
  1984. if (start + len < ord_end) {
  1985. ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy,
  1986. start + len,
  1987. ord_end - (start + len),
  1988. e4b->bd_info);
  1989. break;
  1990. }
  1991. len = start + len - ord_end;
  1992. start = ord_end;
  1993. }
  1994. mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
  1995. mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
  1996. mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
  1997. mb_check_buddy(e4b);
  1998. return ret;
  1999. }
  2000. /*
  2001. * Must be called under group lock!
  2002. */
  2003. static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
  2004. struct ext4_buddy *e4b)
  2005. {
  2006. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  2007. int ret;
  2008. BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
  2009. BUG_ON(ac->ac_status == AC_STATUS_FOUND);
  2010. ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
  2011. ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
  2012. ret = mb_mark_used(e4b, &ac->ac_b_ex);
  2013. /* preallocation can change ac_b_ex, thus we store actually
  2014. * allocated blocks for history */
  2015. ac->ac_f_ex = ac->ac_b_ex;
  2016. ac->ac_status = AC_STATUS_FOUND;
  2017. ac->ac_tail = ret & 0xffff;
  2018. ac->ac_buddy = ret >> 16;
  2019. /*
  2020. * take the folio reference. We want the folio to be pinned
  2021. * so that we don't get a ext4_mb_init_cache_call for this
  2022. * group until we update the bitmap. That would mean we
  2023. * double allocate blocks. The reference is dropped
  2024. * in ext4_mb_release_context
  2025. */
  2026. ac->ac_bitmap_folio = e4b->bd_bitmap_folio;
  2027. folio_get(ac->ac_bitmap_folio);
  2028. ac->ac_buddy_folio = e4b->bd_buddy_folio;
  2029. folio_get(ac->ac_buddy_folio);
  2030. /* store last allocated for subsequent stream allocation */
  2031. if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
  2032. int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
  2033. WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group);
  2034. }
  2035. /*
  2036. * As we've just preallocated more space than
  2037. * user requested originally, we store allocated
  2038. * space in a special descriptor.
  2039. */
  2040. if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
  2041. ext4_mb_new_preallocation(ac);
  2042. }
  2043. static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
  2044. struct ext4_buddy *e4b,
  2045. int finish_group)
  2046. {
  2047. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  2048. struct ext4_free_extent *bex = &ac->ac_b_ex;
  2049. struct ext4_free_extent *gex = &ac->ac_g_ex;
  2050. if (ac->ac_status == AC_STATUS_FOUND)
  2051. return;
  2052. /*
  2053. * We don't want to scan for a whole year
  2054. */
  2055. if (ac->ac_found > sbi->s_mb_max_to_scan &&
  2056. !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
  2057. ac->ac_status = AC_STATUS_BREAK;
  2058. return;
  2059. }
  2060. /*
  2061. * Haven't found good chunk so far, let's continue
  2062. */
  2063. if (bex->fe_len < gex->fe_len)
  2064. return;
  2065. if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
  2066. ext4_mb_use_best_found(ac, e4b);
  2067. }
  2068. /*
  2069. * The routine checks whether found extent is good enough. If it is,
  2070. * then the extent gets marked used and flag is set to the context
  2071. * to stop scanning. Otherwise, the extent is compared with the
  2072. * previous found extent and if new one is better, then it's stored
  2073. * in the context. Later, the best found extent will be used, if
  2074. * mballoc can't find good enough extent.
  2075. *
  2076. * The algorithm used is roughly as follows:
  2077. *
  2078. * * If free extent found is exactly as big as goal, then
  2079. * stop the scan and use it immediately
  2080. *
  2081. * * If free extent found is smaller than goal, then keep retrying
  2082. * upto a max of sbi->s_mb_max_to_scan times (default 200). After
  2083. * that stop scanning and use whatever we have.
  2084. *
  2085. * * If free extent found is bigger than goal, then keep retrying
  2086. * upto a max of sbi->s_mb_min_to_scan times (default 10) before
  2087. * stopping the scan and using the extent.
  2088. *
  2089. *
  2090. * FIXME: real allocation policy is to be designed yet!
  2091. */
  2092. static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
  2093. struct ext4_free_extent *ex,
  2094. struct ext4_buddy *e4b)
  2095. {
  2096. struct ext4_free_extent *bex = &ac->ac_b_ex;
  2097. struct ext4_free_extent *gex = &ac->ac_g_ex;
  2098. BUG_ON(ex->fe_len <= 0);
  2099. BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
  2100. BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
  2101. BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
  2102. ac->ac_found++;
  2103. ac->ac_cX_found[ac->ac_criteria]++;
  2104. /*
  2105. * The special case - take what you catch first
  2106. */
  2107. if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
  2108. *bex = *ex;
  2109. ext4_mb_use_best_found(ac, e4b);
  2110. return;
  2111. }
  2112. /*
  2113. * Let's check whether the chuck is good enough
  2114. */
  2115. if (ex->fe_len == gex->fe_len) {
  2116. *bex = *ex;
  2117. ext4_mb_use_best_found(ac, e4b);
  2118. return;
  2119. }
  2120. /*
  2121. * If this is first found extent, just store it in the context
  2122. */
  2123. if (bex->fe_len == 0) {
  2124. *bex = *ex;
  2125. return;
  2126. }
  2127. /*
  2128. * If new found extent is better, store it in the context
  2129. */
  2130. if (bex->fe_len < gex->fe_len) {
  2131. /* if the request isn't satisfied, any found extent
  2132. * larger than previous best one is better */
  2133. if (ex->fe_len > bex->fe_len)
  2134. *bex = *ex;
  2135. } else if (ex->fe_len > gex->fe_len) {
  2136. /* if the request is satisfied, then we try to find
  2137. * an extent that still satisfy the request, but is
  2138. * smaller than previous one */
  2139. if (ex->fe_len < bex->fe_len)
  2140. *bex = *ex;
  2141. }
  2142. ext4_mb_check_limits(ac, e4b, 0);
  2143. }
  2144. static noinline_for_stack
  2145. void ext4_mb_try_best_found(struct ext4_allocation_context *ac,
  2146. struct ext4_buddy *e4b)
  2147. {
  2148. struct ext4_free_extent ex = ac->ac_b_ex;
  2149. ext4_group_t group = ex.fe_group;
  2150. int max;
  2151. int err;
  2152. BUG_ON(ex.fe_len <= 0);
  2153. err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
  2154. if (err)
  2155. return;
  2156. ext4_lock_group(ac->ac_sb, group);
  2157. if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
  2158. goto out;
  2159. max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
  2160. if (max > 0) {
  2161. ac->ac_b_ex = ex;
  2162. ext4_mb_use_best_found(ac, e4b);
  2163. }
  2164. out:
  2165. ext4_unlock_group(ac->ac_sb, group);
  2166. ext4_mb_unload_buddy(e4b);
  2167. }
  2168. static noinline_for_stack
  2169. int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
  2170. struct ext4_buddy *e4b)
  2171. {
  2172. ext4_group_t group = ac->ac_g_ex.fe_group;
  2173. int max;
  2174. int err;
  2175. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  2176. struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
  2177. struct ext4_free_extent ex;
  2178. if (!grp)
  2179. return -EFSCORRUPTED;
  2180. if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
  2181. return 0;
  2182. if (grp->bb_free == 0)
  2183. return 0;
  2184. err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
  2185. if (err) {
  2186. if (EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info) &&
  2187. !(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
  2188. return 0;
  2189. return err;
  2190. }
  2191. ext4_lock_group(ac->ac_sb, group);
  2192. if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
  2193. goto out;
  2194. max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
  2195. ac->ac_g_ex.fe_len, &ex);
  2196. ex.fe_logical = 0xDEADFA11; /* debug value */
  2197. if (max >= ac->ac_g_ex.fe_len &&
  2198. ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) {
  2199. ext4_fsblk_t start;
  2200. start = ext4_grp_offs_to_block(ac->ac_sb, &ex);
  2201. /* use do_div to get remainder (would be 64-bit modulo) */
  2202. if (do_div(start, sbi->s_stripe) == 0) {
  2203. ac->ac_found++;
  2204. ac->ac_b_ex = ex;
  2205. ext4_mb_use_best_found(ac, e4b);
  2206. }
  2207. } else if (max >= ac->ac_g_ex.fe_len) {
  2208. BUG_ON(ex.fe_len <= 0);
  2209. BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
  2210. BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
  2211. ac->ac_found++;
  2212. ac->ac_b_ex = ex;
  2213. ext4_mb_use_best_found(ac, e4b);
  2214. } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
  2215. /* Sometimes, caller may want to merge even small
  2216. * number of blocks to an existing extent */
  2217. BUG_ON(ex.fe_len <= 0);
  2218. BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
  2219. BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
  2220. ac->ac_found++;
  2221. ac->ac_b_ex = ex;
  2222. ext4_mb_use_best_found(ac, e4b);
  2223. }
  2224. out:
  2225. ext4_unlock_group(ac->ac_sb, group);
  2226. ext4_mb_unload_buddy(e4b);
  2227. return 0;
  2228. }
  2229. /*
  2230. * The routine scans buddy structures (not bitmap!) from given order
  2231. * to max order and tries to find big enough chunk to satisfy the req
  2232. */
  2233. static noinline_for_stack
  2234. void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
  2235. struct ext4_buddy *e4b)
  2236. {
  2237. struct super_block *sb = ac->ac_sb;
  2238. struct ext4_group_info *grp = e4b->bd_info;
  2239. void *buddy;
  2240. int i;
  2241. int k;
  2242. int max;
  2243. BUG_ON(ac->ac_2order <= 0);
  2244. for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
  2245. if (grp->bb_counters[i] == 0)
  2246. continue;
  2247. buddy = mb_find_buddy(e4b, i, &max);
  2248. if (WARN_RATELIMIT(buddy == NULL,
  2249. "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i))
  2250. continue;
  2251. k = mb_find_next_zero_bit(buddy, max, 0);
  2252. if (k >= max) {
  2253. ext4_mark_group_bitmap_corrupted(ac->ac_sb,
  2254. e4b->bd_group,
  2255. EXT4_GROUP_INFO_BBITMAP_CORRUPT);
  2256. ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
  2257. "%d free clusters of order %d. But found 0",
  2258. grp->bb_counters[i], i);
  2259. break;
  2260. }
  2261. ac->ac_found++;
  2262. ac->ac_cX_found[ac->ac_criteria]++;
  2263. ac->ac_b_ex.fe_len = 1 << i;
  2264. ac->ac_b_ex.fe_start = k << i;
  2265. ac->ac_b_ex.fe_group = e4b->bd_group;
  2266. ext4_mb_use_best_found(ac, e4b);
  2267. BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
  2268. if (EXT4_SB(sb)->s_mb_stats)
  2269. atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
  2270. break;
  2271. }
  2272. }
  2273. /*
  2274. * The routine scans the group and measures all found extents.
  2275. * In order to optimize scanning, caller must pass number of
  2276. * free blocks in the group, so the routine can know upper limit.
  2277. */
  2278. static noinline_for_stack
  2279. void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
  2280. struct ext4_buddy *e4b)
  2281. {
  2282. struct super_block *sb = ac->ac_sb;
  2283. void *bitmap = e4b->bd_bitmap;
  2284. struct ext4_free_extent ex;
  2285. int i, j, freelen;
  2286. int free;
  2287. free = e4b->bd_info->bb_free;
  2288. if (WARN_ON(free <= 0))
  2289. return;
  2290. i = e4b->bd_info->bb_first_free;
  2291. while (free && ac->ac_status == AC_STATUS_CONTINUE) {
  2292. i = mb_find_next_zero_bit(bitmap,
  2293. EXT4_CLUSTERS_PER_GROUP(sb), i);
  2294. if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
  2295. /*
  2296. * IF we have corrupt bitmap, we won't find any
  2297. * free blocks even though group info says we
  2298. * have free blocks
  2299. */
  2300. ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
  2301. EXT4_GROUP_INFO_BBITMAP_CORRUPT);
  2302. ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
  2303. "%d free clusters as per "
  2304. "group info. But bitmap says 0",
  2305. free);
  2306. break;
  2307. }
  2308. if (!ext4_mb_cr_expensive(ac->ac_criteria)) {
  2309. /*
  2310. * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are
  2311. * sure that this group will have a large enough
  2312. * continuous free extent, so skip over the smaller free
  2313. * extents
  2314. */
  2315. j = mb_find_next_bit(bitmap,
  2316. EXT4_CLUSTERS_PER_GROUP(sb), i);
  2317. freelen = j - i;
  2318. if (freelen < ac->ac_g_ex.fe_len) {
  2319. i = j;
  2320. free -= freelen;
  2321. continue;
  2322. }
  2323. }
  2324. mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
  2325. if (WARN_ON(ex.fe_len <= 0))
  2326. break;
  2327. if (free < ex.fe_len) {
  2328. ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
  2329. EXT4_GROUP_INFO_BBITMAP_CORRUPT);
  2330. ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
  2331. "%d free clusters as per "
  2332. "group info. But got %d blocks",
  2333. free, ex.fe_len);
  2334. /*
  2335. * The number of free blocks differs. This mostly
  2336. * indicate that the bitmap is corrupt. So exit
  2337. * without claiming the space.
  2338. */
  2339. break;
  2340. }
  2341. ex.fe_logical = 0xDEADC0DE; /* debug value */
  2342. ext4_mb_measure_extent(ac, &ex, e4b);
  2343. i += ex.fe_len;
  2344. free -= ex.fe_len;
  2345. }
  2346. ext4_mb_check_limits(ac, e4b, 1);
  2347. }
  2348. /*
  2349. * This is a special case for storages like raid5
  2350. * we try to find stripe-aligned chunks for stripe-size-multiple requests
  2351. */
  2352. static noinline_for_stack
  2353. void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
  2354. struct ext4_buddy *e4b)
  2355. {
  2356. struct super_block *sb = ac->ac_sb;
  2357. struct ext4_sb_info *sbi = EXT4_SB(sb);
  2358. void *bitmap = e4b->bd_bitmap;
  2359. struct ext4_free_extent ex;
  2360. ext4_fsblk_t first_group_block;
  2361. ext4_fsblk_t a;
  2362. ext4_grpblk_t i, stripe;
  2363. int max;
  2364. BUG_ON(sbi->s_stripe == 0);
  2365. /* find first stripe-aligned block in group */
  2366. first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
  2367. a = first_group_block + sbi->s_stripe - 1;
  2368. do_div(a, sbi->s_stripe);
  2369. i = (a * sbi->s_stripe) - first_group_block;
  2370. stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe);
  2371. i = EXT4_B2C(sbi, i);
  2372. while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
  2373. if (!mb_test_bit(i, bitmap)) {
  2374. max = mb_find_extent(e4b, i, stripe, &ex);
  2375. if (max >= stripe) {
  2376. ac->ac_found++;
  2377. ac->ac_cX_found[ac->ac_criteria]++;
  2378. ex.fe_logical = 0xDEADF00D; /* debug value */
  2379. ac->ac_b_ex = ex;
  2380. ext4_mb_use_best_found(ac, e4b);
  2381. break;
  2382. }
  2383. }
  2384. i += stripe;
  2385. }
  2386. }
  2387. static void __ext4_mb_scan_group(struct ext4_allocation_context *ac)
  2388. {
  2389. bool is_stripe_aligned;
  2390. struct ext4_sb_info *sbi;
  2391. enum criteria cr = ac->ac_criteria;
  2392. ac->ac_groups_scanned++;
  2393. if (cr == CR_POWER2_ALIGNED)
  2394. return ext4_mb_simple_scan_group(ac, ac->ac_e4b);
  2395. sbi = EXT4_SB(ac->ac_sb);
  2396. is_stripe_aligned = false;
  2397. if ((sbi->s_stripe >= sbi->s_cluster_ratio) &&
  2398. !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe)))
  2399. is_stripe_aligned = true;
  2400. if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) &&
  2401. is_stripe_aligned)
  2402. ext4_mb_scan_aligned(ac, ac->ac_e4b);
  2403. if (ac->ac_status == AC_STATUS_CONTINUE)
  2404. ext4_mb_complex_scan_group(ac, ac->ac_e4b);
  2405. }
  2406. /*
  2407. * This is also called BEFORE we load the buddy bitmap.
  2408. * Returns either 1 or 0 indicating that the group is either suitable
  2409. * for the allocation or not.
  2410. */
  2411. static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
  2412. ext4_group_t group, enum criteria cr)
  2413. {
  2414. ext4_grpblk_t free, fragments;
  2415. int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
  2416. struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
  2417. BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS);
  2418. if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
  2419. return false;
  2420. free = grp->bb_free;
  2421. if (free == 0)
  2422. return false;
  2423. fragments = grp->bb_fragments;
  2424. if (fragments == 0)
  2425. return false;
  2426. switch (cr) {
  2427. case CR_POWER2_ALIGNED:
  2428. BUG_ON(ac->ac_2order == 0);
  2429. /* Avoid using the first bg of a flexgroup for data files */
  2430. if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
  2431. (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
  2432. ((group % flex_size) == 0))
  2433. return false;
  2434. if (free < ac->ac_g_ex.fe_len)
  2435. return false;
  2436. if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
  2437. return true;
  2438. if (grp->bb_largest_free_order < ac->ac_2order)
  2439. return false;
  2440. return true;
  2441. case CR_GOAL_LEN_FAST:
  2442. case CR_BEST_AVAIL_LEN:
  2443. if ((free / fragments) >= ac->ac_g_ex.fe_len)
  2444. return true;
  2445. break;
  2446. case CR_GOAL_LEN_SLOW:
  2447. if (free >= ac->ac_g_ex.fe_len)
  2448. return true;
  2449. break;
  2450. case CR_ANY_FREE:
  2451. return true;
  2452. default:
  2453. BUG();
  2454. }
  2455. return false;
  2456. }
  2457. /*
  2458. * This could return negative error code if something goes wrong
  2459. * during ext4_mb_init_group(). This should not be called with
  2460. * ext4_lock_group() held.
  2461. *
  2462. * Note: because we are conditionally operating with the group lock in
  2463. * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this
  2464. * function using __acquire and __release. This means we need to be
  2465. * super careful before messing with the error path handling via "goto
  2466. * out"!
  2467. */
  2468. static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
  2469. ext4_group_t group, enum criteria cr)
  2470. {
  2471. struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
  2472. struct super_block *sb = ac->ac_sb;
  2473. struct ext4_sb_info *sbi = EXT4_SB(sb);
  2474. bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
  2475. ext4_grpblk_t free;
  2476. int ret = 0;
  2477. if (!grp)
  2478. return -EFSCORRUPTED;
  2479. if (sbi->s_mb_stats)
  2480. atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
  2481. if (should_lock) {
  2482. ext4_lock_group(sb, group);
  2483. __release(ext4_group_lock_ptr(sb, group));
  2484. }
  2485. free = grp->bb_free;
  2486. if (free == 0)
  2487. goto out;
  2488. /*
  2489. * In all criterias except CR_ANY_FREE we try to avoid groups that
  2490. * can't possibly satisfy the full goal request due to insufficient
  2491. * free blocks.
  2492. */
  2493. if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len)
  2494. goto out;
  2495. if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
  2496. goto out;
  2497. if (should_lock) {
  2498. __acquire(ext4_group_lock_ptr(sb, group));
  2499. ext4_unlock_group(sb, group);
  2500. }
  2501. /* We only do this if the grp has never been initialized */
  2502. if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
  2503. struct ext4_group_desc *gdp =
  2504. ext4_get_group_desc(sb, group, NULL);
  2505. int ret;
  2506. /*
  2507. * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic
  2508. * search to find large good chunks almost for free. If buddy
  2509. * data is not ready, then this optimization makes no sense. But
  2510. * we never skip the first block group in a flex_bg, since this
  2511. * gets used for metadata block allocation, and we want to make
  2512. * sure we locate metadata blocks in the first block group in
  2513. * the flex_bg if possible.
  2514. */
  2515. if (!ext4_mb_cr_expensive(cr) &&
  2516. (!sbi->s_log_groups_per_flex ||
  2517. ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
  2518. !(ext4_has_group_desc_csum(sb) &&
  2519. (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
  2520. return 0;
  2521. ret = ext4_mb_init_group(sb, group, GFP_NOFS);
  2522. if (ret)
  2523. return ret;
  2524. }
  2525. if (should_lock) {
  2526. ext4_lock_group(sb, group);
  2527. __release(ext4_group_lock_ptr(sb, group));
  2528. }
  2529. ret = ext4_mb_good_group(ac, group, cr);
  2530. out:
  2531. if (should_lock) {
  2532. __acquire(ext4_group_lock_ptr(sb, group));
  2533. ext4_unlock_group(sb, group);
  2534. }
  2535. return ret;
  2536. }
  2537. /*
  2538. * Start prefetching @nr block bitmaps starting at @group.
  2539. * Return the next group which needs to be prefetched.
  2540. */
  2541. ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
  2542. unsigned int nr, int *cnt)
  2543. {
  2544. ext4_group_t ngroups = ext4_get_groups_count(sb);
  2545. struct buffer_head *bh;
  2546. struct blk_plug plug;
  2547. blk_start_plug(&plug);
  2548. while (nr-- > 0) {
  2549. struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
  2550. NULL);
  2551. struct ext4_group_info *grp = ext4_get_group_info(sb, group);
  2552. /*
  2553. * Prefetch block groups with free blocks; but don't
  2554. * bother if it is marked uninitialized on disk, since
  2555. * it won't require I/O to read. Also only try to
  2556. * prefetch once, so we avoid getblk() call, which can
  2557. * be expensive.
  2558. */
  2559. if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
  2560. EXT4_MB_GRP_NEED_INIT(grp) &&
  2561. ext4_free_group_clusters(sb, gdp) > 0 ) {
  2562. bh = ext4_read_block_bitmap_nowait(sb, group, true);
  2563. if (bh && !IS_ERR(bh)) {
  2564. if (!buffer_uptodate(bh) && cnt)
  2565. (*cnt)++;
  2566. brelse(bh);
  2567. }
  2568. }
  2569. if (++group >= ngroups)
  2570. group = 0;
  2571. }
  2572. blk_finish_plug(&plug);
  2573. return group;
  2574. }
  2575. /*
  2576. * Batch reads of the block allocation bitmaps to get
  2577. * multiple READs in flight; limit prefetching at inexpensive
  2578. * CR, otherwise mballoc can spend a lot of time loading
  2579. * imperfect groups
  2580. */
  2581. static void ext4_mb_might_prefetch(struct ext4_allocation_context *ac,
  2582. ext4_group_t group)
  2583. {
  2584. struct ext4_sb_info *sbi;
  2585. if (ac->ac_prefetch_grp != group)
  2586. return;
  2587. sbi = EXT4_SB(ac->ac_sb);
  2588. if (ext4_mb_cr_expensive(ac->ac_criteria) ||
  2589. ac->ac_prefetch_ios < sbi->s_mb_prefetch_limit) {
  2590. unsigned int nr = sbi->s_mb_prefetch;
  2591. if (ext4_has_feature_flex_bg(ac->ac_sb)) {
  2592. nr = 1 << sbi->s_log_groups_per_flex;
  2593. nr -= group & (nr - 1);
  2594. nr = umin(nr, sbi->s_mb_prefetch);
  2595. }
  2596. ac->ac_prefetch_nr = nr;
  2597. ac->ac_prefetch_grp = ext4_mb_prefetch(ac->ac_sb, group, nr,
  2598. &ac->ac_prefetch_ios);
  2599. }
  2600. }
  2601. /*
  2602. * Prefetching reads the block bitmap into the buffer cache; but we
  2603. * need to make sure that the buddy bitmap in the page cache has been
  2604. * initialized. Note that ext4_mb_init_group() will block if the I/O
  2605. * is not yet completed, or indeed if it was not initiated by
  2606. * ext4_mb_prefetch did not start the I/O.
  2607. *
  2608. * TODO: We should actually kick off the buddy bitmap setup in a work
  2609. * queue when the buffer I/O is completed, so that we don't block
  2610. * waiting for the block allocation bitmap read to finish when
  2611. * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
  2612. */
  2613. void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
  2614. unsigned int nr)
  2615. {
  2616. struct ext4_group_desc *gdp;
  2617. struct ext4_group_info *grp;
  2618. while (nr-- > 0) {
  2619. if (!group)
  2620. group = ext4_get_groups_count(sb);
  2621. group--;
  2622. gdp = ext4_get_group_desc(sb, group, NULL);
  2623. grp = ext4_get_group_info(sb, group);
  2624. if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
  2625. ext4_free_group_clusters(sb, gdp) > 0) {
  2626. if (ext4_mb_init_group(sb, group, GFP_NOFS))
  2627. break;
  2628. }
  2629. }
  2630. }
  2631. static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
  2632. ext4_group_t group)
  2633. {
  2634. int ret;
  2635. struct super_block *sb = ac->ac_sb;
  2636. enum criteria cr = ac->ac_criteria;
  2637. ext4_mb_might_prefetch(ac, group);
  2638. /* prevent unnecessary buddy loading. */
  2639. if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group)))
  2640. return 0;
  2641. /* This now checks without needing the buddy folio */
  2642. ret = ext4_mb_good_group_nolock(ac, group, cr);
  2643. if (ret <= 0) {
  2644. if (!ac->ac_first_err)
  2645. ac->ac_first_err = ret;
  2646. return 0;
  2647. }
  2648. ret = ext4_mb_load_buddy(sb, group, ac->ac_e4b);
  2649. if (ret)
  2650. return ret;
  2651. /* skip busy group */
  2652. if (cr >= CR_ANY_FREE)
  2653. ext4_lock_group(sb, group);
  2654. else if (!ext4_try_lock_group(sb, group))
  2655. goto out_unload;
  2656. /* We need to check again after locking the block group. */
  2657. if (unlikely(!ext4_mb_good_group(ac, group, cr)))
  2658. goto out_unlock;
  2659. __ext4_mb_scan_group(ac);
  2660. out_unlock:
  2661. ext4_unlock_group(sb, group);
  2662. out_unload:
  2663. ext4_mb_unload_buddy(ac->ac_e4b);
  2664. return ret;
  2665. }
  2666. static noinline_for_stack int
  2667. ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
  2668. {
  2669. ext4_group_t i;
  2670. int err = 0;
  2671. struct super_block *sb = ac->ac_sb;
  2672. struct ext4_sb_info *sbi = EXT4_SB(sb);
  2673. struct ext4_buddy e4b;
  2674. BUG_ON(ac->ac_status == AC_STATUS_FOUND);
  2675. /* first, try the goal */
  2676. err = ext4_mb_find_by_goal(ac, &e4b);
  2677. if (err || ac->ac_status == AC_STATUS_FOUND)
  2678. goto out;
  2679. if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
  2680. goto out;
  2681. /*
  2682. * ac->ac_2order is set only if the fe_len is a power of 2
  2683. * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
  2684. * so that we try exact allocation using buddy.
  2685. */
  2686. i = fls(ac->ac_g_ex.fe_len);
  2687. ac->ac_2order = 0;
  2688. /*
  2689. * We search using buddy data only if the order of the request
  2690. * is greater than equal to the sbi_s_mb_order2_reqs
  2691. * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
  2692. * We also support searching for power-of-two requests only for
  2693. * requests upto maximum buddy size we have constructed.
  2694. */
  2695. if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
  2696. if (is_power_of_2(ac->ac_g_ex.fe_len))
  2697. ac->ac_2order = array_index_nospec(i - 1,
  2698. MB_NUM_ORDERS(sb));
  2699. }
  2700. /* if stream allocation is enabled, use global goal */
  2701. if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
  2702. int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
  2703. ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
  2704. ac->ac_g_ex.fe_start = -1;
  2705. ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
  2706. }
  2707. /*
  2708. * Let's just scan groups to find more-less suitable blocks We
  2709. * start with CR_GOAL_LEN_FAST, unless it is power of 2
  2710. * aligned, in which case let's do that faster approach first.
  2711. */
  2712. ac->ac_criteria = CR_GOAL_LEN_FAST;
  2713. if (ac->ac_2order)
  2714. ac->ac_criteria = CR_POWER2_ALIGNED;
  2715. ac->ac_e4b = &e4b;
  2716. ac->ac_prefetch_ios = 0;
  2717. ac->ac_first_err = 0;
  2718. repeat:
  2719. while (ac->ac_criteria < EXT4_MB_NUM_CRS) {
  2720. err = ext4_mb_scan_groups(ac);
  2721. if (err)
  2722. goto out;
  2723. if (ac->ac_status != AC_STATUS_CONTINUE)
  2724. break;
  2725. }
  2726. if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
  2727. !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
  2728. /*
  2729. * We've been searching too long. Let's try to allocate
  2730. * the best chunk we've found so far
  2731. */
  2732. ext4_mb_try_best_found(ac, &e4b);
  2733. if (ac->ac_status != AC_STATUS_FOUND) {
  2734. int lost;
  2735. /*
  2736. * Someone more lucky has already allocated it.
  2737. * The only thing we can do is just take first
  2738. * found block(s)
  2739. */
  2740. lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
  2741. mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
  2742. ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
  2743. ac->ac_b_ex.fe_len, lost);
  2744. ac->ac_b_ex.fe_group = 0;
  2745. ac->ac_b_ex.fe_start = 0;
  2746. ac->ac_b_ex.fe_len = 0;
  2747. ac->ac_status = AC_STATUS_CONTINUE;
  2748. ac->ac_flags |= EXT4_MB_HINT_FIRST;
  2749. ac->ac_criteria = CR_ANY_FREE;
  2750. goto repeat;
  2751. }
  2752. }
  2753. if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) {
  2754. atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
  2755. if (ac->ac_flags & EXT4_MB_STREAM_ALLOC &&
  2756. ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group)
  2757. atomic_inc(&sbi->s_bal_stream_goals);
  2758. }
  2759. out:
  2760. if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err)
  2761. err = ac->ac_first_err;
  2762. mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
  2763. ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
  2764. ac->ac_flags, ac->ac_criteria, err);
  2765. if (ac->ac_prefetch_nr)
  2766. ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr);
  2767. return err;
  2768. }
  2769. static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
  2770. {
  2771. struct super_block *sb = pde_data(file_inode(seq->file));
  2772. ext4_group_t group;
  2773. if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
  2774. return NULL;
  2775. group = *pos + 1;
  2776. return (void *) ((unsigned long) group);
  2777. }
  2778. static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
  2779. {
  2780. struct super_block *sb = pde_data(file_inode(seq->file));
  2781. ext4_group_t group;
  2782. ++*pos;
  2783. if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
  2784. return NULL;
  2785. group = *pos + 1;
  2786. return (void *) ((unsigned long) group);
  2787. }
  2788. static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
  2789. {
  2790. struct super_block *sb = pde_data(file_inode(seq->file));
  2791. ext4_group_t group = (ext4_group_t) ((unsigned long) v);
  2792. int i, err;
  2793. char nbuf[16];
  2794. struct ext4_buddy e4b;
  2795. struct ext4_group_info *grinfo;
  2796. unsigned char blocksize_bits = min_t(unsigned char,
  2797. sb->s_blocksize_bits,
  2798. EXT4_MAX_BLOCK_LOG_SIZE);
  2799. DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters,
  2800. EXT4_MAX_BLOCK_LOG_SIZE + 2);
  2801. group--;
  2802. if (group == 0)
  2803. seq_puts(seq, "#group: free frags first ["
  2804. " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 "
  2805. " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n");
  2806. i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) +
  2807. sizeof(struct ext4_group_info);
  2808. grinfo = ext4_get_group_info(sb, group);
  2809. if (!grinfo)
  2810. return 0;
  2811. /* Load the group info in memory only if not already loaded. */
  2812. if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
  2813. err = ext4_mb_load_buddy(sb, group, &e4b);
  2814. if (err) {
  2815. seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf));
  2816. return 0;
  2817. }
  2818. ext4_mb_unload_buddy(&e4b);
  2819. }
  2820. /*
  2821. * We care only about free space counters in the group info and
  2822. * these are safe to access even after the buddy has been unloaded
  2823. */
  2824. memcpy(sg, grinfo, i);
  2825. seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free,
  2826. sg->bb_fragments, sg->bb_first_free);
  2827. for (i = 0; i <= 13; i++)
  2828. seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
  2829. sg->bb_counters[i] : 0);
  2830. seq_puts(seq, " ]");
  2831. if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg))
  2832. seq_puts(seq, " Block bitmap corrupted!");
  2833. seq_putc(seq, '\n');
  2834. return 0;
  2835. }
  2836. static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
  2837. {
  2838. }
  2839. const struct seq_operations ext4_mb_seq_groups_ops = {
  2840. .start = ext4_mb_seq_groups_start,
  2841. .next = ext4_mb_seq_groups_next,
  2842. .stop = ext4_mb_seq_groups_stop,
  2843. .show = ext4_mb_seq_groups_show,
  2844. };
  2845. int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
  2846. {
  2847. struct super_block *sb = seq->private;
  2848. struct ext4_sb_info *sbi = EXT4_SB(sb);
  2849. seq_puts(seq, "mballoc:\n");
  2850. if (!sbi->s_mb_stats) {
  2851. seq_puts(seq, "\tmb stats collection turned off.\n");
  2852. seq_puts(
  2853. seq,
  2854. "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
  2855. return 0;
  2856. }
  2857. seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
  2858. seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
  2859. seq_printf(seq, "\tgroups_scanned: %u\n",
  2860. atomic_read(&sbi->s_bal_groups_scanned));
  2861. /* CR_POWER2_ALIGNED stats */
  2862. seq_puts(seq, "\tcr_p2_aligned_stats:\n");
  2863. seq_printf(seq, "\t\thits: %llu\n",
  2864. atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED]));
  2865. seq_printf(
  2866. seq, "\t\tgroups_considered: %llu\n",
  2867. atomic64_read(
  2868. &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]));
  2869. seq_printf(seq, "\t\textents_scanned: %u\n",
  2870. atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED]));
  2871. seq_printf(seq, "\t\tuseless_loops: %llu\n",
  2872. atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED]));
  2873. /* CR_GOAL_LEN_FAST stats */
  2874. seq_puts(seq, "\tcr_goal_fast_stats:\n");
  2875. seq_printf(seq, "\t\thits: %llu\n",
  2876. atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST]));
  2877. seq_printf(seq, "\t\tgroups_considered: %llu\n",
  2878. atomic64_read(
  2879. &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST]));
  2880. seq_printf(seq, "\t\textents_scanned: %u\n",
  2881. atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST]));
  2882. seq_printf(seq, "\t\tuseless_loops: %llu\n",
  2883. atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST]));
  2884. /* CR_BEST_AVAIL_LEN stats */
  2885. seq_puts(seq, "\tcr_best_avail_stats:\n");
  2886. seq_printf(seq, "\t\thits: %llu\n",
  2887. atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN]));
  2888. seq_printf(
  2889. seq, "\t\tgroups_considered: %llu\n",
  2890. atomic64_read(
  2891. &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN]));
  2892. seq_printf(seq, "\t\textents_scanned: %u\n",
  2893. atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN]));
  2894. seq_printf(seq, "\t\tuseless_loops: %llu\n",
  2895. atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN]));
  2896. /* CR_GOAL_LEN_SLOW stats */
  2897. seq_puts(seq, "\tcr_goal_slow_stats:\n");
  2898. seq_printf(seq, "\t\thits: %llu\n",
  2899. atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW]));
  2900. seq_printf(seq, "\t\tgroups_considered: %llu\n",
  2901. atomic64_read(
  2902. &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW]));
  2903. seq_printf(seq, "\t\textents_scanned: %u\n",
  2904. atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW]));
  2905. seq_printf(seq, "\t\tuseless_loops: %llu\n",
  2906. atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW]));
  2907. /* CR_ANY_FREE stats */
  2908. seq_puts(seq, "\tcr_any_free_stats:\n");
  2909. seq_printf(seq, "\t\thits: %llu\n",
  2910. atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE]));
  2911. seq_printf(
  2912. seq, "\t\tgroups_considered: %llu\n",
  2913. atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE]));
  2914. seq_printf(seq, "\t\textents_scanned: %u\n",
  2915. atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE]));
  2916. seq_printf(seq, "\t\tuseless_loops: %llu\n",
  2917. atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE]));
  2918. /* Aggregates */
  2919. seq_printf(seq, "\textents_scanned: %u\n",
  2920. atomic_read(&sbi->s_bal_ex_scanned));
  2921. seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
  2922. seq_printf(seq, "\t\tstream_goal_hits: %u\n",
  2923. atomic_read(&sbi->s_bal_stream_goals));
  2924. seq_printf(seq, "\t\tlen_goal_hits: %u\n",
  2925. atomic_read(&sbi->s_bal_len_goals));
  2926. seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
  2927. seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
  2928. seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
  2929. seq_printf(seq, "\tbuddies_generated: %u/%u\n",
  2930. atomic_read(&sbi->s_mb_buddies_generated),
  2931. ext4_get_groups_count(sb));
  2932. seq_printf(seq, "\tbuddies_time_used: %llu\n",
  2933. atomic64_read(&sbi->s_mb_generation_time));
  2934. seq_printf(seq, "\tpreallocated: %u\n",
  2935. atomic_read(&sbi->s_mb_preallocated));
  2936. seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded));
  2937. return 0;
  2938. }
  2939. static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
  2940. {
  2941. struct super_block *sb = pde_data(file_inode(seq->file));
  2942. unsigned long position;
  2943. if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
  2944. return NULL;
  2945. position = *pos + 1;
  2946. return (void *) ((unsigned long) position);
  2947. }
  2948. static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos)
  2949. {
  2950. struct super_block *sb = pde_data(file_inode(seq->file));
  2951. unsigned long position;
  2952. ++*pos;
  2953. if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
  2954. return NULL;
  2955. position = *pos + 1;
  2956. return (void *) ((unsigned long) position);
  2957. }
  2958. static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
  2959. {
  2960. struct super_block *sb = pde_data(file_inode(seq->file));
  2961. struct ext4_sb_info *sbi = EXT4_SB(sb);
  2962. unsigned long position = ((unsigned long) v);
  2963. struct ext4_group_info *grp;
  2964. unsigned int count;
  2965. unsigned long idx;
  2966. position--;
  2967. if (position >= MB_NUM_ORDERS(sb)) {
  2968. position -= MB_NUM_ORDERS(sb);
  2969. if (position == 0)
  2970. seq_puts(seq, "avg_fragment_size_lists:\n");
  2971. count = 0;
  2972. xa_for_each(&sbi->s_mb_avg_fragment_size[position], idx, grp)
  2973. count++;
  2974. seq_printf(seq, "\tlist_order_%u_groups: %u\n",
  2975. (unsigned int)position, count);
  2976. return 0;
  2977. }
  2978. if (position == 0) {
  2979. seq_printf(seq, "optimize_scan: %d\n",
  2980. test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0);
  2981. seq_puts(seq, "max_free_order_lists:\n");
  2982. }
  2983. count = 0;
  2984. xa_for_each(&sbi->s_mb_largest_free_orders[position], idx, grp)
  2985. count++;
  2986. seq_printf(seq, "\tlist_order_%u_groups: %u\n",
  2987. (unsigned int)position, count);
  2988. return 0;
  2989. }
  2990. static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
  2991. {
  2992. }
  2993. const struct seq_operations ext4_mb_seq_structs_summary_ops = {
  2994. .start = ext4_mb_seq_structs_summary_start,
  2995. .next = ext4_mb_seq_structs_summary_next,
  2996. .stop = ext4_mb_seq_structs_summary_stop,
  2997. .show = ext4_mb_seq_structs_summary_show,
  2998. };
  2999. static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
  3000. {
  3001. int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
  3002. struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
  3003. BUG_ON(!cachep);
  3004. return cachep;
  3005. }
  3006. /*
  3007. * Allocate the top-level s_group_info array for the specified number
  3008. * of groups
  3009. */
  3010. int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
  3011. {
  3012. struct ext4_sb_info *sbi = EXT4_SB(sb);
  3013. unsigned size;
  3014. struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
  3015. size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
  3016. EXT4_DESC_PER_BLOCK_BITS(sb);
  3017. if (size <= sbi->s_group_info_size)
  3018. return 0;
  3019. size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
  3020. new_groupinfo = kvzalloc(size, GFP_KERNEL);
  3021. if (!new_groupinfo) {
  3022. ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
  3023. return -ENOMEM;
  3024. }
  3025. rcu_read_lock();
  3026. old_groupinfo = rcu_dereference(sbi->s_group_info);
  3027. if (old_groupinfo)
  3028. memcpy(new_groupinfo, old_groupinfo,
  3029. sbi->s_group_info_size * sizeof(*sbi->s_group_info));
  3030. rcu_read_unlock();
  3031. rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
  3032. sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
  3033. if (old_groupinfo)
  3034. ext4_kvfree_array_rcu(old_groupinfo);
  3035. ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
  3036. sbi->s_group_info_size);
  3037. return 0;
  3038. }
  3039. /* Create and initialize ext4_group_info data for the given group. */
  3040. int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
  3041. struct ext4_group_desc *desc)
  3042. {
  3043. int i;
  3044. int metalen = 0;
  3045. int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
  3046. struct ext4_sb_info *sbi = EXT4_SB(sb);
  3047. struct ext4_group_info **meta_group_info;
  3048. struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
  3049. /*
  3050. * First check if this group is the first of a reserved block.
  3051. * If it's true, we have to allocate a new table of pointers
  3052. * to ext4_group_info structures
  3053. */
  3054. if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
  3055. metalen = sizeof(*meta_group_info) <<
  3056. EXT4_DESC_PER_BLOCK_BITS(sb);
  3057. meta_group_info = kmalloc(metalen, GFP_NOFS);
  3058. if (meta_group_info == NULL) {
  3059. ext4_msg(sb, KERN_ERR, "can't allocate mem "
  3060. "for a buddy group");
  3061. return -ENOMEM;
  3062. }
  3063. rcu_read_lock();
  3064. rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
  3065. rcu_read_unlock();
  3066. }
  3067. meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
  3068. i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
  3069. meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
  3070. if (meta_group_info[i] == NULL) {
  3071. ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
  3072. goto exit_group_info;
  3073. }
  3074. set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
  3075. &(meta_group_info[i]->bb_state));
  3076. /*
  3077. * initialize bb_free to be able to skip
  3078. * empty groups without initialization
  3079. */
  3080. if (ext4_has_group_desc_csum(sb) &&
  3081. (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
  3082. meta_group_info[i]->bb_free =
  3083. ext4_free_clusters_after_init(sb, group, desc);
  3084. } else {
  3085. meta_group_info[i]->bb_free =
  3086. ext4_free_group_clusters(sb, desc);
  3087. }
  3088. INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
  3089. init_rwsem(&meta_group_info[i]->alloc_sem);
  3090. meta_group_info[i]->bb_free_root = RB_ROOT;
  3091. meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
  3092. meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */
  3093. meta_group_info[i]->bb_group = group;
  3094. mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
  3095. return 0;
  3096. exit_group_info:
  3097. /* If a meta_group_info table has been allocated, release it now */
  3098. if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
  3099. struct ext4_group_info ***group_info;
  3100. rcu_read_lock();
  3101. group_info = rcu_dereference(sbi->s_group_info);
  3102. kfree(group_info[idx]);
  3103. group_info[idx] = NULL;
  3104. rcu_read_unlock();
  3105. }
  3106. return -ENOMEM;
  3107. } /* ext4_mb_add_groupinfo */
  3108. static int ext4_mb_init_backend(struct super_block *sb)
  3109. {
  3110. ext4_group_t ngroups = ext4_get_groups_count(sb);
  3111. ext4_group_t i;
  3112. struct ext4_sb_info *sbi = EXT4_SB(sb);
  3113. int err;
  3114. struct ext4_group_desc *desc;
  3115. struct ext4_group_info ***group_info;
  3116. struct kmem_cache *cachep;
  3117. err = ext4_mb_alloc_groupinfo(sb, ngroups);
  3118. if (err)
  3119. return err;
  3120. sbi->s_buddy_cache = new_inode(sb);
  3121. if (sbi->s_buddy_cache == NULL) {
  3122. ext4_msg(sb, KERN_ERR, "can't get new inode");
  3123. goto err_freesgi;
  3124. }
  3125. /* To avoid potentially colliding with an valid on-disk inode number,
  3126. * use EXT4_BAD_INO for the buddy cache inode number. This inode is
  3127. * not in the inode hash, so it should never be found by iget(), but
  3128. * this will avoid confusion if it ever shows up during debugging. */
  3129. sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
  3130. EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
  3131. ext4_set_inode_mapping_order(sbi->s_buddy_cache);
  3132. for (i = 0; i < ngroups; i++) {
  3133. cond_resched();
  3134. desc = ext4_get_group_desc(sb, i, NULL);
  3135. if (desc == NULL) {
  3136. ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
  3137. goto err_freebuddy;
  3138. }
  3139. if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
  3140. goto err_freebuddy;
  3141. }
  3142. if (ext4_has_feature_flex_bg(sb)) {
  3143. /* a single flex group is supposed to be read by a single IO.
  3144. * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
  3145. * unsigned integer, so the maximum shift is 32.
  3146. */
  3147. if (sbi->s_es->s_log_groups_per_flex >= 32) {
  3148. ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
  3149. goto err_freebuddy;
  3150. }
  3151. sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
  3152. BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
  3153. sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
  3154. } else {
  3155. sbi->s_mb_prefetch = 32;
  3156. }
  3157. if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
  3158. sbi->s_mb_prefetch = ext4_get_groups_count(sb);
  3159. /*
  3160. * now many real IOs to prefetch within a single allocation at
  3161. * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related
  3162. * optimization we shouldn't try to load too many groups, at some point
  3163. * we should start to use what we've got in memory.
  3164. * with an average random access time 5ms, it'd take a second to get
  3165. * 200 groups (* N with flex_bg), so let's make this limit 4
  3166. */
  3167. sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
  3168. if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
  3169. sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
  3170. return 0;
  3171. err_freebuddy:
  3172. cachep = get_groupinfo_cache(sb->s_blocksize_bits);
  3173. while (i-- > 0) {
  3174. struct ext4_group_info *grp = ext4_get_group_info(sb, i);
  3175. if (grp)
  3176. kmem_cache_free(cachep, grp);
  3177. }
  3178. i = sbi->s_group_info_size;
  3179. rcu_read_lock();
  3180. group_info = rcu_dereference(sbi->s_group_info);
  3181. while (i-- > 0)
  3182. kfree(group_info[i]);
  3183. rcu_read_unlock();
  3184. iput(sbi->s_buddy_cache);
  3185. err_freesgi:
  3186. kvfree(rcu_access_pointer(sbi->s_group_info));
  3187. return -ENOMEM;
  3188. }
  3189. static void ext4_groupinfo_destroy_slabs(void)
  3190. {
  3191. int i;
  3192. for (i = 0; i < NR_GRPINFO_CACHES; i++) {
  3193. kmem_cache_destroy(ext4_groupinfo_caches[i]);
  3194. ext4_groupinfo_caches[i] = NULL;
  3195. }
  3196. }
  3197. static int ext4_groupinfo_create_slab(size_t size)
  3198. {
  3199. static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
  3200. int slab_size;
  3201. int blocksize_bits = order_base_2(size);
  3202. int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
  3203. struct kmem_cache *cachep;
  3204. if (cache_index >= NR_GRPINFO_CACHES)
  3205. return -EINVAL;
  3206. if (unlikely(cache_index < 0))
  3207. cache_index = 0;
  3208. mutex_lock(&ext4_grpinfo_slab_create_mutex);
  3209. if (ext4_groupinfo_caches[cache_index]) {
  3210. mutex_unlock(&ext4_grpinfo_slab_create_mutex);
  3211. return 0; /* Already created */
  3212. }
  3213. slab_size = offsetof(struct ext4_group_info,
  3214. bb_counters[blocksize_bits + 2]);
  3215. cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
  3216. slab_size, 0, SLAB_RECLAIM_ACCOUNT,
  3217. NULL);
  3218. ext4_groupinfo_caches[cache_index] = cachep;
  3219. mutex_unlock(&ext4_grpinfo_slab_create_mutex);
  3220. if (!cachep) {
  3221. printk(KERN_EMERG
  3222. "EXT4-fs: no memory for groupinfo slab cache\n");
  3223. return -ENOMEM;
  3224. }
  3225. return 0;
  3226. }
  3227. static void ext4_discard_work(struct work_struct *work)
  3228. {
  3229. struct ext4_sb_info *sbi = container_of(work,
  3230. struct ext4_sb_info, s_discard_work);
  3231. struct super_block *sb = sbi->s_sb;
  3232. struct ext4_free_data *fd, *nfd;
  3233. struct ext4_buddy e4b;
  3234. LIST_HEAD(discard_list);
  3235. ext4_group_t grp, load_grp;
  3236. int err = 0;
  3237. spin_lock(&sbi->s_md_lock);
  3238. list_splice_init(&sbi->s_discard_list, &discard_list);
  3239. spin_unlock(&sbi->s_md_lock);
  3240. load_grp = UINT_MAX;
  3241. list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) {
  3242. /*
  3243. * If filesystem is umounting or no memory or suffering
  3244. * from no space, give up the discard
  3245. */
  3246. if ((sb->s_flags & SB_ACTIVE) && !err &&
  3247. !atomic_read(&sbi->s_retry_alloc_pending)) {
  3248. grp = fd->efd_group;
  3249. if (grp != load_grp) {
  3250. if (load_grp != UINT_MAX)
  3251. ext4_mb_unload_buddy(&e4b);
  3252. err = ext4_mb_load_buddy(sb, grp, &e4b);
  3253. if (err) {
  3254. kmem_cache_free(ext4_free_data_cachep, fd);
  3255. load_grp = UINT_MAX;
  3256. continue;
  3257. } else {
  3258. load_grp = grp;
  3259. }
  3260. }
  3261. ext4_lock_group(sb, grp);
  3262. ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
  3263. fd->efd_start_cluster + fd->efd_count - 1, 1);
  3264. ext4_unlock_group(sb, grp);
  3265. }
  3266. kmem_cache_free(ext4_free_data_cachep, fd);
  3267. }
  3268. if (load_grp != UINT_MAX)
  3269. ext4_mb_unload_buddy(&e4b);
  3270. }
  3271. static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi)
  3272. {
  3273. if (!sbi->s_mb_avg_fragment_size)
  3274. return;
  3275. for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
  3276. xa_destroy(&sbi->s_mb_avg_fragment_size[i]);
  3277. kfree(sbi->s_mb_avg_fragment_size);
  3278. sbi->s_mb_avg_fragment_size = NULL;
  3279. }
  3280. static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi)
  3281. {
  3282. if (!sbi->s_mb_largest_free_orders)
  3283. return;
  3284. for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
  3285. xa_destroy(&sbi->s_mb_largest_free_orders[i]);
  3286. kfree(sbi->s_mb_largest_free_orders);
  3287. sbi->s_mb_largest_free_orders = NULL;
  3288. }
  3289. int ext4_mb_init(struct super_block *sb)
  3290. {
  3291. struct ext4_sb_info *sbi = EXT4_SB(sb);
  3292. unsigned i, j;
  3293. unsigned offset, offset_incr;
  3294. unsigned max;
  3295. int ret;
  3296. i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);
  3297. sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
  3298. if (sbi->s_mb_offsets == NULL) {
  3299. ret = -ENOMEM;
  3300. goto out;
  3301. }
  3302. i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
  3303. sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
  3304. if (sbi->s_mb_maxs == NULL) {
  3305. ret = -ENOMEM;
  3306. goto out;
  3307. }
  3308. ret = ext4_groupinfo_create_slab(sb->s_blocksize);
  3309. if (ret < 0)
  3310. goto out;
  3311. /* order 0 is regular bitmap */
  3312. sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
  3313. sbi->s_mb_offsets[0] = 0;
  3314. i = 1;
  3315. offset = 0;
  3316. offset_incr = 1 << (sb->s_blocksize_bits - 1);
  3317. max = sb->s_blocksize << 2;
  3318. do {
  3319. sbi->s_mb_offsets[i] = offset;
  3320. sbi->s_mb_maxs[i] = max;
  3321. offset += offset_incr;
  3322. offset_incr = offset_incr >> 1;
  3323. max = max >> 1;
  3324. i++;
  3325. } while (i < MB_NUM_ORDERS(sb));
  3326. sbi->s_mb_avg_fragment_size =
  3327. kmalloc_objs(struct xarray, MB_NUM_ORDERS(sb));
  3328. if (!sbi->s_mb_avg_fragment_size) {
  3329. ret = -ENOMEM;
  3330. goto out;
  3331. }
  3332. for (i = 0; i < MB_NUM_ORDERS(sb); i++)
  3333. xa_init(&sbi->s_mb_avg_fragment_size[i]);
  3334. sbi->s_mb_largest_free_orders =
  3335. kmalloc_objs(struct xarray, MB_NUM_ORDERS(sb));
  3336. if (!sbi->s_mb_largest_free_orders) {
  3337. ret = -ENOMEM;
  3338. goto out;
  3339. }
  3340. for (i = 0; i < MB_NUM_ORDERS(sb); i++)
  3341. xa_init(&sbi->s_mb_largest_free_orders[i]);
  3342. spin_lock_init(&sbi->s_md_lock);
  3343. atomic_set(&sbi->s_mb_free_pending, 0);
  3344. INIT_LIST_HEAD(&sbi->s_freed_data_list[0]);
  3345. INIT_LIST_HEAD(&sbi->s_freed_data_list[1]);
  3346. INIT_LIST_HEAD(&sbi->s_discard_list);
  3347. INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
  3348. atomic_set(&sbi->s_retry_alloc_pending, 0);
  3349. sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
  3350. sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
  3351. sbi->s_mb_stats = MB_DEFAULT_STATS;
  3352. sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
  3353. sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
  3354. sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER;
  3355. /*
  3356. * The default group preallocation is 512, which for 4k block
  3357. * sizes translates to 2 megabytes. However for bigalloc file
  3358. * systems, this is probably too big (i.e, if the cluster size
  3359. * is 1 megabyte, then group preallocation size becomes half a
  3360. * gigabyte!). As a default, we will keep a two megabyte
  3361. * group pralloc size for cluster sizes up to 64k, and after
  3362. * that, we will force a minimum group preallocation size of
  3363. * 32 clusters. This translates to 8 megs when the cluster
  3364. * size is 256k, and 32 megs when the cluster size is 1 meg,
  3365. * which seems reasonable as a default.
  3366. */
  3367. sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
  3368. sbi->s_cluster_bits, 32);
  3369. /*
  3370. * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
  3371. * to the lowest multiple of s_stripe which is bigger than
  3372. * the s_mb_group_prealloc as determined above. We want
  3373. * the preallocation size to be an exact multiple of the
  3374. * RAID stripe size so that preallocations don't fragment
  3375. * the stripes.
  3376. */
  3377. if (sbi->s_stripe > 1) {
  3378. sbi->s_mb_group_prealloc = roundup(
  3379. sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe));
  3380. }
  3381. sbi->s_mb_nr_global_goals = umin(num_possible_cpus(),
  3382. DIV_ROUND_UP(sbi->s_groups_count, 4));
  3383. sbi->s_mb_last_groups = kzalloc_objs(ext4_group_t,
  3384. sbi->s_mb_nr_global_goals);
  3385. if (sbi->s_mb_last_groups == NULL) {
  3386. ret = -ENOMEM;
  3387. goto out;
  3388. }
  3389. sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
  3390. if (sbi->s_locality_groups == NULL) {
  3391. ret = -ENOMEM;
  3392. goto out_free_last_groups;
  3393. }
  3394. for_each_possible_cpu(i) {
  3395. struct ext4_locality_group *lg;
  3396. lg = per_cpu_ptr(sbi->s_locality_groups, i);
  3397. mutex_init(&lg->lg_mutex);
  3398. for (j = 0; j < PREALLOC_TB_SIZE; j++)
  3399. INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
  3400. spin_lock_init(&lg->lg_prealloc_lock);
  3401. }
  3402. if (bdev_nonrot(sb->s_bdev))
  3403. sbi->s_mb_max_linear_groups = 0;
  3404. else
  3405. sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
  3406. /* init file for buddy data */
  3407. ret = ext4_mb_init_backend(sb);
  3408. if (ret != 0)
  3409. goto out_free_locality_groups;
  3410. return 0;
  3411. out_free_locality_groups:
  3412. free_percpu(sbi->s_locality_groups);
  3413. sbi->s_locality_groups = NULL;
  3414. out_free_last_groups:
  3415. kfree(sbi->s_mb_last_groups);
  3416. sbi->s_mb_last_groups = NULL;
  3417. out:
  3418. ext4_mb_avg_fragment_size_destroy(sbi);
  3419. ext4_mb_largest_free_orders_destroy(sbi);
  3420. kfree(sbi->s_mb_offsets);
  3421. sbi->s_mb_offsets = NULL;
  3422. kfree(sbi->s_mb_maxs);
  3423. sbi->s_mb_maxs = NULL;
  3424. return ret;
  3425. }
  3426. /* need to called with the ext4 group lock held */
  3427. static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
  3428. {
  3429. struct ext4_prealloc_space *pa;
  3430. struct list_head *cur, *tmp;
  3431. int count = 0;
  3432. list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
  3433. pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
  3434. list_del(&pa->pa_group_list);
  3435. count++;
  3436. kmem_cache_free(ext4_pspace_cachep, pa);
  3437. }
  3438. return count;
  3439. }
  3440. void ext4_mb_release(struct super_block *sb)
  3441. {
  3442. ext4_group_t ngroups = ext4_get_groups_count(sb);
  3443. ext4_group_t i;
  3444. int num_meta_group_infos;
  3445. struct ext4_group_info *grinfo, ***group_info;
  3446. struct ext4_sb_info *sbi = EXT4_SB(sb);
  3447. struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
  3448. int count;
  3449. /*
  3450. * wait the discard work to drain all of ext4_free_data
  3451. */
  3452. flush_work(&sbi->s_discard_work);
  3453. WARN_ON_ONCE(!list_empty(&sbi->s_discard_list));
  3454. group_info = rcu_access_pointer(sbi->s_group_info);
  3455. if (group_info) {
  3456. for (i = 0; i < ngroups; i++) {
  3457. cond_resched();
  3458. grinfo = ext4_get_group_info(sb, i);
  3459. if (!grinfo)
  3460. continue;
  3461. mb_group_bb_bitmap_free(grinfo);
  3462. ext4_lock_group(sb, i);
  3463. count = ext4_mb_cleanup_pa(grinfo);
  3464. if (count)
  3465. mb_debug(sb, "mballoc: %d PAs left\n",
  3466. count);
  3467. ext4_unlock_group(sb, i);
  3468. kmem_cache_free(cachep, grinfo);
  3469. }
  3470. num_meta_group_infos = (ngroups +
  3471. EXT4_DESC_PER_BLOCK(sb) - 1) >>
  3472. EXT4_DESC_PER_BLOCK_BITS(sb);
  3473. for (i = 0; i < num_meta_group_infos; i++)
  3474. kfree(group_info[i]);
  3475. kvfree(group_info);
  3476. }
  3477. ext4_mb_avg_fragment_size_destroy(sbi);
  3478. ext4_mb_largest_free_orders_destroy(sbi);
  3479. kfree(sbi->s_mb_offsets);
  3480. kfree(sbi->s_mb_maxs);
  3481. iput(sbi->s_buddy_cache);
  3482. if (sbi->s_mb_stats) {
  3483. ext4_msg(sb, KERN_INFO,
  3484. "mballoc: %u blocks %u reqs (%u success)",
  3485. atomic_read(&sbi->s_bal_allocated),
  3486. atomic_read(&sbi->s_bal_reqs),
  3487. atomic_read(&sbi->s_bal_success));
  3488. ext4_msg(sb, KERN_INFO,
  3489. "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
  3490. "%u 2^N hits, %u breaks, %u lost",
  3491. atomic_read(&sbi->s_bal_ex_scanned),
  3492. atomic_read(&sbi->s_bal_groups_scanned),
  3493. atomic_read(&sbi->s_bal_goals),
  3494. atomic_read(&sbi->s_bal_2orders),
  3495. atomic_read(&sbi->s_bal_breaks),
  3496. atomic_read(&sbi->s_mb_lost_chunks));
  3497. ext4_msg(sb, KERN_INFO,
  3498. "mballoc: %u generated and it took %llu",
  3499. atomic_read(&sbi->s_mb_buddies_generated),
  3500. atomic64_read(&sbi->s_mb_generation_time));
  3501. ext4_msg(sb, KERN_INFO,
  3502. "mballoc: %u preallocated, %u discarded",
  3503. atomic_read(&sbi->s_mb_preallocated),
  3504. atomic_read(&sbi->s_mb_discarded));
  3505. }
  3506. free_percpu(sbi->s_locality_groups);
  3507. kfree(sbi->s_mb_last_groups);
  3508. }
  3509. static inline int ext4_issue_discard(struct super_block *sb,
  3510. ext4_group_t block_group, ext4_grpblk_t cluster, int count)
  3511. {
  3512. ext4_fsblk_t discard_block;
  3513. discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
  3514. ext4_group_first_block_no(sb, block_group));
  3515. count = EXT4_C2B(EXT4_SB(sb), count);
  3516. trace_ext4_discard_blocks(sb,
  3517. (unsigned long long) discard_block, count);
  3518. return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
  3519. }
  3520. static void ext4_free_data_in_buddy(struct super_block *sb,
  3521. struct ext4_free_data *entry)
  3522. {
  3523. struct ext4_buddy e4b;
  3524. struct ext4_group_info *db;
  3525. int err, count = 0;
  3526. mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
  3527. entry->efd_count, entry->efd_group, entry);
  3528. err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
  3529. /* we expect to find existing buddy because it's pinned */
  3530. BUG_ON(err != 0);
  3531. atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
  3532. db = e4b.bd_info;
  3533. /* there are blocks to put in buddy to make them really free */
  3534. count += entry->efd_count;
  3535. ext4_lock_group(sb, entry->efd_group);
  3536. /* Take it out of per group rb tree */
  3537. rb_erase(&entry->efd_node, &(db->bb_free_root));
  3538. mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
  3539. /*
  3540. * Clear the trimmed flag for the group so that the next
  3541. * ext4_trim_fs can trim it.
  3542. */
  3543. EXT4_MB_GRP_CLEAR_TRIMMED(db);
  3544. if (!db->bb_free_root.rb_node) {
  3545. /* No more items in the per group rb tree
  3546. * balance refcounts from ext4_mb_free_metadata()
  3547. */
  3548. folio_put(e4b.bd_buddy_folio);
  3549. folio_put(e4b.bd_bitmap_folio);
  3550. }
  3551. ext4_unlock_group(sb, entry->efd_group);
  3552. ext4_mb_unload_buddy(&e4b);
  3553. mb_debug(sb, "freed %d blocks in 1 structures\n", count);
  3554. }
  3555. /*
  3556. * This function is called by the jbd2 layer once the commit has finished,
  3557. * so we know we can free the blocks that were released with that commit.
  3558. */
  3559. void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
  3560. {
  3561. struct ext4_sb_info *sbi = EXT4_SB(sb);
  3562. struct ext4_free_data *entry, *tmp;
  3563. LIST_HEAD(freed_data_list);
  3564. struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1];
  3565. bool wake;
  3566. list_replace_init(s_freed_head, &freed_data_list);
  3567. list_for_each_entry(entry, &freed_data_list, efd_list)
  3568. ext4_free_data_in_buddy(sb, entry);
  3569. if (test_opt(sb, DISCARD)) {
  3570. spin_lock(&sbi->s_md_lock);
  3571. wake = list_empty(&sbi->s_discard_list);
  3572. list_splice_tail(&freed_data_list, &sbi->s_discard_list);
  3573. spin_unlock(&sbi->s_md_lock);
  3574. if (wake)
  3575. queue_work(system_dfl_wq, &sbi->s_discard_work);
  3576. } else {
  3577. list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
  3578. kmem_cache_free(ext4_free_data_cachep, entry);
  3579. }
  3580. }
  3581. int __init ext4_init_mballoc(void)
  3582. {
  3583. ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
  3584. SLAB_RECLAIM_ACCOUNT);
  3585. if (ext4_pspace_cachep == NULL)
  3586. goto out;
  3587. ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
  3588. SLAB_RECLAIM_ACCOUNT);
  3589. if (ext4_ac_cachep == NULL)
  3590. goto out_pa_free;
  3591. ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
  3592. SLAB_RECLAIM_ACCOUNT);
  3593. if (ext4_free_data_cachep == NULL)
  3594. goto out_ac_free;
  3595. return 0;
  3596. out_ac_free:
  3597. kmem_cache_destroy(ext4_ac_cachep);
  3598. out_pa_free:
  3599. kmem_cache_destroy(ext4_pspace_cachep);
  3600. out:
  3601. return -ENOMEM;
  3602. }
  3603. void ext4_exit_mballoc(void)
  3604. {
  3605. /*
  3606. * Wait for completion of call_rcu()'s on ext4_pspace_cachep
  3607. * before destroying the slab cache.
  3608. */
  3609. rcu_barrier();
  3610. kmem_cache_destroy(ext4_pspace_cachep);
  3611. kmem_cache_destroy(ext4_ac_cachep);
  3612. kmem_cache_destroy(ext4_free_data_cachep);
  3613. ext4_groupinfo_destroy_slabs();
  3614. }
  3615. #define EXT4_MB_BITMAP_MARKED_CHECK 0x0001
  3616. #define EXT4_MB_SYNC_UPDATE 0x0002
  3617. int
  3618. ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state,
  3619. ext4_group_t group, ext4_grpblk_t blkoff,
  3620. ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed)
  3621. {
  3622. struct ext4_sb_info *sbi = EXT4_SB(sb);
  3623. struct buffer_head *bitmap_bh = NULL;
  3624. struct ext4_group_desc *gdp;
  3625. struct buffer_head *gdp_bh;
  3626. int err;
  3627. unsigned int i, already, changed = len;
  3628. KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context,
  3629. handle, sb, state, group, blkoff, len,
  3630. flags, ret_changed);
  3631. if (ret_changed)
  3632. *ret_changed = 0;
  3633. bitmap_bh = ext4_read_block_bitmap(sb, group);
  3634. if (IS_ERR(bitmap_bh))
  3635. return PTR_ERR(bitmap_bh);
  3636. if (handle) {
  3637. BUFFER_TRACE(bitmap_bh, "getting write access");
  3638. err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
  3639. EXT4_JTR_NONE);
  3640. if (err)
  3641. goto out_err;
  3642. }
  3643. err = -EIO;
  3644. gdp = ext4_get_group_desc(sb, group, &gdp_bh);
  3645. if (!gdp)
  3646. goto out_err;
  3647. if (handle) {
  3648. BUFFER_TRACE(gdp_bh, "get_write_access");
  3649. err = ext4_journal_get_write_access(handle, sb, gdp_bh,
  3650. EXT4_JTR_NONE);
  3651. if (err)
  3652. goto out_err;
  3653. }
  3654. ext4_lock_group(sb, group);
  3655. if (ext4_has_group_desc_csum(sb) &&
  3656. (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
  3657. gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
  3658. ext4_free_group_clusters_set(sb, gdp,
  3659. ext4_free_clusters_after_init(sb, group, gdp));
  3660. }
  3661. if (flags & EXT4_MB_BITMAP_MARKED_CHECK) {
  3662. already = 0;
  3663. for (i = 0; i < len; i++)
  3664. if (mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
  3665. state)
  3666. already++;
  3667. changed = len - already;
  3668. }
  3669. if (state) {
  3670. mb_set_bits(bitmap_bh->b_data, blkoff, len);
  3671. ext4_free_group_clusters_set(sb, gdp,
  3672. ext4_free_group_clusters(sb, gdp) - changed);
  3673. } else {
  3674. mb_clear_bits(bitmap_bh->b_data, blkoff, len);
  3675. ext4_free_group_clusters_set(sb, gdp,
  3676. ext4_free_group_clusters(sb, gdp) + changed);
  3677. }
  3678. ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
  3679. ext4_group_desc_csum_set(sb, group, gdp);
  3680. ext4_unlock_group(sb, group);
  3681. if (ret_changed)
  3682. *ret_changed = changed;
  3683. if (sbi->s_log_groups_per_flex) {
  3684. ext4_group_t flex_group = ext4_flex_group(sbi, group);
  3685. struct flex_groups *fg = sbi_array_rcu_deref(sbi,
  3686. s_flex_groups, flex_group);
  3687. if (state)
  3688. atomic64_sub(changed, &fg->free_clusters);
  3689. else
  3690. atomic64_add(changed, &fg->free_clusters);
  3691. }
  3692. err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
  3693. if (err)
  3694. goto out_err;
  3695. err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
  3696. if (err)
  3697. goto out_err;
  3698. if (flags & EXT4_MB_SYNC_UPDATE) {
  3699. sync_dirty_buffer(bitmap_bh);
  3700. sync_dirty_buffer(gdp_bh);
  3701. }
  3702. out_err:
  3703. brelse(bitmap_bh);
  3704. return err;
  3705. }
  3706. /*
  3707. * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
  3708. * Returns 0 if success or error code
  3709. */
  3710. static noinline_for_stack int
  3711. ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle)
  3712. {
  3713. struct ext4_group_desc *gdp;
  3714. struct ext4_sb_info *sbi;
  3715. struct super_block *sb;
  3716. ext4_fsblk_t block;
  3717. int err, len;
  3718. int flags = 0;
  3719. ext4_grpblk_t changed;
  3720. BUG_ON(ac->ac_status != AC_STATUS_FOUND);
  3721. BUG_ON(ac->ac_b_ex.fe_len <= 0);
  3722. sb = ac->ac_sb;
  3723. sbi = EXT4_SB(sb);
  3724. gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL);
  3725. if (!gdp)
  3726. return -EIO;
  3727. ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
  3728. ext4_free_group_clusters(sb, gdp));
  3729. block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
  3730. len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
  3731. if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
  3732. ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
  3733. "fs metadata", block, block+len);
  3734. /* File system mounted not to panic on error
  3735. * Fix the bitmap and return EFSCORRUPTED
  3736. * We leak some of the blocks here.
  3737. */
  3738. err = ext4_mb_mark_context(handle, sb, true,
  3739. ac->ac_b_ex.fe_group,
  3740. ac->ac_b_ex.fe_start,
  3741. ac->ac_b_ex.fe_len,
  3742. 0, NULL);
  3743. if (!err)
  3744. err = -EFSCORRUPTED;
  3745. return err;
  3746. }
  3747. #ifdef AGGRESSIVE_CHECK
  3748. flags |= EXT4_MB_BITMAP_MARKED_CHECK;
  3749. #endif
  3750. err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group,
  3751. ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len,
  3752. flags, &changed);
  3753. if (err && changed == 0)
  3754. return err;
  3755. #ifdef AGGRESSIVE_CHECK
  3756. BUG_ON(changed != ac->ac_b_ex.fe_len);
  3757. #endif
  3758. percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
  3759. return err;
  3760. }
  3761. /*
  3762. * Idempotent helper for Ext4 fast commit replay path to set the state of
  3763. * blocks in bitmaps and update counters.
  3764. */
  3765. void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
  3766. int len, bool state)
  3767. {
  3768. struct ext4_sb_info *sbi = EXT4_SB(sb);
  3769. ext4_group_t group;
  3770. ext4_grpblk_t blkoff;
  3771. int err = 0;
  3772. unsigned int clen, thisgrp_len;
  3773. while (len > 0) {
  3774. ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
  3775. /*
  3776. * Check to see if we are freeing blocks across a group
  3777. * boundary.
  3778. * In case of flex_bg, this can happen that (block, len) may
  3779. * span across more than one group. In that case we need to
  3780. * get the corresponding group metadata to work with.
  3781. * For this we have goto again loop.
  3782. */
  3783. thisgrp_len = min(len, EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
  3784. clen = EXT4_NUM_B2C(sbi, thisgrp_len);
  3785. if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) {
  3786. ext4_error(sb, "Marking blocks in system zone - "
  3787. "Block = %llu, len = %u",
  3788. block, thisgrp_len);
  3789. break;
  3790. }
  3791. err = ext4_mb_mark_context(NULL, sb, state,
  3792. group, blkoff, clen,
  3793. EXT4_MB_BITMAP_MARKED_CHECK |
  3794. EXT4_MB_SYNC_UPDATE,
  3795. NULL);
  3796. if (err)
  3797. break;
  3798. block += thisgrp_len;
  3799. len -= thisgrp_len;
  3800. BUG_ON(len < 0);
  3801. }
  3802. }
  3803. /*
  3804. * here we normalize request for locality group
  3805. * Group request are normalized to s_mb_group_prealloc, which goes to
  3806. * s_strip if we set the same via mount option.
  3807. * s_mb_group_prealloc can be configured via
  3808. * /sys/fs/ext4/<partition>/mb_group_prealloc
  3809. *
  3810. * XXX: should we try to preallocate more than the group has now?
  3811. */
  3812. static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
  3813. {
  3814. struct super_block *sb = ac->ac_sb;
  3815. struct ext4_locality_group *lg = ac->ac_lg;
  3816. BUG_ON(lg == NULL);
  3817. ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
  3818. mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
  3819. }
  3820. /*
  3821. * This function returns the next element to look at during inode
  3822. * PA rbtree walk. We assume that we have held the inode PA rbtree lock
  3823. * (ei->i_prealloc_lock)
  3824. *
  3825. * new_start The start of the range we want to compare
  3826. * cur_start The existing start that we are comparing against
  3827. * node The node of the rb_tree
  3828. */
  3829. static inline struct rb_node*
  3830. ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node)
  3831. {
  3832. if (new_start < cur_start)
  3833. return node->rb_left;
  3834. else
  3835. return node->rb_right;
  3836. }
  3837. static inline void
  3838. ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
  3839. ext4_lblk_t start, loff_t end)
  3840. {
  3841. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  3842. struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
  3843. struct ext4_prealloc_space *tmp_pa;
  3844. ext4_lblk_t tmp_pa_start;
  3845. loff_t tmp_pa_end;
  3846. struct rb_node *iter;
  3847. read_lock(&ei->i_prealloc_lock);
  3848. for (iter = ei->i_prealloc_node.rb_node; iter;
  3849. iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) {
  3850. tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
  3851. pa_node.inode_node);
  3852. tmp_pa_start = tmp_pa->pa_lstart;
  3853. tmp_pa_end = pa_logical_end(sbi, tmp_pa);
  3854. spin_lock(&tmp_pa->pa_lock);
  3855. if (tmp_pa->pa_deleted == 0)
  3856. BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start));
  3857. spin_unlock(&tmp_pa->pa_lock);
  3858. }
  3859. read_unlock(&ei->i_prealloc_lock);
  3860. }
  3861. /*
  3862. * Given an allocation context "ac" and a range "start", "end", check
  3863. * and adjust boundaries if the range overlaps with any of the existing
  3864. * preallocatoins stored in the corresponding inode of the allocation context.
  3865. *
  3866. * Parameters:
  3867. * ac allocation context
  3868. * start start of the new range
  3869. * end end of the new range
  3870. */
  3871. static inline void
  3872. ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
  3873. ext4_lblk_t *start, loff_t *end)
  3874. {
  3875. struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
  3876. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  3877. struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL;
  3878. struct rb_node *iter;
  3879. ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1;
  3880. loff_t new_end, tmp_pa_end, left_pa_end = -1;
  3881. new_start = *start;
  3882. new_end = *end;
  3883. /*
  3884. * Adjust the normalized range so that it doesn't overlap with any
  3885. * existing preallocated blocks(PAs). Make sure to hold the rbtree lock
  3886. * so it doesn't change underneath us.
  3887. */
  3888. read_lock(&ei->i_prealloc_lock);
  3889. /* Step 1: find any one immediate neighboring PA of the normalized range */
  3890. for (iter = ei->i_prealloc_node.rb_node; iter;
  3891. iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
  3892. tmp_pa_start, iter)) {
  3893. tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
  3894. pa_node.inode_node);
  3895. tmp_pa_start = tmp_pa->pa_lstart;
  3896. tmp_pa_end = pa_logical_end(sbi, tmp_pa);
  3897. /* PA must not overlap original request */
  3898. spin_lock(&tmp_pa->pa_lock);
  3899. if (tmp_pa->pa_deleted == 0)
  3900. BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end ||
  3901. ac->ac_o_ex.fe_logical < tmp_pa_start));
  3902. spin_unlock(&tmp_pa->pa_lock);
  3903. }
  3904. /*
  3905. * Step 2: check if the found PA is left or right neighbor and
  3906. * get the other neighbor
  3907. */
  3908. if (tmp_pa) {
  3909. if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) {
  3910. struct rb_node *tmp;
  3911. left_pa = tmp_pa;
  3912. tmp = rb_next(&left_pa->pa_node.inode_node);
  3913. if (tmp) {
  3914. right_pa = rb_entry(tmp,
  3915. struct ext4_prealloc_space,
  3916. pa_node.inode_node);
  3917. }
  3918. } else {
  3919. struct rb_node *tmp;
  3920. right_pa = tmp_pa;
  3921. tmp = rb_prev(&right_pa->pa_node.inode_node);
  3922. if (tmp) {
  3923. left_pa = rb_entry(tmp,
  3924. struct ext4_prealloc_space,
  3925. pa_node.inode_node);
  3926. }
  3927. }
  3928. }
  3929. /* Step 3: get the non deleted neighbors */
  3930. if (left_pa) {
  3931. for (iter = &left_pa->pa_node.inode_node;;
  3932. iter = rb_prev(iter)) {
  3933. if (!iter) {
  3934. left_pa = NULL;
  3935. break;
  3936. }
  3937. tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
  3938. pa_node.inode_node);
  3939. left_pa = tmp_pa;
  3940. spin_lock(&tmp_pa->pa_lock);
  3941. if (tmp_pa->pa_deleted == 0) {
  3942. spin_unlock(&tmp_pa->pa_lock);
  3943. break;
  3944. }
  3945. spin_unlock(&tmp_pa->pa_lock);
  3946. }
  3947. }
  3948. if (right_pa) {
  3949. for (iter = &right_pa->pa_node.inode_node;;
  3950. iter = rb_next(iter)) {
  3951. if (!iter) {
  3952. right_pa = NULL;
  3953. break;
  3954. }
  3955. tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
  3956. pa_node.inode_node);
  3957. right_pa = tmp_pa;
  3958. spin_lock(&tmp_pa->pa_lock);
  3959. if (tmp_pa->pa_deleted == 0) {
  3960. spin_unlock(&tmp_pa->pa_lock);
  3961. break;
  3962. }
  3963. spin_unlock(&tmp_pa->pa_lock);
  3964. }
  3965. }
  3966. if (left_pa) {
  3967. left_pa_end = pa_logical_end(sbi, left_pa);
  3968. BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical);
  3969. }
  3970. if (right_pa) {
  3971. right_pa_start = right_pa->pa_lstart;
  3972. BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical);
  3973. }
  3974. /* Step 4: trim our normalized range to not overlap with the neighbors */
  3975. if (left_pa) {
  3976. if (left_pa_end > new_start)
  3977. new_start = left_pa_end;
  3978. }
  3979. if (right_pa) {
  3980. if (right_pa_start < new_end)
  3981. new_end = right_pa_start;
  3982. }
  3983. read_unlock(&ei->i_prealloc_lock);
  3984. /* XXX: extra loop to check we really don't overlap preallocations */
  3985. ext4_mb_pa_assert_overlap(ac, new_start, new_end);
  3986. *start = new_start;
  3987. *end = new_end;
  3988. }
  3989. /*
  3990. * Normalization means making request better in terms of
  3991. * size and alignment
  3992. */
  3993. static noinline_for_stack void
  3994. ext4_mb_normalize_request(struct ext4_allocation_context *ac,
  3995. struct ext4_allocation_request *ar)
  3996. {
  3997. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  3998. struct ext4_super_block *es = sbi->s_es;
  3999. int bsbits, max;
  4000. loff_t size, start_off, end;
  4001. loff_t orig_size __maybe_unused;
  4002. ext4_lblk_t start;
  4003. /* do normalize only data requests, metadata requests
  4004. do not need preallocation */
  4005. if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
  4006. return;
  4007. /* sometime caller may want exact blocks */
  4008. if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
  4009. return;
  4010. /* caller may indicate that preallocation isn't
  4011. * required (it's a tail, for example) */
  4012. if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
  4013. return;
  4014. if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
  4015. ext4_mb_normalize_group_request(ac);
  4016. return ;
  4017. }
  4018. bsbits = ac->ac_sb->s_blocksize_bits;
  4019. /* first, let's learn actual file size
  4020. * given current request is allocated */
  4021. size = extent_logical_end(sbi, &ac->ac_o_ex);
  4022. size = size << bsbits;
  4023. if (size < i_size_read(ac->ac_inode))
  4024. size = i_size_read(ac->ac_inode);
  4025. orig_size = size;
  4026. /* max size of free chunks */
  4027. max = 2 << bsbits;
  4028. #define NRL_CHECK_SIZE(req, size, max, chunk_size) \
  4029. (req <= (size) || max <= (chunk_size))
  4030. /* first, try to predict filesize */
  4031. /* XXX: should this table be tunable? */
  4032. start_off = 0;
  4033. if (size <= 16 * 1024) {
  4034. size = 16 * 1024;
  4035. } else if (size <= 32 * 1024) {
  4036. size = 32 * 1024;
  4037. } else if (size <= 64 * 1024) {
  4038. size = 64 * 1024;
  4039. } else if (size <= 128 * 1024) {
  4040. size = 128 * 1024;
  4041. } else if (size <= 256 * 1024) {
  4042. size = 256 * 1024;
  4043. } else if (size <= 512 * 1024) {
  4044. size = 512 * 1024;
  4045. } else if (size <= 1024 * 1024) {
  4046. size = 1024 * 1024;
  4047. } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
  4048. start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
  4049. (21 - bsbits)) << 21;
  4050. size = 2 * 1024 * 1024;
  4051. } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
  4052. start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
  4053. (22 - bsbits)) << 22;
  4054. size = 4 * 1024 * 1024;
  4055. } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len),
  4056. (8<<20)>>bsbits, max, 8 * 1024)) {
  4057. start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
  4058. (23 - bsbits)) << 23;
  4059. size = 8 * 1024 * 1024;
  4060. } else {
  4061. start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
  4062. size = (loff_t) EXT4_C2B(sbi,
  4063. ac->ac_o_ex.fe_len) << bsbits;
  4064. }
  4065. size = size >> bsbits;
  4066. start = start_off >> bsbits;
  4067. /*
  4068. * For tiny groups (smaller than 8MB) the chosen allocation
  4069. * alignment may be larger than group size. Make sure the
  4070. * alignment does not move allocation to a different group which
  4071. * makes mballoc fail assertions later.
  4072. */
  4073. start = max(start, rounddown(ac->ac_o_ex.fe_logical,
  4074. (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
  4075. /* avoid unnecessary preallocation that may trigger assertions */
  4076. if (start + size > EXT_MAX_BLOCKS)
  4077. size = EXT_MAX_BLOCKS - start;
  4078. /* don't cover already allocated blocks in selected range */
  4079. if (ar->pleft && start <= ar->lleft) {
  4080. size -= ar->lleft + 1 - start;
  4081. start = ar->lleft + 1;
  4082. }
  4083. if (ar->pright && start + size - 1 >= ar->lright)
  4084. size -= start + size - ar->lright;
  4085. /*
  4086. * Trim allocation request for filesystems with artificially small
  4087. * groups.
  4088. */
  4089. if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
  4090. size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
  4091. end = start + size;
  4092. ext4_mb_pa_adjust_overlap(ac, &start, &end);
  4093. size = end - start;
  4094. /*
  4095. * In this function "start" and "size" are normalized for better
  4096. * alignment and length such that we could preallocate more blocks.
  4097. * This normalization is done such that original request of
  4098. * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and
  4099. * "size" boundaries.
  4100. * (Note fe_len can be relaxed since FS block allocation API does not
  4101. * provide gurantee on number of contiguous blocks allocation since that
  4102. * depends upon free space left, etc).
  4103. * In case of inode pa, later we use the allocated blocks
  4104. * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated
  4105. * range of goal/best blocks [start, size] to put it at the
  4106. * ac_o_ex.fe_logical extent of this inode.
  4107. * (See ext4_mb_use_inode_pa() for more details)
  4108. */
  4109. if (start + size <= ac->ac_o_ex.fe_logical ||
  4110. start > ac->ac_o_ex.fe_logical) {
  4111. ext4_msg(ac->ac_sb, KERN_ERR,
  4112. "start %lu, size %lu, fe_logical %lu",
  4113. (unsigned long) start, (unsigned long) size,
  4114. (unsigned long) ac->ac_o_ex.fe_logical);
  4115. BUG();
  4116. }
  4117. BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
  4118. /* now prepare goal request */
  4119. /* XXX: is it better to align blocks WRT to logical
  4120. * placement or satisfy big request as is */
  4121. ac->ac_g_ex.fe_logical = start;
  4122. ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
  4123. ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;
  4124. /* define goal start in order to merge */
  4125. if (ar->pright && (ar->lright == (start + size)) &&
  4126. ar->pright >= size &&
  4127. ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
  4128. /* merge to the right */
  4129. ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
  4130. &ac->ac_g_ex.fe_group,
  4131. &ac->ac_g_ex.fe_start);
  4132. ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
  4133. }
  4134. if (ar->pleft && (ar->lleft + 1 == start) &&
  4135. ar->pleft + 1 < ext4_blocks_count(es)) {
  4136. /* merge to the left */
  4137. ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
  4138. &ac->ac_g_ex.fe_group,
  4139. &ac->ac_g_ex.fe_start);
  4140. ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
  4141. }
  4142. mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
  4143. orig_size, start);
  4144. }
  4145. static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
  4146. {
  4147. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  4148. if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
  4149. atomic_inc(&sbi->s_bal_reqs);
  4150. atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
  4151. if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
  4152. atomic_inc(&sbi->s_bal_success);
  4153. atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
  4154. for (int i=0; i<EXT4_MB_NUM_CRS; i++) {
  4155. atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]);
  4156. }
  4157. atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
  4158. if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
  4159. ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
  4160. atomic_inc(&sbi->s_bal_goals);
  4161. /* did we allocate as much as normalizer originally wanted? */
  4162. if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len)
  4163. atomic_inc(&sbi->s_bal_len_goals);
  4164. if (ac->ac_found > sbi->s_mb_max_to_scan)
  4165. atomic_inc(&sbi->s_bal_breaks);
  4166. }
  4167. if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
  4168. trace_ext4_mballoc_alloc(ac);
  4169. else
  4170. trace_ext4_mballoc_prealloc(ac);
  4171. }
  4172. /*
  4173. * Called on failure; free up any blocks from the inode PA for this
  4174. * context. We don't need this for MB_GROUP_PA because we only change
  4175. * pa_free in ext4_mb_release_context(), but on failure, we've already
  4176. * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
  4177. */
  4178. static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
  4179. {
  4180. struct ext4_prealloc_space *pa = ac->ac_pa;
  4181. struct ext4_buddy e4b;
  4182. int err;
  4183. if (pa == NULL) {
  4184. if (ac->ac_f_ex.fe_len == 0)
  4185. return;
  4186. err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
  4187. if (WARN_RATELIMIT(err,
  4188. "ext4: mb_load_buddy failed (%d)", err))
  4189. /*
  4190. * This should never happen since we pin the
  4191. * folios in the ext4_allocation_context so
  4192. * ext4_mb_load_buddy() should never fail.
  4193. */
  4194. return;
  4195. ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
  4196. mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
  4197. ac->ac_f_ex.fe_len);
  4198. ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
  4199. ext4_mb_unload_buddy(&e4b);
  4200. return;
  4201. }
  4202. if (pa->pa_type == MB_INODE_PA) {
  4203. spin_lock(&pa->pa_lock);
  4204. pa->pa_free += ac->ac_b_ex.fe_len;
  4205. spin_unlock(&pa->pa_lock);
  4206. }
  4207. }
  4208. /*
  4209. * use blocks preallocated to inode
  4210. */
  4211. static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
  4212. struct ext4_prealloc_space *pa)
  4213. {
  4214. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  4215. ext4_fsblk_t start;
  4216. ext4_fsblk_t end;
  4217. int len;
  4218. /* found preallocated blocks, use them */
  4219. start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
  4220. end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
  4221. start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
  4222. len = EXT4_NUM_B2C(sbi, end - start);
  4223. ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
  4224. &ac->ac_b_ex.fe_start);
  4225. ac->ac_b_ex.fe_len = len;
  4226. ac->ac_status = AC_STATUS_FOUND;
  4227. ac->ac_pa = pa;
  4228. BUG_ON(start < pa->pa_pstart);
  4229. BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
  4230. BUG_ON(pa->pa_free < len);
  4231. BUG_ON(ac->ac_b_ex.fe_len <= 0);
  4232. pa->pa_free -= len;
  4233. mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
  4234. }
  4235. /*
  4236. * use blocks preallocated to locality group
  4237. */
  4238. static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
  4239. struct ext4_prealloc_space *pa)
  4240. {
  4241. unsigned int len = ac->ac_o_ex.fe_len;
  4242. ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
  4243. &ac->ac_b_ex.fe_group,
  4244. &ac->ac_b_ex.fe_start);
  4245. ac->ac_b_ex.fe_len = len;
  4246. ac->ac_status = AC_STATUS_FOUND;
  4247. ac->ac_pa = pa;
  4248. /* we don't correct pa_pstart or pa_len here to avoid
  4249. * possible race when the group is being loaded concurrently
  4250. * instead we correct pa later, after blocks are marked
  4251. * in on-disk bitmap -- see ext4_mb_release_context()
  4252. * Other CPUs are prevented from allocating from this pa by lg_mutex
  4253. */
  4254. mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
  4255. pa->pa_lstart, len, pa);
  4256. }
  4257. /*
  4258. * Return the prealloc space that have minimal distance
  4259. * from the goal block. @cpa is the prealloc
  4260. * space that is having currently known minimal distance
  4261. * from the goal block.
  4262. */
  4263. static struct ext4_prealloc_space *
  4264. ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
  4265. struct ext4_prealloc_space *pa,
  4266. struct ext4_prealloc_space *cpa)
  4267. {
  4268. ext4_fsblk_t cur_distance, new_distance;
  4269. if (cpa == NULL) {
  4270. atomic_inc(&pa->pa_count);
  4271. return pa;
  4272. }
  4273. cur_distance = abs(goal_block - cpa->pa_pstart);
  4274. new_distance = abs(goal_block - pa->pa_pstart);
  4275. if (cur_distance <= new_distance)
  4276. return cpa;
  4277. /* drop the previous reference */
  4278. atomic_dec(&cpa->pa_count);
  4279. atomic_inc(&pa->pa_count);
  4280. return pa;
  4281. }
  4282. /*
  4283. * check if found pa meets EXT4_MB_HINT_GOAL_ONLY
  4284. */
  4285. static bool
  4286. ext4_mb_pa_goal_check(struct ext4_allocation_context *ac,
  4287. struct ext4_prealloc_space *pa)
  4288. {
  4289. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  4290. ext4_fsblk_t start;
  4291. if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)))
  4292. return true;
  4293. /*
  4294. * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted
  4295. * in ext4_mb_normalize_request and will keep same with ac_o_ex
  4296. * from ext4_mb_initialize_context. Choose ac_g_ex here to keep
  4297. * consistent with ext4_mb_find_by_goal.
  4298. */
  4299. start = pa->pa_pstart +
  4300. (ac->ac_g_ex.fe_logical - pa->pa_lstart);
  4301. if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start)
  4302. return false;
  4303. if (ac->ac_g_ex.fe_len > pa->pa_len -
  4304. EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart))
  4305. return false;
  4306. return true;
  4307. }
  4308. /*
  4309. * search goal blocks in preallocated space
  4310. */
  4311. static noinline_for_stack bool
  4312. ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
  4313. {
  4314. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  4315. int order, i;
  4316. struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
  4317. struct ext4_locality_group *lg;
  4318. struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL;
  4319. struct rb_node *iter;
  4320. ext4_fsblk_t goal_block;
  4321. /* only data can be preallocated */
  4322. if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
  4323. return false;
  4324. /*
  4325. * first, try per-file preallocation by searching the inode pa rbtree.
  4326. *
  4327. * Here, we can't do a direct traversal of the tree because
  4328. * ext4_mb_discard_group_preallocation() can paralelly mark the pa
  4329. * deleted and that can cause direct traversal to skip some entries.
  4330. */
  4331. read_lock(&ei->i_prealloc_lock);
  4332. if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) {
  4333. goto try_group_pa;
  4334. }
  4335. /*
  4336. * Step 1: Find a pa with logical start immediately adjacent to the
  4337. * original logical start. This could be on the left or right.
  4338. *
  4339. * (tmp_pa->pa_lstart never changes so we can skip locking for it).
  4340. */
  4341. for (iter = ei->i_prealloc_node.rb_node; iter;
  4342. iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
  4343. tmp_pa->pa_lstart, iter)) {
  4344. tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
  4345. pa_node.inode_node);
  4346. }
  4347. /*
  4348. * Step 2: The adjacent pa might be to the right of logical start, find
  4349. * the left adjacent pa. After this step we'd have a valid tmp_pa whose
  4350. * logical start is towards the left of original request's logical start
  4351. */
  4352. if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) {
  4353. struct rb_node *tmp;
  4354. tmp = rb_prev(&tmp_pa->pa_node.inode_node);
  4355. if (tmp) {
  4356. tmp_pa = rb_entry(tmp, struct ext4_prealloc_space,
  4357. pa_node.inode_node);
  4358. } else {
  4359. /*
  4360. * If there is no adjacent pa to the left then finding
  4361. * an overlapping pa is not possible hence stop searching
  4362. * inode pa tree
  4363. */
  4364. goto try_group_pa;
  4365. }
  4366. }
  4367. BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
  4368. /*
  4369. * Step 3: If the left adjacent pa is deleted, keep moving left to find
  4370. * the first non deleted adjacent pa. After this step we should have a
  4371. * valid tmp_pa which is guaranteed to be non deleted.
  4372. */
  4373. for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) {
  4374. if (!iter) {
  4375. /*
  4376. * no non deleted left adjacent pa, so stop searching
  4377. * inode pa tree
  4378. */
  4379. goto try_group_pa;
  4380. }
  4381. tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
  4382. pa_node.inode_node);
  4383. spin_lock(&tmp_pa->pa_lock);
  4384. if (tmp_pa->pa_deleted == 0) {
  4385. /*
  4386. * We will keep holding the pa_lock from
  4387. * this point on because we don't want group discard
  4388. * to delete this pa underneath us. Since group
  4389. * discard is anyways an ENOSPC operation it
  4390. * should be okay for it to wait a few more cycles.
  4391. */
  4392. break;
  4393. } else {
  4394. spin_unlock(&tmp_pa->pa_lock);
  4395. }
  4396. }
  4397. BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
  4398. BUG_ON(tmp_pa->pa_deleted == 1);
  4399. /*
  4400. * Step 4: We now have the non deleted left adjacent pa. Only this
  4401. * pa can possibly satisfy the request hence check if it overlaps
  4402. * original logical start and stop searching if it doesn't.
  4403. */
  4404. if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) {
  4405. spin_unlock(&tmp_pa->pa_lock);
  4406. goto try_group_pa;
  4407. }
  4408. /* non-extent files can't have physical blocks past 2^32 */
  4409. if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
  4410. (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) >
  4411. EXT4_MAX_BLOCK_FILE_PHYS)) {
  4412. /*
  4413. * Since PAs don't overlap, we won't find any other PA to
  4414. * satisfy this.
  4415. */
  4416. spin_unlock(&tmp_pa->pa_lock);
  4417. goto try_group_pa;
  4418. }
  4419. if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) {
  4420. atomic_inc(&tmp_pa->pa_count);
  4421. ext4_mb_use_inode_pa(ac, tmp_pa);
  4422. spin_unlock(&tmp_pa->pa_lock);
  4423. read_unlock(&ei->i_prealloc_lock);
  4424. return true;
  4425. } else {
  4426. /*
  4427. * We found a valid overlapping pa but couldn't use it because
  4428. * it had no free blocks. This should ideally never happen
  4429. * because:
  4430. *
  4431. * 1. When a new inode pa is added to rbtree it must have
  4432. * pa_free > 0 since otherwise we won't actually need
  4433. * preallocation.
  4434. *
  4435. * 2. An inode pa that is in the rbtree can only have it's
  4436. * pa_free become zero when another thread calls:
  4437. * ext4_mb_new_blocks
  4438. * ext4_mb_use_preallocated
  4439. * ext4_mb_use_inode_pa
  4440. *
  4441. * 3. Further, after the above calls make pa_free == 0, we will
  4442. * immediately remove it from the rbtree in:
  4443. * ext4_mb_new_blocks
  4444. * ext4_mb_release_context
  4445. * ext4_mb_put_pa
  4446. *
  4447. * 4. Since the pa_free becoming 0 and pa_free getting removed
  4448. * from tree both happen in ext4_mb_new_blocks, which is always
  4449. * called with i_data_sem held for data allocations, we can be
  4450. * sure that another process will never see a pa in rbtree with
  4451. * pa_free == 0.
  4452. */
  4453. WARN_ON_ONCE(tmp_pa->pa_free == 0);
  4454. }
  4455. spin_unlock(&tmp_pa->pa_lock);
  4456. try_group_pa:
  4457. read_unlock(&ei->i_prealloc_lock);
  4458. /* can we use group allocation? */
  4459. if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
  4460. return false;
  4461. /* inode may have no locality group for some reason */
  4462. lg = ac->ac_lg;
  4463. if (lg == NULL)
  4464. return false;
  4465. order = fls(ac->ac_o_ex.fe_len) - 1;
  4466. if (order > PREALLOC_TB_SIZE - 1)
  4467. /* The max size of hash table is PREALLOC_TB_SIZE */
  4468. order = PREALLOC_TB_SIZE - 1;
  4469. goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
  4470. /*
  4471. * search for the prealloc space that is having
  4472. * minimal distance from the goal block.
  4473. */
  4474. for (i = order; i < PREALLOC_TB_SIZE; i++) {
  4475. rcu_read_lock();
  4476. list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i],
  4477. pa_node.lg_list) {
  4478. spin_lock(&tmp_pa->pa_lock);
  4479. if (tmp_pa->pa_deleted == 0 &&
  4480. tmp_pa->pa_free >= ac->ac_o_ex.fe_len) {
  4481. cpa = ext4_mb_check_group_pa(goal_block,
  4482. tmp_pa, cpa);
  4483. }
  4484. spin_unlock(&tmp_pa->pa_lock);
  4485. }
  4486. rcu_read_unlock();
  4487. }
  4488. if (cpa) {
  4489. ext4_mb_use_group_pa(ac, cpa);
  4490. return true;
  4491. }
  4492. return false;
  4493. }
  4494. /*
  4495. * the function goes through all preallocation in this group and marks them
  4496. * used in in-core bitmap. buddy must be generated from this bitmap
  4497. * Need to be called with ext4 group lock held
  4498. */
  4499. static noinline_for_stack
  4500. void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
  4501. ext4_group_t group)
  4502. {
  4503. struct ext4_group_info *grp = ext4_get_group_info(sb, group);
  4504. struct ext4_prealloc_space *pa;
  4505. struct list_head *cur;
  4506. ext4_group_t groupnr;
  4507. ext4_grpblk_t start;
  4508. int preallocated = 0;
  4509. int len;
  4510. if (!grp)
  4511. return;
  4512. /* all form of preallocation discards first load group,
  4513. * so the only competing code is preallocation use.
  4514. * we don't need any locking here
  4515. * notice we do NOT ignore preallocations with pa_deleted
  4516. * otherwise we could leave used blocks available for
  4517. * allocation in buddy when concurrent ext4_mb_put_pa()
  4518. * is dropping preallocation
  4519. */
  4520. list_for_each(cur, &grp->bb_prealloc_list) {
  4521. pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
  4522. spin_lock(&pa->pa_lock);
  4523. ext4_get_group_no_and_offset(sb, pa->pa_pstart,
  4524. &groupnr, &start);
  4525. len = pa->pa_len;
  4526. spin_unlock(&pa->pa_lock);
  4527. if (unlikely(len == 0))
  4528. continue;
  4529. BUG_ON(groupnr != group);
  4530. mb_set_bits(bitmap, start, len);
  4531. preallocated += len;
  4532. }
  4533. mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
  4534. }
  4535. static void ext4_mb_mark_pa_deleted(struct super_block *sb,
  4536. struct ext4_prealloc_space *pa)
  4537. {
  4538. struct ext4_inode_info *ei;
  4539. if (pa->pa_deleted) {
  4540. ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
  4541. pa->pa_type, pa->pa_pstart, pa->pa_lstart,
  4542. pa->pa_len);
  4543. return;
  4544. }
  4545. pa->pa_deleted = 1;
  4546. if (pa->pa_type == MB_INODE_PA) {
  4547. ei = EXT4_I(pa->pa_inode);
  4548. atomic_dec(&ei->i_prealloc_active);
  4549. }
  4550. }
  4551. static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa)
  4552. {
  4553. BUG_ON(!pa);
  4554. BUG_ON(atomic_read(&pa->pa_count));
  4555. BUG_ON(pa->pa_deleted == 0);
  4556. kmem_cache_free(ext4_pspace_cachep, pa);
  4557. }
  4558. static void ext4_mb_pa_callback(struct rcu_head *head)
  4559. {
  4560. struct ext4_prealloc_space *pa;
  4561. pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
  4562. ext4_mb_pa_free(pa);
  4563. }
  4564. /*
  4565. * drops a reference to preallocated space descriptor
  4566. * if this was the last reference and the space is consumed
  4567. */
  4568. static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
  4569. struct super_block *sb, struct ext4_prealloc_space *pa)
  4570. {
  4571. ext4_group_t grp;
  4572. ext4_fsblk_t grp_blk;
  4573. struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
  4574. /* in this short window concurrent discard can set pa_deleted */
  4575. spin_lock(&pa->pa_lock);
  4576. if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
  4577. spin_unlock(&pa->pa_lock);
  4578. return;
  4579. }
  4580. if (pa->pa_deleted == 1) {
  4581. spin_unlock(&pa->pa_lock);
  4582. return;
  4583. }
  4584. ext4_mb_mark_pa_deleted(sb, pa);
  4585. spin_unlock(&pa->pa_lock);
  4586. grp_blk = pa->pa_pstart;
  4587. /*
  4588. * If doing group-based preallocation, pa_pstart may be in the
  4589. * next group when pa is used up
  4590. */
  4591. if (pa->pa_type == MB_GROUP_PA)
  4592. grp_blk--;
  4593. grp = ext4_get_group_number(sb, grp_blk);
  4594. /*
  4595. * possible race:
  4596. *
  4597. * P1 (buddy init) P2 (regular allocation)
  4598. * find block B in PA
  4599. * copy on-disk bitmap to buddy
  4600. * mark B in on-disk bitmap
  4601. * drop PA from group
  4602. * mark all PAs in buddy
  4603. *
  4604. * thus, P1 initializes buddy with B available. to prevent this
  4605. * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
  4606. * against that pair
  4607. */
  4608. ext4_lock_group(sb, grp);
  4609. list_del(&pa->pa_group_list);
  4610. ext4_unlock_group(sb, grp);
  4611. if (pa->pa_type == MB_INODE_PA) {
  4612. write_lock(pa->pa_node_lock.inode_lock);
  4613. rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
  4614. write_unlock(pa->pa_node_lock.inode_lock);
  4615. ext4_mb_pa_free(pa);
  4616. } else {
  4617. spin_lock(pa->pa_node_lock.lg_lock);
  4618. list_del_rcu(&pa->pa_node.lg_list);
  4619. spin_unlock(pa->pa_node_lock.lg_lock);
  4620. call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
  4621. }
  4622. }
  4623. static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new)
  4624. {
  4625. struct rb_node **iter = &root->rb_node, *parent = NULL;
  4626. struct ext4_prealloc_space *iter_pa, *new_pa;
  4627. ext4_lblk_t iter_start, new_start;
  4628. while (*iter) {
  4629. iter_pa = rb_entry(*iter, struct ext4_prealloc_space,
  4630. pa_node.inode_node);
  4631. new_pa = rb_entry(new, struct ext4_prealloc_space,
  4632. pa_node.inode_node);
  4633. iter_start = iter_pa->pa_lstart;
  4634. new_start = new_pa->pa_lstart;
  4635. parent = *iter;
  4636. if (new_start < iter_start)
  4637. iter = &((*iter)->rb_left);
  4638. else
  4639. iter = &((*iter)->rb_right);
  4640. }
  4641. rb_link_node(new, parent, iter);
  4642. rb_insert_color(new, root);
  4643. }
  4644. /*
  4645. * creates new preallocated space for given inode
  4646. */
  4647. static noinline_for_stack void
  4648. ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
  4649. {
  4650. struct super_block *sb = ac->ac_sb;
  4651. struct ext4_sb_info *sbi = EXT4_SB(sb);
  4652. struct ext4_prealloc_space *pa;
  4653. struct ext4_group_info *grp;
  4654. struct ext4_inode_info *ei;
  4655. /* preallocate only when found space is larger then requested */
  4656. BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
  4657. BUG_ON(ac->ac_status != AC_STATUS_FOUND);
  4658. BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
  4659. BUG_ON(ac->ac_pa == NULL);
  4660. pa = ac->ac_pa;
  4661. if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) {
  4662. struct ext4_free_extent ex = {
  4663. .fe_logical = ac->ac_g_ex.fe_logical,
  4664. .fe_len = ac->ac_orig_goal_len,
  4665. };
  4666. loff_t orig_goal_end = extent_logical_end(sbi, &ex);
  4667. loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex);
  4668. /*
  4669. * We can't allocate as much as normalizer wants, so we try
  4670. * to get proper lstart to cover the original request, except
  4671. * when the goal doesn't cover the original request as below:
  4672. *
  4673. * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048
  4674. * best_ex:0/200(200) -> adjusted: 1848/2048(200)
  4675. */
  4676. BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
  4677. BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
  4678. /*
  4679. * Use the below logic for adjusting best extent as it keeps
  4680. * fragmentation in check while ensuring logical range of best
  4681. * extent doesn't overflow out of goal extent:
  4682. *
  4683. * 1. Check if best ex can be kept at end of goal (before
  4684. * cr_best_avail trimmed it) and still cover original start
  4685. * 2. Else, check if best ex can be kept at start of goal and
  4686. * still cover original end
  4687. * 3. Else, keep the best ex at start of original request.
  4688. */
  4689. ex.fe_len = ac->ac_b_ex.fe_len;
  4690. ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len);
  4691. if (ac->ac_o_ex.fe_logical >= ex.fe_logical)
  4692. goto adjust_bex;
  4693. ex.fe_logical = ac->ac_g_ex.fe_logical;
  4694. if (o_ex_end <= extent_logical_end(sbi, &ex))
  4695. goto adjust_bex;
  4696. ex.fe_logical = ac->ac_o_ex.fe_logical;
  4697. adjust_bex:
  4698. ac->ac_b_ex.fe_logical = ex.fe_logical;
  4699. BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
  4700. BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end);
  4701. }
  4702. pa->pa_lstart = ac->ac_b_ex.fe_logical;
  4703. pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
  4704. pa->pa_len = ac->ac_b_ex.fe_len;
  4705. pa->pa_free = pa->pa_len;
  4706. spin_lock_init(&pa->pa_lock);
  4707. INIT_LIST_HEAD(&pa->pa_group_list);
  4708. pa->pa_deleted = 0;
  4709. pa->pa_type = MB_INODE_PA;
  4710. mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
  4711. pa->pa_len, pa->pa_lstart);
  4712. trace_ext4_mb_new_inode_pa(ac, pa);
  4713. atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
  4714. ext4_mb_use_inode_pa(ac, pa);
  4715. ei = EXT4_I(ac->ac_inode);
  4716. grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
  4717. if (!grp)
  4718. return;
  4719. pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock;
  4720. pa->pa_inode = ac->ac_inode;
  4721. list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
  4722. write_lock(pa->pa_node_lock.inode_lock);
  4723. ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node);
  4724. write_unlock(pa->pa_node_lock.inode_lock);
  4725. atomic_inc(&ei->i_prealloc_active);
  4726. }
  4727. /*
  4728. * creates new preallocated space for locality group inodes belongs to
  4729. */
  4730. static noinline_for_stack void
  4731. ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
  4732. {
  4733. struct super_block *sb = ac->ac_sb;
  4734. struct ext4_locality_group *lg;
  4735. struct ext4_prealloc_space *pa;
  4736. struct ext4_group_info *grp;
  4737. /* preallocate only when found space is larger then requested */
  4738. BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
  4739. BUG_ON(ac->ac_status != AC_STATUS_FOUND);
  4740. BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
  4741. BUG_ON(ac->ac_pa == NULL);
  4742. pa = ac->ac_pa;
  4743. pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
  4744. pa->pa_lstart = pa->pa_pstart;
  4745. pa->pa_len = ac->ac_b_ex.fe_len;
  4746. pa->pa_free = pa->pa_len;
  4747. spin_lock_init(&pa->pa_lock);
  4748. INIT_LIST_HEAD(&pa->pa_node.lg_list);
  4749. INIT_LIST_HEAD(&pa->pa_group_list);
  4750. pa->pa_deleted = 0;
  4751. pa->pa_type = MB_GROUP_PA;
  4752. mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
  4753. pa->pa_len, pa->pa_lstart);
  4754. trace_ext4_mb_new_group_pa(ac, pa);
  4755. ext4_mb_use_group_pa(ac, pa);
  4756. atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
  4757. grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
  4758. if (!grp)
  4759. return;
  4760. lg = ac->ac_lg;
  4761. BUG_ON(lg == NULL);
  4762. pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock;
  4763. pa->pa_inode = NULL;
  4764. list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
  4765. /*
  4766. * We will later add the new pa to the right bucket
  4767. * after updating the pa_free in ext4_mb_release_context
  4768. */
  4769. }
  4770. static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
  4771. {
  4772. if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
  4773. ext4_mb_new_group_pa(ac);
  4774. else
  4775. ext4_mb_new_inode_pa(ac);
  4776. }
  4777. /*
  4778. * finds all unused blocks in on-disk bitmap, frees them in
  4779. * in-core bitmap and buddy.
  4780. * @pa must be unlinked from inode and group lists, so that
  4781. * nobody else can find/use it.
  4782. * the caller MUST hold group/inode locks.
  4783. * TODO: optimize the case when there are no in-core structures yet
  4784. */
  4785. static noinline_for_stack void
  4786. ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
  4787. struct ext4_prealloc_space *pa)
  4788. {
  4789. struct super_block *sb = e4b->bd_sb;
  4790. struct ext4_sb_info *sbi = EXT4_SB(sb);
  4791. unsigned int end;
  4792. unsigned int next;
  4793. ext4_group_t group;
  4794. ext4_grpblk_t bit;
  4795. unsigned long long grp_blk_start;
  4796. int free = 0;
  4797. BUG_ON(pa->pa_deleted == 0);
  4798. ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
  4799. grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
  4800. BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
  4801. end = bit + pa->pa_len;
  4802. while (bit < end) {
  4803. bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
  4804. if (bit >= end)
  4805. break;
  4806. next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
  4807. mb_debug(sb, "free preallocated %u/%u in group %u\n",
  4808. (unsigned) ext4_group_first_block_no(sb, group) + bit,
  4809. (unsigned) next - bit, (unsigned) group);
  4810. free += next - bit;
  4811. trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
  4812. trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
  4813. EXT4_C2B(sbi, bit)),
  4814. next - bit);
  4815. mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
  4816. bit = next + 1;
  4817. }
  4818. if (free != pa->pa_free) {
  4819. ext4_msg(e4b->bd_sb, KERN_CRIT,
  4820. "pa %p: logic %lu, phys. %lu, len %d",
  4821. pa, (unsigned long) pa->pa_lstart,
  4822. (unsigned long) pa->pa_pstart,
  4823. pa->pa_len);
  4824. ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
  4825. free, pa->pa_free);
  4826. /*
  4827. * pa is already deleted so we use the value obtained
  4828. * from the bitmap and continue.
  4829. */
  4830. }
  4831. atomic_add(free, &sbi->s_mb_discarded);
  4832. }
  4833. static noinline_for_stack void
  4834. ext4_mb_release_group_pa(struct ext4_buddy *e4b,
  4835. struct ext4_prealloc_space *pa)
  4836. {
  4837. struct super_block *sb = e4b->bd_sb;
  4838. ext4_group_t group;
  4839. ext4_grpblk_t bit;
  4840. trace_ext4_mb_release_group_pa(sb, pa);
  4841. BUG_ON(pa->pa_deleted == 0);
  4842. ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
  4843. if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
  4844. ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
  4845. e4b->bd_group, group, pa->pa_pstart);
  4846. return;
  4847. }
  4848. mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
  4849. atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
  4850. trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
  4851. }
  4852. /*
  4853. * releases all preallocations in given group
  4854. *
  4855. * first, we need to decide discard policy:
  4856. * - when do we discard
  4857. * 1) ENOSPC
  4858. * - how many do we discard
  4859. * 1) how many requested
  4860. */
  4861. static noinline_for_stack int
  4862. ext4_mb_discard_group_preallocations(struct super_block *sb,
  4863. ext4_group_t group, int *busy)
  4864. {
  4865. struct ext4_group_info *grp = ext4_get_group_info(sb, group);
  4866. struct buffer_head *bitmap_bh = NULL;
  4867. struct ext4_prealloc_space *pa, *tmp;
  4868. LIST_HEAD(list);
  4869. struct ext4_buddy e4b;
  4870. struct ext4_inode_info *ei;
  4871. int err;
  4872. int free = 0;
  4873. if (!grp)
  4874. return 0;
  4875. mb_debug(sb, "discard preallocation for group %u\n", group);
  4876. if (list_empty(&grp->bb_prealloc_list))
  4877. goto out_dbg;
  4878. bitmap_bh = ext4_read_block_bitmap(sb, group);
  4879. if (IS_ERR(bitmap_bh)) {
  4880. err = PTR_ERR(bitmap_bh);
  4881. ext4_error_err(sb, -err,
  4882. "Error %d reading block bitmap for %u",
  4883. err, group);
  4884. goto out_dbg;
  4885. }
  4886. err = ext4_mb_load_buddy(sb, group, &e4b);
  4887. if (err) {
  4888. ext4_warning(sb, "Error %d loading buddy information for %u",
  4889. err, group);
  4890. put_bh(bitmap_bh);
  4891. goto out_dbg;
  4892. }
  4893. ext4_lock_group(sb, group);
  4894. list_for_each_entry_safe(pa, tmp,
  4895. &grp->bb_prealloc_list, pa_group_list) {
  4896. spin_lock(&pa->pa_lock);
  4897. if (atomic_read(&pa->pa_count)) {
  4898. spin_unlock(&pa->pa_lock);
  4899. *busy = 1;
  4900. continue;
  4901. }
  4902. if (pa->pa_deleted) {
  4903. spin_unlock(&pa->pa_lock);
  4904. continue;
  4905. }
  4906. /* seems this one can be freed ... */
  4907. ext4_mb_mark_pa_deleted(sb, pa);
  4908. if (!free)
  4909. this_cpu_inc(discard_pa_seq);
  4910. /* we can trust pa_free ... */
  4911. free += pa->pa_free;
  4912. spin_unlock(&pa->pa_lock);
  4913. list_del(&pa->pa_group_list);
  4914. list_add(&pa->u.pa_tmp_list, &list);
  4915. }
  4916. /* now free all selected PAs */
  4917. list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
  4918. /* remove from object (inode or locality group) */
  4919. if (pa->pa_type == MB_GROUP_PA) {
  4920. spin_lock(pa->pa_node_lock.lg_lock);
  4921. list_del_rcu(&pa->pa_node.lg_list);
  4922. spin_unlock(pa->pa_node_lock.lg_lock);
  4923. } else {
  4924. write_lock(pa->pa_node_lock.inode_lock);
  4925. ei = EXT4_I(pa->pa_inode);
  4926. rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
  4927. write_unlock(pa->pa_node_lock.inode_lock);
  4928. }
  4929. list_del(&pa->u.pa_tmp_list);
  4930. if (pa->pa_type == MB_GROUP_PA) {
  4931. ext4_mb_release_group_pa(&e4b, pa);
  4932. call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
  4933. } else {
  4934. ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
  4935. ext4_mb_pa_free(pa);
  4936. }
  4937. }
  4938. ext4_unlock_group(sb, group);
  4939. ext4_mb_unload_buddy(&e4b);
  4940. put_bh(bitmap_bh);
  4941. out_dbg:
  4942. mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
  4943. free, group, grp->bb_free);
  4944. return free;
  4945. }
  4946. /*
  4947. * releases all non-used preallocated blocks for given inode
  4948. *
  4949. * It's important to discard preallocations under i_data_sem
  4950. * We don't want another block to be served from the prealloc
  4951. * space when we are discarding the inode prealloc space.
  4952. *
  4953. * FIXME!! Make sure it is valid at all the call sites
  4954. */
  4955. void ext4_discard_preallocations(struct inode *inode)
  4956. {
  4957. struct ext4_inode_info *ei = EXT4_I(inode);
  4958. struct super_block *sb = inode->i_sb;
  4959. struct buffer_head *bitmap_bh = NULL;
  4960. struct ext4_prealloc_space *pa, *tmp;
  4961. ext4_group_t group = 0;
  4962. LIST_HEAD(list);
  4963. struct ext4_buddy e4b;
  4964. struct rb_node *iter;
  4965. int err;
  4966. if (!S_ISREG(inode->i_mode))
  4967. return;
  4968. if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
  4969. return;
  4970. mb_debug(sb, "discard preallocation for inode %lu\n",
  4971. inode->i_ino);
  4972. trace_ext4_discard_preallocations(inode,
  4973. atomic_read(&ei->i_prealloc_active));
  4974. repeat:
  4975. /* first, collect all pa's in the inode */
  4976. write_lock(&ei->i_prealloc_lock);
  4977. for (iter = rb_first(&ei->i_prealloc_node); iter;
  4978. iter = rb_next(iter)) {
  4979. pa = rb_entry(iter, struct ext4_prealloc_space,
  4980. pa_node.inode_node);
  4981. BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock);
  4982. spin_lock(&pa->pa_lock);
  4983. if (atomic_read(&pa->pa_count)) {
  4984. /* this shouldn't happen often - nobody should
  4985. * use preallocation while we're discarding it */
  4986. spin_unlock(&pa->pa_lock);
  4987. write_unlock(&ei->i_prealloc_lock);
  4988. ext4_msg(sb, KERN_ERR,
  4989. "uh-oh! used pa while discarding");
  4990. WARN_ON(1);
  4991. schedule_timeout_uninterruptible(HZ);
  4992. goto repeat;
  4993. }
  4994. if (pa->pa_deleted == 0) {
  4995. ext4_mb_mark_pa_deleted(sb, pa);
  4996. spin_unlock(&pa->pa_lock);
  4997. rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
  4998. list_add(&pa->u.pa_tmp_list, &list);
  4999. continue;
  5000. }
  5001. /* someone is deleting pa right now */
  5002. spin_unlock(&pa->pa_lock);
  5003. write_unlock(&ei->i_prealloc_lock);
  5004. /* we have to wait here because pa_deleted
  5005. * doesn't mean pa is already unlinked from
  5006. * the list. as we might be called from
  5007. * ->clear_inode() the inode will get freed
  5008. * and concurrent thread which is unlinking
  5009. * pa from inode's list may access already
  5010. * freed memory, bad-bad-bad */
  5011. /* XXX: if this happens too often, we can
  5012. * add a flag to force wait only in case
  5013. * of ->clear_inode(), but not in case of
  5014. * regular truncate */
  5015. schedule_timeout_uninterruptible(HZ);
  5016. goto repeat;
  5017. }
  5018. write_unlock(&ei->i_prealloc_lock);
  5019. list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
  5020. BUG_ON(pa->pa_type != MB_INODE_PA);
  5021. group = ext4_get_group_number(sb, pa->pa_pstart);
  5022. err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
  5023. GFP_NOFS|__GFP_NOFAIL);
  5024. if (err) {
  5025. ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
  5026. err, group);
  5027. continue;
  5028. }
  5029. bitmap_bh = ext4_read_block_bitmap(sb, group);
  5030. if (IS_ERR(bitmap_bh)) {
  5031. err = PTR_ERR(bitmap_bh);
  5032. ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
  5033. err, group);
  5034. ext4_mb_unload_buddy(&e4b);
  5035. continue;
  5036. }
  5037. ext4_lock_group(sb, group);
  5038. list_del(&pa->pa_group_list);
  5039. ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
  5040. ext4_unlock_group(sb, group);
  5041. ext4_mb_unload_buddy(&e4b);
  5042. put_bh(bitmap_bh);
  5043. list_del(&pa->u.pa_tmp_list);
  5044. ext4_mb_pa_free(pa);
  5045. }
  5046. }
  5047. static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
  5048. {
  5049. struct ext4_prealloc_space *pa;
  5050. BUG_ON(ext4_pspace_cachep == NULL);
  5051. pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
  5052. if (!pa)
  5053. return -ENOMEM;
  5054. atomic_set(&pa->pa_count, 1);
  5055. ac->ac_pa = pa;
  5056. return 0;
  5057. }
  5058. static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac)
  5059. {
  5060. struct ext4_prealloc_space *pa = ac->ac_pa;
  5061. BUG_ON(!pa);
  5062. ac->ac_pa = NULL;
  5063. WARN_ON(!atomic_dec_and_test(&pa->pa_count));
  5064. /*
  5065. * current function is only called due to an error or due to
  5066. * len of found blocks < len of requested blocks hence the PA has not
  5067. * been added to grp->bb_prealloc_list. So we don't need to lock it
  5068. */
  5069. pa->pa_deleted = 1;
  5070. ext4_mb_pa_free(pa);
  5071. }
  5072. #ifdef CONFIG_EXT4_DEBUG
  5073. static inline void ext4_mb_show_pa(struct super_block *sb)
  5074. {
  5075. ext4_group_t i, ngroups;
  5076. if (ext4_emergency_state(sb))
  5077. return;
  5078. ngroups = ext4_get_groups_count(sb);
  5079. mb_debug(sb, "groups: ");
  5080. for (i = 0; i < ngroups; i++) {
  5081. struct ext4_group_info *grp = ext4_get_group_info(sb, i);
  5082. struct ext4_prealloc_space *pa;
  5083. ext4_grpblk_t start;
  5084. struct list_head *cur;
  5085. if (!grp)
  5086. continue;
  5087. ext4_lock_group(sb, i);
  5088. list_for_each(cur, &grp->bb_prealloc_list) {
  5089. pa = list_entry(cur, struct ext4_prealloc_space,
  5090. pa_group_list);
  5091. spin_lock(&pa->pa_lock);
  5092. ext4_get_group_no_and_offset(sb, pa->pa_pstart,
  5093. NULL, &start);
  5094. spin_unlock(&pa->pa_lock);
  5095. mb_debug(sb, "PA:%u:%d:%d\n", i, start,
  5096. pa->pa_len);
  5097. }
  5098. ext4_unlock_group(sb, i);
  5099. mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
  5100. grp->bb_fragments);
  5101. }
  5102. }
  5103. static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
  5104. {
  5105. struct super_block *sb = ac->ac_sb;
  5106. if (ext4_emergency_state(sb))
  5107. return;
  5108. mb_debug(sb, "Can't allocate:"
  5109. " Allocation context details:");
  5110. mb_debug(sb, "status %u flags 0x%x",
  5111. ac->ac_status, ac->ac_flags);
  5112. mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
  5113. "goal %lu/%lu/%lu@%lu, "
  5114. "best %lu/%lu/%lu@%lu cr %d",
  5115. (unsigned long)ac->ac_o_ex.fe_group,
  5116. (unsigned long)ac->ac_o_ex.fe_start,
  5117. (unsigned long)ac->ac_o_ex.fe_len,
  5118. (unsigned long)ac->ac_o_ex.fe_logical,
  5119. (unsigned long)ac->ac_g_ex.fe_group,
  5120. (unsigned long)ac->ac_g_ex.fe_start,
  5121. (unsigned long)ac->ac_g_ex.fe_len,
  5122. (unsigned long)ac->ac_g_ex.fe_logical,
  5123. (unsigned long)ac->ac_b_ex.fe_group,
  5124. (unsigned long)ac->ac_b_ex.fe_start,
  5125. (unsigned long)ac->ac_b_ex.fe_len,
  5126. (unsigned long)ac->ac_b_ex.fe_logical,
  5127. (int)ac->ac_criteria);
  5128. mb_debug(sb, "%u found", ac->ac_found);
  5129. mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa));
  5130. if (ac->ac_pa)
  5131. mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ?
  5132. "group pa" : "inode pa");
  5133. ext4_mb_show_pa(sb);
  5134. }
  5135. #else
  5136. static inline void ext4_mb_show_pa(struct super_block *sb)
  5137. {
  5138. }
  5139. static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
  5140. {
  5141. ext4_mb_show_pa(ac->ac_sb);
  5142. }
  5143. #endif
  5144. /*
  5145. * We use locality group preallocation for small size file. The size of the
  5146. * file is determined by the current size or the resulting size after
  5147. * allocation which ever is larger
  5148. *
  5149. * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
  5150. */
  5151. static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
  5152. {
  5153. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  5154. int bsbits = ac->ac_sb->s_blocksize_bits;
  5155. loff_t size, isize;
  5156. bool inode_pa_eligible, group_pa_eligible;
  5157. if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
  5158. return;
  5159. if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
  5160. return;
  5161. group_pa_eligible = sbi->s_mb_group_prealloc > 0;
  5162. inode_pa_eligible = true;
  5163. size = extent_logical_end(sbi, &ac->ac_o_ex);
  5164. isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
  5165. >> bsbits;
  5166. /* No point in using inode preallocation for closed files */
  5167. if ((size == isize) && !ext4_fs_is_busy(sbi) &&
  5168. !inode_is_open_for_write(ac->ac_inode))
  5169. inode_pa_eligible = false;
  5170. size = max(size, isize);
  5171. /* Don't use group allocation for large files */
  5172. if (size > sbi->s_mb_stream_request)
  5173. group_pa_eligible = false;
  5174. if (!group_pa_eligible) {
  5175. if (inode_pa_eligible)
  5176. ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
  5177. else
  5178. ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
  5179. return;
  5180. }
  5181. BUG_ON(ac->ac_lg != NULL);
  5182. /*
  5183. * locality group prealloc space are per cpu. The reason for having
  5184. * per cpu locality group is to reduce the contention between block
  5185. * request from multiple CPUs.
  5186. */
  5187. ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
  5188. /* we're going to use group allocation */
  5189. ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
  5190. /* serialize all allocations in the group */
  5191. mutex_lock(&ac->ac_lg->lg_mutex);
  5192. }
  5193. static noinline_for_stack void
  5194. ext4_mb_initialize_context(struct ext4_allocation_context *ac,
  5195. struct ext4_allocation_request *ar)
  5196. {
  5197. struct super_block *sb = ar->inode->i_sb;
  5198. struct ext4_sb_info *sbi = EXT4_SB(sb);
  5199. struct ext4_super_block *es = sbi->s_es;
  5200. ext4_group_t group;
  5201. unsigned int len;
  5202. ext4_fsblk_t goal;
  5203. ext4_grpblk_t block;
  5204. /* we can't allocate > group size */
  5205. len = ar->len;
  5206. /* just a dirty hack to filter too big requests */
  5207. if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
  5208. len = EXT4_CLUSTERS_PER_GROUP(sb);
  5209. /* start searching from the goal */
  5210. goal = ar->goal;
  5211. if (goal < le32_to_cpu(es->s_first_data_block) ||
  5212. goal >= ext4_blocks_count(es))
  5213. goal = le32_to_cpu(es->s_first_data_block);
  5214. ext4_get_group_no_and_offset(sb, goal, &group, &block);
  5215. /* set up allocation goals */
  5216. ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
  5217. ac->ac_status = AC_STATUS_CONTINUE;
  5218. ac->ac_sb = sb;
  5219. ac->ac_inode = ar->inode;
  5220. ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
  5221. ac->ac_o_ex.fe_group = group;
  5222. ac->ac_o_ex.fe_start = block;
  5223. ac->ac_o_ex.fe_len = len;
  5224. ac->ac_g_ex = ac->ac_o_ex;
  5225. ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;
  5226. ac->ac_flags = ar->flags;
  5227. /* we have to define context: we'll work with a file or
  5228. * locality group. this is a policy, actually */
  5229. ext4_mb_group_or_file(ac);
  5230. mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
  5231. "left: %u/%u, right %u/%u to %swritable\n",
  5232. (unsigned) ar->len, (unsigned) ar->logical,
  5233. (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
  5234. (unsigned) ar->lleft, (unsigned) ar->pleft,
  5235. (unsigned) ar->lright, (unsigned) ar->pright,
  5236. inode_is_open_for_write(ar->inode) ? "" : "non-");
  5237. }
  5238. static noinline_for_stack void
  5239. ext4_mb_discard_lg_preallocations(struct super_block *sb,
  5240. struct ext4_locality_group *lg,
  5241. int order, int total_entries)
  5242. {
  5243. ext4_group_t group = 0;
  5244. struct ext4_buddy e4b;
  5245. LIST_HEAD(discard_list);
  5246. struct ext4_prealloc_space *pa, *tmp;
  5247. mb_debug(sb, "discard locality group preallocation\n");
  5248. spin_lock(&lg->lg_prealloc_lock);
  5249. list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
  5250. pa_node.lg_list,
  5251. lockdep_is_held(&lg->lg_prealloc_lock)) {
  5252. spin_lock(&pa->pa_lock);
  5253. if (atomic_read(&pa->pa_count)) {
  5254. /*
  5255. * This is the pa that we just used
  5256. * for block allocation. So don't
  5257. * free that
  5258. */
  5259. spin_unlock(&pa->pa_lock);
  5260. continue;
  5261. }
  5262. if (pa->pa_deleted) {
  5263. spin_unlock(&pa->pa_lock);
  5264. continue;
  5265. }
  5266. /* only lg prealloc space */
  5267. BUG_ON(pa->pa_type != MB_GROUP_PA);
  5268. /* seems this one can be freed ... */
  5269. ext4_mb_mark_pa_deleted(sb, pa);
  5270. spin_unlock(&pa->pa_lock);
  5271. list_del_rcu(&pa->pa_node.lg_list);
  5272. list_add(&pa->u.pa_tmp_list, &discard_list);
  5273. total_entries--;
  5274. if (total_entries <= 5) {
  5275. /*
  5276. * we want to keep only 5 entries
  5277. * allowing it to grow to 8. This
  5278. * mak sure we don't call discard
  5279. * soon for this list.
  5280. */
  5281. break;
  5282. }
  5283. }
  5284. spin_unlock(&lg->lg_prealloc_lock);
  5285. list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
  5286. int err;
  5287. group = ext4_get_group_number(sb, pa->pa_pstart);
  5288. err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
  5289. GFP_NOFS|__GFP_NOFAIL);
  5290. if (err) {
  5291. ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
  5292. err, group);
  5293. continue;
  5294. }
  5295. ext4_lock_group(sb, group);
  5296. list_del(&pa->pa_group_list);
  5297. ext4_mb_release_group_pa(&e4b, pa);
  5298. ext4_unlock_group(sb, group);
  5299. ext4_mb_unload_buddy(&e4b);
  5300. list_del(&pa->u.pa_tmp_list);
  5301. call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
  5302. }
  5303. }
  5304. /*
  5305. * We have incremented pa_count. So it cannot be freed at this
  5306. * point. Also we hold lg_mutex. So no parallel allocation is
  5307. * possible from this lg. That means pa_free cannot be updated.
  5308. *
  5309. * A parallel ext4_mb_discard_group_preallocations is possible.
  5310. * which can cause the lg_prealloc_list to be updated.
  5311. */
  5312. static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
  5313. {
  5314. int order, added = 0, lg_prealloc_count = 1;
  5315. struct super_block *sb = ac->ac_sb;
  5316. struct ext4_locality_group *lg = ac->ac_lg;
  5317. struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
  5318. order = fls(pa->pa_free) - 1;
  5319. if (order > PREALLOC_TB_SIZE - 1)
  5320. /* The max size of hash table is PREALLOC_TB_SIZE */
  5321. order = PREALLOC_TB_SIZE - 1;
  5322. /* Add the prealloc space to lg */
  5323. spin_lock(&lg->lg_prealloc_lock);
  5324. list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
  5325. pa_node.lg_list,
  5326. lockdep_is_held(&lg->lg_prealloc_lock)) {
  5327. spin_lock(&tmp_pa->pa_lock);
  5328. if (tmp_pa->pa_deleted) {
  5329. spin_unlock(&tmp_pa->pa_lock);
  5330. continue;
  5331. }
  5332. if (!added && pa->pa_free < tmp_pa->pa_free) {
  5333. /* Add to the tail of the previous entry */
  5334. list_add_tail_rcu(&pa->pa_node.lg_list,
  5335. &tmp_pa->pa_node.lg_list);
  5336. added = 1;
  5337. /*
  5338. * we want to count the total
  5339. * number of entries in the list
  5340. */
  5341. }
  5342. spin_unlock(&tmp_pa->pa_lock);
  5343. lg_prealloc_count++;
  5344. }
  5345. if (!added)
  5346. list_add_tail_rcu(&pa->pa_node.lg_list,
  5347. &lg->lg_prealloc_list[order]);
  5348. spin_unlock(&lg->lg_prealloc_lock);
  5349. /* Now trim the list to be not more than 8 elements */
  5350. if (lg_prealloc_count > 8)
  5351. ext4_mb_discard_lg_preallocations(sb, lg,
  5352. order, lg_prealloc_count);
  5353. }
  5354. /*
  5355. * release all resource we used in allocation
  5356. */
  5357. static void ext4_mb_release_context(struct ext4_allocation_context *ac)
  5358. {
  5359. struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
  5360. struct ext4_prealloc_space *pa = ac->ac_pa;
  5361. if (pa) {
  5362. if (pa->pa_type == MB_GROUP_PA) {
  5363. /* see comment in ext4_mb_use_group_pa() */
  5364. spin_lock(&pa->pa_lock);
  5365. pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
  5366. pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
  5367. pa->pa_free -= ac->ac_b_ex.fe_len;
  5368. pa->pa_len -= ac->ac_b_ex.fe_len;
  5369. spin_unlock(&pa->pa_lock);
  5370. /*
  5371. * We want to add the pa to the right bucket.
  5372. * Remove it from the list and while adding
  5373. * make sure the list to which we are adding
  5374. * doesn't grow big.
  5375. */
  5376. if (likely(pa->pa_free)) {
  5377. spin_lock(pa->pa_node_lock.lg_lock);
  5378. list_del_rcu(&pa->pa_node.lg_list);
  5379. spin_unlock(pa->pa_node_lock.lg_lock);
  5380. ext4_mb_add_n_trim(ac);
  5381. }
  5382. }
  5383. ext4_mb_put_pa(ac, ac->ac_sb, pa);
  5384. }
  5385. if (ac->ac_bitmap_folio)
  5386. folio_put(ac->ac_bitmap_folio);
  5387. if (ac->ac_buddy_folio)
  5388. folio_put(ac->ac_buddy_folio);
  5389. if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
  5390. mutex_unlock(&ac->ac_lg->lg_mutex);
  5391. ext4_mb_collect_stats(ac);
  5392. }
  5393. static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
  5394. {
  5395. ext4_group_t i, ngroups = ext4_get_groups_count(sb);
  5396. int ret;
  5397. int freed = 0, busy = 0;
  5398. int retry = 0;
  5399. trace_ext4_mb_discard_preallocations(sb, needed);
  5400. if (needed == 0)
  5401. needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
  5402. repeat:
  5403. for (i = 0; i < ngroups && needed > 0; i++) {
  5404. ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
  5405. freed += ret;
  5406. needed -= ret;
  5407. cond_resched();
  5408. }
  5409. if (needed > 0 && busy && ++retry < 3) {
  5410. busy = 0;
  5411. goto repeat;
  5412. }
  5413. return freed;
  5414. }
  5415. static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
  5416. struct ext4_allocation_context *ac, u64 *seq)
  5417. {
  5418. int freed;
  5419. u64 seq_retry = 0;
  5420. bool ret = false;
  5421. freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
  5422. if (freed) {
  5423. ret = true;
  5424. goto out_dbg;
  5425. }
  5426. seq_retry = ext4_get_discard_pa_seq_sum();
  5427. if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
  5428. ac->ac_flags |= EXT4_MB_STRICT_CHECK;
  5429. *seq = seq_retry;
  5430. ret = true;
  5431. }
  5432. out_dbg:
  5433. mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret));
  5434. return ret;
  5435. }
  5436. /*
  5437. * Simple allocator for Ext4 fast commit replay path. It searches for blocks
  5438. * linearly starting at the goal block and also excludes the blocks which
  5439. * are going to be in use after fast commit replay.
  5440. */
  5441. static ext4_fsblk_t
  5442. ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
  5443. {
  5444. struct buffer_head *bitmap_bh;
  5445. struct super_block *sb = ar->inode->i_sb;
  5446. struct ext4_sb_info *sbi = EXT4_SB(sb);
  5447. ext4_group_t group, nr;
  5448. ext4_grpblk_t blkoff;
  5449. ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
  5450. ext4_grpblk_t i = 0;
  5451. ext4_fsblk_t goal, block;
  5452. struct ext4_super_block *es = sbi->s_es;
  5453. goal = ar->goal;
  5454. if (goal < le32_to_cpu(es->s_first_data_block) ||
  5455. goal >= ext4_blocks_count(es))
  5456. goal = le32_to_cpu(es->s_first_data_block);
  5457. ar->len = 0;
  5458. ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
  5459. for (nr = ext4_get_groups_count(sb); nr > 0; nr--) {
  5460. bitmap_bh = ext4_read_block_bitmap(sb, group);
  5461. if (IS_ERR(bitmap_bh)) {
  5462. *errp = PTR_ERR(bitmap_bh);
  5463. pr_warn("Failed to read block bitmap\n");
  5464. return 0;
  5465. }
  5466. while (1) {
  5467. i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
  5468. blkoff);
  5469. if (i >= max)
  5470. break;
  5471. if (ext4_fc_replay_check_excluded(sb,
  5472. ext4_group_first_block_no(sb, group) +
  5473. EXT4_C2B(sbi, i))) {
  5474. blkoff = i + 1;
  5475. } else
  5476. break;
  5477. }
  5478. brelse(bitmap_bh);
  5479. if (i < max)
  5480. break;
  5481. if (++group >= ext4_get_groups_count(sb))
  5482. group = 0;
  5483. blkoff = 0;
  5484. }
  5485. if (i >= max) {
  5486. *errp = -ENOSPC;
  5487. return 0;
  5488. }
  5489. block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i);
  5490. ext4_mb_mark_bb(sb, block, 1, true);
  5491. ar->len = 1;
  5492. *errp = 0;
  5493. return block;
  5494. }
  5495. /*
  5496. * Main entry point into mballoc to allocate blocks
  5497. * it tries to use preallocation first, then falls back
  5498. * to usual allocation
  5499. */
  5500. ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
  5501. struct ext4_allocation_request *ar, int *errp)
  5502. {
  5503. struct ext4_allocation_context *ac = NULL;
  5504. struct ext4_sb_info *sbi;
  5505. struct super_block *sb;
  5506. ext4_fsblk_t block = 0;
  5507. unsigned int inquota = 0;
  5508. unsigned int reserv_clstrs = 0;
  5509. int retries = 0;
  5510. u64 seq;
  5511. might_sleep();
  5512. sb = ar->inode->i_sb;
  5513. sbi = EXT4_SB(sb);
  5514. trace_ext4_request_blocks(ar);
  5515. if (sbi->s_mount_state & EXT4_FC_REPLAY)
  5516. return ext4_mb_new_blocks_simple(ar, errp);
  5517. /* Allow to use superuser reservation for quota file */
  5518. if (ext4_is_quota_file(ar->inode))
  5519. ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
  5520. if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
  5521. /* Without delayed allocation we need to verify
  5522. * there is enough free blocks to do block allocation
  5523. * and verify allocation doesn't exceed the quota limits.
  5524. */
  5525. while (ar->len &&
  5526. ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
  5527. /* let others to free the space */
  5528. cond_resched();
  5529. ar->len = ar->len >> 1;
  5530. }
  5531. if (!ar->len) {
  5532. ext4_mb_show_pa(sb);
  5533. *errp = -ENOSPC;
  5534. return 0;
  5535. }
  5536. reserv_clstrs = ar->len;
  5537. if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
  5538. dquot_alloc_block_nofail(ar->inode,
  5539. EXT4_C2B(sbi, ar->len));
  5540. } else {
  5541. while (ar->len &&
  5542. dquot_alloc_block(ar->inode,
  5543. EXT4_C2B(sbi, ar->len))) {
  5544. ar->flags |= EXT4_MB_HINT_NOPREALLOC;
  5545. ar->len--;
  5546. }
  5547. }
  5548. inquota = ar->len;
  5549. if (ar->len == 0) {
  5550. *errp = -EDQUOT;
  5551. goto out;
  5552. }
  5553. }
  5554. ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
  5555. if (!ac) {
  5556. ar->len = 0;
  5557. *errp = -ENOMEM;
  5558. goto out;
  5559. }
  5560. ext4_mb_initialize_context(ac, ar);
  5561. ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
  5562. seq = this_cpu_read(discard_pa_seq);
  5563. if (!ext4_mb_use_preallocated(ac)) {
  5564. ac->ac_op = EXT4_MB_HISTORY_ALLOC;
  5565. ext4_mb_normalize_request(ac, ar);
  5566. *errp = ext4_mb_pa_alloc(ac);
  5567. if (*errp)
  5568. goto errout;
  5569. repeat:
  5570. /* allocate space in core */
  5571. *errp = ext4_mb_regular_allocator(ac);
  5572. /*
  5573. * pa allocated above is added to grp->bb_prealloc_list only
  5574. * when we were able to allocate some block i.e. when
  5575. * ac->ac_status == AC_STATUS_FOUND.
  5576. * And error from above mean ac->ac_status != AC_STATUS_FOUND
  5577. * So we have to free this pa here itself.
  5578. */
  5579. if (*errp) {
  5580. ext4_mb_pa_put_free(ac);
  5581. ext4_discard_allocated_blocks(ac);
  5582. goto errout;
  5583. }
  5584. if (ac->ac_status == AC_STATUS_FOUND &&
  5585. ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
  5586. ext4_mb_pa_put_free(ac);
  5587. }
  5588. if (likely(ac->ac_status == AC_STATUS_FOUND)) {
  5589. *errp = ext4_mb_mark_diskspace_used(ac, handle);
  5590. if (*errp) {
  5591. ext4_discard_allocated_blocks(ac);
  5592. goto errout;
  5593. } else {
  5594. block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
  5595. ar->len = ac->ac_b_ex.fe_len;
  5596. }
  5597. } else {
  5598. if (++retries < 3 &&
  5599. ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
  5600. goto repeat;
  5601. /*
  5602. * If block allocation fails then the pa allocated above
  5603. * needs to be freed here itself.
  5604. */
  5605. ext4_mb_pa_put_free(ac);
  5606. *errp = -ENOSPC;
  5607. }
  5608. if (*errp) {
  5609. errout:
  5610. ac->ac_b_ex.fe_len = 0;
  5611. ar->len = 0;
  5612. ext4_mb_show_ac(ac);
  5613. }
  5614. ext4_mb_release_context(ac);
  5615. kmem_cache_free(ext4_ac_cachep, ac);
  5616. out:
  5617. if (inquota && ar->len < inquota)
  5618. dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
  5619. /* release any reserved blocks */
  5620. if (reserv_clstrs)
  5621. percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs);
  5622. trace_ext4_allocate_blocks(ar, (unsigned long long)block);
  5623. return block;
  5624. }
  5625. /*
  5626. * We can merge two free data extents only if the physical blocks
  5627. * are contiguous, AND the extents were freed by the same transaction,
  5628. * AND the blocks are associated with the same group.
  5629. */
  5630. static inline bool
  5631. ext4_freed_extents_can_be_merged(struct ext4_free_data *entry1,
  5632. struct ext4_free_data *entry2)
  5633. {
  5634. if (entry1->efd_tid != entry2->efd_tid)
  5635. return false;
  5636. if (entry1->efd_start_cluster + entry1->efd_count !=
  5637. entry2->efd_start_cluster)
  5638. return false;
  5639. if (WARN_ON_ONCE(entry1->efd_group != entry2->efd_group))
  5640. return false;
  5641. return true;
  5642. }
  5643. static inline void
  5644. ext4_merge_freed_extents(struct ext4_sb_info *sbi, struct rb_root *root,
  5645. struct ext4_free_data *entry1,
  5646. struct ext4_free_data *entry2)
  5647. {
  5648. entry1->efd_count += entry2->efd_count;
  5649. spin_lock(&sbi->s_md_lock);
  5650. list_del(&entry2->efd_list);
  5651. spin_unlock(&sbi->s_md_lock);
  5652. rb_erase(&entry2->efd_node, root);
  5653. kmem_cache_free(ext4_free_data_cachep, entry2);
  5654. }
  5655. static inline void
  5656. ext4_try_merge_freed_extent_prev(struct ext4_sb_info *sbi, struct rb_root *root,
  5657. struct ext4_free_data *entry)
  5658. {
  5659. struct ext4_free_data *prev;
  5660. struct rb_node *node;
  5661. node = rb_prev(&entry->efd_node);
  5662. if (!node)
  5663. return;
  5664. prev = rb_entry(node, struct ext4_free_data, efd_node);
  5665. if (ext4_freed_extents_can_be_merged(prev, entry))
  5666. ext4_merge_freed_extents(sbi, root, prev, entry);
  5667. }
  5668. static inline void
  5669. ext4_try_merge_freed_extent_next(struct ext4_sb_info *sbi, struct rb_root *root,
  5670. struct ext4_free_data *entry)
  5671. {
  5672. struct ext4_free_data *next;
  5673. struct rb_node *node;
  5674. node = rb_next(&entry->efd_node);
  5675. if (!node)
  5676. return;
  5677. next = rb_entry(node, struct ext4_free_data, efd_node);
  5678. if (ext4_freed_extents_can_be_merged(entry, next))
  5679. ext4_merge_freed_extents(sbi, root, entry, next);
  5680. }
  5681. static noinline_for_stack void
  5682. ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
  5683. struct ext4_free_data *new_entry)
  5684. {
  5685. ext4_group_t group = e4b->bd_group;
  5686. ext4_grpblk_t cluster;
  5687. ext4_grpblk_t clusters = new_entry->efd_count;
  5688. struct ext4_free_data *entry = NULL;
  5689. struct ext4_group_info *db = e4b->bd_info;
  5690. struct super_block *sb = e4b->bd_sb;
  5691. struct ext4_sb_info *sbi = EXT4_SB(sb);
  5692. struct rb_root *root = &db->bb_free_root;
  5693. struct rb_node **n = &root->rb_node;
  5694. struct rb_node *parent = NULL, *new_node;
  5695. BUG_ON(!ext4_handle_valid(handle));
  5696. BUG_ON(e4b->bd_bitmap_folio == NULL);
  5697. BUG_ON(e4b->bd_buddy_folio == NULL);
  5698. new_node = &new_entry->efd_node;
  5699. cluster = new_entry->efd_start_cluster;
  5700. if (!*n) {
  5701. /* first free block exent. We need to
  5702. protect buddy cache from being freed,
  5703. * otherwise we'll refresh it from
  5704. * on-disk bitmap and lose not-yet-available
  5705. * blocks */
  5706. folio_get(e4b->bd_buddy_folio);
  5707. folio_get(e4b->bd_bitmap_folio);
  5708. }
  5709. while (*n) {
  5710. parent = *n;
  5711. entry = rb_entry(parent, struct ext4_free_data, efd_node);
  5712. if (cluster < entry->efd_start_cluster)
  5713. n = &(*n)->rb_left;
  5714. else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
  5715. n = &(*n)->rb_right;
  5716. else {
  5717. ext4_grp_locked_error(sb, group, 0,
  5718. ext4_group_first_block_no(sb, group) +
  5719. EXT4_C2B(sbi, cluster),
  5720. "Block already on to-be-freed list");
  5721. kmem_cache_free(ext4_free_data_cachep, new_entry);
  5722. return;
  5723. }
  5724. }
  5725. atomic_add(clusters, &sbi->s_mb_free_pending);
  5726. if (!entry)
  5727. goto insert;
  5728. /* Now try to see the extent can be merged to prev and next */
  5729. if (ext4_freed_extents_can_be_merged(new_entry, entry)) {
  5730. entry->efd_start_cluster = cluster;
  5731. entry->efd_count += new_entry->efd_count;
  5732. kmem_cache_free(ext4_free_data_cachep, new_entry);
  5733. ext4_try_merge_freed_extent_prev(sbi, root, entry);
  5734. return;
  5735. }
  5736. if (ext4_freed_extents_can_be_merged(entry, new_entry)) {
  5737. entry->efd_count += new_entry->efd_count;
  5738. kmem_cache_free(ext4_free_data_cachep, new_entry);
  5739. ext4_try_merge_freed_extent_next(sbi, root, entry);
  5740. return;
  5741. }
  5742. insert:
  5743. rb_link_node(new_node, parent, n);
  5744. rb_insert_color(new_node, root);
  5745. spin_lock(&sbi->s_md_lock);
  5746. list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]);
  5747. spin_unlock(&sbi->s_md_lock);
  5748. }
  5749. static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
  5750. unsigned long count)
  5751. {
  5752. struct super_block *sb = inode->i_sb;
  5753. ext4_group_t group;
  5754. ext4_grpblk_t blkoff;
  5755. ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
  5756. ext4_mb_mark_context(NULL, sb, false, group, blkoff, count,
  5757. EXT4_MB_BITMAP_MARKED_CHECK |
  5758. EXT4_MB_SYNC_UPDATE,
  5759. NULL);
  5760. }
  5761. /**
  5762. * ext4_mb_clear_bb() -- helper function for freeing blocks.
  5763. * Used by ext4_free_blocks()
  5764. * @handle: handle for this transaction
  5765. * @inode: inode
  5766. * @block: starting physical block to be freed
  5767. * @count: number of blocks to be freed
  5768. * @flags: flags used by ext4_free_blocks
  5769. */
  5770. static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
  5771. ext4_fsblk_t block, unsigned long count,
  5772. int flags)
  5773. {
  5774. struct super_block *sb = inode->i_sb;
  5775. struct ext4_group_info *grp;
  5776. unsigned int overflow;
  5777. ext4_grpblk_t bit;
  5778. ext4_group_t block_group;
  5779. struct ext4_sb_info *sbi;
  5780. struct ext4_buddy e4b;
  5781. unsigned int count_clusters;
  5782. int err = 0;
  5783. int mark_flags = 0;
  5784. ext4_grpblk_t changed;
  5785. sbi = EXT4_SB(sb);
  5786. if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
  5787. !ext4_inode_block_valid(inode, block, count)) {
  5788. ext4_error(sb, "Freeing blocks in system zone - "
  5789. "Block = %llu, count = %lu", block, count);
  5790. /* err = 0. ext4_std_error should be a no op */
  5791. goto error_out;
  5792. }
  5793. flags |= EXT4_FREE_BLOCKS_VALIDATED;
  5794. do_more:
  5795. overflow = 0;
  5796. ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
  5797. grp = ext4_get_group_info(sb, block_group);
  5798. if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
  5799. return;
  5800. /*
  5801. * Check to see if we are freeing blocks across a group
  5802. * boundary.
  5803. */
  5804. if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
  5805. overflow = EXT4_C2B(sbi, bit) + count -
  5806. EXT4_BLOCKS_PER_GROUP(sb);
  5807. count -= overflow;
  5808. /* The range changed so it's no longer validated */
  5809. flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
  5810. }
  5811. count_clusters = EXT4_NUM_B2C(sbi, count);
  5812. trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
  5813. /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
  5814. err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
  5815. GFP_NOFS|__GFP_NOFAIL);
  5816. if (err)
  5817. goto error_out;
  5818. if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
  5819. !ext4_inode_block_valid(inode, block, count)) {
  5820. ext4_error(sb, "Freeing blocks in system zone - "
  5821. "Block = %llu, count = %lu", block, count);
  5822. /* err = 0. ext4_std_error should be a no op */
  5823. goto error_clean;
  5824. }
  5825. #ifdef AGGRESSIVE_CHECK
  5826. mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK;
  5827. #endif
  5828. err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
  5829. count_clusters, mark_flags, &changed);
  5830. if (err && changed == 0)
  5831. goto error_clean;
  5832. #ifdef AGGRESSIVE_CHECK
  5833. BUG_ON(changed != count_clusters);
  5834. #endif
  5835. /*
  5836. * We need to make sure we don't reuse the freed block until after the
  5837. * transaction is committed. We make an exception if the inode is to be
  5838. * written in writeback mode since writeback mode has weak data
  5839. * consistency guarantees.
  5840. */
  5841. if (ext4_handle_valid(handle) &&
  5842. ((flags & EXT4_FREE_BLOCKS_METADATA) ||
  5843. !ext4_should_writeback_data(inode))) {
  5844. struct ext4_free_data *new_entry;
  5845. /*
  5846. * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
  5847. * to fail.
  5848. */
  5849. new_entry = kmem_cache_alloc(ext4_free_data_cachep,
  5850. GFP_NOFS|__GFP_NOFAIL);
  5851. new_entry->efd_start_cluster = bit;
  5852. new_entry->efd_group = block_group;
  5853. new_entry->efd_count = count_clusters;
  5854. new_entry->efd_tid = handle->h_transaction->t_tid;
  5855. ext4_lock_group(sb, block_group);
  5856. ext4_mb_free_metadata(handle, &e4b, new_entry);
  5857. } else {
  5858. if (test_opt(sb, DISCARD)) {
  5859. err = ext4_issue_discard(sb, block_group, bit,
  5860. count_clusters);
  5861. /*
  5862. * Ignore EOPNOTSUPP error. This is consistent with
  5863. * what happens when using journal.
  5864. */
  5865. if (err == -EOPNOTSUPP)
  5866. err = 0;
  5867. if (err)
  5868. ext4_msg(sb, KERN_WARNING, "discard request in"
  5869. " group:%u block:%d count:%lu failed"
  5870. " with %d", block_group, bit, count,
  5871. err);
  5872. }
  5873. EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
  5874. ext4_lock_group(sb, block_group);
  5875. mb_free_blocks(inode, &e4b, bit, count_clusters);
  5876. }
  5877. ext4_unlock_group(sb, block_group);
  5878. /*
  5879. * on a bigalloc file system, defer the s_freeclusters_counter
  5880. * update to the caller (ext4_remove_space and friends) so they
  5881. * can determine if a cluster freed here should be rereserved
  5882. */
  5883. if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
  5884. if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
  5885. dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
  5886. percpu_counter_add(&sbi->s_freeclusters_counter,
  5887. count_clusters);
  5888. }
  5889. if (overflow && !err) {
  5890. block += count;
  5891. count = overflow;
  5892. ext4_mb_unload_buddy(&e4b);
  5893. /* The range changed so it's no longer validated */
  5894. flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
  5895. goto do_more;
  5896. }
  5897. error_clean:
  5898. ext4_mb_unload_buddy(&e4b);
  5899. error_out:
  5900. ext4_std_error(sb, err);
  5901. }
  5902. /**
  5903. * ext4_free_blocks() -- Free given blocks and update quota
  5904. * @handle: handle for this transaction
  5905. * @inode: inode
  5906. * @bh: optional buffer of the block to be freed
  5907. * @block: starting physical block to be freed
  5908. * @count: number of blocks to be freed
  5909. * @flags: flags used by ext4_free_blocks
  5910. */
  5911. void ext4_free_blocks(handle_t *handle, struct inode *inode,
  5912. struct buffer_head *bh, ext4_fsblk_t block,
  5913. unsigned long count, int flags)
  5914. {
  5915. struct super_block *sb = inode->i_sb;
  5916. unsigned int overflow;
  5917. struct ext4_sb_info *sbi;
  5918. sbi = EXT4_SB(sb);
  5919. if (bh) {
  5920. if (block)
  5921. BUG_ON(block != bh->b_blocknr);
  5922. else
  5923. block = bh->b_blocknr;
  5924. }
  5925. if (sbi->s_mount_state & EXT4_FC_REPLAY) {
  5926. ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count));
  5927. return;
  5928. }
  5929. might_sleep();
  5930. if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
  5931. !ext4_inode_block_valid(inode, block, count)) {
  5932. ext4_error(sb, "Freeing blocks not in datazone - "
  5933. "block = %llu, count = %lu", block, count);
  5934. return;
  5935. }
  5936. flags |= EXT4_FREE_BLOCKS_VALIDATED;
  5937. ext4_debug("freeing block %llu\n", block);
  5938. trace_ext4_free_blocks(inode, block, count, flags);
  5939. if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
  5940. BUG_ON(count > 1);
  5941. ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
  5942. inode, bh, block);
  5943. }
  5944. /*
  5945. * If the extent to be freed does not begin on a cluster
  5946. * boundary, we need to deal with partial clusters at the
  5947. * beginning and end of the extent. Normally we will free
  5948. * blocks at the beginning or the end unless we are explicitly
  5949. * requested to avoid doing so.
  5950. */
  5951. overflow = EXT4_PBLK_COFF(sbi, block);
  5952. if (overflow) {
  5953. if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
  5954. overflow = sbi->s_cluster_ratio - overflow;
  5955. block += overflow;
  5956. if (count > overflow)
  5957. count -= overflow;
  5958. else
  5959. return;
  5960. } else {
  5961. block -= overflow;
  5962. count += overflow;
  5963. }
  5964. /* The range changed so it's no longer validated */
  5965. flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
  5966. }
  5967. overflow = EXT4_LBLK_COFF(sbi, count);
  5968. if (overflow) {
  5969. if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
  5970. if (count > overflow)
  5971. count -= overflow;
  5972. else
  5973. return;
  5974. } else
  5975. count += sbi->s_cluster_ratio - overflow;
  5976. /* The range changed so it's no longer validated */
  5977. flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
  5978. }
  5979. if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
  5980. int i;
  5981. int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
  5982. for (i = 0; i < count; i++) {
  5983. cond_resched();
  5984. if (is_metadata)
  5985. bh = sb_find_get_block_nonatomic(inode->i_sb,
  5986. block + i);
  5987. ext4_forget(handle, is_metadata, inode, bh, block + i);
  5988. }
  5989. }
  5990. ext4_mb_clear_bb(handle, inode, block, count, flags);
  5991. }
  5992. /**
  5993. * ext4_group_add_blocks() -- Add given blocks to an existing group
  5994. * @handle: handle to this transaction
  5995. * @sb: super block
  5996. * @block: start physical block to add to the block group
  5997. * @count: number of blocks to free
  5998. *
  5999. * This marks the blocks as free in the bitmap and buddy.
  6000. */
  6001. int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
  6002. ext4_fsblk_t block, unsigned long count)
  6003. {
  6004. ext4_group_t block_group;
  6005. ext4_grpblk_t bit;
  6006. struct ext4_sb_info *sbi = EXT4_SB(sb);
  6007. struct ext4_buddy e4b;
  6008. int err = 0;
  6009. ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
  6010. ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
  6011. unsigned long cluster_count = last_cluster - first_cluster + 1;
  6012. ext4_grpblk_t changed;
  6013. ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
  6014. if (cluster_count == 0)
  6015. return 0;
  6016. ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
  6017. /*
  6018. * Check to see if we are freeing blocks across a group
  6019. * boundary.
  6020. */
  6021. if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) {
  6022. ext4_warning(sb, "too many blocks added to group %u",
  6023. block_group);
  6024. err = -EINVAL;
  6025. goto error_out;
  6026. }
  6027. err = ext4_mb_load_buddy(sb, block_group, &e4b);
  6028. if (err)
  6029. goto error_out;
  6030. if (!ext4_sb_block_valid(sb, NULL, block, count)) {
  6031. ext4_error(sb, "Adding blocks in system zones - "
  6032. "Block = %llu, count = %lu",
  6033. block, count);
  6034. err = -EINVAL;
  6035. goto error_clean;
  6036. }
  6037. err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
  6038. cluster_count, EXT4_MB_BITMAP_MARKED_CHECK,
  6039. &changed);
  6040. if (err && changed == 0)
  6041. goto error_clean;
  6042. if (changed != cluster_count)
  6043. ext4_error(sb, "bit already cleared in group %u", block_group);
  6044. ext4_lock_group(sb, block_group);
  6045. mb_free_blocks(NULL, &e4b, bit, cluster_count);
  6046. ext4_unlock_group(sb, block_group);
  6047. percpu_counter_add(&sbi->s_freeclusters_counter,
  6048. changed);
  6049. error_clean:
  6050. ext4_mb_unload_buddy(&e4b);
  6051. error_out:
  6052. ext4_std_error(sb, err);
  6053. return err;
  6054. }
  6055. /**
  6056. * ext4_trim_extent -- function to TRIM one single free extent in the group
  6057. * @sb: super block for the file system
  6058. * @start: starting block of the free extent in the alloc. group
  6059. * @count: number of blocks to TRIM
  6060. * @e4b: ext4 buddy for the group
  6061. *
  6062. * Trim "count" blocks starting at "start" in the "group". To assure that no
  6063. * one will allocate those blocks, mark it as used in buddy bitmap. This must
  6064. * be called with under the group lock.
  6065. */
  6066. static int ext4_trim_extent(struct super_block *sb,
  6067. int start, int count, struct ext4_buddy *e4b)
  6068. __releases(bitlock)
  6069. __acquires(bitlock)
  6070. {
  6071. struct ext4_free_extent ex;
  6072. ext4_group_t group = e4b->bd_group;
  6073. int ret = 0;
  6074. trace_ext4_trim_extent(sb, group, start, count);
  6075. assert_spin_locked(ext4_group_lock_ptr(sb, group));
  6076. ex.fe_start = start;
  6077. ex.fe_group = group;
  6078. ex.fe_len = count;
  6079. /*
  6080. * Mark blocks used, so no one can reuse them while
  6081. * being trimmed.
  6082. */
  6083. mb_mark_used(e4b, &ex);
  6084. ext4_unlock_group(sb, group);
  6085. ret = ext4_issue_discard(sb, group, start, count);
  6086. ext4_lock_group(sb, group);
  6087. mb_free_blocks(NULL, e4b, start, ex.fe_len);
  6088. return ret;
  6089. }
  6090. static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
  6091. ext4_group_t grp)
  6092. {
  6093. unsigned long nr_clusters_in_group;
  6094. if (grp < (ext4_get_groups_count(sb) - 1))
  6095. nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb);
  6096. else
  6097. nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) -
  6098. ext4_group_first_block_no(sb, grp))
  6099. >> EXT4_CLUSTER_BITS(sb);
  6100. return nr_clusters_in_group - 1;
  6101. }
  6102. static bool ext4_trim_interrupted(void)
  6103. {
  6104. return fatal_signal_pending(current) || freezing(current);
  6105. }
  6106. static int ext4_try_to_trim_range(struct super_block *sb,
  6107. struct ext4_buddy *e4b, ext4_grpblk_t start,
  6108. ext4_grpblk_t max, ext4_grpblk_t minblocks)
  6109. __acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
  6110. __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
  6111. {
  6112. ext4_grpblk_t next, count, free_count, last, origin_start;
  6113. bool set_trimmed = false;
  6114. void *bitmap;
  6115. if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
  6116. return 0;
  6117. last = ext4_last_grp_cluster(sb, e4b->bd_group);
  6118. bitmap = e4b->bd_bitmap;
  6119. if (start == 0 && max >= last)
  6120. set_trimmed = true;
  6121. origin_start = start;
  6122. start = max(e4b->bd_info->bb_first_free, start);
  6123. count = 0;
  6124. free_count = 0;
  6125. while (start <= max) {
  6126. start = mb_find_next_zero_bit(bitmap, max + 1, start);
  6127. if (start > max)
  6128. break;
  6129. next = mb_find_next_bit(bitmap, last + 1, start);
  6130. if (origin_start == 0 && next >= last)
  6131. set_trimmed = true;
  6132. if ((next - start) >= minblocks) {
  6133. int ret = ext4_trim_extent(sb, start, next - start, e4b);
  6134. if (ret && ret != -EOPNOTSUPP)
  6135. return count;
  6136. count += next - start;
  6137. }
  6138. free_count += next - start;
  6139. start = next + 1;
  6140. if (ext4_trim_interrupted())
  6141. return count;
  6142. if (need_resched()) {
  6143. ext4_unlock_group(sb, e4b->bd_group);
  6144. cond_resched();
  6145. ext4_lock_group(sb, e4b->bd_group);
  6146. }
  6147. if ((e4b->bd_info->bb_free - free_count) < minblocks)
  6148. break;
  6149. }
  6150. if (set_trimmed)
  6151. EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info);
  6152. return count;
  6153. }
  6154. /**
  6155. * ext4_trim_all_free -- function to trim all free space in alloc. group
  6156. * @sb: super block for file system
  6157. * @group: group to be trimmed
  6158. * @start: first group block to examine
  6159. * @max: last group block to examine
  6160. * @minblocks: minimum extent block count
  6161. *
  6162. * ext4_trim_all_free walks through group's block bitmap searching for free
  6163. * extents. When the free extent is found, mark it as used in group buddy
  6164. * bitmap. Then issue a TRIM command on this extent and free the extent in
  6165. * the group buddy bitmap.
  6166. */
  6167. static ext4_grpblk_t
  6168. ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
  6169. ext4_grpblk_t start, ext4_grpblk_t max,
  6170. ext4_grpblk_t minblocks)
  6171. {
  6172. struct ext4_buddy e4b;
  6173. int ret;
  6174. trace_ext4_trim_all_free(sb, group, start, max);
  6175. ret = ext4_mb_load_buddy(sb, group, &e4b);
  6176. if (ret) {
  6177. ext4_warning(sb, "Error %d loading buddy information for %u",
  6178. ret, group);
  6179. return ret;
  6180. }
  6181. ext4_lock_group(sb, group);
  6182. if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
  6183. minblocks < EXT4_SB(sb)->s_last_trim_minblks)
  6184. ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
  6185. else
  6186. ret = 0;
  6187. ext4_unlock_group(sb, group);
  6188. ext4_mb_unload_buddy(&e4b);
  6189. ext4_debug("trimmed %d blocks in the group %d\n",
  6190. ret, group);
  6191. return ret;
  6192. }
  6193. /**
  6194. * ext4_trim_fs() -- trim ioctl handle function
  6195. * @sb: superblock for filesystem
  6196. * @range: fstrim_range structure
  6197. *
  6198. * start: First Byte to trim
  6199. * len: number of Bytes to trim from start
  6200. * minlen: minimum extent length in Bytes
  6201. * ext4_trim_fs goes through all allocation groups containing Bytes from
  6202. * start to start+len. For each such a group ext4_trim_all_free function
  6203. * is invoked to trim all free space.
  6204. */
  6205. int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
  6206. {
  6207. unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev);
  6208. struct ext4_group_info *grp;
  6209. ext4_group_t group, first_group, last_group;
  6210. ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
  6211. uint64_t start, end, minlen, trimmed = 0;
  6212. ext4_fsblk_t first_data_blk =
  6213. le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
  6214. ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
  6215. int ret = 0;
  6216. start = range->start >> sb->s_blocksize_bits;
  6217. end = start + (range->len >> sb->s_blocksize_bits) - 1;
  6218. minlen = EXT4_NUM_B2C(EXT4_SB(sb),
  6219. range->minlen >> sb->s_blocksize_bits);
  6220. if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
  6221. start >= max_blks ||
  6222. range->len < sb->s_blocksize)
  6223. return -EINVAL;
  6224. /* No point to try to trim less than discard granularity */
  6225. if (range->minlen < discard_granularity) {
  6226. minlen = EXT4_NUM_B2C(EXT4_SB(sb),
  6227. discard_granularity >> sb->s_blocksize_bits);
  6228. if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
  6229. goto out;
  6230. }
  6231. if (end >= max_blks - 1)
  6232. end = max_blks - 1;
  6233. if (end <= first_data_blk)
  6234. goto out;
  6235. if (start < first_data_blk)
  6236. start = first_data_blk;
  6237. /* Determine first and last group to examine based on start and end */
  6238. ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
  6239. &first_group, &first_cluster);
  6240. ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
  6241. &last_group, &last_cluster);
  6242. /* end now represents the last cluster to discard in this group */
  6243. end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
  6244. for (group = first_group; group <= last_group; group++) {
  6245. if (ext4_trim_interrupted())
  6246. break;
  6247. grp = ext4_get_group_info(sb, group);
  6248. if (!grp)
  6249. continue;
  6250. /* We only do this if the grp has never been initialized */
  6251. if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
  6252. ret = ext4_mb_init_group(sb, group, GFP_NOFS);
  6253. if (ret)
  6254. break;
  6255. }
  6256. /*
  6257. * For all the groups except the last one, last cluster will
  6258. * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
  6259. * change it for the last group, note that last_cluster is
  6260. * already computed earlier by ext4_get_group_no_and_offset()
  6261. */
  6262. if (group == last_group)
  6263. end = last_cluster;
  6264. if (grp->bb_free >= minlen) {
  6265. cnt = ext4_trim_all_free(sb, group, first_cluster,
  6266. end, minlen);
  6267. if (cnt < 0) {
  6268. ret = cnt;
  6269. break;
  6270. }
  6271. trimmed += cnt;
  6272. }
  6273. /*
  6274. * For every group except the first one, we are sure
  6275. * that the first cluster to discard will be cluster #0.
  6276. */
  6277. first_cluster = 0;
  6278. }
  6279. if (!ret)
  6280. EXT4_SB(sb)->s_last_trim_minblks = minlen;
  6281. out:
  6282. range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
  6283. return ret;
  6284. }
  6285. /* Iterate all the free extents in the group. */
  6286. int
  6287. ext4_mballoc_query_range(
  6288. struct super_block *sb,
  6289. ext4_group_t group,
  6290. ext4_grpblk_t first,
  6291. ext4_grpblk_t end,
  6292. ext4_mballoc_query_range_fn meta_formatter,
  6293. ext4_mballoc_query_range_fn formatter,
  6294. void *priv)
  6295. {
  6296. void *bitmap;
  6297. ext4_grpblk_t start, next;
  6298. struct ext4_buddy e4b;
  6299. int error;
  6300. error = ext4_mb_load_buddy(sb, group, &e4b);
  6301. if (error)
  6302. return error;
  6303. bitmap = e4b.bd_bitmap;
  6304. ext4_lock_group(sb, group);
  6305. start = max(e4b.bd_info->bb_first_free, first);
  6306. if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
  6307. end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
  6308. if (meta_formatter && start != first) {
  6309. if (start > end)
  6310. start = end;
  6311. ext4_unlock_group(sb, group);
  6312. error = meta_formatter(sb, group, first, start - first,
  6313. priv);
  6314. if (error)
  6315. goto out_unload;
  6316. ext4_lock_group(sb, group);
  6317. }
  6318. while (start <= end) {
  6319. start = mb_find_next_zero_bit(bitmap, end + 1, start);
  6320. if (start > end)
  6321. break;
  6322. next = mb_find_next_bit(bitmap, end + 1, start);
  6323. ext4_unlock_group(sb, group);
  6324. error = formatter(sb, group, start, next - start, priv);
  6325. if (error)
  6326. goto out_unload;
  6327. ext4_lock_group(sb, group);
  6328. start = next + 1;
  6329. }
  6330. ext4_unlock_group(sb, group);
  6331. out_unload:
  6332. ext4_mb_unload_buddy(&e4b);
  6333. return error;
  6334. }
  6335. #if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS)
  6336. void mb_clear_bits_test(void *bm, int cur, int len)
  6337. {
  6338. mb_clear_bits(bm, cur, len);
  6339. }
  6340. EXPORT_SYMBOL_FOR_EXT4_TEST(mb_clear_bits_test);
  6341. ext4_fsblk_t
  6342. ext4_mb_new_blocks_simple_test(struct ext4_allocation_request *ar,
  6343. int *errp)
  6344. {
  6345. return ext4_mb_new_blocks_simple(ar, errp);
  6346. }
  6347. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_new_blocks_simple_test);
  6348. int mb_find_next_zero_bit_test(void *addr, int max, int start)
  6349. {
  6350. return mb_find_next_zero_bit(addr, max, start);
  6351. }
  6352. EXPORT_SYMBOL_FOR_EXT4_TEST(mb_find_next_zero_bit_test);
  6353. int mb_find_next_bit_test(void *addr, int max, int start)
  6354. {
  6355. return mb_find_next_bit(addr, max, start);
  6356. }
  6357. EXPORT_SYMBOL_FOR_EXT4_TEST(mb_find_next_bit_test);
  6358. void mb_clear_bit_test(int bit, void *addr)
  6359. {
  6360. mb_clear_bit(bit, addr);
  6361. }
  6362. EXPORT_SYMBOL_FOR_EXT4_TEST(mb_clear_bit_test);
  6363. int mb_test_bit_test(int bit, void *addr)
  6364. {
  6365. return mb_test_bit(bit, addr);
  6366. }
  6367. EXPORT_SYMBOL_FOR_EXT4_TEST(mb_test_bit_test);
  6368. int ext4_mb_mark_diskspace_used_test(struct ext4_allocation_context *ac,
  6369. handle_t *handle)
  6370. {
  6371. return ext4_mb_mark_diskspace_used(ac, handle);
  6372. }
  6373. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_mark_diskspace_used_test);
  6374. int mb_mark_used_test(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
  6375. {
  6376. return mb_mark_used(e4b, ex);
  6377. }
  6378. EXPORT_SYMBOL_FOR_EXT4_TEST(mb_mark_used_test);
  6379. void ext4_mb_generate_buddy_test(struct super_block *sb, void *buddy,
  6380. void *bitmap, ext4_group_t group,
  6381. struct ext4_group_info *grp)
  6382. {
  6383. ext4_mb_generate_buddy(sb, buddy, bitmap, group, grp);
  6384. }
  6385. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_generate_buddy_test);
  6386. int ext4_mb_load_buddy_test(struct super_block *sb, ext4_group_t group,
  6387. struct ext4_buddy *e4b)
  6388. {
  6389. return ext4_mb_load_buddy(sb, group, e4b);
  6390. }
  6391. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_load_buddy_test);
  6392. void ext4_mb_unload_buddy_test(struct ext4_buddy *e4b)
  6393. {
  6394. ext4_mb_unload_buddy(e4b);
  6395. }
  6396. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_unload_buddy_test);
  6397. void mb_free_blocks_test(struct inode *inode, struct ext4_buddy *e4b,
  6398. int first, int count)
  6399. {
  6400. mb_free_blocks(inode, e4b, first, count);
  6401. }
  6402. EXPORT_SYMBOL_FOR_EXT4_TEST(mb_free_blocks_test);
  6403. void ext4_free_blocks_simple_test(struct inode *inode, ext4_fsblk_t block,
  6404. unsigned long count)
  6405. {
  6406. return ext4_free_blocks_simple(inode, block, count);
  6407. }
  6408. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_free_blocks_simple_test);
  6409. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_wait_block_bitmap);
  6410. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_init);
  6411. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_get_group_desc);
  6412. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_count_free_clusters);
  6413. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_get_group_info);
  6414. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_free_group_clusters_set);
  6415. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_release);
  6416. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_read_block_bitmap_nowait);
  6417. EXPORT_SYMBOL_FOR_EXT4_TEST(mb_set_bits);
  6418. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_fc_init_inode);
  6419. EXPORT_SYMBOL_FOR_EXT4_TEST(ext4_mb_mark_context);
  6420. #endif