slub.c 245 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * SLUB: A slab allocator with low overhead percpu array caches and mostly
  4. * lockless freeing of objects to slabs in the slowpath.
  5. *
  6. * The allocator synchronizes using spin_trylock for percpu arrays in the
  7. * fastpath, and cmpxchg_double (or bit spinlock) for slowpath freeing.
  8. * Uses a centralized lock to manage a pool of partial slabs.
  9. *
  10. * (C) 2007 SGI, Christoph Lameter
  11. * (C) 2011 Linux Foundation, Christoph Lameter
  12. * (C) 2025 SUSE, Vlastimil Babka
  13. */
  14. #include <linux/mm.h>
  15. #include <linux/swap.h> /* mm_account_reclaimed_pages() */
  16. #include <linux/module.h>
  17. #include <linux/bit_spinlock.h>
  18. #include <linux/interrupt.h>
  19. #include <linux/swab.h>
  20. #include <linux/bitops.h>
  21. #include <linux/slab.h>
  22. #include "slab.h"
  23. #include <linux/vmalloc.h>
  24. #include <linux/proc_fs.h>
  25. #include <linux/seq_file.h>
  26. #include <linux/kasan.h>
  27. #include <linux/node.h>
  28. #include <linux/kmsan.h>
  29. #include <linux/cpu.h>
  30. #include <linux/cpuset.h>
  31. #include <linux/mempolicy.h>
  32. #include <linux/ctype.h>
  33. #include <linux/stackdepot.h>
  34. #include <linux/debugobjects.h>
  35. #include <linux/kallsyms.h>
  36. #include <linux/kfence.h>
  37. #include <linux/memory.h>
  38. #include <linux/math64.h>
  39. #include <linux/fault-inject.h>
  40. #include <linux/kmemleak.h>
  41. #include <linux/stacktrace.h>
  42. #include <linux/prefetch.h>
  43. #include <linux/memcontrol.h>
  44. #include <linux/random.h>
  45. #include <linux/prandom.h>
  46. #include <kunit/test.h>
  47. #include <kunit/test-bug.h>
  48. #include <linux/sort.h>
  49. #include <linux/irq_work.h>
  50. #include <linux/kprobes.h>
  51. #include <linux/debugfs.h>
  52. #include <trace/events/kmem.h>
  53. #include "internal.h"
  54. /*
  55. * Lock order:
  56. * 0. cpu_hotplug_lock
  57. * 1. slab_mutex (Global Mutex)
  58. * 2a. kmem_cache->cpu_sheaves->lock (Local trylock)
  59. * 2b. node->barn->lock (Spinlock)
  60. * 2c. node->list_lock (Spinlock)
  61. * 3. slab_lock(slab) (Only on some arches)
  62. * 4. object_map_lock (Only for debugging)
  63. *
  64. * slab_mutex
  65. *
  66. * The role of the slab_mutex is to protect the list of all the slabs
  67. * and to synchronize major metadata changes to slab cache structures.
  68. * Also synchronizes memory hotplug callbacks.
  69. *
  70. * slab_lock
  71. *
  72. * The slab_lock is a wrapper around the page lock, thus it is a bit
  73. * spinlock.
  74. *
  75. * The slab_lock is only used on arches that do not have the ability
  76. * to do a cmpxchg_double. It only protects:
  77. *
  78. * A. slab->freelist -> List of free objects in a slab
  79. * B. slab->inuse -> Number of objects in use
  80. * C. slab->objects -> Number of objects in slab
  81. * D. slab->frozen -> frozen state
  82. *
  83. * SL_partial slabs
  84. *
  85. * Slabs on node partial list have at least one free object. A limited number
  86. * of slabs on the list can be fully free (slab->inuse == 0), until we start
  87. * discarding them. These slabs are marked with SL_partial, and the flag is
  88. * cleared while removing them, usually to grab their freelist afterwards.
  89. * This clearing also exempts them from list management. Please see
  90. * __slab_free() for more details.
  91. *
  92. * Full slabs
  93. *
  94. * For caches without debugging enabled, full slabs (slab->inuse ==
  95. * slab->objects and slab->freelist == NULL) are not placed on any list.
  96. * The __slab_free() freeing the first object from such a slab will place
  97. * it on the partial list. Caches with debugging enabled place such slab
  98. * on the full list and use different allocation and freeing paths.
  99. *
  100. * Frozen slabs
  101. *
  102. * If a slab is frozen then it is exempt from list management. It is used to
  103. * indicate a slab that has failed consistency checks and thus cannot be
  104. * allocated from anymore - it is also marked as full. Any previously
  105. * allocated objects will be simply leaked upon freeing instead of attempting
  106. * to modify the potentially corrupted freelist and metadata.
  107. *
  108. * To sum up, the current scheme is:
  109. * - node partial slab: SL_partial && !full && !frozen
  110. * - taken off partial list: !SL_partial && !full && !frozen
  111. * - full slab, not on any list: !SL_partial && full && !frozen
  112. * - frozen due to inconsistency: !SL_partial && full && frozen
  113. *
  114. * node->list_lock (spinlock)
  115. *
  116. * The list_lock protects the partial and full list on each node and
  117. * the partial slab counter. If taken then no new slabs may be added or
  118. * removed from the lists nor make the number of partial slabs be modified.
  119. * (Note that the total number of slabs is an atomic value that may be
  120. * modified without taking the list lock).
  121. *
  122. * The list_lock is a centralized lock and thus we avoid taking it as
  123. * much as possible. As long as SLUB does not have to handle partial
  124. * slabs, operations can continue without any centralized lock.
  125. *
  126. * For debug caches, all allocations are forced to go through a list_lock
  127. * protected region to serialize against concurrent validation.
  128. *
  129. * cpu_sheaves->lock (local_trylock)
  130. *
  131. * This lock protects fastpath operations on the percpu sheaves. On !RT it
  132. * only disables preemption and does no atomic operations. As long as the main
  133. * or spare sheaf can handle the allocation or free, there is no other
  134. * overhead.
  135. *
  136. * node->barn->lock (spinlock)
  137. *
  138. * This lock protects the operations on per-NUMA-node barn. It can quickly
  139. * serve an empty or full sheaf if available, and avoid more expensive refill
  140. * or flush operation.
  141. *
  142. * Lockless freeing
  143. *
  144. * Objects may have to be freed to their slabs when they are from a remote
  145. * node (where we want to avoid filling local sheaves with remote objects)
  146. * or when there are too many full sheaves. On architectures supporting
  147. * cmpxchg_double this is done by a lockless update of slab's freelist and
  148. * counters, otherwise slab_lock is taken. This only needs to take the
  149. * list_lock if it's a first free to a full slab, or when a slab becomes empty
  150. * after the free.
  151. *
  152. * irq, preemption, migration considerations
  153. *
  154. * Interrupts are disabled as part of list_lock or barn lock operations, or
  155. * around the slab_lock operation, in order to make the slab allocator safe
  156. * to use in the context of an irq.
  157. * Preemption is disabled as part of local_trylock operations.
  158. * kmalloc_nolock() and kfree_nolock() are safe in NMI context but see
  159. * their limitations.
  160. *
  161. * SLUB assigns two object arrays called sheaves for caching allocations and
  162. * frees on each cpu, with a NUMA node shared barn for balancing between cpus.
  163. * Allocations and frees are primarily served from these sheaves.
  164. *
  165. * Slabs with free elements are kept on a partial list and during regular
  166. * operations no list for full slabs is used. If an object in a full slab is
  167. * freed then the slab will show up again on the partial lists.
  168. * We track full slabs for debugging purposes though because otherwise we
  169. * cannot scan all objects.
  170. *
  171. * Slabs are freed when they become empty. Teardown and setup is minimal so we
  172. * rely on the page allocators per cpu caches for fast frees and allocs.
  173. *
  174. * SLAB_DEBUG_FLAGS Slab requires special handling due to debug
  175. * options set. This moves slab handling out of
  176. * the fast path and disables lockless freelists.
  177. */
  178. /**
  179. * enum slab_flags - How the slab flags bits are used.
  180. * @SL_locked: Is locked with slab_lock()
  181. * @SL_partial: On the per-node partial list
  182. * @SL_pfmemalloc: Was allocated from PF_MEMALLOC reserves
  183. *
  184. * The slab flags share space with the page flags but some bits have
  185. * different interpretations. The high bits are used for information
  186. * like zone/node/section.
  187. */
  188. enum slab_flags {
  189. SL_locked = PG_locked,
  190. SL_partial = PG_workingset, /* Historical reasons for this bit */
  191. SL_pfmemalloc = PG_active, /* Historical reasons for this bit */
  192. };
  193. #ifndef CONFIG_SLUB_TINY
  194. #define __fastpath_inline __always_inline
  195. #else
  196. #define __fastpath_inline
  197. #endif
  198. #ifdef CONFIG_SLUB_DEBUG
  199. #ifdef CONFIG_SLUB_DEBUG_ON
  200. DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
  201. #else
  202. DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
  203. #endif
  204. #endif /* CONFIG_SLUB_DEBUG */
  205. #ifdef CONFIG_NUMA
  206. static DEFINE_STATIC_KEY_FALSE(strict_numa);
  207. #endif
  208. /* Structure holding parameters for get_from_partial() call chain */
  209. struct partial_context {
  210. gfp_t flags;
  211. unsigned int orig_size;
  212. };
  213. /* Structure holding parameters for get_partial_node_bulk() */
  214. struct partial_bulk_context {
  215. gfp_t flags;
  216. unsigned int min_objects;
  217. unsigned int max_objects;
  218. struct list_head slabs;
  219. };
  220. static inline bool kmem_cache_debug(struct kmem_cache *s)
  221. {
  222. return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
  223. }
  224. void *fixup_red_left(struct kmem_cache *s, void *p)
  225. {
  226. if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
  227. p += s->red_left_pad;
  228. return p;
  229. }
  230. /*
  231. * Issues still to be resolved:
  232. *
  233. * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
  234. *
  235. * - Variable sizing of the per node arrays
  236. */
  237. /* Enable to log cmpxchg failures */
  238. #undef SLUB_DEBUG_CMPXCHG
  239. #ifndef CONFIG_SLUB_TINY
  240. /*
  241. * Minimum number of partial slabs. These will be left on the partial
  242. * lists even if they are empty. kmem_cache_shrink may reclaim them.
  243. */
  244. #define MIN_PARTIAL 5
  245. /*
  246. * Maximum number of desirable partial slabs.
  247. * The existence of more partial slabs makes kmem_cache_shrink
  248. * sort the partial list by the number of objects in use.
  249. */
  250. #define MAX_PARTIAL 10
  251. #else
  252. #define MIN_PARTIAL 0
  253. #define MAX_PARTIAL 0
  254. #endif
  255. #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
  256. SLAB_POISON | SLAB_STORE_USER)
  257. /*
  258. * These debug flags cannot use CMPXCHG because there might be consistency
  259. * issues when checking or reading debug information
  260. */
  261. #define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
  262. SLAB_TRACE)
  263. /*
  264. * Debugging flags that require metadata to be stored in the slab. These get
  265. * disabled when slab_debug=O is used and a cache's min order increases with
  266. * metadata.
  267. */
  268. #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
  269. #define OO_SHIFT 16
  270. #define OO_MASK ((1 << OO_SHIFT) - 1)
  271. #define MAX_OBJS_PER_PAGE 32767 /* since slab.objects is u15 */
  272. /* Internal SLUB flags */
  273. /* Poison object */
  274. #define __OBJECT_POISON __SLAB_FLAG_BIT(_SLAB_OBJECT_POISON)
  275. /* Use cmpxchg_double */
  276. #ifdef system_has_freelist_aba
  277. #define __CMPXCHG_DOUBLE __SLAB_FLAG_BIT(_SLAB_CMPXCHG_DOUBLE)
  278. #else
  279. #define __CMPXCHG_DOUBLE __SLAB_FLAG_UNUSED
  280. #endif
  281. /*
  282. * Tracking user of a slab.
  283. */
  284. #define TRACK_ADDRS_COUNT 16
  285. struct track {
  286. unsigned long addr; /* Called from address */
  287. #ifdef CONFIG_STACKDEPOT
  288. depot_stack_handle_t handle;
  289. #endif
  290. int cpu; /* Was running on cpu */
  291. int pid; /* Pid context */
  292. unsigned long when; /* When did the operation occur */
  293. };
  294. enum track_item { TRACK_ALLOC, TRACK_FREE };
  295. #ifdef SLAB_SUPPORTS_SYSFS
  296. static int sysfs_slab_add(struct kmem_cache *);
  297. #else
  298. static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
  299. #endif
  300. #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
  301. static void debugfs_slab_add(struct kmem_cache *);
  302. #else
  303. static inline void debugfs_slab_add(struct kmem_cache *s) { }
  304. #endif
  305. enum add_mode {
  306. ADD_TO_HEAD,
  307. ADD_TO_TAIL,
  308. };
  309. enum stat_item {
  310. ALLOC_FASTPATH, /* Allocation from percpu sheaves */
  311. ALLOC_SLOWPATH, /* Allocation from partial or new slab */
  312. FREE_RCU_SHEAF, /* Free to rcu_free sheaf */
  313. FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */
  314. FREE_FASTPATH, /* Free to percpu sheaves */
  315. FREE_SLOWPATH, /* Free to a slab */
  316. FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */
  317. FREE_REMOVE_PARTIAL, /* Freeing removes last object */
  318. ALLOC_SLAB, /* New slab acquired from page allocator */
  319. ALLOC_NODE_MISMATCH, /* Requested node different from cpu sheaf */
  320. FREE_SLAB, /* Slab freed to the page allocator */
  321. ORDER_FALLBACK, /* Number of times fallback was necessary */
  322. CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */
  323. SHEAF_FLUSH, /* Objects flushed from a sheaf */
  324. SHEAF_REFILL, /* Objects refilled to a sheaf */
  325. SHEAF_ALLOC, /* Allocation of an empty sheaf */
  326. SHEAF_FREE, /* Freeing of an empty sheaf */
  327. BARN_GET, /* Got full sheaf from barn */
  328. BARN_GET_FAIL, /* Failed to get full sheaf from barn */
  329. BARN_PUT, /* Put full sheaf to barn */
  330. BARN_PUT_FAIL, /* Failed to put full sheaf to barn */
  331. SHEAF_PREFILL_FAST, /* Sheaf prefill grabbed the spare sheaf */
  332. SHEAF_PREFILL_SLOW, /* Sheaf prefill found no spare sheaf */
  333. SHEAF_PREFILL_OVERSIZE, /* Allocation of oversize sheaf for prefill */
  334. SHEAF_RETURN_FAST, /* Sheaf return reattached spare sheaf */
  335. SHEAF_RETURN_SLOW, /* Sheaf return could not reattach spare */
  336. NR_SLUB_STAT_ITEMS
  337. };
  338. #ifdef CONFIG_SLUB_STATS
  339. struct kmem_cache_stats {
  340. unsigned int stat[NR_SLUB_STAT_ITEMS];
  341. };
  342. #endif
  343. static inline void stat(const struct kmem_cache *s, enum stat_item si)
  344. {
  345. #ifdef CONFIG_SLUB_STATS
  346. /*
  347. * The rmw is racy on a preemptible kernel but this is acceptable, so
  348. * avoid this_cpu_add()'s irq-disable overhead.
  349. */
  350. raw_cpu_inc(s->cpu_stats->stat[si]);
  351. #endif
  352. }
  353. static inline
  354. void stat_add(const struct kmem_cache *s, enum stat_item si, int v)
  355. {
  356. #ifdef CONFIG_SLUB_STATS
  357. raw_cpu_add(s->cpu_stats->stat[si], v);
  358. #endif
  359. }
  360. #define MAX_FULL_SHEAVES 10
  361. #define MAX_EMPTY_SHEAVES 10
  362. struct node_barn {
  363. spinlock_t lock;
  364. struct list_head sheaves_full;
  365. struct list_head sheaves_empty;
  366. unsigned int nr_full;
  367. unsigned int nr_empty;
  368. };
  369. struct slab_sheaf {
  370. union {
  371. struct rcu_head rcu_head;
  372. struct list_head barn_list;
  373. /* only used for prefilled sheafs */
  374. struct {
  375. unsigned int capacity;
  376. bool pfmemalloc;
  377. };
  378. };
  379. struct kmem_cache *cache;
  380. unsigned int size;
  381. int node; /* only used for rcu_sheaf */
  382. void *objects[];
  383. };
  384. struct slub_percpu_sheaves {
  385. local_trylock_t lock;
  386. struct slab_sheaf *main; /* never NULL when unlocked */
  387. struct slab_sheaf *spare; /* empty or full, may be NULL */
  388. struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */
  389. };
  390. /*
  391. * The slab lists for all objects.
  392. */
  393. struct kmem_cache_node {
  394. spinlock_t list_lock;
  395. unsigned long nr_partial;
  396. struct list_head partial;
  397. #ifdef CONFIG_SLUB_DEBUG
  398. atomic_long_t nr_slabs;
  399. atomic_long_t total_objects;
  400. struct list_head full;
  401. #endif
  402. struct node_barn *barn;
  403. };
  404. static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
  405. {
  406. return s->node[node];
  407. }
  408. /*
  409. * Get the barn of the current cpu's closest memory node. It may not exist on
  410. * systems with memoryless nodes but without CONFIG_HAVE_MEMORYLESS_NODES
  411. */
  412. static inline struct node_barn *get_barn(struct kmem_cache *s)
  413. {
  414. struct kmem_cache_node *n = get_node(s, numa_mem_id());
  415. if (!n)
  416. return NULL;
  417. return n->barn;
  418. }
  419. /*
  420. * Iterator over all nodes. The body will be executed for each node that has
  421. * a kmem_cache_node structure allocated (which is true for all online nodes)
  422. */
  423. #define for_each_kmem_cache_node(__s, __node, __n) \
  424. for (__node = 0; __node < nr_node_ids; __node++) \
  425. if ((__n = get_node(__s, __node)))
  426. /*
  427. * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
  428. * Corresponds to node_state[N_MEMORY], but can temporarily
  429. * differ during memory hotplug/hotremove operations.
  430. * Protected by slab_mutex.
  431. */
  432. static nodemask_t slab_nodes;
  433. /*
  434. * Workqueue used for flushing cpu and kfree_rcu sheaves.
  435. */
  436. static struct workqueue_struct *flushwq;
  437. struct slub_flush_work {
  438. struct work_struct work;
  439. struct kmem_cache *s;
  440. bool skip;
  441. };
  442. static DEFINE_MUTEX(flush_lock);
  443. static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
  444. /********************************************************************
  445. * Core slab cache functions
  446. *******************************************************************/
  447. /*
  448. * Returns freelist pointer (ptr). With hardening, this is obfuscated
  449. * with an XOR of the address where the pointer is held and a per-cache
  450. * random number.
  451. */
  452. static inline freeptr_t freelist_ptr_encode(const struct kmem_cache *s,
  453. void *ptr, unsigned long ptr_addr)
  454. {
  455. unsigned long encoded;
  456. #ifdef CONFIG_SLAB_FREELIST_HARDENED
  457. encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr);
  458. #else
  459. encoded = (unsigned long)ptr;
  460. #endif
  461. return (freeptr_t){.v = encoded};
  462. }
  463. static inline void *freelist_ptr_decode(const struct kmem_cache *s,
  464. freeptr_t ptr, unsigned long ptr_addr)
  465. {
  466. void *decoded;
  467. #ifdef CONFIG_SLAB_FREELIST_HARDENED
  468. decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr));
  469. #else
  470. decoded = (void *)ptr.v;
  471. #endif
  472. return decoded;
  473. }
  474. static inline void *get_freepointer(struct kmem_cache *s, void *object)
  475. {
  476. unsigned long ptr_addr;
  477. freeptr_t p;
  478. object = kasan_reset_tag(object);
  479. ptr_addr = (unsigned long)object + s->offset;
  480. p = *(freeptr_t *)(ptr_addr);
  481. return freelist_ptr_decode(s, p, ptr_addr);
  482. }
  483. static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
  484. {
  485. unsigned long freeptr_addr = (unsigned long)object + s->offset;
  486. #ifdef CONFIG_SLAB_FREELIST_HARDENED
  487. BUG_ON(object == fp); /* naive detection of double free or corruption */
  488. #endif
  489. freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
  490. *(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, fp, freeptr_addr);
  491. }
  492. /*
  493. * See comment in calculate_sizes().
  494. */
  495. static inline bool freeptr_outside_object(struct kmem_cache *s)
  496. {
  497. return s->offset >= s->inuse;
  498. }
  499. /*
  500. * Return offset of the end of info block which is inuse + free pointer if
  501. * not overlapping with object.
  502. */
  503. static inline unsigned int get_info_end(struct kmem_cache *s)
  504. {
  505. if (freeptr_outside_object(s))
  506. return s->inuse + sizeof(void *);
  507. else
  508. return s->inuse;
  509. }
  510. /* Loop over all objects in a slab */
  511. #define for_each_object(__p, __s, __addr, __objects) \
  512. for (__p = fixup_red_left(__s, __addr); \
  513. __p < (__addr) + (__objects) * (__s)->size; \
  514. __p += (__s)->size)
  515. static inline unsigned int order_objects(unsigned int order, unsigned int size)
  516. {
  517. return ((unsigned int)PAGE_SIZE << order) / size;
  518. }
  519. static inline struct kmem_cache_order_objects oo_make(unsigned int order,
  520. unsigned int size)
  521. {
  522. struct kmem_cache_order_objects x = {
  523. (order << OO_SHIFT) + order_objects(order, size)
  524. };
  525. return x;
  526. }
  527. static inline unsigned int oo_order(struct kmem_cache_order_objects x)
  528. {
  529. return x.x >> OO_SHIFT;
  530. }
  531. static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
  532. {
  533. return x.x & OO_MASK;
  534. }
  535. /*
  536. * If network-based swap is enabled, slub must keep track of whether memory
  537. * were allocated from pfmemalloc reserves.
  538. */
  539. static inline bool slab_test_pfmemalloc(const struct slab *slab)
  540. {
  541. return test_bit(SL_pfmemalloc, &slab->flags.f);
  542. }
  543. static inline void slab_set_pfmemalloc(struct slab *slab)
  544. {
  545. set_bit(SL_pfmemalloc, &slab->flags.f);
  546. }
  547. static inline void __slab_clear_pfmemalloc(struct slab *slab)
  548. {
  549. __clear_bit(SL_pfmemalloc, &slab->flags.f);
  550. }
  551. /*
  552. * Per slab locking using the pagelock
  553. */
  554. static __always_inline void slab_lock(struct slab *slab)
  555. {
  556. bit_spin_lock(SL_locked, &slab->flags.f);
  557. }
  558. static __always_inline void slab_unlock(struct slab *slab)
  559. {
  560. bit_spin_unlock(SL_locked, &slab->flags.f);
  561. }
  562. static inline bool
  563. __update_freelist_fast(struct slab *slab, struct freelist_counters *old,
  564. struct freelist_counters *new)
  565. {
  566. #ifdef system_has_freelist_aba
  567. return try_cmpxchg_freelist(&slab->freelist_counters,
  568. &old->freelist_counters,
  569. new->freelist_counters);
  570. #else
  571. return false;
  572. #endif
  573. }
  574. static inline bool
  575. __update_freelist_slow(struct slab *slab, struct freelist_counters *old,
  576. struct freelist_counters *new)
  577. {
  578. bool ret = false;
  579. slab_lock(slab);
  580. if (slab->freelist == old->freelist &&
  581. slab->counters == old->counters) {
  582. slab->freelist = new->freelist;
  583. /* prevent tearing for the read in get_partial_node_bulk() */
  584. WRITE_ONCE(slab->counters, new->counters);
  585. ret = true;
  586. }
  587. slab_unlock(slab);
  588. return ret;
  589. }
  590. /*
  591. * Interrupts must be disabled (for the fallback code to work right), typically
  592. * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
  593. * part of bit_spin_lock(), is sufficient because the policy is not to allow any
  594. * allocation/ free operation in hardirq context. Therefore nothing can
  595. * interrupt the operation.
  596. */
  597. static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
  598. struct freelist_counters *old, struct freelist_counters *new, const char *n)
  599. {
  600. bool ret;
  601. if (!IS_ENABLED(CONFIG_PREEMPT_RT))
  602. lockdep_assert_irqs_disabled();
  603. if (s->flags & __CMPXCHG_DOUBLE)
  604. ret = __update_freelist_fast(slab, old, new);
  605. else
  606. ret = __update_freelist_slow(slab, old, new);
  607. if (likely(ret))
  608. return true;
  609. cpu_relax();
  610. stat(s, CMPXCHG_DOUBLE_FAIL);
  611. #ifdef SLUB_DEBUG_CMPXCHG
  612. pr_info("%s %s: cmpxchg double redo ", n, s->name);
  613. #endif
  614. return false;
  615. }
  616. static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
  617. struct freelist_counters *old, struct freelist_counters *new, const char *n)
  618. {
  619. bool ret;
  620. if (s->flags & __CMPXCHG_DOUBLE) {
  621. ret = __update_freelist_fast(slab, old, new);
  622. } else {
  623. unsigned long flags;
  624. local_irq_save(flags);
  625. ret = __update_freelist_slow(slab, old, new);
  626. local_irq_restore(flags);
  627. }
  628. if (likely(ret))
  629. return true;
  630. cpu_relax();
  631. stat(s, CMPXCHG_DOUBLE_FAIL);
  632. #ifdef SLUB_DEBUG_CMPXCHG
  633. pr_info("%s %s: cmpxchg double redo ", n, s->name);
  634. #endif
  635. return false;
  636. }
  637. /*
  638. * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
  639. * family will round up the real request size to these fixed ones, so
  640. * there could be an extra area than what is requested. Save the original
  641. * request size in the meta data area, for better debug and sanity check.
  642. */
  643. static inline void set_orig_size(struct kmem_cache *s,
  644. void *object, unsigned long orig_size)
  645. {
  646. void *p = kasan_reset_tag(object);
  647. if (!slub_debug_orig_size(s))
  648. return;
  649. p += get_info_end(s);
  650. p += sizeof(struct track) * 2;
  651. *(unsigned long *)p = orig_size;
  652. }
  653. static inline unsigned long get_orig_size(struct kmem_cache *s, void *object)
  654. {
  655. void *p = kasan_reset_tag(object);
  656. if (is_kfence_address(object))
  657. return kfence_ksize(object);
  658. if (!slub_debug_orig_size(s))
  659. return s->object_size;
  660. p += get_info_end(s);
  661. p += sizeof(struct track) * 2;
  662. return *(unsigned long *)p;
  663. }
  664. #ifdef CONFIG_SLAB_OBJ_EXT
  665. /*
  666. * Check if memory cgroup or memory allocation profiling is enabled.
  667. * If enabled, SLUB tries to reduce memory overhead of accounting
  668. * slab objects. If neither is enabled when this function is called,
  669. * the optimization is simply skipped to avoid affecting caches that do not
  670. * need slabobj_ext metadata.
  671. *
  672. * However, this may disable optimization when memory cgroup or memory
  673. * allocation profiling is used, but slabs are created too early
  674. * even before those subsystems are initialized.
  675. */
  676. static inline bool need_slab_obj_exts(struct kmem_cache *s)
  677. {
  678. if (s->flags & SLAB_NO_OBJ_EXT)
  679. return false;
  680. if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
  681. return true;
  682. if (mem_alloc_profiling_enabled())
  683. return true;
  684. return false;
  685. }
  686. static inline unsigned int obj_exts_size_in_slab(struct slab *slab)
  687. {
  688. return sizeof(struct slabobj_ext) * slab->objects;
  689. }
  690. static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s,
  691. struct slab *slab)
  692. {
  693. unsigned long objext_offset;
  694. objext_offset = s->size * slab->objects;
  695. objext_offset = ALIGN(objext_offset, sizeof(struct slabobj_ext));
  696. return objext_offset;
  697. }
  698. static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s,
  699. struct slab *slab)
  700. {
  701. unsigned long objext_offset = obj_exts_offset_in_slab(s, slab);
  702. unsigned long objext_size = obj_exts_size_in_slab(slab);
  703. return objext_offset + objext_size <= slab_size(slab);
  704. }
  705. static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab)
  706. {
  707. unsigned long obj_exts;
  708. unsigned long start;
  709. unsigned long end;
  710. obj_exts = slab_obj_exts(slab);
  711. if (!obj_exts)
  712. return false;
  713. start = (unsigned long)slab_address(slab);
  714. end = start + slab_size(slab);
  715. return (obj_exts >= start) && (obj_exts < end);
  716. }
  717. #else
  718. static inline bool need_slab_obj_exts(struct kmem_cache *s)
  719. {
  720. return false;
  721. }
  722. static inline unsigned int obj_exts_size_in_slab(struct slab *slab)
  723. {
  724. return 0;
  725. }
  726. static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s,
  727. struct slab *slab)
  728. {
  729. return 0;
  730. }
  731. static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s,
  732. struct slab *slab)
  733. {
  734. return false;
  735. }
  736. static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab)
  737. {
  738. return false;
  739. }
  740. #endif
  741. #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT)
  742. static bool obj_exts_in_object(struct kmem_cache *s, struct slab *slab)
  743. {
  744. /*
  745. * Note we cannot rely on the SLAB_OBJ_EXT_IN_OBJ flag here and need to
  746. * check the stride. A cache can have SLAB_OBJ_EXT_IN_OBJ set, but
  747. * allocations within_slab_leftover are preferred. And those may be
  748. * possible or not depending on the particular slab's size.
  749. */
  750. return obj_exts_in_slab(s, slab) &&
  751. (slab_get_stride(slab) == s->size);
  752. }
  753. static unsigned int obj_exts_offset_in_object(struct kmem_cache *s)
  754. {
  755. unsigned int offset = get_info_end(s);
  756. if (kmem_cache_debug_flags(s, SLAB_STORE_USER))
  757. offset += sizeof(struct track) * 2;
  758. if (slub_debug_orig_size(s))
  759. offset += sizeof(unsigned long);
  760. offset += kasan_metadata_size(s, false);
  761. return offset;
  762. }
  763. #else
  764. static inline bool obj_exts_in_object(struct kmem_cache *s, struct slab *slab)
  765. {
  766. return false;
  767. }
  768. static inline unsigned int obj_exts_offset_in_object(struct kmem_cache *s)
  769. {
  770. return 0;
  771. }
  772. #endif
  773. #ifdef CONFIG_SLUB_DEBUG
  774. /*
  775. * For debugging context when we want to check if the struct slab pointer
  776. * appears to be valid.
  777. */
  778. static inline bool validate_slab_ptr(struct slab *slab)
  779. {
  780. return PageSlab(slab_page(slab));
  781. }
  782. static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
  783. static DEFINE_SPINLOCK(object_map_lock);
  784. static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
  785. struct slab *slab)
  786. {
  787. void *addr = slab_address(slab);
  788. void *p;
  789. bitmap_zero(obj_map, slab->objects);
  790. for (p = slab->freelist; p; p = get_freepointer(s, p))
  791. set_bit(__obj_to_index(s, addr, p), obj_map);
  792. }
  793. #if IS_ENABLED(CONFIG_KUNIT)
  794. static bool slab_add_kunit_errors(void)
  795. {
  796. struct kunit_resource *resource;
  797. if (!kunit_get_current_test())
  798. return false;
  799. resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
  800. if (!resource)
  801. return false;
  802. (*(int *)resource->data)++;
  803. kunit_put_resource(resource);
  804. return true;
  805. }
  806. bool slab_in_kunit_test(void)
  807. {
  808. struct kunit_resource *resource;
  809. if (!kunit_get_current_test())
  810. return false;
  811. resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
  812. if (!resource)
  813. return false;
  814. kunit_put_resource(resource);
  815. return true;
  816. }
  817. #else
  818. static inline bool slab_add_kunit_errors(void) { return false; }
  819. #endif
  820. static inline unsigned int size_from_object(struct kmem_cache *s)
  821. {
  822. if (s->flags & SLAB_RED_ZONE)
  823. return s->size - s->red_left_pad;
  824. return s->size;
  825. }
  826. static inline void *restore_red_left(struct kmem_cache *s, void *p)
  827. {
  828. if (s->flags & SLAB_RED_ZONE)
  829. p -= s->red_left_pad;
  830. return p;
  831. }
  832. /*
  833. * Debug settings:
  834. */
  835. #if defined(CONFIG_SLUB_DEBUG_ON)
  836. static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
  837. #else
  838. static slab_flags_t slub_debug;
  839. #endif
  840. static const char *slub_debug_string __ro_after_init;
  841. static int disable_higher_order_debug;
  842. /*
  843. * Object debugging
  844. */
  845. /* Verify that a pointer has an address that is valid within a slab page */
  846. static inline int check_valid_pointer(struct kmem_cache *s,
  847. struct slab *slab, void *object)
  848. {
  849. void *base;
  850. if (!object)
  851. return 1;
  852. base = slab_address(slab);
  853. object = kasan_reset_tag(object);
  854. object = restore_red_left(s, object);
  855. if (object < base || object >= base + slab->objects * s->size ||
  856. (object - base) % s->size) {
  857. return 0;
  858. }
  859. return 1;
  860. }
  861. static void print_section(char *level, char *text, u8 *addr,
  862. unsigned int length)
  863. {
  864. metadata_access_enable();
  865. print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
  866. 16, 1, kasan_reset_tag((void *)addr), length, 1);
  867. metadata_access_disable();
  868. }
  869. static struct track *get_track(struct kmem_cache *s, void *object,
  870. enum track_item alloc)
  871. {
  872. struct track *p;
  873. p = object + get_info_end(s);
  874. return kasan_reset_tag(p + alloc);
  875. }
  876. #ifdef CONFIG_STACKDEPOT
  877. static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
  878. {
  879. depot_stack_handle_t handle;
  880. unsigned long entries[TRACK_ADDRS_COUNT];
  881. unsigned int nr_entries;
  882. nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
  883. handle = stack_depot_save(entries, nr_entries, gfp_flags);
  884. return handle;
  885. }
  886. #else
  887. static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
  888. {
  889. return 0;
  890. }
  891. #endif
  892. static void set_track_update(struct kmem_cache *s, void *object,
  893. enum track_item alloc, unsigned long addr,
  894. depot_stack_handle_t handle)
  895. {
  896. struct track *p = get_track(s, object, alloc);
  897. #ifdef CONFIG_STACKDEPOT
  898. p->handle = handle;
  899. #endif
  900. p->addr = addr;
  901. p->cpu = raw_smp_processor_id();
  902. p->pid = current->pid;
  903. p->when = jiffies;
  904. }
  905. static __always_inline void set_track(struct kmem_cache *s, void *object,
  906. enum track_item alloc, unsigned long addr, gfp_t gfp_flags)
  907. {
  908. depot_stack_handle_t handle = set_track_prepare(gfp_flags);
  909. set_track_update(s, object, alloc, addr, handle);
  910. }
  911. static void init_tracking(struct kmem_cache *s, void *object)
  912. {
  913. struct track *p;
  914. if (!(s->flags & SLAB_STORE_USER))
  915. return;
  916. p = get_track(s, object, TRACK_ALLOC);
  917. memset(p, 0, 2*sizeof(struct track));
  918. }
  919. static void print_track(const char *s, struct track *t, unsigned long pr_time)
  920. {
  921. depot_stack_handle_t handle __maybe_unused;
  922. if (!t->addr)
  923. return;
  924. pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
  925. s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
  926. #ifdef CONFIG_STACKDEPOT
  927. handle = READ_ONCE(t->handle);
  928. if (handle)
  929. stack_depot_print(handle);
  930. else
  931. pr_err("object allocation/free stack trace missing\n");
  932. #endif
  933. }
  934. void print_tracking(struct kmem_cache *s, void *object)
  935. {
  936. unsigned long pr_time = jiffies;
  937. if (!(s->flags & SLAB_STORE_USER))
  938. return;
  939. print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
  940. print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
  941. }
  942. static void print_slab_info(const struct slab *slab)
  943. {
  944. pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
  945. slab, slab->objects, slab->inuse, slab->freelist,
  946. &slab->flags.f);
  947. }
  948. void skip_orig_size_check(struct kmem_cache *s, const void *object)
  949. {
  950. set_orig_size(s, (void *)object, s->object_size);
  951. }
  952. static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp)
  953. {
  954. struct va_format vaf;
  955. va_list args;
  956. va_copy(args, argsp);
  957. vaf.fmt = fmt;
  958. vaf.va = &args;
  959. pr_err("=============================================================================\n");
  960. pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf);
  961. pr_err("-----------------------------------------------------------------------------\n\n");
  962. va_end(args);
  963. }
  964. static void slab_bug(struct kmem_cache *s, const char *fmt, ...)
  965. {
  966. va_list args;
  967. va_start(args, fmt);
  968. __slab_bug(s, fmt, args);
  969. va_end(args);
  970. }
  971. __printf(2, 3)
  972. static void slab_fix(struct kmem_cache *s, const char *fmt, ...)
  973. {
  974. struct va_format vaf;
  975. va_list args;
  976. if (slab_add_kunit_errors())
  977. return;
  978. va_start(args, fmt);
  979. vaf.fmt = fmt;
  980. vaf.va = &args;
  981. pr_err("FIX %s: %pV\n", s->name, &vaf);
  982. va_end(args);
  983. }
  984. static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
  985. {
  986. unsigned int off; /* Offset of last byte */
  987. u8 *addr = slab_address(slab);
  988. print_tracking(s, p);
  989. print_slab_info(slab);
  990. pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
  991. p, p - addr, get_freepointer(s, p));
  992. if (s->flags & SLAB_RED_ZONE)
  993. print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
  994. s->red_left_pad);
  995. else if (p > addr + 16)
  996. print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
  997. print_section(KERN_ERR, "Object ", p,
  998. min_t(unsigned int, s->object_size, PAGE_SIZE));
  999. if (s->flags & SLAB_RED_ZONE)
  1000. print_section(KERN_ERR, "Redzone ", p + s->object_size,
  1001. s->inuse - s->object_size);
  1002. off = get_info_end(s);
  1003. if (s->flags & SLAB_STORE_USER)
  1004. off += 2 * sizeof(struct track);
  1005. if (slub_debug_orig_size(s))
  1006. off += sizeof(unsigned long);
  1007. off += kasan_metadata_size(s, false);
  1008. if (obj_exts_in_object(s, slab))
  1009. off += sizeof(struct slabobj_ext);
  1010. if (off != size_from_object(s))
  1011. /* Beginning of the filler is the free pointer */
  1012. print_section(KERN_ERR, "Padding ", p + off,
  1013. size_from_object(s) - off);
  1014. }
  1015. static void object_err(struct kmem_cache *s, struct slab *slab,
  1016. u8 *object, const char *reason)
  1017. {
  1018. if (slab_add_kunit_errors())
  1019. return;
  1020. slab_bug(s, reason);
  1021. if (!object || !check_valid_pointer(s, slab, object)) {
  1022. print_slab_info(slab);
  1023. pr_err("Invalid pointer 0x%p\n", object);
  1024. } else {
  1025. print_trailer(s, slab, object);
  1026. }
  1027. add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
  1028. WARN_ON(1);
  1029. }
  1030. static void __slab_err(struct slab *slab)
  1031. {
  1032. if (slab_in_kunit_test())
  1033. return;
  1034. print_slab_info(slab);
  1035. add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
  1036. WARN_ON(1);
  1037. }
  1038. static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
  1039. const char *fmt, ...)
  1040. {
  1041. va_list args;
  1042. if (slab_add_kunit_errors())
  1043. return;
  1044. va_start(args, fmt);
  1045. __slab_bug(s, fmt, args);
  1046. va_end(args);
  1047. __slab_err(slab);
  1048. }
  1049. static void init_object(struct kmem_cache *s, void *object, u8 val)
  1050. {
  1051. u8 *p = kasan_reset_tag(object);
  1052. unsigned int poison_size = s->object_size;
  1053. if (s->flags & SLAB_RED_ZONE) {
  1054. /*
  1055. * Here and below, avoid overwriting the KMSAN shadow. Keeping
  1056. * the shadow makes it possible to distinguish uninit-value
  1057. * from use-after-free.
  1058. */
  1059. memset_no_sanitize_memory(p - s->red_left_pad, val,
  1060. s->red_left_pad);
  1061. if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
  1062. /*
  1063. * Redzone the extra allocated space by kmalloc than
  1064. * requested, and the poison size will be limited to
  1065. * the original request size accordingly.
  1066. */
  1067. poison_size = get_orig_size(s, object);
  1068. }
  1069. }
  1070. if (s->flags & __OBJECT_POISON) {
  1071. memset_no_sanitize_memory(p, POISON_FREE, poison_size - 1);
  1072. memset_no_sanitize_memory(p + poison_size - 1, POISON_END, 1);
  1073. }
  1074. if (s->flags & SLAB_RED_ZONE)
  1075. memset_no_sanitize_memory(p + poison_size, val,
  1076. s->inuse - poison_size);
  1077. }
  1078. static void restore_bytes(struct kmem_cache *s, const char *message, u8 data,
  1079. void *from, void *to)
  1080. {
  1081. slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
  1082. memset(from, data, to - from);
  1083. }
  1084. #ifdef CONFIG_KMSAN
  1085. #define pad_check_attributes noinline __no_kmsan_checks
  1086. #else
  1087. #define pad_check_attributes
  1088. #endif
  1089. static pad_check_attributes int
  1090. check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
  1091. u8 *object, const char *what, u8 *start, unsigned int value,
  1092. unsigned int bytes, bool slab_obj_print)
  1093. {
  1094. u8 *fault;
  1095. u8 *end;
  1096. u8 *addr = slab_address(slab);
  1097. metadata_access_enable();
  1098. fault = memchr_inv(kasan_reset_tag(start), value, bytes);
  1099. metadata_access_disable();
  1100. if (!fault)
  1101. return 1;
  1102. end = start + bytes;
  1103. while (end > fault && end[-1] == value)
  1104. end--;
  1105. if (slab_add_kunit_errors())
  1106. goto skip_bug_print;
  1107. pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
  1108. what, fault, end - 1, fault - addr, fault[0], value);
  1109. if (slab_obj_print)
  1110. object_err(s, slab, object, "Object corrupt");
  1111. skip_bug_print:
  1112. restore_bytes(s, what, value, fault, end);
  1113. return 0;
  1114. }
  1115. /*
  1116. * Object field layout:
  1117. *
  1118. * [Left redzone padding] (if SLAB_RED_ZONE)
  1119. * - Field size: s->red_left_pad
  1120. * - Immediately precedes each object when SLAB_RED_ZONE is set.
  1121. * - Filled with 0xbb (SLUB_RED_INACTIVE) for inactive objects and
  1122. * 0xcc (SLUB_RED_ACTIVE) for objects in use when SLAB_RED_ZONE.
  1123. *
  1124. * [Object bytes] (object address starts here)
  1125. * - Field size: s->object_size
  1126. * - Object payload bytes.
  1127. * - If the freepointer may overlap the object, it is stored inside
  1128. * the object (typically near the middle).
  1129. * - Poisoning uses 0x6b (POISON_FREE) and the last byte is
  1130. * 0xa5 (POISON_END) when __OBJECT_POISON is enabled.
  1131. *
  1132. * [Word-align padding] (right redzone when SLAB_RED_ZONE is set)
  1133. * - Field size: s->inuse - s->object_size
  1134. * - If redzoning is enabled and ALIGN(size, sizeof(void *)) adds no
  1135. * padding, explicitly extend by one word so the right redzone is
  1136. * non-empty.
  1137. * - Filled with 0xbb (SLUB_RED_INACTIVE) for inactive objects and
  1138. * 0xcc (SLUB_RED_ACTIVE) for objects in use when SLAB_RED_ZONE.
  1139. *
  1140. * [Metadata starts at object + s->inuse]
  1141. * - A. freelist pointer (if freeptr_outside_object)
  1142. * - B. alloc tracking (SLAB_STORE_USER)
  1143. * - C. free tracking (SLAB_STORE_USER)
  1144. * - D. original request size (SLAB_KMALLOC && SLAB_STORE_USER)
  1145. * - E. KASAN metadata (if enabled)
  1146. *
  1147. * [Mandatory padding] (if CONFIG_SLUB_DEBUG && SLAB_RED_ZONE)
  1148. * - One mandatory debug word to guarantee a minimum poisoned gap
  1149. * between metadata and the next object, independent of alignment.
  1150. * - Filled with 0x5a (POISON_INUSE) when SLAB_POISON is set.
  1151. * [Final alignment padding]
  1152. * - Bytes added by ALIGN(size, s->align) to reach s->size.
  1153. * - When the padding is large enough, it can be used to store
  1154. * struct slabobj_ext for accounting metadata (obj_exts_in_object()).
  1155. * - The remaining bytes (if any) are filled with 0x5a (POISON_INUSE)
  1156. * when SLAB_POISON is set.
  1157. *
  1158. * Notes:
  1159. * - Redzones are filled by init_object() with SLUB_RED_ACTIVE/INACTIVE.
  1160. * - Object contents are poisoned with POISON_FREE/END when __OBJECT_POISON.
  1161. * - The trailing padding is pre-filled with POISON_INUSE by
  1162. * setup_slab_debug() when SLAB_POISON is set, and is validated by
  1163. * check_pad_bytes().
  1164. * - The first object pointer is slab_address(slab) +
  1165. * (s->red_left_pad if redzoning); subsequent objects are reached by
  1166. * adding s->size each time.
  1167. *
  1168. * If a slab cache flag relies on specific metadata to exist at a fixed
  1169. * offset, the flag must be included in SLAB_NEVER_MERGE to prevent merging.
  1170. * Otherwise, the cache would misbehave as s->object_size and s->inuse are
  1171. * adjusted during cache merging (see __kmem_cache_alias()).
  1172. */
  1173. static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
  1174. {
  1175. unsigned long off = get_info_end(s); /* The end of info */
  1176. if (s->flags & SLAB_STORE_USER) {
  1177. /* We also have user information there */
  1178. off += 2 * sizeof(struct track);
  1179. if (s->flags & SLAB_KMALLOC)
  1180. off += sizeof(unsigned long);
  1181. }
  1182. off += kasan_metadata_size(s, false);
  1183. if (obj_exts_in_object(s, slab))
  1184. off += sizeof(struct slabobj_ext);
  1185. if (size_from_object(s) == off)
  1186. return 1;
  1187. return check_bytes_and_report(s, slab, p, "Object padding",
  1188. p + off, POISON_INUSE, size_from_object(s) - off, true);
  1189. }
  1190. /* Check the pad bytes at the end of a slab page */
  1191. static pad_check_attributes void
  1192. slab_pad_check(struct kmem_cache *s, struct slab *slab)
  1193. {
  1194. u8 *start;
  1195. u8 *fault;
  1196. u8 *end;
  1197. u8 *pad;
  1198. int length;
  1199. int remainder;
  1200. if (!(s->flags & SLAB_POISON))
  1201. return;
  1202. start = slab_address(slab);
  1203. length = slab_size(slab);
  1204. end = start + length;
  1205. if (obj_exts_in_slab(s, slab) && !obj_exts_in_object(s, slab)) {
  1206. remainder = length;
  1207. remainder -= obj_exts_offset_in_slab(s, slab);
  1208. remainder -= obj_exts_size_in_slab(slab);
  1209. } else {
  1210. remainder = length % s->size;
  1211. }
  1212. if (!remainder)
  1213. return;
  1214. pad = end - remainder;
  1215. metadata_access_enable();
  1216. fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
  1217. metadata_access_disable();
  1218. if (!fault)
  1219. return;
  1220. while (end > fault && end[-1] == POISON_INUSE)
  1221. end--;
  1222. slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu",
  1223. fault, end - 1, fault - start);
  1224. print_section(KERN_ERR, "Padding ", pad, remainder);
  1225. __slab_err(slab);
  1226. restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
  1227. }
  1228. static int check_object(struct kmem_cache *s, struct slab *slab,
  1229. void *object, u8 val)
  1230. {
  1231. u8 *p = object;
  1232. u8 *endobject = object + s->object_size;
  1233. unsigned int orig_size, kasan_meta_size;
  1234. int ret = 1;
  1235. if (s->flags & SLAB_RED_ZONE) {
  1236. if (!check_bytes_and_report(s, slab, object, "Left Redzone",
  1237. object - s->red_left_pad, val, s->red_left_pad, ret))
  1238. ret = 0;
  1239. if (!check_bytes_and_report(s, slab, object, "Right Redzone",
  1240. endobject, val, s->inuse - s->object_size, ret))
  1241. ret = 0;
  1242. if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
  1243. orig_size = get_orig_size(s, object);
  1244. if (s->object_size > orig_size &&
  1245. !check_bytes_and_report(s, slab, object,
  1246. "kmalloc Redzone", p + orig_size,
  1247. val, s->object_size - orig_size, ret)) {
  1248. ret = 0;
  1249. }
  1250. }
  1251. } else {
  1252. if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
  1253. if (!check_bytes_and_report(s, slab, p, "Alignment padding",
  1254. endobject, POISON_INUSE,
  1255. s->inuse - s->object_size, ret))
  1256. ret = 0;
  1257. }
  1258. }
  1259. if (s->flags & SLAB_POISON) {
  1260. if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON)) {
  1261. /*
  1262. * KASAN can save its free meta data inside of the
  1263. * object at offset 0. Thus, skip checking the part of
  1264. * the redzone that overlaps with the meta data.
  1265. */
  1266. kasan_meta_size = kasan_metadata_size(s, true);
  1267. if (kasan_meta_size < s->object_size - 1 &&
  1268. !check_bytes_and_report(s, slab, p, "Poison",
  1269. p + kasan_meta_size, POISON_FREE,
  1270. s->object_size - kasan_meta_size - 1, ret))
  1271. ret = 0;
  1272. if (kasan_meta_size < s->object_size &&
  1273. !check_bytes_and_report(s, slab, p, "End Poison",
  1274. p + s->object_size - 1, POISON_END, 1, ret))
  1275. ret = 0;
  1276. }
  1277. /*
  1278. * check_pad_bytes cleans up on its own.
  1279. */
  1280. if (!check_pad_bytes(s, slab, p))
  1281. ret = 0;
  1282. }
  1283. /*
  1284. * Cannot check freepointer while object is allocated if
  1285. * object and freepointer overlap.
  1286. */
  1287. if ((freeptr_outside_object(s) || val != SLUB_RED_ACTIVE) &&
  1288. !check_valid_pointer(s, slab, get_freepointer(s, p))) {
  1289. object_err(s, slab, p, "Freepointer corrupt");
  1290. /*
  1291. * No choice but to zap it and thus lose the remainder
  1292. * of the free objects in this slab. May cause
  1293. * another error because the object count is now wrong.
  1294. */
  1295. set_freepointer(s, p, NULL);
  1296. ret = 0;
  1297. }
  1298. return ret;
  1299. }
  1300. /*
  1301. * Checks if the slab state looks sane. Assumes the struct slab pointer
  1302. * was either obtained in a way that ensures it's valid, or validated
  1303. * by validate_slab_ptr()
  1304. */
  1305. static int check_slab(struct kmem_cache *s, struct slab *slab)
  1306. {
  1307. int maxobj;
  1308. maxobj = order_objects(slab_order(slab), s->size);
  1309. if (slab->objects > maxobj) {
  1310. slab_err(s, slab, "objects %u > max %u",
  1311. slab->objects, maxobj);
  1312. return 0;
  1313. }
  1314. if (slab->inuse > slab->objects) {
  1315. slab_err(s, slab, "inuse %u > max %u",
  1316. slab->inuse, slab->objects);
  1317. return 0;
  1318. }
  1319. if (slab->frozen) {
  1320. slab_err(s, slab, "Slab disabled since SLUB metadata consistency check failed");
  1321. return 0;
  1322. }
  1323. /* Slab_pad_check fixes things up after itself */
  1324. slab_pad_check(s, slab);
  1325. return 1;
  1326. }
  1327. /*
  1328. * Determine if a certain object in a slab is on the freelist. Must hold the
  1329. * slab lock to guarantee that the chains are in a consistent state.
  1330. */
  1331. static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
  1332. {
  1333. int nr = 0;
  1334. void *fp;
  1335. void *object = NULL;
  1336. int max_objects;
  1337. fp = slab->freelist;
  1338. while (fp && nr <= slab->objects) {
  1339. if (fp == search)
  1340. return true;
  1341. if (!check_valid_pointer(s, slab, fp)) {
  1342. if (object) {
  1343. object_err(s, slab, object,
  1344. "Freechain corrupt");
  1345. set_freepointer(s, object, NULL);
  1346. break;
  1347. } else {
  1348. slab_err(s, slab, "Freepointer corrupt");
  1349. slab->freelist = NULL;
  1350. slab->inuse = slab->objects;
  1351. slab_fix(s, "Freelist cleared");
  1352. return false;
  1353. }
  1354. }
  1355. object = fp;
  1356. fp = get_freepointer(s, object);
  1357. nr++;
  1358. }
  1359. if (nr > slab->objects) {
  1360. slab_err(s, slab, "Freelist cycle detected");
  1361. slab->freelist = NULL;
  1362. slab->inuse = slab->objects;
  1363. slab_fix(s, "Freelist cleared");
  1364. return false;
  1365. }
  1366. max_objects = order_objects(slab_order(slab), s->size);
  1367. if (max_objects > MAX_OBJS_PER_PAGE)
  1368. max_objects = MAX_OBJS_PER_PAGE;
  1369. if (slab->objects != max_objects) {
  1370. slab_err(s, slab, "Wrong number of objects. Found %d but should be %d",
  1371. slab->objects, max_objects);
  1372. slab->objects = max_objects;
  1373. slab_fix(s, "Number of objects adjusted");
  1374. }
  1375. if (slab->inuse != slab->objects - nr) {
  1376. slab_err(s, slab, "Wrong object count. Counter is %d but counted were %d",
  1377. slab->inuse, slab->objects - nr);
  1378. slab->inuse = slab->objects - nr;
  1379. slab_fix(s, "Object count adjusted");
  1380. }
  1381. return search == NULL;
  1382. }
  1383. static void trace(struct kmem_cache *s, struct slab *slab, void *object,
  1384. int alloc)
  1385. {
  1386. if (s->flags & SLAB_TRACE) {
  1387. pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
  1388. s->name,
  1389. alloc ? "alloc" : "free",
  1390. object, slab->inuse,
  1391. slab->freelist);
  1392. if (!alloc)
  1393. print_section(KERN_INFO, "Object ", (void *)object,
  1394. s->object_size);
  1395. dump_stack();
  1396. }
  1397. }
  1398. /*
  1399. * Tracking of fully allocated slabs for debugging purposes.
  1400. */
  1401. static void add_full(struct kmem_cache *s,
  1402. struct kmem_cache_node *n, struct slab *slab)
  1403. {
  1404. if (!(s->flags & SLAB_STORE_USER))
  1405. return;
  1406. lockdep_assert_held(&n->list_lock);
  1407. list_add(&slab->slab_list, &n->full);
  1408. }
  1409. static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab)
  1410. {
  1411. if (!(s->flags & SLAB_STORE_USER))
  1412. return;
  1413. lockdep_assert_held(&n->list_lock);
  1414. list_del(&slab->slab_list);
  1415. }
  1416. static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
  1417. {
  1418. return atomic_long_read(&n->nr_slabs);
  1419. }
  1420. static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
  1421. {
  1422. struct kmem_cache_node *n = get_node(s, node);
  1423. atomic_long_inc(&n->nr_slabs);
  1424. atomic_long_add(objects, &n->total_objects);
  1425. }
  1426. static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
  1427. {
  1428. struct kmem_cache_node *n = get_node(s, node);
  1429. atomic_long_dec(&n->nr_slabs);
  1430. atomic_long_sub(objects, &n->total_objects);
  1431. }
  1432. /* Object debug checks for alloc/free paths */
  1433. static void setup_object_debug(struct kmem_cache *s, void *object)
  1434. {
  1435. if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
  1436. return;
  1437. init_object(s, object, SLUB_RED_INACTIVE);
  1438. init_tracking(s, object);
  1439. }
  1440. static
  1441. void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr)
  1442. {
  1443. if (!kmem_cache_debug_flags(s, SLAB_POISON))
  1444. return;
  1445. metadata_access_enable();
  1446. memset(kasan_reset_tag(addr), POISON_INUSE, slab_size(slab));
  1447. metadata_access_disable();
  1448. }
  1449. static inline int alloc_consistency_checks(struct kmem_cache *s,
  1450. struct slab *slab, void *object)
  1451. {
  1452. if (!check_slab(s, slab))
  1453. return 0;
  1454. if (!check_valid_pointer(s, slab, object)) {
  1455. object_err(s, slab, object, "Freelist Pointer check fails");
  1456. return 0;
  1457. }
  1458. if (!check_object(s, slab, object, SLUB_RED_INACTIVE))
  1459. return 0;
  1460. return 1;
  1461. }
  1462. static noinline bool alloc_debug_processing(struct kmem_cache *s,
  1463. struct slab *slab, void *object, int orig_size)
  1464. {
  1465. if (s->flags & SLAB_CONSISTENCY_CHECKS) {
  1466. if (!alloc_consistency_checks(s, slab, object))
  1467. goto bad;
  1468. }
  1469. /* Success. Perform special debug activities for allocs */
  1470. trace(s, slab, object, 1);
  1471. set_orig_size(s, object, orig_size);
  1472. init_object(s, object, SLUB_RED_ACTIVE);
  1473. return true;
  1474. bad:
  1475. /*
  1476. * Let's do the best we can to avoid issues in the future. Marking all
  1477. * objects as used avoids touching the remaining objects.
  1478. */
  1479. slab_fix(s, "Marking all objects used");
  1480. slab->inuse = slab->objects;
  1481. slab->freelist = NULL;
  1482. slab->frozen = 1; /* mark consistency-failed slab as frozen */
  1483. return false;
  1484. }
  1485. static inline int free_consistency_checks(struct kmem_cache *s,
  1486. struct slab *slab, void *object, unsigned long addr)
  1487. {
  1488. if (!check_valid_pointer(s, slab, object)) {
  1489. slab_err(s, slab, "Invalid object pointer 0x%p", object);
  1490. return 0;
  1491. }
  1492. if (on_freelist(s, slab, object)) {
  1493. object_err(s, slab, object, "Object already free");
  1494. return 0;
  1495. }
  1496. if (!check_object(s, slab, object, SLUB_RED_ACTIVE))
  1497. return 0;
  1498. if (unlikely(s != slab->slab_cache)) {
  1499. if (!slab->slab_cache) {
  1500. slab_err(NULL, slab, "No slab cache for object 0x%p",
  1501. object);
  1502. } else {
  1503. object_err(s, slab, object,
  1504. "page slab pointer corrupt.");
  1505. }
  1506. return 0;
  1507. }
  1508. return 1;
  1509. }
  1510. /*
  1511. * Parse a block of slab_debug options. Blocks are delimited by ';'
  1512. *
  1513. * @str: start of block
  1514. * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
  1515. * @slabs: return start of list of slabs, or NULL when there's no list
  1516. * @init: assume this is initial parsing and not per-kmem-create parsing
  1517. *
  1518. * returns the start of next block if there's any, or NULL
  1519. */
  1520. static const char *
  1521. parse_slub_debug_flags(const char *str, slab_flags_t *flags, const char **slabs, bool init)
  1522. {
  1523. bool higher_order_disable = false;
  1524. /* Skip any completely empty blocks */
  1525. while (*str && *str == ';')
  1526. str++;
  1527. if (*str == ',') {
  1528. /*
  1529. * No options but restriction on slabs. This means full
  1530. * debugging for slabs matching a pattern.
  1531. */
  1532. *flags = DEBUG_DEFAULT_FLAGS;
  1533. goto check_slabs;
  1534. }
  1535. *flags = 0;
  1536. /* Determine which debug features should be switched on */
  1537. for (; *str && *str != ',' && *str != ';'; str++) {
  1538. switch (tolower(*str)) {
  1539. case '-':
  1540. *flags = 0;
  1541. break;
  1542. case 'f':
  1543. *flags |= SLAB_CONSISTENCY_CHECKS;
  1544. break;
  1545. case 'z':
  1546. *flags |= SLAB_RED_ZONE;
  1547. break;
  1548. case 'p':
  1549. *flags |= SLAB_POISON;
  1550. break;
  1551. case 'u':
  1552. *flags |= SLAB_STORE_USER;
  1553. break;
  1554. case 't':
  1555. *flags |= SLAB_TRACE;
  1556. break;
  1557. case 'a':
  1558. *flags |= SLAB_FAILSLAB;
  1559. break;
  1560. case 'o':
  1561. /*
  1562. * Avoid enabling debugging on caches if its minimum
  1563. * order would increase as a result.
  1564. */
  1565. higher_order_disable = true;
  1566. break;
  1567. default:
  1568. if (init)
  1569. pr_err("slab_debug option '%c' unknown. skipped\n", *str);
  1570. }
  1571. }
  1572. check_slabs:
  1573. if (*str == ',')
  1574. *slabs = ++str;
  1575. else
  1576. *slabs = NULL;
  1577. /* Skip over the slab list */
  1578. while (*str && *str != ';')
  1579. str++;
  1580. /* Skip any completely empty blocks */
  1581. while (*str && *str == ';')
  1582. str++;
  1583. if (init && higher_order_disable)
  1584. disable_higher_order_debug = 1;
  1585. if (*str)
  1586. return str;
  1587. else
  1588. return NULL;
  1589. }
  1590. static int __init setup_slub_debug(const char *str, const struct kernel_param *kp)
  1591. {
  1592. slab_flags_t flags;
  1593. slab_flags_t global_flags;
  1594. const char *saved_str;
  1595. const char *slab_list;
  1596. bool global_slub_debug_changed = false;
  1597. bool slab_list_specified = false;
  1598. global_flags = DEBUG_DEFAULT_FLAGS;
  1599. if (!str || !*str)
  1600. /*
  1601. * No options specified. Switch on full debugging.
  1602. */
  1603. goto out;
  1604. saved_str = str;
  1605. while (str) {
  1606. str = parse_slub_debug_flags(str, &flags, &slab_list, true);
  1607. if (!slab_list) {
  1608. global_flags = flags;
  1609. global_slub_debug_changed = true;
  1610. } else {
  1611. slab_list_specified = true;
  1612. if (flags & SLAB_STORE_USER)
  1613. stack_depot_request_early_init();
  1614. }
  1615. }
  1616. /*
  1617. * For backwards compatibility, a single list of flags with list of
  1618. * slabs means debugging is only changed for those slabs, so the global
  1619. * slab_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
  1620. * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
  1621. * long as there is no option specifying flags without a slab list.
  1622. */
  1623. if (slab_list_specified) {
  1624. if (!global_slub_debug_changed)
  1625. global_flags = slub_debug;
  1626. slub_debug_string = saved_str;
  1627. }
  1628. out:
  1629. slub_debug = global_flags;
  1630. if (slub_debug & SLAB_STORE_USER)
  1631. stack_depot_request_early_init();
  1632. if (slub_debug != 0 || slub_debug_string)
  1633. static_branch_enable(&slub_debug_enabled);
  1634. else
  1635. static_branch_disable(&slub_debug_enabled);
  1636. if ((static_branch_unlikely(&init_on_alloc) ||
  1637. static_branch_unlikely(&init_on_free)) &&
  1638. (slub_debug & SLAB_POISON))
  1639. pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
  1640. return 0;
  1641. }
  1642. static const struct kernel_param_ops param_ops_slab_debug __initconst = {
  1643. .flags = KERNEL_PARAM_OPS_FL_NOARG,
  1644. .set = setup_slub_debug,
  1645. };
  1646. __core_param_cb(slab_debug, &param_ops_slab_debug, NULL, 0);
  1647. __core_param_cb(slub_debug, &param_ops_slab_debug, NULL, 0);
  1648. /*
  1649. * kmem_cache_flags - apply debugging options to the cache
  1650. * @flags: flags to set
  1651. * @name: name of the cache
  1652. *
  1653. * Debug option(s) are applied to @flags. In addition to the debug
  1654. * option(s), if a slab name (or multiple) is specified i.e.
  1655. * slab_debug=<Debug-Options>,<slab name1>,<slab name2> ...
  1656. * then only the select slabs will receive the debug option(s).
  1657. */
  1658. slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
  1659. {
  1660. const char *iter;
  1661. size_t len;
  1662. const char *next_block;
  1663. slab_flags_t block_flags;
  1664. slab_flags_t slub_debug_local = slub_debug;
  1665. if (flags & SLAB_NO_USER_FLAGS)
  1666. return flags;
  1667. /*
  1668. * If the slab cache is for debugging (e.g. kmemleak) then
  1669. * don't store user (stack trace) information by default,
  1670. * but let the user enable it via the command line below.
  1671. */
  1672. if (flags & SLAB_NOLEAKTRACE)
  1673. slub_debug_local &= ~SLAB_STORE_USER;
  1674. len = strlen(name);
  1675. next_block = slub_debug_string;
  1676. /* Go through all blocks of debug options, see if any matches our slab's name */
  1677. while (next_block) {
  1678. next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
  1679. if (!iter)
  1680. continue;
  1681. /* Found a block that has a slab list, search it */
  1682. while (*iter) {
  1683. const char *end, *glob;
  1684. size_t cmplen;
  1685. end = strchrnul(iter, ',');
  1686. if (next_block && next_block < end)
  1687. end = next_block - 1;
  1688. glob = strnchr(iter, end - iter, '*');
  1689. if (glob)
  1690. cmplen = glob - iter;
  1691. else
  1692. cmplen = max_t(size_t, len, (end - iter));
  1693. if (!strncmp(name, iter, cmplen)) {
  1694. flags |= block_flags;
  1695. return flags;
  1696. }
  1697. if (!*end || *end == ';')
  1698. break;
  1699. iter = end + 1;
  1700. }
  1701. }
  1702. return flags | slub_debug_local;
  1703. }
  1704. #else /* !CONFIG_SLUB_DEBUG */
  1705. static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
  1706. static inline
  1707. void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
  1708. static inline bool alloc_debug_processing(struct kmem_cache *s,
  1709. struct slab *slab, void *object, int orig_size) { return true; }
  1710. static inline bool free_debug_processing(struct kmem_cache *s,
  1711. struct slab *slab, void *head, void *tail, int *bulk_cnt,
  1712. unsigned long addr, depot_stack_handle_t handle) { return true; }
  1713. static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
  1714. static inline int check_object(struct kmem_cache *s, struct slab *slab,
  1715. void *object, u8 val) { return 1; }
  1716. static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; }
  1717. static inline void set_track(struct kmem_cache *s, void *object,
  1718. enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {}
  1719. static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
  1720. struct slab *slab) {}
  1721. static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
  1722. struct slab *slab) {}
  1723. slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
  1724. {
  1725. return flags;
  1726. }
  1727. #define slub_debug 0
  1728. #define disable_higher_order_debug 0
  1729. static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
  1730. { return 0; }
  1731. static inline void inc_slabs_node(struct kmem_cache *s, int node,
  1732. int objects) {}
  1733. static inline void dec_slabs_node(struct kmem_cache *s, int node,
  1734. int objects) {}
  1735. #endif /* CONFIG_SLUB_DEBUG */
  1736. /*
  1737. * The allocated objcg pointers array is not accounted directly.
  1738. * Moreover, it should not come from DMA buffer and is not readily
  1739. * reclaimable. So those GFP bits should be masked off.
  1740. */
  1741. #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
  1742. __GFP_ACCOUNT | __GFP_NOFAIL)
  1743. #ifdef CONFIG_SLAB_OBJ_EXT
  1744. #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
  1745. static inline void mark_obj_codetag_empty(const void *obj)
  1746. {
  1747. struct slab *obj_slab;
  1748. unsigned long slab_exts;
  1749. obj_slab = virt_to_slab(obj);
  1750. slab_exts = slab_obj_exts(obj_slab);
  1751. if (slab_exts) {
  1752. get_slab_obj_exts(slab_exts);
  1753. unsigned int offs = obj_to_index(obj_slab->slab_cache,
  1754. obj_slab, obj);
  1755. struct slabobj_ext *ext = slab_obj_ext(obj_slab,
  1756. slab_exts, offs);
  1757. if (unlikely(is_codetag_empty(&ext->ref))) {
  1758. put_slab_obj_exts(slab_exts);
  1759. return;
  1760. }
  1761. /* codetag should be NULL here */
  1762. WARN_ON(ext->ref.ct);
  1763. set_codetag_empty(&ext->ref);
  1764. put_slab_obj_exts(slab_exts);
  1765. }
  1766. }
  1767. static inline bool mark_failed_objexts_alloc(struct slab *slab)
  1768. {
  1769. return cmpxchg(&slab->obj_exts, 0, OBJEXTS_ALLOC_FAIL) == 0;
  1770. }
  1771. static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
  1772. struct slabobj_ext *vec, unsigned int objects)
  1773. {
  1774. /*
  1775. * If vector previously failed to allocate then we have live
  1776. * objects with no tag reference. Mark all references in this
  1777. * vector as empty to avoid warnings later on.
  1778. */
  1779. if (obj_exts == OBJEXTS_ALLOC_FAIL) {
  1780. unsigned int i;
  1781. for (i = 0; i < objects; i++)
  1782. set_codetag_empty(&vec[i].ref);
  1783. }
  1784. }
  1785. #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
  1786. static inline void mark_obj_codetag_empty(const void *obj) {}
  1787. static inline bool mark_failed_objexts_alloc(struct slab *slab) { return false; }
  1788. static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
  1789. struct slabobj_ext *vec, unsigned int objects) {}
  1790. #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
  1791. static inline void init_slab_obj_exts(struct slab *slab)
  1792. {
  1793. slab->obj_exts = 0;
  1794. }
  1795. /*
  1796. * Calculate the allocation size for slabobj_ext array.
  1797. *
  1798. * When memory allocation profiling is enabled, the obj_exts array
  1799. * could be allocated from the same slab cache it's being allocated for.
  1800. * This would prevent the slab from ever being freed because it would
  1801. * always contain at least one allocated object (its own obj_exts array).
  1802. *
  1803. * To avoid this, increase the allocation size when we detect the array
  1804. * may come from the same cache, forcing it to use a different cache.
  1805. */
  1806. static inline size_t obj_exts_alloc_size(struct kmem_cache *s,
  1807. struct slab *slab, gfp_t gfp)
  1808. {
  1809. size_t sz = sizeof(struct slabobj_ext) * slab->objects;
  1810. struct kmem_cache *obj_exts_cache;
  1811. if (sz > KMALLOC_MAX_CACHE_SIZE)
  1812. return sz;
  1813. if (!is_kmalloc_normal(s))
  1814. return sz;
  1815. obj_exts_cache = kmalloc_slab(sz, NULL, gfp, 0);
  1816. /*
  1817. * We can't simply compare s with obj_exts_cache, because random kmalloc
  1818. * caches have multiple caches per size, selected by caller address.
  1819. * Since caller address may differ between kmalloc_slab() and actual
  1820. * allocation, bump size when sizes are equal.
  1821. */
  1822. if (s->object_size == obj_exts_cache->object_size)
  1823. return obj_exts_cache->object_size + 1;
  1824. return sz;
  1825. }
  1826. int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
  1827. gfp_t gfp, bool new_slab)
  1828. {
  1829. bool allow_spin = gfpflags_allow_spinning(gfp);
  1830. unsigned int objects = objs_per_slab(s, slab);
  1831. unsigned long new_exts;
  1832. unsigned long old_exts;
  1833. struct slabobj_ext *vec;
  1834. size_t sz;
  1835. gfp &= ~OBJCGS_CLEAR_MASK;
  1836. /* Prevent recursive extension vector allocation */
  1837. gfp |= __GFP_NO_OBJ_EXT;
  1838. sz = obj_exts_alloc_size(s, slab, gfp);
  1839. /*
  1840. * Note that allow_spin may be false during early boot and its
  1841. * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting
  1842. * architectures with cmpxchg16b, early obj_exts will be missing for
  1843. * very early allocations on those.
  1844. */
  1845. if (unlikely(!allow_spin))
  1846. vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT,
  1847. slab_nid(slab));
  1848. else
  1849. vec = kmalloc_node(sz, gfp | __GFP_ZERO, slab_nid(slab));
  1850. if (!vec) {
  1851. /*
  1852. * Try to mark vectors which failed to allocate.
  1853. * If this operation fails, there may be a racing process
  1854. * that has already completed the allocation.
  1855. */
  1856. if (!mark_failed_objexts_alloc(slab) &&
  1857. slab_obj_exts(slab))
  1858. return 0;
  1859. return -ENOMEM;
  1860. }
  1861. VM_WARN_ON_ONCE(virt_to_slab(vec) != NULL &&
  1862. virt_to_slab(vec)->slab_cache == s);
  1863. new_exts = (unsigned long)vec;
  1864. #ifdef CONFIG_MEMCG
  1865. new_exts |= MEMCG_DATA_OBJEXTS;
  1866. #endif
  1867. retry:
  1868. old_exts = READ_ONCE(slab->obj_exts);
  1869. handle_failed_objexts_alloc(old_exts, vec, objects);
  1870. if (new_slab) {
  1871. /*
  1872. * If the slab is brand new and nobody can yet access its
  1873. * obj_exts, no synchronization is required and obj_exts can
  1874. * be simply assigned.
  1875. */
  1876. slab->obj_exts = new_exts;
  1877. } else if (old_exts & ~OBJEXTS_FLAGS_MASK) {
  1878. /*
  1879. * If the slab is already in use, somebody can allocate and
  1880. * assign slabobj_exts in parallel. In this case the existing
  1881. * objcg vector should be reused.
  1882. */
  1883. mark_obj_codetag_empty(vec);
  1884. if (unlikely(!allow_spin))
  1885. kfree_nolock(vec);
  1886. else
  1887. kfree(vec);
  1888. return 0;
  1889. } else if (cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) {
  1890. /* Retry if a racing thread changed slab->obj_exts from under us. */
  1891. goto retry;
  1892. }
  1893. if (allow_spin)
  1894. kmemleak_not_leak(vec);
  1895. return 0;
  1896. }
  1897. static inline void free_slab_obj_exts(struct slab *slab, bool allow_spin)
  1898. {
  1899. struct slabobj_ext *obj_exts;
  1900. obj_exts = (struct slabobj_ext *)slab_obj_exts(slab);
  1901. if (!obj_exts) {
  1902. /*
  1903. * If obj_exts allocation failed, slab->obj_exts is set to
  1904. * OBJEXTS_ALLOC_FAIL. In this case, we end up here and should
  1905. * clear the flag.
  1906. */
  1907. slab->obj_exts = 0;
  1908. return;
  1909. }
  1910. if (obj_exts_in_slab(slab->slab_cache, slab)) {
  1911. slab->obj_exts = 0;
  1912. return;
  1913. }
  1914. /*
  1915. * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its
  1916. * corresponding extension will be NULL. alloc_tag_sub() will throw a
  1917. * warning if slab has extensions but the extension of an object is
  1918. * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that
  1919. * the extension for obj_exts is expected to be NULL.
  1920. */
  1921. mark_obj_codetag_empty(obj_exts);
  1922. if (allow_spin)
  1923. kfree(obj_exts);
  1924. else
  1925. kfree_nolock(obj_exts);
  1926. slab->obj_exts = 0;
  1927. }
  1928. /*
  1929. * Try to allocate slabobj_ext array from unused space.
  1930. * This function must be called on a freshly allocated slab to prevent
  1931. * concurrency problems.
  1932. */
  1933. static void alloc_slab_obj_exts_early(struct kmem_cache *s, struct slab *slab)
  1934. {
  1935. void *addr;
  1936. unsigned long obj_exts;
  1937. /* Initialize stride early to avoid memory ordering issues */
  1938. slab_set_stride(slab, sizeof(struct slabobj_ext));
  1939. if (!need_slab_obj_exts(s))
  1940. return;
  1941. if (obj_exts_fit_within_slab_leftover(s, slab)) {
  1942. addr = slab_address(slab) + obj_exts_offset_in_slab(s, slab);
  1943. addr = kasan_reset_tag(addr);
  1944. obj_exts = (unsigned long)addr;
  1945. get_slab_obj_exts(obj_exts);
  1946. memset(addr, 0, obj_exts_size_in_slab(slab));
  1947. put_slab_obj_exts(obj_exts);
  1948. #ifdef CONFIG_MEMCG
  1949. obj_exts |= MEMCG_DATA_OBJEXTS;
  1950. #endif
  1951. slab->obj_exts = obj_exts;
  1952. } else if (s->flags & SLAB_OBJ_EXT_IN_OBJ) {
  1953. unsigned int offset = obj_exts_offset_in_object(s);
  1954. obj_exts = (unsigned long)slab_address(slab);
  1955. obj_exts += s->red_left_pad;
  1956. obj_exts += offset;
  1957. get_slab_obj_exts(obj_exts);
  1958. for_each_object(addr, s, slab_address(slab), slab->objects)
  1959. memset(kasan_reset_tag(addr) + offset, 0,
  1960. sizeof(struct slabobj_ext));
  1961. put_slab_obj_exts(obj_exts);
  1962. #ifdef CONFIG_MEMCG
  1963. obj_exts |= MEMCG_DATA_OBJEXTS;
  1964. #endif
  1965. slab->obj_exts = obj_exts;
  1966. slab_set_stride(slab, s->size);
  1967. }
  1968. }
  1969. #else /* CONFIG_SLAB_OBJ_EXT */
  1970. static inline void mark_obj_codetag_empty(const void *obj)
  1971. {
  1972. }
  1973. static inline void init_slab_obj_exts(struct slab *slab)
  1974. {
  1975. }
  1976. static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
  1977. gfp_t gfp, bool new_slab)
  1978. {
  1979. return 0;
  1980. }
  1981. static inline void free_slab_obj_exts(struct slab *slab, bool allow_spin)
  1982. {
  1983. }
  1984. static inline void alloc_slab_obj_exts_early(struct kmem_cache *s,
  1985. struct slab *slab)
  1986. {
  1987. }
  1988. #endif /* CONFIG_SLAB_OBJ_EXT */
  1989. #ifdef CONFIG_MEM_ALLOC_PROFILING
  1990. static inline unsigned long
  1991. prepare_slab_obj_exts_hook(struct kmem_cache *s, struct slab *slab,
  1992. gfp_t flags, void *p)
  1993. {
  1994. if (!slab_obj_exts(slab) &&
  1995. alloc_slab_obj_exts(slab, s, flags, false)) {
  1996. pr_warn_once("%s, %s: Failed to create slab extension vector!\n",
  1997. __func__, s->name);
  1998. return 0;
  1999. }
  2000. return slab_obj_exts(slab);
  2001. }
  2002. /* Should be called only if mem_alloc_profiling_enabled() */
  2003. static noinline void
  2004. __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
  2005. {
  2006. unsigned long obj_exts;
  2007. struct slabobj_ext *obj_ext;
  2008. struct slab *slab;
  2009. if (!object)
  2010. return;
  2011. if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
  2012. return;
  2013. if (flags & __GFP_NO_OBJ_EXT)
  2014. return;
  2015. slab = virt_to_slab(object);
  2016. obj_exts = prepare_slab_obj_exts_hook(s, slab, flags, object);
  2017. /*
  2018. * Currently obj_exts is used only for allocation profiling.
  2019. * If other users appear then mem_alloc_profiling_enabled()
  2020. * check should be added before alloc_tag_add().
  2021. */
  2022. if (obj_exts) {
  2023. unsigned int obj_idx = obj_to_index(s, slab, object);
  2024. get_slab_obj_exts(obj_exts);
  2025. obj_ext = slab_obj_ext(slab, obj_exts, obj_idx);
  2026. alloc_tag_add(&obj_ext->ref, current->alloc_tag, s->size);
  2027. put_slab_obj_exts(obj_exts);
  2028. } else {
  2029. alloc_tag_set_inaccurate(current->alloc_tag);
  2030. }
  2031. }
  2032. static inline void
  2033. alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
  2034. {
  2035. if (mem_alloc_profiling_enabled())
  2036. __alloc_tagging_slab_alloc_hook(s, object, flags);
  2037. }
  2038. /* Should be called only if mem_alloc_profiling_enabled() */
  2039. static noinline void
  2040. __alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
  2041. int objects)
  2042. {
  2043. int i;
  2044. unsigned long obj_exts;
  2045. /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
  2046. if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
  2047. return;
  2048. obj_exts = slab_obj_exts(slab);
  2049. if (!obj_exts)
  2050. return;
  2051. get_slab_obj_exts(obj_exts);
  2052. for (i = 0; i < objects; i++) {
  2053. unsigned int off = obj_to_index(s, slab, p[i]);
  2054. alloc_tag_sub(&slab_obj_ext(slab, obj_exts, off)->ref, s->size);
  2055. }
  2056. put_slab_obj_exts(obj_exts);
  2057. }
  2058. static inline void
  2059. alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
  2060. int objects)
  2061. {
  2062. if (mem_alloc_profiling_enabled())
  2063. __alloc_tagging_slab_free_hook(s, slab, p, objects);
  2064. }
  2065. #else /* CONFIG_MEM_ALLOC_PROFILING */
  2066. static inline void
  2067. alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
  2068. {
  2069. }
  2070. static inline void
  2071. alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
  2072. int objects)
  2073. {
  2074. }
  2075. #endif /* CONFIG_MEM_ALLOC_PROFILING */
  2076. #ifdef CONFIG_MEMCG
  2077. static void memcg_alloc_abort_single(struct kmem_cache *s, void *object);
  2078. static __fastpath_inline
  2079. bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
  2080. gfp_t flags, size_t size, void **p)
  2081. {
  2082. if (likely(!memcg_kmem_online()))
  2083. return true;
  2084. if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)))
  2085. return true;
  2086. if (likely(__memcg_slab_post_alloc_hook(s, lru, flags, size, p)))
  2087. return true;
  2088. if (likely(size == 1)) {
  2089. memcg_alloc_abort_single(s, *p);
  2090. *p = NULL;
  2091. } else {
  2092. kmem_cache_free_bulk(s, size, p);
  2093. }
  2094. return false;
  2095. }
  2096. static __fastpath_inline
  2097. void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
  2098. int objects)
  2099. {
  2100. unsigned long obj_exts;
  2101. if (!memcg_kmem_online())
  2102. return;
  2103. obj_exts = slab_obj_exts(slab);
  2104. if (likely(!obj_exts))
  2105. return;
  2106. get_slab_obj_exts(obj_exts);
  2107. __memcg_slab_free_hook(s, slab, p, objects, obj_exts);
  2108. put_slab_obj_exts(obj_exts);
  2109. }
  2110. static __fastpath_inline
  2111. bool memcg_slab_post_charge(void *p, gfp_t flags)
  2112. {
  2113. unsigned long obj_exts;
  2114. struct slabobj_ext *obj_ext;
  2115. struct kmem_cache *s;
  2116. struct page *page;
  2117. struct slab *slab;
  2118. unsigned long off;
  2119. page = virt_to_page(p);
  2120. if (PageLargeKmalloc(page)) {
  2121. unsigned int order;
  2122. int size;
  2123. if (PageMemcgKmem(page))
  2124. return true;
  2125. order = large_kmalloc_order(page);
  2126. if (__memcg_kmem_charge_page(page, flags, order))
  2127. return false;
  2128. /*
  2129. * This page has already been accounted in the global stats but
  2130. * not in the memcg stats. So, subtract from the global and use
  2131. * the interface which adds to both global and memcg stats.
  2132. */
  2133. size = PAGE_SIZE << order;
  2134. mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, -size);
  2135. mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, size);
  2136. return true;
  2137. }
  2138. slab = page_slab(page);
  2139. s = slab->slab_cache;
  2140. /*
  2141. * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency
  2142. * of slab_obj_exts being allocated from the same slab and thus the slab
  2143. * becoming effectively unfreeable.
  2144. */
  2145. if (is_kmalloc_normal(s))
  2146. return true;
  2147. /* Ignore already charged objects. */
  2148. obj_exts = slab_obj_exts(slab);
  2149. if (obj_exts) {
  2150. get_slab_obj_exts(obj_exts);
  2151. off = obj_to_index(s, slab, p);
  2152. obj_ext = slab_obj_ext(slab, obj_exts, off);
  2153. if (unlikely(obj_ext->objcg)) {
  2154. put_slab_obj_exts(obj_exts);
  2155. return true;
  2156. }
  2157. put_slab_obj_exts(obj_exts);
  2158. }
  2159. return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p);
  2160. }
  2161. #else /* CONFIG_MEMCG */
  2162. static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s,
  2163. struct list_lru *lru,
  2164. gfp_t flags, size_t size,
  2165. void **p)
  2166. {
  2167. return true;
  2168. }
  2169. static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
  2170. void **p, int objects)
  2171. {
  2172. }
  2173. static inline bool memcg_slab_post_charge(void *p, gfp_t flags)
  2174. {
  2175. return true;
  2176. }
  2177. #endif /* CONFIG_MEMCG */
  2178. #ifdef CONFIG_SLUB_RCU_DEBUG
  2179. static void slab_free_after_rcu_debug(struct rcu_head *rcu_head);
  2180. struct rcu_delayed_free {
  2181. struct rcu_head head;
  2182. void *object;
  2183. };
  2184. #endif
  2185. /*
  2186. * Hooks for other subsystems that check memory allocations. In a typical
  2187. * production configuration these hooks all should produce no code at all.
  2188. *
  2189. * Returns true if freeing of the object can proceed, false if its reuse
  2190. * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned
  2191. * to KFENCE.
  2192. *
  2193. * For objects allocated via kmalloc_nolock(), only a subset of alloc hooks
  2194. * are invoked, so some free hooks must handle asymmetric hook calls.
  2195. *
  2196. * Alloc hooks called for kmalloc_nolock():
  2197. * - kmsan_slab_alloc()
  2198. * - kasan_slab_alloc()
  2199. * - memcg_slab_post_alloc_hook()
  2200. * - alloc_tagging_slab_alloc_hook()
  2201. *
  2202. * Free hooks that must handle missing corresponding alloc hooks:
  2203. * - kmemleak_free_recursive()
  2204. * - kfence_free()
  2205. *
  2206. * Free hooks that have no alloc hook counterpart, and thus safe to call:
  2207. * - debug_check_no_locks_freed()
  2208. * - debug_check_no_obj_freed()
  2209. * - __kcsan_check_access()
  2210. */
  2211. static __always_inline
  2212. bool slab_free_hook(struct kmem_cache *s, void *x, bool init,
  2213. bool after_rcu_delay)
  2214. {
  2215. /* Are the object contents still accessible? */
  2216. bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay;
  2217. kmemleak_free_recursive(x, s->flags);
  2218. kmsan_slab_free(s, x);
  2219. debug_check_no_locks_freed(x, s->object_size);
  2220. if (!(s->flags & SLAB_DEBUG_OBJECTS))
  2221. debug_check_no_obj_freed(x, s->object_size);
  2222. /* Use KCSAN to help debug racy use-after-free. */
  2223. if (!still_accessible)
  2224. __kcsan_check_access(x, s->object_size,
  2225. KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
  2226. if (kfence_free(x))
  2227. return false;
  2228. /*
  2229. * Give KASAN a chance to notice an invalid free operation before we
  2230. * modify the object.
  2231. */
  2232. if (kasan_slab_pre_free(s, x))
  2233. return false;
  2234. #ifdef CONFIG_SLUB_RCU_DEBUG
  2235. if (still_accessible) {
  2236. struct rcu_delayed_free *delayed_free;
  2237. delayed_free = kmalloc_obj(*delayed_free, GFP_NOWAIT);
  2238. if (delayed_free) {
  2239. /*
  2240. * Let KASAN track our call stack as a "related work
  2241. * creation", just like if the object had been freed
  2242. * normally via kfree_rcu().
  2243. * We have to do this manually because the rcu_head is
  2244. * not located inside the object.
  2245. */
  2246. kasan_record_aux_stack(x);
  2247. delayed_free->object = x;
  2248. call_rcu(&delayed_free->head, slab_free_after_rcu_debug);
  2249. return false;
  2250. }
  2251. }
  2252. #endif /* CONFIG_SLUB_RCU_DEBUG */
  2253. /*
  2254. * As memory initialization might be integrated into KASAN,
  2255. * kasan_slab_free and initialization memset's must be
  2256. * kept together to avoid discrepancies in behavior.
  2257. *
  2258. * The initialization memset's clear the object and the metadata,
  2259. * but don't touch the SLAB redzone.
  2260. *
  2261. * The object's freepointer is also avoided if stored outside the
  2262. * object.
  2263. */
  2264. if (unlikely(init)) {
  2265. int rsize;
  2266. unsigned int inuse, orig_size;
  2267. inuse = get_info_end(s);
  2268. orig_size = get_orig_size(s, x);
  2269. if (!kasan_has_integrated_init())
  2270. memset(kasan_reset_tag(x), 0, orig_size);
  2271. rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
  2272. memset((char *)kasan_reset_tag(x) + inuse, 0,
  2273. s->size - inuse - rsize);
  2274. /*
  2275. * Restore orig_size, otherwise kmalloc redzone overwritten
  2276. * would be reported
  2277. */
  2278. set_orig_size(s, x, orig_size);
  2279. }
  2280. /* KASAN might put x into memory quarantine, delaying its reuse. */
  2281. return !kasan_slab_free(s, x, init, still_accessible, false);
  2282. }
  2283. static __fastpath_inline
  2284. bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail,
  2285. int *cnt)
  2286. {
  2287. void *object;
  2288. void *next = *head;
  2289. void *old_tail = *tail;
  2290. bool init;
  2291. if (is_kfence_address(next)) {
  2292. slab_free_hook(s, next, false, false);
  2293. return false;
  2294. }
  2295. /* Head and tail of the reconstructed freelist */
  2296. *head = NULL;
  2297. *tail = NULL;
  2298. init = slab_want_init_on_free(s);
  2299. do {
  2300. object = next;
  2301. next = get_freepointer(s, object);
  2302. /* If object's reuse doesn't have to be delayed */
  2303. if (likely(slab_free_hook(s, object, init, false))) {
  2304. /* Move object to the new freelist */
  2305. set_freepointer(s, object, *head);
  2306. *head = object;
  2307. if (!*tail)
  2308. *tail = object;
  2309. } else {
  2310. /*
  2311. * Adjust the reconstructed freelist depth
  2312. * accordingly if object's reuse is delayed.
  2313. */
  2314. --(*cnt);
  2315. }
  2316. } while (object != old_tail);
  2317. return *head != NULL;
  2318. }
  2319. static void *setup_object(struct kmem_cache *s, void *object)
  2320. {
  2321. setup_object_debug(s, object);
  2322. object = kasan_init_slab_obj(s, object);
  2323. if (unlikely(s->ctor)) {
  2324. kasan_unpoison_new_object(s, object);
  2325. s->ctor(object);
  2326. kasan_poison_new_object(s, object);
  2327. }
  2328. return object;
  2329. }
  2330. static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp,
  2331. unsigned int capacity)
  2332. {
  2333. struct slab_sheaf *sheaf;
  2334. size_t sheaf_size;
  2335. if (gfp & __GFP_NO_OBJ_EXT)
  2336. return NULL;
  2337. gfp &= ~OBJCGS_CLEAR_MASK;
  2338. /*
  2339. * Prevent recursion to the same cache, or a deep stack of kmallocs of
  2340. * varying sizes (sheaf capacity might differ for each kmalloc size
  2341. * bucket)
  2342. */
  2343. if (s->flags & SLAB_KMALLOC)
  2344. gfp |= __GFP_NO_OBJ_EXT;
  2345. sheaf_size = struct_size(sheaf, objects, capacity);
  2346. sheaf = kzalloc(sheaf_size, gfp);
  2347. if (unlikely(!sheaf))
  2348. return NULL;
  2349. sheaf->cache = s;
  2350. stat(s, SHEAF_ALLOC);
  2351. return sheaf;
  2352. }
  2353. static inline struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s,
  2354. gfp_t gfp)
  2355. {
  2356. return __alloc_empty_sheaf(s, gfp, s->sheaf_capacity);
  2357. }
  2358. static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
  2359. {
  2360. /*
  2361. * If the sheaf was created with __GFP_NO_OBJ_EXT flag then its
  2362. * corresponding extension is NULL and alloc_tag_sub() will throw a
  2363. * warning, therefore replace NULL with CODETAG_EMPTY to indicate
  2364. * that the extension for this sheaf is expected to be NULL.
  2365. */
  2366. if (s->flags & SLAB_KMALLOC)
  2367. mark_obj_codetag_empty(sheaf);
  2368. VM_WARN_ON_ONCE(sheaf->size > 0);
  2369. kfree(sheaf);
  2370. stat(s, SHEAF_FREE);
  2371. }
  2372. static unsigned int
  2373. refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
  2374. unsigned int max);
  2375. static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
  2376. gfp_t gfp)
  2377. {
  2378. int to_fill = s->sheaf_capacity - sheaf->size;
  2379. int filled;
  2380. if (!to_fill)
  2381. return 0;
  2382. filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill,
  2383. to_fill);
  2384. sheaf->size += filled;
  2385. stat_add(s, SHEAF_REFILL, filled);
  2386. if (filled < to_fill)
  2387. return -ENOMEM;
  2388. return 0;
  2389. }
  2390. static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf);
  2391. static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp)
  2392. {
  2393. struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp);
  2394. if (!sheaf)
  2395. return NULL;
  2396. if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
  2397. sheaf_flush_unused(s, sheaf);
  2398. free_empty_sheaf(s, sheaf);
  2399. return NULL;
  2400. }
  2401. return sheaf;
  2402. }
  2403. /*
  2404. * Maximum number of objects freed during a single flush of main pcs sheaf.
  2405. * Translates directly to an on-stack array size.
  2406. */
  2407. #define PCS_BATCH_MAX 32U
  2408. static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
  2409. /*
  2410. * Free all objects from the main sheaf. In order to perform
  2411. * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where
  2412. * object pointers are moved to a on-stack array under the lock. To bound the
  2413. * stack usage, limit each batch to PCS_BATCH_MAX.
  2414. *
  2415. * Must be called with s->cpu_sheaves->lock locked, returns with the lock
  2416. * unlocked.
  2417. *
  2418. * Returns how many objects are remaining to be flushed
  2419. */
  2420. static unsigned int __sheaf_flush_main_batch(struct kmem_cache *s)
  2421. {
  2422. struct slub_percpu_sheaves *pcs;
  2423. unsigned int batch, remaining;
  2424. void *objects[PCS_BATCH_MAX];
  2425. struct slab_sheaf *sheaf;
  2426. lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
  2427. pcs = this_cpu_ptr(s->cpu_sheaves);
  2428. sheaf = pcs->main;
  2429. batch = min(PCS_BATCH_MAX, sheaf->size);
  2430. sheaf->size -= batch;
  2431. memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *));
  2432. remaining = sheaf->size;
  2433. local_unlock(&s->cpu_sheaves->lock);
  2434. __kmem_cache_free_bulk(s, batch, &objects[0]);
  2435. stat_add(s, SHEAF_FLUSH, batch);
  2436. return remaining;
  2437. }
  2438. static void sheaf_flush_main(struct kmem_cache *s)
  2439. {
  2440. unsigned int remaining;
  2441. do {
  2442. local_lock(&s->cpu_sheaves->lock);
  2443. remaining = __sheaf_flush_main_batch(s);
  2444. } while (remaining);
  2445. }
  2446. /*
  2447. * Returns true if the main sheaf was at least partially flushed.
  2448. */
  2449. static bool sheaf_try_flush_main(struct kmem_cache *s)
  2450. {
  2451. unsigned int remaining;
  2452. bool ret = false;
  2453. do {
  2454. if (!local_trylock(&s->cpu_sheaves->lock))
  2455. return ret;
  2456. ret = true;
  2457. remaining = __sheaf_flush_main_batch(s);
  2458. } while (remaining);
  2459. return ret;
  2460. }
  2461. /*
  2462. * Free all objects from a sheaf that's unused, i.e. not linked to any
  2463. * cpu_sheaves, so we need no locking and batching. The locking is also not
  2464. * necessary when flushing cpu's sheaves (both spare and main) during cpu
  2465. * hotremove as the cpu is not executing anymore.
  2466. */
  2467. static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
  2468. {
  2469. if (!sheaf->size)
  2470. return;
  2471. stat_add(s, SHEAF_FLUSH, sheaf->size);
  2472. __kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]);
  2473. sheaf->size = 0;
  2474. }
  2475. static bool __rcu_free_sheaf_prepare(struct kmem_cache *s,
  2476. struct slab_sheaf *sheaf)
  2477. {
  2478. bool init = slab_want_init_on_free(s);
  2479. void **p = &sheaf->objects[0];
  2480. unsigned int i = 0;
  2481. bool pfmemalloc = false;
  2482. while (i < sheaf->size) {
  2483. struct slab *slab = virt_to_slab(p[i]);
  2484. memcg_slab_free_hook(s, slab, p + i, 1);
  2485. alloc_tagging_slab_free_hook(s, slab, p + i, 1);
  2486. if (unlikely(!slab_free_hook(s, p[i], init, true))) {
  2487. p[i] = p[--sheaf->size];
  2488. continue;
  2489. }
  2490. if (slab_test_pfmemalloc(slab))
  2491. pfmemalloc = true;
  2492. i++;
  2493. }
  2494. return pfmemalloc;
  2495. }
  2496. static void rcu_free_sheaf_nobarn(struct rcu_head *head)
  2497. {
  2498. struct slab_sheaf *sheaf;
  2499. struct kmem_cache *s;
  2500. sheaf = container_of(head, struct slab_sheaf, rcu_head);
  2501. s = sheaf->cache;
  2502. __rcu_free_sheaf_prepare(s, sheaf);
  2503. sheaf_flush_unused(s, sheaf);
  2504. free_empty_sheaf(s, sheaf);
  2505. }
  2506. /*
  2507. * Caller needs to make sure migration is disabled in order to fully flush
  2508. * single cpu's sheaves
  2509. *
  2510. * must not be called from an irq
  2511. *
  2512. * flushing operations are rare so let's keep it simple and flush to slabs
  2513. * directly, skipping the barn
  2514. */
  2515. static void pcs_flush_all(struct kmem_cache *s)
  2516. {
  2517. struct slub_percpu_sheaves *pcs;
  2518. struct slab_sheaf *spare, *rcu_free;
  2519. local_lock(&s->cpu_sheaves->lock);
  2520. pcs = this_cpu_ptr(s->cpu_sheaves);
  2521. spare = pcs->spare;
  2522. pcs->spare = NULL;
  2523. rcu_free = pcs->rcu_free;
  2524. pcs->rcu_free = NULL;
  2525. local_unlock(&s->cpu_sheaves->lock);
  2526. if (spare) {
  2527. sheaf_flush_unused(s, spare);
  2528. free_empty_sheaf(s, spare);
  2529. }
  2530. if (rcu_free)
  2531. call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
  2532. sheaf_flush_main(s);
  2533. }
  2534. static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu)
  2535. {
  2536. struct slub_percpu_sheaves *pcs;
  2537. pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
  2538. /* The cpu is not executing anymore so we don't need pcs->lock */
  2539. sheaf_flush_unused(s, pcs->main);
  2540. if (pcs->spare) {
  2541. sheaf_flush_unused(s, pcs->spare);
  2542. free_empty_sheaf(s, pcs->spare);
  2543. pcs->spare = NULL;
  2544. }
  2545. if (pcs->rcu_free) {
  2546. call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn);
  2547. pcs->rcu_free = NULL;
  2548. }
  2549. }
  2550. static void pcs_destroy(struct kmem_cache *s)
  2551. {
  2552. int cpu;
  2553. /*
  2554. * We may be unwinding cache creation that failed before or during the
  2555. * allocation of this.
  2556. */
  2557. if (!s->cpu_sheaves)
  2558. return;
  2559. /* pcs->main can only point to the bootstrap sheaf, nothing to free */
  2560. if (!cache_has_sheaves(s))
  2561. goto free_pcs;
  2562. for_each_possible_cpu(cpu) {
  2563. struct slub_percpu_sheaves *pcs;
  2564. pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
  2565. /* This can happen when unwinding failed cache creation. */
  2566. if (!pcs->main)
  2567. continue;
  2568. /*
  2569. * We have already passed __kmem_cache_shutdown() so everything
  2570. * was flushed and there should be no objects allocated from
  2571. * slabs, otherwise kmem_cache_destroy() would have aborted.
  2572. * Therefore something would have to be really wrong if the
  2573. * warnings here trigger, and we should rather leave objects and
  2574. * sheaves to leak in that case.
  2575. */
  2576. WARN_ON(pcs->spare);
  2577. WARN_ON(pcs->rcu_free);
  2578. if (!WARN_ON(pcs->main->size)) {
  2579. free_empty_sheaf(s, pcs->main);
  2580. pcs->main = NULL;
  2581. }
  2582. }
  2583. free_pcs:
  2584. free_percpu(s->cpu_sheaves);
  2585. s->cpu_sheaves = NULL;
  2586. }
  2587. static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn,
  2588. bool allow_spin)
  2589. {
  2590. struct slab_sheaf *empty = NULL;
  2591. unsigned long flags;
  2592. if (!data_race(barn->nr_empty))
  2593. return NULL;
  2594. if (likely(allow_spin))
  2595. spin_lock_irqsave(&barn->lock, flags);
  2596. else if (!spin_trylock_irqsave(&barn->lock, flags))
  2597. return NULL;
  2598. if (likely(barn->nr_empty)) {
  2599. empty = list_first_entry(&barn->sheaves_empty,
  2600. struct slab_sheaf, barn_list);
  2601. list_del(&empty->barn_list);
  2602. barn->nr_empty--;
  2603. }
  2604. spin_unlock_irqrestore(&barn->lock, flags);
  2605. return empty;
  2606. }
  2607. /*
  2608. * The following two functions are used mainly in cases where we have to undo an
  2609. * intended action due to a race or cpu migration. Thus they do not check the
  2610. * empty or full sheaf limits for simplicity.
  2611. */
  2612. static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf)
  2613. {
  2614. unsigned long flags;
  2615. spin_lock_irqsave(&barn->lock, flags);
  2616. list_add(&sheaf->barn_list, &barn->sheaves_empty);
  2617. barn->nr_empty++;
  2618. spin_unlock_irqrestore(&barn->lock, flags);
  2619. }
  2620. static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf)
  2621. {
  2622. unsigned long flags;
  2623. spin_lock_irqsave(&barn->lock, flags);
  2624. list_add(&sheaf->barn_list, &barn->sheaves_full);
  2625. barn->nr_full++;
  2626. spin_unlock_irqrestore(&barn->lock, flags);
  2627. }
  2628. static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn)
  2629. {
  2630. struct slab_sheaf *sheaf = NULL;
  2631. unsigned long flags;
  2632. if (!data_race(barn->nr_full) && !data_race(barn->nr_empty))
  2633. return NULL;
  2634. spin_lock_irqsave(&barn->lock, flags);
  2635. if (barn->nr_full) {
  2636. sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
  2637. barn_list);
  2638. list_del(&sheaf->barn_list);
  2639. barn->nr_full--;
  2640. } else if (barn->nr_empty) {
  2641. sheaf = list_first_entry(&barn->sheaves_empty,
  2642. struct slab_sheaf, barn_list);
  2643. list_del(&sheaf->barn_list);
  2644. barn->nr_empty--;
  2645. }
  2646. spin_unlock_irqrestore(&barn->lock, flags);
  2647. return sheaf;
  2648. }
  2649. /*
  2650. * If a full sheaf is available, return it and put the supplied empty one to
  2651. * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't
  2652. * change.
  2653. */
  2654. static struct slab_sheaf *
  2655. barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty,
  2656. bool allow_spin)
  2657. {
  2658. struct slab_sheaf *full = NULL;
  2659. unsigned long flags;
  2660. if (!data_race(barn->nr_full))
  2661. return NULL;
  2662. if (likely(allow_spin))
  2663. spin_lock_irqsave(&barn->lock, flags);
  2664. else if (!spin_trylock_irqsave(&barn->lock, flags))
  2665. return NULL;
  2666. if (likely(barn->nr_full)) {
  2667. full = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
  2668. barn_list);
  2669. list_del(&full->barn_list);
  2670. list_add(&empty->barn_list, &barn->sheaves_empty);
  2671. barn->nr_full--;
  2672. barn->nr_empty++;
  2673. }
  2674. spin_unlock_irqrestore(&barn->lock, flags);
  2675. return full;
  2676. }
  2677. /*
  2678. * If an empty sheaf is available, return it and put the supplied full one to
  2679. * barn. But if there are too many full sheaves, reject this with -E2BIG.
  2680. */
  2681. static struct slab_sheaf *
  2682. barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full,
  2683. bool allow_spin)
  2684. {
  2685. struct slab_sheaf *empty;
  2686. unsigned long flags;
  2687. /* we don't repeat this check under barn->lock as it's not critical */
  2688. if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES)
  2689. return ERR_PTR(-E2BIG);
  2690. if (!data_race(barn->nr_empty))
  2691. return ERR_PTR(-ENOMEM);
  2692. if (likely(allow_spin))
  2693. spin_lock_irqsave(&barn->lock, flags);
  2694. else if (!spin_trylock_irqsave(&barn->lock, flags))
  2695. return ERR_PTR(-EBUSY);
  2696. if (likely(barn->nr_empty)) {
  2697. empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf,
  2698. barn_list);
  2699. list_del(&empty->barn_list);
  2700. list_add(&full->barn_list, &barn->sheaves_full);
  2701. barn->nr_empty--;
  2702. barn->nr_full++;
  2703. } else {
  2704. empty = ERR_PTR(-ENOMEM);
  2705. }
  2706. spin_unlock_irqrestore(&barn->lock, flags);
  2707. return empty;
  2708. }
  2709. static void barn_init(struct node_barn *barn)
  2710. {
  2711. spin_lock_init(&barn->lock);
  2712. INIT_LIST_HEAD(&barn->sheaves_full);
  2713. INIT_LIST_HEAD(&barn->sheaves_empty);
  2714. barn->nr_full = 0;
  2715. barn->nr_empty = 0;
  2716. }
  2717. static void barn_shrink(struct kmem_cache *s, struct node_barn *barn)
  2718. {
  2719. LIST_HEAD(empty_list);
  2720. LIST_HEAD(full_list);
  2721. struct slab_sheaf *sheaf, *sheaf2;
  2722. unsigned long flags;
  2723. spin_lock_irqsave(&barn->lock, flags);
  2724. list_splice_init(&barn->sheaves_full, &full_list);
  2725. barn->nr_full = 0;
  2726. list_splice_init(&barn->sheaves_empty, &empty_list);
  2727. barn->nr_empty = 0;
  2728. spin_unlock_irqrestore(&barn->lock, flags);
  2729. list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) {
  2730. sheaf_flush_unused(s, sheaf);
  2731. free_empty_sheaf(s, sheaf);
  2732. }
  2733. list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list)
  2734. free_empty_sheaf(s, sheaf);
  2735. }
  2736. /*
  2737. * Slab allocation and freeing
  2738. */
  2739. static inline struct slab *alloc_slab_page(gfp_t flags, int node,
  2740. struct kmem_cache_order_objects oo,
  2741. bool allow_spin)
  2742. {
  2743. struct page *page;
  2744. struct slab *slab;
  2745. unsigned int order = oo_order(oo);
  2746. if (unlikely(!allow_spin))
  2747. page = alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */,
  2748. node, order);
  2749. else if (node == NUMA_NO_NODE)
  2750. page = alloc_frozen_pages(flags, order);
  2751. else
  2752. page = __alloc_frozen_pages(flags, order, node, NULL);
  2753. if (!page)
  2754. return NULL;
  2755. __SetPageSlab(page);
  2756. slab = page_slab(page);
  2757. if (page_is_pfmemalloc(page))
  2758. slab_set_pfmemalloc(slab);
  2759. return slab;
  2760. }
  2761. #ifdef CONFIG_SLAB_FREELIST_RANDOM
  2762. /* Pre-initialize the random sequence cache */
  2763. static int init_cache_random_seq(struct kmem_cache *s)
  2764. {
  2765. unsigned int count = oo_objects(s->oo);
  2766. int err;
  2767. /* Bailout if already initialised */
  2768. if (s->random_seq)
  2769. return 0;
  2770. err = cache_random_seq_create(s, count, GFP_KERNEL);
  2771. if (err) {
  2772. pr_err("SLUB: Unable to initialize free list for %s\n",
  2773. s->name);
  2774. return err;
  2775. }
  2776. /* Transform to an offset on the set of pages */
  2777. if (s->random_seq) {
  2778. unsigned int i;
  2779. for (i = 0; i < count; i++)
  2780. s->random_seq[i] *= s->size;
  2781. }
  2782. return 0;
  2783. }
  2784. /* Initialize each random sequence freelist per cache */
  2785. static void __init init_freelist_randomization(void)
  2786. {
  2787. struct kmem_cache *s;
  2788. mutex_lock(&slab_mutex);
  2789. list_for_each_entry(s, &slab_caches, list)
  2790. init_cache_random_seq(s);
  2791. mutex_unlock(&slab_mutex);
  2792. }
  2793. /* Get the next entry on the pre-computed freelist randomized */
  2794. static void *next_freelist_entry(struct kmem_cache *s,
  2795. unsigned long *pos, void *start,
  2796. unsigned long page_limit,
  2797. unsigned long freelist_count)
  2798. {
  2799. unsigned int idx;
  2800. /*
  2801. * If the target page allocation failed, the number of objects on the
  2802. * page might be smaller than the usual size defined by the cache.
  2803. */
  2804. do {
  2805. idx = s->random_seq[*pos];
  2806. *pos += 1;
  2807. if (*pos >= freelist_count)
  2808. *pos = 0;
  2809. } while (unlikely(idx >= page_limit));
  2810. return (char *)start + idx;
  2811. }
  2812. static DEFINE_PER_CPU(struct rnd_state, slab_rnd_state);
  2813. /* Shuffle the single linked freelist based on a random pre-computed sequence */
  2814. static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab,
  2815. bool allow_spin)
  2816. {
  2817. void *start;
  2818. void *cur;
  2819. void *next;
  2820. unsigned long idx, pos, page_limit, freelist_count;
  2821. if (slab->objects < 2 || !s->random_seq)
  2822. return false;
  2823. freelist_count = oo_objects(s->oo);
  2824. if (allow_spin) {
  2825. pos = get_random_u32_below(freelist_count);
  2826. } else {
  2827. struct rnd_state *state;
  2828. /*
  2829. * An interrupt or NMI handler might interrupt and change
  2830. * the state in the middle, but that's safe.
  2831. */
  2832. state = &get_cpu_var(slab_rnd_state);
  2833. pos = prandom_u32_state(state) % freelist_count;
  2834. put_cpu_var(slab_rnd_state);
  2835. }
  2836. page_limit = slab->objects * s->size;
  2837. start = fixup_red_left(s, slab_address(slab));
  2838. /* First entry is used as the base of the freelist */
  2839. cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count);
  2840. cur = setup_object(s, cur);
  2841. slab->freelist = cur;
  2842. for (idx = 1; idx < slab->objects; idx++) {
  2843. next = next_freelist_entry(s, &pos, start, page_limit,
  2844. freelist_count);
  2845. next = setup_object(s, next);
  2846. set_freepointer(s, cur, next);
  2847. cur = next;
  2848. }
  2849. set_freepointer(s, cur, NULL);
  2850. return true;
  2851. }
  2852. #else
  2853. static inline int init_cache_random_seq(struct kmem_cache *s)
  2854. {
  2855. return 0;
  2856. }
  2857. static inline void init_freelist_randomization(void) { }
  2858. static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab,
  2859. bool allow_spin)
  2860. {
  2861. return false;
  2862. }
  2863. #endif /* CONFIG_SLAB_FREELIST_RANDOM */
  2864. static __always_inline void account_slab(struct slab *slab, int order,
  2865. struct kmem_cache *s, gfp_t gfp)
  2866. {
  2867. if (memcg_kmem_online() &&
  2868. (s->flags & SLAB_ACCOUNT) &&
  2869. !slab_obj_exts(slab))
  2870. alloc_slab_obj_exts(slab, s, gfp, true);
  2871. mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
  2872. PAGE_SIZE << order);
  2873. }
  2874. static __always_inline void unaccount_slab(struct slab *slab, int order,
  2875. struct kmem_cache *s, bool allow_spin)
  2876. {
  2877. /*
  2878. * The slab object extensions should now be freed regardless of
  2879. * whether mem_alloc_profiling_enabled() or not because profiling
  2880. * might have been disabled after slab->obj_exts got allocated.
  2881. */
  2882. free_slab_obj_exts(slab, allow_spin);
  2883. mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
  2884. -(PAGE_SIZE << order));
  2885. }
  2886. static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
  2887. {
  2888. bool allow_spin = gfpflags_allow_spinning(flags);
  2889. struct slab *slab;
  2890. struct kmem_cache_order_objects oo = s->oo;
  2891. gfp_t alloc_gfp;
  2892. void *start, *p, *next;
  2893. int idx;
  2894. bool shuffle;
  2895. flags &= gfp_allowed_mask;
  2896. flags |= s->allocflags;
  2897. /*
  2898. * Let the initial higher-order allocation fail under memory pressure
  2899. * so we fall-back to the minimum order allocation.
  2900. */
  2901. alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
  2902. if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
  2903. alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
  2904. /*
  2905. * __GFP_RECLAIM could be cleared on the first allocation attempt,
  2906. * so pass allow_spin flag directly.
  2907. */
  2908. slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
  2909. if (unlikely(!slab)) {
  2910. oo = s->min;
  2911. alloc_gfp = flags;
  2912. /*
  2913. * Allocation may have failed due to fragmentation.
  2914. * Try a lower order alloc if possible
  2915. */
  2916. slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
  2917. if (unlikely(!slab))
  2918. return NULL;
  2919. stat(s, ORDER_FALLBACK);
  2920. }
  2921. slab->objects = oo_objects(oo);
  2922. slab->inuse = 0;
  2923. slab->frozen = 0;
  2924. slab->slab_cache = s;
  2925. kasan_poison_slab(slab);
  2926. start = slab_address(slab);
  2927. setup_slab_debug(s, slab, start);
  2928. init_slab_obj_exts(slab);
  2929. /*
  2930. * Poison the slab before initializing the slabobj_ext array
  2931. * to prevent the array from being overwritten.
  2932. */
  2933. alloc_slab_obj_exts_early(s, slab);
  2934. account_slab(slab, oo_order(oo), s, flags);
  2935. shuffle = shuffle_freelist(s, slab, allow_spin);
  2936. if (!shuffle) {
  2937. start = fixup_red_left(s, start);
  2938. start = setup_object(s, start);
  2939. slab->freelist = start;
  2940. for (idx = 0, p = start; idx < slab->objects - 1; idx++) {
  2941. next = p + s->size;
  2942. next = setup_object(s, next);
  2943. set_freepointer(s, p, next);
  2944. p = next;
  2945. }
  2946. set_freepointer(s, p, NULL);
  2947. }
  2948. return slab;
  2949. }
  2950. static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
  2951. {
  2952. if (unlikely(flags & GFP_SLAB_BUG_MASK))
  2953. flags = kmalloc_fix_flags(flags);
  2954. WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
  2955. return allocate_slab(s,
  2956. flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
  2957. }
  2958. static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin)
  2959. {
  2960. struct page *page = slab_page(slab);
  2961. int order = compound_order(page);
  2962. int pages = 1 << order;
  2963. __slab_clear_pfmemalloc(slab);
  2964. page->mapping = NULL;
  2965. __ClearPageSlab(page);
  2966. mm_account_reclaimed_pages(pages);
  2967. unaccount_slab(slab, order, s, allow_spin);
  2968. if (allow_spin)
  2969. free_frozen_pages(page, order);
  2970. else
  2971. free_frozen_pages_nolock(page, order);
  2972. }
  2973. static void free_new_slab_nolock(struct kmem_cache *s, struct slab *slab)
  2974. {
  2975. /*
  2976. * Since it was just allocated, we can skip the actions in
  2977. * discard_slab() and free_slab().
  2978. */
  2979. __free_slab(s, slab, false);
  2980. }
  2981. static void rcu_free_slab(struct rcu_head *h)
  2982. {
  2983. struct slab *slab = container_of(h, struct slab, rcu_head);
  2984. __free_slab(slab->slab_cache, slab, true);
  2985. }
  2986. static void free_slab(struct kmem_cache *s, struct slab *slab)
  2987. {
  2988. if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
  2989. void *p;
  2990. slab_pad_check(s, slab);
  2991. for_each_object(p, s, slab_address(slab), slab->objects)
  2992. check_object(s, slab, p, SLUB_RED_INACTIVE);
  2993. }
  2994. if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU))
  2995. call_rcu(&slab->rcu_head, rcu_free_slab);
  2996. else
  2997. __free_slab(s, slab, true);
  2998. }
  2999. static void discard_slab(struct kmem_cache *s, struct slab *slab)
  3000. {
  3001. dec_slabs_node(s, slab_nid(slab), slab->objects);
  3002. free_slab(s, slab);
  3003. }
  3004. static inline bool slab_test_node_partial(const struct slab *slab)
  3005. {
  3006. return test_bit(SL_partial, &slab->flags.f);
  3007. }
  3008. static inline void slab_set_node_partial(struct slab *slab)
  3009. {
  3010. set_bit(SL_partial, &slab->flags.f);
  3011. }
  3012. static inline void slab_clear_node_partial(struct slab *slab)
  3013. {
  3014. clear_bit(SL_partial, &slab->flags.f);
  3015. }
  3016. /*
  3017. * Management of partially allocated slabs.
  3018. */
  3019. static inline void
  3020. __add_partial(struct kmem_cache_node *n, struct slab *slab, enum add_mode mode)
  3021. {
  3022. n->nr_partial++;
  3023. if (mode == ADD_TO_TAIL)
  3024. list_add_tail(&slab->slab_list, &n->partial);
  3025. else
  3026. list_add(&slab->slab_list, &n->partial);
  3027. slab_set_node_partial(slab);
  3028. }
  3029. static inline void add_partial(struct kmem_cache_node *n,
  3030. struct slab *slab, enum add_mode mode)
  3031. {
  3032. lockdep_assert_held(&n->list_lock);
  3033. __add_partial(n, slab, mode);
  3034. }
  3035. static inline void remove_partial(struct kmem_cache_node *n,
  3036. struct slab *slab)
  3037. {
  3038. lockdep_assert_held(&n->list_lock);
  3039. list_del(&slab->slab_list);
  3040. slab_clear_node_partial(slab);
  3041. n->nr_partial--;
  3042. }
  3043. /*
  3044. * Called only for kmem_cache_debug() caches instead of remove_partial(), with a
  3045. * slab from the n->partial list. Remove only a single object from the slab, do
  3046. * the alloc_debug_processing() checks and leave the slab on the list, or move
  3047. * it to full list if it was the last free object.
  3048. */
  3049. static void *alloc_single_from_partial(struct kmem_cache *s,
  3050. struct kmem_cache_node *n, struct slab *slab, int orig_size)
  3051. {
  3052. void *object;
  3053. lockdep_assert_held(&n->list_lock);
  3054. #ifdef CONFIG_SLUB_DEBUG
  3055. if (s->flags & SLAB_CONSISTENCY_CHECKS) {
  3056. if (!validate_slab_ptr(slab)) {
  3057. slab_err(s, slab, "Not a valid slab page");
  3058. return NULL;
  3059. }
  3060. }
  3061. #endif
  3062. object = slab->freelist;
  3063. slab->freelist = get_freepointer(s, object);
  3064. slab->inuse++;
  3065. if (!alloc_debug_processing(s, slab, object, orig_size)) {
  3066. remove_partial(n, slab);
  3067. return NULL;
  3068. }
  3069. if (slab->inuse == slab->objects) {
  3070. remove_partial(n, slab);
  3071. add_full(s, n, slab);
  3072. }
  3073. return object;
  3074. }
  3075. /*
  3076. * Called only for kmem_cache_debug() caches to allocate from a freshly
  3077. * allocated slab. Allocate a single object instead of whole freelist
  3078. * and put the slab to the partial (or full) list.
  3079. */
  3080. static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab,
  3081. int orig_size, gfp_t gfpflags)
  3082. {
  3083. bool allow_spin = gfpflags_allow_spinning(gfpflags);
  3084. int nid = slab_nid(slab);
  3085. struct kmem_cache_node *n = get_node(s, nid);
  3086. unsigned long flags;
  3087. void *object;
  3088. if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) {
  3089. /* Unlucky, discard newly allocated slab. */
  3090. free_new_slab_nolock(s, slab);
  3091. return NULL;
  3092. }
  3093. object = slab->freelist;
  3094. slab->freelist = get_freepointer(s, object);
  3095. slab->inuse = 1;
  3096. if (!alloc_debug_processing(s, slab, object, orig_size)) {
  3097. /*
  3098. * It's not really expected that this would fail on a
  3099. * freshly allocated slab, but a concurrent memory
  3100. * corruption in theory could cause that.
  3101. * Leak memory of allocated slab.
  3102. */
  3103. if (!allow_spin)
  3104. spin_unlock_irqrestore(&n->list_lock, flags);
  3105. return NULL;
  3106. }
  3107. if (allow_spin)
  3108. spin_lock_irqsave(&n->list_lock, flags);
  3109. if (slab->inuse == slab->objects)
  3110. add_full(s, n, slab);
  3111. else
  3112. add_partial(n, slab, ADD_TO_HEAD);
  3113. inc_slabs_node(s, nid, slab->objects);
  3114. spin_unlock_irqrestore(&n->list_lock, flags);
  3115. return object;
  3116. }
  3117. static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
  3118. static bool get_partial_node_bulk(struct kmem_cache *s,
  3119. struct kmem_cache_node *n,
  3120. struct partial_bulk_context *pc,
  3121. bool allow_spin)
  3122. {
  3123. struct slab *slab, *slab2;
  3124. unsigned int total_free = 0;
  3125. unsigned long flags;
  3126. /* Racy check to avoid taking the lock unnecessarily. */
  3127. if (!n || data_race(!n->nr_partial))
  3128. return false;
  3129. INIT_LIST_HEAD(&pc->slabs);
  3130. if (allow_spin)
  3131. spin_lock_irqsave(&n->list_lock, flags);
  3132. else if (!spin_trylock_irqsave(&n->list_lock, flags))
  3133. return false;
  3134. list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
  3135. struct freelist_counters flc;
  3136. unsigned int slab_free;
  3137. if (!pfmemalloc_match(slab, pc->flags))
  3138. continue;
  3139. /*
  3140. * determine the number of free objects in the slab racily
  3141. *
  3142. * slab_free is a lower bound due to possible subsequent
  3143. * concurrent freeing, so the caller may get more objects than
  3144. * requested and must handle that
  3145. */
  3146. flc.counters = data_race(READ_ONCE(slab->counters));
  3147. slab_free = flc.objects - flc.inuse;
  3148. /* we have already min and this would get us over the max */
  3149. if (total_free >= pc->min_objects
  3150. && total_free + slab_free > pc->max_objects)
  3151. break;
  3152. remove_partial(n, slab);
  3153. list_add(&slab->slab_list, &pc->slabs);
  3154. total_free += slab_free;
  3155. if (total_free >= pc->max_objects)
  3156. break;
  3157. }
  3158. spin_unlock_irqrestore(&n->list_lock, flags);
  3159. return total_free > 0;
  3160. }
  3161. /*
  3162. * Try to allocate object from a partial slab on a specific node.
  3163. */
  3164. static void *get_from_partial_node(struct kmem_cache *s,
  3165. struct kmem_cache_node *n,
  3166. struct partial_context *pc)
  3167. {
  3168. struct slab *slab, *slab2;
  3169. unsigned long flags;
  3170. void *object = NULL;
  3171. /*
  3172. * Racy check. If we mistakenly see no partial slabs then we
  3173. * just allocate an empty slab. If we mistakenly try to get a
  3174. * partial slab and there is none available then get_from_partial()
  3175. * will return NULL.
  3176. */
  3177. if (!n || !n->nr_partial)
  3178. return NULL;
  3179. if (gfpflags_allow_spinning(pc->flags))
  3180. spin_lock_irqsave(&n->list_lock, flags);
  3181. else if (!spin_trylock_irqsave(&n->list_lock, flags))
  3182. return NULL;
  3183. list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
  3184. struct freelist_counters old, new;
  3185. if (!pfmemalloc_match(slab, pc->flags))
  3186. continue;
  3187. if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
  3188. object = alloc_single_from_partial(s, n, slab,
  3189. pc->orig_size);
  3190. if (object)
  3191. break;
  3192. continue;
  3193. }
  3194. /*
  3195. * get a single object from the slab. This might race against
  3196. * __slab_free(), which however has to take the list_lock if
  3197. * it's about to make the slab fully free.
  3198. */
  3199. do {
  3200. old.freelist = slab->freelist;
  3201. old.counters = slab->counters;
  3202. new.freelist = get_freepointer(s, old.freelist);
  3203. new.counters = old.counters;
  3204. new.inuse++;
  3205. } while (!__slab_update_freelist(s, slab, &old, &new, "get_from_partial_node"));
  3206. object = old.freelist;
  3207. if (!new.freelist)
  3208. remove_partial(n, slab);
  3209. break;
  3210. }
  3211. spin_unlock_irqrestore(&n->list_lock, flags);
  3212. return object;
  3213. }
  3214. /*
  3215. * Get an object from somewhere. Search in increasing NUMA distances.
  3216. */
  3217. static void *get_from_any_partial(struct kmem_cache *s, struct partial_context *pc)
  3218. {
  3219. #ifdef CONFIG_NUMA
  3220. struct zonelist *zonelist;
  3221. struct zoneref *z;
  3222. struct zone *zone;
  3223. enum zone_type highest_zoneidx = gfp_zone(pc->flags);
  3224. unsigned int cpuset_mems_cookie;
  3225. bool allow_spin = gfpflags_allow_spinning(pc->flags);
  3226. /*
  3227. * The defrag ratio allows a configuration of the tradeoffs between
  3228. * inter node defragmentation and node local allocations. A lower
  3229. * defrag_ratio increases the tendency to do local allocations
  3230. * instead of attempting to obtain partial slabs from other nodes.
  3231. *
  3232. * If the defrag_ratio is set to 0 then kmalloc() always
  3233. * returns node local objects. If the ratio is higher then kmalloc()
  3234. * may return off node objects because partial slabs are obtained
  3235. * from other nodes and filled up.
  3236. *
  3237. * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
  3238. * (which makes defrag_ratio = 1000) then every (well almost)
  3239. * allocation will first attempt to defrag slab caches on other nodes.
  3240. * This means scanning over all nodes to look for partial slabs which
  3241. * may be expensive if we do it every time we are trying to find a slab
  3242. * with available objects.
  3243. */
  3244. if (!s->remote_node_defrag_ratio ||
  3245. get_cycles() % 1024 > s->remote_node_defrag_ratio)
  3246. return NULL;
  3247. do {
  3248. /*
  3249. * read_mems_allowed_begin() accesses current->mems_allowed_seq,
  3250. * a seqcount_spinlock_t that is not NMI-safe. Do not access
  3251. * current->mems_allowed_seq and avoid retry when GFP flags
  3252. * indicate spinning is not allowed.
  3253. */
  3254. if (allow_spin)
  3255. cpuset_mems_cookie = read_mems_allowed_begin();
  3256. zonelist = node_zonelist(mempolicy_slab_node(), pc->flags);
  3257. for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
  3258. struct kmem_cache_node *n;
  3259. n = get_node(s, zone_to_nid(zone));
  3260. if (n && cpuset_zone_allowed(zone, pc->flags) &&
  3261. n->nr_partial > s->min_partial) {
  3262. void *object = get_from_partial_node(s, n, pc);
  3263. if (object) {
  3264. /*
  3265. * Don't check read_mems_allowed_retry()
  3266. * here - if mems_allowed was updated in
  3267. * parallel, that was a harmless race
  3268. * between allocation and the cpuset
  3269. * update
  3270. */
  3271. return object;
  3272. }
  3273. }
  3274. }
  3275. } while (allow_spin && read_mems_allowed_retry(cpuset_mems_cookie));
  3276. #endif /* CONFIG_NUMA */
  3277. return NULL;
  3278. }
  3279. /*
  3280. * Get an object from a partial slab
  3281. */
  3282. static void *get_from_partial(struct kmem_cache *s, int node,
  3283. struct partial_context *pc)
  3284. {
  3285. int searchnode = node;
  3286. void *object;
  3287. if (node == NUMA_NO_NODE)
  3288. searchnode = numa_mem_id();
  3289. object = get_from_partial_node(s, get_node(s, searchnode), pc);
  3290. if (object || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
  3291. return object;
  3292. return get_from_any_partial(s, pc);
  3293. }
  3294. static bool has_pcs_used(int cpu, struct kmem_cache *s)
  3295. {
  3296. struct slub_percpu_sheaves *pcs;
  3297. if (!cache_has_sheaves(s))
  3298. return false;
  3299. pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
  3300. return (pcs->spare || pcs->rcu_free || pcs->main->size);
  3301. }
  3302. /*
  3303. * Flush percpu sheaves
  3304. *
  3305. * Called from CPU work handler with migration disabled.
  3306. */
  3307. static void flush_cpu_sheaves(struct work_struct *w)
  3308. {
  3309. struct kmem_cache *s;
  3310. struct slub_flush_work *sfw;
  3311. sfw = container_of(w, struct slub_flush_work, work);
  3312. s = sfw->s;
  3313. if (cache_has_sheaves(s))
  3314. pcs_flush_all(s);
  3315. }
  3316. static void flush_all_cpus_locked(struct kmem_cache *s)
  3317. {
  3318. struct slub_flush_work *sfw;
  3319. unsigned int cpu;
  3320. lockdep_assert_cpus_held();
  3321. mutex_lock(&flush_lock);
  3322. for_each_online_cpu(cpu) {
  3323. sfw = &per_cpu(slub_flush, cpu);
  3324. if (!has_pcs_used(cpu, s)) {
  3325. sfw->skip = true;
  3326. continue;
  3327. }
  3328. INIT_WORK(&sfw->work, flush_cpu_sheaves);
  3329. sfw->skip = false;
  3330. sfw->s = s;
  3331. queue_work_on(cpu, flushwq, &sfw->work);
  3332. }
  3333. for_each_online_cpu(cpu) {
  3334. sfw = &per_cpu(slub_flush, cpu);
  3335. if (sfw->skip)
  3336. continue;
  3337. flush_work(&sfw->work);
  3338. }
  3339. mutex_unlock(&flush_lock);
  3340. }
  3341. static void flush_all(struct kmem_cache *s)
  3342. {
  3343. cpus_read_lock();
  3344. flush_all_cpus_locked(s);
  3345. cpus_read_unlock();
  3346. }
  3347. static void flush_rcu_sheaf(struct work_struct *w)
  3348. {
  3349. struct slub_percpu_sheaves *pcs;
  3350. struct slab_sheaf *rcu_free;
  3351. struct slub_flush_work *sfw;
  3352. struct kmem_cache *s;
  3353. sfw = container_of(w, struct slub_flush_work, work);
  3354. s = sfw->s;
  3355. local_lock(&s->cpu_sheaves->lock);
  3356. pcs = this_cpu_ptr(s->cpu_sheaves);
  3357. rcu_free = pcs->rcu_free;
  3358. pcs->rcu_free = NULL;
  3359. local_unlock(&s->cpu_sheaves->lock);
  3360. if (rcu_free)
  3361. call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
  3362. }
  3363. /* needed for kvfree_rcu_barrier() */
  3364. void flush_rcu_sheaves_on_cache(struct kmem_cache *s)
  3365. {
  3366. struct slub_flush_work *sfw;
  3367. unsigned int cpu;
  3368. mutex_lock(&flush_lock);
  3369. for_each_online_cpu(cpu) {
  3370. sfw = &per_cpu(slub_flush, cpu);
  3371. /*
  3372. * we don't check if rcu_free sheaf exists - racing
  3373. * __kfree_rcu_sheaf() might have just removed it.
  3374. * by executing flush_rcu_sheaf() on the cpu we make
  3375. * sure the __kfree_rcu_sheaf() finished its call_rcu()
  3376. */
  3377. INIT_WORK(&sfw->work, flush_rcu_sheaf);
  3378. sfw->s = s;
  3379. queue_work_on(cpu, flushwq, &sfw->work);
  3380. }
  3381. for_each_online_cpu(cpu) {
  3382. sfw = &per_cpu(slub_flush, cpu);
  3383. flush_work(&sfw->work);
  3384. }
  3385. mutex_unlock(&flush_lock);
  3386. }
  3387. void flush_all_rcu_sheaves(void)
  3388. {
  3389. struct kmem_cache *s;
  3390. cpus_read_lock();
  3391. mutex_lock(&slab_mutex);
  3392. list_for_each_entry(s, &slab_caches, list) {
  3393. if (!cache_has_sheaves(s))
  3394. continue;
  3395. flush_rcu_sheaves_on_cache(s);
  3396. }
  3397. mutex_unlock(&slab_mutex);
  3398. cpus_read_unlock();
  3399. rcu_barrier();
  3400. }
  3401. /*
  3402. * Use the cpu notifier to insure that the cpu slabs are flushed when
  3403. * necessary.
  3404. */
  3405. static int slub_cpu_dead(unsigned int cpu)
  3406. {
  3407. struct kmem_cache *s;
  3408. mutex_lock(&slab_mutex);
  3409. list_for_each_entry(s, &slab_caches, list) {
  3410. if (cache_has_sheaves(s))
  3411. __pcs_flush_all_cpu(s, cpu);
  3412. }
  3413. mutex_unlock(&slab_mutex);
  3414. return 0;
  3415. }
  3416. #ifdef CONFIG_SLUB_DEBUG
  3417. static int count_free(struct slab *slab)
  3418. {
  3419. return slab->objects - slab->inuse;
  3420. }
  3421. static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
  3422. {
  3423. return atomic_long_read(&n->total_objects);
  3424. }
  3425. /* Supports checking bulk free of a constructed freelist */
  3426. static inline bool free_debug_processing(struct kmem_cache *s,
  3427. struct slab *slab, void *head, void *tail, int *bulk_cnt,
  3428. unsigned long addr, depot_stack_handle_t handle)
  3429. {
  3430. bool checks_ok = false;
  3431. void *object = head;
  3432. int cnt = 0;
  3433. if (s->flags & SLAB_CONSISTENCY_CHECKS) {
  3434. if (!check_slab(s, slab))
  3435. goto out;
  3436. }
  3437. if (slab->inuse < *bulk_cnt) {
  3438. slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
  3439. slab->inuse, *bulk_cnt);
  3440. goto out;
  3441. }
  3442. next_object:
  3443. if (++cnt > *bulk_cnt)
  3444. goto out_cnt;
  3445. if (s->flags & SLAB_CONSISTENCY_CHECKS) {
  3446. if (!free_consistency_checks(s, slab, object, addr))
  3447. goto out;
  3448. }
  3449. if (s->flags & SLAB_STORE_USER)
  3450. set_track_update(s, object, TRACK_FREE, addr, handle);
  3451. trace(s, slab, object, 0);
  3452. /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
  3453. init_object(s, object, SLUB_RED_INACTIVE);
  3454. /* Reached end of constructed freelist yet? */
  3455. if (object != tail) {
  3456. object = get_freepointer(s, object);
  3457. goto next_object;
  3458. }
  3459. checks_ok = true;
  3460. out_cnt:
  3461. if (cnt != *bulk_cnt) {
  3462. slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
  3463. *bulk_cnt, cnt);
  3464. *bulk_cnt = cnt;
  3465. }
  3466. out:
  3467. if (!checks_ok)
  3468. slab_fix(s, "Object at 0x%p not freed", object);
  3469. return checks_ok;
  3470. }
  3471. #endif /* CONFIG_SLUB_DEBUG */
  3472. #if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS)
  3473. static unsigned long count_partial(struct kmem_cache_node *n,
  3474. int (*get_count)(struct slab *))
  3475. {
  3476. unsigned long flags;
  3477. unsigned long x = 0;
  3478. struct slab *slab;
  3479. spin_lock_irqsave(&n->list_lock, flags);
  3480. list_for_each_entry(slab, &n->partial, slab_list)
  3481. x += get_count(slab);
  3482. spin_unlock_irqrestore(&n->list_lock, flags);
  3483. return x;
  3484. }
  3485. #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
  3486. #ifdef CONFIG_SLUB_DEBUG
  3487. #define MAX_PARTIAL_TO_SCAN 10000
  3488. static unsigned long count_partial_free_approx(struct kmem_cache_node *n)
  3489. {
  3490. unsigned long flags;
  3491. unsigned long x = 0;
  3492. struct slab *slab;
  3493. spin_lock_irqsave(&n->list_lock, flags);
  3494. if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) {
  3495. list_for_each_entry(slab, &n->partial, slab_list)
  3496. x += slab->objects - slab->inuse;
  3497. } else {
  3498. /*
  3499. * For a long list, approximate the total count of objects in
  3500. * it to meet the limit on the number of slabs to scan.
  3501. * Scan from both the list's head and tail for better accuracy.
  3502. */
  3503. unsigned long scanned = 0;
  3504. list_for_each_entry(slab, &n->partial, slab_list) {
  3505. x += slab->objects - slab->inuse;
  3506. if (++scanned == MAX_PARTIAL_TO_SCAN / 2)
  3507. break;
  3508. }
  3509. list_for_each_entry_reverse(slab, &n->partial, slab_list) {
  3510. x += slab->objects - slab->inuse;
  3511. if (++scanned == MAX_PARTIAL_TO_SCAN)
  3512. break;
  3513. }
  3514. x = mult_frac(x, n->nr_partial, scanned);
  3515. x = min(x, node_nr_objs(n));
  3516. }
  3517. spin_unlock_irqrestore(&n->list_lock, flags);
  3518. return x;
  3519. }
  3520. static noinline void
  3521. slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
  3522. {
  3523. static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
  3524. DEFAULT_RATELIMIT_BURST);
  3525. int cpu = raw_smp_processor_id();
  3526. int node;
  3527. struct kmem_cache_node *n;
  3528. if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
  3529. return;
  3530. pr_warn("SLUB: Unable to allocate memory on CPU %u (of node %d) on node %d, gfp=%#x(%pGg)\n",
  3531. cpu, cpu_to_node(cpu), nid, gfpflags, &gfpflags);
  3532. pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
  3533. s->name, s->object_size, s->size, oo_order(s->oo),
  3534. oo_order(s->min));
  3535. if (oo_order(s->min) > get_order(s->object_size))
  3536. pr_warn(" %s debugging increased min order, use slab_debug=O to disable.\n",
  3537. s->name);
  3538. for_each_kmem_cache_node(s, node, n) {
  3539. unsigned long nr_slabs;
  3540. unsigned long nr_objs;
  3541. unsigned long nr_free;
  3542. nr_free = count_partial_free_approx(n);
  3543. nr_slabs = node_nr_slabs(n);
  3544. nr_objs = node_nr_objs(n);
  3545. pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
  3546. node, nr_slabs, nr_objs, nr_free);
  3547. }
  3548. }
  3549. #else /* CONFIG_SLUB_DEBUG */
  3550. static inline void
  3551. slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { }
  3552. #endif
  3553. static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
  3554. {
  3555. if (unlikely(slab_test_pfmemalloc(slab)))
  3556. return gfp_pfmemalloc_allowed(gfpflags);
  3557. return true;
  3558. }
  3559. /*
  3560. * Get the slab's freelist and do not freeze it.
  3561. *
  3562. * Assumes the slab is isolated from node partial list and not frozen.
  3563. *
  3564. * Assumes this is performed only for caches without debugging so we
  3565. * don't need to worry about adding the slab to the full list.
  3566. */
  3567. static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab)
  3568. {
  3569. struct freelist_counters old, new;
  3570. do {
  3571. old.freelist = slab->freelist;
  3572. old.counters = slab->counters;
  3573. new.freelist = NULL;
  3574. new.counters = old.counters;
  3575. VM_WARN_ON_ONCE(new.frozen);
  3576. new.inuse = old.objects;
  3577. } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze"));
  3578. return old.freelist;
  3579. }
  3580. /*
  3581. * If the object has been wiped upon free, make sure it's fully initialized by
  3582. * zeroing out freelist pointer.
  3583. *
  3584. * Note that we also wipe custom freelist pointers.
  3585. */
  3586. static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
  3587. void *obj)
  3588. {
  3589. if (unlikely(slab_want_init_on_free(s)) && obj &&
  3590. !freeptr_outside_object(s))
  3591. memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
  3592. 0, sizeof(void *));
  3593. }
  3594. static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
  3595. void **p, unsigned int count, bool allow_spin)
  3596. {
  3597. unsigned int allocated = 0;
  3598. struct kmem_cache_node *n;
  3599. bool needs_add_partial;
  3600. unsigned long flags;
  3601. void *object;
  3602. /*
  3603. * Are we going to put the slab on the partial list?
  3604. * Note slab->inuse is 0 on a new slab.
  3605. */
  3606. needs_add_partial = (slab->objects > count);
  3607. if (!allow_spin && needs_add_partial) {
  3608. n = get_node(s, slab_nid(slab));
  3609. if (!spin_trylock_irqsave(&n->list_lock, flags)) {
  3610. /* Unlucky, discard newly allocated slab */
  3611. free_new_slab_nolock(s, slab);
  3612. return 0;
  3613. }
  3614. }
  3615. object = slab->freelist;
  3616. while (object && allocated < count) {
  3617. p[allocated] = object;
  3618. object = get_freepointer(s, object);
  3619. maybe_wipe_obj_freeptr(s, p[allocated]);
  3620. slab->inuse++;
  3621. allocated++;
  3622. }
  3623. slab->freelist = object;
  3624. if (needs_add_partial) {
  3625. if (allow_spin) {
  3626. n = get_node(s, slab_nid(slab));
  3627. spin_lock_irqsave(&n->list_lock, flags);
  3628. }
  3629. add_partial(n, slab, ADD_TO_HEAD);
  3630. spin_unlock_irqrestore(&n->list_lock, flags);
  3631. }
  3632. inc_slabs_node(s, slab_nid(slab), slab->objects);
  3633. return allocated;
  3634. }
  3635. /*
  3636. * Slow path. We failed to allocate via percpu sheaves or they are not available
  3637. * due to bootstrap or debugging enabled or SLUB_TINY.
  3638. *
  3639. * We try to allocate from partial slab lists and fall back to allocating a new
  3640. * slab.
  3641. */
  3642. static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
  3643. unsigned long addr, unsigned int orig_size)
  3644. {
  3645. bool allow_spin = gfpflags_allow_spinning(gfpflags);
  3646. void *object;
  3647. struct slab *slab;
  3648. struct partial_context pc;
  3649. bool try_thisnode = true;
  3650. stat(s, ALLOC_SLOWPATH);
  3651. new_objects:
  3652. pc.flags = gfpflags;
  3653. /*
  3654. * When a preferred node is indicated but no __GFP_THISNODE
  3655. *
  3656. * 1) try to get a partial slab from target node only by having
  3657. * __GFP_THISNODE in pc.flags for get_from_partial()
  3658. * 2) if 1) failed, try to allocate a new slab from target node with
  3659. * GPF_NOWAIT | __GFP_THISNODE opportunistically
  3660. * 3) if 2) failed, retry with original gfpflags which will allow
  3661. * get_from_partial() try partial lists of other nodes before
  3662. * potentially allocating new page from other nodes
  3663. */
  3664. if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
  3665. && try_thisnode)) {
  3666. if (unlikely(!allow_spin))
  3667. /* Do not upgrade gfp to NOWAIT from more restrictive mode */
  3668. pc.flags = gfpflags | __GFP_THISNODE;
  3669. else
  3670. pc.flags = GFP_NOWAIT | __GFP_THISNODE;
  3671. }
  3672. pc.orig_size = orig_size;
  3673. object = get_from_partial(s, node, &pc);
  3674. if (object)
  3675. goto success;
  3676. slab = new_slab(s, pc.flags, node);
  3677. if (unlikely(!slab)) {
  3678. if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
  3679. && try_thisnode) {
  3680. try_thisnode = false;
  3681. goto new_objects;
  3682. }
  3683. slab_out_of_memory(s, gfpflags, node);
  3684. return NULL;
  3685. }
  3686. stat(s, ALLOC_SLAB);
  3687. if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
  3688. object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
  3689. if (likely(object))
  3690. goto success;
  3691. } else {
  3692. alloc_from_new_slab(s, slab, &object, 1, allow_spin);
  3693. /* we don't need to check SLAB_STORE_USER here */
  3694. if (likely(object))
  3695. return object;
  3696. }
  3697. if (allow_spin)
  3698. goto new_objects;
  3699. /* This could cause an endless loop. Fail instead. */
  3700. return NULL;
  3701. success:
  3702. if (kmem_cache_debug_flags(s, SLAB_STORE_USER))
  3703. set_track(s, object, TRACK_ALLOC, addr, gfpflags);
  3704. return object;
  3705. }
  3706. static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
  3707. gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
  3708. {
  3709. void *object;
  3710. #ifdef CONFIG_NUMA
  3711. if (static_branch_unlikely(&strict_numa) &&
  3712. node == NUMA_NO_NODE) {
  3713. struct mempolicy *mpol = current->mempolicy;
  3714. if (mpol) {
  3715. /*
  3716. * Special BIND rule support. If the local node
  3717. * is in permitted set then do not redirect
  3718. * to a particular node.
  3719. * Otherwise we apply the memory policy to get
  3720. * the node we need to allocate on.
  3721. */
  3722. if (mpol->mode != MPOL_BIND ||
  3723. !node_isset(numa_mem_id(), mpol->nodes))
  3724. node = mempolicy_slab_node();
  3725. }
  3726. }
  3727. #endif
  3728. object = ___slab_alloc(s, gfpflags, node, addr, orig_size);
  3729. return object;
  3730. }
  3731. static __fastpath_inline
  3732. struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
  3733. {
  3734. flags &= gfp_allowed_mask;
  3735. might_alloc(flags);
  3736. if (unlikely(should_failslab(s, flags)))
  3737. return NULL;
  3738. return s;
  3739. }
  3740. static __fastpath_inline
  3741. bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
  3742. gfp_t flags, size_t size, void **p, bool init,
  3743. unsigned int orig_size)
  3744. {
  3745. unsigned int zero_size = s->object_size;
  3746. bool kasan_init = init;
  3747. size_t i;
  3748. gfp_t init_flags = flags & gfp_allowed_mask;
  3749. /*
  3750. * For kmalloc object, the allocated memory size(object_size) is likely
  3751. * larger than the requested size(orig_size). If redzone check is
  3752. * enabled for the extra space, don't zero it, as it will be redzoned
  3753. * soon. The redzone operation for this extra space could be seen as a
  3754. * replacement of current poisoning under certain debug option, and
  3755. * won't break other sanity checks.
  3756. */
  3757. if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) &&
  3758. (s->flags & SLAB_KMALLOC))
  3759. zero_size = orig_size;
  3760. /*
  3761. * When slab_debug is enabled, avoid memory initialization integrated
  3762. * into KASAN and instead zero out the memory via the memset below with
  3763. * the proper size. Otherwise, KASAN might overwrite SLUB redzones and
  3764. * cause false-positive reports. This does not lead to a performance
  3765. * penalty on production builds, as slab_debug is not intended to be
  3766. * enabled there.
  3767. */
  3768. if (__slub_debug_enabled())
  3769. kasan_init = false;
  3770. /*
  3771. * As memory initialization might be integrated into KASAN,
  3772. * kasan_slab_alloc and initialization memset must be
  3773. * kept together to avoid discrepancies in behavior.
  3774. *
  3775. * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
  3776. */
  3777. for (i = 0; i < size; i++) {
  3778. p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init);
  3779. if (p[i] && init && (!kasan_init ||
  3780. !kasan_has_integrated_init()))
  3781. memset(p[i], 0, zero_size);
  3782. if (gfpflags_allow_spinning(flags))
  3783. kmemleak_alloc_recursive(p[i], s->object_size, 1,
  3784. s->flags, init_flags);
  3785. kmsan_slab_alloc(s, p[i], init_flags);
  3786. alloc_tagging_slab_alloc_hook(s, p[i], flags);
  3787. }
  3788. return memcg_slab_post_alloc_hook(s, lru, flags, size, p);
  3789. }
  3790. /*
  3791. * Replace the empty main sheaf with a (at least partially) full sheaf.
  3792. *
  3793. * Must be called with the cpu_sheaves local lock locked. If successful, returns
  3794. * the pcs pointer and the local lock locked (possibly on a different cpu than
  3795. * initially called). If not successful, returns NULL and the local lock
  3796. * unlocked.
  3797. */
  3798. static struct slub_percpu_sheaves *
  3799. __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp)
  3800. {
  3801. struct slab_sheaf *empty = NULL;
  3802. struct slab_sheaf *full;
  3803. struct node_barn *barn;
  3804. bool allow_spin;
  3805. lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
  3806. /* Bootstrap or debug cache, back off */
  3807. if (unlikely(!cache_has_sheaves(s))) {
  3808. local_unlock(&s->cpu_sheaves->lock);
  3809. return NULL;
  3810. }
  3811. if (pcs->spare && pcs->spare->size > 0) {
  3812. swap(pcs->main, pcs->spare);
  3813. return pcs;
  3814. }
  3815. barn = get_barn(s);
  3816. if (!barn) {
  3817. local_unlock(&s->cpu_sheaves->lock);
  3818. return NULL;
  3819. }
  3820. allow_spin = gfpflags_allow_spinning(gfp);
  3821. full = barn_replace_empty_sheaf(barn, pcs->main, allow_spin);
  3822. if (full) {
  3823. stat(s, BARN_GET);
  3824. pcs->main = full;
  3825. return pcs;
  3826. }
  3827. stat(s, BARN_GET_FAIL);
  3828. if (allow_spin) {
  3829. if (pcs->spare) {
  3830. empty = pcs->spare;
  3831. pcs->spare = NULL;
  3832. } else {
  3833. empty = barn_get_empty_sheaf(barn, true);
  3834. }
  3835. }
  3836. local_unlock(&s->cpu_sheaves->lock);
  3837. pcs = NULL;
  3838. if (!allow_spin)
  3839. return NULL;
  3840. if (empty) {
  3841. if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
  3842. full = empty;
  3843. } else {
  3844. /*
  3845. * we must be very low on memory so don't bother
  3846. * with the barn
  3847. */
  3848. sheaf_flush_unused(s, empty);
  3849. free_empty_sheaf(s, empty);
  3850. }
  3851. } else {
  3852. full = alloc_full_sheaf(s, gfp);
  3853. }
  3854. if (!full)
  3855. return NULL;
  3856. if (!local_trylock(&s->cpu_sheaves->lock))
  3857. goto barn_put;
  3858. pcs = this_cpu_ptr(s->cpu_sheaves);
  3859. /*
  3860. * If we are returning empty sheaf, we either got it from the
  3861. * barn or had to allocate one. If we are returning a full
  3862. * sheaf, it's due to racing or being migrated to a different
  3863. * cpu. Breaching the barn's sheaf limits should be thus rare
  3864. * enough so just ignore them to simplify the recovery.
  3865. */
  3866. if (pcs->main->size == 0) {
  3867. if (!pcs->spare)
  3868. pcs->spare = pcs->main;
  3869. else
  3870. barn_put_empty_sheaf(barn, pcs->main);
  3871. pcs->main = full;
  3872. return pcs;
  3873. }
  3874. if (!pcs->spare) {
  3875. pcs->spare = full;
  3876. return pcs;
  3877. }
  3878. if (pcs->spare->size == 0) {
  3879. barn_put_empty_sheaf(barn, pcs->spare);
  3880. pcs->spare = full;
  3881. return pcs;
  3882. }
  3883. barn_put:
  3884. barn_put_full_sheaf(barn, full);
  3885. stat(s, BARN_PUT);
  3886. return pcs;
  3887. }
  3888. static __fastpath_inline
  3889. void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node)
  3890. {
  3891. struct slub_percpu_sheaves *pcs;
  3892. bool node_requested;
  3893. void *object;
  3894. #ifdef CONFIG_NUMA
  3895. if (static_branch_unlikely(&strict_numa) &&
  3896. node == NUMA_NO_NODE) {
  3897. struct mempolicy *mpol = current->mempolicy;
  3898. if (mpol) {
  3899. /*
  3900. * Special BIND rule support. If the local node
  3901. * is in permitted set then do not redirect
  3902. * to a particular node.
  3903. * Otherwise we apply the memory policy to get
  3904. * the node we need to allocate on.
  3905. */
  3906. if (mpol->mode != MPOL_BIND ||
  3907. !node_isset(numa_mem_id(), mpol->nodes))
  3908. node = mempolicy_slab_node();
  3909. }
  3910. }
  3911. #endif
  3912. node_requested = IS_ENABLED(CONFIG_NUMA) && node != NUMA_NO_NODE;
  3913. /*
  3914. * We assume the percpu sheaves contain only local objects although it's
  3915. * not completely guaranteed, so we verify later.
  3916. */
  3917. if (unlikely(node_requested && node != numa_mem_id())) {
  3918. stat(s, ALLOC_NODE_MISMATCH);
  3919. return NULL;
  3920. }
  3921. if (!local_trylock(&s->cpu_sheaves->lock))
  3922. return NULL;
  3923. pcs = this_cpu_ptr(s->cpu_sheaves);
  3924. if (unlikely(pcs->main->size == 0)) {
  3925. pcs = __pcs_replace_empty_main(s, pcs, gfp);
  3926. if (unlikely(!pcs))
  3927. return NULL;
  3928. }
  3929. object = pcs->main->objects[pcs->main->size - 1];
  3930. if (unlikely(node_requested)) {
  3931. /*
  3932. * Verify that the object was from the node we want. This could
  3933. * be false because of cpu migration during an unlocked part of
  3934. * the current allocation or previous freeing process.
  3935. */
  3936. if (page_to_nid(virt_to_page(object)) != node) {
  3937. local_unlock(&s->cpu_sheaves->lock);
  3938. stat(s, ALLOC_NODE_MISMATCH);
  3939. return NULL;
  3940. }
  3941. }
  3942. pcs->main->size--;
  3943. local_unlock(&s->cpu_sheaves->lock);
  3944. stat(s, ALLOC_FASTPATH);
  3945. return object;
  3946. }
  3947. static __fastpath_inline
  3948. unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, gfp_t gfp, size_t size,
  3949. void **p)
  3950. {
  3951. struct slub_percpu_sheaves *pcs;
  3952. struct slab_sheaf *main;
  3953. unsigned int allocated = 0;
  3954. unsigned int batch;
  3955. next_batch:
  3956. if (!local_trylock(&s->cpu_sheaves->lock))
  3957. return allocated;
  3958. pcs = this_cpu_ptr(s->cpu_sheaves);
  3959. if (unlikely(pcs->main->size == 0)) {
  3960. struct slab_sheaf *full;
  3961. struct node_barn *barn;
  3962. if (unlikely(!cache_has_sheaves(s))) {
  3963. local_unlock(&s->cpu_sheaves->lock);
  3964. return allocated;
  3965. }
  3966. if (pcs->spare && pcs->spare->size > 0) {
  3967. swap(pcs->main, pcs->spare);
  3968. goto do_alloc;
  3969. }
  3970. barn = get_barn(s);
  3971. if (!barn) {
  3972. local_unlock(&s->cpu_sheaves->lock);
  3973. return allocated;
  3974. }
  3975. full = barn_replace_empty_sheaf(barn, pcs->main,
  3976. gfpflags_allow_spinning(gfp));
  3977. if (full) {
  3978. stat(s, BARN_GET);
  3979. pcs->main = full;
  3980. goto do_alloc;
  3981. }
  3982. stat(s, BARN_GET_FAIL);
  3983. local_unlock(&s->cpu_sheaves->lock);
  3984. /*
  3985. * Once full sheaves in barn are depleted, let the bulk
  3986. * allocation continue from slab pages, otherwise we would just
  3987. * be copying arrays of pointers twice.
  3988. */
  3989. return allocated;
  3990. }
  3991. do_alloc:
  3992. main = pcs->main;
  3993. batch = min(size, main->size);
  3994. main->size -= batch;
  3995. memcpy(p, main->objects + main->size, batch * sizeof(void *));
  3996. local_unlock(&s->cpu_sheaves->lock);
  3997. stat_add(s, ALLOC_FASTPATH, batch);
  3998. allocated += batch;
  3999. if (batch < size) {
  4000. p += batch;
  4001. size -= batch;
  4002. goto next_batch;
  4003. }
  4004. return allocated;
  4005. }
  4006. /*
  4007. * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
  4008. * have the fastpath folded into their functions. So no function call
  4009. * overhead for requests that can be satisfied on the fastpath.
  4010. *
  4011. * The fastpath works by first checking if the lockless freelist can be used.
  4012. * If not then __slab_alloc is called for slow processing.
  4013. *
  4014. * Otherwise we can simply pick the next object from the lockless free list.
  4015. */
  4016. static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
  4017. gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
  4018. {
  4019. void *object;
  4020. bool init = false;
  4021. s = slab_pre_alloc_hook(s, gfpflags);
  4022. if (unlikely(!s))
  4023. return NULL;
  4024. object = kfence_alloc(s, orig_size, gfpflags);
  4025. if (unlikely(object))
  4026. goto out;
  4027. object = alloc_from_pcs(s, gfpflags, node);
  4028. if (!object)
  4029. object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
  4030. maybe_wipe_obj_freeptr(s, object);
  4031. init = slab_want_init_on_alloc(gfpflags, s);
  4032. out:
  4033. /*
  4034. * When init equals 'true', like for kzalloc() family, only
  4035. * @orig_size bytes might be zeroed instead of s->object_size
  4036. * In case this fails due to memcg_slab_post_alloc_hook(),
  4037. * object is set to NULL
  4038. */
  4039. slab_post_alloc_hook(s, lru, gfpflags, 1, &object, init, orig_size);
  4040. return object;
  4041. }
  4042. void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags)
  4043. {
  4044. void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_,
  4045. s->object_size);
  4046. trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
  4047. return ret;
  4048. }
  4049. EXPORT_SYMBOL(kmem_cache_alloc_noprof);
  4050. void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
  4051. gfp_t gfpflags)
  4052. {
  4053. void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_,
  4054. s->object_size);
  4055. trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
  4056. return ret;
  4057. }
  4058. EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof);
  4059. bool kmem_cache_charge(void *objp, gfp_t gfpflags)
  4060. {
  4061. if (!memcg_kmem_online())
  4062. return true;
  4063. return memcg_slab_post_charge(objp, gfpflags);
  4064. }
  4065. EXPORT_SYMBOL(kmem_cache_charge);
  4066. /**
  4067. * kmem_cache_alloc_node - Allocate an object on the specified node
  4068. * @s: The cache to allocate from.
  4069. * @gfpflags: See kmalloc().
  4070. * @node: node number of the target node.
  4071. *
  4072. * Identical to kmem_cache_alloc but it will allocate memory on the given
  4073. * node, which can improve the performance for cpu bound structures.
  4074. *
  4075. * Fallback to other node is possible if __GFP_THISNODE is not set.
  4076. *
  4077. * Return: pointer to the new object or %NULL in case of error
  4078. */
  4079. void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node)
  4080. {
  4081. void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
  4082. trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node);
  4083. return ret;
  4084. }
  4085. EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
  4086. static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s,
  4087. struct slab_sheaf *sheaf, gfp_t gfp)
  4088. {
  4089. gfp_t gfp_nomemalloc;
  4090. int ret;
  4091. gfp_nomemalloc = gfp | __GFP_NOMEMALLOC;
  4092. if (gfp_pfmemalloc_allowed(gfp))
  4093. gfp_nomemalloc |= __GFP_NOWARN;
  4094. ret = refill_sheaf(s, sheaf, gfp_nomemalloc);
  4095. if (likely(!ret || !gfp_pfmemalloc_allowed(gfp)))
  4096. return ret;
  4097. /*
  4098. * if we are allowed to, refill sheaf with pfmemalloc but then remember
  4099. * it for when it's returned
  4100. */
  4101. ret = refill_sheaf(s, sheaf, gfp);
  4102. sheaf->pfmemalloc = true;
  4103. return ret;
  4104. }
  4105. static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
  4106. size_t size, void **p);
  4107. /*
  4108. * returns a sheaf that has at least the requested size
  4109. * when prefilling is needed, do so with given gfp flags
  4110. *
  4111. * return NULL if sheaf allocation or prefilling failed
  4112. */
  4113. struct slab_sheaf *
  4114. kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
  4115. {
  4116. struct slub_percpu_sheaves *pcs;
  4117. struct slab_sheaf *sheaf = NULL;
  4118. struct node_barn *barn;
  4119. if (unlikely(!size))
  4120. return NULL;
  4121. if (unlikely(size > s->sheaf_capacity)) {
  4122. sheaf = kzalloc_flex(*sheaf, objects, size, gfp);
  4123. if (!sheaf)
  4124. return NULL;
  4125. stat(s, SHEAF_PREFILL_OVERSIZE);
  4126. sheaf->cache = s;
  4127. sheaf->capacity = size;
  4128. /*
  4129. * we do not need to care about pfmemalloc here because oversize
  4130. * sheaves area always flushed and freed when returned
  4131. */
  4132. if (!__kmem_cache_alloc_bulk(s, gfp, size,
  4133. &sheaf->objects[0])) {
  4134. kfree(sheaf);
  4135. return NULL;
  4136. }
  4137. sheaf->size = size;
  4138. return sheaf;
  4139. }
  4140. local_lock(&s->cpu_sheaves->lock);
  4141. pcs = this_cpu_ptr(s->cpu_sheaves);
  4142. if (pcs->spare) {
  4143. sheaf = pcs->spare;
  4144. pcs->spare = NULL;
  4145. stat(s, SHEAF_PREFILL_FAST);
  4146. } else {
  4147. barn = get_barn(s);
  4148. stat(s, SHEAF_PREFILL_SLOW);
  4149. if (barn)
  4150. sheaf = barn_get_full_or_empty_sheaf(barn);
  4151. if (sheaf && sheaf->size)
  4152. stat(s, BARN_GET);
  4153. else
  4154. stat(s, BARN_GET_FAIL);
  4155. }
  4156. local_unlock(&s->cpu_sheaves->lock);
  4157. if (!sheaf)
  4158. sheaf = alloc_empty_sheaf(s, gfp);
  4159. if (sheaf) {
  4160. sheaf->capacity = s->sheaf_capacity;
  4161. sheaf->pfmemalloc = false;
  4162. if (sheaf->size < size &&
  4163. __prefill_sheaf_pfmemalloc(s, sheaf, gfp)) {
  4164. sheaf_flush_unused(s, sheaf);
  4165. free_empty_sheaf(s, sheaf);
  4166. sheaf = NULL;
  4167. }
  4168. }
  4169. return sheaf;
  4170. }
  4171. /*
  4172. * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf()
  4173. *
  4174. * If the sheaf cannot simply become the percpu spare sheaf, but there's space
  4175. * for a full sheaf in the barn, we try to refill the sheaf back to the cache's
  4176. * sheaf_capacity to avoid handling partially full sheaves.
  4177. *
  4178. * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the
  4179. * sheaf is instead flushed and freed.
  4180. */
  4181. void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
  4182. struct slab_sheaf *sheaf)
  4183. {
  4184. struct slub_percpu_sheaves *pcs;
  4185. struct node_barn *barn;
  4186. if (unlikely((sheaf->capacity != s->sheaf_capacity)
  4187. || sheaf->pfmemalloc)) {
  4188. sheaf_flush_unused(s, sheaf);
  4189. kfree(sheaf);
  4190. return;
  4191. }
  4192. local_lock(&s->cpu_sheaves->lock);
  4193. pcs = this_cpu_ptr(s->cpu_sheaves);
  4194. barn = get_barn(s);
  4195. if (!pcs->spare) {
  4196. pcs->spare = sheaf;
  4197. sheaf = NULL;
  4198. stat(s, SHEAF_RETURN_FAST);
  4199. }
  4200. local_unlock(&s->cpu_sheaves->lock);
  4201. if (!sheaf)
  4202. return;
  4203. stat(s, SHEAF_RETURN_SLOW);
  4204. /*
  4205. * If the barn has too many full sheaves or we fail to refill the sheaf,
  4206. * simply flush and free it.
  4207. */
  4208. if (!barn || data_race(barn->nr_full) >= MAX_FULL_SHEAVES ||
  4209. refill_sheaf(s, sheaf, gfp)) {
  4210. sheaf_flush_unused(s, sheaf);
  4211. free_empty_sheaf(s, sheaf);
  4212. return;
  4213. }
  4214. barn_put_full_sheaf(barn, sheaf);
  4215. stat(s, BARN_PUT);
  4216. }
  4217. /*
  4218. * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least
  4219. * the given size
  4220. *
  4221. * the sheaf might be replaced by a new one when requesting more than
  4222. * s->sheaf_capacity objects if such replacement is necessary, but the refill
  4223. * fails (returning -ENOMEM), the existing sheaf is left intact
  4224. *
  4225. * In practice we always refill to full sheaf's capacity.
  4226. */
  4227. int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
  4228. struct slab_sheaf **sheafp, unsigned int size)
  4229. {
  4230. struct slab_sheaf *sheaf;
  4231. /*
  4232. * TODO: do we want to support *sheaf == NULL to be equivalent of
  4233. * kmem_cache_prefill_sheaf() ?
  4234. */
  4235. if (!sheafp || !(*sheafp))
  4236. return -EINVAL;
  4237. sheaf = *sheafp;
  4238. if (sheaf->size >= size)
  4239. return 0;
  4240. if (likely(sheaf->capacity >= size)) {
  4241. if (likely(sheaf->capacity == s->sheaf_capacity))
  4242. return __prefill_sheaf_pfmemalloc(s, sheaf, gfp);
  4243. if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
  4244. &sheaf->objects[sheaf->size])) {
  4245. return -ENOMEM;
  4246. }
  4247. sheaf->size = sheaf->capacity;
  4248. return 0;
  4249. }
  4250. /*
  4251. * We had a regular sized sheaf and need an oversize one, or we had an
  4252. * oversize one already but need a larger one now.
  4253. * This should be a very rare path so let's not complicate it.
  4254. */
  4255. sheaf = kmem_cache_prefill_sheaf(s, gfp, size);
  4256. if (!sheaf)
  4257. return -ENOMEM;
  4258. kmem_cache_return_sheaf(s, gfp, *sheafp);
  4259. *sheafp = sheaf;
  4260. return 0;
  4261. }
  4262. /*
  4263. * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf()
  4264. *
  4265. * Guaranteed not to fail as many allocations as was the requested size.
  4266. * After the sheaf is emptied, it fails - no fallback to the slab cache itself.
  4267. *
  4268. * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT
  4269. * memcg charging is forced over limit if necessary, to avoid failure.
  4270. *
  4271. * It is possible that the allocation comes from kfence and then the sheaf
  4272. * size is not decreased.
  4273. */
  4274. void *
  4275. kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp,
  4276. struct slab_sheaf *sheaf)
  4277. {
  4278. void *ret = NULL;
  4279. bool init;
  4280. if (sheaf->size == 0)
  4281. goto out;
  4282. ret = kfence_alloc(s, s->object_size, gfp);
  4283. if (likely(!ret))
  4284. ret = sheaf->objects[--sheaf->size];
  4285. init = slab_want_init_on_alloc(gfp, s);
  4286. /* add __GFP_NOFAIL to force successful memcg charging */
  4287. slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size);
  4288. out:
  4289. trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE);
  4290. return ret;
  4291. }
  4292. unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf)
  4293. {
  4294. return sheaf->size;
  4295. }
  4296. /*
  4297. * To avoid unnecessary overhead, we pass through large allocation requests
  4298. * directly to the page allocator. We use __GFP_COMP, because we will need to
  4299. * know the allocation order to free the pages properly in kfree.
  4300. */
  4301. static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
  4302. {
  4303. struct page *page;
  4304. void *ptr = NULL;
  4305. unsigned int order = get_order(size);
  4306. if (unlikely(flags & GFP_SLAB_BUG_MASK))
  4307. flags = kmalloc_fix_flags(flags);
  4308. flags |= __GFP_COMP;
  4309. if (node == NUMA_NO_NODE)
  4310. page = alloc_frozen_pages_noprof(flags, order);
  4311. else
  4312. page = __alloc_frozen_pages_noprof(flags, order, node, NULL);
  4313. if (page) {
  4314. ptr = page_address(page);
  4315. mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
  4316. PAGE_SIZE << order);
  4317. __SetPageLargeKmalloc(page);
  4318. }
  4319. ptr = kasan_kmalloc_large(ptr, size, flags);
  4320. /* As ptr might get tagged, call kmemleak hook after KASAN. */
  4321. kmemleak_alloc(ptr, size, 1, flags);
  4322. kmsan_kmalloc_large(ptr, size, flags);
  4323. return ptr;
  4324. }
  4325. void *__kmalloc_large_noprof(size_t size, gfp_t flags)
  4326. {
  4327. void *ret = ___kmalloc_large_node(size, flags, NUMA_NO_NODE);
  4328. trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
  4329. flags, NUMA_NO_NODE);
  4330. return ret;
  4331. }
  4332. EXPORT_SYMBOL(__kmalloc_large_noprof);
  4333. void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
  4334. {
  4335. void *ret = ___kmalloc_large_node(size, flags, node);
  4336. trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
  4337. flags, node);
  4338. return ret;
  4339. }
  4340. EXPORT_SYMBOL(__kmalloc_large_node_noprof);
  4341. static __always_inline
  4342. void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
  4343. unsigned long caller)
  4344. {
  4345. struct kmem_cache *s;
  4346. void *ret;
  4347. if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
  4348. ret = __kmalloc_large_node_noprof(size, flags, node);
  4349. trace_kmalloc(caller, ret, size,
  4350. PAGE_SIZE << get_order(size), flags, node);
  4351. return ret;
  4352. }
  4353. if (unlikely(!size))
  4354. return ZERO_SIZE_PTR;
  4355. s = kmalloc_slab(size, b, flags, caller);
  4356. ret = slab_alloc_node(s, NULL, flags, node, caller, size);
  4357. ret = kasan_kmalloc(s, ret, size, flags);
  4358. trace_kmalloc(caller, ret, size, s->size, flags, node);
  4359. return ret;
  4360. }
  4361. void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
  4362. {
  4363. return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, _RET_IP_);
  4364. }
  4365. EXPORT_SYMBOL(__kmalloc_node_noprof);
  4366. void *__kmalloc_noprof(size_t size, gfp_t flags)
  4367. {
  4368. return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_);
  4369. }
  4370. EXPORT_SYMBOL(__kmalloc_noprof);
  4371. /**
  4372. * kmalloc_nolock - Allocate an object of given size from any context.
  4373. * @size: size to allocate
  4374. * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT
  4375. * allowed.
  4376. * @node: node number of the target node.
  4377. *
  4378. * Return: pointer to the new object or NULL in case of error.
  4379. * NULL does not mean EBUSY or EAGAIN. It means ENOMEM.
  4380. * There is no reason to call it again and expect !NULL.
  4381. */
  4382. void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
  4383. {
  4384. gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
  4385. struct kmem_cache *s;
  4386. bool can_retry = true;
  4387. void *ret;
  4388. VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO |
  4389. __GFP_NO_OBJ_EXT));
  4390. if (unlikely(!size))
  4391. return ZERO_SIZE_PTR;
  4392. /*
  4393. * See the comment for the same check in
  4394. * alloc_frozen_pages_nolock_noprof()
  4395. */
  4396. if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
  4397. return NULL;
  4398. retry:
  4399. if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
  4400. return NULL;
  4401. s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_);
  4402. if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
  4403. /*
  4404. * kmalloc_nolock() is not supported on architectures that
  4405. * don't implement cmpxchg16b and thus need slab_lock()
  4406. * which could be preempted by a nmi.
  4407. * But debug caches don't use that and only rely on
  4408. * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt
  4409. * to allocate from debug caches by
  4410. * spin_trylock_irqsave(&n->list_lock, ...)
  4411. */
  4412. return NULL;
  4413. ret = alloc_from_pcs(s, alloc_gfp, node);
  4414. if (ret)
  4415. goto success;
  4416. /*
  4417. * Do not call slab_alloc_node(), since trylock mode isn't
  4418. * compatible with slab_pre_alloc_hook/should_failslab and
  4419. * kfence_alloc. Hence call __slab_alloc_node() (at most twice)
  4420. * and slab_post_alloc_hook() directly.
  4421. */
  4422. ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
  4423. /*
  4424. * It's possible we failed due to trylock as we preempted someone with
  4425. * the sheaves locked, and the list_lock is also held by another cpu.
  4426. * But it should be rare that multiple kmalloc buckets would have
  4427. * sheaves locked, so try a larger one.
  4428. */
  4429. if (!ret && can_retry) {
  4430. /* pick the next kmalloc bucket */
  4431. size = s->object_size + 1;
  4432. /*
  4433. * Another alternative is to
  4434. * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT;
  4435. * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT;
  4436. * to retry from bucket of the same size.
  4437. */
  4438. can_retry = false;
  4439. goto retry;
  4440. }
  4441. success:
  4442. maybe_wipe_obj_freeptr(s, ret);
  4443. slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret,
  4444. slab_want_init_on_alloc(alloc_gfp, s), size);
  4445. ret = kasan_kmalloc(s, ret, size, alloc_gfp);
  4446. return ret;
  4447. }
  4448. EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof);
  4449. void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
  4450. int node, unsigned long caller)
  4451. {
  4452. return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, caller);
  4453. }
  4454. EXPORT_SYMBOL(__kmalloc_node_track_caller_noprof);
  4455. void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size)
  4456. {
  4457. void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE,
  4458. _RET_IP_, size);
  4459. trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE);
  4460. ret = kasan_kmalloc(s, ret, size, gfpflags);
  4461. return ret;
  4462. }
  4463. EXPORT_SYMBOL(__kmalloc_cache_noprof);
  4464. void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags,
  4465. int node, size_t size)
  4466. {
  4467. void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
  4468. trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node);
  4469. ret = kasan_kmalloc(s, ret, size, gfpflags);
  4470. return ret;
  4471. }
  4472. EXPORT_SYMBOL(__kmalloc_cache_node_noprof);
  4473. static noinline void free_to_partial_list(
  4474. struct kmem_cache *s, struct slab *slab,
  4475. void *head, void *tail, int bulk_cnt,
  4476. unsigned long addr)
  4477. {
  4478. struct kmem_cache_node *n = get_node(s, slab_nid(slab));
  4479. struct slab *slab_free = NULL;
  4480. int cnt = bulk_cnt;
  4481. unsigned long flags;
  4482. depot_stack_handle_t handle = 0;
  4483. /*
  4484. * We cannot use GFP_NOWAIT as there are callsites where waking up
  4485. * kswapd could deadlock
  4486. */
  4487. if (s->flags & SLAB_STORE_USER)
  4488. handle = set_track_prepare(__GFP_NOWARN);
  4489. spin_lock_irqsave(&n->list_lock, flags);
  4490. if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) {
  4491. void *prior = slab->freelist;
  4492. /* Perform the actual freeing while we still hold the locks */
  4493. slab->inuse -= cnt;
  4494. set_freepointer(s, tail, prior);
  4495. slab->freelist = head;
  4496. /*
  4497. * If the slab is empty, and node's partial list is full,
  4498. * it should be discarded anyway no matter it's on full or
  4499. * partial list.
  4500. */
  4501. if (slab->inuse == 0 && n->nr_partial >= s->min_partial)
  4502. slab_free = slab;
  4503. if (!prior) {
  4504. /* was on full list */
  4505. remove_full(s, n, slab);
  4506. if (!slab_free) {
  4507. add_partial(n, slab, ADD_TO_TAIL);
  4508. stat(s, FREE_ADD_PARTIAL);
  4509. }
  4510. } else if (slab_free) {
  4511. remove_partial(n, slab);
  4512. stat(s, FREE_REMOVE_PARTIAL);
  4513. }
  4514. }
  4515. if (slab_free) {
  4516. /*
  4517. * Update the counters while still holding n->list_lock to
  4518. * prevent spurious validation warnings
  4519. */
  4520. dec_slabs_node(s, slab_nid(slab_free), slab_free->objects);
  4521. }
  4522. spin_unlock_irqrestore(&n->list_lock, flags);
  4523. if (slab_free) {
  4524. stat(s, FREE_SLAB);
  4525. free_slab(s, slab_free);
  4526. }
  4527. }
  4528. /*
  4529. * Slow path handling. This may still be called frequently since objects
  4530. * have a longer lifetime than the cpu slabs in most processing loads.
  4531. *
  4532. * So we still attempt to reduce cache line usage. Just take the slab
  4533. * lock and free the item. If there is no additional partial slab
  4534. * handling required then we can return immediately.
  4535. */
  4536. static void __slab_free(struct kmem_cache *s, struct slab *slab,
  4537. void *head, void *tail, int cnt,
  4538. unsigned long addr)
  4539. {
  4540. bool was_full;
  4541. struct freelist_counters old, new;
  4542. struct kmem_cache_node *n = NULL;
  4543. unsigned long flags;
  4544. bool on_node_partial;
  4545. if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
  4546. free_to_partial_list(s, slab, head, tail, cnt, addr);
  4547. return;
  4548. }
  4549. do {
  4550. if (unlikely(n)) {
  4551. spin_unlock_irqrestore(&n->list_lock, flags);
  4552. n = NULL;
  4553. }
  4554. old.freelist = slab->freelist;
  4555. old.counters = slab->counters;
  4556. was_full = (old.freelist == NULL);
  4557. set_freepointer(s, tail, old.freelist);
  4558. new.freelist = head;
  4559. new.counters = old.counters;
  4560. new.inuse -= cnt;
  4561. /*
  4562. * Might need to be taken off (due to becoming empty) or added
  4563. * to (due to not being full anymore) the partial list.
  4564. * Unless it's frozen.
  4565. */
  4566. if (!new.inuse || was_full) {
  4567. n = get_node(s, slab_nid(slab));
  4568. /*
  4569. * Speculatively acquire the list_lock.
  4570. * If the cmpxchg does not succeed then we may
  4571. * drop the list_lock without any processing.
  4572. *
  4573. * Otherwise the list_lock will synchronize with
  4574. * other processors updating the list of slabs.
  4575. */
  4576. spin_lock_irqsave(&n->list_lock, flags);
  4577. on_node_partial = slab_test_node_partial(slab);
  4578. }
  4579. } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free"));
  4580. if (likely(!n)) {
  4581. /*
  4582. * We didn't take the list_lock because the slab was already on
  4583. * the partial list and will remain there.
  4584. */
  4585. return;
  4586. }
  4587. /*
  4588. * This slab was partially empty but not on the per-node partial list,
  4589. * in which case we shouldn't manipulate its list, just return.
  4590. */
  4591. if (!was_full && !on_node_partial) {
  4592. spin_unlock_irqrestore(&n->list_lock, flags);
  4593. return;
  4594. }
  4595. /*
  4596. * If slab became empty, should we add/keep it on the partial list or we
  4597. * have enough?
  4598. */
  4599. if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
  4600. goto slab_empty;
  4601. /*
  4602. * Objects left in the slab. If it was not on the partial list before
  4603. * then add it.
  4604. */
  4605. if (unlikely(was_full)) {
  4606. add_partial(n, slab, ADD_TO_TAIL);
  4607. stat(s, FREE_ADD_PARTIAL);
  4608. }
  4609. spin_unlock_irqrestore(&n->list_lock, flags);
  4610. return;
  4611. slab_empty:
  4612. /*
  4613. * The slab could have a single object and thus go from full to empty in
  4614. * a single free, but more likely it was on the partial list. Remove it.
  4615. */
  4616. if (likely(!was_full)) {
  4617. remove_partial(n, slab);
  4618. stat(s, FREE_REMOVE_PARTIAL);
  4619. }
  4620. spin_unlock_irqrestore(&n->list_lock, flags);
  4621. stat(s, FREE_SLAB);
  4622. discard_slab(s, slab);
  4623. }
  4624. /*
  4625. * pcs is locked. We should have get rid of the spare sheaf and obtained an
  4626. * empty sheaf, while the main sheaf is full. We want to install the empty sheaf
  4627. * as a main sheaf, and make the current main sheaf a spare sheaf.
  4628. *
  4629. * However due to having relinquished the cpu_sheaves lock when obtaining
  4630. * the empty sheaf, we need to handle some unlikely but possible cases.
  4631. *
  4632. * If we put any sheaf to barn here, it's because we were interrupted or have
  4633. * been migrated to a different cpu, which should be rare enough so just ignore
  4634. * the barn's limits to simplify the handling.
  4635. *
  4636. * An alternative scenario that gets us here is when we fail
  4637. * barn_replace_full_sheaf(), because there's no empty sheaf available in the
  4638. * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the
  4639. * limit on full sheaves was not exceeded, we assume it didn't change and just
  4640. * put the full sheaf there.
  4641. */
  4642. static void __pcs_install_empty_sheaf(struct kmem_cache *s,
  4643. struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty,
  4644. struct node_barn *barn)
  4645. {
  4646. lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
  4647. /* This is what we expect to find if nobody interrupted us. */
  4648. if (likely(!pcs->spare)) {
  4649. pcs->spare = pcs->main;
  4650. pcs->main = empty;
  4651. return;
  4652. }
  4653. /*
  4654. * Unlikely because if the main sheaf had space, we would have just
  4655. * freed to it. Get rid of our empty sheaf.
  4656. */
  4657. if (pcs->main->size < s->sheaf_capacity) {
  4658. barn_put_empty_sheaf(barn, empty);
  4659. return;
  4660. }
  4661. /* Also unlikely for the same reason */
  4662. if (pcs->spare->size < s->sheaf_capacity) {
  4663. swap(pcs->main, pcs->spare);
  4664. barn_put_empty_sheaf(barn, empty);
  4665. return;
  4666. }
  4667. /*
  4668. * We probably failed barn_replace_full_sheaf() due to no empty sheaf
  4669. * available there, but we allocated one, so finish the job.
  4670. */
  4671. barn_put_full_sheaf(barn, pcs->main);
  4672. stat(s, BARN_PUT);
  4673. pcs->main = empty;
  4674. }
  4675. /*
  4676. * Replace the full main sheaf with a (at least partially) empty sheaf.
  4677. *
  4678. * Must be called with the cpu_sheaves local lock locked. If successful, returns
  4679. * the pcs pointer and the local lock locked (possibly on a different cpu than
  4680. * initially called). If not successful, returns NULL and the local lock
  4681. * unlocked.
  4682. */
  4683. static struct slub_percpu_sheaves *
  4684. __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
  4685. bool allow_spin)
  4686. {
  4687. struct slab_sheaf *empty;
  4688. struct node_barn *barn;
  4689. bool put_fail;
  4690. restart:
  4691. lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
  4692. /* Bootstrap or debug cache, back off */
  4693. if (unlikely(!cache_has_sheaves(s))) {
  4694. local_unlock(&s->cpu_sheaves->lock);
  4695. return NULL;
  4696. }
  4697. barn = get_barn(s);
  4698. if (!barn) {
  4699. local_unlock(&s->cpu_sheaves->lock);
  4700. return NULL;
  4701. }
  4702. put_fail = false;
  4703. if (!pcs->spare) {
  4704. empty = barn_get_empty_sheaf(barn, allow_spin);
  4705. if (empty) {
  4706. pcs->spare = pcs->main;
  4707. pcs->main = empty;
  4708. return pcs;
  4709. }
  4710. goto alloc_empty;
  4711. }
  4712. if (pcs->spare->size < s->sheaf_capacity) {
  4713. swap(pcs->main, pcs->spare);
  4714. return pcs;
  4715. }
  4716. empty = barn_replace_full_sheaf(barn, pcs->main, allow_spin);
  4717. if (!IS_ERR(empty)) {
  4718. stat(s, BARN_PUT);
  4719. pcs->main = empty;
  4720. return pcs;
  4721. }
  4722. /* sheaf_flush_unused() doesn't support !allow_spin */
  4723. if (PTR_ERR(empty) == -E2BIG && allow_spin) {
  4724. /* Since we got here, spare exists and is full */
  4725. struct slab_sheaf *to_flush = pcs->spare;
  4726. stat(s, BARN_PUT_FAIL);
  4727. pcs->spare = NULL;
  4728. local_unlock(&s->cpu_sheaves->lock);
  4729. sheaf_flush_unused(s, to_flush);
  4730. empty = to_flush;
  4731. goto got_empty;
  4732. }
  4733. /*
  4734. * We could not replace full sheaf because barn had no empty
  4735. * sheaves. We can still allocate it and put the full sheaf in
  4736. * __pcs_install_empty_sheaf(), but if we fail to allocate it,
  4737. * make sure to count the fail.
  4738. */
  4739. put_fail = true;
  4740. alloc_empty:
  4741. local_unlock(&s->cpu_sheaves->lock);
  4742. /*
  4743. * alloc_empty_sheaf() doesn't support !allow_spin and it's
  4744. * easier to fall back to freeing directly without sheaves
  4745. * than add the support (and to sheaf_flush_unused() above)
  4746. */
  4747. if (!allow_spin)
  4748. return NULL;
  4749. empty = alloc_empty_sheaf(s, GFP_NOWAIT);
  4750. if (empty)
  4751. goto got_empty;
  4752. if (put_fail)
  4753. stat(s, BARN_PUT_FAIL);
  4754. if (!sheaf_try_flush_main(s))
  4755. return NULL;
  4756. if (!local_trylock(&s->cpu_sheaves->lock))
  4757. return NULL;
  4758. pcs = this_cpu_ptr(s->cpu_sheaves);
  4759. /*
  4760. * we flushed the main sheaf so it should be empty now,
  4761. * but in case we got preempted or migrated, we need to
  4762. * check again
  4763. */
  4764. if (pcs->main->size == s->sheaf_capacity)
  4765. goto restart;
  4766. return pcs;
  4767. got_empty:
  4768. if (!local_trylock(&s->cpu_sheaves->lock)) {
  4769. barn_put_empty_sheaf(barn, empty);
  4770. return NULL;
  4771. }
  4772. pcs = this_cpu_ptr(s->cpu_sheaves);
  4773. __pcs_install_empty_sheaf(s, pcs, empty, barn);
  4774. return pcs;
  4775. }
  4776. /*
  4777. * Free an object to the percpu sheaves.
  4778. * The object is expected to have passed slab_free_hook() already.
  4779. */
  4780. static __fastpath_inline
  4781. bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin)
  4782. {
  4783. struct slub_percpu_sheaves *pcs;
  4784. if (!local_trylock(&s->cpu_sheaves->lock))
  4785. return false;
  4786. pcs = this_cpu_ptr(s->cpu_sheaves);
  4787. if (unlikely(pcs->main->size == s->sheaf_capacity)) {
  4788. pcs = __pcs_replace_full_main(s, pcs, allow_spin);
  4789. if (unlikely(!pcs))
  4790. return false;
  4791. }
  4792. pcs->main->objects[pcs->main->size++] = object;
  4793. local_unlock(&s->cpu_sheaves->lock);
  4794. stat(s, FREE_FASTPATH);
  4795. return true;
  4796. }
  4797. static void rcu_free_sheaf(struct rcu_head *head)
  4798. {
  4799. struct kmem_cache_node *n;
  4800. struct slab_sheaf *sheaf;
  4801. struct node_barn *barn = NULL;
  4802. struct kmem_cache *s;
  4803. sheaf = container_of(head, struct slab_sheaf, rcu_head);
  4804. s = sheaf->cache;
  4805. /*
  4806. * This may remove some objects due to slab_free_hook() returning false,
  4807. * so that the sheaf might no longer be completely full. But it's easier
  4808. * to handle it as full (unless it became completely empty), as the code
  4809. * handles it fine. The only downside is that sheaf will serve fewer
  4810. * allocations when reused. It only happens due to debugging, which is a
  4811. * performance hit anyway.
  4812. *
  4813. * If it returns true, there was at least one object from pfmemalloc
  4814. * slab so simply flush everything.
  4815. */
  4816. if (__rcu_free_sheaf_prepare(s, sheaf))
  4817. goto flush;
  4818. n = get_node(s, sheaf->node);
  4819. if (!n)
  4820. goto flush;
  4821. barn = n->barn;
  4822. /* due to slab_free_hook() */
  4823. if (unlikely(sheaf->size == 0))
  4824. goto empty;
  4825. /*
  4826. * Checking nr_full/nr_empty outside lock avoids contention in case the
  4827. * barn is at the respective limit. Due to the race we might go over the
  4828. * limit but that should be rare and harmless.
  4829. */
  4830. if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) {
  4831. stat(s, BARN_PUT);
  4832. barn_put_full_sheaf(barn, sheaf);
  4833. return;
  4834. }
  4835. flush:
  4836. stat(s, BARN_PUT_FAIL);
  4837. sheaf_flush_unused(s, sheaf);
  4838. empty:
  4839. if (barn && data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) {
  4840. barn_put_empty_sheaf(barn, sheaf);
  4841. return;
  4842. }
  4843. free_empty_sheaf(s, sheaf);
  4844. }
  4845. /*
  4846. * kvfree_call_rcu() can be called while holding a raw_spinlock_t. Since
  4847. * __kfree_rcu_sheaf() may acquire a spinlock_t (sleeping lock on PREEMPT_RT),
  4848. * this would violate lock nesting rules. Therefore, kvfree_call_rcu() avoids
  4849. * this problem by bypassing the sheaves layer entirely on PREEMPT_RT.
  4850. *
  4851. * However, lockdep still complains that it is invalid to acquire spinlock_t
  4852. * while holding raw_spinlock_t, even on !PREEMPT_RT where spinlock_t is a
  4853. * spinning lock. Tell lockdep that acquiring spinlock_t is valid here
  4854. * by temporarily raising the wait-type to LD_WAIT_CONFIG.
  4855. */
  4856. static DEFINE_WAIT_OVERRIDE_MAP(kfree_rcu_sheaf_map, LD_WAIT_CONFIG);
  4857. bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
  4858. {
  4859. struct slub_percpu_sheaves *pcs;
  4860. struct slab_sheaf *rcu_sheaf;
  4861. if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
  4862. return false;
  4863. lock_map_acquire_try(&kfree_rcu_sheaf_map);
  4864. if (!local_trylock(&s->cpu_sheaves->lock))
  4865. goto fail;
  4866. pcs = this_cpu_ptr(s->cpu_sheaves);
  4867. if (unlikely(!pcs->rcu_free)) {
  4868. struct slab_sheaf *empty;
  4869. struct node_barn *barn;
  4870. /* Bootstrap or debug cache, fall back */
  4871. if (unlikely(!cache_has_sheaves(s))) {
  4872. local_unlock(&s->cpu_sheaves->lock);
  4873. goto fail;
  4874. }
  4875. if (pcs->spare && pcs->spare->size == 0) {
  4876. pcs->rcu_free = pcs->spare;
  4877. pcs->spare = NULL;
  4878. goto do_free;
  4879. }
  4880. barn = get_barn(s);
  4881. if (!barn) {
  4882. local_unlock(&s->cpu_sheaves->lock);
  4883. goto fail;
  4884. }
  4885. empty = barn_get_empty_sheaf(barn, true);
  4886. if (empty) {
  4887. pcs->rcu_free = empty;
  4888. goto do_free;
  4889. }
  4890. local_unlock(&s->cpu_sheaves->lock);
  4891. empty = alloc_empty_sheaf(s, GFP_NOWAIT);
  4892. if (!empty)
  4893. goto fail;
  4894. if (!local_trylock(&s->cpu_sheaves->lock)) {
  4895. barn_put_empty_sheaf(barn, empty);
  4896. goto fail;
  4897. }
  4898. pcs = this_cpu_ptr(s->cpu_sheaves);
  4899. if (unlikely(pcs->rcu_free))
  4900. barn_put_empty_sheaf(barn, empty);
  4901. else
  4902. pcs->rcu_free = empty;
  4903. }
  4904. do_free:
  4905. rcu_sheaf = pcs->rcu_free;
  4906. /*
  4907. * Since we flush immediately when size reaches capacity, we never reach
  4908. * this with size already at capacity, so no OOB write is possible.
  4909. */
  4910. rcu_sheaf->objects[rcu_sheaf->size++] = obj;
  4911. if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
  4912. rcu_sheaf = NULL;
  4913. } else {
  4914. pcs->rcu_free = NULL;
  4915. rcu_sheaf->node = numa_mem_id();
  4916. }
  4917. /*
  4918. * we flush before local_unlock to make sure a racing
  4919. * flush_all_rcu_sheaves() doesn't miss this sheaf
  4920. */
  4921. if (rcu_sheaf)
  4922. call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
  4923. local_unlock(&s->cpu_sheaves->lock);
  4924. stat(s, FREE_RCU_SHEAF);
  4925. lock_map_release(&kfree_rcu_sheaf_map);
  4926. return true;
  4927. fail:
  4928. stat(s, FREE_RCU_SHEAF_FAIL);
  4929. lock_map_release(&kfree_rcu_sheaf_map);
  4930. return false;
  4931. }
  4932. /*
  4933. * Bulk free objects to the percpu sheaves.
  4934. * Unlike free_to_pcs() this includes the calls to all necessary hooks
  4935. * and the fallback to freeing to slab pages.
  4936. */
  4937. static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
  4938. {
  4939. struct slub_percpu_sheaves *pcs;
  4940. struct slab_sheaf *main, *empty;
  4941. bool init = slab_want_init_on_free(s);
  4942. unsigned int batch, i = 0;
  4943. struct node_barn *barn;
  4944. void *remote_objects[PCS_BATCH_MAX];
  4945. unsigned int remote_nr = 0;
  4946. int node = numa_mem_id();
  4947. next_remote_batch:
  4948. while (i < size) {
  4949. struct slab *slab = virt_to_slab(p[i]);
  4950. memcg_slab_free_hook(s, slab, p + i, 1);
  4951. alloc_tagging_slab_free_hook(s, slab, p + i, 1);
  4952. if (unlikely(!slab_free_hook(s, p[i], init, false))) {
  4953. p[i] = p[--size];
  4954. continue;
  4955. }
  4956. if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)
  4957. || slab_test_pfmemalloc(slab))) {
  4958. remote_objects[remote_nr] = p[i];
  4959. p[i] = p[--size];
  4960. if (++remote_nr >= PCS_BATCH_MAX)
  4961. goto flush_remote;
  4962. continue;
  4963. }
  4964. i++;
  4965. }
  4966. if (!size)
  4967. goto flush_remote;
  4968. next_batch:
  4969. if (!local_trylock(&s->cpu_sheaves->lock))
  4970. goto fallback;
  4971. pcs = this_cpu_ptr(s->cpu_sheaves);
  4972. if (likely(pcs->main->size < s->sheaf_capacity))
  4973. goto do_free;
  4974. barn = get_barn(s);
  4975. if (!barn)
  4976. goto no_empty;
  4977. if (!pcs->spare) {
  4978. empty = barn_get_empty_sheaf(barn, true);
  4979. if (!empty)
  4980. goto no_empty;
  4981. pcs->spare = pcs->main;
  4982. pcs->main = empty;
  4983. goto do_free;
  4984. }
  4985. if (pcs->spare->size < s->sheaf_capacity) {
  4986. swap(pcs->main, pcs->spare);
  4987. goto do_free;
  4988. }
  4989. empty = barn_replace_full_sheaf(barn, pcs->main, true);
  4990. if (IS_ERR(empty)) {
  4991. stat(s, BARN_PUT_FAIL);
  4992. goto no_empty;
  4993. }
  4994. stat(s, BARN_PUT);
  4995. pcs->main = empty;
  4996. do_free:
  4997. main = pcs->main;
  4998. batch = min(size, s->sheaf_capacity - main->size);
  4999. memcpy(main->objects + main->size, p, batch * sizeof(void *));
  5000. main->size += batch;
  5001. local_unlock(&s->cpu_sheaves->lock);
  5002. stat_add(s, FREE_FASTPATH, batch);
  5003. if (batch < size) {
  5004. p += batch;
  5005. size -= batch;
  5006. goto next_batch;
  5007. }
  5008. if (remote_nr)
  5009. goto flush_remote;
  5010. return;
  5011. no_empty:
  5012. local_unlock(&s->cpu_sheaves->lock);
  5013. /*
  5014. * if we depleted all empty sheaves in the barn or there are too
  5015. * many full sheaves, free the rest to slab pages
  5016. */
  5017. fallback:
  5018. __kmem_cache_free_bulk(s, size, p);
  5019. stat_add(s, FREE_SLOWPATH, size);
  5020. flush_remote:
  5021. if (remote_nr) {
  5022. __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]);
  5023. stat_add(s, FREE_SLOWPATH, remote_nr);
  5024. if (i < size) {
  5025. remote_nr = 0;
  5026. goto next_remote_batch;
  5027. }
  5028. }
  5029. }
  5030. struct defer_free {
  5031. struct llist_head objects;
  5032. struct irq_work work;
  5033. };
  5034. static void free_deferred_objects(struct irq_work *work);
  5035. static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = {
  5036. .objects = LLIST_HEAD_INIT(objects),
  5037. .work = IRQ_WORK_INIT(free_deferred_objects),
  5038. };
  5039. /*
  5040. * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe
  5041. * to take sleeping spin_locks from __slab_free().
  5042. * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore().
  5043. */
  5044. static void free_deferred_objects(struct irq_work *work)
  5045. {
  5046. struct defer_free *df = container_of(work, struct defer_free, work);
  5047. struct llist_head *objs = &df->objects;
  5048. struct llist_node *llnode, *pos, *t;
  5049. if (llist_empty(objs))
  5050. return;
  5051. llnode = llist_del_all(objs);
  5052. llist_for_each_safe(pos, t, llnode) {
  5053. struct kmem_cache *s;
  5054. struct slab *slab;
  5055. void *x = pos;
  5056. slab = virt_to_slab(x);
  5057. s = slab->slab_cache;
  5058. /* Point 'x' back to the beginning of allocated object */
  5059. x -= s->offset;
  5060. /*
  5061. * We used freepointer in 'x' to link 'x' into df->objects.
  5062. * Clear it to NULL to avoid false positive detection
  5063. * of "Freepointer corruption".
  5064. */
  5065. set_freepointer(s, x, NULL);
  5066. __slab_free(s, slab, x, x, 1, _THIS_IP_);
  5067. stat(s, FREE_SLOWPATH);
  5068. }
  5069. }
  5070. static void defer_free(struct kmem_cache *s, void *head)
  5071. {
  5072. struct defer_free *df;
  5073. guard(preempt)();
  5074. head = kasan_reset_tag(head);
  5075. df = this_cpu_ptr(&defer_free_objects);
  5076. if (llist_add(head + s->offset, &df->objects))
  5077. irq_work_queue(&df->work);
  5078. }
  5079. void defer_free_barrier(void)
  5080. {
  5081. int cpu;
  5082. for_each_possible_cpu(cpu)
  5083. irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work);
  5084. }
  5085. static __fastpath_inline
  5086. void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
  5087. unsigned long addr)
  5088. {
  5089. memcg_slab_free_hook(s, slab, &object, 1);
  5090. alloc_tagging_slab_free_hook(s, slab, &object, 1);
  5091. if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false)))
  5092. return;
  5093. if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())
  5094. && likely(!slab_test_pfmemalloc(slab))) {
  5095. if (likely(free_to_pcs(s, object, true)))
  5096. return;
  5097. }
  5098. __slab_free(s, slab, object, object, 1, addr);
  5099. stat(s, FREE_SLOWPATH);
  5100. }
  5101. #ifdef CONFIG_MEMCG
  5102. /* Do not inline the rare memcg charging failed path into the allocation path */
  5103. static noinline
  5104. void memcg_alloc_abort_single(struct kmem_cache *s, void *object)
  5105. {
  5106. struct slab *slab = virt_to_slab(object);
  5107. alloc_tagging_slab_free_hook(s, slab, &object, 1);
  5108. if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
  5109. __slab_free(s, slab, object, object, 1, _RET_IP_);
  5110. }
  5111. #endif
  5112. static __fastpath_inline
  5113. void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head,
  5114. void *tail, void **p, int cnt, unsigned long addr)
  5115. {
  5116. memcg_slab_free_hook(s, slab, p, cnt);
  5117. alloc_tagging_slab_free_hook(s, slab, p, cnt);
  5118. /*
  5119. * With KASAN enabled slab_free_freelist_hook modifies the freelist
  5120. * to remove objects, whose reuse must be delayed.
  5121. */
  5122. if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) {
  5123. __slab_free(s, slab, head, tail, cnt, addr);
  5124. stat_add(s, FREE_SLOWPATH, cnt);
  5125. }
  5126. }
  5127. #ifdef CONFIG_SLUB_RCU_DEBUG
  5128. static void slab_free_after_rcu_debug(struct rcu_head *rcu_head)
  5129. {
  5130. struct rcu_delayed_free *delayed_free =
  5131. container_of(rcu_head, struct rcu_delayed_free, head);
  5132. void *object = delayed_free->object;
  5133. struct slab *slab = virt_to_slab(object);
  5134. struct kmem_cache *s;
  5135. kfree(delayed_free);
  5136. if (WARN_ON(is_kfence_address(object)))
  5137. return;
  5138. /* find the object and the cache again */
  5139. if (WARN_ON(!slab))
  5140. return;
  5141. s = slab->slab_cache;
  5142. if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU)))
  5143. return;
  5144. /* resume freeing */
  5145. if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) {
  5146. __slab_free(s, slab, object, object, 1, _THIS_IP_);
  5147. stat(s, FREE_SLOWPATH);
  5148. }
  5149. }
  5150. #endif /* CONFIG_SLUB_RCU_DEBUG */
  5151. #ifdef CONFIG_KASAN_GENERIC
  5152. void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
  5153. {
  5154. __slab_free(cache, virt_to_slab(x), x, x, 1, addr);
  5155. stat(cache, FREE_SLOWPATH);
  5156. }
  5157. #endif
  5158. static noinline void warn_free_bad_obj(struct kmem_cache *s, void *obj)
  5159. {
  5160. struct kmem_cache *cachep;
  5161. struct slab *slab;
  5162. slab = virt_to_slab(obj);
  5163. if (WARN_ONCE(!slab,
  5164. "kmem_cache_free(%s, %p): object is not in a slab page\n",
  5165. s->name, obj))
  5166. return;
  5167. cachep = slab->slab_cache;
  5168. if (WARN_ONCE(cachep != s,
  5169. "kmem_cache_free(%s, %p): object belongs to different cache %s\n",
  5170. s->name, obj, cachep ? cachep->name : "(NULL)")) {
  5171. if (cachep)
  5172. print_tracking(cachep, obj);
  5173. return;
  5174. }
  5175. }
  5176. /**
  5177. * kmem_cache_free - Deallocate an object
  5178. * @s: The cache the allocation was from.
  5179. * @x: The previously allocated object.
  5180. *
  5181. * Free an object which was previously allocated from this
  5182. * cache.
  5183. */
  5184. void kmem_cache_free(struct kmem_cache *s, void *x)
  5185. {
  5186. struct slab *slab;
  5187. slab = virt_to_slab(x);
  5188. if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) ||
  5189. kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
  5190. /*
  5191. * Intentionally leak the object in these cases, because it
  5192. * would be too dangerous to continue.
  5193. */
  5194. if (unlikely(!slab || (slab->slab_cache != s))) {
  5195. warn_free_bad_obj(s, x);
  5196. return;
  5197. }
  5198. }
  5199. trace_kmem_cache_free(_RET_IP_, x, s);
  5200. slab_free(s, slab, x, _RET_IP_);
  5201. }
  5202. EXPORT_SYMBOL(kmem_cache_free);
  5203. static inline size_t slab_ksize(struct slab *slab)
  5204. {
  5205. struct kmem_cache *s = slab->slab_cache;
  5206. #ifdef CONFIG_SLUB_DEBUG
  5207. /*
  5208. * Debugging requires use of the padding between object
  5209. * and whatever may come after it.
  5210. */
  5211. if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
  5212. return s->object_size;
  5213. #endif
  5214. if (s->flags & SLAB_KASAN)
  5215. return s->object_size;
  5216. /*
  5217. * If we have the need to store the freelist pointer
  5218. * or any other metadata back there then we can
  5219. * only use the space before that information.
  5220. */
  5221. if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER))
  5222. return s->inuse;
  5223. else if (obj_exts_in_object(s, slab))
  5224. return s->inuse;
  5225. /*
  5226. * Else we can use all the padding etc for the allocation
  5227. */
  5228. return s->size;
  5229. }
  5230. static size_t __ksize(const void *object)
  5231. {
  5232. struct page *page;
  5233. struct slab *slab;
  5234. if (unlikely(object == ZERO_SIZE_PTR))
  5235. return 0;
  5236. page = virt_to_page(object);
  5237. if (unlikely(PageLargeKmalloc(page)))
  5238. return large_kmalloc_size(page);
  5239. slab = page_slab(page);
  5240. /* Delete this after we're sure there are no users */
  5241. if (WARN_ON(!slab))
  5242. return page_size(page);
  5243. #ifdef CONFIG_SLUB_DEBUG
  5244. skip_orig_size_check(slab->slab_cache, object);
  5245. #endif
  5246. return slab_ksize(slab);
  5247. }
  5248. /**
  5249. * ksize -- Report full size of underlying allocation
  5250. * @objp: pointer to the object
  5251. *
  5252. * This should only be used internally to query the true size of allocations.
  5253. * It is not meant to be a way to discover the usable size of an allocation
  5254. * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond
  5255. * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
  5256. * and/or FORTIFY_SOURCE.
  5257. *
  5258. * Return: size of the actual memory used by @objp in bytes
  5259. */
  5260. size_t ksize(const void *objp)
  5261. {
  5262. /*
  5263. * We need to first check that the pointer to the object is valid.
  5264. * The KASAN report printed from ksize() is more useful, then when
  5265. * it's printed later when the behaviour could be undefined due to
  5266. * a potential use-after-free or double-free.
  5267. *
  5268. * We use kasan_check_byte(), which is supported for the hardware
  5269. * tag-based KASAN mode, unlike kasan_check_read/write().
  5270. *
  5271. * If the pointed to memory is invalid, we return 0 to avoid users of
  5272. * ksize() writing to and potentially corrupting the memory region.
  5273. *
  5274. * We want to perform the check before __ksize(), to avoid potentially
  5275. * crashing in __ksize() due to accessing invalid metadata.
  5276. */
  5277. if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
  5278. return 0;
  5279. return kfence_ksize(objp) ?: __ksize(objp);
  5280. }
  5281. EXPORT_SYMBOL(ksize);
  5282. static void free_large_kmalloc(struct page *page, void *object)
  5283. {
  5284. unsigned int order = compound_order(page);
  5285. if (WARN_ON_ONCE(!PageLargeKmalloc(page))) {
  5286. dump_page(page, "Not a kmalloc allocation");
  5287. return;
  5288. }
  5289. if (WARN_ON_ONCE(order == 0))
  5290. pr_warn_once("object pointer: 0x%p\n", object);
  5291. kmemleak_free(object);
  5292. kasan_kfree_large(object);
  5293. kmsan_kfree_large(object);
  5294. mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
  5295. -(PAGE_SIZE << order));
  5296. __ClearPageLargeKmalloc(page);
  5297. free_frozen_pages(page, order);
  5298. }
  5299. /*
  5300. * Given an rcu_head embedded within an object obtained from kvmalloc at an
  5301. * offset < 4k, free the object in question.
  5302. */
  5303. void kvfree_rcu_cb(struct rcu_head *head)
  5304. {
  5305. void *obj = head;
  5306. struct page *page;
  5307. struct slab *slab;
  5308. struct kmem_cache *s;
  5309. void *slab_addr;
  5310. if (is_vmalloc_addr(obj)) {
  5311. obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
  5312. vfree(obj);
  5313. return;
  5314. }
  5315. page = virt_to_page(obj);
  5316. slab = page_slab(page);
  5317. if (!slab) {
  5318. /*
  5319. * rcu_head offset can be only less than page size so no need to
  5320. * consider allocation order
  5321. */
  5322. obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
  5323. free_large_kmalloc(page, obj);
  5324. return;
  5325. }
  5326. s = slab->slab_cache;
  5327. slab_addr = slab_address(slab);
  5328. if (is_kfence_address(obj)) {
  5329. obj = kfence_object_start(obj);
  5330. } else {
  5331. unsigned int idx = __obj_to_index(s, slab_addr, obj);
  5332. obj = slab_addr + s->size * idx;
  5333. obj = fixup_red_left(s, obj);
  5334. }
  5335. slab_free(s, slab, obj, _RET_IP_);
  5336. }
  5337. /**
  5338. * kfree - free previously allocated memory
  5339. * @object: pointer returned by kmalloc(), kmalloc_nolock(), or kmem_cache_alloc()
  5340. *
  5341. * If @object is NULL, no operation is performed.
  5342. */
  5343. void kfree(const void *object)
  5344. {
  5345. struct page *page;
  5346. struct slab *slab;
  5347. struct kmem_cache *s;
  5348. void *x = (void *)object;
  5349. trace_kfree(_RET_IP_, object);
  5350. if (unlikely(ZERO_OR_NULL_PTR(object)))
  5351. return;
  5352. page = virt_to_page(object);
  5353. slab = page_slab(page);
  5354. if (!slab) {
  5355. /* kmalloc_nolock() doesn't support large kmalloc */
  5356. free_large_kmalloc(page, (void *)object);
  5357. return;
  5358. }
  5359. s = slab->slab_cache;
  5360. slab_free(s, slab, x, _RET_IP_);
  5361. }
  5362. EXPORT_SYMBOL(kfree);
  5363. /*
  5364. * Can be called while holding raw_spinlock_t or from IRQ and NMI,
  5365. * but ONLY for objects allocated by kmalloc_nolock().
  5366. * Debug checks (like kmemleak and kfence) were skipped on allocation,
  5367. * hence
  5368. * obj = kmalloc(); kfree_nolock(obj);
  5369. * will miss kmemleak/kfence book keeping and will cause false positives.
  5370. * large_kmalloc is not supported either.
  5371. */
  5372. void kfree_nolock(const void *object)
  5373. {
  5374. struct slab *slab;
  5375. struct kmem_cache *s;
  5376. void *x = (void *)object;
  5377. if (unlikely(ZERO_OR_NULL_PTR(object)))
  5378. return;
  5379. slab = virt_to_slab(object);
  5380. if (unlikely(!slab)) {
  5381. WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()");
  5382. return;
  5383. }
  5384. s = slab->slab_cache;
  5385. memcg_slab_free_hook(s, slab, &x, 1);
  5386. alloc_tagging_slab_free_hook(s, slab, &x, 1);
  5387. /*
  5388. * Unlike slab_free() do NOT call the following:
  5389. * kmemleak_free_recursive(x, s->flags);
  5390. * debug_check_no_locks_freed(x, s->object_size);
  5391. * debug_check_no_obj_freed(x, s->object_size);
  5392. * __kcsan_check_access(x, s->object_size, ..);
  5393. * kfence_free(x);
  5394. * since they take spinlocks or not safe from any context.
  5395. */
  5396. kmsan_slab_free(s, x);
  5397. /*
  5398. * If KASAN finds a kernel bug it will do kasan_report_invalid_free()
  5399. * which will call raw_spin_lock_irqsave() which is technically
  5400. * unsafe from NMI, but take chance and report kernel bug.
  5401. * The sequence of
  5402. * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI
  5403. * -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU
  5404. * is double buggy and deserves to deadlock.
  5405. */
  5406. if (kasan_slab_pre_free(s, x))
  5407. return;
  5408. /*
  5409. * memcg, kasan_slab_pre_free are done for 'x'.
  5410. * The only thing left is kasan_poison without quarantine,
  5411. * since kasan quarantine takes locks and not supported from NMI.
  5412. */
  5413. kasan_slab_free(s, x, false, false, /* skip quarantine */true);
  5414. if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) {
  5415. if (likely(free_to_pcs(s, x, false)))
  5416. return;
  5417. }
  5418. /*
  5419. * __slab_free() can locklessly cmpxchg16 into a slab, but then it might
  5420. * need to take spin_lock for further processing.
  5421. * Avoid the complexity and simply add to a deferred list.
  5422. */
  5423. defer_free(s, x);
  5424. }
  5425. EXPORT_SYMBOL_GPL(kfree_nolock);
  5426. static __always_inline __realloc_size(2) void *
  5427. __do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid)
  5428. {
  5429. void *ret;
  5430. size_t ks = 0;
  5431. int orig_size = 0;
  5432. struct kmem_cache *s = NULL;
  5433. if (unlikely(ZERO_OR_NULL_PTR(p)))
  5434. goto alloc_new;
  5435. /* Check for double-free. */
  5436. if (!kasan_check_byte(p))
  5437. return NULL;
  5438. /*
  5439. * If reallocation is not necessary (e. g. the new size is less
  5440. * than the current allocated size), the current allocation will be
  5441. * preserved unless __GFP_THISNODE is set. In the latter case a new
  5442. * allocation on the requested node will be attempted.
  5443. */
  5444. if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
  5445. nid != page_to_nid(virt_to_page(p)))
  5446. goto alloc_new;
  5447. if (is_kfence_address(p)) {
  5448. ks = orig_size = kfence_ksize(p);
  5449. } else {
  5450. struct page *page = virt_to_page(p);
  5451. struct slab *slab = page_slab(page);
  5452. if (!slab) {
  5453. /* Big kmalloc object */
  5454. ks = page_size(page);
  5455. WARN_ON(ks <= KMALLOC_MAX_CACHE_SIZE);
  5456. WARN_ON(p != page_address(page));
  5457. } else {
  5458. s = slab->slab_cache;
  5459. orig_size = get_orig_size(s, (void *)p);
  5460. ks = s->object_size;
  5461. }
  5462. }
  5463. /* If the old object doesn't fit, allocate a bigger one */
  5464. if (new_size > ks)
  5465. goto alloc_new;
  5466. /* If the old object doesn't satisfy the new alignment, allocate a new one */
  5467. if (!IS_ALIGNED((unsigned long)p, align))
  5468. goto alloc_new;
  5469. /* Zero out spare memory. */
  5470. if (want_init_on_alloc(flags)) {
  5471. kasan_disable_current();
  5472. if (orig_size && orig_size < new_size)
  5473. memset(kasan_reset_tag(p) + orig_size, 0, new_size - orig_size);
  5474. else
  5475. memset(kasan_reset_tag(p) + new_size, 0, ks - new_size);
  5476. kasan_enable_current();
  5477. }
  5478. /* Setup kmalloc redzone when needed */
  5479. if (s && slub_debug_orig_size(s)) {
  5480. set_orig_size(s, (void *)p, new_size);
  5481. if (s->flags & SLAB_RED_ZONE && new_size < ks)
  5482. memset_no_sanitize_memory(kasan_reset_tag(p) + new_size,
  5483. SLUB_RED_ACTIVE, ks - new_size);
  5484. }
  5485. p = kasan_krealloc(p, new_size, flags);
  5486. return (void *)p;
  5487. alloc_new:
  5488. ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_);
  5489. if (ret && p) {
  5490. /* Disable KASAN checks as the object's redzone is accessed. */
  5491. kasan_disable_current();
  5492. memcpy(ret, kasan_reset_tag(p), orig_size ?: ks);
  5493. kasan_enable_current();
  5494. }
  5495. return ret;
  5496. }
  5497. /**
  5498. * krealloc_node_align - reallocate memory. The contents will remain unchanged.
  5499. * @p: object to reallocate memory for.
  5500. * @new_size: how many bytes of memory are required.
  5501. * @align: desired alignment.
  5502. * @flags: the type of memory to allocate.
  5503. * @nid: NUMA node or NUMA_NO_NODE
  5504. *
  5505. * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size
  5506. * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
  5507. *
  5508. * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
  5509. * Documentation/core-api/memory-allocation.rst for more details.
  5510. *
  5511. * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
  5512. * initial memory allocation, every subsequent call to this API for the same
  5513. * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
  5514. * __GFP_ZERO is not fully honored by this API.
  5515. *
  5516. * When slub_debug_orig_size() is off, krealloc() only knows about the bucket
  5517. * size of an allocation (but not the exact size it was allocated with) and
  5518. * hence implements the following semantics for shrinking and growing buffers
  5519. * with __GFP_ZERO::
  5520. *
  5521. * new bucket
  5522. * 0 size size
  5523. * |--------|----------------|
  5524. * | keep | zero |
  5525. *
  5526. * Otherwise, the original allocation size 'orig_size' could be used to
  5527. * precisely clear the requested size, and the new size will also be stored
  5528. * as the new 'orig_size'.
  5529. *
  5530. * In any case, the contents of the object pointed to are preserved up to the
  5531. * lesser of the new and old sizes.
  5532. *
  5533. * Return: pointer to the allocated memory or %NULL in case of error
  5534. */
  5535. void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align,
  5536. gfp_t flags, int nid)
  5537. {
  5538. void *ret;
  5539. if (unlikely(!new_size)) {
  5540. kfree(p);
  5541. return ZERO_SIZE_PTR;
  5542. }
  5543. ret = __do_krealloc(p, new_size, align, flags, nid);
  5544. if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
  5545. kfree(p);
  5546. return ret;
  5547. }
  5548. EXPORT_SYMBOL(krealloc_node_align_noprof);
  5549. static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
  5550. {
  5551. /*
  5552. * We want to attempt a large physically contiguous block first because
  5553. * it is less likely to fragment multiple larger blocks and therefore
  5554. * contribute to a long term fragmentation less than vmalloc fallback.
  5555. * However make sure that larger requests are not too disruptive - i.e.
  5556. * do not direct reclaim unless physically continuous memory is preferred
  5557. * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to
  5558. * start working in the background
  5559. */
  5560. if (size > PAGE_SIZE) {
  5561. flags |= __GFP_NOWARN;
  5562. if (!(flags & __GFP_RETRY_MAYFAIL))
  5563. flags &= ~__GFP_DIRECT_RECLAIM;
  5564. /* nofail semantic is implemented by the vmalloc fallback */
  5565. flags &= ~__GFP_NOFAIL;
  5566. }
  5567. return flags;
  5568. }
  5569. /**
  5570. * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
  5571. * failure, fall back to non-contiguous (vmalloc) allocation.
  5572. * @size: size of the request.
  5573. * @b: which set of kmalloc buckets to allocate from.
  5574. * @align: desired alignment.
  5575. * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
  5576. * @node: numa node to allocate from
  5577. *
  5578. * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
  5579. * Documentation/core-api/memory-allocation.rst for more details.
  5580. *
  5581. * Uses kmalloc to get the memory but if the allocation fails then falls back
  5582. * to the vmalloc allocator. Use kvfree for freeing the memory.
  5583. *
  5584. * GFP_NOWAIT and GFP_ATOMIC are supported, the __GFP_NORETRY modifier is not.
  5585. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
  5586. * preferable to the vmalloc fallback, due to visible performance drawbacks.
  5587. *
  5588. * Return: pointer to the allocated memory of %NULL in case of failure
  5589. */
  5590. void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
  5591. gfp_t flags, int node)
  5592. {
  5593. bool allow_block;
  5594. void *ret;
  5595. /*
  5596. * It doesn't really make sense to fallback to vmalloc for sub page
  5597. * requests
  5598. */
  5599. ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b),
  5600. kmalloc_gfp_adjust(flags, size),
  5601. node, _RET_IP_);
  5602. if (ret || size <= PAGE_SIZE)
  5603. return ret;
  5604. /* Don't even allow crazy sizes */
  5605. if (unlikely(size > INT_MAX)) {
  5606. WARN_ON_ONCE(!(flags & __GFP_NOWARN));
  5607. return NULL;
  5608. }
  5609. /*
  5610. * For non-blocking the VM_ALLOW_HUGE_VMAP is not used
  5611. * because the huge-mapping path in vmalloc contains at
  5612. * least one might_sleep() call.
  5613. *
  5614. * TODO: Revise huge-mapping path to support non-blocking
  5615. * flags.
  5616. */
  5617. allow_block = gfpflags_allow_blocking(flags);
  5618. /*
  5619. * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
  5620. * since the callers already cannot assume anything
  5621. * about the resulting pointer, and cannot play
  5622. * protection games.
  5623. */
  5624. return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
  5625. flags, PAGE_KERNEL, allow_block ? VM_ALLOW_HUGE_VMAP:0,
  5626. node, __builtin_return_address(0));
  5627. }
  5628. EXPORT_SYMBOL(__kvmalloc_node_noprof);
  5629. /**
  5630. * kvfree() - Free memory.
  5631. * @addr: Pointer to allocated memory.
  5632. *
  5633. * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
  5634. * It is slightly more efficient to use kfree() or vfree() if you are certain
  5635. * that you know which one to use.
  5636. *
  5637. * Context: Either preemptible task context or not-NMI interrupt.
  5638. */
  5639. void kvfree(const void *addr)
  5640. {
  5641. if (is_vmalloc_addr(addr))
  5642. vfree(addr);
  5643. else
  5644. kfree(addr);
  5645. }
  5646. EXPORT_SYMBOL(kvfree);
  5647. /**
  5648. * kvfree_sensitive - Free a data object containing sensitive information.
  5649. * @addr: address of the data object to be freed.
  5650. * @len: length of the data object.
  5651. *
  5652. * Use the special memzero_explicit() function to clear the content of a
  5653. * kvmalloc'ed object containing sensitive data to make sure that the
  5654. * compiler won't optimize out the data clearing.
  5655. */
  5656. void kvfree_sensitive(const void *addr, size_t len)
  5657. {
  5658. if (likely(!ZERO_OR_NULL_PTR(addr))) {
  5659. memzero_explicit((void *)addr, len);
  5660. kvfree(addr);
  5661. }
  5662. }
  5663. EXPORT_SYMBOL(kvfree_sensitive);
  5664. /**
  5665. * kvrealloc_node_align - reallocate memory; contents remain unchanged
  5666. * @p: object to reallocate memory for
  5667. * @size: the size to reallocate
  5668. * @align: desired alignment
  5669. * @flags: the flags for the page level allocator
  5670. * @nid: NUMA node id
  5671. *
  5672. * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
  5673. * and @p is not a %NULL pointer, the object pointed to is freed.
  5674. *
  5675. * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
  5676. * Documentation/core-api/memory-allocation.rst for more details.
  5677. *
  5678. * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
  5679. * initial memory allocation, every subsequent call to this API for the same
  5680. * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
  5681. * __GFP_ZERO is not fully honored by this API.
  5682. *
  5683. * In any case, the contents of the object pointed to are preserved up to the
  5684. * lesser of the new and old sizes.
  5685. *
  5686. * This function must not be called concurrently with itself or kvfree() for the
  5687. * same memory allocation.
  5688. *
  5689. * Return: pointer to the allocated memory or %NULL in case of error
  5690. */
  5691. void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
  5692. gfp_t flags, int nid)
  5693. {
  5694. void *n;
  5695. if (is_vmalloc_addr(p))
  5696. return vrealloc_node_align_noprof(p, size, align, flags, nid);
  5697. n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid);
  5698. if (!n) {
  5699. /* We failed to krealloc(), fall back to kvmalloc(). */
  5700. n = kvmalloc_node_align_noprof(size, align, flags, nid);
  5701. if (!n)
  5702. return NULL;
  5703. if (p) {
  5704. /* We already know that `p` is not a vmalloc address. */
  5705. kasan_disable_current();
  5706. memcpy(n, kasan_reset_tag(p), ksize(p));
  5707. kasan_enable_current();
  5708. kfree(p);
  5709. }
  5710. }
  5711. return n;
  5712. }
  5713. EXPORT_SYMBOL(kvrealloc_node_align_noprof);
  5714. struct detached_freelist {
  5715. struct slab *slab;
  5716. void *tail;
  5717. void *freelist;
  5718. int cnt;
  5719. struct kmem_cache *s;
  5720. };
  5721. /*
  5722. * This function progressively scans the array with free objects (with
  5723. * a limited look ahead) and extract objects belonging to the same
  5724. * slab. It builds a detached freelist directly within the given
  5725. * slab/objects. This can happen without any need for
  5726. * synchronization, because the objects are owned by running process.
  5727. * The freelist is build up as a single linked list in the objects.
  5728. * The idea is, that this detached freelist can then be bulk
  5729. * transferred to the real freelist(s), but only requiring a single
  5730. * synchronization primitive. Look ahead in the array is limited due
  5731. * to performance reasons.
  5732. */
  5733. static inline
  5734. int build_detached_freelist(struct kmem_cache *s, size_t size,
  5735. void **p, struct detached_freelist *df)
  5736. {
  5737. int lookahead = 3;
  5738. void *object;
  5739. struct page *page;
  5740. struct slab *slab;
  5741. size_t same;
  5742. object = p[--size];
  5743. page = virt_to_page(object);
  5744. slab = page_slab(page);
  5745. if (!s) {
  5746. /* Handle kalloc'ed objects */
  5747. if (!slab) {
  5748. free_large_kmalloc(page, object);
  5749. df->slab = NULL;
  5750. return size;
  5751. }
  5752. /* Derive kmem_cache from object */
  5753. df->slab = slab;
  5754. df->s = slab->slab_cache;
  5755. } else {
  5756. df->slab = slab;
  5757. df->s = s;
  5758. }
  5759. /* Start new detached freelist */
  5760. df->tail = object;
  5761. df->freelist = object;
  5762. df->cnt = 1;
  5763. if (is_kfence_address(object))
  5764. return size;
  5765. set_freepointer(df->s, object, NULL);
  5766. same = size;
  5767. while (size) {
  5768. object = p[--size];
  5769. /* df->slab is always set at this point */
  5770. if (df->slab == virt_to_slab(object)) {
  5771. /* Opportunity build freelist */
  5772. set_freepointer(df->s, object, df->freelist);
  5773. df->freelist = object;
  5774. df->cnt++;
  5775. same--;
  5776. if (size != same)
  5777. swap(p[size], p[same]);
  5778. continue;
  5779. }
  5780. /* Limit look ahead search */
  5781. if (!--lookahead)
  5782. break;
  5783. }
  5784. return same;
  5785. }
  5786. /*
  5787. * Internal bulk free of objects that were not initialised by the post alloc
  5788. * hooks and thus should not be processed by the free hooks
  5789. */
  5790. static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
  5791. {
  5792. if (!size)
  5793. return;
  5794. do {
  5795. struct detached_freelist df;
  5796. size = build_detached_freelist(s, size, p, &df);
  5797. if (!df.slab)
  5798. continue;
  5799. if (kfence_free(df.freelist))
  5800. continue;
  5801. __slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt,
  5802. _RET_IP_);
  5803. } while (likely(size));
  5804. }
  5805. /* Note that interrupts must be enabled when calling this function. */
  5806. void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
  5807. {
  5808. if (!size)
  5809. return;
  5810. /*
  5811. * freeing to sheaves is so incompatible with the detached freelist so
  5812. * once we go that way, we have to do everything differently
  5813. */
  5814. if (s && cache_has_sheaves(s)) {
  5815. free_to_pcs_bulk(s, size, p);
  5816. return;
  5817. }
  5818. do {
  5819. struct detached_freelist df;
  5820. size = build_detached_freelist(s, size, p, &df);
  5821. if (!df.slab)
  5822. continue;
  5823. slab_free_bulk(df.s, df.slab, df.freelist, df.tail, &p[size],
  5824. df.cnt, _RET_IP_);
  5825. } while (likely(size));
  5826. }
  5827. EXPORT_SYMBOL(kmem_cache_free_bulk);
  5828. static unsigned int
  5829. __refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
  5830. unsigned int max, struct kmem_cache_node *n,
  5831. bool allow_spin)
  5832. {
  5833. struct partial_bulk_context pc;
  5834. struct slab *slab, *slab2;
  5835. unsigned int refilled = 0;
  5836. unsigned long flags;
  5837. void *object;
  5838. pc.flags = gfp;
  5839. pc.min_objects = min;
  5840. pc.max_objects = max;
  5841. if (!get_partial_node_bulk(s, n, &pc, allow_spin))
  5842. return 0;
  5843. list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
  5844. list_del(&slab->slab_list);
  5845. object = get_freelist_nofreeze(s, slab);
  5846. while (object && refilled < max) {
  5847. p[refilled] = object;
  5848. object = get_freepointer(s, object);
  5849. maybe_wipe_obj_freeptr(s, p[refilled]);
  5850. refilled++;
  5851. }
  5852. /*
  5853. * Freelist had more objects than we can accommodate, we need to
  5854. * free them back. We can treat it like a detached freelist, just
  5855. * need to find the tail object.
  5856. */
  5857. if (unlikely(object)) {
  5858. void *head = object;
  5859. void *tail;
  5860. int cnt = 0;
  5861. do {
  5862. tail = object;
  5863. cnt++;
  5864. object = get_freepointer(s, object);
  5865. } while (object);
  5866. __slab_free(s, slab, head, tail, cnt, _RET_IP_);
  5867. }
  5868. if (refilled >= max)
  5869. break;
  5870. }
  5871. if (unlikely(!list_empty(&pc.slabs))) {
  5872. spin_lock_irqsave(&n->list_lock, flags);
  5873. list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
  5874. if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial))
  5875. continue;
  5876. list_del(&slab->slab_list);
  5877. add_partial(n, slab, ADD_TO_HEAD);
  5878. }
  5879. spin_unlock_irqrestore(&n->list_lock, flags);
  5880. /* any slabs left are completely free and for discard */
  5881. list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
  5882. list_del(&slab->slab_list);
  5883. discard_slab(s, slab);
  5884. }
  5885. }
  5886. return refilled;
  5887. }
  5888. #ifdef CONFIG_NUMA
  5889. static unsigned int
  5890. __refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
  5891. unsigned int max)
  5892. {
  5893. struct zonelist *zonelist;
  5894. struct zoneref *z;
  5895. struct zone *zone;
  5896. enum zone_type highest_zoneidx = gfp_zone(gfp);
  5897. unsigned int cpuset_mems_cookie;
  5898. unsigned int refilled = 0;
  5899. /* see get_from_any_partial() for the defrag ratio description */
  5900. if (!s->remote_node_defrag_ratio ||
  5901. get_cycles() % 1024 > s->remote_node_defrag_ratio)
  5902. return 0;
  5903. do {
  5904. cpuset_mems_cookie = read_mems_allowed_begin();
  5905. zonelist = node_zonelist(mempolicy_slab_node(), gfp);
  5906. for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
  5907. struct kmem_cache_node *n;
  5908. unsigned int r;
  5909. n = get_node(s, zone_to_nid(zone));
  5910. if (!n || !cpuset_zone_allowed(zone, gfp) ||
  5911. n->nr_partial <= s->min_partial)
  5912. continue;
  5913. r = __refill_objects_node(s, p, gfp, min, max, n,
  5914. /* allow_spin = */ false);
  5915. refilled += r;
  5916. if (r >= min) {
  5917. /*
  5918. * Don't check read_mems_allowed_retry() here -
  5919. * if mems_allowed was updated in parallel, that
  5920. * was a harmless race between allocation and
  5921. * the cpuset update
  5922. */
  5923. return refilled;
  5924. }
  5925. p += r;
  5926. min -= r;
  5927. max -= r;
  5928. }
  5929. } while (read_mems_allowed_retry(cpuset_mems_cookie));
  5930. return refilled;
  5931. }
  5932. #else
  5933. static inline unsigned int
  5934. __refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
  5935. unsigned int max)
  5936. {
  5937. return 0;
  5938. }
  5939. #endif
  5940. static unsigned int
  5941. refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
  5942. unsigned int max)
  5943. {
  5944. int local_node = numa_mem_id();
  5945. unsigned int refilled;
  5946. struct slab *slab;
  5947. if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
  5948. return 0;
  5949. refilled = __refill_objects_node(s, p, gfp, min, max,
  5950. get_node(s, local_node),
  5951. /* allow_spin = */ true);
  5952. if (refilled >= min)
  5953. return refilled;
  5954. refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled,
  5955. max - refilled);
  5956. if (refilled >= min)
  5957. return refilled;
  5958. new_slab:
  5959. slab = new_slab(s, gfp, local_node);
  5960. if (!slab)
  5961. goto out;
  5962. stat(s, ALLOC_SLAB);
  5963. /*
  5964. * TODO: possible optimization - if we know we will consume the whole
  5965. * slab we might skip creating the freelist?
  5966. */
  5967. refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled,
  5968. /* allow_spin = */ true);
  5969. if (refilled < min)
  5970. goto new_slab;
  5971. out:
  5972. return refilled;
  5973. }
  5974. static inline
  5975. int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
  5976. void **p)
  5977. {
  5978. int i;
  5979. if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
  5980. for (i = 0; i < size; i++) {
  5981. p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_,
  5982. s->object_size);
  5983. if (unlikely(!p[i]))
  5984. goto error;
  5985. maybe_wipe_obj_freeptr(s, p[i]);
  5986. }
  5987. } else {
  5988. i = refill_objects(s, p, flags, size, size);
  5989. if (i < size)
  5990. goto error;
  5991. stat_add(s, ALLOC_SLOWPATH, i);
  5992. }
  5993. return i;
  5994. error:
  5995. __kmem_cache_free_bulk(s, i, p);
  5996. return 0;
  5997. }
  5998. /*
  5999. * Note that interrupts must be enabled when calling this function and gfp
  6000. * flags must allow spinning.
  6001. */
  6002. int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
  6003. void **p)
  6004. {
  6005. unsigned int i = 0;
  6006. void *kfence_obj;
  6007. if (!size)
  6008. return 0;
  6009. s = slab_pre_alloc_hook(s, flags);
  6010. if (unlikely(!s))
  6011. return 0;
  6012. /*
  6013. * to make things simpler, only assume at most once kfence allocated
  6014. * object per bulk allocation and choose its index randomly
  6015. */
  6016. kfence_obj = kfence_alloc(s, s->object_size, flags);
  6017. if (unlikely(kfence_obj)) {
  6018. if (unlikely(size == 1)) {
  6019. p[0] = kfence_obj;
  6020. goto out;
  6021. }
  6022. size--;
  6023. }
  6024. i = alloc_from_pcs_bulk(s, flags, size, p);
  6025. if (i < size) {
  6026. /*
  6027. * If we ran out of memory, don't bother with freeing back to
  6028. * the percpu sheaves, we have bigger problems.
  6029. */
  6030. if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) {
  6031. if (i > 0)
  6032. __kmem_cache_free_bulk(s, i, p);
  6033. if (kfence_obj)
  6034. __kfence_free(kfence_obj);
  6035. return 0;
  6036. }
  6037. }
  6038. if (unlikely(kfence_obj)) {
  6039. int idx = get_random_u32_below(size + 1);
  6040. if (idx != size)
  6041. p[size] = p[idx];
  6042. p[idx] = kfence_obj;
  6043. size++;
  6044. }
  6045. out:
  6046. /*
  6047. * memcg and kmem_cache debug support and memory initialization.
  6048. * Done outside of the IRQ disabled fastpath loop.
  6049. */
  6050. if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p,
  6051. slab_want_init_on_alloc(flags, s), s->object_size))) {
  6052. return 0;
  6053. }
  6054. return size;
  6055. }
  6056. EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof);
  6057. /*
  6058. * Object placement in a slab is made very easy because we always start at
  6059. * offset 0. If we tune the size of the object to the alignment then we can
  6060. * get the required alignment by putting one properly sized object after
  6061. * another.
  6062. *
  6063. * Notice that the allocation order determines the sizes of the per cpu
  6064. * caches. Each processor has always one slab available for allocations.
  6065. * Increasing the allocation order reduces the number of times that slabs
  6066. * must be moved on and off the partial lists and is therefore a factor in
  6067. * locking overhead.
  6068. */
  6069. /*
  6070. * Minimum / Maximum order of slab pages. This influences locking overhead
  6071. * and slab fragmentation. A higher order reduces the number of partial slabs
  6072. * and increases the number of allocations possible without having to
  6073. * take the list_lock.
  6074. */
  6075. static unsigned int slub_min_order;
  6076. static unsigned int slub_max_order =
  6077. IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER;
  6078. static unsigned int slub_min_objects;
  6079. /*
  6080. * Calculate the order of allocation given an slab object size.
  6081. *
  6082. * The order of allocation has significant impact on performance and other
  6083. * system components. Generally order 0 allocations should be preferred since
  6084. * order 0 does not cause fragmentation in the page allocator. Larger objects
  6085. * be problematic to put into order 0 slabs because there may be too much
  6086. * unused space left. We go to a higher order if more than 1/16th of the slab
  6087. * would be wasted.
  6088. *
  6089. * In order to reach satisfactory performance we must ensure that a minimum
  6090. * number of objects is in one slab. Otherwise we may generate too much
  6091. * activity on the partial lists which requires taking the list_lock. This is
  6092. * less a concern for large slabs though which are rarely used.
  6093. *
  6094. * slab_max_order specifies the order where we begin to stop considering the
  6095. * number of objects in a slab as critical. If we reach slab_max_order then
  6096. * we try to keep the page order as low as possible. So we accept more waste
  6097. * of space in favor of a small page order.
  6098. *
  6099. * Higher order allocations also allow the placement of more objects in a
  6100. * slab and thereby reduce object handling overhead. If the user has
  6101. * requested a higher minimum order then we start with that one instead of
  6102. * the smallest order which will fit the object.
  6103. */
  6104. static inline unsigned int calc_slab_order(unsigned int size,
  6105. unsigned int min_order, unsigned int max_order,
  6106. unsigned int fract_leftover)
  6107. {
  6108. unsigned int order;
  6109. for (order = min_order; order <= max_order; order++) {
  6110. unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
  6111. unsigned int rem;
  6112. rem = slab_size % size;
  6113. if (rem <= slab_size / fract_leftover)
  6114. break;
  6115. }
  6116. return order;
  6117. }
  6118. static inline int calculate_order(unsigned int size)
  6119. {
  6120. unsigned int order;
  6121. unsigned int min_objects;
  6122. unsigned int max_objects;
  6123. unsigned int min_order;
  6124. min_objects = slub_min_objects;
  6125. if (!min_objects) {
  6126. /*
  6127. * Some architectures will only update present cpus when
  6128. * onlining them, so don't trust the number if it's just 1. But
  6129. * we also don't want to use nr_cpu_ids always, as on some other
  6130. * architectures, there can be many possible cpus, but never
  6131. * onlined. Here we compromise between trying to avoid too high
  6132. * order on systems that appear larger than they are, and too
  6133. * low order on systems that appear smaller than they are.
  6134. */
  6135. unsigned int nr_cpus = num_present_cpus();
  6136. if (nr_cpus <= 1)
  6137. nr_cpus = nr_cpu_ids;
  6138. min_objects = 4 * (fls(nr_cpus) + 1);
  6139. }
  6140. /* min_objects can't be 0 because get_order(0) is undefined */
  6141. max_objects = max(order_objects(slub_max_order, size), 1U);
  6142. min_objects = min(min_objects, max_objects);
  6143. min_order = max_t(unsigned int, slub_min_order,
  6144. get_order(min_objects * size));
  6145. if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
  6146. return get_order(size * MAX_OBJS_PER_PAGE) - 1;
  6147. /*
  6148. * Attempt to find best configuration for a slab. This works by first
  6149. * attempting to generate a layout with the best possible configuration
  6150. * and backing off gradually.
  6151. *
  6152. * We start with accepting at most 1/16 waste and try to find the
  6153. * smallest order from min_objects-derived/slab_min_order up to
  6154. * slab_max_order that will satisfy the constraint. Note that increasing
  6155. * the order can only result in same or less fractional waste, not more.
  6156. *
  6157. * If that fails, we increase the acceptable fraction of waste and try
  6158. * again. The last iteration with fraction of 1/2 would effectively
  6159. * accept any waste and give us the order determined by min_objects, as
  6160. * long as at least single object fits within slab_max_order.
  6161. */
  6162. for (unsigned int fraction = 16; fraction > 1; fraction /= 2) {
  6163. order = calc_slab_order(size, min_order, slub_max_order,
  6164. fraction);
  6165. if (order <= slub_max_order)
  6166. return order;
  6167. }
  6168. /*
  6169. * Doh this slab cannot be placed using slab_max_order.
  6170. */
  6171. order = get_order(size);
  6172. if (order <= MAX_PAGE_ORDER)
  6173. return order;
  6174. return -ENOSYS;
  6175. }
  6176. static void
  6177. init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn)
  6178. {
  6179. n->nr_partial = 0;
  6180. spin_lock_init(&n->list_lock);
  6181. INIT_LIST_HEAD(&n->partial);
  6182. #ifdef CONFIG_SLUB_DEBUG
  6183. atomic_long_set(&n->nr_slabs, 0);
  6184. atomic_long_set(&n->total_objects, 0);
  6185. INIT_LIST_HEAD(&n->full);
  6186. #endif
  6187. n->barn = barn;
  6188. if (barn)
  6189. barn_init(barn);
  6190. }
  6191. #ifdef CONFIG_SLUB_STATS
  6192. static inline int alloc_kmem_cache_stats(struct kmem_cache *s)
  6193. {
  6194. BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
  6195. NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH *
  6196. sizeof(struct kmem_cache_stats));
  6197. s->cpu_stats = alloc_percpu(struct kmem_cache_stats);
  6198. if (!s->cpu_stats)
  6199. return 0;
  6200. return 1;
  6201. }
  6202. #endif
  6203. static int init_percpu_sheaves(struct kmem_cache *s)
  6204. {
  6205. static struct slab_sheaf bootstrap_sheaf = {};
  6206. int cpu;
  6207. for_each_possible_cpu(cpu) {
  6208. struct slub_percpu_sheaves *pcs;
  6209. pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
  6210. local_trylock_init(&pcs->lock);
  6211. /*
  6212. * Bootstrap sheaf has zero size so fast-path allocation fails.
  6213. * It has also size == s->sheaf_capacity, so fast-path free
  6214. * fails. In the slow paths we recognize the situation by
  6215. * checking s->sheaf_capacity. This allows fast paths to assume
  6216. * s->cpu_sheaves and pcs->main always exists and are valid.
  6217. * It's also safe to share the single static bootstrap_sheaf
  6218. * with zero-sized objects array as it's never modified.
  6219. *
  6220. * Bootstrap_sheaf also has NULL pointer to kmem_cache so we
  6221. * recognize it and not attempt to free it when destroying the
  6222. * cache.
  6223. *
  6224. * We keep bootstrap_sheaf for kmem_cache and kmem_cache_node,
  6225. * caches with debug enabled, and all caches with SLUB_TINY.
  6226. * For kmalloc caches it's used temporarily during the initial
  6227. * bootstrap.
  6228. */
  6229. if (!s->sheaf_capacity)
  6230. pcs->main = &bootstrap_sheaf;
  6231. else
  6232. pcs->main = alloc_empty_sheaf(s, GFP_KERNEL);
  6233. if (!pcs->main)
  6234. return -ENOMEM;
  6235. }
  6236. return 0;
  6237. }
  6238. static struct kmem_cache *kmem_cache_node;
  6239. /*
  6240. * No kmalloc_node yet so do it by hand. We know that this is the first
  6241. * slab on the node for this slabcache. There are no concurrent accesses
  6242. * possible.
  6243. *
  6244. * Note that this function only works on the kmem_cache_node
  6245. * when allocating for the kmem_cache_node. This is used for bootstrapping
  6246. * memory on a fresh node that has no slab structures yet.
  6247. */
  6248. static void early_kmem_cache_node_alloc(int node)
  6249. {
  6250. struct slab *slab;
  6251. struct kmem_cache_node *n;
  6252. BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
  6253. slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
  6254. BUG_ON(!slab);
  6255. if (slab_nid(slab) != node) {
  6256. pr_err("SLUB: Unable to allocate memory from node %d\n", node);
  6257. pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
  6258. }
  6259. n = slab->freelist;
  6260. BUG_ON(!n);
  6261. #ifdef CONFIG_SLUB_DEBUG
  6262. init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
  6263. #endif
  6264. n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
  6265. slab->freelist = get_freepointer(kmem_cache_node, n);
  6266. slab->inuse = 1;
  6267. kmem_cache_node->node[node] = n;
  6268. init_kmem_cache_node(n, NULL);
  6269. inc_slabs_node(kmem_cache_node, node, slab->objects);
  6270. /*
  6271. * No locks need to be taken here as it has just been
  6272. * initialized and there is no concurrent access.
  6273. */
  6274. __add_partial(n, slab, ADD_TO_HEAD);
  6275. }
  6276. static void free_kmem_cache_nodes(struct kmem_cache *s)
  6277. {
  6278. int node;
  6279. struct kmem_cache_node *n;
  6280. for_each_kmem_cache_node(s, node, n) {
  6281. if (n->barn) {
  6282. WARN_ON(n->barn->nr_full);
  6283. WARN_ON(n->barn->nr_empty);
  6284. kfree(n->barn);
  6285. n->barn = NULL;
  6286. }
  6287. s->node[node] = NULL;
  6288. kmem_cache_free(kmem_cache_node, n);
  6289. }
  6290. }
  6291. void __kmem_cache_release(struct kmem_cache *s)
  6292. {
  6293. cache_random_seq_destroy(s);
  6294. pcs_destroy(s);
  6295. #ifdef CONFIG_SLUB_STATS
  6296. free_percpu(s->cpu_stats);
  6297. #endif
  6298. free_kmem_cache_nodes(s);
  6299. }
  6300. static int init_kmem_cache_nodes(struct kmem_cache *s)
  6301. {
  6302. int node;
  6303. for_each_node_mask(node, slab_nodes) {
  6304. struct kmem_cache_node *n;
  6305. struct node_barn *barn = NULL;
  6306. if (slab_state == DOWN) {
  6307. early_kmem_cache_node_alloc(node);
  6308. continue;
  6309. }
  6310. if (cache_has_sheaves(s)) {
  6311. barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node);
  6312. if (!barn)
  6313. return 0;
  6314. }
  6315. n = kmem_cache_alloc_node(kmem_cache_node,
  6316. GFP_KERNEL, node);
  6317. if (!n) {
  6318. kfree(barn);
  6319. return 0;
  6320. }
  6321. init_kmem_cache_node(n, barn);
  6322. s->node[node] = n;
  6323. }
  6324. return 1;
  6325. }
  6326. static unsigned int calculate_sheaf_capacity(struct kmem_cache *s,
  6327. struct kmem_cache_args *args)
  6328. {
  6329. unsigned int capacity;
  6330. size_t size;
  6331. if (IS_ENABLED(CONFIG_SLUB_TINY) || s->flags & SLAB_DEBUG_FLAGS)
  6332. return 0;
  6333. /*
  6334. * Bootstrap caches can't have sheaves for now (SLAB_NO_OBJ_EXT).
  6335. * SLAB_NOLEAKTRACE caches (e.g., kmemleak's object_cache) must not
  6336. * have sheaves to avoid recursion when sheaf allocation triggers
  6337. * kmemleak tracking.
  6338. */
  6339. if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
  6340. return 0;
  6341. /*
  6342. * For now we use roughly similar formula (divided by two as there are
  6343. * two percpu sheaves) as what was used for percpu partial slabs, which
  6344. * should result in similar lock contention (barn or list_lock)
  6345. */
  6346. if (s->size >= PAGE_SIZE)
  6347. capacity = 4;
  6348. else if (s->size >= 1024)
  6349. capacity = 12;
  6350. else if (s->size >= 256)
  6351. capacity = 26;
  6352. else
  6353. capacity = 60;
  6354. /* Increment capacity to make sheaf exactly a kmalloc size bucket */
  6355. size = struct_size_t(struct slab_sheaf, objects, capacity);
  6356. size = kmalloc_size_roundup(size);
  6357. capacity = (size - struct_size_t(struct slab_sheaf, objects, 0)) / sizeof(void *);
  6358. /*
  6359. * Respect an explicit request for capacity that's typically motivated by
  6360. * expected maximum size of kmem_cache_prefill_sheaf() to not end up
  6361. * using low-performance oversize sheaves
  6362. */
  6363. return max(capacity, args->sheaf_capacity);
  6364. }
  6365. /*
  6366. * calculate_sizes() determines the order and the distribution of data within
  6367. * a slab object.
  6368. */
  6369. static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
  6370. {
  6371. slab_flags_t flags = s->flags;
  6372. unsigned int size = s->object_size;
  6373. unsigned int aligned_size;
  6374. unsigned int order;
  6375. /*
  6376. * Round up object size to the next word boundary. We can only
  6377. * place the free pointer at word boundaries and this determines
  6378. * the possible location of the free pointer.
  6379. */
  6380. size = ALIGN(size, sizeof(void *));
  6381. #ifdef CONFIG_SLUB_DEBUG
  6382. /*
  6383. * Determine if we can poison the object itself. If the user of
  6384. * the slab may touch the object after free or before allocation
  6385. * then we should never poison the object itself.
  6386. */
  6387. if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
  6388. !s->ctor)
  6389. s->flags |= __OBJECT_POISON;
  6390. else
  6391. s->flags &= ~__OBJECT_POISON;
  6392. /*
  6393. * If we are Redzoning and there is no space between the end of the
  6394. * object and the following fields, add one word so the right Redzone
  6395. * is non-empty.
  6396. */
  6397. if ((flags & SLAB_RED_ZONE) && size == s->object_size)
  6398. size += sizeof(void *);
  6399. #endif
  6400. /*
  6401. * With that we have determined the number of bytes in actual use
  6402. * by the object and redzoning.
  6403. */
  6404. s->inuse = size;
  6405. if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) ||
  6406. (flags & SLAB_POISON) ||
  6407. (s->ctor && !args->use_freeptr_offset) ||
  6408. ((flags & SLAB_RED_ZONE) &&
  6409. (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {
  6410. /*
  6411. * Relocate free pointer after the object if it is not
  6412. * permitted to overwrite the first word of the object on
  6413. * kmem_cache_free.
  6414. *
  6415. * This is the case if we do RCU, have a constructor, are
  6416. * poisoning the objects, or are redzoning an object smaller
  6417. * than sizeof(void *) or are redzoning an object with
  6418. * slub_debug_orig_size() enabled, in which case the right
  6419. * redzone may be extended.
  6420. *
  6421. * The assumption that s->offset >= s->inuse means free
  6422. * pointer is outside of the object is used in the
  6423. * freeptr_outside_object() function. If that is no
  6424. * longer true, the function needs to be modified.
  6425. */
  6426. s->offset = size;
  6427. size += sizeof(void *);
  6428. } else if (((flags & SLAB_TYPESAFE_BY_RCU) || s->ctor) &&
  6429. args->use_freeptr_offset) {
  6430. s->offset = args->freeptr_offset;
  6431. } else {
  6432. /*
  6433. * Store freelist pointer near middle of object to keep
  6434. * it away from the edges of the object to avoid small
  6435. * sized over/underflows from neighboring allocations.
  6436. */
  6437. s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
  6438. }
  6439. #ifdef CONFIG_SLUB_DEBUG
  6440. if (flags & SLAB_STORE_USER) {
  6441. /*
  6442. * Need to store information about allocs and frees after
  6443. * the object.
  6444. */
  6445. size += 2 * sizeof(struct track);
  6446. /* Save the original kmalloc request size */
  6447. if (flags & SLAB_KMALLOC)
  6448. size += sizeof(unsigned long);
  6449. }
  6450. #endif
  6451. kasan_cache_create(s, &size, &s->flags);
  6452. #ifdef CONFIG_SLUB_DEBUG
  6453. if (flags & SLAB_RED_ZONE) {
  6454. /*
  6455. * Add some empty padding so that we can catch
  6456. * overwrites from earlier objects rather than let
  6457. * tracking information or the free pointer be
  6458. * corrupted if a user writes before the start
  6459. * of the object.
  6460. */
  6461. size += sizeof(void *);
  6462. s->red_left_pad = sizeof(void *);
  6463. s->red_left_pad = ALIGN(s->red_left_pad, s->align);
  6464. size += s->red_left_pad;
  6465. }
  6466. #endif
  6467. /*
  6468. * SLUB stores one object immediately after another beginning from
  6469. * offset 0. In order to align the objects we have to simply size
  6470. * each object to conform to the alignment.
  6471. */
  6472. aligned_size = ALIGN(size, s->align);
  6473. #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT)
  6474. if (slab_args_unmergeable(args, s->flags) &&
  6475. (aligned_size - size >= sizeof(struct slabobj_ext)))
  6476. s->flags |= SLAB_OBJ_EXT_IN_OBJ;
  6477. #endif
  6478. size = aligned_size;
  6479. s->size = size;
  6480. s->reciprocal_size = reciprocal_value(size);
  6481. order = calculate_order(size);
  6482. if ((int)order < 0)
  6483. return 0;
  6484. s->allocflags = __GFP_COMP;
  6485. if (s->flags & SLAB_CACHE_DMA)
  6486. s->allocflags |= GFP_DMA;
  6487. if (s->flags & SLAB_CACHE_DMA32)
  6488. s->allocflags |= GFP_DMA32;
  6489. if (s->flags & SLAB_RECLAIM_ACCOUNT)
  6490. s->allocflags |= __GFP_RECLAIMABLE;
  6491. /*
  6492. * For KMALLOC_NORMAL caches we enable sheaves later by
  6493. * bootstrap_kmalloc_sheaves() to avoid recursion
  6494. */
  6495. if (!is_kmalloc_normal(s))
  6496. s->sheaf_capacity = calculate_sheaf_capacity(s, args);
  6497. /*
  6498. * Determine the number of objects per slab
  6499. */
  6500. s->oo = oo_make(order, size);
  6501. s->min = oo_make(get_order(size), size);
  6502. return !!oo_objects(s->oo);
  6503. }
  6504. static void list_slab_objects(struct kmem_cache *s, struct slab *slab)
  6505. {
  6506. #ifdef CONFIG_SLUB_DEBUG
  6507. void *addr = slab_address(slab);
  6508. void *p;
  6509. if (!slab_add_kunit_errors())
  6510. slab_bug(s, "Objects remaining on __kmem_cache_shutdown()");
  6511. spin_lock(&object_map_lock);
  6512. __fill_map(object_map, s, slab);
  6513. for_each_object(p, s, addr, slab->objects) {
  6514. if (!test_bit(__obj_to_index(s, addr, p), object_map)) {
  6515. if (slab_add_kunit_errors())
  6516. continue;
  6517. pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
  6518. print_tracking(s, p);
  6519. }
  6520. }
  6521. spin_unlock(&object_map_lock);
  6522. __slab_err(slab);
  6523. #endif
  6524. }
  6525. /*
  6526. * Attempt to free all partial slabs on a node.
  6527. * This is called from __kmem_cache_shutdown(). We must take list_lock
  6528. * because sysfs file might still access partial list after the shutdowning.
  6529. */
  6530. static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
  6531. {
  6532. LIST_HEAD(discard);
  6533. struct slab *slab, *h;
  6534. BUG_ON(irqs_disabled());
  6535. spin_lock_irq(&n->list_lock);
  6536. list_for_each_entry_safe(slab, h, &n->partial, slab_list) {
  6537. if (!slab->inuse) {
  6538. remove_partial(n, slab);
  6539. list_add(&slab->slab_list, &discard);
  6540. } else {
  6541. list_slab_objects(s, slab);
  6542. }
  6543. }
  6544. spin_unlock_irq(&n->list_lock);
  6545. list_for_each_entry_safe(slab, h, &discard, slab_list)
  6546. discard_slab(s, slab);
  6547. }
  6548. bool __kmem_cache_empty(struct kmem_cache *s)
  6549. {
  6550. int node;
  6551. struct kmem_cache_node *n;
  6552. for_each_kmem_cache_node(s, node, n)
  6553. if (n->nr_partial || node_nr_slabs(n))
  6554. return false;
  6555. return true;
  6556. }
  6557. /*
  6558. * Release all resources used by a slab cache.
  6559. */
  6560. int __kmem_cache_shutdown(struct kmem_cache *s)
  6561. {
  6562. int node;
  6563. struct kmem_cache_node *n;
  6564. flush_all_cpus_locked(s);
  6565. /* we might have rcu sheaves in flight */
  6566. if (cache_has_sheaves(s))
  6567. rcu_barrier();
  6568. /* Attempt to free all objects */
  6569. for_each_kmem_cache_node(s, node, n) {
  6570. if (n->barn)
  6571. barn_shrink(s, n->barn);
  6572. free_partial(s, n);
  6573. if (n->nr_partial || node_nr_slabs(n))
  6574. return 1;
  6575. }
  6576. return 0;
  6577. }
  6578. #ifdef CONFIG_PRINTK
  6579. void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
  6580. {
  6581. void *base;
  6582. int __maybe_unused i;
  6583. unsigned int objnr;
  6584. void *objp;
  6585. void *objp0;
  6586. struct kmem_cache *s = slab->slab_cache;
  6587. struct track __maybe_unused *trackp;
  6588. kpp->kp_ptr = object;
  6589. kpp->kp_slab = slab;
  6590. kpp->kp_slab_cache = s;
  6591. base = slab_address(slab);
  6592. objp0 = kasan_reset_tag(object);
  6593. #ifdef CONFIG_SLUB_DEBUG
  6594. objp = restore_red_left(s, objp0);
  6595. #else
  6596. objp = objp0;
  6597. #endif
  6598. objnr = obj_to_index(s, slab, objp);
  6599. kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
  6600. objp = base + s->size * objnr;
  6601. kpp->kp_objp = objp;
  6602. if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size
  6603. || (objp - base) % s->size) ||
  6604. !(s->flags & SLAB_STORE_USER))
  6605. return;
  6606. #ifdef CONFIG_SLUB_DEBUG
  6607. objp = fixup_red_left(s, objp);
  6608. trackp = get_track(s, objp, TRACK_ALLOC);
  6609. kpp->kp_ret = (void *)trackp->addr;
  6610. #ifdef CONFIG_STACKDEPOT
  6611. {
  6612. depot_stack_handle_t handle;
  6613. unsigned long *entries;
  6614. unsigned int nr_entries;
  6615. handle = READ_ONCE(trackp->handle);
  6616. if (handle) {
  6617. nr_entries = stack_depot_fetch(handle, &entries);
  6618. for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
  6619. kpp->kp_stack[i] = (void *)entries[i];
  6620. }
  6621. trackp = get_track(s, objp, TRACK_FREE);
  6622. handle = READ_ONCE(trackp->handle);
  6623. if (handle) {
  6624. nr_entries = stack_depot_fetch(handle, &entries);
  6625. for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
  6626. kpp->kp_free_stack[i] = (void *)entries[i];
  6627. }
  6628. }
  6629. #endif
  6630. #endif
  6631. }
  6632. #endif
  6633. /********************************************************************
  6634. * Kmalloc subsystem
  6635. *******************************************************************/
  6636. static int __init setup_slub_min_order(const char *str, const struct kernel_param *kp)
  6637. {
  6638. int ret;
  6639. ret = kstrtouint(str, 0, &slub_min_order);
  6640. if (ret)
  6641. return ret;
  6642. if (slub_min_order > slub_max_order)
  6643. slub_max_order = slub_min_order;
  6644. return 0;
  6645. }
  6646. static const struct kernel_param_ops param_ops_slab_min_order __initconst = {
  6647. .set = setup_slub_min_order,
  6648. };
  6649. __core_param_cb(slab_min_order, &param_ops_slab_min_order, &slub_min_order, 0);
  6650. __core_param_cb(slub_min_order, &param_ops_slab_min_order, &slub_min_order, 0);
  6651. static int __init setup_slub_max_order(const char *str, const struct kernel_param *kp)
  6652. {
  6653. int ret;
  6654. ret = kstrtouint(str, 0, &slub_max_order);
  6655. if (ret)
  6656. return ret;
  6657. slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER);
  6658. if (slub_min_order > slub_max_order)
  6659. slub_min_order = slub_max_order;
  6660. return 0;
  6661. }
  6662. static const struct kernel_param_ops param_ops_slab_max_order __initconst = {
  6663. .set = setup_slub_max_order,
  6664. };
  6665. __core_param_cb(slab_max_order, &param_ops_slab_max_order, &slub_max_order, 0);
  6666. __core_param_cb(slub_max_order, &param_ops_slab_max_order, &slub_max_order, 0);
  6667. core_param(slab_min_objects, slub_min_objects, uint, 0);
  6668. core_param(slub_min_objects, slub_min_objects, uint, 0);
  6669. #ifdef CONFIG_NUMA
  6670. static int __init setup_slab_strict_numa(const char *str, const struct kernel_param *kp)
  6671. {
  6672. if (nr_node_ids > 1) {
  6673. static_branch_enable(&strict_numa);
  6674. pr_info("SLUB: Strict NUMA enabled.\n");
  6675. } else {
  6676. pr_warn("slab_strict_numa parameter set on non NUMA system.\n");
  6677. }
  6678. return 0;
  6679. }
  6680. static const struct kernel_param_ops param_ops_slab_strict_numa __initconst = {
  6681. .flags = KERNEL_PARAM_OPS_FL_NOARG,
  6682. .set = setup_slab_strict_numa,
  6683. };
  6684. __core_param_cb(slab_strict_numa, &param_ops_slab_strict_numa, NULL, 0);
  6685. #endif
  6686. #ifdef CONFIG_HARDENED_USERCOPY
  6687. /*
  6688. * Rejects incorrectly sized objects and objects that are to be copied
  6689. * to/from userspace but do not fall entirely within the containing slab
  6690. * cache's usercopy region.
  6691. *
  6692. * Returns NULL if check passes, otherwise const char * to name of cache
  6693. * to indicate an error.
  6694. */
  6695. void __check_heap_object(const void *ptr, unsigned long n,
  6696. const struct slab *slab, bool to_user)
  6697. {
  6698. struct kmem_cache *s;
  6699. unsigned int offset;
  6700. bool is_kfence = is_kfence_address(ptr);
  6701. ptr = kasan_reset_tag(ptr);
  6702. /* Find object and usable object size. */
  6703. s = slab->slab_cache;
  6704. /* Reject impossible pointers. */
  6705. if (ptr < slab_address(slab))
  6706. usercopy_abort("SLUB object not in SLUB page?!", NULL,
  6707. to_user, 0, n);
  6708. /* Find offset within object. */
  6709. if (is_kfence)
  6710. offset = ptr - kfence_object_start(ptr);
  6711. else
  6712. offset = (ptr - slab_address(slab)) % s->size;
  6713. /* Adjust for redzone and reject if within the redzone. */
  6714. if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
  6715. if (offset < s->red_left_pad)
  6716. usercopy_abort("SLUB object in left red zone",
  6717. s->name, to_user, offset, n);
  6718. offset -= s->red_left_pad;
  6719. }
  6720. /* Allow address range falling entirely within usercopy region. */
  6721. if (offset >= s->useroffset &&
  6722. offset - s->useroffset <= s->usersize &&
  6723. n <= s->useroffset - offset + s->usersize)
  6724. return;
  6725. usercopy_abort("SLUB object", s->name, to_user, offset, n);
  6726. }
  6727. #endif /* CONFIG_HARDENED_USERCOPY */
  6728. #define SHRINK_PROMOTE_MAX 32
  6729. /*
  6730. * kmem_cache_shrink discards empty slabs and promotes the slabs filled
  6731. * up most to the head of the partial lists. New allocations will then
  6732. * fill those up and thus they can be removed from the partial lists.
  6733. *
  6734. * The slabs with the least items are placed last. This results in them
  6735. * being allocated from last increasing the chance that the last objects
  6736. * are freed in them.
  6737. */
  6738. static int __kmem_cache_do_shrink(struct kmem_cache *s)
  6739. {
  6740. int node;
  6741. int i;
  6742. struct kmem_cache_node *n;
  6743. struct slab *slab;
  6744. struct slab *t;
  6745. struct list_head discard;
  6746. struct list_head promote[SHRINK_PROMOTE_MAX];
  6747. unsigned long flags;
  6748. int ret = 0;
  6749. for_each_kmem_cache_node(s, node, n) {
  6750. INIT_LIST_HEAD(&discard);
  6751. for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
  6752. INIT_LIST_HEAD(promote + i);
  6753. if (n->barn)
  6754. barn_shrink(s, n->barn);
  6755. spin_lock_irqsave(&n->list_lock, flags);
  6756. /*
  6757. * Build lists of slabs to discard or promote.
  6758. *
  6759. * Note that concurrent frees may occur while we hold the
  6760. * list_lock. slab->inuse here is the upper limit.
  6761. */
  6762. list_for_each_entry_safe(slab, t, &n->partial, slab_list) {
  6763. int free = slab->objects - slab->inuse;
  6764. /* Do not reread slab->inuse */
  6765. barrier();
  6766. /* We do not keep full slabs on the list */
  6767. BUG_ON(free <= 0);
  6768. if (free == slab->objects) {
  6769. list_move(&slab->slab_list, &discard);
  6770. slab_clear_node_partial(slab);
  6771. n->nr_partial--;
  6772. dec_slabs_node(s, node, slab->objects);
  6773. } else if (free <= SHRINK_PROMOTE_MAX)
  6774. list_move(&slab->slab_list, promote + free - 1);
  6775. }
  6776. /*
  6777. * Promote the slabs filled up most to the head of the
  6778. * partial list.
  6779. */
  6780. for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
  6781. list_splice(promote + i, &n->partial);
  6782. spin_unlock_irqrestore(&n->list_lock, flags);
  6783. /* Release empty slabs */
  6784. list_for_each_entry_safe(slab, t, &discard, slab_list)
  6785. free_slab(s, slab);
  6786. if (node_nr_slabs(n))
  6787. ret = 1;
  6788. }
  6789. return ret;
  6790. }
  6791. int __kmem_cache_shrink(struct kmem_cache *s)
  6792. {
  6793. flush_all(s);
  6794. return __kmem_cache_do_shrink(s);
  6795. }
  6796. static int slab_mem_going_offline_callback(void)
  6797. {
  6798. struct kmem_cache *s;
  6799. mutex_lock(&slab_mutex);
  6800. list_for_each_entry(s, &slab_caches, list) {
  6801. flush_all_cpus_locked(s);
  6802. __kmem_cache_do_shrink(s);
  6803. }
  6804. mutex_unlock(&slab_mutex);
  6805. return 0;
  6806. }
  6807. static int slab_mem_going_online_callback(int nid)
  6808. {
  6809. struct kmem_cache_node *n;
  6810. struct kmem_cache *s;
  6811. int ret = 0;
  6812. /*
  6813. * We are bringing a node online. No memory is available yet. We must
  6814. * allocate a kmem_cache_node structure in order to bring the node
  6815. * online.
  6816. */
  6817. mutex_lock(&slab_mutex);
  6818. list_for_each_entry(s, &slab_caches, list) {
  6819. struct node_barn *barn = NULL;
  6820. /*
  6821. * The structure may already exist if the node was previously
  6822. * onlined and offlined.
  6823. */
  6824. if (get_node(s, nid))
  6825. continue;
  6826. if (cache_has_sheaves(s)) {
  6827. barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid);
  6828. if (!barn) {
  6829. ret = -ENOMEM;
  6830. goto out;
  6831. }
  6832. }
  6833. /*
  6834. * XXX: kmem_cache_alloc_node will fallback to other nodes
  6835. * since memory is not yet available from the node that
  6836. * is brought up.
  6837. */
  6838. n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
  6839. if (!n) {
  6840. kfree(barn);
  6841. ret = -ENOMEM;
  6842. goto out;
  6843. }
  6844. init_kmem_cache_node(n, barn);
  6845. s->node[nid] = n;
  6846. }
  6847. /*
  6848. * Any cache created after this point will also have kmem_cache_node
  6849. * initialized for the new node.
  6850. */
  6851. node_set(nid, slab_nodes);
  6852. out:
  6853. mutex_unlock(&slab_mutex);
  6854. return ret;
  6855. }
  6856. static int slab_memory_callback(struct notifier_block *self,
  6857. unsigned long action, void *arg)
  6858. {
  6859. struct node_notify *nn = arg;
  6860. int nid = nn->nid;
  6861. int ret = 0;
  6862. switch (action) {
  6863. case NODE_ADDING_FIRST_MEMORY:
  6864. ret = slab_mem_going_online_callback(nid);
  6865. break;
  6866. case NODE_REMOVING_LAST_MEMORY:
  6867. ret = slab_mem_going_offline_callback();
  6868. break;
  6869. }
  6870. if (ret)
  6871. ret = notifier_from_errno(ret);
  6872. else
  6873. ret = NOTIFY_OK;
  6874. return ret;
  6875. }
  6876. /********************************************************************
  6877. * Basic setup of slabs
  6878. *******************************************************************/
  6879. /*
  6880. * Used for early kmem_cache structures that were allocated using
  6881. * the page allocator. Allocate them properly then fix up the pointers
  6882. * that may be pointing to the wrong kmem_cache structure.
  6883. */
  6884. static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
  6885. {
  6886. int node;
  6887. struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
  6888. struct kmem_cache_node *n;
  6889. memcpy(s, static_cache, kmem_cache->object_size);
  6890. for_each_kmem_cache_node(s, node, n) {
  6891. struct slab *p;
  6892. list_for_each_entry(p, &n->partial, slab_list)
  6893. p->slab_cache = s;
  6894. #ifdef CONFIG_SLUB_DEBUG
  6895. list_for_each_entry(p, &n->full, slab_list)
  6896. p->slab_cache = s;
  6897. #endif
  6898. }
  6899. list_add(&s->list, &slab_caches);
  6900. return s;
  6901. }
  6902. /*
  6903. * Finish the sheaves initialization done normally by init_percpu_sheaves() and
  6904. * init_kmem_cache_nodes(). For normal kmalloc caches we have to bootstrap it
  6905. * since sheaves and barns are allocated by kmalloc.
  6906. */
  6907. static void __init bootstrap_cache_sheaves(struct kmem_cache *s)
  6908. {
  6909. struct kmem_cache_args empty_args = {};
  6910. unsigned int capacity;
  6911. bool failed = false;
  6912. int node, cpu;
  6913. capacity = calculate_sheaf_capacity(s, &empty_args);
  6914. /* capacity can be 0 due to debugging or SLUB_TINY */
  6915. if (!capacity)
  6916. return;
  6917. for_each_node_mask(node, slab_nodes) {
  6918. struct node_barn *barn;
  6919. barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node);
  6920. if (!barn) {
  6921. failed = true;
  6922. goto out;
  6923. }
  6924. barn_init(barn);
  6925. get_node(s, node)->barn = barn;
  6926. }
  6927. for_each_possible_cpu(cpu) {
  6928. struct slub_percpu_sheaves *pcs;
  6929. pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
  6930. pcs->main = __alloc_empty_sheaf(s, GFP_KERNEL, capacity);
  6931. if (!pcs->main) {
  6932. failed = true;
  6933. break;
  6934. }
  6935. }
  6936. out:
  6937. /*
  6938. * It's still early in boot so treat this like same as a failure to
  6939. * create the kmalloc cache in the first place
  6940. */
  6941. if (failed)
  6942. panic("Out of memory when creating kmem_cache %s\n", s->name);
  6943. s->sheaf_capacity = capacity;
  6944. }
  6945. static void __init bootstrap_kmalloc_sheaves(void)
  6946. {
  6947. enum kmalloc_cache_type type;
  6948. for (type = KMALLOC_NORMAL; type <= KMALLOC_RANDOM_END; type++) {
  6949. for (int idx = 0; idx < KMALLOC_SHIFT_HIGH + 1; idx++) {
  6950. if (kmalloc_caches[type][idx])
  6951. bootstrap_cache_sheaves(kmalloc_caches[type][idx]);
  6952. }
  6953. }
  6954. }
  6955. void __init kmem_cache_init(void)
  6956. {
  6957. static __initdata struct kmem_cache boot_kmem_cache,
  6958. boot_kmem_cache_node;
  6959. int node;
  6960. if (debug_guardpage_minorder())
  6961. slub_max_order = 0;
  6962. /* Inform pointer hashing choice about slub debugging state. */
  6963. hash_pointers_finalize(__slub_debug_enabled());
  6964. kmem_cache_node = &boot_kmem_cache_node;
  6965. kmem_cache = &boot_kmem_cache;
  6966. /*
  6967. * Initialize the nodemask for which we will allocate per node
  6968. * structures. Here we don't need taking slab_mutex yet.
  6969. */
  6970. for_each_node_state(node, N_MEMORY)
  6971. node_set(node, slab_nodes);
  6972. create_boot_cache(kmem_cache_node, "kmem_cache_node",
  6973. sizeof(struct kmem_cache_node),
  6974. SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
  6975. hotplug_node_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
  6976. /* Able to allocate the per node structures */
  6977. slab_state = PARTIAL;
  6978. create_boot_cache(kmem_cache, "kmem_cache",
  6979. offsetof(struct kmem_cache, node) +
  6980. nr_node_ids * sizeof(struct kmem_cache_node *),
  6981. SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
  6982. kmem_cache = bootstrap(&boot_kmem_cache);
  6983. kmem_cache_node = bootstrap(&boot_kmem_cache_node);
  6984. /* Now we can use the kmem_cache to allocate kmalloc slabs */
  6985. setup_kmalloc_cache_index_table();
  6986. create_kmalloc_caches();
  6987. bootstrap_kmalloc_sheaves();
  6988. /* Setup random freelists for each cache */
  6989. init_freelist_randomization();
  6990. cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
  6991. slub_cpu_dead);
  6992. pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
  6993. cache_line_size(),
  6994. slub_min_order, slub_max_order, slub_min_objects,
  6995. nr_cpu_ids, nr_node_ids);
  6996. }
  6997. void __init kmem_cache_init_late(void)
  6998. {
  6999. flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM | WQ_PERCPU,
  7000. 0);
  7001. WARN_ON(!flushwq);
  7002. #ifdef CONFIG_SLAB_FREELIST_RANDOM
  7003. prandom_init_once(&slab_rnd_state);
  7004. #endif
  7005. }
  7006. int do_kmem_cache_create(struct kmem_cache *s, const char *name,
  7007. unsigned int size, struct kmem_cache_args *args,
  7008. slab_flags_t flags)
  7009. {
  7010. int err = -EINVAL;
  7011. s->name = name;
  7012. s->size = s->object_size = size;
  7013. s->flags = kmem_cache_flags(flags, s->name);
  7014. #ifdef CONFIG_SLAB_FREELIST_HARDENED
  7015. s->random = get_random_long();
  7016. #endif
  7017. s->align = args->align;
  7018. s->ctor = args->ctor;
  7019. #ifdef CONFIG_HARDENED_USERCOPY
  7020. s->useroffset = args->useroffset;
  7021. s->usersize = args->usersize;
  7022. #endif
  7023. if (!calculate_sizes(args, s))
  7024. goto out;
  7025. if (disable_higher_order_debug) {
  7026. /*
  7027. * Disable debugging flags that store metadata if the min slab
  7028. * order increased.
  7029. */
  7030. if (get_order(s->size) > get_order(s->object_size)) {
  7031. s->flags &= ~DEBUG_METADATA_FLAGS;
  7032. s->offset = 0;
  7033. if (!calculate_sizes(args, s))
  7034. goto out;
  7035. }
  7036. }
  7037. #ifdef system_has_freelist_aba
  7038. if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
  7039. /* Enable fast mode */
  7040. s->flags |= __CMPXCHG_DOUBLE;
  7041. }
  7042. #endif
  7043. /*
  7044. * The larger the object size is, the more slabs we want on the partial
  7045. * list to avoid pounding the page allocator excessively.
  7046. */
  7047. s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
  7048. s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
  7049. s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves);
  7050. if (!s->cpu_sheaves) {
  7051. err = -ENOMEM;
  7052. goto out;
  7053. }
  7054. #ifdef CONFIG_NUMA
  7055. s->remote_node_defrag_ratio = 1000;
  7056. #endif
  7057. /* Initialize the pre-computed randomized freelist if slab is up */
  7058. if (slab_state >= UP) {
  7059. if (init_cache_random_seq(s))
  7060. goto out;
  7061. }
  7062. if (!init_kmem_cache_nodes(s))
  7063. goto out;
  7064. #ifdef CONFIG_SLUB_STATS
  7065. if (!alloc_kmem_cache_stats(s))
  7066. goto out;
  7067. #endif
  7068. err = init_percpu_sheaves(s);
  7069. if (err)
  7070. goto out;
  7071. err = 0;
  7072. /* Mutex is not taken during early boot */
  7073. if (slab_state <= UP)
  7074. goto out;
  7075. /*
  7076. * Failing to create sysfs files is not critical to SLUB functionality.
  7077. * If it fails, proceed with cache creation without these files.
  7078. */
  7079. if (sysfs_slab_add(s))
  7080. pr_err("SLUB: Unable to add cache %s to sysfs\n", s->name);
  7081. if (s->flags & SLAB_STORE_USER)
  7082. debugfs_slab_add(s);
  7083. out:
  7084. if (err)
  7085. __kmem_cache_release(s);
  7086. return err;
  7087. }
  7088. #ifdef SLAB_SUPPORTS_SYSFS
  7089. static int count_inuse(struct slab *slab)
  7090. {
  7091. return slab->inuse;
  7092. }
  7093. static int count_total(struct slab *slab)
  7094. {
  7095. return slab->objects;
  7096. }
  7097. #endif
  7098. #ifdef CONFIG_SLUB_DEBUG
  7099. static void validate_slab(struct kmem_cache *s, struct slab *slab,
  7100. unsigned long *obj_map)
  7101. {
  7102. void *p;
  7103. void *addr = slab_address(slab);
  7104. if (!validate_slab_ptr(slab)) {
  7105. slab_err(s, slab, "Not a valid slab page");
  7106. return;
  7107. }
  7108. if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
  7109. return;
  7110. /* Now we know that a valid freelist exists */
  7111. __fill_map(obj_map, s, slab);
  7112. for_each_object(p, s, addr, slab->objects) {
  7113. u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ?
  7114. SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
  7115. if (!check_object(s, slab, p, val))
  7116. break;
  7117. }
  7118. }
  7119. static int validate_slab_node(struct kmem_cache *s,
  7120. struct kmem_cache_node *n, unsigned long *obj_map)
  7121. {
  7122. unsigned long count = 0;
  7123. struct slab *slab;
  7124. unsigned long flags;
  7125. spin_lock_irqsave(&n->list_lock, flags);
  7126. list_for_each_entry(slab, &n->partial, slab_list) {
  7127. validate_slab(s, slab, obj_map);
  7128. count++;
  7129. }
  7130. if (count != n->nr_partial) {
  7131. pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
  7132. s->name, count, n->nr_partial);
  7133. slab_add_kunit_errors();
  7134. }
  7135. if (!(s->flags & SLAB_STORE_USER))
  7136. goto out;
  7137. list_for_each_entry(slab, &n->full, slab_list) {
  7138. validate_slab(s, slab, obj_map);
  7139. count++;
  7140. }
  7141. if (count != node_nr_slabs(n)) {
  7142. pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
  7143. s->name, count, node_nr_slabs(n));
  7144. slab_add_kunit_errors();
  7145. }
  7146. out:
  7147. spin_unlock_irqrestore(&n->list_lock, flags);
  7148. return count;
  7149. }
  7150. long validate_slab_cache(struct kmem_cache *s)
  7151. {
  7152. int node;
  7153. unsigned long count = 0;
  7154. struct kmem_cache_node *n;
  7155. unsigned long *obj_map;
  7156. obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
  7157. if (!obj_map)
  7158. return -ENOMEM;
  7159. flush_all(s);
  7160. for_each_kmem_cache_node(s, node, n)
  7161. count += validate_slab_node(s, n, obj_map);
  7162. bitmap_free(obj_map);
  7163. return count;
  7164. }
  7165. EXPORT_SYMBOL(validate_slab_cache);
  7166. #ifdef CONFIG_DEBUG_FS
  7167. /*
  7168. * Generate lists of code addresses where slabcache objects are allocated
  7169. * and freed.
  7170. */
  7171. struct location {
  7172. depot_stack_handle_t handle;
  7173. unsigned long count;
  7174. unsigned long addr;
  7175. unsigned long waste;
  7176. long long sum_time;
  7177. long min_time;
  7178. long max_time;
  7179. long min_pid;
  7180. long max_pid;
  7181. DECLARE_BITMAP(cpus, NR_CPUS);
  7182. nodemask_t nodes;
  7183. };
  7184. struct loc_track {
  7185. unsigned long max;
  7186. unsigned long count;
  7187. struct location *loc;
  7188. loff_t idx;
  7189. };
  7190. static struct dentry *slab_debugfs_root;
  7191. static void free_loc_track(struct loc_track *t)
  7192. {
  7193. if (t->max)
  7194. free_pages((unsigned long)t->loc,
  7195. get_order(sizeof(struct location) * t->max));
  7196. }
  7197. static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
  7198. {
  7199. struct location *l;
  7200. int order;
  7201. order = get_order(sizeof(struct location) * max);
  7202. l = (void *)__get_free_pages(flags, order);
  7203. if (!l)
  7204. return 0;
  7205. if (t->count) {
  7206. memcpy(l, t->loc, sizeof(struct location) * t->count);
  7207. free_loc_track(t);
  7208. }
  7209. t->max = max;
  7210. t->loc = l;
  7211. return 1;
  7212. }
  7213. static int add_location(struct loc_track *t, struct kmem_cache *s,
  7214. const struct track *track,
  7215. unsigned int orig_size)
  7216. {
  7217. long start, end, pos;
  7218. struct location *l;
  7219. unsigned long caddr, chandle, cwaste;
  7220. unsigned long age = jiffies - track->when;
  7221. depot_stack_handle_t handle = 0;
  7222. unsigned int waste = s->object_size - orig_size;
  7223. #ifdef CONFIG_STACKDEPOT
  7224. handle = READ_ONCE(track->handle);
  7225. #endif
  7226. start = -1;
  7227. end = t->count;
  7228. for ( ; ; ) {
  7229. pos = start + (end - start + 1) / 2;
  7230. /*
  7231. * There is nothing at "end". If we end up there
  7232. * we need to add something to before end.
  7233. */
  7234. if (pos == end)
  7235. break;
  7236. l = &t->loc[pos];
  7237. caddr = l->addr;
  7238. chandle = l->handle;
  7239. cwaste = l->waste;
  7240. if ((track->addr == caddr) && (handle == chandle) &&
  7241. (waste == cwaste)) {
  7242. l->count++;
  7243. if (track->when) {
  7244. l->sum_time += age;
  7245. if (age < l->min_time)
  7246. l->min_time = age;
  7247. if (age > l->max_time)
  7248. l->max_time = age;
  7249. if (track->pid < l->min_pid)
  7250. l->min_pid = track->pid;
  7251. if (track->pid > l->max_pid)
  7252. l->max_pid = track->pid;
  7253. cpumask_set_cpu(track->cpu,
  7254. to_cpumask(l->cpus));
  7255. }
  7256. node_set(page_to_nid(virt_to_page(track)), l->nodes);
  7257. return 1;
  7258. }
  7259. if (track->addr < caddr)
  7260. end = pos;
  7261. else if (track->addr == caddr && handle < chandle)
  7262. end = pos;
  7263. else if (track->addr == caddr && handle == chandle &&
  7264. waste < cwaste)
  7265. end = pos;
  7266. else
  7267. start = pos;
  7268. }
  7269. /*
  7270. * Not found. Insert new tracking element.
  7271. */
  7272. if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
  7273. return 0;
  7274. l = t->loc + pos;
  7275. if (pos < t->count)
  7276. memmove(l + 1, l,
  7277. (t->count - pos) * sizeof(struct location));
  7278. t->count++;
  7279. l->count = 1;
  7280. l->addr = track->addr;
  7281. l->sum_time = age;
  7282. l->min_time = age;
  7283. l->max_time = age;
  7284. l->min_pid = track->pid;
  7285. l->max_pid = track->pid;
  7286. l->handle = handle;
  7287. l->waste = waste;
  7288. cpumask_clear(to_cpumask(l->cpus));
  7289. cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
  7290. nodes_clear(l->nodes);
  7291. node_set(page_to_nid(virt_to_page(track)), l->nodes);
  7292. return 1;
  7293. }
  7294. static void process_slab(struct loc_track *t, struct kmem_cache *s,
  7295. struct slab *slab, enum track_item alloc,
  7296. unsigned long *obj_map)
  7297. {
  7298. void *addr = slab_address(slab);
  7299. bool is_alloc = (alloc == TRACK_ALLOC);
  7300. void *p;
  7301. __fill_map(obj_map, s, slab);
  7302. for_each_object(p, s, addr, slab->objects)
  7303. if (!test_bit(__obj_to_index(s, addr, p), obj_map))
  7304. add_location(t, s, get_track(s, p, alloc),
  7305. is_alloc ? get_orig_size(s, p) :
  7306. s->object_size);
  7307. }
  7308. #endif /* CONFIG_DEBUG_FS */
  7309. #endif /* CONFIG_SLUB_DEBUG */
  7310. #ifdef SLAB_SUPPORTS_SYSFS
  7311. enum slab_stat_type {
  7312. SL_ALL, /* All slabs */
  7313. SL_PARTIAL, /* Only partially allocated slabs */
  7314. SL_CPU, /* Only slabs used for cpu caches */
  7315. SL_OBJECTS, /* Determine allocated objects not slabs */
  7316. SL_TOTAL /* Determine object capacity not slabs */
  7317. };
  7318. #define SO_ALL (1 << SL_ALL)
  7319. #define SO_PARTIAL (1 << SL_PARTIAL)
  7320. #define SO_CPU (1 << SL_CPU)
  7321. #define SO_OBJECTS (1 << SL_OBJECTS)
  7322. #define SO_TOTAL (1 << SL_TOTAL)
  7323. static ssize_t show_slab_objects(struct kmem_cache *s,
  7324. char *buf, unsigned long flags)
  7325. {
  7326. unsigned long total = 0;
  7327. int node;
  7328. int x;
  7329. unsigned long *nodes;
  7330. int len = 0;
  7331. nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
  7332. if (!nodes)
  7333. return -ENOMEM;
  7334. /*
  7335. * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
  7336. * already held which will conflict with an existing lock order:
  7337. *
  7338. * mem_hotplug_lock->slab_mutex->kernfs_mutex
  7339. *
  7340. * We don't really need mem_hotplug_lock (to hold off
  7341. * slab_mem_going_offline_callback) here because slab's memory hot
  7342. * unplug code doesn't destroy the kmem_cache->node[] data.
  7343. */
  7344. #ifdef CONFIG_SLUB_DEBUG
  7345. if (flags & SO_ALL) {
  7346. struct kmem_cache_node *n;
  7347. for_each_kmem_cache_node(s, node, n) {
  7348. if (flags & SO_TOTAL)
  7349. x = node_nr_objs(n);
  7350. else if (flags & SO_OBJECTS)
  7351. x = node_nr_objs(n) - count_partial(n, count_free);
  7352. else
  7353. x = node_nr_slabs(n);
  7354. total += x;
  7355. nodes[node] += x;
  7356. }
  7357. } else
  7358. #endif
  7359. if (flags & SO_PARTIAL) {
  7360. struct kmem_cache_node *n;
  7361. for_each_kmem_cache_node(s, node, n) {
  7362. if (flags & SO_TOTAL)
  7363. x = count_partial(n, count_total);
  7364. else if (flags & SO_OBJECTS)
  7365. x = count_partial(n, count_inuse);
  7366. else
  7367. x = n->nr_partial;
  7368. total += x;
  7369. nodes[node] += x;
  7370. }
  7371. }
  7372. len += sysfs_emit_at(buf, len, "%lu", total);
  7373. #ifdef CONFIG_NUMA
  7374. for (node = 0; node < nr_node_ids; node++) {
  7375. if (nodes[node])
  7376. len += sysfs_emit_at(buf, len, " N%d=%lu",
  7377. node, nodes[node]);
  7378. }
  7379. #endif
  7380. len += sysfs_emit_at(buf, len, "\n");
  7381. kfree(nodes);
  7382. return len;
  7383. }
  7384. #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
  7385. #define to_slab(n) container_of(n, struct kmem_cache, kobj)
  7386. struct slab_attribute {
  7387. struct attribute attr;
  7388. ssize_t (*show)(struct kmem_cache *s, char *buf);
  7389. ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
  7390. };
  7391. #define SLAB_ATTR_RO(_name) \
  7392. static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400)
  7393. #define SLAB_ATTR(_name) \
  7394. static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600)
  7395. static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
  7396. {
  7397. return sysfs_emit(buf, "%u\n", s->size);
  7398. }
  7399. SLAB_ATTR_RO(slab_size);
  7400. static ssize_t align_show(struct kmem_cache *s, char *buf)
  7401. {
  7402. return sysfs_emit(buf, "%u\n", s->align);
  7403. }
  7404. SLAB_ATTR_RO(align);
  7405. static ssize_t object_size_show(struct kmem_cache *s, char *buf)
  7406. {
  7407. return sysfs_emit(buf, "%u\n", s->object_size);
  7408. }
  7409. SLAB_ATTR_RO(object_size);
  7410. static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
  7411. {
  7412. return sysfs_emit(buf, "%u\n", oo_objects(s->oo));
  7413. }
  7414. SLAB_ATTR_RO(objs_per_slab);
  7415. static ssize_t order_show(struct kmem_cache *s, char *buf)
  7416. {
  7417. return sysfs_emit(buf, "%u\n", oo_order(s->oo));
  7418. }
  7419. SLAB_ATTR_RO(order);
  7420. static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf)
  7421. {
  7422. return sysfs_emit(buf, "%u\n", s->sheaf_capacity);
  7423. }
  7424. SLAB_ATTR_RO(sheaf_capacity);
  7425. static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
  7426. {
  7427. return sysfs_emit(buf, "%lu\n", s->min_partial);
  7428. }
  7429. static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
  7430. size_t length)
  7431. {
  7432. unsigned long min;
  7433. int err;
  7434. err = kstrtoul(buf, 10, &min);
  7435. if (err)
  7436. return err;
  7437. s->min_partial = min;
  7438. return length;
  7439. }
  7440. SLAB_ATTR(min_partial);
  7441. static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
  7442. {
  7443. return sysfs_emit(buf, "0\n");
  7444. }
  7445. static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
  7446. size_t length)
  7447. {
  7448. unsigned int objects;
  7449. int err;
  7450. err = kstrtouint(buf, 10, &objects);
  7451. if (err)
  7452. return err;
  7453. if (objects)
  7454. return -EINVAL;
  7455. return length;
  7456. }
  7457. SLAB_ATTR(cpu_partial);
  7458. static ssize_t ctor_show(struct kmem_cache *s, char *buf)
  7459. {
  7460. if (!s->ctor)
  7461. return 0;
  7462. return sysfs_emit(buf, "%pS\n", s->ctor);
  7463. }
  7464. SLAB_ATTR_RO(ctor);
  7465. static ssize_t aliases_show(struct kmem_cache *s, char *buf)
  7466. {
  7467. return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
  7468. }
  7469. SLAB_ATTR_RO(aliases);
  7470. static ssize_t partial_show(struct kmem_cache *s, char *buf)
  7471. {
  7472. return show_slab_objects(s, buf, SO_PARTIAL);
  7473. }
  7474. SLAB_ATTR_RO(partial);
  7475. static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
  7476. {
  7477. return show_slab_objects(s, buf, SO_CPU);
  7478. }
  7479. SLAB_ATTR_RO(cpu_slabs);
  7480. static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
  7481. {
  7482. return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
  7483. }
  7484. SLAB_ATTR_RO(objects_partial);
  7485. static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
  7486. {
  7487. return sysfs_emit(buf, "0(0)\n");
  7488. }
  7489. SLAB_ATTR_RO(slabs_cpu_partial);
  7490. static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
  7491. {
  7492. return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
  7493. }
  7494. SLAB_ATTR_RO(reclaim_account);
  7495. static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
  7496. {
  7497. return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
  7498. }
  7499. SLAB_ATTR_RO(hwcache_align);
  7500. #ifdef CONFIG_ZONE_DMA
  7501. static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
  7502. {
  7503. return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
  7504. }
  7505. SLAB_ATTR_RO(cache_dma);
  7506. #endif
  7507. #ifdef CONFIG_HARDENED_USERCOPY
  7508. static ssize_t usersize_show(struct kmem_cache *s, char *buf)
  7509. {
  7510. return sysfs_emit(buf, "%u\n", s->usersize);
  7511. }
  7512. SLAB_ATTR_RO(usersize);
  7513. #endif
  7514. static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
  7515. {
  7516. return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
  7517. }
  7518. SLAB_ATTR_RO(destroy_by_rcu);
  7519. #ifdef CONFIG_SLUB_DEBUG
  7520. static ssize_t slabs_show(struct kmem_cache *s, char *buf)
  7521. {
  7522. return show_slab_objects(s, buf, SO_ALL);
  7523. }
  7524. SLAB_ATTR_RO(slabs);
  7525. static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
  7526. {
  7527. return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
  7528. }
  7529. SLAB_ATTR_RO(total_objects);
  7530. static ssize_t objects_show(struct kmem_cache *s, char *buf)
  7531. {
  7532. return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
  7533. }
  7534. SLAB_ATTR_RO(objects);
  7535. static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
  7536. {
  7537. return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
  7538. }
  7539. SLAB_ATTR_RO(sanity_checks);
  7540. static ssize_t trace_show(struct kmem_cache *s, char *buf)
  7541. {
  7542. return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE));
  7543. }
  7544. SLAB_ATTR_RO(trace);
  7545. static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
  7546. {
  7547. return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
  7548. }
  7549. SLAB_ATTR_RO(red_zone);
  7550. static ssize_t poison_show(struct kmem_cache *s, char *buf)
  7551. {
  7552. return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON));
  7553. }
  7554. SLAB_ATTR_RO(poison);
  7555. static ssize_t store_user_show(struct kmem_cache *s, char *buf)
  7556. {
  7557. return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
  7558. }
  7559. SLAB_ATTR_RO(store_user);
  7560. static ssize_t validate_show(struct kmem_cache *s, char *buf)
  7561. {
  7562. return 0;
  7563. }
  7564. static ssize_t validate_store(struct kmem_cache *s,
  7565. const char *buf, size_t length)
  7566. {
  7567. int ret = -EINVAL;
  7568. if (buf[0] == '1' && kmem_cache_debug(s)) {
  7569. ret = validate_slab_cache(s);
  7570. if (ret >= 0)
  7571. ret = length;
  7572. }
  7573. return ret;
  7574. }
  7575. SLAB_ATTR(validate);
  7576. #endif /* CONFIG_SLUB_DEBUG */
  7577. #ifdef CONFIG_FAILSLAB
  7578. static ssize_t failslab_show(struct kmem_cache *s, char *buf)
  7579. {
  7580. return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
  7581. }
  7582. static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
  7583. size_t length)
  7584. {
  7585. if (s->refcount > 1)
  7586. return -EINVAL;
  7587. if (buf[0] == '1')
  7588. WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB);
  7589. else
  7590. WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB);
  7591. return length;
  7592. }
  7593. SLAB_ATTR(failslab);
  7594. #endif
  7595. static ssize_t shrink_show(struct kmem_cache *s, char *buf)
  7596. {
  7597. return 0;
  7598. }
  7599. static ssize_t shrink_store(struct kmem_cache *s,
  7600. const char *buf, size_t length)
  7601. {
  7602. if (buf[0] == '1')
  7603. kmem_cache_shrink(s);
  7604. else
  7605. return -EINVAL;
  7606. return length;
  7607. }
  7608. SLAB_ATTR(shrink);
  7609. #ifdef CONFIG_NUMA
  7610. static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
  7611. {
  7612. return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10);
  7613. }
  7614. static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
  7615. const char *buf, size_t length)
  7616. {
  7617. unsigned int ratio;
  7618. int err;
  7619. err = kstrtouint(buf, 10, &ratio);
  7620. if (err)
  7621. return err;
  7622. if (ratio > 100)
  7623. return -ERANGE;
  7624. s->remote_node_defrag_ratio = ratio * 10;
  7625. return length;
  7626. }
  7627. SLAB_ATTR(remote_node_defrag_ratio);
  7628. #endif
  7629. #ifdef CONFIG_SLUB_STATS
  7630. static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
  7631. {
  7632. unsigned long sum = 0;
  7633. int cpu;
  7634. int len = 0;
  7635. int *data = kmalloc_objs(int, nr_cpu_ids);
  7636. if (!data)
  7637. return -ENOMEM;
  7638. for_each_online_cpu(cpu) {
  7639. unsigned int x = per_cpu_ptr(s->cpu_stats, cpu)->stat[si];
  7640. data[cpu] = x;
  7641. sum += x;
  7642. }
  7643. len += sysfs_emit_at(buf, len, "%lu", sum);
  7644. #ifdef CONFIG_SMP
  7645. for_each_online_cpu(cpu) {
  7646. if (data[cpu])
  7647. len += sysfs_emit_at(buf, len, " C%d=%u",
  7648. cpu, data[cpu]);
  7649. }
  7650. #endif
  7651. kfree(data);
  7652. len += sysfs_emit_at(buf, len, "\n");
  7653. return len;
  7654. }
  7655. static void clear_stat(struct kmem_cache *s, enum stat_item si)
  7656. {
  7657. int cpu;
  7658. for_each_online_cpu(cpu)
  7659. per_cpu_ptr(s->cpu_stats, cpu)->stat[si] = 0;
  7660. }
  7661. #define STAT_ATTR(si, text) \
  7662. static ssize_t text##_show(struct kmem_cache *s, char *buf) \
  7663. { \
  7664. return show_stat(s, buf, si); \
  7665. } \
  7666. static ssize_t text##_store(struct kmem_cache *s, \
  7667. const char *buf, size_t length) \
  7668. { \
  7669. if (buf[0] != '0') \
  7670. return -EINVAL; \
  7671. clear_stat(s, si); \
  7672. return length; \
  7673. } \
  7674. SLAB_ATTR(text); \
  7675. STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
  7676. STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
  7677. STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf);
  7678. STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail);
  7679. STAT_ATTR(FREE_FASTPATH, free_fastpath);
  7680. STAT_ATTR(FREE_SLOWPATH, free_slowpath);
  7681. STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
  7682. STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
  7683. STAT_ATTR(ALLOC_SLAB, alloc_slab);
  7684. STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
  7685. STAT_ATTR(FREE_SLAB, free_slab);
  7686. STAT_ATTR(ORDER_FALLBACK, order_fallback);
  7687. STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
  7688. STAT_ATTR(SHEAF_FLUSH, sheaf_flush);
  7689. STAT_ATTR(SHEAF_REFILL, sheaf_refill);
  7690. STAT_ATTR(SHEAF_ALLOC, sheaf_alloc);
  7691. STAT_ATTR(SHEAF_FREE, sheaf_free);
  7692. STAT_ATTR(BARN_GET, barn_get);
  7693. STAT_ATTR(BARN_GET_FAIL, barn_get_fail);
  7694. STAT_ATTR(BARN_PUT, barn_put);
  7695. STAT_ATTR(BARN_PUT_FAIL, barn_put_fail);
  7696. STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast);
  7697. STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow);
  7698. STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize);
  7699. STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast);
  7700. STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow);
  7701. #endif /* CONFIG_SLUB_STATS */
  7702. #ifdef CONFIG_KFENCE
  7703. static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf)
  7704. {
  7705. return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE));
  7706. }
  7707. static ssize_t skip_kfence_store(struct kmem_cache *s,
  7708. const char *buf, size_t length)
  7709. {
  7710. int ret = length;
  7711. if (buf[0] == '0')
  7712. s->flags &= ~SLAB_SKIP_KFENCE;
  7713. else if (buf[0] == '1')
  7714. s->flags |= SLAB_SKIP_KFENCE;
  7715. else
  7716. ret = -EINVAL;
  7717. return ret;
  7718. }
  7719. SLAB_ATTR(skip_kfence);
  7720. #endif
  7721. static struct attribute *slab_attrs[] = {
  7722. &slab_size_attr.attr,
  7723. &object_size_attr.attr,
  7724. &objs_per_slab_attr.attr,
  7725. &order_attr.attr,
  7726. &sheaf_capacity_attr.attr,
  7727. &min_partial_attr.attr,
  7728. &cpu_partial_attr.attr,
  7729. &objects_partial_attr.attr,
  7730. &partial_attr.attr,
  7731. &cpu_slabs_attr.attr,
  7732. &ctor_attr.attr,
  7733. &aliases_attr.attr,
  7734. &align_attr.attr,
  7735. &hwcache_align_attr.attr,
  7736. &reclaim_account_attr.attr,
  7737. &destroy_by_rcu_attr.attr,
  7738. &shrink_attr.attr,
  7739. &slabs_cpu_partial_attr.attr,
  7740. #ifdef CONFIG_SLUB_DEBUG
  7741. &total_objects_attr.attr,
  7742. &objects_attr.attr,
  7743. &slabs_attr.attr,
  7744. &sanity_checks_attr.attr,
  7745. &trace_attr.attr,
  7746. &red_zone_attr.attr,
  7747. &poison_attr.attr,
  7748. &store_user_attr.attr,
  7749. &validate_attr.attr,
  7750. #endif
  7751. #ifdef CONFIG_ZONE_DMA
  7752. &cache_dma_attr.attr,
  7753. #endif
  7754. #ifdef CONFIG_NUMA
  7755. &remote_node_defrag_ratio_attr.attr,
  7756. #endif
  7757. #ifdef CONFIG_SLUB_STATS
  7758. &alloc_fastpath_attr.attr,
  7759. &alloc_slowpath_attr.attr,
  7760. &free_rcu_sheaf_attr.attr,
  7761. &free_rcu_sheaf_fail_attr.attr,
  7762. &free_fastpath_attr.attr,
  7763. &free_slowpath_attr.attr,
  7764. &free_add_partial_attr.attr,
  7765. &free_remove_partial_attr.attr,
  7766. &alloc_slab_attr.attr,
  7767. &alloc_node_mismatch_attr.attr,
  7768. &free_slab_attr.attr,
  7769. &order_fallback_attr.attr,
  7770. &cmpxchg_double_fail_attr.attr,
  7771. &sheaf_flush_attr.attr,
  7772. &sheaf_refill_attr.attr,
  7773. &sheaf_alloc_attr.attr,
  7774. &sheaf_free_attr.attr,
  7775. &barn_get_attr.attr,
  7776. &barn_get_fail_attr.attr,
  7777. &barn_put_attr.attr,
  7778. &barn_put_fail_attr.attr,
  7779. &sheaf_prefill_fast_attr.attr,
  7780. &sheaf_prefill_slow_attr.attr,
  7781. &sheaf_prefill_oversize_attr.attr,
  7782. &sheaf_return_fast_attr.attr,
  7783. &sheaf_return_slow_attr.attr,
  7784. #endif
  7785. #ifdef CONFIG_FAILSLAB
  7786. &failslab_attr.attr,
  7787. #endif
  7788. #ifdef CONFIG_HARDENED_USERCOPY
  7789. &usersize_attr.attr,
  7790. #endif
  7791. #ifdef CONFIG_KFENCE
  7792. &skip_kfence_attr.attr,
  7793. #endif
  7794. NULL
  7795. };
  7796. static const struct attribute_group slab_attr_group = {
  7797. .attrs = slab_attrs,
  7798. };
  7799. static ssize_t slab_attr_show(struct kobject *kobj,
  7800. struct attribute *attr,
  7801. char *buf)
  7802. {
  7803. struct slab_attribute *attribute;
  7804. struct kmem_cache *s;
  7805. attribute = to_slab_attr(attr);
  7806. s = to_slab(kobj);
  7807. if (!attribute->show)
  7808. return -EIO;
  7809. return attribute->show(s, buf);
  7810. }
  7811. static ssize_t slab_attr_store(struct kobject *kobj,
  7812. struct attribute *attr,
  7813. const char *buf, size_t len)
  7814. {
  7815. struct slab_attribute *attribute;
  7816. struct kmem_cache *s;
  7817. attribute = to_slab_attr(attr);
  7818. s = to_slab(kobj);
  7819. if (!attribute->store)
  7820. return -EIO;
  7821. return attribute->store(s, buf, len);
  7822. }
  7823. static void kmem_cache_release(struct kobject *k)
  7824. {
  7825. slab_kmem_cache_release(to_slab(k));
  7826. }
  7827. static const struct sysfs_ops slab_sysfs_ops = {
  7828. .show = slab_attr_show,
  7829. .store = slab_attr_store,
  7830. };
  7831. static const struct kobj_type slab_ktype = {
  7832. .sysfs_ops = &slab_sysfs_ops,
  7833. .release = kmem_cache_release,
  7834. };
  7835. static struct kset *slab_kset;
  7836. static inline struct kset *cache_kset(struct kmem_cache *s)
  7837. {
  7838. return slab_kset;
  7839. }
  7840. #define ID_STR_LENGTH 32
  7841. /* Create a unique string id for a slab cache:
  7842. *
  7843. * Format :[flags-]size
  7844. */
  7845. static char *create_unique_id(struct kmem_cache *s)
  7846. {
  7847. char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
  7848. char *p = name;
  7849. if (!name)
  7850. return ERR_PTR(-ENOMEM);
  7851. *p++ = ':';
  7852. /*
  7853. * First flags affecting slabcache operations. We will only
  7854. * get here for aliasable slabs so we do not need to support
  7855. * too many flags. The flags here must cover all flags that
  7856. * are matched during merging to guarantee that the id is
  7857. * unique.
  7858. */
  7859. if (s->flags & SLAB_CACHE_DMA)
  7860. *p++ = 'd';
  7861. if (s->flags & SLAB_CACHE_DMA32)
  7862. *p++ = 'D';
  7863. if (s->flags & SLAB_RECLAIM_ACCOUNT)
  7864. *p++ = 'a';
  7865. if (s->flags & SLAB_CONSISTENCY_CHECKS)
  7866. *p++ = 'F';
  7867. if (s->flags & SLAB_ACCOUNT)
  7868. *p++ = 'A';
  7869. if (p != name + 1)
  7870. *p++ = '-';
  7871. p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size);
  7872. if (WARN_ON(p > name + ID_STR_LENGTH - 1)) {
  7873. kfree(name);
  7874. return ERR_PTR(-EINVAL);
  7875. }
  7876. kmsan_unpoison_memory(name, p - name);
  7877. return name;
  7878. }
  7879. static int sysfs_slab_add(struct kmem_cache *s)
  7880. {
  7881. int err;
  7882. const char *name;
  7883. struct kset *kset = cache_kset(s);
  7884. int unmergeable = slab_unmergeable(s);
  7885. if (!unmergeable && disable_higher_order_debug &&
  7886. (slub_debug & DEBUG_METADATA_FLAGS))
  7887. unmergeable = 1;
  7888. if (unmergeable) {
  7889. /*
  7890. * Slabcache can never be merged so we can use the name proper.
  7891. * This is typically the case for debug situations. In that
  7892. * case we can catch duplicate names easily.
  7893. */
  7894. sysfs_remove_link(&slab_kset->kobj, s->name);
  7895. name = s->name;
  7896. } else {
  7897. /*
  7898. * Create a unique name for the slab as a target
  7899. * for the symlinks.
  7900. */
  7901. name = create_unique_id(s);
  7902. if (IS_ERR(name))
  7903. return PTR_ERR(name);
  7904. }
  7905. s->kobj.kset = kset;
  7906. err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
  7907. if (err)
  7908. goto out;
  7909. err = sysfs_create_group(&s->kobj, &slab_attr_group);
  7910. if (err)
  7911. goto out_del_kobj;
  7912. if (!unmergeable) {
  7913. /* Setup first alias */
  7914. sysfs_slab_alias(s, s->name);
  7915. }
  7916. out:
  7917. if (!unmergeable)
  7918. kfree(name);
  7919. return err;
  7920. out_del_kobj:
  7921. kobject_del(&s->kobj);
  7922. goto out;
  7923. }
  7924. void sysfs_slab_unlink(struct kmem_cache *s)
  7925. {
  7926. if (s->kobj.state_in_sysfs)
  7927. kobject_del(&s->kobj);
  7928. }
  7929. void sysfs_slab_release(struct kmem_cache *s)
  7930. {
  7931. kobject_put(&s->kobj);
  7932. }
  7933. /*
  7934. * Need to buffer aliases during bootup until sysfs becomes
  7935. * available lest we lose that information.
  7936. */
  7937. struct saved_alias {
  7938. struct kmem_cache *s;
  7939. const char *name;
  7940. struct saved_alias *next;
  7941. };
  7942. static struct saved_alias *alias_list;
  7943. int sysfs_slab_alias(struct kmem_cache *s, const char *name)
  7944. {
  7945. struct saved_alias *al;
  7946. if (slab_state == FULL) {
  7947. /*
  7948. * If we have a leftover link then remove it.
  7949. */
  7950. sysfs_remove_link(&slab_kset->kobj, name);
  7951. /*
  7952. * The original cache may have failed to generate sysfs file.
  7953. * In that case, sysfs_create_link() returns -ENOENT and
  7954. * symbolic link creation is skipped.
  7955. */
  7956. return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
  7957. }
  7958. al = kmalloc_obj(struct saved_alias);
  7959. if (!al)
  7960. return -ENOMEM;
  7961. al->s = s;
  7962. al->name = name;
  7963. al->next = alias_list;
  7964. alias_list = al;
  7965. kmsan_unpoison_memory(al, sizeof(*al));
  7966. return 0;
  7967. }
  7968. static int __init slab_sysfs_init(void)
  7969. {
  7970. struct kmem_cache *s;
  7971. int err;
  7972. mutex_lock(&slab_mutex);
  7973. slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
  7974. if (!slab_kset) {
  7975. mutex_unlock(&slab_mutex);
  7976. pr_err("Cannot register slab subsystem.\n");
  7977. return -ENOMEM;
  7978. }
  7979. slab_state = FULL;
  7980. list_for_each_entry(s, &slab_caches, list) {
  7981. err = sysfs_slab_add(s);
  7982. if (err)
  7983. pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
  7984. s->name);
  7985. }
  7986. while (alias_list) {
  7987. struct saved_alias *al = alias_list;
  7988. alias_list = alias_list->next;
  7989. err = sysfs_slab_alias(al->s, al->name);
  7990. if (err)
  7991. pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
  7992. al->name);
  7993. kfree(al);
  7994. }
  7995. mutex_unlock(&slab_mutex);
  7996. return 0;
  7997. }
  7998. late_initcall(slab_sysfs_init);
  7999. #endif /* SLAB_SUPPORTS_SYSFS */
  8000. #if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
  8001. static int slab_debugfs_show(struct seq_file *seq, void *v)
  8002. {
  8003. struct loc_track *t = seq->private;
  8004. struct location *l;
  8005. unsigned long idx;
  8006. idx = (unsigned long) t->idx;
  8007. if (idx < t->count) {
  8008. l = &t->loc[idx];
  8009. seq_printf(seq, "%7ld ", l->count);
  8010. if (l->addr)
  8011. seq_printf(seq, "%pS", (void *)l->addr);
  8012. else
  8013. seq_puts(seq, "<not-available>");
  8014. if (l->waste)
  8015. seq_printf(seq, " waste=%lu/%lu",
  8016. l->count * l->waste, l->waste);
  8017. if (l->sum_time != l->min_time) {
  8018. seq_printf(seq, " age=%ld/%llu/%ld",
  8019. l->min_time, div_u64(l->sum_time, l->count),
  8020. l->max_time);
  8021. } else
  8022. seq_printf(seq, " age=%ld", l->min_time);
  8023. if (l->min_pid != l->max_pid)
  8024. seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
  8025. else
  8026. seq_printf(seq, " pid=%ld",
  8027. l->min_pid);
  8028. if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
  8029. seq_printf(seq, " cpus=%*pbl",
  8030. cpumask_pr_args(to_cpumask(l->cpus)));
  8031. if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
  8032. seq_printf(seq, " nodes=%*pbl",
  8033. nodemask_pr_args(&l->nodes));
  8034. #ifdef CONFIG_STACKDEPOT
  8035. {
  8036. depot_stack_handle_t handle;
  8037. unsigned long *entries;
  8038. unsigned int nr_entries, j;
  8039. handle = READ_ONCE(l->handle);
  8040. if (handle) {
  8041. nr_entries = stack_depot_fetch(handle, &entries);
  8042. seq_puts(seq, "\n");
  8043. for (j = 0; j < nr_entries; j++)
  8044. seq_printf(seq, " %pS\n", (void *)entries[j]);
  8045. }
  8046. }
  8047. #endif
  8048. seq_puts(seq, "\n");
  8049. }
  8050. if (!idx && !t->count)
  8051. seq_puts(seq, "No data\n");
  8052. return 0;
  8053. }
  8054. static void slab_debugfs_stop(struct seq_file *seq, void *v)
  8055. {
  8056. }
  8057. static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
  8058. {
  8059. struct loc_track *t = seq->private;
  8060. t->idx = ++(*ppos);
  8061. if (*ppos <= t->count)
  8062. return ppos;
  8063. return NULL;
  8064. }
  8065. static int cmp_loc_by_count(const void *a, const void *b)
  8066. {
  8067. struct location *loc1 = (struct location *)a;
  8068. struct location *loc2 = (struct location *)b;
  8069. return cmp_int(loc2->count, loc1->count);
  8070. }
  8071. static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
  8072. {
  8073. struct loc_track *t = seq->private;
  8074. t->idx = *ppos;
  8075. return ppos;
  8076. }
  8077. static const struct seq_operations slab_debugfs_sops = {
  8078. .start = slab_debugfs_start,
  8079. .next = slab_debugfs_next,
  8080. .stop = slab_debugfs_stop,
  8081. .show = slab_debugfs_show,
  8082. };
  8083. static int slab_debug_trace_open(struct inode *inode, struct file *filep)
  8084. {
  8085. struct kmem_cache_node *n;
  8086. enum track_item alloc;
  8087. int node;
  8088. struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
  8089. sizeof(struct loc_track));
  8090. struct kmem_cache *s = file_inode(filep)->i_private;
  8091. unsigned long *obj_map;
  8092. if (!t)
  8093. return -ENOMEM;
  8094. obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
  8095. if (!obj_map) {
  8096. seq_release_private(inode, filep);
  8097. return -ENOMEM;
  8098. }
  8099. alloc = debugfs_get_aux_num(filep);
  8100. if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
  8101. bitmap_free(obj_map);
  8102. seq_release_private(inode, filep);
  8103. return -ENOMEM;
  8104. }
  8105. for_each_kmem_cache_node(s, node, n) {
  8106. unsigned long flags;
  8107. struct slab *slab;
  8108. if (!node_nr_slabs(n))
  8109. continue;
  8110. spin_lock_irqsave(&n->list_lock, flags);
  8111. list_for_each_entry(slab, &n->partial, slab_list)
  8112. process_slab(t, s, slab, alloc, obj_map);
  8113. list_for_each_entry(slab, &n->full, slab_list)
  8114. process_slab(t, s, slab, alloc, obj_map);
  8115. spin_unlock_irqrestore(&n->list_lock, flags);
  8116. }
  8117. /* Sort locations by count */
  8118. sort(t->loc, t->count, sizeof(struct location),
  8119. cmp_loc_by_count, NULL);
  8120. bitmap_free(obj_map);
  8121. return 0;
  8122. }
  8123. static int slab_debug_trace_release(struct inode *inode, struct file *file)
  8124. {
  8125. struct seq_file *seq = file->private_data;
  8126. struct loc_track *t = seq->private;
  8127. free_loc_track(t);
  8128. return seq_release_private(inode, file);
  8129. }
  8130. static const struct file_operations slab_debugfs_fops = {
  8131. .open = slab_debug_trace_open,
  8132. .read = seq_read,
  8133. .llseek = seq_lseek,
  8134. .release = slab_debug_trace_release,
  8135. };
  8136. static void debugfs_slab_add(struct kmem_cache *s)
  8137. {
  8138. struct dentry *slab_cache_dir;
  8139. if (unlikely(!slab_debugfs_root))
  8140. return;
  8141. slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
  8142. debugfs_create_file_aux_num("alloc_traces", 0400, slab_cache_dir, s,
  8143. TRACK_ALLOC, &slab_debugfs_fops);
  8144. debugfs_create_file_aux_num("free_traces", 0400, slab_cache_dir, s,
  8145. TRACK_FREE, &slab_debugfs_fops);
  8146. }
  8147. void debugfs_slab_release(struct kmem_cache *s)
  8148. {
  8149. debugfs_lookup_and_remove(s->name, slab_debugfs_root);
  8150. }
  8151. static int __init slab_debugfs_init(void)
  8152. {
  8153. struct kmem_cache *s;
  8154. slab_debugfs_root = debugfs_create_dir("slab", NULL);
  8155. list_for_each_entry(s, &slab_caches, list)
  8156. if (s->flags & SLAB_STORE_USER)
  8157. debugfs_slab_add(s);
  8158. return 0;
  8159. }
  8160. __initcall(slab_debugfs_init);
  8161. #endif
  8162. /*
  8163. * The /proc/slabinfo ABI
  8164. */
  8165. #ifdef CONFIG_SLUB_DEBUG
  8166. void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
  8167. {
  8168. unsigned long nr_slabs = 0;
  8169. unsigned long nr_objs = 0;
  8170. unsigned long nr_free = 0;
  8171. int node;
  8172. struct kmem_cache_node *n;
  8173. for_each_kmem_cache_node(s, node, n) {
  8174. nr_slabs += node_nr_slabs(n);
  8175. nr_objs += node_nr_objs(n);
  8176. nr_free += count_partial_free_approx(n);
  8177. }
  8178. sinfo->active_objs = nr_objs - nr_free;
  8179. sinfo->num_objs = nr_objs;
  8180. sinfo->active_slabs = nr_slabs;
  8181. sinfo->num_slabs = nr_slabs;
  8182. sinfo->objects_per_slab = oo_objects(s->oo);
  8183. sinfo->cache_order = oo_order(s->oo);
  8184. }
  8185. #endif /* CONFIG_SLUB_DEBUG */