volumes.c 237 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2007 Oracle. All rights reserved.
  4. */
  5. #include <linux/sched.h>
  6. #include <linux/sched/mm.h>
  7. #include <linux/slab.h>
  8. #include <linux/ratelimit.h>
  9. #include <linux/kthread.h>
  10. #include <linux/semaphore.h>
  11. #include <linux/uuid.h>
  12. #include <linux/list_sort.h>
  13. #include <linux/namei.h>
  14. #include "misc.h"
  15. #include "disk-io.h"
  16. #include "extent-tree.h"
  17. #include "transaction.h"
  18. #include "volumes.h"
  19. #include "raid56.h"
  20. #include "dev-replace.h"
  21. #include "sysfs.h"
  22. #include "tree-checker.h"
  23. #include "space-info.h"
  24. #include "block-group.h"
  25. #include "discard.h"
  26. #include "zoned.h"
  27. #include "fs.h"
  28. #include "accessors.h"
  29. #include "uuid-tree.h"
  30. #include "ioctl.h"
  31. #include "relocation.h"
  32. #include "scrub.h"
  33. #include "super.h"
  34. #include "raid-stripe-tree.h"
  35. #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
  36. BTRFS_BLOCK_GROUP_RAID10 | \
  37. BTRFS_BLOCK_GROUP_RAID56_MASK)
  38. struct btrfs_io_geometry {
  39. u32 stripe_index;
  40. u32 stripe_nr;
  41. int mirror_num;
  42. int num_stripes;
  43. u64 stripe_offset;
  44. u64 raid56_full_stripe_start;
  45. int max_errors;
  46. enum btrfs_map_op op;
  47. bool use_rst;
  48. };
  49. const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
  50. [BTRFS_RAID_RAID10] = {
  51. .sub_stripes = 2,
  52. .dev_stripes = 1,
  53. .devs_max = 0, /* 0 == as many as possible */
  54. .devs_min = 2,
  55. .tolerated_failures = 1,
  56. .devs_increment = 2,
  57. .ncopies = 2,
  58. .nparity = 0,
  59. .raid_name = "raid10",
  60. .bg_flag = BTRFS_BLOCK_GROUP_RAID10,
  61. .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
  62. },
  63. [BTRFS_RAID_RAID1] = {
  64. .sub_stripes = 1,
  65. .dev_stripes = 1,
  66. .devs_max = 2,
  67. .devs_min = 2,
  68. .tolerated_failures = 1,
  69. .devs_increment = 2,
  70. .ncopies = 2,
  71. .nparity = 0,
  72. .raid_name = "raid1",
  73. .bg_flag = BTRFS_BLOCK_GROUP_RAID1,
  74. .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
  75. },
  76. [BTRFS_RAID_RAID1C3] = {
  77. .sub_stripes = 1,
  78. .dev_stripes = 1,
  79. .devs_max = 3,
  80. .devs_min = 3,
  81. .tolerated_failures = 2,
  82. .devs_increment = 3,
  83. .ncopies = 3,
  84. .nparity = 0,
  85. .raid_name = "raid1c3",
  86. .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
  87. .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
  88. },
  89. [BTRFS_RAID_RAID1C4] = {
  90. .sub_stripes = 1,
  91. .dev_stripes = 1,
  92. .devs_max = 4,
  93. .devs_min = 4,
  94. .tolerated_failures = 3,
  95. .devs_increment = 4,
  96. .ncopies = 4,
  97. .nparity = 0,
  98. .raid_name = "raid1c4",
  99. .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
  100. .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
  101. },
  102. [BTRFS_RAID_DUP] = {
  103. .sub_stripes = 1,
  104. .dev_stripes = 2,
  105. .devs_max = 1,
  106. .devs_min = 1,
  107. .tolerated_failures = 0,
  108. .devs_increment = 1,
  109. .ncopies = 2,
  110. .nparity = 0,
  111. .raid_name = "dup",
  112. .bg_flag = BTRFS_BLOCK_GROUP_DUP,
  113. .mindev_error = 0,
  114. },
  115. [BTRFS_RAID_RAID0] = {
  116. .sub_stripes = 1,
  117. .dev_stripes = 1,
  118. .devs_max = 0,
  119. .devs_min = 1,
  120. .tolerated_failures = 0,
  121. .devs_increment = 1,
  122. .ncopies = 1,
  123. .nparity = 0,
  124. .raid_name = "raid0",
  125. .bg_flag = BTRFS_BLOCK_GROUP_RAID0,
  126. .mindev_error = 0,
  127. },
  128. [BTRFS_RAID_SINGLE] = {
  129. .sub_stripes = 1,
  130. .dev_stripes = 1,
  131. .devs_max = 1,
  132. .devs_min = 1,
  133. .tolerated_failures = 0,
  134. .devs_increment = 1,
  135. .ncopies = 1,
  136. .nparity = 0,
  137. .raid_name = "single",
  138. .bg_flag = 0,
  139. .mindev_error = 0,
  140. },
  141. [BTRFS_RAID_RAID5] = {
  142. .sub_stripes = 1,
  143. .dev_stripes = 1,
  144. .devs_max = 0,
  145. .devs_min = 2,
  146. .tolerated_failures = 1,
  147. .devs_increment = 1,
  148. .ncopies = 1,
  149. .nparity = 1,
  150. .raid_name = "raid5",
  151. .bg_flag = BTRFS_BLOCK_GROUP_RAID5,
  152. .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
  153. },
  154. [BTRFS_RAID_RAID6] = {
  155. .sub_stripes = 1,
  156. .dev_stripes = 1,
  157. .devs_max = 0,
  158. .devs_min = 3,
  159. .tolerated_failures = 2,
  160. .devs_increment = 1,
  161. .ncopies = 1,
  162. .nparity = 2,
  163. .raid_name = "raid6",
  164. .bg_flag = BTRFS_BLOCK_GROUP_RAID6,
  165. .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
  166. },
  167. };
  168. /*
  169. * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
  170. * can be used as index to access btrfs_raid_array[].
  171. */
  172. enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
  173. {
  174. const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
  175. if (!profile)
  176. return BTRFS_RAID_SINGLE;
  177. return BTRFS_BG_FLAG_TO_INDEX(profile);
  178. }
  179. const char *btrfs_bg_type_to_raid_name(u64 flags)
  180. {
  181. const int index = btrfs_bg_flags_to_raid_index(flags);
  182. if (index >= BTRFS_NR_RAID_TYPES)
  183. return NULL;
  184. return btrfs_raid_array[index].raid_name;
  185. }
  186. int btrfs_nr_parity_stripes(u64 type)
  187. {
  188. enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);
  189. return btrfs_raid_array[index].nparity;
  190. }
  191. /*
  192. * Fill @buf with textual description of @bg_flags, no more than @size_buf
  193. * bytes including terminating null byte.
  194. */
  195. void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
  196. {
  197. int i;
  198. int ret;
  199. char *bp = buf;
  200. u64 flags = bg_flags;
  201. u32 size_bp = size_buf;
  202. if (!flags)
  203. return;
  204. #define DESCRIBE_FLAG(flag, desc) \
  205. do { \
  206. if (flags & (flag)) { \
  207. ret = snprintf(bp, size_bp, "%s|", (desc)); \
  208. if (ret < 0 || ret >= size_bp) \
  209. goto out_overflow; \
  210. size_bp -= ret; \
  211. bp += ret; \
  212. flags &= ~(flag); \
  213. } \
  214. } while (0)
  215. DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
  216. DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
  217. DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
  218. /* Block groups containing the remap tree. */
  219. DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA_REMAP, "metadata-remap");
  220. /* Block group that has been remapped. */
  221. DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped");
  222. DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
  223. for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
  224. DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
  225. btrfs_raid_array[i].raid_name);
  226. #undef DESCRIBE_FLAG
  227. if (flags) {
  228. ret = snprintf(bp, size_bp, "0x%llx|", flags);
  229. size_bp -= ret;
  230. }
  231. if (size_bp < size_buf)
  232. buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
  233. /*
  234. * The text is trimmed, it's up to the caller to provide sufficiently
  235. * large buffer
  236. */
  237. out_overflow:;
  238. }
  239. static int init_first_rw_device(struct btrfs_trans_handle *trans);
  240. static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
  241. static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
  242. /*
  243. * Device locking
  244. * ==============
  245. *
  246. * There are several mutexes that protect manipulation of devices and low-level
  247. * structures like chunks but not block groups, extents or files
  248. *
  249. * uuid_mutex (global lock)
  250. * ------------------------
  251. * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
  252. * the SCAN_DEV ioctl registration or from mount either implicitly (the first
  253. * device) or requested by the device= mount option
  254. *
  255. * the mutex can be very coarse and can cover long-running operations
  256. *
  257. * protects: updates to fs_devices counters like missing devices, rw devices,
  258. * seeding, structure cloning, opening/closing devices at mount/umount time
  259. *
  260. * global::fs_devs - add, remove, updates to the global list
  261. *
  262. * does not protect: manipulation of the fs_devices::devices list in general
  263. * but in mount context it could be used to exclude list modifications by eg.
  264. * scan ioctl
  265. *
  266. * btrfs_device::name - renames (write side), read is RCU
  267. *
  268. * fs_devices::device_list_mutex (per-fs, with RCU)
  269. * ------------------------------------------------
  270. * protects updates to fs_devices::devices, ie. adding and deleting
  271. *
  272. * simple list traversal with read-only actions can be done with RCU protection
  273. *
  274. * may be used to exclude some operations from running concurrently without any
  275. * modifications to the list (see write_all_supers)
  276. *
  277. * Is not required at mount and close times, because our device list is
  278. * protected by the uuid_mutex at that point.
  279. *
  280. * balance_mutex
  281. * -------------
  282. * protects balance structures (status, state) and context accessed from
  283. * several places (internally, ioctl)
  284. *
  285. * chunk_mutex
  286. * -----------
  287. * protects chunks, adding or removing during allocation, trim or when a new
  288. * device is added/removed. Additionally it also protects post_commit_list of
  289. * individual devices, since they can be added to the transaction's
  290. * post_commit_list only with chunk_mutex held.
  291. *
  292. * cleaner_mutex
  293. * -------------
  294. * a big lock that is held by the cleaner thread and prevents running subvolume
  295. * cleaning together with relocation or delayed iputs
  296. *
  297. *
  298. * Lock nesting
  299. * ============
  300. *
  301. * uuid_mutex
  302. * device_list_mutex
  303. * chunk_mutex
  304. * balance_mutex
  305. *
  306. *
  307. * Exclusive operations
  308. * ====================
  309. *
  310. * Maintains the exclusivity of the following operations that apply to the
  311. * whole filesystem and cannot run in parallel.
  312. *
  313. * - Balance (*)
  314. * - Device add
  315. * - Device remove
  316. * - Device replace (*)
  317. * - Resize
  318. *
  319. * The device operations (as above) can be in one of the following states:
  320. *
  321. * - Running state
  322. * - Paused state
  323. * - Completed state
  324. *
  325. * Only device operations marked with (*) can go into the Paused state for the
  326. * following reasons:
  327. *
  328. * - ioctl (only Balance can be Paused through ioctl)
  329. * - filesystem remounted as read-only
  330. * - filesystem unmounted and mounted as read-only
  331. * - system power-cycle and filesystem mounted as read-only
  332. * - filesystem or device errors leading to forced read-only
  333. *
  334. * The status of exclusive operation is set and cleared atomically.
  335. * During the course of Paused state, fs_info::exclusive_operation remains set.
  336. * A device operation in Paused or Running state can be canceled or resumed
  337. * either by ioctl (Balance only) or when remounted as read-write.
  338. * The exclusive status is cleared when the device operation is canceled or
  339. * completed.
  340. */
  341. DEFINE_MUTEX(uuid_mutex);
  342. static LIST_HEAD(fs_uuids);
  343. struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
  344. {
  345. return &fs_uuids;
  346. }
  347. /*
  348. * Allocate new btrfs_fs_devices structure identified by a fsid.
  349. *
  350. * @fsid: if not NULL, copy the UUID to fs_devices::fsid and to
  351. * fs_devices::metadata_fsid
  352. *
  353. * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
  354. * The returned struct is not linked onto any lists and can be destroyed with
  355. * kfree() right away.
  356. */
  357. static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
  358. {
  359. struct btrfs_fs_devices *fs_devs;
  360. fs_devs = kzalloc_obj(*fs_devs);
  361. if (!fs_devs)
  362. return ERR_PTR(-ENOMEM);
  363. mutex_init(&fs_devs->device_list_mutex);
  364. INIT_LIST_HEAD(&fs_devs->devices);
  365. INIT_LIST_HEAD(&fs_devs->alloc_list);
  366. INIT_LIST_HEAD(&fs_devs->fs_list);
  367. INIT_LIST_HEAD(&fs_devs->seed_list);
  368. if (fsid) {
  369. memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
  370. memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
  371. }
  372. return fs_devs;
  373. }
  374. static void btrfs_free_device(struct btrfs_device *device)
  375. {
  376. WARN_ON(!list_empty(&device->post_commit_list));
  377. /*
  378. * No need to call kfree_rcu() nor do RCU lock/unlock, nothing is
  379. * reading the device name.
  380. */
  381. kfree(rcu_dereference_raw(device->name));
  382. btrfs_extent_io_tree_release(&device->alloc_state);
  383. btrfs_destroy_dev_zone_info(device);
  384. kfree(device);
  385. }
  386. static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
  387. {
  388. struct btrfs_device *device;
  389. WARN_ON(fs_devices->opened);
  390. WARN_ON(fs_devices->holding);
  391. while (!list_empty(&fs_devices->devices)) {
  392. device = list_first_entry(&fs_devices->devices,
  393. struct btrfs_device, dev_list);
  394. list_del(&device->dev_list);
  395. btrfs_free_device(device);
  396. }
  397. kfree(fs_devices);
  398. }
  399. void __exit btrfs_cleanup_fs_uuids(void)
  400. {
  401. struct btrfs_fs_devices *fs_devices;
  402. while (!list_empty(&fs_uuids)) {
  403. fs_devices = list_first_entry(&fs_uuids, struct btrfs_fs_devices,
  404. fs_list);
  405. list_del(&fs_devices->fs_list);
  406. free_fs_devices(fs_devices);
  407. }
  408. }
  409. static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices,
  410. const u8 *fsid, const u8 *metadata_fsid)
  411. {
  412. if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0)
  413. return false;
  414. if (!metadata_fsid)
  415. return true;
  416. if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0)
  417. return false;
  418. return true;
  419. }
  420. static noinline struct btrfs_fs_devices *find_fsid(
  421. const u8 *fsid, const u8 *metadata_fsid)
  422. {
  423. struct btrfs_fs_devices *fs_devices;
  424. ASSERT(fsid);
  425. /* Handle non-split brain cases */
  426. list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
  427. if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid))
  428. return fs_devices;
  429. }
  430. return NULL;
  431. }
  432. static int
  433. btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
  434. int flush, struct file **bdev_file,
  435. struct btrfs_super_block **disk_super)
  436. {
  437. struct block_device *bdev;
  438. int ret;
  439. *bdev_file = bdev_file_open_by_path(device_path, flags, holder, &fs_holder_ops);
  440. if (IS_ERR(*bdev_file)) {
  441. ret = PTR_ERR(*bdev_file);
  442. btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d",
  443. device_path, flags, ret);
  444. goto error;
  445. }
  446. bdev = file_bdev(*bdev_file);
  447. if (flush)
  448. sync_blockdev(bdev);
  449. if (holder) {
  450. ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
  451. if (ret) {
  452. bdev_fput(*bdev_file);
  453. goto error;
  454. }
  455. }
  456. invalidate_bdev(bdev);
  457. *disk_super = btrfs_read_disk_super(bdev, 0, false);
  458. if (IS_ERR(*disk_super)) {
  459. ret = PTR_ERR(*disk_super);
  460. bdev_fput(*bdev_file);
  461. goto error;
  462. }
  463. return 0;
  464. error:
  465. *disk_super = NULL;
  466. *bdev_file = NULL;
  467. return ret;
  468. }
  469. /*
  470. * Search and remove all stale devices (which are not mounted). When both
  471. * inputs are NULL, it will search and release all stale devices.
  472. *
  473. * @devt: Optional. When provided will it release all unmounted devices
  474. * matching this devt only.
  475. * @skip_device: Optional. Will skip this device when searching for the stale
  476. * devices.
  477. *
  478. * Return: 0 for success or if @devt is 0.
  479. * -EBUSY if @devt is a mounted device.
  480. * -ENOENT if @devt does not match any device in the list.
  481. */
  482. static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
  483. {
  484. struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
  485. struct btrfs_device *device, *tmp_device;
  486. int ret;
  487. bool freed = false;
  488. lockdep_assert_held(&uuid_mutex);
  489. /* Return good status if there is no instance of devt. */
  490. ret = 0;
  491. list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
  492. mutex_lock(&fs_devices->device_list_mutex);
  493. list_for_each_entry_safe(device, tmp_device,
  494. &fs_devices->devices, dev_list) {
  495. if (skip_device && skip_device == device)
  496. continue;
  497. if (devt && devt != device->devt)
  498. continue;
  499. if (fs_devices->opened || fs_devices->holding) {
  500. if (devt)
  501. ret = -EBUSY;
  502. break;
  503. }
  504. /* delete the stale device */
  505. fs_devices->num_devices--;
  506. list_del(&device->dev_list);
  507. btrfs_free_device(device);
  508. freed = true;
  509. }
  510. mutex_unlock(&fs_devices->device_list_mutex);
  511. if (fs_devices->num_devices == 0) {
  512. btrfs_sysfs_remove_fsid(fs_devices);
  513. list_del(&fs_devices->fs_list);
  514. free_fs_devices(fs_devices);
  515. }
  516. }
  517. /* If there is at least one freed device return 0. */
  518. if (freed)
  519. return 0;
  520. return ret;
  521. }
  522. static struct btrfs_fs_devices *find_fsid_by_device(
  523. struct btrfs_super_block *disk_super,
  524. dev_t devt, bool *same_fsid_diff_dev)
  525. {
  526. struct btrfs_fs_devices *fsid_fs_devices;
  527. struct btrfs_fs_devices *devt_fs_devices;
  528. const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
  529. BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
  530. bool found_by_devt = false;
  531. /* Find the fs_device by the usual method, if found use it. */
  532. fsid_fs_devices = find_fsid(disk_super->fsid,
  533. has_metadata_uuid ? disk_super->metadata_uuid : NULL);
  534. /* The temp_fsid feature is supported only with single device filesystem. */
  535. if (btrfs_super_num_devices(disk_super) != 1)
  536. return fsid_fs_devices;
  537. /*
  538. * A seed device is an integral component of the sprout device, which
  539. * functions as a multi-device filesystem. So, temp-fsid feature is
  540. * not supported.
  541. */
  542. if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)
  543. return fsid_fs_devices;
  544. /* Try to find a fs_devices by matching devt. */
  545. list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) {
  546. struct btrfs_device *device;
  547. list_for_each_entry(device, &devt_fs_devices->devices, dev_list) {
  548. if (device->devt == devt) {
  549. found_by_devt = true;
  550. break;
  551. }
  552. }
  553. if (found_by_devt)
  554. break;
  555. }
  556. if (found_by_devt) {
  557. /* Existing device. */
  558. if (fsid_fs_devices == NULL) {
  559. if (devt_fs_devices->opened == 0) {
  560. /* Stale device. */
  561. return NULL;
  562. } else {
  563. /* temp_fsid is mounting a subvol. */
  564. return devt_fs_devices;
  565. }
  566. } else {
  567. /* Regular or temp_fsid device mounting a subvol. */
  568. return devt_fs_devices;
  569. }
  570. } else {
  571. /* New device. */
  572. if (fsid_fs_devices == NULL) {
  573. return NULL;
  574. } else {
  575. /* sb::fsid is already used create a new temp_fsid. */
  576. *same_fsid_diff_dev = true;
  577. return NULL;
  578. }
  579. }
  580. /* Not reached. */
  581. }
  582. /*
  583. * This is only used on mount, and we are protected from competing things
  584. * messing with our fs_devices by the uuid_mutex, thus we do not need the
  585. * fs_devices->device_list_mutex here.
  586. */
  587. static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
  588. struct btrfs_device *device, blk_mode_t flags,
  589. void *holder)
  590. {
  591. struct file *bdev_file;
  592. struct btrfs_super_block *disk_super;
  593. u64 devid;
  594. int ret;
  595. if (device->bdev)
  596. return -EINVAL;
  597. if (!device->name)
  598. return -EINVAL;
  599. ret = btrfs_get_bdev_and_sb(rcu_dereference_raw(device->name), flags, holder, 1,
  600. &bdev_file, &disk_super);
  601. if (ret)
  602. return ret;
  603. devid = btrfs_stack_device_id(&disk_super->dev_item);
  604. if (devid != device->devid)
  605. goto error_free_page;
  606. if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
  607. goto error_free_page;
  608. device->generation = btrfs_super_generation(disk_super);
  609. if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
  610. if (btrfs_super_incompat_flags(disk_super) &
  611. BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
  612. btrfs_err(NULL,
  613. "invalid seeding and uuid-changed device detected");
  614. goto error_free_page;
  615. }
  616. clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
  617. fs_devices->seeding = true;
  618. } else {
  619. if (bdev_read_only(file_bdev(bdev_file)))
  620. clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
  621. else
  622. set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
  623. }
  624. if (!bdev_nonrot(file_bdev(bdev_file)))
  625. fs_devices->rotating = true;
  626. if (bdev_max_discard_sectors(file_bdev(bdev_file)))
  627. fs_devices->discardable = true;
  628. device->bdev_file = bdev_file;
  629. device->bdev = file_bdev(bdev_file);
  630. clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
  631. if (device->devt != device->bdev->bd_dev) {
  632. btrfs_warn(NULL,
  633. "device %s maj:min changed from %d:%d to %d:%d",
  634. rcu_dereference_raw(device->name), MAJOR(device->devt),
  635. MINOR(device->devt), MAJOR(device->bdev->bd_dev),
  636. MINOR(device->bdev->bd_dev));
  637. device->devt = device->bdev->bd_dev;
  638. }
  639. fs_devices->open_devices++;
  640. if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
  641. device->devid != BTRFS_DEV_REPLACE_DEVID) {
  642. fs_devices->rw_devices++;
  643. list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
  644. }
  645. btrfs_release_disk_super(disk_super);
  646. return 0;
  647. error_free_page:
  648. btrfs_release_disk_super(disk_super);
  649. bdev_fput(bdev_file);
  650. return -EINVAL;
  651. }
  652. const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
  653. {
  654. bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
  655. BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
  656. return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
  657. }
  658. static bool is_same_device(struct btrfs_device *device, const char *new_path)
  659. {
  660. struct path old = { .mnt = NULL, .dentry = NULL };
  661. struct path new = { .mnt = NULL, .dentry = NULL };
  662. char AUTO_KFREE(old_path);
  663. bool is_same = false;
  664. int ret;
  665. if (!device->name)
  666. goto out;
  667. old_path = kzalloc(PATH_MAX, GFP_NOFS);
  668. if (!old_path)
  669. goto out;
  670. rcu_read_lock();
  671. ret = strscpy(old_path, rcu_dereference(device->name), PATH_MAX);
  672. rcu_read_unlock();
  673. if (ret < 0)
  674. goto out;
  675. ret = kern_path(old_path, LOOKUP_FOLLOW, &old);
  676. if (ret)
  677. goto out;
  678. ret = kern_path(new_path, LOOKUP_FOLLOW, &new);
  679. if (ret)
  680. goto out;
  681. if (path_equal(&old, &new))
  682. is_same = true;
  683. out:
  684. path_put(&old);
  685. path_put(&new);
  686. return is_same;
  687. }
  688. /*
  689. * Add new device to list of registered devices
  690. *
  691. * Returns:
  692. * device pointer which was just added or updated when successful
  693. * error pointer when failed
  694. */
  695. static noinline struct btrfs_device *device_list_add(const char *path,
  696. struct btrfs_super_block *disk_super,
  697. bool *new_device_added)
  698. {
  699. struct btrfs_device *device;
  700. struct btrfs_fs_devices *fs_devices = NULL;
  701. const char *name;
  702. u64 found_transid = btrfs_super_generation(disk_super);
  703. u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
  704. dev_t path_devt;
  705. int ret;
  706. bool same_fsid_diff_dev = false;
  707. bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
  708. BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
  709. if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
  710. btrfs_err(NULL,
  711. "device %s has incomplete metadata_uuid change, please use btrfstune to complete",
  712. path);
  713. return ERR_PTR(-EAGAIN);
  714. }
  715. ret = lookup_bdev(path, &path_devt);
  716. if (ret) {
  717. btrfs_err(NULL, "failed to lookup block device for path %s: %d",
  718. path, ret);
  719. return ERR_PTR(ret);
  720. }
  721. fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev);
  722. if (!fs_devices) {
  723. fs_devices = alloc_fs_devices(disk_super->fsid);
  724. if (IS_ERR(fs_devices))
  725. return ERR_CAST(fs_devices);
  726. if (has_metadata_uuid)
  727. memcpy(fs_devices->metadata_uuid,
  728. disk_super->metadata_uuid, BTRFS_FSID_SIZE);
  729. if (same_fsid_diff_dev) {
  730. generate_random_uuid(fs_devices->fsid);
  731. fs_devices->temp_fsid = true;
  732. btrfs_info(NULL, "device %s (%d:%d) using temp-fsid %pU",
  733. path, MAJOR(path_devt), MINOR(path_devt),
  734. fs_devices->fsid);
  735. }
  736. mutex_lock(&fs_devices->device_list_mutex);
  737. list_add(&fs_devices->fs_list, &fs_uuids);
  738. device = NULL;
  739. } else {
  740. struct btrfs_dev_lookup_args args = {
  741. .devid = devid,
  742. .uuid = disk_super->dev_item.uuid,
  743. };
  744. mutex_lock(&fs_devices->device_list_mutex);
  745. device = btrfs_find_device(fs_devices, &args);
  746. if (found_transid > fs_devices->latest_generation) {
  747. memcpy(fs_devices->fsid, disk_super->fsid,
  748. BTRFS_FSID_SIZE);
  749. memcpy(fs_devices->metadata_uuid,
  750. btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
  751. }
  752. }
  753. if (!device) {
  754. unsigned int nofs_flag;
  755. if (fs_devices->opened) {
  756. btrfs_err(NULL,
  757. "device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
  758. path, MAJOR(path_devt), MINOR(path_devt),
  759. fs_devices->fsid, current->comm,
  760. task_pid_nr(current));
  761. mutex_unlock(&fs_devices->device_list_mutex);
  762. return ERR_PTR(-EBUSY);
  763. }
  764. nofs_flag = memalloc_nofs_save();
  765. device = btrfs_alloc_device(NULL, &devid,
  766. disk_super->dev_item.uuid, path);
  767. memalloc_nofs_restore(nofs_flag);
  768. if (IS_ERR(device)) {
  769. mutex_unlock(&fs_devices->device_list_mutex);
  770. /* we can safely leave the fs_devices entry around */
  771. return device;
  772. }
  773. device->devt = path_devt;
  774. list_add_rcu(&device->dev_list, &fs_devices->devices);
  775. fs_devices->num_devices++;
  776. device->fs_devices = fs_devices;
  777. *new_device_added = true;
  778. if (disk_super->label[0])
  779. pr_info(
  780. "BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
  781. disk_super->label, devid, found_transid, path,
  782. MAJOR(path_devt), MINOR(path_devt),
  783. current->comm, task_pid_nr(current));
  784. else
  785. pr_info(
  786. "BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
  787. disk_super->fsid, devid, found_transid, path,
  788. MAJOR(path_devt), MINOR(path_devt),
  789. current->comm, task_pid_nr(current));
  790. } else if (!device->name || !is_same_device(device, path)) {
  791. const char *old_name;
  792. /*
  793. * When FS is already mounted.
  794. * 1. If you are here and if the device->name is NULL that
  795. * means this device was missing at time of FS mount.
  796. * 2. If you are here and if the device->name is different
  797. * from 'path' that means either
  798. * a. The same device disappeared and reappeared with
  799. * different name. or
  800. * b. The missing-disk-which-was-replaced, has
  801. * reappeared now.
  802. *
  803. * We must allow 1 and 2a above. But 2b would be a spurious
  804. * and unintentional.
  805. *
  806. * Further in case of 1 and 2a above, the disk at 'path'
  807. * would have missed some transaction when it was away and
  808. * in case of 2a the stale bdev has to be updated as well.
  809. * 2b must not be allowed at all time.
  810. */
  811. /*
  812. * For now, we do allow update to btrfs_fs_device through the
  813. * btrfs dev scan cli after FS has been mounted. We're still
  814. * tracking a problem where systems fail mount by subvolume id
  815. * when we reject replacement on a mounted FS.
  816. */
  817. if (!fs_devices->opened && found_transid < device->generation) {
  818. /*
  819. * That is if the FS is _not_ mounted and if you
  820. * are here, that means there is more than one
  821. * disk with same uuid and devid.We keep the one
  822. * with larger generation number or the last-in if
  823. * generation are equal.
  824. */
  825. mutex_unlock(&fs_devices->device_list_mutex);
  826. btrfs_err(NULL,
  827. "device %s already registered with a higher generation, found %llu expect %llu",
  828. path, found_transid, device->generation);
  829. return ERR_PTR(-EEXIST);
  830. }
  831. /*
  832. * We are going to replace the device path for a given devid,
  833. * make sure it's the same device if the device is mounted
  834. *
  835. * NOTE: the device->fs_info may not be reliable here so pass
  836. * in a NULL to message helpers instead. This avoids a possible
  837. * use-after-free when the fs_info and fs_info->sb are already
  838. * torn down.
  839. */
  840. if (device->bdev) {
  841. if (device->devt != path_devt) {
  842. mutex_unlock(&fs_devices->device_list_mutex);
  843. btrfs_warn(NULL,
  844. "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
  845. path, devid, found_transid,
  846. current->comm,
  847. task_pid_nr(current));
  848. return ERR_PTR(-EEXIST);
  849. }
  850. btrfs_info(NULL,
  851. "devid %llu device path %s changed to %s scanned by %s (%d)",
  852. devid, btrfs_dev_name(device),
  853. path, current->comm,
  854. task_pid_nr(current));
  855. }
  856. name = kstrdup(path, GFP_NOFS);
  857. if (!name) {
  858. mutex_unlock(&fs_devices->device_list_mutex);
  859. return ERR_PTR(-ENOMEM);
  860. }
  861. rcu_read_lock();
  862. old_name = rcu_dereference(device->name);
  863. rcu_read_unlock();
  864. rcu_assign_pointer(device->name, name);
  865. kfree_rcu_mightsleep(old_name);
  866. if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
  867. fs_devices->missing_devices--;
  868. clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
  869. }
  870. device->devt = path_devt;
  871. }
  872. /*
  873. * Unmount does not free the btrfs_device struct but would zero
  874. * generation along with most of the other members. So just update
  875. * it back. We need it to pick the disk with largest generation
  876. * (as above).
  877. */
  878. if (!fs_devices->opened) {
  879. device->generation = found_transid;
  880. fs_devices->latest_generation = max_t(u64, found_transid,
  881. fs_devices->latest_generation);
  882. }
  883. fs_devices->total_devices = btrfs_super_num_devices(disk_super);
  884. mutex_unlock(&fs_devices->device_list_mutex);
  885. return device;
  886. }
  887. static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
  888. {
  889. struct btrfs_fs_devices *fs_devices;
  890. struct btrfs_device *device;
  891. struct btrfs_device *orig_dev;
  892. int ret = 0;
  893. lockdep_assert_held(&uuid_mutex);
  894. fs_devices = alloc_fs_devices(orig->fsid);
  895. if (IS_ERR(fs_devices))
  896. return fs_devices;
  897. fs_devices->total_devices = orig->total_devices;
  898. list_for_each_entry(orig_dev, &orig->devices, dev_list) {
  899. const char *dev_path = NULL;
  900. /*
  901. * This is ok to do without RCU read locked because we hold the
  902. * uuid mutex so nothing we touch in here is going to disappear.
  903. */
  904. if (orig_dev->name)
  905. dev_path = rcu_dereference_raw(orig_dev->name);
  906. device = btrfs_alloc_device(NULL, &orig_dev->devid,
  907. orig_dev->uuid, dev_path);
  908. if (IS_ERR(device)) {
  909. ret = PTR_ERR(device);
  910. goto error;
  911. }
  912. if (orig_dev->zone_info) {
  913. struct btrfs_zoned_device_info *zone_info;
  914. zone_info = btrfs_clone_dev_zone_info(orig_dev);
  915. if (!zone_info) {
  916. btrfs_free_device(device);
  917. ret = -ENOMEM;
  918. goto error;
  919. }
  920. device->zone_info = zone_info;
  921. }
  922. list_add(&device->dev_list, &fs_devices->devices);
  923. device->fs_devices = fs_devices;
  924. fs_devices->num_devices++;
  925. }
  926. return fs_devices;
  927. error:
  928. free_fs_devices(fs_devices);
  929. return ERR_PTR(ret);
  930. }
  931. static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
  932. struct btrfs_device **latest_dev)
  933. {
  934. struct btrfs_device *device, *next;
  935. /* This is the initialized path, it is safe to release the devices. */
  936. list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
  937. if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
  938. if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
  939. &device->dev_state) &&
  940. !test_bit(BTRFS_DEV_STATE_MISSING,
  941. &device->dev_state) &&
  942. (!*latest_dev ||
  943. device->generation > (*latest_dev)->generation)) {
  944. *latest_dev = device;
  945. }
  946. continue;
  947. }
  948. /*
  949. * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
  950. * in btrfs_init_dev_replace() so just continue.
  951. */
  952. if (device->devid == BTRFS_DEV_REPLACE_DEVID)
  953. continue;
  954. if (device->bdev_file) {
  955. bdev_fput(device->bdev_file);
  956. device->bdev = NULL;
  957. device->bdev_file = NULL;
  958. fs_devices->open_devices--;
  959. }
  960. if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
  961. list_del_init(&device->dev_alloc_list);
  962. clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
  963. fs_devices->rw_devices--;
  964. }
  965. list_del_init(&device->dev_list);
  966. fs_devices->num_devices--;
  967. btrfs_free_device(device);
  968. }
  969. }
  970. /*
  971. * After we have read the system tree and know devids belonging to this
  972. * filesystem, remove the device which does not belong there.
  973. */
  974. void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
  975. {
  976. struct btrfs_device *latest_dev = NULL;
  977. struct btrfs_fs_devices *seed_dev;
  978. mutex_lock(&uuid_mutex);
  979. __btrfs_free_extra_devids(fs_devices, &latest_dev);
  980. list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
  981. __btrfs_free_extra_devids(seed_dev, &latest_dev);
  982. fs_devices->latest_dev = latest_dev;
  983. mutex_unlock(&uuid_mutex);
  984. }
  985. static void btrfs_close_bdev(struct btrfs_device *device)
  986. {
  987. if (!device->bdev)
  988. return;
  989. if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
  990. sync_blockdev(device->bdev);
  991. invalidate_bdev(device->bdev);
  992. }
  993. bdev_fput(device->bdev_file);
  994. }
  995. static void btrfs_close_one_device(struct btrfs_device *device)
  996. {
  997. struct btrfs_fs_devices *fs_devices = device->fs_devices;
  998. if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
  999. device->devid != BTRFS_DEV_REPLACE_DEVID) {
  1000. list_del_init(&device->dev_alloc_list);
  1001. fs_devices->rw_devices--;
  1002. }
  1003. if (device->devid == BTRFS_DEV_REPLACE_DEVID)
  1004. clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
  1005. if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
  1006. clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
  1007. fs_devices->missing_devices--;
  1008. }
  1009. btrfs_close_bdev(device);
  1010. if (device->bdev) {
  1011. fs_devices->open_devices--;
  1012. device->bdev = NULL;
  1013. device->bdev_file = NULL;
  1014. }
  1015. clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
  1016. btrfs_destroy_dev_zone_info(device);
  1017. device->fs_info = NULL;
  1018. atomic_set(&device->dev_stats_ccnt, 0);
  1019. btrfs_extent_io_tree_release(&device->alloc_state);
  1020. /*
  1021. * Reset the flush error record. We might have a transient flush error
  1022. * in this mount, and if so we aborted the current transaction and set
  1023. * the fs to an error state, guaranteeing no super blocks can be further
  1024. * committed. However that error might be transient and if we unmount the
  1025. * filesystem and mount it again, we should allow the mount to succeed
  1026. * (btrfs_check_rw_degradable() should not fail) - if after mounting the
  1027. * filesystem again we still get flush errors, then we will again abort
  1028. * any transaction and set the error state, guaranteeing no commits of
  1029. * unsafe super blocks.
  1030. */
  1031. clear_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state);
  1032. /* Verify the device is back in a pristine state */
  1033. WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
  1034. WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
  1035. WARN_ON(!list_empty(&device->dev_alloc_list));
  1036. WARN_ON(!list_empty(&device->post_commit_list));
  1037. }
  1038. static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
  1039. {
  1040. struct btrfs_device *device, *tmp;
  1041. lockdep_assert_held(&uuid_mutex);
  1042. if (--fs_devices->opened > 0)
  1043. return;
  1044. list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
  1045. btrfs_close_one_device(device);
  1046. WARN_ON(fs_devices->open_devices);
  1047. WARN_ON(fs_devices->rw_devices);
  1048. fs_devices->opened = 0;
  1049. fs_devices->seeding = false;
  1050. fs_devices->fs_info = NULL;
  1051. }
  1052. void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
  1053. {
  1054. LIST_HEAD(list);
  1055. struct btrfs_fs_devices *tmp;
  1056. mutex_lock(&uuid_mutex);
  1057. close_fs_devices(fs_devices);
  1058. if (!fs_devices->opened && !fs_devices->holding) {
  1059. list_splice_init(&fs_devices->seed_list, &list);
  1060. /*
  1061. * If the struct btrfs_fs_devices is not assembled with any
  1062. * other device, it can be re-initialized during the next mount
  1063. * without the needing device-scan step. Therefore, it can be
  1064. * fully freed.
  1065. */
  1066. if (fs_devices->num_devices == 1) {
  1067. list_del(&fs_devices->fs_list);
  1068. free_fs_devices(fs_devices);
  1069. }
  1070. }
  1071. list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
  1072. close_fs_devices(fs_devices);
  1073. list_del(&fs_devices->seed_list);
  1074. free_fs_devices(fs_devices);
  1075. }
  1076. mutex_unlock(&uuid_mutex);
  1077. }
  1078. static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
  1079. blk_mode_t flags, void *holder)
  1080. {
  1081. struct btrfs_device *device;
  1082. struct btrfs_device *latest_dev = NULL;
  1083. struct btrfs_device *tmp_device;
  1084. s64 __maybe_unused value = 0;
  1085. int ret = 0;
  1086. list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
  1087. dev_list) {
  1088. int ret2;
  1089. ret2 = btrfs_open_one_device(fs_devices, device, flags, holder);
  1090. if (ret2 == 0 &&
  1091. (!latest_dev || device->generation > latest_dev->generation)) {
  1092. latest_dev = device;
  1093. } else if (ret2 == -ENODATA) {
  1094. fs_devices->num_devices--;
  1095. list_del(&device->dev_list);
  1096. btrfs_free_device(device);
  1097. }
  1098. if (ret == 0 && ret2 != 0)
  1099. ret = ret2;
  1100. }
  1101. if (fs_devices->open_devices == 0) {
  1102. if (ret)
  1103. return ret;
  1104. return -EINVAL;
  1105. }
  1106. fs_devices->opened = 1;
  1107. fs_devices->latest_dev = latest_dev;
  1108. fs_devices->total_rw_bytes = 0;
  1109. fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
  1110. #ifdef CONFIG_BTRFS_EXPERIMENTAL
  1111. fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
  1112. fs_devices->read_devid = latest_dev->devid;
  1113. fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(),
  1114. &value);
  1115. if (fs_devices->read_policy == BTRFS_READ_POLICY_RR)
  1116. fs_devices->collect_fs_stats = true;
  1117. if (value) {
  1118. if (fs_devices->read_policy == BTRFS_READ_POLICY_RR)
  1119. fs_devices->rr_min_contig_read = value;
  1120. if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID)
  1121. fs_devices->read_devid = value;
  1122. }
  1123. #else
  1124. fs_devices->read_policy = BTRFS_READ_POLICY_PID;
  1125. #endif
  1126. return 0;
  1127. }
  1128. static int devid_cmp(void *priv, const struct list_head *a,
  1129. const struct list_head *b)
  1130. {
  1131. const struct btrfs_device *dev1, *dev2;
  1132. dev1 = list_entry(a, struct btrfs_device, dev_list);
  1133. dev2 = list_entry(b, struct btrfs_device, dev_list);
  1134. if (dev1->devid < dev2->devid)
  1135. return -1;
  1136. else if (dev1->devid > dev2->devid)
  1137. return 1;
  1138. return 0;
  1139. }
  1140. int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
  1141. blk_mode_t flags, void *holder)
  1142. {
  1143. int ret;
  1144. lockdep_assert_held(&uuid_mutex);
  1145. /*
  1146. * The device_list_mutex cannot be taken here in case opening the
  1147. * underlying device takes further locks like open_mutex.
  1148. *
  1149. * We also don't need the lock here as this is called during mount and
  1150. * exclusion is provided by uuid_mutex
  1151. */
  1152. if (fs_devices->opened) {
  1153. fs_devices->opened++;
  1154. ret = 0;
  1155. } else {
  1156. list_sort(NULL, &fs_devices->devices, devid_cmp);
  1157. ret = open_fs_devices(fs_devices, flags, holder);
  1158. }
  1159. return ret;
  1160. }
  1161. void btrfs_release_disk_super(struct btrfs_super_block *super)
  1162. {
  1163. struct page *page = virt_to_page(super);
  1164. put_page(page);
  1165. }
  1166. struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
  1167. int copy_num, bool drop_cache)
  1168. {
  1169. struct btrfs_super_block *super;
  1170. struct page *page;
  1171. u64 bytenr, bytenr_orig;
  1172. struct address_space *mapping = bdev->bd_mapping;
  1173. int ret;
  1174. bytenr_orig = btrfs_sb_offset(copy_num);
  1175. ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
  1176. if (ret < 0) {
  1177. if (ret == -ENOENT)
  1178. ret = -EINVAL;
  1179. return ERR_PTR(ret);
  1180. }
  1181. if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
  1182. return ERR_PTR(-EINVAL);
  1183. if (drop_cache) {
  1184. /* This should only be called with the primary sb. */
  1185. ASSERT(copy_num == 0);
  1186. /*
  1187. * Drop the page of the primary superblock, so later read will
  1188. * always read from the device.
  1189. */
  1190. invalidate_inode_pages2_range(mapping, bytenr >> PAGE_SHIFT,
  1191. (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
  1192. }
  1193. filemap_invalidate_lock(mapping);
  1194. page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
  1195. filemap_invalidate_unlock(mapping);
  1196. if (IS_ERR(page))
  1197. return ERR_CAST(page);
  1198. super = page_address(page);
  1199. if (btrfs_super_magic(super) != BTRFS_MAGIC ||
  1200. btrfs_super_bytenr(super) != bytenr_orig) {
  1201. btrfs_release_disk_super(super);
  1202. return ERR_PTR(-EINVAL);
  1203. }
  1204. /*
  1205. * Make sure the last byte of label is properly NUL terminated. We use
  1206. * '%s' to print the label, if not properly NUL terminated we can access
  1207. * beyond the label.
  1208. */
  1209. if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1])
  1210. super->label[BTRFS_LABEL_SIZE - 1] = 0;
  1211. return super;
  1212. }
  1213. int btrfs_forget_devices(dev_t devt)
  1214. {
  1215. int ret;
  1216. mutex_lock(&uuid_mutex);
  1217. ret = btrfs_free_stale_devices(devt, NULL);
  1218. mutex_unlock(&uuid_mutex);
  1219. return ret;
  1220. }
  1221. static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
  1222. const char *path, dev_t devt,
  1223. bool mount_arg_dev)
  1224. {
  1225. struct btrfs_fs_devices *fs_devices;
  1226. /*
  1227. * Do not skip device registration for mounted devices with matching
  1228. * maj:min but different paths. Booting without initrd relies on
  1229. * /dev/root initially, later replaced with the actual root device.
  1230. * A successful scan ensures grub2-probe selects the correct device.
  1231. */
  1232. list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
  1233. struct btrfs_device *device;
  1234. mutex_lock(&fs_devices->device_list_mutex);
  1235. if (!fs_devices->opened) {
  1236. mutex_unlock(&fs_devices->device_list_mutex);
  1237. continue;
  1238. }
  1239. list_for_each_entry(device, &fs_devices->devices, dev_list) {
  1240. if (device->bdev && (device->bdev->bd_dev == devt) &&
  1241. strcmp(rcu_dereference_raw(device->name), path) != 0) {
  1242. mutex_unlock(&fs_devices->device_list_mutex);
  1243. /* Do not skip registration. */
  1244. return false;
  1245. }
  1246. }
  1247. mutex_unlock(&fs_devices->device_list_mutex);
  1248. }
  1249. if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
  1250. !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING))
  1251. return true;
  1252. return false;
  1253. }
  1254. /*
  1255. * Look for a btrfs signature on a device. This may be called out of the mount path
  1256. * and we are not allowed to call set_blocksize during the scan. The superblock
  1257. * is read via pagecache.
  1258. *
  1259. * With @mount_arg_dev it's a scan during mount time that will always register
  1260. * the device or return an error. Multi-device and seeding devices are registered
  1261. * in both cases.
  1262. */
  1263. struct btrfs_device *btrfs_scan_one_device(const char *path,
  1264. bool mount_arg_dev)
  1265. {
  1266. struct btrfs_super_block *disk_super;
  1267. bool new_device_added = false;
  1268. struct btrfs_device *device = NULL;
  1269. struct file *bdev_file;
  1270. dev_t devt;
  1271. lockdep_assert_held(&uuid_mutex);
  1272. /*
  1273. * Avoid an exclusive open here, as the systemd-udev may initiate the
  1274. * device scan which may race with the user's mount or mkfs command,
  1275. * resulting in failure.
  1276. * Since the device scan is solely for reading purposes, there is no
  1277. * need for an exclusive open. Additionally, the devices are read again
  1278. * during the mount process. It is ok to get some inconsistent
  1279. * values temporarily, as the device paths of the fsid are the only
  1280. * required information for assembling the volume.
  1281. */
  1282. bdev_file = bdev_file_open_by_path(path, BLK_OPEN_READ, NULL, NULL);
  1283. if (IS_ERR(bdev_file))
  1284. return ERR_CAST(bdev_file);
  1285. disk_super = btrfs_read_disk_super(file_bdev(bdev_file), 0, false);
  1286. if (IS_ERR(disk_super)) {
  1287. device = ERR_CAST(disk_super);
  1288. goto error_bdev_put;
  1289. }
  1290. devt = file_bdev(bdev_file)->bd_dev;
  1291. if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
  1292. btrfs_debug(NULL, "skip registering single non-seed device %s (%d:%d)",
  1293. path, MAJOR(devt), MINOR(devt));
  1294. btrfs_free_stale_devices(devt, NULL);
  1295. device = NULL;
  1296. goto free_disk_super;
  1297. }
  1298. device = device_list_add(path, disk_super, &new_device_added);
  1299. if (!IS_ERR(device) && new_device_added)
  1300. btrfs_free_stale_devices(device->devt, device);
  1301. free_disk_super:
  1302. btrfs_release_disk_super(disk_super);
  1303. error_bdev_put:
  1304. bdev_fput(bdev_file);
  1305. return device;
  1306. }
  1307. /*
  1308. * Find the first pending extent intersecting a range.
  1309. *
  1310. * @device: the device to search
  1311. * @start: start of the range to check
  1312. * @len: length of the range to check
  1313. * @pending_start: output pointer for the start of the found pending extent
  1314. * @pending_end: output pointer for the end of the found pending extent (inclusive)
  1315. *
  1316. * Search for a pending chunk allocation that intersects the half-open range
  1317. * [start, start + len).
  1318. *
  1319. * Return: true if a pending extent was found, false otherwise.
  1320. * If the return value is true, store the first pending extent in
  1321. * [*pending_start, *pending_end]. Otherwise, the two output variables
  1322. * may still be modified, to something outside the range and should not
  1323. * be used.
  1324. */
  1325. bool btrfs_first_pending_extent(struct btrfs_device *device, u64 start, u64 len,
  1326. u64 *pending_start, u64 *pending_end)
  1327. {
  1328. lockdep_assert_held(&device->fs_info->chunk_mutex);
  1329. if (btrfs_find_first_extent_bit(&device->alloc_state, start,
  1330. pending_start, pending_end,
  1331. CHUNK_ALLOCATED, NULL)) {
  1332. if (in_range(*pending_start, start, len) ||
  1333. in_range(start, *pending_start, *pending_end + 1 - *pending_start)) {
  1334. return true;
  1335. }
  1336. }
  1337. return false;
  1338. }
  1339. /*
  1340. * Find the first real hole accounting for pending extents.
  1341. *
  1342. * @device: the device containing the candidate hole
  1343. * @start: input/output pointer for the hole start position
  1344. * @len: input/output pointer for the hole length
  1345. * @min_hole_size: the size of hole we are looking for
  1346. *
  1347. * Given a potential hole specified by [*start, *start + *len), check for pending
  1348. * chunk allocations within that range. If pending extents are found, the hole is
  1349. * adjusted to represent the first true free space that is large enough when
  1350. * accounting for pending chunks.
  1351. *
  1352. * Note that this function must handle various cases involving non consecutive
  1353. * pending extents.
  1354. *
  1355. * Returns: true if a suitable hole was found and false otherwise.
  1356. * If the return value is true, then *start and *len are set to represent the hole.
  1357. * If the return value is false, then *start is set to the largest hole we
  1358. * found and *len is set to its length.
  1359. * If there are no holes at all, then *start is set to the end of the range and
  1360. * *len is set to 0.
  1361. */
  1362. bool btrfs_find_hole_in_pending_extents(struct btrfs_device *device, u64 *start,
  1363. u64 *len, u64 min_hole_size)
  1364. {
  1365. u64 pending_start, pending_end;
  1366. u64 end;
  1367. u64 max_hole_start = 0;
  1368. u64 max_hole_len = 0;
  1369. lockdep_assert_held(&device->fs_info->chunk_mutex);
  1370. if (*len == 0)
  1371. return false;
  1372. end = *start + *len - 1;
  1373. /*
  1374. * Loop until we either see a large enough hole or check every pending
  1375. * extent overlapping the candidate hole.
  1376. * At every hole that we observe, record it if it is the new max.
  1377. * At the end of the iteration, set the output variables to the max hole.
  1378. */
  1379. while (true) {
  1380. if (btrfs_first_pending_extent(device, *start, *len, &pending_start, &pending_end)) {
  1381. /*
  1382. * Case 1: the pending extent overlaps the start of
  1383. * candidate hole. That means the true hole is after the
  1384. * pending extent, but we need to find the next pending
  1385. * extent to properly size the hole. In the next loop,
  1386. * we will reduce to case 2 or 3.
  1387. * e.g.,
  1388. *
  1389. * |----pending A----| real hole |----pending B----|
  1390. * | candidate hole |
  1391. * *start end
  1392. */
  1393. if (pending_start <= *start) {
  1394. *start = pending_end + 1;
  1395. goto next;
  1396. }
  1397. /*
  1398. * Case 2: The pending extent starts after *start (and overlaps
  1399. * [*start, end), so the first hole just goes up to the start
  1400. * of the pending extent.
  1401. * e.g.,
  1402. *
  1403. * | real hole |----pending A----|
  1404. * | candidate hole |
  1405. * *start end
  1406. */
  1407. *len = pending_start - *start;
  1408. if (*len > max_hole_len) {
  1409. max_hole_start = *start;
  1410. max_hole_len = *len;
  1411. }
  1412. if (*len >= min_hole_size)
  1413. break;
  1414. /*
  1415. * If the hole wasn't big enough, then we advance past
  1416. * the pending extent and keep looking.
  1417. */
  1418. *start = pending_end + 1;
  1419. goto next;
  1420. } else {
  1421. /*
  1422. * Case 3: There is no pending extent overlapping the
  1423. * range [*start, *start + *len - 1], so the only remaining
  1424. * hole is the remaining range.
  1425. * e.g.,
  1426. *
  1427. * | candidate hole |
  1428. * | real hole |
  1429. * *start end
  1430. */
  1431. if (*len > max_hole_len) {
  1432. max_hole_start = *start;
  1433. max_hole_len = *len;
  1434. }
  1435. break;
  1436. }
  1437. next:
  1438. if (*start > end)
  1439. break;
  1440. *len = end - *start + 1;
  1441. }
  1442. if (max_hole_len) {
  1443. *start = max_hole_start;
  1444. *len = max_hole_len;
  1445. } else {
  1446. *start = end + 1;
  1447. *len = 0;
  1448. }
  1449. return max_hole_len >= min_hole_size;
  1450. }
  1451. static u64 dev_extent_search_start(struct btrfs_device *device)
  1452. {
  1453. switch (device->fs_devices->chunk_alloc_policy) {
  1454. default:
  1455. btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy);
  1456. fallthrough;
  1457. case BTRFS_CHUNK_ALLOC_REGULAR:
  1458. return BTRFS_DEVICE_RANGE_RESERVED;
  1459. case BTRFS_CHUNK_ALLOC_ZONED:
  1460. /*
  1461. * We don't care about the starting region like regular
  1462. * allocator, because we anyway use/reserve the first two zones
  1463. * for superblock logging.
  1464. */
  1465. return 0;
  1466. }
  1467. }
  1468. static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
  1469. u64 *hole_start, u64 *hole_size,
  1470. u64 num_bytes)
  1471. {
  1472. u64 zone_size = device->zone_info->zone_size;
  1473. u64 pos;
  1474. int ret;
  1475. bool changed = false;
  1476. ASSERT(IS_ALIGNED(*hole_start, zone_size),
  1477. "hole_start=%llu zone_size=%llu", *hole_start, zone_size);
  1478. while (*hole_size > 0) {
  1479. pos = btrfs_find_allocatable_zones(device, *hole_start,
  1480. *hole_start + *hole_size,
  1481. num_bytes);
  1482. if (pos != *hole_start) {
  1483. *hole_size = *hole_start + *hole_size - pos;
  1484. *hole_start = pos;
  1485. changed = true;
  1486. if (*hole_size < num_bytes)
  1487. break;
  1488. }
  1489. ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
  1490. /* Range is ensured to be empty */
  1491. if (!ret)
  1492. return changed;
  1493. /* Given hole range was invalid (outside of device) */
  1494. if (ret == -ERANGE) {
  1495. *hole_start += *hole_size;
  1496. *hole_size = 0;
  1497. return true;
  1498. }
  1499. *hole_start += zone_size;
  1500. *hole_size -= zone_size;
  1501. changed = true;
  1502. }
  1503. return changed;
  1504. }
  1505. /*
  1506. * Validate and adjust a hole for chunk allocation
  1507. *
  1508. * @device: the device containing the candidate hole
  1509. * @hole_start: input/output pointer for the hole start position
  1510. * @hole_size: input/output pointer for the hole size
  1511. * @num_bytes: minimum allocation size required
  1512. *
  1513. * Check if the specified hole is suitable for allocation and adjust it if
  1514. * necessary. The hole may be modified to skip over pending chunk allocations
  1515. * and to satisfy stricter zoned requirements on zoned filesystems.
  1516. *
  1517. * For regular (non-zoned) allocation, if the hole after adjustment is smaller
  1518. * than @num_bytes, the search continues past additional pending extents until
  1519. * either a sufficiently large hole is found or no more pending extents exist.
  1520. *
  1521. * Return: true if a suitable hole was found and false otherwise.
  1522. * If the return value is true, then *hole_start and *hole_size are set to
  1523. * represent the hole we found.
  1524. * If the return value is false, then *hole_start is set to the largest
  1525. * hole we found and *hole_size is set to its length.
  1526. * If there are no holes at all, then *hole_start is set to the end of the range
  1527. * and *hole_size is set to 0.
  1528. */
  1529. static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
  1530. u64 *hole_size, u64 num_bytes)
  1531. {
  1532. bool found = false;
  1533. const u64 hole_end = *hole_start + *hole_size - 1;
  1534. ASSERT(*hole_size > 0);
  1535. again:
  1536. *hole_size = hole_end - *hole_start + 1;
  1537. found = btrfs_find_hole_in_pending_extents(device, hole_start, hole_size, num_bytes);
  1538. if (!found)
  1539. return found;
  1540. ASSERT(*hole_size >= num_bytes);
  1541. switch (device->fs_devices->chunk_alloc_policy) {
  1542. default:
  1543. btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy);
  1544. fallthrough;
  1545. case BTRFS_CHUNK_ALLOC_REGULAR:
  1546. return found;
  1547. case BTRFS_CHUNK_ALLOC_ZONED:
  1548. if (dev_extent_hole_check_zoned(device, hole_start, hole_size, num_bytes))
  1549. goto again;
  1550. break;
  1551. }
  1552. return found;
  1553. }
  1554. /*
  1555. * Find free space in the specified device.
  1556. *
  1557. * @device: the device which we search the free space in
  1558. * @num_bytes: the size of the free space that we need
  1559. * @search_start: the position from which to begin the search
  1560. * @start: store the start of the free space.
  1561. * @len: the size of the free space. that we find, or the size
  1562. * of the max free space if we don't find suitable free space
  1563. *
  1564. * This does a pretty simple search, the expectation is that it is called very
  1565. * infrequently and that a given device has a small number of extents.
  1566. *
  1567. * @start is used to store the start of the free space if we find. But if we
  1568. * don't find suitable free space, it will be used to store the start position
  1569. * of the max free space.
  1570. *
  1571. * @len is used to store the size of the free space that we find.
  1572. * But if we don't find suitable free space, it is used to store the size of
  1573. * the max free space.
  1574. *
  1575. * NOTE: This function will search *commit* root of device tree, and does extra
  1576. * check to ensure dev extents are not double allocated.
  1577. * This makes the function safe to allocate dev extents but may not report
  1578. * correct usable device space, as device extent freed in current transaction
  1579. * is not reported as available.
  1580. */
  1581. static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
  1582. u64 *start, u64 *len)
  1583. {
  1584. struct btrfs_fs_info *fs_info = device->fs_info;
  1585. struct btrfs_root *root = fs_info->dev_root;
  1586. struct btrfs_key key;
  1587. struct btrfs_dev_extent *dev_extent;
  1588. BTRFS_PATH_AUTO_FREE(path);
  1589. u64 search_start;
  1590. u64 hole_size;
  1591. u64 max_hole_start;
  1592. u64 max_hole_size = 0;
  1593. u64 extent_end;
  1594. u64 search_end = device->total_bytes;
  1595. int ret;
  1596. int slot;
  1597. struct extent_buffer *l;
  1598. search_start = dev_extent_search_start(device);
  1599. max_hole_start = search_start;
  1600. WARN_ON(device->zone_info &&
  1601. !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
  1602. path = btrfs_alloc_path();
  1603. if (!path) {
  1604. ret = -ENOMEM;
  1605. goto out;
  1606. }
  1607. if (search_start >= search_end ||
  1608. test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
  1609. ret = -ENOSPC;
  1610. goto out;
  1611. }
  1612. path->reada = READA_FORWARD;
  1613. path->search_commit_root = true;
  1614. path->skip_locking = true;
  1615. key.objectid = device->devid;
  1616. key.type = BTRFS_DEV_EXTENT_KEY;
  1617. key.offset = search_start;
  1618. ret = btrfs_search_backwards(root, &key, path);
  1619. if (ret < 0)
  1620. goto out;
  1621. while (search_start < search_end) {
  1622. l = path->nodes[0];
  1623. slot = path->slots[0];
  1624. if (slot >= btrfs_header_nritems(l)) {
  1625. ret = btrfs_next_leaf(root, path);
  1626. if (ret == 0)
  1627. continue;
  1628. if (ret < 0)
  1629. goto out;
  1630. break;
  1631. }
  1632. btrfs_item_key_to_cpu(l, &key, slot);
  1633. if (key.objectid < device->devid)
  1634. goto next;
  1635. if (key.objectid > device->devid)
  1636. break;
  1637. if (key.type != BTRFS_DEV_EXTENT_KEY)
  1638. goto next;
  1639. if (key.offset > search_end)
  1640. break;
  1641. if (key.offset > search_start) {
  1642. hole_size = key.offset - search_start;
  1643. dev_extent_hole_check(device, &search_start, &hole_size,
  1644. num_bytes);
  1645. if (hole_size > max_hole_size) {
  1646. max_hole_start = search_start;
  1647. max_hole_size = hole_size;
  1648. }
  1649. /*
  1650. * If this free space is greater than which we need,
  1651. * it must be the max free space that we have found
  1652. * until now, so max_hole_start must point to the start
  1653. * of this free space and the length of this free space
  1654. * is stored in max_hole_size. Thus, we return
  1655. * max_hole_start and max_hole_size and go back to the
  1656. * caller.
  1657. */
  1658. if (hole_size >= num_bytes) {
  1659. ret = 0;
  1660. goto out;
  1661. }
  1662. }
  1663. dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
  1664. extent_end = key.offset + btrfs_dev_extent_length(l,
  1665. dev_extent);
  1666. if (extent_end > search_start)
  1667. search_start = extent_end;
  1668. next:
  1669. path->slots[0]++;
  1670. cond_resched();
  1671. }
  1672. /*
  1673. * At this point, search_start should be the end of
  1674. * allocated dev extents, and when shrinking the device,
  1675. * search_end may be smaller than search_start.
  1676. */
  1677. if (search_end > search_start) {
  1678. hole_size = search_end - search_start;
  1679. dev_extent_hole_check(device, &search_start, &hole_size, num_bytes);
  1680. if (hole_size > max_hole_size) {
  1681. max_hole_start = search_start;
  1682. max_hole_size = hole_size;
  1683. }
  1684. }
  1685. /* See above. */
  1686. if (max_hole_size < num_bytes)
  1687. ret = -ENOSPC;
  1688. else
  1689. ret = 0;
  1690. ASSERT(max_hole_start + max_hole_size <= search_end,
  1691. "max_hole_start=%llu max_hole_size=%llu search_end=%llu",
  1692. max_hole_start, max_hole_size, search_end);
  1693. out:
  1694. *start = max_hole_start;
  1695. if (len)
  1696. *len = max_hole_size;
  1697. return ret;
  1698. }
  1699. static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
  1700. struct btrfs_device *device,
  1701. u64 start, u64 *dev_extent_len)
  1702. {
  1703. struct btrfs_fs_info *fs_info = device->fs_info;
  1704. struct btrfs_root *root = fs_info->dev_root;
  1705. int ret;
  1706. BTRFS_PATH_AUTO_FREE(path);
  1707. struct btrfs_key key;
  1708. struct btrfs_key found_key;
  1709. struct extent_buffer *leaf = NULL;
  1710. struct btrfs_dev_extent *extent = NULL;
  1711. path = btrfs_alloc_path();
  1712. if (!path)
  1713. return -ENOMEM;
  1714. key.objectid = device->devid;
  1715. key.type = BTRFS_DEV_EXTENT_KEY;
  1716. key.offset = start;
  1717. again:
  1718. ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  1719. if (ret > 0) {
  1720. ret = btrfs_previous_item(root, path, key.objectid,
  1721. BTRFS_DEV_EXTENT_KEY);
  1722. if (ret)
  1723. return ret;
  1724. leaf = path->nodes[0];
  1725. btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
  1726. extent = btrfs_item_ptr(leaf, path->slots[0],
  1727. struct btrfs_dev_extent);
  1728. BUG_ON(found_key.offset > start || found_key.offset +
  1729. btrfs_dev_extent_length(leaf, extent) < start);
  1730. key = found_key;
  1731. btrfs_release_path(path);
  1732. goto again;
  1733. } else if (ret == 0) {
  1734. leaf = path->nodes[0];
  1735. extent = btrfs_item_ptr(leaf, path->slots[0],
  1736. struct btrfs_dev_extent);
  1737. } else {
  1738. return ret;
  1739. }
  1740. *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
  1741. ret = btrfs_del_item(trans, root, path);
  1742. if (ret == 0)
  1743. set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
  1744. return ret;
  1745. }
  1746. static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
  1747. {
  1748. struct rb_node *n;
  1749. u64 ret = 0;
  1750. read_lock(&fs_info->mapping_tree_lock);
  1751. n = rb_last(&fs_info->mapping_tree.rb_root);
  1752. if (n) {
  1753. struct btrfs_chunk_map *map;
  1754. map = rb_entry(n, struct btrfs_chunk_map, rb_node);
  1755. ret = map->start + map->chunk_len;
  1756. }
  1757. read_unlock(&fs_info->mapping_tree_lock);
  1758. return ret;
  1759. }
  1760. static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
  1761. u64 *devid_ret)
  1762. {
  1763. int ret;
  1764. struct btrfs_key key;
  1765. struct btrfs_key found_key;
  1766. BTRFS_PATH_AUTO_FREE(path);
  1767. path = btrfs_alloc_path();
  1768. if (!path)
  1769. return -ENOMEM;
  1770. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  1771. key.type = BTRFS_DEV_ITEM_KEY;
  1772. key.offset = (u64)-1;
  1773. ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
  1774. if (ret < 0)
  1775. return ret;
  1776. if (unlikely(ret == 0)) {
  1777. /* Corruption */
  1778. btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
  1779. return -EUCLEAN;
  1780. }
  1781. ret = btrfs_previous_item(fs_info->chunk_root, path,
  1782. BTRFS_DEV_ITEMS_OBJECTID,
  1783. BTRFS_DEV_ITEM_KEY);
  1784. if (ret) {
  1785. *devid_ret = 1;
  1786. } else {
  1787. btrfs_item_key_to_cpu(path->nodes[0], &found_key,
  1788. path->slots[0]);
  1789. *devid_ret = found_key.offset + 1;
  1790. }
  1791. return 0;
  1792. }
  1793. /*
  1794. * the device information is stored in the chunk root
  1795. * the btrfs_device struct should be fully filled in
  1796. */
  1797. static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
  1798. struct btrfs_device *device)
  1799. {
  1800. int ret;
  1801. BTRFS_PATH_AUTO_FREE(path);
  1802. struct btrfs_dev_item *dev_item;
  1803. struct extent_buffer *leaf;
  1804. struct btrfs_key key;
  1805. unsigned long ptr;
  1806. path = btrfs_alloc_path();
  1807. if (!path)
  1808. return -ENOMEM;
  1809. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  1810. key.type = BTRFS_DEV_ITEM_KEY;
  1811. key.offset = device->devid;
  1812. btrfs_reserve_chunk_metadata(trans, true);
  1813. ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
  1814. &key, sizeof(*dev_item));
  1815. btrfs_trans_release_chunk_metadata(trans);
  1816. if (ret)
  1817. return ret;
  1818. leaf = path->nodes[0];
  1819. dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
  1820. btrfs_set_device_id(leaf, dev_item, device->devid);
  1821. btrfs_set_device_generation(leaf, dev_item, 0);
  1822. btrfs_set_device_type(leaf, dev_item, device->type);
  1823. btrfs_set_device_io_align(leaf, dev_item, device->io_align);
  1824. btrfs_set_device_io_width(leaf, dev_item, device->io_width);
  1825. btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
  1826. btrfs_set_device_total_bytes(leaf, dev_item,
  1827. btrfs_device_get_disk_total_bytes(device));
  1828. btrfs_set_device_bytes_used(leaf, dev_item,
  1829. btrfs_device_get_bytes_used(device));
  1830. btrfs_set_device_group(leaf, dev_item, 0);
  1831. btrfs_set_device_seek_speed(leaf, dev_item, 0);
  1832. btrfs_set_device_bandwidth(leaf, dev_item, 0);
  1833. btrfs_set_device_start_offset(leaf, dev_item, 0);
  1834. ptr = btrfs_device_uuid(dev_item);
  1835. write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
  1836. ptr = btrfs_device_fsid(dev_item);
  1837. write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
  1838. ptr, BTRFS_FSID_SIZE);
  1839. return 0;
  1840. }
  1841. /*
  1842. * Function to update ctime/mtime for a given device path.
  1843. * Mainly used for ctime/mtime based probe like libblkid.
  1844. *
  1845. * We don't care about errors here, this is just to be kind to userspace.
  1846. */
  1847. static void update_dev_time(const char *device_path)
  1848. {
  1849. struct path path;
  1850. if (!kern_path(device_path, LOOKUP_FOLLOW, &path)) {
  1851. vfs_utimes(&path, NULL);
  1852. path_put(&path);
  1853. }
  1854. }
  1855. static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
  1856. struct btrfs_device *device)
  1857. {
  1858. struct btrfs_root *root = device->fs_info->chunk_root;
  1859. int ret;
  1860. BTRFS_PATH_AUTO_FREE(path);
  1861. struct btrfs_key key;
  1862. path = btrfs_alloc_path();
  1863. if (!path)
  1864. return -ENOMEM;
  1865. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  1866. key.type = BTRFS_DEV_ITEM_KEY;
  1867. key.offset = device->devid;
  1868. btrfs_reserve_chunk_metadata(trans, false);
  1869. ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  1870. btrfs_trans_release_chunk_metadata(trans);
  1871. if (ret > 0)
  1872. return -ENOENT;
  1873. if (ret < 0)
  1874. return ret;
  1875. return btrfs_del_item(trans, root, path);
  1876. }
  1877. /*
  1878. * Verify that @num_devices satisfies the RAID profile constraints in the whole
  1879. * filesystem. It's up to the caller to adjust that number regarding eg. device
  1880. * replace.
  1881. */
  1882. static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
  1883. u64 num_devices)
  1884. {
  1885. u64 all_avail;
  1886. unsigned seq;
  1887. int i;
  1888. do {
  1889. seq = read_seqbegin(&fs_info->profiles_lock);
  1890. all_avail = fs_info->avail_data_alloc_bits |
  1891. fs_info->avail_system_alloc_bits |
  1892. fs_info->avail_metadata_alloc_bits;
  1893. } while (read_seqretry(&fs_info->profiles_lock, seq));
  1894. for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
  1895. if (!(all_avail & btrfs_raid_array[i].bg_flag))
  1896. continue;
  1897. if (num_devices < btrfs_raid_array[i].devs_min)
  1898. return btrfs_raid_array[i].mindev_error;
  1899. }
  1900. return 0;
  1901. }
  1902. static struct btrfs_device * btrfs_find_next_active_device(
  1903. struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
  1904. {
  1905. struct btrfs_device *next_device;
  1906. list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
  1907. if (next_device != device &&
  1908. !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
  1909. && next_device->bdev)
  1910. return next_device;
  1911. }
  1912. return NULL;
  1913. }
  1914. /*
  1915. * Helper function to check if the given device is part of s_bdev / latest_dev
  1916. * and replace it with the provided or the next active device, in the context
  1917. * where this function called, there should be always be another device (or
  1918. * this_dev) which is active.
  1919. */
  1920. void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
  1921. struct btrfs_device *next_device)
  1922. {
  1923. struct btrfs_fs_info *fs_info = device->fs_info;
  1924. if (!next_device)
  1925. next_device = btrfs_find_next_active_device(fs_info->fs_devices,
  1926. device);
  1927. ASSERT(next_device);
  1928. if (fs_info->sb->s_bdev &&
  1929. (fs_info->sb->s_bdev == device->bdev))
  1930. fs_info->sb->s_bdev = next_device->bdev;
  1931. if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
  1932. fs_info->fs_devices->latest_dev = next_device;
  1933. }
  1934. /*
  1935. * Return btrfs_fs_devices::num_devices excluding the device that's being
  1936. * currently replaced.
  1937. */
  1938. static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
  1939. {
  1940. u64 num_devices = fs_info->fs_devices->num_devices;
  1941. down_read(&fs_info->dev_replace.rwsem);
  1942. if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
  1943. ASSERT(num_devices > 1, "num_devices=%llu", num_devices);
  1944. num_devices--;
  1945. }
  1946. up_read(&fs_info->dev_replace.rwsem);
  1947. return num_devices;
  1948. }
  1949. static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
  1950. struct block_device *bdev, int copy_num)
  1951. {
  1952. struct btrfs_super_block *disk_super;
  1953. const size_t len = sizeof(disk_super->magic);
  1954. const u64 bytenr = btrfs_sb_offset(copy_num);
  1955. int ret;
  1956. disk_super = btrfs_read_disk_super(bdev, copy_num, false);
  1957. if (IS_ERR(disk_super))
  1958. return;
  1959. memset(&disk_super->magic, 0, len);
  1960. folio_mark_dirty(virt_to_folio(disk_super));
  1961. btrfs_release_disk_super(disk_super);
  1962. ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1);
  1963. if (ret)
  1964. btrfs_warn(fs_info, "error clearing superblock number %d (%d)",
  1965. copy_num, ret);
  1966. }
  1967. void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
  1968. {
  1969. int copy_num;
  1970. struct block_device *bdev = device->bdev;
  1971. if (!bdev)
  1972. return;
  1973. for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
  1974. if (bdev_is_zoned(bdev))
  1975. btrfs_reset_sb_log_zones(bdev, copy_num);
  1976. else
  1977. btrfs_scratch_superblock(fs_info, bdev, copy_num);
  1978. }
  1979. /* Notify udev that device has changed */
  1980. btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
  1981. /* Update ctime/mtime for device path for libblkid */
  1982. update_dev_time(rcu_dereference_raw(device->name));
  1983. }
  1984. int btrfs_rm_device(struct btrfs_fs_info *fs_info,
  1985. struct btrfs_dev_lookup_args *args,
  1986. struct file **bdev_file)
  1987. {
  1988. struct btrfs_trans_handle *trans;
  1989. struct btrfs_device *device;
  1990. struct btrfs_fs_devices *cur_devices;
  1991. struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
  1992. u64 num_devices;
  1993. int ret = 0;
  1994. if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
  1995. btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
  1996. return -EINVAL;
  1997. }
  1998. /*
  1999. * The device list in fs_devices is accessed without locks (neither
  2000. * uuid_mutex nor device_list_mutex) as it won't change on a mounted
  2001. * filesystem and another device rm cannot run.
  2002. */
  2003. num_devices = btrfs_num_devices(fs_info);
  2004. ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
  2005. if (ret)
  2006. return ret;
  2007. device = btrfs_find_device(fs_info->fs_devices, args);
  2008. if (!device) {
  2009. if (args->missing)
  2010. ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
  2011. else
  2012. ret = -ENOENT;
  2013. return ret;
  2014. }
  2015. if (btrfs_pinned_by_swapfile(fs_info, device)) {
  2016. btrfs_warn(fs_info,
  2017. "cannot remove device %s (devid %llu) due to active swapfile",
  2018. btrfs_dev_name(device), device->devid);
  2019. return -ETXTBSY;
  2020. }
  2021. if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
  2022. return BTRFS_ERROR_DEV_TGT_REPLACE;
  2023. if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
  2024. fs_info->fs_devices->rw_devices == 1)
  2025. return BTRFS_ERROR_DEV_ONLY_WRITABLE;
  2026. if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
  2027. mutex_lock(&fs_info->chunk_mutex);
  2028. list_del_init(&device->dev_alloc_list);
  2029. device->fs_devices->rw_devices--;
  2030. mutex_unlock(&fs_info->chunk_mutex);
  2031. }
  2032. ret = btrfs_shrink_device(device, 0);
  2033. if (ret)
  2034. goto error_undo;
  2035. trans = btrfs_start_transaction(fs_info->chunk_root, 0);
  2036. if (IS_ERR(trans)) {
  2037. ret = PTR_ERR(trans);
  2038. goto error_undo;
  2039. }
  2040. ret = btrfs_rm_dev_item(trans, device);
  2041. if (unlikely(ret)) {
  2042. /* Any error in dev item removal is critical */
  2043. btrfs_crit(fs_info,
  2044. "failed to remove device item for devid %llu: %d",
  2045. device->devid, ret);
  2046. btrfs_abort_transaction(trans, ret);
  2047. btrfs_end_transaction(trans);
  2048. return ret;
  2049. }
  2050. clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
  2051. btrfs_scrub_cancel_dev(device);
  2052. /*
  2053. * the device list mutex makes sure that we don't change
  2054. * the device list while someone else is writing out all
  2055. * the device supers. Whoever is writing all supers, should
  2056. * lock the device list mutex before getting the number of
  2057. * devices in the super block (super_copy). Conversely,
  2058. * whoever updates the number of devices in the super block
  2059. * (super_copy) should hold the device list mutex.
  2060. */
  2061. /*
  2062. * In normal cases the cur_devices == fs_devices. But in case
  2063. * of deleting a seed device, the cur_devices should point to
  2064. * its own fs_devices listed under the fs_devices->seed_list.
  2065. */
  2066. cur_devices = device->fs_devices;
  2067. mutex_lock(&fs_devices->device_list_mutex);
  2068. list_del_rcu(&device->dev_list);
  2069. cur_devices->num_devices--;
  2070. cur_devices->total_devices--;
  2071. /* Update total_devices of the parent fs_devices if it's seed */
  2072. if (cur_devices != fs_devices)
  2073. fs_devices->total_devices--;
  2074. if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
  2075. cur_devices->missing_devices--;
  2076. btrfs_assign_next_active_device(device, NULL);
  2077. if (device->bdev_file) {
  2078. cur_devices->open_devices--;
  2079. /* remove sysfs entry */
  2080. btrfs_sysfs_remove_device(device);
  2081. }
  2082. num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
  2083. btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
  2084. mutex_unlock(&fs_devices->device_list_mutex);
  2085. /*
  2086. * At this point, the device is zero sized and detached from the
  2087. * devices list. All that's left is to zero out the old supers and
  2088. * free the device.
  2089. *
  2090. * We cannot call btrfs_close_bdev() here because we're holding the sb
  2091. * write lock, and bdev_fput() on the block device will pull in the
  2092. * ->open_mutex on the block device and it's dependencies. Instead
  2093. * just flush the device and let the caller do the final bdev_release.
  2094. */
  2095. if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
  2096. btrfs_scratch_superblocks(fs_info, device);
  2097. if (device->bdev) {
  2098. sync_blockdev(device->bdev);
  2099. invalidate_bdev(device->bdev);
  2100. }
  2101. }
  2102. *bdev_file = device->bdev_file;
  2103. synchronize_rcu();
  2104. btrfs_free_device(device);
  2105. /*
  2106. * This can happen if cur_devices is the private seed devices list. We
  2107. * cannot call close_fs_devices() here because it expects the uuid_mutex
  2108. * to be held, but in fact we don't need that for the private
  2109. * seed_devices, we can simply decrement cur_devices->opened and then
  2110. * remove it from our list and free the fs_devices.
  2111. */
  2112. if (cur_devices->num_devices == 0) {
  2113. list_del_init(&cur_devices->seed_list);
  2114. ASSERT(cur_devices->opened == 1, "opened=%d", cur_devices->opened);
  2115. cur_devices->opened--;
  2116. free_fs_devices(cur_devices);
  2117. }
  2118. return btrfs_commit_transaction(trans);
  2119. error_undo:
  2120. if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
  2121. mutex_lock(&fs_info->chunk_mutex);
  2122. list_add(&device->dev_alloc_list,
  2123. &fs_devices->alloc_list);
  2124. device->fs_devices->rw_devices++;
  2125. mutex_unlock(&fs_info->chunk_mutex);
  2126. }
  2127. return ret;
  2128. }
  2129. void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
  2130. {
  2131. struct btrfs_fs_devices *fs_devices;
  2132. lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
  2133. /*
  2134. * in case of fs with no seed, srcdev->fs_devices will point
  2135. * to fs_devices of fs_info. However when the dev being replaced is
  2136. * a seed dev it will point to the seed's local fs_devices. In short
  2137. * srcdev will have its correct fs_devices in both the cases.
  2138. */
  2139. fs_devices = srcdev->fs_devices;
  2140. list_del_rcu(&srcdev->dev_list);
  2141. list_del(&srcdev->dev_alloc_list);
  2142. fs_devices->num_devices--;
  2143. if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
  2144. fs_devices->missing_devices--;
  2145. if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
  2146. fs_devices->rw_devices--;
  2147. if (srcdev->bdev)
  2148. fs_devices->open_devices--;
  2149. }
  2150. void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
  2151. {
  2152. struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
  2153. mutex_lock(&uuid_mutex);
  2154. btrfs_close_bdev(srcdev);
  2155. synchronize_rcu();
  2156. btrfs_free_device(srcdev);
  2157. /* if this is no devs we rather delete the fs_devices */
  2158. if (!fs_devices->num_devices) {
  2159. /*
  2160. * On a mounted FS, num_devices can't be zero unless it's a
  2161. * seed. In case of a seed device being replaced, the replace
  2162. * target added to the sprout FS, so there will be no more
  2163. * device left under the seed FS.
  2164. */
  2165. ASSERT(fs_devices->seeding);
  2166. list_del_init(&fs_devices->seed_list);
  2167. close_fs_devices(fs_devices);
  2168. free_fs_devices(fs_devices);
  2169. }
  2170. mutex_unlock(&uuid_mutex);
  2171. }
  2172. void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
  2173. {
  2174. struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
  2175. mutex_lock(&fs_devices->device_list_mutex);
  2176. btrfs_sysfs_remove_device(tgtdev);
  2177. if (tgtdev->bdev)
  2178. fs_devices->open_devices--;
  2179. fs_devices->num_devices--;
  2180. btrfs_assign_next_active_device(tgtdev, NULL);
  2181. list_del_rcu(&tgtdev->dev_list);
  2182. mutex_unlock(&fs_devices->device_list_mutex);
  2183. btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev);
  2184. btrfs_close_bdev(tgtdev);
  2185. synchronize_rcu();
  2186. btrfs_free_device(tgtdev);
  2187. }
  2188. /*
  2189. * Populate args from device at path.
  2190. *
  2191. * @fs_info: the filesystem
  2192. * @args: the args to populate
  2193. * @path: the path to the device
  2194. *
  2195. * This will read the super block of the device at @path and populate @args with
  2196. * the devid, fsid, and uuid. This is meant to be used for ioctls that need to
  2197. * lookup a device to operate on, but need to do it before we take any locks.
  2198. * This properly handles the special case of "missing" that a user may pass in,
  2199. * and does some basic sanity checks. The caller must make sure that @path is
  2200. * properly NUL terminated before calling in, and must call
  2201. * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
  2202. * uuid buffers.
  2203. *
  2204. * Return: 0 for success, -errno for failure
  2205. */
  2206. int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
  2207. struct btrfs_dev_lookup_args *args,
  2208. const char *path)
  2209. {
  2210. struct btrfs_super_block *disk_super;
  2211. struct file *bdev_file;
  2212. int ret;
  2213. if (!path || !path[0])
  2214. return -EINVAL;
  2215. if (!strcmp(path, "missing")) {
  2216. args->missing = true;
  2217. return 0;
  2218. }
  2219. args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
  2220. args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
  2221. if (!args->uuid || !args->fsid) {
  2222. btrfs_put_dev_args_from_path(args);
  2223. return -ENOMEM;
  2224. }
  2225. ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
  2226. &bdev_file, &disk_super);
  2227. if (ret) {
  2228. btrfs_put_dev_args_from_path(args);
  2229. return ret;
  2230. }
  2231. args->devid = btrfs_stack_device_id(&disk_super->dev_item);
  2232. memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
  2233. if (btrfs_fs_incompat(fs_info, METADATA_UUID))
  2234. memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
  2235. else
  2236. memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
  2237. btrfs_release_disk_super(disk_super);
  2238. bdev_fput(bdev_file);
  2239. return 0;
  2240. }
  2241. /*
  2242. * Only use this jointly with btrfs_get_dev_args_from_path() because we will
  2243. * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
  2244. * that don't need to be freed.
  2245. */
  2246. void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
  2247. {
  2248. kfree(args->uuid);
  2249. kfree(args->fsid);
  2250. args->uuid = NULL;
  2251. args->fsid = NULL;
  2252. }
  2253. struct btrfs_device *btrfs_find_device_by_devspec(
  2254. struct btrfs_fs_info *fs_info, u64 devid,
  2255. const char *device_path)
  2256. {
  2257. BTRFS_DEV_LOOKUP_ARGS(args);
  2258. struct btrfs_device *device;
  2259. int ret;
  2260. if (devid) {
  2261. args.devid = devid;
  2262. device = btrfs_find_device(fs_info->fs_devices, &args);
  2263. if (!device)
  2264. return ERR_PTR(-ENOENT);
  2265. return device;
  2266. }
  2267. ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
  2268. if (ret)
  2269. return ERR_PTR(ret);
  2270. device = btrfs_find_device(fs_info->fs_devices, &args);
  2271. btrfs_put_dev_args_from_path(&args);
  2272. if (!device)
  2273. return ERR_PTR(-ENOENT);
  2274. return device;
  2275. }
  2276. static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
  2277. {
  2278. struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
  2279. struct btrfs_fs_devices *old_devices;
  2280. struct btrfs_fs_devices *seed_devices;
  2281. lockdep_assert_held(&uuid_mutex);
  2282. if (!fs_devices->seeding)
  2283. return ERR_PTR(-EINVAL);
  2284. /*
  2285. * Private copy of the seed devices, anchored at
  2286. * fs_info->fs_devices->seed_list
  2287. */
  2288. seed_devices = alloc_fs_devices(NULL);
  2289. if (IS_ERR(seed_devices))
  2290. return seed_devices;
  2291. /*
  2292. * It's necessary to retain a copy of the original seed fs_devices in
  2293. * fs_uuids so that filesystems which have been seeded can successfully
  2294. * reference the seed device from open_seed_devices. This also supports
  2295. * multiple fs seed.
  2296. */
  2297. old_devices = clone_fs_devices(fs_devices);
  2298. if (IS_ERR(old_devices)) {
  2299. kfree(seed_devices);
  2300. return old_devices;
  2301. }
  2302. list_add(&old_devices->fs_list, &fs_uuids);
  2303. memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
  2304. seed_devices->opened = 1;
  2305. INIT_LIST_HEAD(&seed_devices->devices);
  2306. INIT_LIST_HEAD(&seed_devices->alloc_list);
  2307. mutex_init(&seed_devices->device_list_mutex);
  2308. return seed_devices;
  2309. }
  2310. /*
  2311. * Splice seed devices into the sprout fs_devices.
  2312. * Generate a new fsid for the sprouted read-write filesystem.
  2313. */
  2314. static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
  2315. struct btrfs_fs_devices *seed_devices)
  2316. {
  2317. struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
  2318. struct btrfs_super_block *disk_super = fs_info->super_copy;
  2319. struct btrfs_device *device;
  2320. u64 super_flags;
  2321. /*
  2322. * We are updating the fsid, the thread leading to device_list_add()
  2323. * could race, so uuid_mutex is needed.
  2324. */
  2325. lockdep_assert_held(&uuid_mutex);
  2326. /*
  2327. * The threads listed below may traverse dev_list but can do that without
  2328. * device_list_mutex:
  2329. * - All device ops and balance - as we are in btrfs_exclop_start.
  2330. * - Various dev_list readers - are using RCU.
  2331. * - btrfs_ioctl_fitrim() - is using RCU.
  2332. *
  2333. * For-read threads as below are using device_list_mutex:
  2334. * - Readonly scrub btrfs_scrub_dev()
  2335. * - Readonly scrub btrfs_scrub_progress()
  2336. * - btrfs_get_dev_stats()
  2337. */
  2338. lockdep_assert_held(&fs_devices->device_list_mutex);
  2339. list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
  2340. synchronize_rcu);
  2341. list_for_each_entry(device, &seed_devices->devices, dev_list)
  2342. device->fs_devices = seed_devices;
  2343. fs_devices->seeding = false;
  2344. fs_devices->num_devices = 0;
  2345. fs_devices->open_devices = 0;
  2346. fs_devices->missing_devices = 0;
  2347. fs_devices->rotating = false;
  2348. list_add(&seed_devices->seed_list, &fs_devices->seed_list);
  2349. generate_random_uuid(fs_devices->fsid);
  2350. memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
  2351. memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
  2352. super_flags = btrfs_super_flags(disk_super) &
  2353. ~BTRFS_SUPER_FLAG_SEEDING;
  2354. btrfs_set_super_flags(disk_super, super_flags);
  2355. }
  2356. /*
  2357. * Store the expected generation for seed devices in device items.
  2358. */
  2359. static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
  2360. {
  2361. BTRFS_DEV_LOOKUP_ARGS(args);
  2362. struct btrfs_fs_info *fs_info = trans->fs_info;
  2363. struct btrfs_root *root = fs_info->chunk_root;
  2364. BTRFS_PATH_AUTO_FREE(path);
  2365. struct extent_buffer *leaf;
  2366. struct btrfs_dev_item *dev_item;
  2367. struct btrfs_device *device;
  2368. struct btrfs_key key;
  2369. u8 fs_uuid[BTRFS_FSID_SIZE];
  2370. u8 dev_uuid[BTRFS_UUID_SIZE];
  2371. int ret;
  2372. path = btrfs_alloc_path();
  2373. if (!path)
  2374. return -ENOMEM;
  2375. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  2376. key.type = BTRFS_DEV_ITEM_KEY;
  2377. key.offset = 0;
  2378. while (1) {
  2379. btrfs_reserve_chunk_metadata(trans, false);
  2380. ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
  2381. btrfs_trans_release_chunk_metadata(trans);
  2382. if (ret < 0)
  2383. return ret;
  2384. leaf = path->nodes[0];
  2385. next_slot:
  2386. if (path->slots[0] >= btrfs_header_nritems(leaf)) {
  2387. ret = btrfs_next_leaf(root, path);
  2388. if (ret > 0)
  2389. break;
  2390. if (ret < 0)
  2391. return ret;
  2392. leaf = path->nodes[0];
  2393. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  2394. btrfs_release_path(path);
  2395. continue;
  2396. }
  2397. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  2398. if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
  2399. key.type != BTRFS_DEV_ITEM_KEY)
  2400. break;
  2401. dev_item = btrfs_item_ptr(leaf, path->slots[0],
  2402. struct btrfs_dev_item);
  2403. args.devid = btrfs_device_id(leaf, dev_item);
  2404. read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
  2405. BTRFS_UUID_SIZE);
  2406. read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
  2407. BTRFS_FSID_SIZE);
  2408. args.uuid = dev_uuid;
  2409. args.fsid = fs_uuid;
  2410. device = btrfs_find_device(fs_info->fs_devices, &args);
  2411. BUG_ON(!device); /* Logic error */
  2412. if (device->fs_devices->seeding)
  2413. btrfs_set_device_generation(leaf, dev_item,
  2414. device->generation);
  2415. path->slots[0]++;
  2416. goto next_slot;
  2417. }
  2418. return 0;
  2419. }
  2420. int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
  2421. {
  2422. struct btrfs_root *root = fs_info->dev_root;
  2423. struct btrfs_trans_handle *trans;
  2424. struct btrfs_device *device;
  2425. struct file *bdev_file;
  2426. struct super_block *sb = fs_info->sb;
  2427. struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
  2428. struct btrfs_fs_devices *seed_devices = NULL;
  2429. u64 orig_super_total_bytes;
  2430. u64 orig_super_num_devices;
  2431. int ret = 0;
  2432. bool seeding_dev = false;
  2433. bool locked = false;
  2434. if (sb_rdonly(sb) && !fs_devices->seeding)
  2435. return -EROFS;
  2436. bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
  2437. fs_info->sb, &fs_holder_ops);
  2438. if (IS_ERR(bdev_file))
  2439. return PTR_ERR(bdev_file);
  2440. if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) {
  2441. ret = -EINVAL;
  2442. goto error;
  2443. }
  2444. if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) {
  2445. ret = -EINVAL;
  2446. goto error;
  2447. }
  2448. if (fs_devices->seeding) {
  2449. seeding_dev = true;
  2450. down_write(&sb->s_umount);
  2451. mutex_lock(&uuid_mutex);
  2452. locked = true;
  2453. }
  2454. sync_blockdev(file_bdev(bdev_file));
  2455. rcu_read_lock();
  2456. list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
  2457. if (device->bdev == file_bdev(bdev_file)) {
  2458. ret = -EEXIST;
  2459. rcu_read_unlock();
  2460. goto error;
  2461. }
  2462. }
  2463. rcu_read_unlock();
  2464. device = btrfs_alloc_device(fs_info, NULL, NULL, device_path);
  2465. if (IS_ERR(device)) {
  2466. /* we can safely leave the fs_devices entry around */
  2467. ret = PTR_ERR(device);
  2468. goto error;
  2469. }
  2470. device->fs_info = fs_info;
  2471. device->bdev_file = bdev_file;
  2472. device->bdev = file_bdev(bdev_file);
  2473. ret = lookup_bdev(device_path, &device->devt);
  2474. if (ret)
  2475. goto error_free_device;
  2476. ret = btrfs_get_dev_zone_info(device, false);
  2477. if (ret)
  2478. goto error_free_device;
  2479. trans = btrfs_start_transaction(root, 0);
  2480. if (IS_ERR(trans)) {
  2481. ret = PTR_ERR(trans);
  2482. goto error_free_zone;
  2483. }
  2484. set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
  2485. device->generation = trans->transid;
  2486. device->io_width = fs_info->sectorsize;
  2487. device->io_align = fs_info->sectorsize;
  2488. device->sector_size = fs_info->sectorsize;
  2489. device->total_bytes =
  2490. round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize);
  2491. device->disk_total_bytes = device->total_bytes;
  2492. device->commit_total_bytes = device->total_bytes;
  2493. set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
  2494. clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
  2495. device->dev_stats_valid = 1;
  2496. set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
  2497. if (seeding_dev) {
  2498. /* GFP_KERNEL allocation must not be under device_list_mutex */
  2499. seed_devices = btrfs_init_sprout(fs_info);
  2500. if (IS_ERR(seed_devices)) {
  2501. ret = PTR_ERR(seed_devices);
  2502. btrfs_abort_transaction(trans, ret);
  2503. goto error_trans;
  2504. }
  2505. }
  2506. mutex_lock(&fs_devices->device_list_mutex);
  2507. if (seeding_dev) {
  2508. btrfs_setup_sprout(fs_info, seed_devices);
  2509. btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
  2510. device);
  2511. }
  2512. device->fs_devices = fs_devices;
  2513. mutex_lock(&fs_info->chunk_mutex);
  2514. list_add_rcu(&device->dev_list, &fs_devices->devices);
  2515. list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
  2516. fs_devices->num_devices++;
  2517. fs_devices->open_devices++;
  2518. fs_devices->rw_devices++;
  2519. fs_devices->total_devices++;
  2520. fs_devices->total_rw_bytes += device->total_bytes;
  2521. atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
  2522. if (!bdev_nonrot(device->bdev))
  2523. fs_devices->rotating = true;
  2524. orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
  2525. btrfs_set_super_total_bytes(fs_info->super_copy,
  2526. round_down(orig_super_total_bytes + device->total_bytes,
  2527. fs_info->sectorsize));
  2528. orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
  2529. btrfs_set_super_num_devices(fs_info->super_copy,
  2530. orig_super_num_devices + 1);
  2531. /*
  2532. * we've got more storage, clear any full flags on the space
  2533. * infos
  2534. */
  2535. btrfs_clear_space_info_full(fs_info);
  2536. mutex_unlock(&fs_info->chunk_mutex);
  2537. /* Add sysfs device entry */
  2538. btrfs_sysfs_add_device(device);
  2539. mutex_unlock(&fs_devices->device_list_mutex);
  2540. if (seeding_dev) {
  2541. mutex_lock(&fs_info->chunk_mutex);
  2542. ret = init_first_rw_device(trans);
  2543. mutex_unlock(&fs_info->chunk_mutex);
  2544. if (unlikely(ret)) {
  2545. btrfs_abort_transaction(trans, ret);
  2546. goto error_sysfs;
  2547. }
  2548. }
  2549. ret = btrfs_add_dev_item(trans, device);
  2550. if (unlikely(ret)) {
  2551. btrfs_abort_transaction(trans, ret);
  2552. goto error_sysfs;
  2553. }
  2554. if (seeding_dev) {
  2555. ret = btrfs_finish_sprout(trans);
  2556. if (unlikely(ret)) {
  2557. btrfs_abort_transaction(trans, ret);
  2558. goto error_sysfs;
  2559. }
  2560. /*
  2561. * fs_devices now represents the newly sprouted filesystem and
  2562. * its fsid has been changed by btrfs_sprout_splice().
  2563. */
  2564. btrfs_sysfs_update_sprout_fsid(fs_devices);
  2565. }
  2566. ret = btrfs_commit_transaction(trans);
  2567. if (seeding_dev) {
  2568. mutex_unlock(&uuid_mutex);
  2569. up_write(&sb->s_umount);
  2570. locked = false;
  2571. if (ret) /* transaction commit */
  2572. return ret;
  2573. ret = btrfs_relocate_sys_chunks(fs_info);
  2574. if (ret < 0)
  2575. btrfs_handle_fs_error(fs_info, ret,
  2576. "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
  2577. trans = btrfs_attach_transaction(root);
  2578. if (IS_ERR(trans)) {
  2579. if (PTR_ERR(trans) == -ENOENT)
  2580. return 0;
  2581. ret = PTR_ERR(trans);
  2582. trans = NULL;
  2583. goto error_sysfs;
  2584. }
  2585. ret = btrfs_commit_transaction(trans);
  2586. }
  2587. /*
  2588. * Now that we have written a new super block to this device, check all
  2589. * other fs_devices list if device_path alienates any other scanned
  2590. * device.
  2591. * We can ignore the return value as it typically returns -EINVAL and
  2592. * only succeeds if the device was an alien.
  2593. */
  2594. btrfs_forget_devices(device->devt);
  2595. /* Update ctime/mtime for blkid or udev */
  2596. update_dev_time(device_path);
  2597. return ret;
  2598. error_sysfs:
  2599. btrfs_sysfs_remove_device(device);
  2600. mutex_lock(&fs_info->fs_devices->device_list_mutex);
  2601. mutex_lock(&fs_info->chunk_mutex);
  2602. list_del_rcu(&device->dev_list);
  2603. list_del(&device->dev_alloc_list);
  2604. fs_info->fs_devices->num_devices--;
  2605. fs_info->fs_devices->open_devices--;
  2606. fs_info->fs_devices->rw_devices--;
  2607. fs_info->fs_devices->total_devices--;
  2608. fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
  2609. atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
  2610. btrfs_set_super_total_bytes(fs_info->super_copy,
  2611. orig_super_total_bytes);
  2612. btrfs_set_super_num_devices(fs_info->super_copy,
  2613. orig_super_num_devices);
  2614. mutex_unlock(&fs_info->chunk_mutex);
  2615. mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  2616. error_trans:
  2617. if (trans)
  2618. btrfs_end_transaction(trans);
  2619. error_free_zone:
  2620. btrfs_destroy_dev_zone_info(device);
  2621. error_free_device:
  2622. btrfs_free_device(device);
  2623. error:
  2624. bdev_fput(bdev_file);
  2625. if (locked) {
  2626. mutex_unlock(&uuid_mutex);
  2627. up_write(&sb->s_umount);
  2628. }
  2629. return ret;
  2630. }
  2631. int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device)
  2632. {
  2633. int ret;
  2634. BTRFS_PATH_AUTO_FREE(path);
  2635. struct btrfs_root *root = device->fs_info->chunk_root;
  2636. struct btrfs_dev_item *dev_item;
  2637. struct extent_buffer *leaf;
  2638. struct btrfs_key key;
  2639. path = btrfs_alloc_path();
  2640. if (!path)
  2641. return -ENOMEM;
  2642. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  2643. key.type = BTRFS_DEV_ITEM_KEY;
  2644. key.offset = device->devid;
  2645. ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
  2646. if (ret < 0)
  2647. return ret;
  2648. if (ret > 0)
  2649. return -ENOENT;
  2650. leaf = path->nodes[0];
  2651. dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
  2652. btrfs_set_device_id(leaf, dev_item, device->devid);
  2653. btrfs_set_device_type(leaf, dev_item, device->type);
  2654. btrfs_set_device_io_align(leaf, dev_item, device->io_align);
  2655. btrfs_set_device_io_width(leaf, dev_item, device->io_width);
  2656. btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
  2657. btrfs_set_device_total_bytes(leaf, dev_item,
  2658. btrfs_device_get_disk_total_bytes(device));
  2659. btrfs_set_device_bytes_used(leaf, dev_item,
  2660. btrfs_device_get_bytes_used(device));
  2661. return ret;
  2662. }
  2663. int btrfs_grow_device(struct btrfs_trans_handle *trans,
  2664. struct btrfs_device *device, u64 new_size)
  2665. {
  2666. struct btrfs_fs_info *fs_info = device->fs_info;
  2667. struct btrfs_super_block *super_copy = fs_info->super_copy;
  2668. u64 old_total;
  2669. u64 diff;
  2670. int ret;
  2671. if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
  2672. return -EACCES;
  2673. new_size = round_down(new_size, fs_info->sectorsize);
  2674. mutex_lock(&fs_info->chunk_mutex);
  2675. old_total = btrfs_super_total_bytes(super_copy);
  2676. diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
  2677. if (new_size <= device->total_bytes ||
  2678. test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
  2679. mutex_unlock(&fs_info->chunk_mutex);
  2680. return -EINVAL;
  2681. }
  2682. btrfs_set_super_total_bytes(super_copy,
  2683. round_down(old_total + diff, fs_info->sectorsize));
  2684. device->fs_devices->total_rw_bytes += diff;
  2685. atomic64_add(diff, &fs_info->free_chunk_space);
  2686. btrfs_device_set_total_bytes(device, new_size);
  2687. btrfs_device_set_disk_total_bytes(device, new_size);
  2688. btrfs_clear_space_info_full(device->fs_info);
  2689. if (list_empty(&device->post_commit_list))
  2690. list_add_tail(&device->post_commit_list,
  2691. &trans->transaction->dev_update_list);
  2692. mutex_unlock(&fs_info->chunk_mutex);
  2693. btrfs_reserve_chunk_metadata(trans, false);
  2694. ret = btrfs_update_device(trans, device);
  2695. btrfs_trans_release_chunk_metadata(trans);
  2696. return ret;
  2697. }
  2698. static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
  2699. {
  2700. struct btrfs_fs_info *fs_info = trans->fs_info;
  2701. struct btrfs_root *root = fs_info->chunk_root;
  2702. int ret;
  2703. BTRFS_PATH_AUTO_FREE(path);
  2704. struct btrfs_key key;
  2705. path = btrfs_alloc_path();
  2706. if (!path)
  2707. return -ENOMEM;
  2708. key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
  2709. key.type = BTRFS_CHUNK_ITEM_KEY;
  2710. key.offset = chunk_offset;
  2711. ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  2712. if (ret < 0)
  2713. return ret;
  2714. if (unlikely(ret > 0)) {
  2715. /* Logic error or corruption */
  2716. btrfs_err(fs_info, "failed to lookup chunk %llu when freeing",
  2717. chunk_offset);
  2718. btrfs_abort_transaction(trans, -ENOENT);
  2719. return -EUCLEAN;
  2720. }
  2721. ret = btrfs_del_item(trans, root, path);
  2722. if (unlikely(ret < 0)) {
  2723. btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset);
  2724. btrfs_abort_transaction(trans, ret);
  2725. return ret;
  2726. }
  2727. return ret;
  2728. }
  2729. static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
  2730. {
  2731. struct btrfs_super_block *super_copy = fs_info->super_copy;
  2732. struct btrfs_disk_key *disk_key;
  2733. struct btrfs_chunk *chunk;
  2734. u8 *ptr;
  2735. int ret = 0;
  2736. u32 num_stripes;
  2737. u32 array_size;
  2738. u32 len = 0;
  2739. u32 cur;
  2740. struct btrfs_key key;
  2741. lockdep_assert_held(&fs_info->chunk_mutex);
  2742. array_size = btrfs_super_sys_array_size(super_copy);
  2743. ptr = super_copy->sys_chunk_array;
  2744. cur = 0;
  2745. while (cur < array_size) {
  2746. disk_key = (struct btrfs_disk_key *)ptr;
  2747. btrfs_disk_key_to_cpu(&key, disk_key);
  2748. len = sizeof(*disk_key);
  2749. if (key.type == BTRFS_CHUNK_ITEM_KEY) {
  2750. chunk = (struct btrfs_chunk *)(ptr + len);
  2751. num_stripes = btrfs_stack_chunk_num_stripes(chunk);
  2752. len += btrfs_chunk_item_size(num_stripes);
  2753. } else {
  2754. ret = -EIO;
  2755. break;
  2756. }
  2757. if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
  2758. key.offset == chunk_offset) {
  2759. memmove(ptr, ptr + len, array_size - (cur + len));
  2760. array_size -= len;
  2761. btrfs_set_super_sys_array_size(super_copy, array_size);
  2762. } else {
  2763. ptr += len;
  2764. cur += len;
  2765. }
  2766. }
  2767. return ret;
  2768. }
  2769. struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
  2770. u64 logical, u64 length)
  2771. {
  2772. struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node;
  2773. struct rb_node *prev = NULL;
  2774. struct rb_node *orig_prev;
  2775. struct btrfs_chunk_map *map;
  2776. struct btrfs_chunk_map *prev_map = NULL;
  2777. while (node) {
  2778. map = rb_entry(node, struct btrfs_chunk_map, rb_node);
  2779. prev = node;
  2780. prev_map = map;
  2781. if (logical < map->start) {
  2782. node = node->rb_left;
  2783. } else if (logical >= map->start + map->chunk_len) {
  2784. node = node->rb_right;
  2785. } else {
  2786. refcount_inc(&map->refs);
  2787. return map;
  2788. }
  2789. }
  2790. if (!prev)
  2791. return NULL;
  2792. orig_prev = prev;
  2793. while (prev && logical >= prev_map->start + prev_map->chunk_len) {
  2794. prev = rb_next(prev);
  2795. prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
  2796. }
  2797. if (!prev) {
  2798. prev = orig_prev;
  2799. prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
  2800. while (prev && logical < prev_map->start) {
  2801. prev = rb_prev(prev);
  2802. prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
  2803. }
  2804. }
  2805. if (prev) {
  2806. u64 end = logical + length;
  2807. /*
  2808. * Caller can pass a U64_MAX length when it wants to get any
  2809. * chunk starting at an offset of 'logical' or higher, so deal
  2810. * with underflow by resetting the end offset to U64_MAX.
  2811. */
  2812. if (end < logical)
  2813. end = U64_MAX;
  2814. if (end > prev_map->start &&
  2815. logical < prev_map->start + prev_map->chunk_len) {
  2816. refcount_inc(&prev_map->refs);
  2817. return prev_map;
  2818. }
  2819. }
  2820. return NULL;
  2821. }
  2822. struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
  2823. u64 logical, u64 length)
  2824. {
  2825. struct btrfs_chunk_map *map;
  2826. read_lock(&fs_info->mapping_tree_lock);
  2827. map = btrfs_find_chunk_map_nolock(fs_info, logical, length);
  2828. read_unlock(&fs_info->mapping_tree_lock);
  2829. return map;
  2830. }
  2831. /*
  2832. * Find the mapping containing the given logical extent.
  2833. *
  2834. * @logical: Logical block offset in bytes.
  2835. * @length: Length of extent in bytes.
  2836. *
  2837. * Return: Chunk mapping or ERR_PTR.
  2838. */
  2839. struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
  2840. u64 logical, u64 length)
  2841. {
  2842. struct btrfs_chunk_map *map;
  2843. map = btrfs_find_chunk_map(fs_info, logical, length);
  2844. if (unlikely(!map)) {
  2845. btrfs_crit(fs_info,
  2846. "unable to find chunk map for logical %llu length %llu",
  2847. logical, length);
  2848. return ERR_PTR(-EINVAL);
  2849. }
  2850. if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
  2851. btrfs_crit(fs_info,
  2852. "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
  2853. logical, logical + length, map->start,
  2854. map->start + map->chunk_len);
  2855. btrfs_free_chunk_map(map);
  2856. return ERR_PTR(-EINVAL);
  2857. }
  2858. /* Callers are responsible for dropping the reference. */
  2859. return map;
  2860. }
  2861. static int remove_chunk_item(struct btrfs_trans_handle *trans,
  2862. struct btrfs_chunk_map *map, u64 chunk_offset)
  2863. {
  2864. int i;
  2865. /*
  2866. * Removing chunk items and updating the device items in the chunks btree
  2867. * requires holding the chunk_mutex.
  2868. * See the comment at btrfs_chunk_alloc() for the details.
  2869. */
  2870. lockdep_assert_held(&trans->fs_info->chunk_mutex);
  2871. for (i = 0; i < map->num_stripes; i++) {
  2872. int ret;
  2873. ret = btrfs_update_device(trans, map->stripes[i].dev);
  2874. if (ret)
  2875. return ret;
  2876. }
  2877. return btrfs_free_chunk(trans, chunk_offset);
  2878. }
  2879. int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map)
  2880. {
  2881. struct btrfs_fs_info *fs_info = trans->fs_info;
  2882. struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
  2883. u64 dev_extent_len = 0;
  2884. int i, ret = 0;
  2885. /*
  2886. * First delete the device extent items from the devices btree.
  2887. * We take the device_list_mutex to avoid racing with the finishing phase
  2888. * of a device replace operation. See the comment below before acquiring
  2889. * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
  2890. * because that can result in a deadlock when deleting the device extent
  2891. * items from the devices btree - COWing an extent buffer from the btree
  2892. * may result in allocating a new metadata chunk, which would attempt to
  2893. * lock again fs_info->chunk_mutex.
  2894. */
  2895. mutex_lock(&fs_devices->device_list_mutex);
  2896. for (i = 0; i < map->num_stripes; i++) {
  2897. struct btrfs_device *device = map->stripes[i].dev;
  2898. ret = btrfs_free_dev_extent(trans, device,
  2899. map->stripes[i].physical,
  2900. &dev_extent_len);
  2901. if (unlikely(ret)) {
  2902. mutex_unlock(&fs_devices->device_list_mutex);
  2903. btrfs_abort_transaction(trans, ret);
  2904. return ret;
  2905. }
  2906. if (device->bytes_used > 0) {
  2907. mutex_lock(&fs_info->chunk_mutex);
  2908. btrfs_device_set_bytes_used(device,
  2909. device->bytes_used - dev_extent_len);
  2910. atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
  2911. btrfs_clear_space_info_full(fs_info);
  2912. if (list_empty(&device->post_commit_list)) {
  2913. list_add_tail(&device->post_commit_list,
  2914. &trans->transaction->dev_update_list);
  2915. }
  2916. mutex_unlock(&fs_info->chunk_mutex);
  2917. }
  2918. }
  2919. mutex_unlock(&fs_devices->device_list_mutex);
  2920. return 0;
  2921. }
  2922. int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
  2923. {
  2924. struct btrfs_fs_info *fs_info = trans->fs_info;
  2925. struct btrfs_chunk_map *map;
  2926. int ret;
  2927. map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
  2928. if (IS_ERR(map)) {
  2929. DEBUG_WARN("errr %ld reading chunk map at offset %llu",
  2930. PTR_ERR(map), chunk_offset);
  2931. return PTR_ERR(map);
  2932. }
  2933. ret = btrfs_remove_dev_extents(trans, map);
  2934. if (ret)
  2935. goto out;
  2936. /*
  2937. * We acquire fs_info->chunk_mutex for 2 reasons:
  2938. *
  2939. * 1) Just like with the first phase of the chunk allocation, we must
  2940. * reserve system space, do all chunk btree updates and deletions, and
  2941. * update the system chunk array in the superblock while holding this
  2942. * mutex. This is for similar reasons as explained on the comment at
  2943. * the top of btrfs_chunk_alloc();
  2944. *
  2945. * 2) Prevent races with the final phase of a device replace operation
  2946. * that replaces the device object associated with the map's stripes,
  2947. * because the device object's id can change at any time during that
  2948. * final phase of the device replace operation
  2949. * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
  2950. * replaced device and then see it with an ID of
  2951. * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
  2952. * the device item, which does not exists on the chunk btree.
  2953. * The finishing phase of device replace acquires both the
  2954. * device_list_mutex and the chunk_mutex, in that order, so we are
  2955. * safe by just acquiring the chunk_mutex.
  2956. */
  2957. trans->removing_chunk = true;
  2958. mutex_lock(&fs_info->chunk_mutex);
  2959. check_system_chunk(trans, map->type);
  2960. ret = remove_chunk_item(trans, map, chunk_offset);
  2961. /*
  2962. * Normally we should not get -ENOSPC since we reserved space before
  2963. * through the call to check_system_chunk().
  2964. *
  2965. * Despite our system space_info having enough free space, we may not
  2966. * be able to allocate extents from its block groups, because all have
  2967. * an incompatible profile, which will force us to allocate a new system
  2968. * block group with the right profile, or right after we called
  2969. * check_system_space() above, a scrub turned the only system block group
  2970. * with enough free space into RO mode.
  2971. * This is explained with more detail at do_chunk_alloc().
  2972. *
  2973. * So if we get -ENOSPC, allocate a new system chunk and retry once.
  2974. */
  2975. if (ret == -ENOSPC) {
  2976. const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
  2977. struct btrfs_block_group *sys_bg;
  2978. struct btrfs_space_info *space_info;
  2979. space_info = btrfs_find_space_info(fs_info, sys_flags);
  2980. if (unlikely(!space_info)) {
  2981. ret = -EINVAL;
  2982. btrfs_abort_transaction(trans, ret);
  2983. goto out;
  2984. }
  2985. sys_bg = btrfs_create_chunk(trans, space_info, sys_flags);
  2986. if (IS_ERR(sys_bg)) {
  2987. ret = PTR_ERR(sys_bg);
  2988. btrfs_abort_transaction(trans, ret);
  2989. goto out;
  2990. }
  2991. ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
  2992. if (unlikely(ret)) {
  2993. btrfs_abort_transaction(trans, ret);
  2994. goto out;
  2995. }
  2996. ret = remove_chunk_item(trans, map, chunk_offset);
  2997. if (unlikely(ret)) {
  2998. btrfs_abort_transaction(trans, ret);
  2999. goto out;
  3000. }
  3001. } else if (unlikely(ret)) {
  3002. btrfs_abort_transaction(trans, ret);
  3003. goto out;
  3004. }
  3005. trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len);
  3006. if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
  3007. ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
  3008. if (unlikely(ret)) {
  3009. btrfs_abort_transaction(trans, ret);
  3010. goto out;
  3011. }
  3012. }
  3013. mutex_unlock(&fs_info->chunk_mutex);
  3014. trans->removing_chunk = false;
  3015. /*
  3016. * We are done with chunk btree updates and deletions, so release the
  3017. * system space we previously reserved (with check_system_chunk()).
  3018. */
  3019. btrfs_trans_release_chunk_metadata(trans);
  3020. /* On error, btrfs_remove_block_group() aborts the transaction. */
  3021. ret = btrfs_remove_block_group(trans, map);
  3022. if (unlikely(ret))
  3023. ASSERT(BTRFS_FS_ERROR(fs_info) != 0);
  3024. out:
  3025. if (trans->removing_chunk) {
  3026. mutex_unlock(&fs_info->chunk_mutex);
  3027. trans->removing_chunk = false;
  3028. }
  3029. /* once for us */
  3030. btrfs_free_chunk_map(map);
  3031. return ret;
  3032. }
  3033. static int btrfs_relocate_chunk_finish(struct btrfs_fs_info *fs_info,
  3034. struct btrfs_block_group *bg)
  3035. {
  3036. struct btrfs_root *root = fs_info->chunk_root;
  3037. struct btrfs_trans_handle *trans;
  3038. u64 length;
  3039. int ret;
  3040. btrfs_discard_cancel_work(&fs_info->discard_ctl, bg);
  3041. length = bg->length;
  3042. btrfs_put_block_group(bg);
  3043. /*
  3044. * On a zoned file system, discard the whole block group, this will
  3045. * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
  3046. * resetting the zone fails, don't treat it as a fatal problem from the
  3047. * filesystem's point of view.
  3048. */
  3049. if (btrfs_is_zoned(fs_info)) {
  3050. ret = btrfs_discard_extent(fs_info, bg->start, length, NULL, true);
  3051. if (ret)
  3052. btrfs_info(fs_info, "failed to reset zone %llu after relocation",
  3053. bg->start);
  3054. }
  3055. trans = btrfs_start_trans_remove_block_group(root->fs_info, bg->start);
  3056. if (IS_ERR(trans)) {
  3057. ret = PTR_ERR(trans);
  3058. btrfs_handle_fs_error(root->fs_info, ret, NULL);
  3059. return ret;
  3060. }
  3061. /* Step two, delete the device extents and the chunk tree entries. */
  3062. ret = btrfs_remove_chunk(trans, bg->start);
  3063. btrfs_end_transaction(trans);
  3064. return ret;
  3065. }
  3066. int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, bool verbose)
  3067. {
  3068. struct btrfs_block_group *block_group;
  3069. int ret;
  3070. if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
  3071. btrfs_err(fs_info,
  3072. "relocate: not supported on extent tree v2 yet");
  3073. return -EINVAL;
  3074. }
  3075. /*
  3076. * Prevent races with automatic removal of unused block groups.
  3077. * After we relocate and before we remove the chunk with offset
  3078. * chunk_offset, automatic removal of the block group can kick in,
  3079. * resulting in a failure when calling btrfs_remove_chunk() below.
  3080. *
  3081. * Make sure to acquire this mutex before doing a tree search (dev
  3082. * or chunk trees) to find chunks. Otherwise the cleaner kthread might
  3083. * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
  3084. * we release the path used to search the chunk/dev tree and before
  3085. * the current task acquires this mutex and calls us.
  3086. */
  3087. lockdep_assert_held(&fs_info->reclaim_bgs_lock);
  3088. /* step one, relocate all the extents inside this chunk */
  3089. btrfs_scrub_pause(fs_info);
  3090. ret = btrfs_relocate_block_group(fs_info, chunk_offset, verbose);
  3091. btrfs_scrub_continue(fs_info);
  3092. if (ret) {
  3093. /*
  3094. * If we had a transaction abort, stop all running scrubs.
  3095. * See transaction.c:cleanup_transaction() why we do it here.
  3096. */
  3097. if (BTRFS_FS_ERROR(fs_info))
  3098. btrfs_scrub_cancel(fs_info);
  3099. return ret;
  3100. }
  3101. block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
  3102. if (!block_group)
  3103. return -ENOENT;
  3104. if (should_relocate_using_remap_tree(block_group)) {
  3105. /* If we're relocating using the remap tree we're now done. */
  3106. btrfs_put_block_group(block_group);
  3107. ret = 0;
  3108. } else {
  3109. ret = btrfs_relocate_chunk_finish(fs_info, block_group);
  3110. }
  3111. return ret;
  3112. }
  3113. static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
  3114. {
  3115. struct btrfs_root *chunk_root = fs_info->chunk_root;
  3116. BTRFS_PATH_AUTO_FREE(path);
  3117. struct extent_buffer *leaf;
  3118. struct btrfs_chunk *chunk;
  3119. struct btrfs_key key;
  3120. struct btrfs_key found_key;
  3121. u64 chunk_type;
  3122. bool retried = false;
  3123. int failed = 0;
  3124. int ret;
  3125. path = btrfs_alloc_path();
  3126. if (!path)
  3127. return -ENOMEM;
  3128. again:
  3129. key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
  3130. key.type = BTRFS_CHUNK_ITEM_KEY;
  3131. key.offset = (u64)-1;
  3132. while (1) {
  3133. mutex_lock(&fs_info->reclaim_bgs_lock);
  3134. ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
  3135. if (ret < 0) {
  3136. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3137. return ret;
  3138. }
  3139. if (unlikely(ret == 0)) {
  3140. /*
  3141. * On the first search we would find chunk tree with
  3142. * offset -1, which is not possible. On subsequent
  3143. * loops this would find an existing item on an invalid
  3144. * offset (one less than the previous one, wrong
  3145. * alignment and size).
  3146. */
  3147. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3148. return -EUCLEAN;
  3149. }
  3150. ret = btrfs_previous_item(chunk_root, path, key.objectid,
  3151. key.type);
  3152. if (ret)
  3153. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3154. if (ret < 0)
  3155. return ret;
  3156. if (ret > 0)
  3157. break;
  3158. leaf = path->nodes[0];
  3159. btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
  3160. chunk = btrfs_item_ptr(leaf, path->slots[0],
  3161. struct btrfs_chunk);
  3162. chunk_type = btrfs_chunk_type(leaf, chunk);
  3163. btrfs_release_path(path);
  3164. if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
  3165. ret = btrfs_relocate_chunk(fs_info, found_key.offset,
  3166. true);
  3167. if (ret == -ENOSPC)
  3168. failed++;
  3169. else
  3170. BUG_ON(ret);
  3171. }
  3172. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3173. if (found_key.offset == 0)
  3174. break;
  3175. key.offset = found_key.offset - 1;
  3176. }
  3177. ret = 0;
  3178. if (failed && !retried) {
  3179. failed = 0;
  3180. retried = true;
  3181. goto again;
  3182. } else if (WARN_ON(failed && retried)) {
  3183. ret = -ENOSPC;
  3184. }
  3185. return ret;
  3186. }
  3187. /*
  3188. * return 1 : allocate a data chunk successfully,
  3189. * return <0: errors during allocating a data chunk,
  3190. * return 0 : no need to allocate a data chunk.
  3191. */
  3192. static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
  3193. u64 chunk_offset)
  3194. {
  3195. struct btrfs_block_group *cache;
  3196. u64 bytes_used;
  3197. u64 chunk_type;
  3198. cache = btrfs_lookup_block_group(fs_info, chunk_offset);
  3199. ASSERT(cache);
  3200. chunk_type = cache->flags;
  3201. btrfs_put_block_group(cache);
  3202. if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
  3203. return 0;
  3204. spin_lock(&fs_info->data_sinfo->lock);
  3205. bytes_used = fs_info->data_sinfo->bytes_used;
  3206. spin_unlock(&fs_info->data_sinfo->lock);
  3207. if (!bytes_used) {
  3208. struct btrfs_trans_handle *trans;
  3209. int ret;
  3210. trans = btrfs_join_transaction(fs_info->tree_root);
  3211. if (IS_ERR(trans))
  3212. return PTR_ERR(trans);
  3213. ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
  3214. btrfs_end_transaction(trans);
  3215. if (ret < 0)
  3216. return ret;
  3217. return 1;
  3218. }
  3219. return 0;
  3220. }
  3221. static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
  3222. const struct btrfs_disk_balance_args *disk)
  3223. {
  3224. memset(cpu, 0, sizeof(*cpu));
  3225. cpu->profiles = le64_to_cpu(disk->profiles);
  3226. cpu->usage = le64_to_cpu(disk->usage);
  3227. cpu->devid = le64_to_cpu(disk->devid);
  3228. cpu->pstart = le64_to_cpu(disk->pstart);
  3229. cpu->pend = le64_to_cpu(disk->pend);
  3230. cpu->vstart = le64_to_cpu(disk->vstart);
  3231. cpu->vend = le64_to_cpu(disk->vend);
  3232. cpu->target = le64_to_cpu(disk->target);
  3233. cpu->flags = le64_to_cpu(disk->flags);
  3234. cpu->limit = le64_to_cpu(disk->limit);
  3235. cpu->stripes_min = le32_to_cpu(disk->stripes_min);
  3236. cpu->stripes_max = le32_to_cpu(disk->stripes_max);
  3237. }
  3238. static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
  3239. const struct btrfs_balance_args *cpu)
  3240. {
  3241. memset(disk, 0, sizeof(*disk));
  3242. disk->profiles = cpu_to_le64(cpu->profiles);
  3243. disk->usage = cpu_to_le64(cpu->usage);
  3244. disk->devid = cpu_to_le64(cpu->devid);
  3245. disk->pstart = cpu_to_le64(cpu->pstart);
  3246. disk->pend = cpu_to_le64(cpu->pend);
  3247. disk->vstart = cpu_to_le64(cpu->vstart);
  3248. disk->vend = cpu_to_le64(cpu->vend);
  3249. disk->target = cpu_to_le64(cpu->target);
  3250. disk->flags = cpu_to_le64(cpu->flags);
  3251. disk->limit = cpu_to_le64(cpu->limit);
  3252. disk->stripes_min = cpu_to_le32(cpu->stripes_min);
  3253. disk->stripes_max = cpu_to_le32(cpu->stripes_max);
  3254. }
  3255. static int insert_balance_item(struct btrfs_fs_info *fs_info,
  3256. struct btrfs_balance_control *bctl)
  3257. {
  3258. struct btrfs_root *root = fs_info->tree_root;
  3259. struct btrfs_trans_handle *trans;
  3260. struct btrfs_balance_item *item;
  3261. struct btrfs_disk_balance_args disk_bargs;
  3262. struct btrfs_path *path;
  3263. struct extent_buffer *leaf;
  3264. struct btrfs_key key;
  3265. int ret;
  3266. path = btrfs_alloc_path();
  3267. if (!path)
  3268. return -ENOMEM;
  3269. trans = btrfs_start_transaction(root, 0);
  3270. if (IS_ERR(trans)) {
  3271. btrfs_free_path(path);
  3272. return PTR_ERR(trans);
  3273. }
  3274. key.objectid = BTRFS_BALANCE_OBJECTID;
  3275. key.type = BTRFS_TEMPORARY_ITEM_KEY;
  3276. key.offset = 0;
  3277. ret = btrfs_insert_empty_item(trans, root, path, &key,
  3278. sizeof(*item));
  3279. if (ret)
  3280. goto out;
  3281. leaf = path->nodes[0];
  3282. item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
  3283. memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
  3284. btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
  3285. btrfs_set_balance_data(leaf, item, &disk_bargs);
  3286. btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
  3287. btrfs_set_balance_meta(leaf, item, &disk_bargs);
  3288. btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
  3289. btrfs_set_balance_sys(leaf, item, &disk_bargs);
  3290. btrfs_set_balance_flags(leaf, item, bctl->flags);
  3291. out:
  3292. btrfs_free_path(path);
  3293. if (ret == 0)
  3294. ret = btrfs_commit_transaction(trans);
  3295. else
  3296. btrfs_end_transaction(trans);
  3297. return ret;
  3298. }
  3299. static int del_balance_item(struct btrfs_fs_info *fs_info)
  3300. {
  3301. struct btrfs_root *root = fs_info->tree_root;
  3302. struct btrfs_trans_handle *trans;
  3303. struct btrfs_path *path;
  3304. struct btrfs_key key;
  3305. int ret;
  3306. path = btrfs_alloc_path();
  3307. if (!path)
  3308. return -ENOMEM;
  3309. trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
  3310. if (IS_ERR(trans)) {
  3311. btrfs_free_path(path);
  3312. return PTR_ERR(trans);
  3313. }
  3314. key.objectid = BTRFS_BALANCE_OBJECTID;
  3315. key.type = BTRFS_TEMPORARY_ITEM_KEY;
  3316. key.offset = 0;
  3317. ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  3318. if (ret < 0)
  3319. goto out;
  3320. if (ret > 0) {
  3321. ret = -ENOENT;
  3322. goto out;
  3323. }
  3324. ret = btrfs_del_item(trans, root, path);
  3325. out:
  3326. btrfs_free_path(path);
  3327. if (ret == 0)
  3328. ret = btrfs_commit_transaction(trans);
  3329. else
  3330. btrfs_end_transaction(trans);
  3331. return ret;
  3332. }
  3333. /*
  3334. * This is a heuristic used to reduce the number of chunks balanced on
  3335. * resume after balance was interrupted.
  3336. */
  3337. static void update_balance_args(struct btrfs_balance_control *bctl)
  3338. {
  3339. /*
  3340. * Turn on soft mode for chunk types that were being converted.
  3341. */
  3342. if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
  3343. bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
  3344. if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
  3345. bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
  3346. if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
  3347. bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
  3348. /*
  3349. * Turn on usage filter if is not already used. The idea is
  3350. * that chunks that we have already balanced should be
  3351. * reasonably full. Don't do it for chunks that are being
  3352. * converted - that will keep us from relocating unconverted
  3353. * (albeit full) chunks.
  3354. */
  3355. if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
  3356. !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
  3357. !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
  3358. bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
  3359. bctl->data.usage = 90;
  3360. }
  3361. if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
  3362. !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
  3363. !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
  3364. bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
  3365. bctl->sys.usage = 90;
  3366. }
  3367. if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
  3368. !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
  3369. !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
  3370. bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
  3371. bctl->meta.usage = 90;
  3372. }
  3373. }
  3374. /*
  3375. * Clear the balance status in fs_info and delete the balance item from disk.
  3376. */
  3377. static void reset_balance_state(struct btrfs_fs_info *fs_info)
  3378. {
  3379. struct btrfs_balance_control *bctl = fs_info->balance_ctl;
  3380. int ret;
  3381. ASSERT(fs_info->balance_ctl);
  3382. spin_lock(&fs_info->balance_lock);
  3383. fs_info->balance_ctl = NULL;
  3384. spin_unlock(&fs_info->balance_lock);
  3385. kfree(bctl);
  3386. ret = del_balance_item(fs_info);
  3387. if (ret)
  3388. btrfs_handle_fs_error(fs_info, ret, NULL);
  3389. }
  3390. /*
  3391. * Balance filters. Return 1 if chunk should be filtered out
  3392. * (should not be balanced).
  3393. */
  3394. static bool chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bargs)
  3395. {
  3396. chunk_type = chunk_to_extended(chunk_type) &
  3397. BTRFS_EXTENDED_PROFILE_MASK;
  3398. if (bargs->profiles & chunk_type)
  3399. return false;
  3400. return true;
  3401. }
  3402. static bool chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
  3403. struct btrfs_balance_args *bargs)
  3404. {
  3405. struct btrfs_block_group *cache;
  3406. u64 chunk_used;
  3407. u64 user_thresh_min;
  3408. u64 user_thresh_max;
  3409. bool ret = true;
  3410. cache = btrfs_lookup_block_group(fs_info, chunk_offset);
  3411. chunk_used = cache->used;
  3412. if (bargs->usage_min == 0)
  3413. user_thresh_min = 0;
  3414. else
  3415. user_thresh_min = mult_perc(cache->length, bargs->usage_min);
  3416. if (bargs->usage_max == 0)
  3417. user_thresh_max = 1;
  3418. else if (bargs->usage_max > 100)
  3419. user_thresh_max = cache->length;
  3420. else
  3421. user_thresh_max = mult_perc(cache->length, bargs->usage_max);
  3422. if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
  3423. ret = false;
  3424. btrfs_put_block_group(cache);
  3425. return ret;
  3426. }
  3427. static bool chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
  3428. struct btrfs_balance_args *bargs)
  3429. {
  3430. struct btrfs_block_group *cache;
  3431. u64 chunk_used, user_thresh;
  3432. bool ret = true;
  3433. cache = btrfs_lookup_block_group(fs_info, chunk_offset);
  3434. chunk_used = cache->used;
  3435. if (bargs->usage_min == 0)
  3436. user_thresh = 1;
  3437. else if (bargs->usage > 100)
  3438. user_thresh = cache->length;
  3439. else
  3440. user_thresh = mult_perc(cache->length, bargs->usage);
  3441. if (chunk_used < user_thresh)
  3442. ret = false;
  3443. btrfs_put_block_group(cache);
  3444. return ret;
  3445. }
  3446. static bool chunk_devid_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
  3447. struct btrfs_balance_args *bargs)
  3448. {
  3449. struct btrfs_stripe *stripe;
  3450. int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
  3451. int i;
  3452. for (i = 0; i < num_stripes; i++) {
  3453. stripe = btrfs_stripe_nr(chunk, i);
  3454. if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
  3455. return false;
  3456. }
  3457. return true;
  3458. }
  3459. static u64 calc_data_stripes(u64 type, int num_stripes)
  3460. {
  3461. const int index = btrfs_bg_flags_to_raid_index(type);
  3462. const int ncopies = btrfs_raid_array[index].ncopies;
  3463. const int nparity = btrfs_raid_array[index].nparity;
  3464. return (num_stripes - nparity) / ncopies;
  3465. }
  3466. /* [pstart, pend) */
  3467. static bool chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
  3468. struct btrfs_balance_args *bargs)
  3469. {
  3470. struct btrfs_stripe *stripe;
  3471. int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
  3472. u64 stripe_offset;
  3473. u64 stripe_length;
  3474. u64 type;
  3475. int factor;
  3476. int i;
  3477. if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
  3478. return false;
  3479. type = btrfs_chunk_type(leaf, chunk);
  3480. factor = calc_data_stripes(type, num_stripes);
  3481. for (i = 0; i < num_stripes; i++) {
  3482. stripe = btrfs_stripe_nr(chunk, i);
  3483. if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
  3484. continue;
  3485. stripe_offset = btrfs_stripe_offset(leaf, stripe);
  3486. stripe_length = btrfs_chunk_length(leaf, chunk);
  3487. stripe_length = div_u64(stripe_length, factor);
  3488. if (stripe_offset < bargs->pend &&
  3489. stripe_offset + stripe_length > bargs->pstart)
  3490. return false;
  3491. }
  3492. return true;
  3493. }
  3494. /* [vstart, vend) */
  3495. static bool chunk_vrange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
  3496. u64 chunk_offset, struct btrfs_balance_args *bargs)
  3497. {
  3498. if (chunk_offset < bargs->vend &&
  3499. chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
  3500. /* at least part of the chunk is inside this vrange */
  3501. return false;
  3502. return true;
  3503. }
  3504. static bool chunk_stripes_range_filter(struct extent_buffer *leaf,
  3505. struct btrfs_chunk *chunk,
  3506. struct btrfs_balance_args *bargs)
  3507. {
  3508. int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
  3509. if (bargs->stripes_min <= num_stripes
  3510. && num_stripes <= bargs->stripes_max)
  3511. return false;
  3512. return true;
  3513. }
  3514. static bool chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs)
  3515. {
  3516. if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
  3517. return false;
  3518. chunk_type = chunk_to_extended(chunk_type) &
  3519. BTRFS_EXTENDED_PROFILE_MASK;
  3520. if (bargs->target == chunk_type)
  3521. return true;
  3522. return false;
  3523. }
  3524. static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
  3525. u64 chunk_offset)
  3526. {
  3527. struct btrfs_fs_info *fs_info = leaf->fs_info;
  3528. struct btrfs_balance_control *bctl = fs_info->balance_ctl;
  3529. struct btrfs_balance_args *bargs = NULL;
  3530. u64 chunk_type = btrfs_chunk_type(leaf, chunk);
  3531. /* Treat METADATA_REMAP chunks as METADATA. */
  3532. if (chunk_type & BTRFS_BLOCK_GROUP_METADATA_REMAP) {
  3533. chunk_type &= ~BTRFS_BLOCK_GROUP_METADATA_REMAP;
  3534. chunk_type |= BTRFS_BLOCK_GROUP_METADATA;
  3535. }
  3536. /* type filter */
  3537. if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
  3538. (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
  3539. return false;
  3540. }
  3541. if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
  3542. bargs = &bctl->data;
  3543. else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
  3544. bargs = &bctl->sys;
  3545. else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
  3546. bargs = &bctl->meta;
  3547. /* profiles filter */
  3548. if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
  3549. chunk_profiles_filter(chunk_type, bargs)) {
  3550. return false;
  3551. }
  3552. /* usage filter */
  3553. if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
  3554. chunk_usage_filter(fs_info, chunk_offset, bargs)) {
  3555. return false;
  3556. } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
  3557. chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
  3558. return false;
  3559. }
  3560. /* devid filter */
  3561. if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
  3562. chunk_devid_filter(leaf, chunk, bargs)) {
  3563. return false;
  3564. }
  3565. /* drange filter, makes sense only with devid filter */
  3566. if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
  3567. chunk_drange_filter(leaf, chunk, bargs)) {
  3568. return false;
  3569. }
  3570. /* vrange filter */
  3571. if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
  3572. chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
  3573. return false;
  3574. }
  3575. /* stripes filter */
  3576. if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
  3577. chunk_stripes_range_filter(leaf, chunk, bargs)) {
  3578. return false;
  3579. }
  3580. /* soft profile changing mode */
  3581. if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
  3582. chunk_soft_convert_filter(chunk_type, bargs)) {
  3583. return false;
  3584. }
  3585. /*
  3586. * limited by count, must be the last filter
  3587. */
  3588. if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
  3589. if (bargs->limit == 0)
  3590. return false;
  3591. else
  3592. bargs->limit--;
  3593. } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
  3594. /*
  3595. * Same logic as the 'limit' filter; the minimum cannot be
  3596. * determined here because we do not have the global information
  3597. * about the count of all chunks that satisfy the filters.
  3598. */
  3599. if (bargs->limit_max == 0)
  3600. return false;
  3601. else
  3602. bargs->limit_max--;
  3603. }
  3604. return true;
  3605. }
  3606. struct remap_chunk_info {
  3607. struct list_head list;
  3608. u64 offset;
  3609. struct btrfs_block_group *bg;
  3610. bool made_ro;
  3611. };
  3612. static int cow_remap_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path)
  3613. {
  3614. struct btrfs_fs_info *fs_info = trans->fs_info;
  3615. struct btrfs_key key = { 0 };
  3616. int ret;
  3617. ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1);
  3618. if (ret < 0)
  3619. return ret;
  3620. while (true) {
  3621. ret = btrfs_next_leaf(fs_info->remap_root, path);
  3622. if (ret < 0) {
  3623. return ret;
  3624. } else if (ret > 0) {
  3625. ret = 0;
  3626. break;
  3627. }
  3628. btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
  3629. btrfs_release_path(path);
  3630. ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1);
  3631. if (ret < 0)
  3632. break;
  3633. }
  3634. return ret;
  3635. }
  3636. static int balance_remap_chunks(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
  3637. struct list_head *chunks)
  3638. {
  3639. struct remap_chunk_info *rci, *tmp;
  3640. struct btrfs_trans_handle *trans;
  3641. int ret;
  3642. list_for_each_entry_safe(rci, tmp, chunks, list) {
  3643. rci->bg = btrfs_lookup_block_group(fs_info, rci->offset);
  3644. if (!rci->bg) {
  3645. list_del(&rci->list);
  3646. kfree(rci);
  3647. continue;
  3648. }
  3649. ret = btrfs_inc_block_group_ro(rci->bg, false);
  3650. if (ret)
  3651. goto end;
  3652. rci->made_ro = true;
  3653. }
  3654. if (list_empty(chunks))
  3655. return 0;
  3656. trans = btrfs_start_transaction(fs_info->remap_root, 0);
  3657. if (IS_ERR(trans)) {
  3658. ret = PTR_ERR(trans);
  3659. goto end;
  3660. }
  3661. mutex_lock(&fs_info->remap_mutex);
  3662. ret = cow_remap_tree(trans, path);
  3663. mutex_unlock(&fs_info->remap_mutex);
  3664. btrfs_release_path(path);
  3665. btrfs_commit_transaction(trans);
  3666. end:
  3667. while (!list_empty(chunks)) {
  3668. bool is_unused;
  3669. struct btrfs_block_group *bg;
  3670. rci = list_first_entry(chunks, struct remap_chunk_info, list);
  3671. bg = rci->bg;
  3672. if (bg) {
  3673. /*
  3674. * This is a bit racy and the 'used' status can change
  3675. * but this is not a problem as later functions will
  3676. * verify it again.
  3677. */
  3678. spin_lock(&bg->lock);
  3679. is_unused = !btrfs_is_block_group_used(bg);
  3680. spin_unlock(&bg->lock);
  3681. if (is_unused)
  3682. btrfs_mark_bg_unused(bg);
  3683. if (rci->made_ro)
  3684. btrfs_dec_block_group_ro(bg);
  3685. btrfs_put_block_group(bg);
  3686. }
  3687. list_del(&rci->list);
  3688. kfree(rci);
  3689. }
  3690. return ret;
  3691. }
  3692. static int __btrfs_balance(struct btrfs_fs_info *fs_info)
  3693. {
  3694. struct btrfs_balance_control *bctl = fs_info->balance_ctl;
  3695. struct btrfs_root *chunk_root = fs_info->chunk_root;
  3696. u64 chunk_type;
  3697. struct btrfs_chunk *chunk;
  3698. BTRFS_PATH_AUTO_FREE(path);
  3699. struct btrfs_key key;
  3700. struct btrfs_key found_key;
  3701. struct extent_buffer *leaf;
  3702. int slot;
  3703. int ret;
  3704. int enospc_errors = 0;
  3705. bool counting = true;
  3706. /* The single value limit and min/max limits use the same bytes in the */
  3707. u64 limit_data = bctl->data.limit;
  3708. u64 limit_meta = bctl->meta.limit;
  3709. u64 limit_sys = bctl->sys.limit;
  3710. u32 count_data = 0;
  3711. u32 count_meta = 0;
  3712. u32 count_sys = 0;
  3713. int chunk_reserved = 0;
  3714. struct remap_chunk_info *rci;
  3715. unsigned int num_remap_chunks = 0;
  3716. LIST_HEAD(remap_chunks);
  3717. path = btrfs_alloc_path();
  3718. if (!path) {
  3719. ret = -ENOMEM;
  3720. goto error;
  3721. }
  3722. /* zero out stat counters */
  3723. spin_lock(&fs_info->balance_lock);
  3724. memset(&bctl->stat, 0, sizeof(bctl->stat));
  3725. spin_unlock(&fs_info->balance_lock);
  3726. again:
  3727. if (!counting) {
  3728. /*
  3729. * The single value limit and min/max limits use the same bytes
  3730. * in the
  3731. */
  3732. bctl->data.limit = limit_data;
  3733. bctl->meta.limit = limit_meta;
  3734. bctl->sys.limit = limit_sys;
  3735. }
  3736. key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
  3737. key.type = BTRFS_CHUNK_ITEM_KEY;
  3738. key.offset = (u64)-1;
  3739. while (1) {
  3740. if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
  3741. atomic_read(&fs_info->balance_cancel_req)) {
  3742. ret = -ECANCELED;
  3743. goto error;
  3744. }
  3745. mutex_lock(&fs_info->reclaim_bgs_lock);
  3746. ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
  3747. if (ret < 0) {
  3748. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3749. goto error;
  3750. }
  3751. /*
  3752. * this shouldn't happen, it means the last relocate
  3753. * failed
  3754. */
  3755. if (unlikely(ret == 0)) {
  3756. btrfs_err(fs_info,
  3757. "unexpected exact match of CHUNK_ITEM in chunk tree, offset 0x%llx",
  3758. key.offset);
  3759. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3760. ret = -EUCLEAN;
  3761. goto error;
  3762. }
  3763. ret = btrfs_previous_item(chunk_root, path, 0,
  3764. BTRFS_CHUNK_ITEM_KEY);
  3765. if (ret) {
  3766. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3767. ret = 0;
  3768. break;
  3769. }
  3770. leaf = path->nodes[0];
  3771. slot = path->slots[0];
  3772. btrfs_item_key_to_cpu(leaf, &found_key, slot);
  3773. if (found_key.objectid != key.objectid) {
  3774. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3775. break;
  3776. }
  3777. chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
  3778. chunk_type = btrfs_chunk_type(leaf, chunk);
  3779. /* Check if chunk has already been fully relocated. */
  3780. if (chunk_type & BTRFS_BLOCK_GROUP_REMAPPED &&
  3781. btrfs_chunk_num_stripes(leaf, chunk) == 0) {
  3782. btrfs_release_path(path);
  3783. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3784. goto loop;
  3785. }
  3786. if (!counting) {
  3787. spin_lock(&fs_info->balance_lock);
  3788. bctl->stat.considered++;
  3789. spin_unlock(&fs_info->balance_lock);
  3790. }
  3791. ret = should_balance_chunk(leaf, chunk, found_key.offset);
  3792. btrfs_release_path(path);
  3793. if (!ret) {
  3794. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3795. goto loop;
  3796. }
  3797. if (counting) {
  3798. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3799. spin_lock(&fs_info->balance_lock);
  3800. bctl->stat.expected++;
  3801. spin_unlock(&fs_info->balance_lock);
  3802. if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
  3803. count_data++;
  3804. else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
  3805. count_sys++;
  3806. else if (chunk_type & (BTRFS_BLOCK_GROUP_METADATA |
  3807. BTRFS_BLOCK_GROUP_METADATA_REMAP))
  3808. count_meta++;
  3809. goto loop;
  3810. }
  3811. /*
  3812. * Apply limit_min filter, no need to check if the LIMITS
  3813. * filter is used, limit_min is 0 by default
  3814. */
  3815. if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
  3816. count_data < bctl->data.limit_min)
  3817. || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
  3818. count_meta < bctl->meta.limit_min)
  3819. || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
  3820. count_sys < bctl->sys.limit_min)) {
  3821. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3822. goto loop;
  3823. }
  3824. /*
  3825. * Balancing METADATA_REMAP chunks takes place separately - add
  3826. * the details to a list so it can be processed later.
  3827. */
  3828. if (chunk_type & BTRFS_BLOCK_GROUP_METADATA_REMAP) {
  3829. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3830. rci = kmalloc_obj(struct remap_chunk_info, GFP_NOFS);
  3831. if (!rci) {
  3832. ret = -ENOMEM;
  3833. goto error;
  3834. }
  3835. rci->offset = found_key.offset;
  3836. rci->bg = NULL;
  3837. rci->made_ro = false;
  3838. list_add_tail(&rci->list, &remap_chunks);
  3839. num_remap_chunks++;
  3840. goto loop;
  3841. }
  3842. if (!chunk_reserved) {
  3843. /*
  3844. * We may be relocating the only data chunk we have,
  3845. * which could potentially end up with losing data's
  3846. * raid profile, so lets allocate an empty one in
  3847. * advance.
  3848. */
  3849. ret = btrfs_may_alloc_data_chunk(fs_info,
  3850. found_key.offset);
  3851. if (ret < 0) {
  3852. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3853. goto error;
  3854. } else if (ret == 1) {
  3855. chunk_reserved = 1;
  3856. }
  3857. }
  3858. ret = btrfs_relocate_chunk(fs_info, found_key.offset, true);
  3859. mutex_unlock(&fs_info->reclaim_bgs_lock);
  3860. if (ret == -ENOSPC) {
  3861. enospc_errors++;
  3862. } else if (ret == -ETXTBSY) {
  3863. btrfs_info(fs_info,
  3864. "skipping relocation of block group %llu due to active swapfile",
  3865. found_key.offset);
  3866. ret = 0;
  3867. } else if (ret) {
  3868. goto error;
  3869. } else {
  3870. spin_lock(&fs_info->balance_lock);
  3871. bctl->stat.completed++;
  3872. spin_unlock(&fs_info->balance_lock);
  3873. }
  3874. loop:
  3875. if (found_key.offset == 0)
  3876. break;
  3877. key.offset = found_key.offset - 1;
  3878. }
  3879. btrfs_release_path(path);
  3880. if (counting) {
  3881. counting = false;
  3882. goto again;
  3883. }
  3884. if (!list_empty(&remap_chunks)) {
  3885. ret = balance_remap_chunks(fs_info, path, &remap_chunks);
  3886. if (ret == -ENOSPC)
  3887. enospc_errors++;
  3888. if (!ret) {
  3889. spin_lock(&fs_info->balance_lock);
  3890. bctl->stat.completed += num_remap_chunks;
  3891. spin_unlock(&fs_info->balance_lock);
  3892. }
  3893. }
  3894. error:
  3895. if (enospc_errors) {
  3896. btrfs_info(fs_info, "%d enospc errors during balance",
  3897. enospc_errors);
  3898. if (!ret)
  3899. ret = -ENOSPC;
  3900. }
  3901. return ret;
  3902. }
  3903. /*
  3904. * See if a given profile is valid and reduced.
  3905. *
  3906. * @flags: profile to validate
  3907. * @extended: if true @flags is treated as an extended profile
  3908. */
  3909. static int alloc_profile_is_valid(u64 flags, bool extended)
  3910. {
  3911. u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
  3912. BTRFS_BLOCK_GROUP_PROFILE_MASK);
  3913. flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
  3914. /* 1) check that all other bits are zeroed */
  3915. if (flags & ~mask)
  3916. return 0;
  3917. /* 2) see if profile is reduced */
  3918. if (flags == 0)
  3919. return !extended; /* "0" is valid for usual profiles */
  3920. return has_single_bit_set(flags);
  3921. }
  3922. /*
  3923. * Validate target profile against allowed profiles and return true if it's OK.
  3924. * Otherwise print the error message and return false.
  3925. */
  3926. static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
  3927. const struct btrfs_balance_args *bargs,
  3928. u64 allowed, const char *type)
  3929. {
  3930. if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
  3931. return true;
  3932. /* Profile is valid and does not have bits outside of the allowed set */
  3933. if (alloc_profile_is_valid(bargs->target, 1) &&
  3934. (bargs->target & ~allowed) == 0)
  3935. return true;
  3936. btrfs_err(fs_info, "balance: invalid convert %s profile %s",
  3937. type, btrfs_bg_type_to_raid_name(bargs->target));
  3938. return false;
  3939. }
  3940. /*
  3941. * Fill @buf with textual description of balance filter flags @bargs, up to
  3942. * @size_buf including the terminating null. The output may be trimmed if it
  3943. * does not fit into the provided buffer.
  3944. */
  3945. static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
  3946. u32 size_buf)
  3947. {
  3948. int ret;
  3949. u32 size_bp = size_buf;
  3950. char *bp = buf;
  3951. u64 flags = bargs->flags;
  3952. char tmp_buf[128] = {'\0'};
  3953. if (!flags)
  3954. return;
  3955. #define CHECK_APPEND_NOARG(a) \
  3956. do { \
  3957. ret = snprintf(bp, size_bp, (a)); \
  3958. if (ret < 0 || ret >= size_bp) \
  3959. goto out_overflow; \
  3960. size_bp -= ret; \
  3961. bp += ret; \
  3962. } while (0)
  3963. #define CHECK_APPEND_1ARG(a, v1) \
  3964. do { \
  3965. ret = snprintf(bp, size_bp, (a), (v1)); \
  3966. if (ret < 0 || ret >= size_bp) \
  3967. goto out_overflow; \
  3968. size_bp -= ret; \
  3969. bp += ret; \
  3970. } while (0)
  3971. #define CHECK_APPEND_2ARG(a, v1, v2) \
  3972. do { \
  3973. ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
  3974. if (ret < 0 || ret >= size_bp) \
  3975. goto out_overflow; \
  3976. size_bp -= ret; \
  3977. bp += ret; \
  3978. } while (0)
  3979. if (flags & BTRFS_BALANCE_ARGS_CONVERT)
  3980. CHECK_APPEND_1ARG("convert=%s,",
  3981. btrfs_bg_type_to_raid_name(bargs->target));
  3982. if (flags & BTRFS_BALANCE_ARGS_SOFT)
  3983. CHECK_APPEND_NOARG("soft,");
  3984. if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
  3985. btrfs_describe_block_groups(bargs->profiles, tmp_buf,
  3986. sizeof(tmp_buf));
  3987. CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
  3988. }
  3989. if (flags & BTRFS_BALANCE_ARGS_USAGE)
  3990. CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
  3991. if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
  3992. CHECK_APPEND_2ARG("usage=%u..%u,",
  3993. bargs->usage_min, bargs->usage_max);
  3994. if (flags & BTRFS_BALANCE_ARGS_DEVID)
  3995. CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
  3996. if (flags & BTRFS_BALANCE_ARGS_DRANGE)
  3997. CHECK_APPEND_2ARG("drange=%llu..%llu,",
  3998. bargs->pstart, bargs->pend);
  3999. if (flags & BTRFS_BALANCE_ARGS_VRANGE)
  4000. CHECK_APPEND_2ARG("vrange=%llu..%llu,",
  4001. bargs->vstart, bargs->vend);
  4002. if (flags & BTRFS_BALANCE_ARGS_LIMIT)
  4003. CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
  4004. if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
  4005. CHECK_APPEND_2ARG("limit=%u..%u,",
  4006. bargs->limit_min, bargs->limit_max);
  4007. if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
  4008. CHECK_APPEND_2ARG("stripes=%u..%u,",
  4009. bargs->stripes_min, bargs->stripes_max);
  4010. #undef CHECK_APPEND_2ARG
  4011. #undef CHECK_APPEND_1ARG
  4012. #undef CHECK_APPEND_NOARG
  4013. out_overflow:
  4014. if (size_bp < size_buf)
  4015. buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
  4016. else
  4017. buf[0] = '\0';
  4018. }
  4019. static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
  4020. {
  4021. u32 size_buf = 1024;
  4022. char tmp_buf[192] = {'\0'};
  4023. char AUTO_KFREE(buf);
  4024. char *bp;
  4025. u32 size_bp = size_buf;
  4026. int ret;
  4027. struct btrfs_balance_control *bctl = fs_info->balance_ctl;
  4028. buf = kzalloc(size_buf, GFP_KERNEL);
  4029. if (!buf)
  4030. return;
  4031. bp = buf;
  4032. #define CHECK_APPEND_1ARG(a, v1) \
  4033. do { \
  4034. ret = snprintf(bp, size_bp, (a), (v1)); \
  4035. if (ret < 0 || ret >= size_bp) \
  4036. goto out_overflow; \
  4037. size_bp -= ret; \
  4038. bp += ret; \
  4039. } while (0)
  4040. if (bctl->flags & BTRFS_BALANCE_FORCE)
  4041. CHECK_APPEND_1ARG("%s", "-f ");
  4042. if (bctl->flags & BTRFS_BALANCE_DATA) {
  4043. describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
  4044. CHECK_APPEND_1ARG("-d%s ", tmp_buf);
  4045. }
  4046. if (bctl->flags & BTRFS_BALANCE_METADATA) {
  4047. describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
  4048. CHECK_APPEND_1ARG("-m%s ", tmp_buf);
  4049. }
  4050. if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
  4051. describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
  4052. CHECK_APPEND_1ARG("-s%s ", tmp_buf);
  4053. }
  4054. #undef CHECK_APPEND_1ARG
  4055. out_overflow:
  4056. if (size_bp < size_buf)
  4057. buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
  4058. btrfs_info(fs_info, "balance: %s %s",
  4059. (bctl->flags & BTRFS_BALANCE_RESUME) ?
  4060. "resume" : "start", buf);
  4061. }
  4062. /*
  4063. * Should be called with balance mutex held
  4064. */
  4065. int btrfs_balance(struct btrfs_fs_info *fs_info,
  4066. struct btrfs_balance_control *bctl,
  4067. struct btrfs_ioctl_balance_args *bargs)
  4068. {
  4069. u64 meta_target, data_target;
  4070. u64 allowed;
  4071. int mixed = 0;
  4072. int ret;
  4073. u64 num_devices;
  4074. unsigned seq;
  4075. bool reducing_redundancy;
  4076. bool paused = false;
  4077. int i;
  4078. if (btrfs_fs_closing(fs_info) ||
  4079. atomic_read(&fs_info->balance_pause_req) ||
  4080. btrfs_should_cancel_balance(fs_info)) {
  4081. ret = -EINVAL;
  4082. goto out;
  4083. }
  4084. allowed = btrfs_super_incompat_flags(fs_info->super_copy);
  4085. if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
  4086. mixed = 1;
  4087. /*
  4088. * In case of mixed groups both data and meta should be picked,
  4089. * and identical options should be given for both of them.
  4090. */
  4091. allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
  4092. if (mixed && (bctl->flags & allowed)) {
  4093. if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
  4094. !(bctl->flags & BTRFS_BALANCE_METADATA) ||
  4095. memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
  4096. btrfs_err(fs_info,
  4097. "balance: mixed groups data and metadata options must be the same");
  4098. ret = -EINVAL;
  4099. goto out;
  4100. }
  4101. }
  4102. /*
  4103. * rw_devices will not change at the moment, device add/delete/replace
  4104. * are exclusive
  4105. */
  4106. num_devices = fs_info->fs_devices->rw_devices;
  4107. /*
  4108. * SINGLE profile on-disk has no profile bit, but in-memory we have a
  4109. * special bit for it, to make it easier to distinguish. Thus we need
  4110. * to set it manually, or balance would refuse the profile.
  4111. */
  4112. allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
  4113. for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
  4114. if (num_devices >= btrfs_raid_array[i].devs_min)
  4115. allowed |= btrfs_raid_array[i].bg_flag;
  4116. if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
  4117. !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
  4118. !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
  4119. ret = -EINVAL;
  4120. goto out;
  4121. }
  4122. /*
  4123. * Allow to reduce metadata or system integrity only if force set for
  4124. * profiles with redundancy (copies, parity)
  4125. */
  4126. allowed = 0;
  4127. for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
  4128. if (btrfs_raid_array[i].ncopies >= 2 ||
  4129. btrfs_raid_array[i].tolerated_failures >= 1)
  4130. allowed |= btrfs_raid_array[i].bg_flag;
  4131. }
  4132. do {
  4133. seq = read_seqbegin(&fs_info->profiles_lock);
  4134. if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
  4135. (fs_info->avail_system_alloc_bits & allowed) &&
  4136. !(bctl->sys.target & allowed)) ||
  4137. ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
  4138. (fs_info->avail_metadata_alloc_bits & allowed) &&
  4139. !(bctl->meta.target & allowed)))
  4140. reducing_redundancy = true;
  4141. else
  4142. reducing_redundancy = false;
  4143. /* if we're not converting, the target field is uninitialized */
  4144. meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
  4145. bctl->meta.target : fs_info->avail_metadata_alloc_bits;
  4146. data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
  4147. bctl->data.target : fs_info->avail_data_alloc_bits;
  4148. } while (read_seqretry(&fs_info->profiles_lock, seq));
  4149. if (reducing_redundancy) {
  4150. if (bctl->flags & BTRFS_BALANCE_FORCE) {
  4151. btrfs_info(fs_info,
  4152. "balance: force reducing metadata redundancy");
  4153. } else {
  4154. btrfs_err(fs_info,
  4155. "balance: reduces metadata redundancy, use --force if you want this");
  4156. ret = -EINVAL;
  4157. goto out;
  4158. }
  4159. }
  4160. if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
  4161. btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
  4162. btrfs_warn(fs_info,
  4163. "balance: metadata profile %s has lower redundancy than data profile %s",
  4164. btrfs_bg_type_to_raid_name(meta_target),
  4165. btrfs_bg_type_to_raid_name(data_target));
  4166. }
  4167. ret = insert_balance_item(fs_info, bctl);
  4168. if (ret && ret != -EEXIST)
  4169. goto out;
  4170. if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
  4171. BUG_ON(ret == -EEXIST);
  4172. BUG_ON(fs_info->balance_ctl);
  4173. spin_lock(&fs_info->balance_lock);
  4174. fs_info->balance_ctl = bctl;
  4175. spin_unlock(&fs_info->balance_lock);
  4176. } else {
  4177. BUG_ON(ret != -EEXIST);
  4178. spin_lock(&fs_info->balance_lock);
  4179. update_balance_args(bctl);
  4180. spin_unlock(&fs_info->balance_lock);
  4181. }
  4182. ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
  4183. set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
  4184. describe_balance_start_or_resume(fs_info);
  4185. mutex_unlock(&fs_info->balance_mutex);
  4186. ret = __btrfs_balance(fs_info);
  4187. mutex_lock(&fs_info->balance_mutex);
  4188. if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
  4189. btrfs_info(fs_info, "balance: paused");
  4190. btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
  4191. paused = true;
  4192. }
  4193. /*
  4194. * Balance can be canceled by:
  4195. *
  4196. * - Regular cancel request
  4197. * Then ret == -ECANCELED and balance_cancel_req > 0
  4198. *
  4199. * - Fatal signal to "btrfs" process
  4200. * Either the signal caught by wait_reserve_ticket() and callers
  4201. * got -EINTR, or caught by btrfs_should_cancel_balance() and
  4202. * got -ECANCELED.
  4203. * Either way, in this case balance_cancel_req = 0, and
  4204. * ret == -EINTR or ret == -ECANCELED.
  4205. *
  4206. * So here we only check the return value to catch canceled balance.
  4207. */
  4208. else if (ret == -ECANCELED || ret == -EINTR)
  4209. btrfs_info(fs_info, "balance: canceled");
  4210. else
  4211. btrfs_info(fs_info, "balance: ended with status: %d", ret);
  4212. clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
  4213. if (bargs) {
  4214. memset(bargs, 0, sizeof(*bargs));
  4215. btrfs_update_ioctl_balance_args(fs_info, bargs);
  4216. }
  4217. /* We didn't pause, we can clean everything up. */
  4218. if (!paused) {
  4219. reset_balance_state(fs_info);
  4220. btrfs_exclop_finish(fs_info);
  4221. }
  4222. wake_up(&fs_info->balance_wait_q);
  4223. return ret;
  4224. out:
  4225. if (bctl->flags & BTRFS_BALANCE_RESUME)
  4226. reset_balance_state(fs_info);
  4227. else
  4228. kfree(bctl);
  4229. btrfs_exclop_finish(fs_info);
  4230. return ret;
  4231. }
  4232. static int balance_kthread(void *data)
  4233. {
  4234. struct btrfs_fs_info *fs_info = data;
  4235. int ret = 0;
  4236. guard(super_write)(fs_info->sb);
  4237. mutex_lock(&fs_info->balance_mutex);
  4238. if (fs_info->balance_ctl)
  4239. ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
  4240. mutex_unlock(&fs_info->balance_mutex);
  4241. return ret;
  4242. }
  4243. int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
  4244. {
  4245. struct task_struct *tsk;
  4246. mutex_lock(&fs_info->balance_mutex);
  4247. if (!fs_info->balance_ctl) {
  4248. mutex_unlock(&fs_info->balance_mutex);
  4249. return 0;
  4250. }
  4251. mutex_unlock(&fs_info->balance_mutex);
  4252. if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
  4253. btrfs_info(fs_info, "balance: resume skipped");
  4254. return 0;
  4255. }
  4256. spin_lock(&fs_info->super_lock);
  4257. ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED,
  4258. "exclusive_operation=%d", fs_info->exclusive_operation);
  4259. fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
  4260. spin_unlock(&fs_info->super_lock);
  4261. /*
  4262. * A ro->rw remount sequence should continue with the paused balance
  4263. * regardless of who pauses it, system or the user as of now, so set
  4264. * the resume flag.
  4265. */
  4266. spin_lock(&fs_info->balance_lock);
  4267. fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
  4268. spin_unlock(&fs_info->balance_lock);
  4269. tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
  4270. return PTR_ERR_OR_ZERO(tsk);
  4271. }
  4272. int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
  4273. {
  4274. struct btrfs_balance_control *bctl;
  4275. struct btrfs_balance_item *item;
  4276. struct btrfs_disk_balance_args disk_bargs;
  4277. BTRFS_PATH_AUTO_FREE(path);
  4278. struct extent_buffer *leaf;
  4279. struct btrfs_key key;
  4280. int ret;
  4281. path = btrfs_alloc_path();
  4282. if (!path)
  4283. return -ENOMEM;
  4284. key.objectid = BTRFS_BALANCE_OBJECTID;
  4285. key.type = BTRFS_TEMPORARY_ITEM_KEY;
  4286. key.offset = 0;
  4287. ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
  4288. if (ret < 0)
  4289. return ret;
  4290. if (ret > 0) { /* ret = -ENOENT; */
  4291. return 0;
  4292. }
  4293. bctl = kzalloc_obj(*bctl, GFP_NOFS);
  4294. if (!bctl)
  4295. return -ENOMEM;
  4296. leaf = path->nodes[0];
  4297. item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
  4298. bctl->flags = btrfs_balance_flags(leaf, item);
  4299. bctl->flags |= BTRFS_BALANCE_RESUME;
  4300. btrfs_balance_data(leaf, item, &disk_bargs);
  4301. btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
  4302. btrfs_balance_meta(leaf, item, &disk_bargs);
  4303. btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
  4304. btrfs_balance_sys(leaf, item, &disk_bargs);
  4305. btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
  4306. /*
  4307. * This should never happen, as the paused balance state is recovered
  4308. * during mount without any chance of other exclusive ops to collide.
  4309. *
  4310. * This gives the exclusive op status to balance and keeps in paused
  4311. * state until user intervention (cancel or umount). If the ownership
  4312. * cannot be assigned, show a message but do not fail. The balance
  4313. * is in a paused state and must have fs_info::balance_ctl properly
  4314. * set up.
  4315. */
  4316. if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
  4317. btrfs_warn(fs_info,
  4318. "balance: cannot set exclusive op status, resume manually");
  4319. btrfs_release_path(path);
  4320. mutex_lock(&fs_info->balance_mutex);
  4321. BUG_ON(fs_info->balance_ctl);
  4322. spin_lock(&fs_info->balance_lock);
  4323. fs_info->balance_ctl = bctl;
  4324. spin_unlock(&fs_info->balance_lock);
  4325. mutex_unlock(&fs_info->balance_mutex);
  4326. return ret;
  4327. }
  4328. int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
  4329. {
  4330. int ret = 0;
  4331. mutex_lock(&fs_info->balance_mutex);
  4332. if (!fs_info->balance_ctl) {
  4333. mutex_unlock(&fs_info->balance_mutex);
  4334. return -ENOTCONN;
  4335. }
  4336. if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
  4337. atomic_inc(&fs_info->balance_pause_req);
  4338. mutex_unlock(&fs_info->balance_mutex);
  4339. wait_event(fs_info->balance_wait_q,
  4340. !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
  4341. mutex_lock(&fs_info->balance_mutex);
  4342. /* we are good with balance_ctl ripped off from under us */
  4343. BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
  4344. atomic_dec(&fs_info->balance_pause_req);
  4345. } else {
  4346. ret = -ENOTCONN;
  4347. }
  4348. mutex_unlock(&fs_info->balance_mutex);
  4349. return ret;
  4350. }
  4351. int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
  4352. {
  4353. mutex_lock(&fs_info->balance_mutex);
  4354. if (!fs_info->balance_ctl) {
  4355. mutex_unlock(&fs_info->balance_mutex);
  4356. return -ENOTCONN;
  4357. }
  4358. /*
  4359. * A paused balance with the item stored on disk can be resumed at
  4360. * mount time if the mount is read-write. Otherwise it's still paused
  4361. * and we must not allow cancelling as it deletes the item.
  4362. */
  4363. if (sb_rdonly(fs_info->sb)) {
  4364. mutex_unlock(&fs_info->balance_mutex);
  4365. return -EROFS;
  4366. }
  4367. atomic_inc(&fs_info->balance_cancel_req);
  4368. /*
  4369. * if we are running just wait and return, balance item is
  4370. * deleted in btrfs_balance in this case
  4371. */
  4372. if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
  4373. mutex_unlock(&fs_info->balance_mutex);
  4374. wait_event(fs_info->balance_wait_q,
  4375. !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
  4376. mutex_lock(&fs_info->balance_mutex);
  4377. } else {
  4378. mutex_unlock(&fs_info->balance_mutex);
  4379. /*
  4380. * Lock released to allow other waiters to continue, we'll
  4381. * reexamine the status again.
  4382. */
  4383. mutex_lock(&fs_info->balance_mutex);
  4384. if (fs_info->balance_ctl) {
  4385. reset_balance_state(fs_info);
  4386. btrfs_exclop_finish(fs_info);
  4387. btrfs_info(fs_info, "balance: canceled");
  4388. }
  4389. }
  4390. ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
  4391. atomic_dec(&fs_info->balance_cancel_req);
  4392. mutex_unlock(&fs_info->balance_mutex);
  4393. return 0;
  4394. }
  4395. /*
  4396. * shrinking a device means finding all of the device extents past
  4397. * the new size, and then following the back refs to the chunks.
  4398. * The chunk relocation code actually frees the device extent
  4399. */
  4400. int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
  4401. {
  4402. struct btrfs_fs_info *fs_info = device->fs_info;
  4403. struct btrfs_root *root = fs_info->dev_root;
  4404. struct btrfs_trans_handle *trans;
  4405. struct btrfs_dev_extent *dev_extent = NULL;
  4406. struct btrfs_path *path;
  4407. u64 length;
  4408. u64 chunk_offset;
  4409. int ret;
  4410. int slot;
  4411. int failed = 0;
  4412. bool retried = false;
  4413. struct extent_buffer *l;
  4414. struct btrfs_key key;
  4415. struct btrfs_super_block *super_copy = fs_info->super_copy;
  4416. u64 old_total = btrfs_super_total_bytes(super_copy);
  4417. u64 old_size = btrfs_device_get_total_bytes(device);
  4418. u64 diff;
  4419. u64 start;
  4420. u64 free_diff = 0;
  4421. u64 pending_start, pending_end;
  4422. new_size = round_down(new_size, fs_info->sectorsize);
  4423. start = new_size;
  4424. diff = round_down(old_size - new_size, fs_info->sectorsize);
  4425. if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
  4426. return -EINVAL;
  4427. path = btrfs_alloc_path();
  4428. if (!path)
  4429. return -ENOMEM;
  4430. path->reada = READA_BACK;
  4431. trans = btrfs_start_transaction(root, 0);
  4432. if (IS_ERR(trans)) {
  4433. btrfs_free_path(path);
  4434. return PTR_ERR(trans);
  4435. }
  4436. mutex_lock(&fs_info->chunk_mutex);
  4437. btrfs_device_set_total_bytes(device, new_size);
  4438. if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
  4439. device->fs_devices->total_rw_bytes -= diff;
  4440. /*
  4441. * The new free_chunk_space is new_size - used, so we have to
  4442. * subtract the delta of the old free_chunk_space which included
  4443. * old_size - used. If used > new_size then just subtract this
  4444. * entire device's free space.
  4445. */
  4446. if (device->bytes_used < new_size)
  4447. free_diff = (old_size - device->bytes_used) -
  4448. (new_size - device->bytes_used);
  4449. else
  4450. free_diff = old_size - device->bytes_used;
  4451. atomic64_sub(free_diff, &fs_info->free_chunk_space);
  4452. }
  4453. /*
  4454. * Once the device's size has been set to the new size, ensure all
  4455. * in-memory chunks are synced to disk so that the loop below sees them
  4456. * and relocates them accordingly.
  4457. */
  4458. if (btrfs_first_pending_extent(device, start, diff, &pending_start, &pending_end)) {
  4459. mutex_unlock(&fs_info->chunk_mutex);
  4460. ret = btrfs_commit_transaction(trans);
  4461. if (ret)
  4462. goto done;
  4463. } else {
  4464. mutex_unlock(&fs_info->chunk_mutex);
  4465. btrfs_end_transaction(trans);
  4466. }
  4467. again:
  4468. key.objectid = device->devid;
  4469. key.type = BTRFS_DEV_EXTENT_KEY;
  4470. key.offset = (u64)-1;
  4471. do {
  4472. mutex_lock(&fs_info->reclaim_bgs_lock);
  4473. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  4474. if (ret < 0) {
  4475. mutex_unlock(&fs_info->reclaim_bgs_lock);
  4476. goto done;
  4477. }
  4478. ret = btrfs_previous_item(root, path, 0, key.type);
  4479. if (ret) {
  4480. mutex_unlock(&fs_info->reclaim_bgs_lock);
  4481. if (ret < 0)
  4482. goto done;
  4483. ret = 0;
  4484. btrfs_release_path(path);
  4485. break;
  4486. }
  4487. l = path->nodes[0];
  4488. slot = path->slots[0];
  4489. btrfs_item_key_to_cpu(l, &key, path->slots[0]);
  4490. if (key.objectid != device->devid) {
  4491. mutex_unlock(&fs_info->reclaim_bgs_lock);
  4492. btrfs_release_path(path);
  4493. break;
  4494. }
  4495. dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
  4496. length = btrfs_dev_extent_length(l, dev_extent);
  4497. if (key.offset + length <= new_size) {
  4498. mutex_unlock(&fs_info->reclaim_bgs_lock);
  4499. btrfs_release_path(path);
  4500. break;
  4501. }
  4502. chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
  4503. btrfs_release_path(path);
  4504. /*
  4505. * We may be relocating the only data chunk we have,
  4506. * which could potentially end up with losing data's
  4507. * raid profile, so lets allocate an empty one in
  4508. * advance.
  4509. */
  4510. ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
  4511. if (ret < 0) {
  4512. mutex_unlock(&fs_info->reclaim_bgs_lock);
  4513. goto done;
  4514. }
  4515. ret = btrfs_relocate_chunk(fs_info, chunk_offset, true);
  4516. mutex_unlock(&fs_info->reclaim_bgs_lock);
  4517. if (ret == -ENOSPC) {
  4518. failed++;
  4519. } else if (ret) {
  4520. if (ret == -ETXTBSY) {
  4521. btrfs_warn(fs_info,
  4522. "could not shrink block group %llu due to active swapfile",
  4523. chunk_offset);
  4524. }
  4525. goto done;
  4526. }
  4527. } while (key.offset-- > 0);
  4528. if (failed && !retried) {
  4529. failed = 0;
  4530. retried = true;
  4531. goto again;
  4532. } else if (failed && retried) {
  4533. ret = -ENOSPC;
  4534. goto done;
  4535. }
  4536. /* Shrinking succeeded, else we would be at "done". */
  4537. trans = btrfs_start_transaction(root, 0);
  4538. if (IS_ERR(trans)) {
  4539. ret = PTR_ERR(trans);
  4540. goto done;
  4541. }
  4542. mutex_lock(&fs_info->chunk_mutex);
  4543. /* Clear all state bits beyond the shrunk device size */
  4544. btrfs_clear_extent_bit(&device->alloc_state, new_size, (u64)-1,
  4545. CHUNK_STATE_MASK, NULL);
  4546. btrfs_device_set_disk_total_bytes(device, new_size);
  4547. if (list_empty(&device->post_commit_list))
  4548. list_add_tail(&device->post_commit_list,
  4549. &trans->transaction->dev_update_list);
  4550. WARN_ON(diff > old_total);
  4551. btrfs_set_super_total_bytes(super_copy,
  4552. round_down(old_total - diff, fs_info->sectorsize));
  4553. mutex_unlock(&fs_info->chunk_mutex);
  4554. btrfs_reserve_chunk_metadata(trans, false);
  4555. /* Now btrfs_update_device() will change the on-disk size. */
  4556. ret = btrfs_update_device(trans, device);
  4557. btrfs_trans_release_chunk_metadata(trans);
  4558. if (unlikely(ret < 0)) {
  4559. btrfs_abort_transaction(trans, ret);
  4560. btrfs_end_transaction(trans);
  4561. } else {
  4562. ret = btrfs_commit_transaction(trans);
  4563. }
  4564. done:
  4565. btrfs_free_path(path);
  4566. if (ret) {
  4567. mutex_lock(&fs_info->chunk_mutex);
  4568. btrfs_device_set_total_bytes(device, old_size);
  4569. if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
  4570. device->fs_devices->total_rw_bytes += diff;
  4571. atomic64_add(free_diff, &fs_info->free_chunk_space);
  4572. }
  4573. mutex_unlock(&fs_info->chunk_mutex);
  4574. }
  4575. return ret;
  4576. }
  4577. static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
  4578. struct btrfs_key *key,
  4579. struct btrfs_chunk *chunk, int item_size)
  4580. {
  4581. struct btrfs_super_block *super_copy = fs_info->super_copy;
  4582. struct btrfs_disk_key disk_key;
  4583. u32 array_size;
  4584. u8 *ptr;
  4585. lockdep_assert_held(&fs_info->chunk_mutex);
  4586. array_size = btrfs_super_sys_array_size(super_copy);
  4587. if (array_size + item_size + sizeof(disk_key)
  4588. > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
  4589. return -EFBIG;
  4590. ptr = super_copy->sys_chunk_array + array_size;
  4591. btrfs_cpu_key_to_disk(&disk_key, key);
  4592. memcpy(ptr, &disk_key, sizeof(disk_key));
  4593. ptr += sizeof(disk_key);
  4594. memcpy(ptr, chunk, item_size);
  4595. item_size += sizeof(disk_key);
  4596. btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
  4597. return 0;
  4598. }
  4599. /*
  4600. * sort the devices in descending order by max_avail, total_avail
  4601. */
  4602. static int btrfs_cmp_device_info(const void *a, const void *b)
  4603. {
  4604. const struct btrfs_device_info *di_a = a;
  4605. const struct btrfs_device_info *di_b = b;
  4606. if (di_a->max_avail > di_b->max_avail)
  4607. return -1;
  4608. if (di_a->max_avail < di_b->max_avail)
  4609. return 1;
  4610. if (di_a->total_avail > di_b->total_avail)
  4611. return -1;
  4612. if (di_a->total_avail < di_b->total_avail)
  4613. return 1;
  4614. return 0;
  4615. }
  4616. static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
  4617. {
  4618. if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
  4619. return;
  4620. btrfs_set_fs_incompat(info, RAID56);
  4621. }
  4622. static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
  4623. {
  4624. if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
  4625. return;
  4626. btrfs_set_fs_incompat(info, RAID1C34);
  4627. }
  4628. /*
  4629. * Structure used internally for btrfs_create_chunk() function.
  4630. * Wraps needed parameters.
  4631. */
  4632. struct alloc_chunk_ctl {
  4633. u64 start;
  4634. u64 type;
  4635. /* Total number of stripes to allocate */
  4636. int num_stripes;
  4637. /* sub_stripes info for map */
  4638. int sub_stripes;
  4639. /* Stripes per device */
  4640. int dev_stripes;
  4641. /* Maximum number of devices to use */
  4642. int devs_max;
  4643. /* Minimum number of devices to use */
  4644. int devs_min;
  4645. /* ndevs has to be a multiple of this */
  4646. int devs_increment;
  4647. /* Number of copies */
  4648. int ncopies;
  4649. /* Number of stripes worth of bytes to store parity information */
  4650. int nparity;
  4651. u64 max_stripe_size;
  4652. u64 max_chunk_size;
  4653. u64 dev_extent_min;
  4654. u64 stripe_size;
  4655. u64 chunk_size;
  4656. int ndevs;
  4657. /* Space_info the block group is going to belong. */
  4658. struct btrfs_space_info *space_info;
  4659. };
  4660. static void init_alloc_chunk_ctl_policy_regular(
  4661. struct btrfs_fs_devices *fs_devices,
  4662. struct alloc_chunk_ctl *ctl)
  4663. {
  4664. struct btrfs_space_info *space_info;
  4665. space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
  4666. ASSERT(space_info);
  4667. ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
  4668. ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G);
  4669. if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
  4670. ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
  4671. /* We don't want a chunk larger than 10% of writable space */
  4672. ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
  4673. ctl->max_chunk_size);
  4674. ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes);
  4675. }
  4676. static void init_alloc_chunk_ctl_policy_zoned(
  4677. struct btrfs_fs_devices *fs_devices,
  4678. struct alloc_chunk_ctl *ctl)
  4679. {
  4680. u64 zone_size = fs_devices->fs_info->zone_size;
  4681. u64 limit;
  4682. int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
  4683. int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
  4684. u64 min_chunk_size = min_data_stripes * zone_size;
  4685. u64 type = ctl->type;
  4686. ctl->max_stripe_size = zone_size;
  4687. if (type & BTRFS_BLOCK_GROUP_DATA) {
  4688. ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
  4689. zone_size);
  4690. } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
  4691. ctl->max_chunk_size = ctl->max_stripe_size;
  4692. } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
  4693. ctl->max_chunk_size = 2 * ctl->max_stripe_size;
  4694. ctl->devs_max = min_t(int, ctl->devs_max,
  4695. BTRFS_MAX_DEVS_SYS_CHUNK);
  4696. } else {
  4697. BUG();
  4698. }
  4699. /* We don't want a chunk larger than 10% of writable space */
  4700. limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10),
  4701. zone_size),
  4702. min_chunk_size);
  4703. ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
  4704. ctl->dev_extent_min = zone_size * ctl->dev_stripes;
  4705. }
  4706. static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
  4707. struct alloc_chunk_ctl *ctl)
  4708. {
  4709. int index = btrfs_bg_flags_to_raid_index(ctl->type);
  4710. ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
  4711. ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
  4712. ctl->devs_max = btrfs_raid_array[index].devs_max;
  4713. if (!ctl->devs_max)
  4714. ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
  4715. ctl->devs_min = btrfs_raid_array[index].devs_min;
  4716. ctl->devs_increment = btrfs_raid_array[index].devs_increment;
  4717. ctl->ncopies = btrfs_raid_array[index].ncopies;
  4718. ctl->nparity = btrfs_raid_array[index].nparity;
  4719. ctl->ndevs = 0;
  4720. switch (fs_devices->chunk_alloc_policy) {
  4721. default:
  4722. btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy);
  4723. fallthrough;
  4724. case BTRFS_CHUNK_ALLOC_REGULAR:
  4725. init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
  4726. break;
  4727. case BTRFS_CHUNK_ALLOC_ZONED:
  4728. init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
  4729. break;
  4730. }
  4731. }
  4732. static int gather_device_info(struct btrfs_fs_devices *fs_devices,
  4733. struct alloc_chunk_ctl *ctl,
  4734. struct btrfs_device_info *devices_info)
  4735. {
  4736. struct btrfs_fs_info *info = fs_devices->fs_info;
  4737. struct btrfs_device *device;
  4738. u64 total_avail;
  4739. u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
  4740. int ret;
  4741. int ndevs = 0;
  4742. u64 max_avail;
  4743. u64 dev_offset;
  4744. /*
  4745. * in the first pass through the devices list, we gather information
  4746. * about the available holes on each device.
  4747. */
  4748. list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
  4749. if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
  4750. WARN(1, KERN_ERR
  4751. "BTRFS: read-only device in alloc_list\n");
  4752. continue;
  4753. }
  4754. if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
  4755. &device->dev_state) ||
  4756. test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
  4757. continue;
  4758. if (device->total_bytes > device->bytes_used)
  4759. total_avail = device->total_bytes - device->bytes_used;
  4760. else
  4761. total_avail = 0;
  4762. /* If there is no space on this device, skip it. */
  4763. if (total_avail < ctl->dev_extent_min)
  4764. continue;
  4765. ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
  4766. &max_avail);
  4767. if (ret && ret != -ENOSPC)
  4768. return ret;
  4769. if (ret == 0)
  4770. max_avail = dev_extent_want;
  4771. if (max_avail < ctl->dev_extent_min) {
  4772. if (btrfs_test_opt(info, ENOSPC_DEBUG))
  4773. btrfs_debug(info,
  4774. "%s: devid %llu has no free space, have=%llu want=%llu",
  4775. __func__, device->devid, max_avail,
  4776. ctl->dev_extent_min);
  4777. continue;
  4778. }
  4779. if (ndevs == fs_devices->rw_devices) {
  4780. WARN(1, "%s: found more than %llu devices\n",
  4781. __func__, fs_devices->rw_devices);
  4782. break;
  4783. }
  4784. devices_info[ndevs].dev_offset = dev_offset;
  4785. devices_info[ndevs].max_avail = max_avail;
  4786. devices_info[ndevs].total_avail = total_avail;
  4787. devices_info[ndevs].dev = device;
  4788. ++ndevs;
  4789. }
  4790. ctl->ndevs = ndevs;
  4791. /*
  4792. * now sort the devices by hole size / available space
  4793. */
  4794. sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
  4795. btrfs_cmp_device_info, NULL);
  4796. return 0;
  4797. }
  4798. static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
  4799. struct btrfs_device_info *devices_info)
  4800. {
  4801. /* Number of stripes that count for block group size */
  4802. int data_stripes;
  4803. /*
  4804. * The primary goal is to maximize the number of stripes, so use as
  4805. * many devices as possible, even if the stripes are not maximum sized.
  4806. *
  4807. * The DUP profile stores more than one stripe per device, the
  4808. * max_avail is the total size so we have to adjust.
  4809. */
  4810. ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
  4811. ctl->dev_stripes);
  4812. ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
  4813. /* This will have to be fixed for RAID1 and RAID10 over more drives */
  4814. data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
  4815. /*
  4816. * Use the number of data stripes to figure out how big this chunk is
  4817. * really going to be in terms of logical address space, and compare
  4818. * that answer with the max chunk size. If it's higher, we try to
  4819. * reduce stripe_size.
  4820. */
  4821. if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
  4822. /*
  4823. * Reduce stripe_size, round it up to a 16MB boundary again and
  4824. * then use it, unless it ends up being even bigger than the
  4825. * previous value we had already.
  4826. */
  4827. ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
  4828. data_stripes), SZ_16M),
  4829. ctl->stripe_size);
  4830. }
  4831. /* Stripe size should not go beyond 1G. */
  4832. ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
  4833. /* Align to BTRFS_STRIPE_LEN */
  4834. ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
  4835. ctl->chunk_size = ctl->stripe_size * data_stripes;
  4836. return 0;
  4837. }
  4838. static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
  4839. struct btrfs_device_info *devices_info)
  4840. {
  4841. u64 zone_size = devices_info[0].dev->zone_info->zone_size;
  4842. /* Number of stripes that count for block group size */
  4843. int data_stripes;
  4844. /*
  4845. * It should hold because:
  4846. * dev_extent_min == dev_extent_want == zone_size * dev_stripes
  4847. */
  4848. ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min,
  4849. "ndevs=%d max_avail=%llu dev_extent_min=%llu", ctl->ndevs,
  4850. devices_info[ctl->ndevs - 1].max_avail, ctl->dev_extent_min);
  4851. ctl->stripe_size = zone_size;
  4852. ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
  4853. data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
  4854. /* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */
  4855. if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
  4856. ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
  4857. ctl->stripe_size) + ctl->nparity,
  4858. ctl->dev_stripes);
  4859. ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
  4860. data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
  4861. ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size,
  4862. "stripe_size=%llu data_stripes=%d max_chunk_size=%llu",
  4863. ctl->stripe_size, data_stripes, ctl->max_chunk_size);
  4864. }
  4865. ctl->chunk_size = ctl->stripe_size * data_stripes;
  4866. return 0;
  4867. }
  4868. static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
  4869. struct alloc_chunk_ctl *ctl,
  4870. struct btrfs_device_info *devices_info)
  4871. {
  4872. struct btrfs_fs_info *info = fs_devices->fs_info;
  4873. /*
  4874. * Round down to number of usable stripes, devs_increment can be any
  4875. * number so we can't use round_down() that requires power of 2, while
  4876. * rounddown is safe.
  4877. */
  4878. ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
  4879. if (ctl->ndevs < ctl->devs_min) {
  4880. if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
  4881. btrfs_debug(info,
  4882. "%s: not enough devices with free space: have=%d minimum required=%d",
  4883. __func__, ctl->ndevs, ctl->devs_min);
  4884. }
  4885. return -ENOSPC;
  4886. }
  4887. ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
  4888. switch (fs_devices->chunk_alloc_policy) {
  4889. default:
  4890. btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy);
  4891. fallthrough;
  4892. case BTRFS_CHUNK_ALLOC_REGULAR:
  4893. return decide_stripe_size_regular(ctl, devices_info);
  4894. case BTRFS_CHUNK_ALLOC_ZONED:
  4895. return decide_stripe_size_zoned(ctl, devices_info);
  4896. }
  4897. }
  4898. static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits)
  4899. {
  4900. for (int i = 0; i < map->num_stripes; i++) {
  4901. struct btrfs_io_stripe *stripe = &map->stripes[i];
  4902. struct btrfs_device *device = stripe->dev;
  4903. btrfs_set_extent_bit(&device->alloc_state, stripe->physical,
  4904. stripe->physical + map->stripe_size - 1,
  4905. bits | EXTENT_NOWAIT, NULL);
  4906. }
  4907. }
  4908. void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
  4909. {
  4910. for (int i = 0; i < map->num_stripes; i++) {
  4911. struct btrfs_io_stripe *stripe = &map->stripes[i];
  4912. struct btrfs_device *device = stripe->dev;
  4913. btrfs_clear_extent_bit(&device->alloc_state, stripe->physical,
  4914. stripe->physical + map->stripe_size - 1,
  4915. bits | EXTENT_NOWAIT, NULL);
  4916. }
  4917. }
  4918. void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
  4919. {
  4920. write_lock(&fs_info->mapping_tree_lock);
  4921. rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
  4922. RB_CLEAR_NODE(&map->rb_node);
  4923. btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
  4924. write_unlock(&fs_info->mapping_tree_lock);
  4925. /* Once for the tree reference. */
  4926. btrfs_free_chunk_map(map);
  4927. }
  4928. static int btrfs_chunk_map_cmp(const struct rb_node *new,
  4929. const struct rb_node *exist)
  4930. {
  4931. const struct btrfs_chunk_map *new_map =
  4932. rb_entry(new, struct btrfs_chunk_map, rb_node);
  4933. const struct btrfs_chunk_map *exist_map =
  4934. rb_entry(exist, struct btrfs_chunk_map, rb_node);
  4935. if (new_map->start == exist_map->start)
  4936. return 0;
  4937. if (new_map->start < exist_map->start)
  4938. return -1;
  4939. return 1;
  4940. }
  4941. EXPORT_FOR_TESTS
  4942. int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
  4943. {
  4944. struct rb_node *exist;
  4945. write_lock(&fs_info->mapping_tree_lock);
  4946. exist = rb_find_add_cached(&map->rb_node, &fs_info->mapping_tree,
  4947. btrfs_chunk_map_cmp);
  4948. if (exist) {
  4949. write_unlock(&fs_info->mapping_tree_lock);
  4950. return -EEXIST;
  4951. }
  4952. chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
  4953. btrfs_chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
  4954. write_unlock(&fs_info->mapping_tree_lock);
  4955. return 0;
  4956. }
  4957. EXPORT_FOR_TESTS
  4958. struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp)
  4959. {
  4960. struct btrfs_chunk_map *map;
  4961. map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp);
  4962. if (!map)
  4963. return NULL;
  4964. refcount_set(&map->refs, 1);
  4965. RB_CLEAR_NODE(&map->rb_node);
  4966. return map;
  4967. }
  4968. static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
  4969. struct alloc_chunk_ctl *ctl,
  4970. struct btrfs_device_info *devices_info)
  4971. {
  4972. struct btrfs_fs_info *info = trans->fs_info;
  4973. struct btrfs_chunk_map *map;
  4974. struct btrfs_block_group *block_group;
  4975. u64 start = ctl->start;
  4976. u64 type = ctl->type;
  4977. int ret;
  4978. map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS);
  4979. if (!map)
  4980. return ERR_PTR(-ENOMEM);
  4981. map->start = start;
  4982. map->chunk_len = ctl->chunk_size;
  4983. map->stripe_size = ctl->stripe_size;
  4984. map->type = type;
  4985. map->io_align = BTRFS_STRIPE_LEN;
  4986. map->io_width = BTRFS_STRIPE_LEN;
  4987. map->sub_stripes = ctl->sub_stripes;
  4988. map->num_stripes = ctl->num_stripes;
  4989. for (int i = 0; i < ctl->ndevs; i++) {
  4990. for (int j = 0; j < ctl->dev_stripes; j++) {
  4991. int s = i * ctl->dev_stripes + j;
  4992. map->stripes[s].dev = devices_info[i].dev;
  4993. map->stripes[s].physical = devices_info[i].dev_offset +
  4994. j * ctl->stripe_size;
  4995. }
  4996. }
  4997. trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
  4998. ret = btrfs_add_chunk_map(info, map);
  4999. if (ret) {
  5000. btrfs_free_chunk_map(map);
  5001. return ERR_PTR(ret);
  5002. }
  5003. block_group = btrfs_make_block_group(trans, ctl->space_info, type, start,
  5004. ctl->chunk_size);
  5005. if (IS_ERR(block_group)) {
  5006. btrfs_remove_chunk_map(info, map);
  5007. return block_group;
  5008. }
  5009. for (int i = 0; i < map->num_stripes; i++) {
  5010. struct btrfs_device *dev = map->stripes[i].dev;
  5011. btrfs_device_set_bytes_used(dev,
  5012. dev->bytes_used + ctl->stripe_size);
  5013. if (list_empty(&dev->post_commit_list))
  5014. list_add_tail(&dev->post_commit_list,
  5015. &trans->transaction->dev_update_list);
  5016. }
  5017. atomic64_sub(ctl->stripe_size * map->num_stripes,
  5018. &info->free_chunk_space);
  5019. check_raid56_incompat_flag(info, type);
  5020. check_raid1c34_incompat_flag(info, type);
  5021. return block_group;
  5022. }
  5023. struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
  5024. struct btrfs_space_info *space_info,
  5025. u64 type)
  5026. {
  5027. struct btrfs_fs_info *info = trans->fs_info;
  5028. struct btrfs_fs_devices *fs_devices = info->fs_devices;
  5029. struct btrfs_device_info AUTO_KFREE(devices_info);
  5030. struct alloc_chunk_ctl ctl;
  5031. int ret;
  5032. lockdep_assert_held(&info->chunk_mutex);
  5033. if (!alloc_profile_is_valid(type, 0)) {
  5034. DEBUG_WARN("invalid alloc profile for type %llu", type);
  5035. return ERR_PTR(-EINVAL);
  5036. }
  5037. if (list_empty(&fs_devices->alloc_list)) {
  5038. if (btrfs_test_opt(info, ENOSPC_DEBUG))
  5039. btrfs_debug(info, "%s: no writable device", __func__);
  5040. return ERR_PTR(-ENOSPC);
  5041. }
  5042. if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
  5043. btrfs_err(info, "invalid chunk type 0x%llx requested", type);
  5044. DEBUG_WARN();
  5045. return ERR_PTR(-EINVAL);
  5046. }
  5047. ctl.start = find_next_chunk(info);
  5048. ctl.type = type;
  5049. ctl.space_info = space_info;
  5050. init_alloc_chunk_ctl(fs_devices, &ctl);
  5051. devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
  5052. GFP_NOFS);
  5053. if (!devices_info)
  5054. return ERR_PTR(-ENOMEM);
  5055. ret = gather_device_info(fs_devices, &ctl, devices_info);
  5056. if (ret < 0)
  5057. return ERR_PTR(ret);
  5058. ret = decide_stripe_size(fs_devices, &ctl, devices_info);
  5059. if (ret < 0)
  5060. return ERR_PTR(ret);
  5061. return create_chunk(trans, &ctl, devices_info);
  5062. }
  5063. /*
  5064. * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
  5065. * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
  5066. * chunks.
  5067. *
  5068. * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
  5069. * phases.
  5070. */
  5071. int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
  5072. struct btrfs_block_group *bg)
  5073. {
  5074. struct btrfs_fs_info *fs_info = trans->fs_info;
  5075. struct btrfs_root *chunk_root = fs_info->chunk_root;
  5076. struct btrfs_key key;
  5077. struct btrfs_chunk *chunk;
  5078. struct btrfs_stripe *stripe;
  5079. struct btrfs_chunk_map *map;
  5080. size_t item_size;
  5081. int i;
  5082. int ret;
  5083. /*
  5084. * We take the chunk_mutex for 2 reasons:
  5085. *
  5086. * 1) Updates and insertions in the chunk btree must be done while holding
  5087. * the chunk_mutex, as well as updating the system chunk array in the
  5088. * superblock. See the comment on top of btrfs_chunk_alloc() for the
  5089. * details;
  5090. *
  5091. * 2) To prevent races with the final phase of a device replace operation
  5092. * that replaces the device object associated with the map's stripes,
  5093. * because the device object's id can change at any time during that
  5094. * final phase of the device replace operation
  5095. * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
  5096. * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
  5097. * which would cause a failure when updating the device item, which does
  5098. * not exists, or persisting a stripe of the chunk item with such ID.
  5099. * Here we can't use the device_list_mutex because our caller already
  5100. * has locked the chunk_mutex, and the final phase of device replace
  5101. * acquires both mutexes - first the device_list_mutex and then the
  5102. * chunk_mutex. Using any of those two mutexes protects us from a
  5103. * concurrent device replace.
  5104. */
  5105. lockdep_assert_held(&fs_info->chunk_mutex);
  5106. map = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
  5107. if (IS_ERR(map)) {
  5108. ret = PTR_ERR(map);
  5109. btrfs_abort_transaction(trans, ret);
  5110. return ret;
  5111. }
  5112. item_size = btrfs_chunk_item_size(map->num_stripes);
  5113. chunk = kzalloc(item_size, GFP_NOFS);
  5114. if (unlikely(!chunk)) {
  5115. ret = -ENOMEM;
  5116. btrfs_abort_transaction(trans, ret);
  5117. goto out;
  5118. }
  5119. for (i = 0; i < map->num_stripes; i++) {
  5120. struct btrfs_device *device = map->stripes[i].dev;
  5121. ret = btrfs_update_device(trans, device);
  5122. if (ret)
  5123. goto out;
  5124. }
  5125. stripe = &chunk->stripe;
  5126. for (i = 0; i < map->num_stripes; i++) {
  5127. struct btrfs_device *device = map->stripes[i].dev;
  5128. const u64 dev_offset = map->stripes[i].physical;
  5129. btrfs_set_stack_stripe_devid(stripe, device->devid);
  5130. btrfs_set_stack_stripe_offset(stripe, dev_offset);
  5131. memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
  5132. stripe++;
  5133. }
  5134. btrfs_set_stack_chunk_length(chunk, bg->length);
  5135. btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
  5136. btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
  5137. btrfs_set_stack_chunk_type(chunk, map->type);
  5138. btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
  5139. btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN);
  5140. btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN);
  5141. btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
  5142. btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
  5143. key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
  5144. key.type = BTRFS_CHUNK_ITEM_KEY;
  5145. key.offset = bg->start;
  5146. ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
  5147. if (ret)
  5148. goto out;
  5149. set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
  5150. if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
  5151. ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
  5152. if (ret)
  5153. goto out;
  5154. }
  5155. out:
  5156. kfree(chunk);
  5157. btrfs_free_chunk_map(map);
  5158. return ret;
  5159. }
  5160. static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
  5161. {
  5162. struct btrfs_fs_info *fs_info = trans->fs_info;
  5163. u64 alloc_profile;
  5164. struct btrfs_block_group *meta_bg;
  5165. struct btrfs_space_info *meta_space_info;
  5166. struct btrfs_block_group *sys_bg;
  5167. struct btrfs_space_info *sys_space_info;
  5168. /*
  5169. * When adding a new device for sprouting, the seed device is read-only
  5170. * so we must first allocate a metadata and a system chunk. But before
  5171. * adding the block group items to the extent, device and chunk btrees,
  5172. * we must first:
  5173. *
  5174. * 1) Create both chunks without doing any changes to the btrees, as
  5175. * otherwise we would get -ENOSPC since the block groups from the
  5176. * seed device are read-only;
  5177. *
  5178. * 2) Add the device item for the new sprout device - finishing the setup
  5179. * of a new block group requires updating the device item in the chunk
  5180. * btree, so it must exist when we attempt to do it. The previous step
  5181. * ensures this does not fail with -ENOSPC.
  5182. *
  5183. * After that we can add the block group items to their btrees:
  5184. * update existing device item in the chunk btree, add a new block group
  5185. * item to the extent btree, add a new chunk item to the chunk btree and
  5186. * finally add the new device extent items to the devices btree.
  5187. */
  5188. alloc_profile = btrfs_metadata_alloc_profile(fs_info);
  5189. meta_space_info = btrfs_find_space_info(fs_info, alloc_profile);
  5190. if (!meta_space_info) {
  5191. DEBUG_WARN();
  5192. return -EINVAL;
  5193. }
  5194. meta_bg = btrfs_create_chunk(trans, meta_space_info, alloc_profile);
  5195. if (IS_ERR(meta_bg))
  5196. return PTR_ERR(meta_bg);
  5197. alloc_profile = btrfs_system_alloc_profile(fs_info);
  5198. sys_space_info = btrfs_find_space_info(fs_info, alloc_profile);
  5199. if (!sys_space_info) {
  5200. DEBUG_WARN();
  5201. return -EINVAL;
  5202. }
  5203. sys_bg = btrfs_create_chunk(trans, sys_space_info, alloc_profile);
  5204. if (IS_ERR(sys_bg))
  5205. return PTR_ERR(sys_bg);
  5206. return 0;
  5207. }
  5208. static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map)
  5209. {
  5210. const int index = btrfs_bg_flags_to_raid_index(map->type);
  5211. return btrfs_raid_array[index].tolerated_failures;
  5212. }
  5213. bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
  5214. {
  5215. struct btrfs_chunk_map *map;
  5216. int miss_ndevs = 0;
  5217. int i;
  5218. bool ret = true;
  5219. map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
  5220. if (IS_ERR(map))
  5221. return false;
  5222. for (i = 0; i < map->num_stripes; i++) {
  5223. if (test_bit(BTRFS_DEV_STATE_MISSING,
  5224. &map->stripes[i].dev->dev_state)) {
  5225. miss_ndevs++;
  5226. continue;
  5227. }
  5228. if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
  5229. &map->stripes[i].dev->dev_state)) {
  5230. ret = false;
  5231. goto end;
  5232. }
  5233. }
  5234. /*
  5235. * If the number of missing devices is larger than max errors, we can
  5236. * not write the data into that chunk successfully.
  5237. */
  5238. if (miss_ndevs > btrfs_chunk_max_errors(map))
  5239. ret = false;
  5240. end:
  5241. btrfs_free_chunk_map(map);
  5242. return ret;
  5243. }
  5244. void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
  5245. {
  5246. write_lock(&fs_info->mapping_tree_lock);
  5247. while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) {
  5248. struct btrfs_chunk_map *map;
  5249. struct rb_node *node;
  5250. node = rb_first_cached(&fs_info->mapping_tree);
  5251. map = rb_entry(node, struct btrfs_chunk_map, rb_node);
  5252. rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
  5253. RB_CLEAR_NODE(&map->rb_node);
  5254. btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
  5255. /* Once for the tree ref. */
  5256. btrfs_free_chunk_map(map);
  5257. cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
  5258. }
  5259. write_unlock(&fs_info->mapping_tree_lock);
  5260. }
  5261. static int btrfs_chunk_map_num_copies(const struct btrfs_chunk_map *map)
  5262. {
  5263. enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(map->type);
  5264. if (map->type & BTRFS_BLOCK_GROUP_RAID5)
  5265. return 2;
  5266. /*
  5267. * There could be two corrupted data stripes, we need to loop retry in
  5268. * order to rebuild the correct data.
  5269. *
  5270. * Fail a stripe at a time on every retry except the stripe under
  5271. * reconstruction.
  5272. */
  5273. if (map->type & BTRFS_BLOCK_GROUP_RAID6)
  5274. return map->num_stripes;
  5275. /* Non-RAID56, use their ncopies from btrfs_raid_array. */
  5276. return btrfs_raid_array[index].ncopies;
  5277. }
  5278. int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
  5279. {
  5280. struct btrfs_chunk_map *map;
  5281. int ret;
  5282. map = btrfs_get_chunk_map(fs_info, logical, len);
  5283. if (IS_ERR(map))
  5284. /*
  5285. * We could return errors for these cases, but that could get
  5286. * ugly and we'd probably do the same thing which is just not do
  5287. * anything else and exit, so return 1 so the callers don't try
  5288. * to use other copies.
  5289. */
  5290. return 1;
  5291. ret = btrfs_chunk_map_num_copies(map);
  5292. btrfs_free_chunk_map(map);
  5293. return ret;
  5294. }
  5295. unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
  5296. u64 logical)
  5297. {
  5298. struct btrfs_chunk_map *map;
  5299. unsigned long len = fs_info->sectorsize;
  5300. if (!btrfs_fs_incompat(fs_info, RAID56))
  5301. return len;
  5302. map = btrfs_get_chunk_map(fs_info, logical, len);
  5303. if (!WARN_ON(IS_ERR(map))) {
  5304. if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
  5305. len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
  5306. btrfs_free_chunk_map(map);
  5307. }
  5308. return len;
  5309. }
  5310. #ifdef CONFIG_BTRFS_EXPERIMENTAL
  5311. static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes)
  5312. {
  5313. for (int index = first; index < first + num_stripes; index++) {
  5314. const struct btrfs_device *device = map->stripes[index].dev;
  5315. if (device->devid == READ_ONCE(device->fs_devices->read_devid))
  5316. return index;
  5317. }
  5318. /* If no read-preferred device is set use the first stripe. */
  5319. return first;
  5320. }
  5321. struct stripe_mirror {
  5322. u64 devid;
  5323. int num;
  5324. };
  5325. static int btrfs_cmp_devid(const void *a, const void *b)
  5326. {
  5327. const struct stripe_mirror *s1 = (const struct stripe_mirror *)a;
  5328. const struct stripe_mirror *s2 = (const struct stripe_mirror *)b;
  5329. if (s1->devid < s2->devid)
  5330. return -1;
  5331. if (s1->devid > s2->devid)
  5332. return 1;
  5333. return 0;
  5334. }
  5335. /*
  5336. * Select a stripe for reading using the round-robin algorithm.
  5337. *
  5338. * 1. Compute the read cycle as the total sectors read divided by the minimum
  5339. * sectors per device.
  5340. * 2. Determine the stripe number for the current read by taking the modulus
  5341. * of the read cycle with the total number of stripes:
  5342. *
  5343. * stripe index = (total sectors / min sectors per dev) % num stripes
  5344. *
  5345. * The calculated stripe index is then used to select the corresponding device
  5346. * from the list of devices, which is ordered by devid.
  5347. */
  5348. static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes)
  5349. {
  5350. struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 };
  5351. struct btrfs_device *device = map->stripes[first].dev;
  5352. struct btrfs_fs_info *fs_info = device->fs_devices->fs_info;
  5353. unsigned int read_cycle;
  5354. unsigned int total_reads;
  5355. unsigned int min_reads_per_dev;
  5356. total_reads = percpu_counter_sum(&fs_info->stats_read_blocks);
  5357. min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >>
  5358. fs_info->sectorsize_bits;
  5359. for (int index = 0, i = first; i < first + num_stripes; i++) {
  5360. stripes[index].devid = map->stripes[i].dev->devid;
  5361. stripes[index].num = i;
  5362. index++;
  5363. }
  5364. sort(stripes, num_stripes, sizeof(struct stripe_mirror),
  5365. btrfs_cmp_devid, NULL);
  5366. read_cycle = total_reads / min_reads_per_dev;
  5367. return stripes[read_cycle % num_stripes].num;
  5368. }
  5369. #endif
  5370. static int find_live_mirror(struct btrfs_fs_info *fs_info,
  5371. struct btrfs_chunk_map *map, int first,
  5372. bool dev_replace_is_ongoing)
  5373. {
  5374. const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy);
  5375. int i;
  5376. int num_stripes;
  5377. int preferred_mirror;
  5378. int tolerance;
  5379. struct btrfs_device *srcdev;
  5380. ASSERT((map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)),
  5381. "type=%llu", map->type);
  5382. if (map->type & BTRFS_BLOCK_GROUP_RAID10)
  5383. num_stripes = map->sub_stripes;
  5384. else
  5385. num_stripes = map->num_stripes;
  5386. switch (policy) {
  5387. default:
  5388. /* Shouldn't happen, just warn and use pid instead of failing */
  5389. btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid",
  5390. policy);
  5391. WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID);
  5392. fallthrough;
  5393. case BTRFS_READ_POLICY_PID:
  5394. preferred_mirror = first + (current->pid % num_stripes);
  5395. break;
  5396. #ifdef CONFIG_BTRFS_EXPERIMENTAL
  5397. case BTRFS_READ_POLICY_RR:
  5398. preferred_mirror = btrfs_read_rr(map, first, num_stripes);
  5399. break;
  5400. case BTRFS_READ_POLICY_DEVID:
  5401. preferred_mirror = btrfs_read_preferred(map, first, num_stripes);
  5402. break;
  5403. #endif
  5404. }
  5405. if (dev_replace_is_ongoing &&
  5406. fs_info->dev_replace.cont_reading_from_srcdev_mode ==
  5407. BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
  5408. srcdev = fs_info->dev_replace.srcdev;
  5409. else
  5410. srcdev = NULL;
  5411. /*
  5412. * try to avoid the drive that is the source drive for a
  5413. * dev-replace procedure, only choose it if no other non-missing
  5414. * mirror is available
  5415. */
  5416. for (tolerance = 0; tolerance < 2; tolerance++) {
  5417. if (map->stripes[preferred_mirror].dev->bdev &&
  5418. (tolerance || map->stripes[preferred_mirror].dev != srcdev))
  5419. return preferred_mirror;
  5420. for (i = first; i < first + num_stripes; i++) {
  5421. if (map->stripes[i].dev->bdev &&
  5422. (tolerance || map->stripes[i].dev != srcdev))
  5423. return i;
  5424. }
  5425. }
  5426. /* we couldn't find one that doesn't fail. Just return something
  5427. * and the io error handling code will clean up eventually
  5428. */
  5429. return preferred_mirror;
  5430. }
  5431. EXPORT_FOR_TESTS
  5432. struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
  5433. u64 logical, u16 total_stripes)
  5434. {
  5435. struct btrfs_io_context *bioc;
  5436. bioc = kzalloc_flex(*bioc, stripes, total_stripes, GFP_NOFS);
  5437. if (!bioc)
  5438. return NULL;
  5439. refcount_set(&bioc->refs, 1);
  5440. bioc->fs_info = fs_info;
  5441. bioc->replace_stripe_src = -1;
  5442. bioc->full_stripe_logical = (u64)-1;
  5443. bioc->logical = logical;
  5444. return bioc;
  5445. }
  5446. void btrfs_get_bioc(struct btrfs_io_context *bioc)
  5447. {
  5448. WARN_ON(!refcount_read(&bioc->refs));
  5449. refcount_inc(&bioc->refs);
  5450. }
  5451. void btrfs_put_bioc(struct btrfs_io_context *bioc)
  5452. {
  5453. if (!bioc)
  5454. return;
  5455. if (refcount_dec_and_test(&bioc->refs))
  5456. kfree(bioc);
  5457. }
  5458. /*
  5459. * Please note that, discard won't be sent to target device of device
  5460. * replace.
  5461. */
  5462. struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
  5463. u64 logical, u64 *length_ret,
  5464. u32 *num_stripes, bool do_remap)
  5465. {
  5466. struct btrfs_chunk_map *map;
  5467. struct btrfs_discard_stripe *stripes;
  5468. u64 length = *length_ret;
  5469. u64 offset;
  5470. u32 stripe_nr;
  5471. u32 stripe_nr_end;
  5472. u32 stripe_cnt;
  5473. u64 stripe_end_offset;
  5474. u64 stripe_offset;
  5475. u32 stripe_index;
  5476. u32 factor = 0;
  5477. u32 sub_stripes = 0;
  5478. u32 stripes_per_dev = 0;
  5479. u32 remaining_stripes = 0;
  5480. u32 last_stripe = 0;
  5481. int ret;
  5482. int i;
  5483. map = btrfs_get_chunk_map(fs_info, logical, length);
  5484. if (IS_ERR(map))
  5485. return ERR_CAST(map);
  5486. if (do_remap && (map->type & BTRFS_BLOCK_GROUP_REMAPPED)) {
  5487. u64 new_logical = logical;
  5488. ret = btrfs_translate_remap(fs_info, &new_logical, &length);
  5489. if (ret)
  5490. goto out_free_map;
  5491. if (new_logical != logical) {
  5492. btrfs_free_chunk_map(map);
  5493. map = btrfs_get_chunk_map(fs_info, new_logical, length);
  5494. if (IS_ERR(map))
  5495. return ERR_CAST(map);
  5496. logical = new_logical;
  5497. }
  5498. }
  5499. /* we don't discard raid56 yet */
  5500. if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
  5501. ret = -EOPNOTSUPP;
  5502. goto out_free_map;
  5503. }
  5504. offset = logical - map->start;
  5505. length = min_t(u64, map->start + map->chunk_len - logical, length);
  5506. *length_ret = length;
  5507. /*
  5508. * stripe_nr counts the total number of stripes we have to stride
  5509. * to get to this block
  5510. */
  5511. stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
  5512. /* stripe_offset is the offset of this block in its stripe */
  5513. stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr);
  5514. stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
  5515. BTRFS_STRIPE_LEN_SHIFT;
  5516. stripe_cnt = stripe_nr_end - stripe_nr;
  5517. stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) -
  5518. (offset + length);
  5519. /*
  5520. * after this, stripe_nr is the number of stripes on this
  5521. * device we have to walk to find the data, and stripe_index is
  5522. * the number of our device in the stripe array
  5523. */
  5524. *num_stripes = 1;
  5525. stripe_index = 0;
  5526. if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
  5527. BTRFS_BLOCK_GROUP_RAID10)) {
  5528. if (map->type & BTRFS_BLOCK_GROUP_RAID0)
  5529. sub_stripes = 1;
  5530. else
  5531. sub_stripes = map->sub_stripes;
  5532. factor = map->num_stripes / sub_stripes;
  5533. *num_stripes = min_t(u64, map->num_stripes,
  5534. sub_stripes * stripe_cnt);
  5535. stripe_index = stripe_nr % factor;
  5536. stripe_nr /= factor;
  5537. stripe_index *= sub_stripes;
  5538. remaining_stripes = stripe_cnt % factor;
  5539. stripes_per_dev = stripe_cnt / factor;
  5540. last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes;
  5541. } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
  5542. BTRFS_BLOCK_GROUP_DUP)) {
  5543. *num_stripes = map->num_stripes;
  5544. } else {
  5545. stripe_index = stripe_nr % map->num_stripes;
  5546. stripe_nr /= map->num_stripes;
  5547. }
  5548. stripes = kzalloc_objs(*stripes, *num_stripes, GFP_NOFS);
  5549. if (!stripes) {
  5550. ret = -ENOMEM;
  5551. goto out_free_map;
  5552. }
  5553. for (i = 0; i < *num_stripes; i++) {
  5554. stripes[i].physical =
  5555. map->stripes[stripe_index].physical +
  5556. stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
  5557. stripes[i].dev = map->stripes[stripe_index].dev;
  5558. if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
  5559. BTRFS_BLOCK_GROUP_RAID10)) {
  5560. stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev);
  5561. if (i / sub_stripes < remaining_stripes)
  5562. stripes[i].length += BTRFS_STRIPE_LEN;
  5563. /*
  5564. * Special for the first stripe and
  5565. * the last stripe:
  5566. *
  5567. * |-------|...|-------|
  5568. * |----------|
  5569. * off end_off
  5570. */
  5571. if (i < sub_stripes)
  5572. stripes[i].length -= stripe_offset;
  5573. if (stripe_index >= last_stripe &&
  5574. stripe_index <= (last_stripe +
  5575. sub_stripes - 1))
  5576. stripes[i].length -= stripe_end_offset;
  5577. if (i == sub_stripes - 1)
  5578. stripe_offset = 0;
  5579. } else {
  5580. stripes[i].length = length;
  5581. }
  5582. stripe_index++;
  5583. if (stripe_index == map->num_stripes) {
  5584. stripe_index = 0;
  5585. stripe_nr++;
  5586. }
  5587. }
  5588. btrfs_free_chunk_map(map);
  5589. return stripes;
  5590. out_free_map:
  5591. btrfs_free_chunk_map(map);
  5592. return ERR_PTR(ret);
  5593. }
  5594. static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
  5595. {
  5596. struct btrfs_block_group *cache;
  5597. bool ret;
  5598. /* Non zoned filesystem does not use "to_copy" flag */
  5599. if (!btrfs_is_zoned(fs_info))
  5600. return false;
  5601. cache = btrfs_lookup_block_group(fs_info, logical);
  5602. ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
  5603. btrfs_put_block_group(cache);
  5604. return ret;
  5605. }
  5606. static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc,
  5607. struct btrfs_dev_replace *dev_replace,
  5608. u64 logical,
  5609. struct btrfs_io_geometry *io_geom)
  5610. {
  5611. u64 srcdev_devid = dev_replace->srcdev->devid;
  5612. /*
  5613. * At this stage, num_stripes is still the real number of stripes,
  5614. * excluding the duplicated stripes.
  5615. */
  5616. int num_stripes = io_geom->num_stripes;
  5617. int max_errors = io_geom->max_errors;
  5618. int nr_extra_stripes = 0;
  5619. int i;
  5620. /*
  5621. * A block group which has "to_copy" set will eventually be copied by
  5622. * the dev-replace process. We can avoid cloning IO here.
  5623. */
  5624. if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
  5625. return;
  5626. /*
  5627. * Duplicate the write operations while the dev-replace procedure is
  5628. * running. Since the copying of the old disk to the new disk takes
  5629. * place at run time while the filesystem is mounted writable, the
  5630. * regular write operations to the old disk have to be duplicated to go
  5631. * to the new disk as well.
  5632. *
  5633. * Note that device->missing is handled by the caller, and that the
  5634. * write to the old disk is already set up in the stripes array.
  5635. */
  5636. for (i = 0; i < num_stripes; i++) {
  5637. struct btrfs_io_stripe *old = &bioc->stripes[i];
  5638. struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];
  5639. if (old->dev->devid != srcdev_devid)
  5640. continue;
  5641. new->physical = old->physical;
  5642. new->dev = dev_replace->tgtdev;
  5643. if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
  5644. bioc->replace_stripe_src = i;
  5645. nr_extra_stripes++;
  5646. }
  5647. /* We can only have at most 2 extra nr_stripes (for DUP). */
  5648. ASSERT(nr_extra_stripes <= 2, "nr_extra_stripes=%d", nr_extra_stripes);
  5649. /*
  5650. * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
  5651. * replace.
  5652. * If we have 2 extra stripes, only choose the one with smaller physical.
  5653. */
  5654. if (io_geom->op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
  5655. struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
  5656. struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
  5657. /* Only DUP can have two extra stripes. */
  5658. ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP,
  5659. "map_type=%llu", bioc->map_type);
  5660. /*
  5661. * Swap the last stripe stripes and reduce @nr_extra_stripes.
  5662. * The extra stripe would still be there, but won't be accessed.
  5663. */
  5664. if (first->physical > second->physical) {
  5665. swap(second->physical, first->physical);
  5666. swap(second->dev, first->dev);
  5667. nr_extra_stripes--;
  5668. }
  5669. }
  5670. io_geom->num_stripes = num_stripes + nr_extra_stripes;
  5671. io_geom->max_errors = max_errors + nr_extra_stripes;
  5672. bioc->replace_nr_stripes = nr_extra_stripes;
  5673. }
  5674. static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
  5675. struct btrfs_io_geometry *io_geom)
  5676. {
  5677. /*
  5678. * Stripe_nr is the stripe where this block falls. stripe_offset is
  5679. * the offset of this block in its stripe.
  5680. */
  5681. io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
  5682. io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
  5683. ASSERT(io_geom->stripe_offset < U32_MAX,
  5684. "stripe_offset=%llu", io_geom->stripe_offset);
  5685. if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
  5686. unsigned long full_stripe_len =
  5687. btrfs_stripe_nr_to_offset(nr_data_stripes(map));
  5688. /*
  5689. * For full stripe start, we use previously calculated
  5690. * @stripe_nr. Align it to nr_data_stripes, then multiply with
  5691. * STRIPE_LEN.
  5692. *
  5693. * By this we can avoid u64 division completely. And we have
  5694. * to go rounddown(), not round_down(), as nr_data_stripes is
  5695. * not ensured to be power of 2.
  5696. */
  5697. io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset(
  5698. rounddown(io_geom->stripe_nr, nr_data_stripes(map)));
  5699. ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset,
  5700. "raid56_full_stripe_start=%llu full_stripe_len=%lu offset=%llu",
  5701. io_geom->raid56_full_stripe_start, full_stripe_len, offset);
  5702. ASSERT(io_geom->raid56_full_stripe_start <= offset,
  5703. "raid56_full_stripe_start=%llu offset=%llu",
  5704. io_geom->raid56_full_stripe_start, offset);
  5705. /*
  5706. * For writes to RAID56, allow to write a full stripe set, but
  5707. * no straddling of stripe sets.
  5708. */
  5709. if (io_geom->op == BTRFS_MAP_WRITE)
  5710. return full_stripe_len - (offset - io_geom->raid56_full_stripe_start);
  5711. }
  5712. /*
  5713. * For other RAID types and for RAID56 reads, allow a single stripe (on
  5714. * a single disk).
  5715. */
  5716. if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
  5717. return BTRFS_STRIPE_LEN - io_geom->stripe_offset;
  5718. return U64_MAX;
  5719. }
  5720. static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
  5721. u64 *length, struct btrfs_io_stripe *dst,
  5722. struct btrfs_chunk_map *map,
  5723. struct btrfs_io_geometry *io_geom)
  5724. {
  5725. dst->dev = map->stripes[io_geom->stripe_index].dev;
  5726. if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst)
  5727. return btrfs_get_raid_extent_offset(fs_info, logical, length,
  5728. map->type,
  5729. io_geom->stripe_index, dst);
  5730. dst->physical = map->stripes[io_geom->stripe_index].physical +
  5731. io_geom->stripe_offset +
  5732. btrfs_stripe_nr_to_offset(io_geom->stripe_nr);
  5733. return 0;
  5734. }
  5735. static bool is_single_device_io(struct btrfs_fs_info *fs_info,
  5736. const struct btrfs_io_stripe *smap,
  5737. const struct btrfs_chunk_map *map,
  5738. int num_alloc_stripes,
  5739. struct btrfs_io_geometry *io_geom)
  5740. {
  5741. if (!smap)
  5742. return false;
  5743. if (num_alloc_stripes != 1)
  5744. return false;
  5745. if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ)
  5746. return false;
  5747. if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1)
  5748. return false;
  5749. return true;
  5750. }
  5751. static void map_blocks_raid0(const struct btrfs_chunk_map *map,
  5752. struct btrfs_io_geometry *io_geom)
  5753. {
  5754. io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
  5755. io_geom->stripe_nr /= map->num_stripes;
  5756. if (io_geom->op == BTRFS_MAP_READ)
  5757. io_geom->mirror_num = 1;
  5758. }
  5759. static void map_blocks_raid1(struct btrfs_fs_info *fs_info,
  5760. struct btrfs_chunk_map *map,
  5761. struct btrfs_io_geometry *io_geom,
  5762. bool dev_replace_is_ongoing)
  5763. {
  5764. if (io_geom->op != BTRFS_MAP_READ) {
  5765. io_geom->num_stripes = map->num_stripes;
  5766. return;
  5767. }
  5768. if (io_geom->mirror_num) {
  5769. io_geom->stripe_index = io_geom->mirror_num - 1;
  5770. return;
  5771. }
  5772. io_geom->stripe_index = find_live_mirror(fs_info, map, 0,
  5773. dev_replace_is_ongoing);
  5774. io_geom->mirror_num = io_geom->stripe_index + 1;
  5775. }
  5776. static void map_blocks_dup(const struct btrfs_chunk_map *map,
  5777. struct btrfs_io_geometry *io_geom)
  5778. {
  5779. if (io_geom->op != BTRFS_MAP_READ) {
  5780. io_geom->num_stripes = map->num_stripes;
  5781. return;
  5782. }
  5783. if (io_geom->mirror_num) {
  5784. io_geom->stripe_index = io_geom->mirror_num - 1;
  5785. return;
  5786. }
  5787. io_geom->mirror_num = 1;
  5788. }
  5789. static void map_blocks_raid10(struct btrfs_fs_info *fs_info,
  5790. struct btrfs_chunk_map *map,
  5791. struct btrfs_io_geometry *io_geom,
  5792. bool dev_replace_is_ongoing)
  5793. {
  5794. u32 factor = map->num_stripes / map->sub_stripes;
  5795. int old_stripe_index;
  5796. io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes;
  5797. io_geom->stripe_nr /= factor;
  5798. if (io_geom->op != BTRFS_MAP_READ) {
  5799. io_geom->num_stripes = map->sub_stripes;
  5800. return;
  5801. }
  5802. if (io_geom->mirror_num) {
  5803. io_geom->stripe_index += io_geom->mirror_num - 1;
  5804. return;
  5805. }
  5806. old_stripe_index = io_geom->stripe_index;
  5807. io_geom->stripe_index = find_live_mirror(fs_info, map,
  5808. io_geom->stripe_index,
  5809. dev_replace_is_ongoing);
  5810. io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1;
  5811. }
  5812. static void map_blocks_raid56_write(struct btrfs_chunk_map *map,
  5813. struct btrfs_io_geometry *io_geom,
  5814. u64 logical, u64 *length)
  5815. {
  5816. int data_stripes = nr_data_stripes(map);
  5817. /*
  5818. * Needs full stripe mapping.
  5819. *
  5820. * Push stripe_nr back to the start of the full stripe For those cases
  5821. * needing a full stripe, @stripe_nr is the full stripe number.
  5822. *
  5823. * Originally we go raid56_full_stripe_start / full_stripe_len, but
  5824. * that can be expensive. Here we just divide @stripe_nr with
  5825. * @data_stripes.
  5826. */
  5827. io_geom->stripe_nr /= data_stripes;
  5828. /* RAID[56] write or recovery. Return all stripes */
  5829. io_geom->num_stripes = map->num_stripes;
  5830. io_geom->max_errors = btrfs_chunk_max_errors(map);
  5831. /* Return the length to the full stripe end. */
  5832. *length = min(logical + *length,
  5833. io_geom->raid56_full_stripe_start + map->start +
  5834. btrfs_stripe_nr_to_offset(data_stripes)) -
  5835. logical;
  5836. io_geom->stripe_index = 0;
  5837. io_geom->stripe_offset = 0;
  5838. }
  5839. static void map_blocks_raid56_read(struct btrfs_chunk_map *map,
  5840. struct btrfs_io_geometry *io_geom)
  5841. {
  5842. int data_stripes = nr_data_stripes(map);
  5843. ASSERT(io_geom->mirror_num <= 1, "mirror_num=%d", io_geom->mirror_num);
  5844. /* Just grab the data stripe directly. */
  5845. io_geom->stripe_index = io_geom->stripe_nr % data_stripes;
  5846. io_geom->stripe_nr /= data_stripes;
  5847. /* We distribute the parity blocks across stripes. */
  5848. io_geom->stripe_index =
  5849. (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes;
  5850. if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1)
  5851. io_geom->mirror_num = 1;
  5852. }
  5853. static void map_blocks_single(const struct btrfs_chunk_map *map,
  5854. struct btrfs_io_geometry *io_geom)
  5855. {
  5856. io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
  5857. io_geom->stripe_nr /= map->num_stripes;
  5858. io_geom->mirror_num = io_geom->stripe_index + 1;
  5859. }
  5860. /*
  5861. * Map one logical range to one or more physical ranges.
  5862. *
  5863. * @length: (Mandatory) mapped length of this run.
  5864. * One logical range can be split into different segments
  5865. * due to factors like zones and RAID0/5/6/10 stripe
  5866. * boundaries.
  5867. *
  5868. * @bioc_ret: (Mandatory) returned btrfs_io_context structure.
  5869. * which has one or more physical ranges (btrfs_io_stripe)
  5870. * recorded inside.
  5871. * Caller should call btrfs_put_bioc() to free it after use.
  5872. *
  5873. * @smap: (Optional) single physical range optimization.
  5874. * If the map request can be fulfilled by one single
  5875. * physical range, and this is parameter is not NULL,
  5876. * then @bioc_ret would be NULL, and @smap would be
  5877. * updated.
  5878. *
  5879. * @mirror_num_ret: (Mandatory) returned mirror number if the original
  5880. * value is 0.
  5881. *
  5882. * Mirror number 0 means to choose any live mirrors.
  5883. *
  5884. * For non-RAID56 profiles, non-zero mirror_num means
  5885. * the Nth mirror. (e.g. mirror_num 1 means the first
  5886. * copy).
  5887. *
  5888. * For RAID56 profile, mirror 1 means rebuild from P and
  5889. * the remaining data stripes.
  5890. *
  5891. * For RAID6 profile, mirror > 2 means mark another
  5892. * data/P stripe error and rebuild from the remaining
  5893. * stripes..
  5894. */
  5895. int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
  5896. u64 logical, u64 *length,
  5897. struct btrfs_io_context **bioc_ret,
  5898. struct btrfs_io_stripe *smap, int *mirror_num_ret)
  5899. {
  5900. struct btrfs_chunk_map *map;
  5901. struct btrfs_io_geometry io_geom = { 0 };
  5902. u64 map_offset;
  5903. int ret = 0;
  5904. int num_copies;
  5905. struct btrfs_io_context *bioc = NULL;
  5906. struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  5907. bool dev_replace_is_ongoing = false;
  5908. u16 num_alloc_stripes;
  5909. u64 max_len;
  5910. ASSERT(bioc_ret);
  5911. io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
  5912. io_geom.num_stripes = 1;
  5913. io_geom.stripe_index = 0;
  5914. io_geom.op = op;
  5915. map = btrfs_get_chunk_map(fs_info, logical, *length);
  5916. if (IS_ERR(map))
  5917. return PTR_ERR(map);
  5918. if (map->type & BTRFS_BLOCK_GROUP_REMAPPED) {
  5919. u64 new_logical = logical;
  5920. ret = btrfs_translate_remap(fs_info, &new_logical, length);
  5921. if (ret)
  5922. goto out;
  5923. if (new_logical != logical) {
  5924. btrfs_free_chunk_map(map);
  5925. map = btrfs_get_chunk_map(fs_info, new_logical, *length);
  5926. if (IS_ERR(map))
  5927. return PTR_ERR(map);
  5928. logical = new_logical;
  5929. }
  5930. }
  5931. num_copies = btrfs_chunk_map_num_copies(map);
  5932. if (io_geom.mirror_num > num_copies) {
  5933. ret = -EINVAL;
  5934. goto out;
  5935. }
  5936. map_offset = logical - map->start;
  5937. io_geom.raid56_full_stripe_start = (u64)-1;
  5938. max_len = btrfs_max_io_len(map, map_offset, &io_geom);
  5939. *length = min_t(u64, map->chunk_len - map_offset, max_len);
  5940. io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type);
  5941. if (dev_replace->replace_task != current)
  5942. down_read(&dev_replace->rwsem);
  5943. dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
  5944. /*
  5945. * Hold the semaphore for read during the whole operation, write is
  5946. * requested at commit time but must wait.
  5947. */
  5948. if (!dev_replace_is_ongoing && dev_replace->replace_task != current)
  5949. up_read(&dev_replace->rwsem);
  5950. switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
  5951. case BTRFS_BLOCK_GROUP_RAID0:
  5952. map_blocks_raid0(map, &io_geom);
  5953. break;
  5954. case BTRFS_BLOCK_GROUP_RAID1:
  5955. case BTRFS_BLOCK_GROUP_RAID1C3:
  5956. case BTRFS_BLOCK_GROUP_RAID1C4:
  5957. map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing);
  5958. break;
  5959. case BTRFS_BLOCK_GROUP_DUP:
  5960. map_blocks_dup(map, &io_geom);
  5961. break;
  5962. case BTRFS_BLOCK_GROUP_RAID10:
  5963. map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing);
  5964. break;
  5965. case BTRFS_BLOCK_GROUP_RAID5:
  5966. case BTRFS_BLOCK_GROUP_RAID6:
  5967. if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)
  5968. map_blocks_raid56_write(map, &io_geom, logical, length);
  5969. else
  5970. map_blocks_raid56_read(map, &io_geom);
  5971. break;
  5972. default:
  5973. /*
  5974. * After this, stripe_nr is the number of stripes on this
  5975. * device we have to walk to find the data, and stripe_index is
  5976. * the number of our device in the stripe array
  5977. */
  5978. map_blocks_single(map, &io_geom);
  5979. break;
  5980. }
  5981. if (io_geom.stripe_index >= map->num_stripes) {
  5982. btrfs_crit(fs_info,
  5983. "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
  5984. io_geom.stripe_index, map->num_stripes);
  5985. ret = -EINVAL;
  5986. goto out;
  5987. }
  5988. num_alloc_stripes = io_geom.num_stripes;
  5989. if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
  5990. op != BTRFS_MAP_READ)
  5991. /*
  5992. * For replace case, we need to add extra stripes for extra
  5993. * duplicated stripes.
  5994. *
  5995. * For both WRITE and GET_READ_MIRRORS, we may have at most
  5996. * 2 more stripes (DUP types, otherwise 1).
  5997. */
  5998. num_alloc_stripes += 2;
  5999. /*
  6000. * If this I/O maps to a single device, try to return the device and
  6001. * physical block information on the stack instead of allocating an
  6002. * I/O context structure.
  6003. */
  6004. if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) {
  6005. ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
  6006. if (mirror_num_ret)
  6007. *mirror_num_ret = io_geom.mirror_num;
  6008. *bioc_ret = NULL;
  6009. goto out;
  6010. }
  6011. bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
  6012. if (!bioc) {
  6013. ret = -ENOMEM;
  6014. goto out;
  6015. }
  6016. bioc->map_type = map->type;
  6017. bioc->use_rst = io_geom.use_rst;
  6018. /*
  6019. * For RAID56 full map, we need to make sure the stripes[] follows the
  6020. * rule that data stripes are all ordered, then followed with P and Q
  6021. * (if we have).
  6022. *
  6023. * It's still mostly the same as other profiles, just with extra rotation.
  6024. */
  6025. if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
  6026. (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) {
  6027. /*
  6028. * For RAID56 @stripe_nr is already the number of full stripes
  6029. * before us, which is also the rotation value (needs to modulo
  6030. * with num_stripes).
  6031. *
  6032. * In this case, we just add @stripe_nr with @i, then do the
  6033. * modulo, to reduce one modulo call.
  6034. */
  6035. bioc->full_stripe_logical = map->start +
  6036. btrfs_stripe_nr_to_offset(io_geom.stripe_nr *
  6037. nr_data_stripes(map));
  6038. for (int i = 0; i < io_geom.num_stripes; i++) {
  6039. struct btrfs_io_stripe *dst = &bioc->stripes[i];
  6040. u32 stripe_index;
  6041. stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes;
  6042. dst->dev = map->stripes[stripe_index].dev;
  6043. dst->physical =
  6044. map->stripes[stripe_index].physical +
  6045. io_geom.stripe_offset +
  6046. btrfs_stripe_nr_to_offset(io_geom.stripe_nr);
  6047. }
  6048. } else {
  6049. /*
  6050. * For all other non-RAID56 profiles, just copy the target
  6051. * stripe into the bioc.
  6052. */
  6053. for (int i = 0; i < io_geom.num_stripes; i++) {
  6054. ret = set_io_stripe(fs_info, logical, length,
  6055. &bioc->stripes[i], map, &io_geom);
  6056. if (ret < 0)
  6057. break;
  6058. io_geom.stripe_index++;
  6059. }
  6060. }
  6061. if (ret) {
  6062. *bioc_ret = NULL;
  6063. btrfs_put_bioc(bioc);
  6064. goto out;
  6065. }
  6066. if (op != BTRFS_MAP_READ)
  6067. io_geom.max_errors = btrfs_chunk_max_errors(map);
  6068. if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
  6069. op != BTRFS_MAP_READ) {
  6070. handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom);
  6071. }
  6072. *bioc_ret = bioc;
  6073. bioc->num_stripes = io_geom.num_stripes;
  6074. bioc->max_errors = io_geom.max_errors;
  6075. bioc->mirror_num = io_geom.mirror_num;
  6076. out:
  6077. if (dev_replace_is_ongoing && dev_replace->replace_task != current) {
  6078. lockdep_assert_held(&dev_replace->rwsem);
  6079. /* Unlock and let waiting writers proceed */
  6080. up_read(&dev_replace->rwsem);
  6081. }
  6082. btrfs_free_chunk_map(map);
  6083. return ret;
  6084. }
  6085. static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
  6086. const struct btrfs_fs_devices *fs_devices)
  6087. {
  6088. if (args->fsid == NULL)
  6089. return true;
  6090. if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
  6091. return true;
  6092. return false;
  6093. }
  6094. static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
  6095. const struct btrfs_device *device)
  6096. {
  6097. if (args->devt)
  6098. return device->devt == args->devt;
  6099. if (args->missing) {
  6100. if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
  6101. !device->bdev)
  6102. return true;
  6103. return false;
  6104. }
  6105. if (device->devid != args->devid)
  6106. return false;
  6107. if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
  6108. return false;
  6109. return true;
  6110. }
  6111. /*
  6112. * Find a device specified by @devid or @uuid in the list of @fs_devices, or
  6113. * return NULL.
  6114. *
  6115. * If devid and uuid are both specified, the match must be exact, otherwise
  6116. * only devid is used.
  6117. */
  6118. struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
  6119. const struct btrfs_dev_lookup_args *args)
  6120. {
  6121. struct btrfs_device *device;
  6122. struct btrfs_fs_devices *seed_devs;
  6123. if (dev_args_match_fs_devices(args, fs_devices)) {
  6124. list_for_each_entry(device, &fs_devices->devices, dev_list) {
  6125. if (dev_args_match_device(args, device))
  6126. return device;
  6127. }
  6128. }
  6129. list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
  6130. if (!dev_args_match_fs_devices(args, seed_devs))
  6131. continue;
  6132. list_for_each_entry(device, &seed_devs->devices, dev_list) {
  6133. if (dev_args_match_device(args, device))
  6134. return device;
  6135. }
  6136. }
  6137. return NULL;
  6138. }
  6139. static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
  6140. u64 devid, u8 *dev_uuid)
  6141. {
  6142. struct btrfs_device *device;
  6143. unsigned int nofs_flag;
  6144. /*
  6145. * We call this under the chunk_mutex, so we want to use NOFS for this
  6146. * allocation, however we don't want to change btrfs_alloc_device() to
  6147. * always do NOFS because we use it in a lot of other GFP_KERNEL safe
  6148. * places.
  6149. */
  6150. nofs_flag = memalloc_nofs_save();
  6151. device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL);
  6152. memalloc_nofs_restore(nofs_flag);
  6153. if (IS_ERR(device))
  6154. return device;
  6155. list_add(&device->dev_list, &fs_devices->devices);
  6156. device->fs_devices = fs_devices;
  6157. fs_devices->num_devices++;
  6158. set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
  6159. fs_devices->missing_devices++;
  6160. return device;
  6161. }
  6162. /*
  6163. * Allocate new device struct, set up devid and UUID.
  6164. *
  6165. * @fs_info: used only for generating a new devid, can be NULL if
  6166. * devid is provided (i.e. @devid != NULL).
  6167. * @devid: a pointer to devid for this device. If NULL a new devid
  6168. * is generated.
  6169. * @uuid: a pointer to UUID for this device. If NULL a new UUID
  6170. * is generated.
  6171. * @path: a pointer to device path if available, NULL otherwise.
  6172. *
  6173. * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
  6174. * on error. Returned struct is not linked onto any lists and must be
  6175. * destroyed with btrfs_free_device.
  6176. */
  6177. struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
  6178. const u64 *devid, const u8 *uuid,
  6179. const char *path)
  6180. {
  6181. struct btrfs_device *dev;
  6182. u64 tmp;
  6183. if (WARN_ON(!devid && !fs_info))
  6184. return ERR_PTR(-EINVAL);
  6185. dev = kzalloc_obj(*dev);
  6186. if (!dev)
  6187. return ERR_PTR(-ENOMEM);
  6188. INIT_LIST_HEAD(&dev->dev_list);
  6189. INIT_LIST_HEAD(&dev->dev_alloc_list);
  6190. INIT_LIST_HEAD(&dev->post_commit_list);
  6191. atomic_set(&dev->dev_stats_ccnt, 0);
  6192. btrfs_device_data_ordered_init(dev);
  6193. btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
  6194. if (devid)
  6195. tmp = *devid;
  6196. else {
  6197. int ret;
  6198. ret = find_next_devid(fs_info, &tmp);
  6199. if (ret) {
  6200. btrfs_free_device(dev);
  6201. return ERR_PTR(ret);
  6202. }
  6203. }
  6204. dev->devid = tmp;
  6205. if (uuid)
  6206. memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
  6207. else
  6208. generate_random_uuid(dev->uuid);
  6209. if (path) {
  6210. const char *name;
  6211. name = kstrdup(path, GFP_KERNEL);
  6212. if (!name) {
  6213. btrfs_free_device(dev);
  6214. return ERR_PTR(-ENOMEM);
  6215. }
  6216. rcu_assign_pointer(dev->name, name);
  6217. }
  6218. return dev;
  6219. }
  6220. static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
  6221. u64 devid, u8 *uuid, bool error)
  6222. {
  6223. if (error)
  6224. btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
  6225. devid, uuid);
  6226. else
  6227. btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
  6228. devid, uuid);
  6229. }
  6230. u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map)
  6231. {
  6232. const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
  6233. return div_u64(map->chunk_len, data_stripes);
  6234. }
  6235. #if BITS_PER_LONG == 32
  6236. /*
  6237. * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
  6238. * can't be accessed on 32bit systems.
  6239. *
  6240. * This function do mount time check to reject the fs if it already has
  6241. * metadata chunk beyond that limit.
  6242. */
  6243. static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
  6244. u64 logical, u64 length, u64 type)
  6245. {
  6246. if (!(type & BTRFS_BLOCK_GROUP_METADATA))
  6247. return 0;
  6248. if (logical + length < MAX_LFS_FILESIZE)
  6249. return 0;
  6250. btrfs_err_32bit_limit(fs_info);
  6251. return -EOVERFLOW;
  6252. }
  6253. /*
  6254. * This is to give early warning for any metadata chunk reaching
  6255. * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
  6256. * Although we can still access the metadata, it's not going to be possible
  6257. * once the limit is reached.
  6258. */
  6259. static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
  6260. u64 logical, u64 length, u64 type)
  6261. {
  6262. if (!(type & BTRFS_BLOCK_GROUP_METADATA))
  6263. return;
  6264. if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
  6265. return;
  6266. btrfs_warn_32bit_limit(fs_info);
  6267. }
  6268. #endif
  6269. static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
  6270. u64 devid, u8 *uuid)
  6271. {
  6272. struct btrfs_device *dev;
  6273. if (!btrfs_test_opt(fs_info, DEGRADED)) {
  6274. btrfs_report_missing_device(fs_info, devid, uuid, true);
  6275. return ERR_PTR(-ENOENT);
  6276. }
  6277. dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
  6278. if (IS_ERR(dev)) {
  6279. btrfs_err(fs_info, "failed to init missing device %llu: %ld",
  6280. devid, PTR_ERR(dev));
  6281. return dev;
  6282. }
  6283. btrfs_report_missing_device(fs_info, devid, uuid, false);
  6284. return dev;
  6285. }
  6286. static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
  6287. struct btrfs_chunk *chunk)
  6288. {
  6289. BTRFS_DEV_LOOKUP_ARGS(args);
  6290. struct btrfs_fs_info *fs_info = leaf->fs_info;
  6291. struct btrfs_chunk_map *map;
  6292. u64 logical;
  6293. u64 length;
  6294. u64 devid;
  6295. u64 type;
  6296. u8 uuid[BTRFS_UUID_SIZE];
  6297. int index;
  6298. int num_stripes;
  6299. int ret;
  6300. int i;
  6301. logical = key->offset;
  6302. length = btrfs_chunk_length(leaf, chunk);
  6303. type = btrfs_chunk_type(leaf, chunk);
  6304. index = btrfs_bg_flags_to_raid_index(type);
  6305. num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
  6306. #if BITS_PER_LONG == 32
  6307. ret = check_32bit_meta_chunk(fs_info, logical, length, type);
  6308. if (ret < 0)
  6309. return ret;
  6310. warn_32bit_meta_chunk(fs_info, logical, length, type);
  6311. #endif
  6312. map = btrfs_find_chunk_map(fs_info, logical, 1);
  6313. /* already mapped? */
  6314. if (map && map->start <= logical && map->start + map->chunk_len > logical) {
  6315. btrfs_free_chunk_map(map);
  6316. return 0;
  6317. } else if (map) {
  6318. btrfs_free_chunk_map(map);
  6319. }
  6320. map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS);
  6321. if (!map)
  6322. return -ENOMEM;
  6323. map->start = logical;
  6324. map->chunk_len = length;
  6325. map->num_stripes = num_stripes;
  6326. map->io_width = btrfs_chunk_io_width(leaf, chunk);
  6327. map->io_align = btrfs_chunk_io_align(leaf, chunk);
  6328. map->type = type;
  6329. /*
  6330. * We can't use the sub_stripes value, as for profiles other than
  6331. * RAID10, they may have 0 as sub_stripes for filesystems created by
  6332. * older mkfs (<v5.4).
  6333. * In that case, it can cause divide-by-zero errors later.
  6334. * Since currently sub_stripes is fixed for each profile, let's
  6335. * use the trusted value instead.
  6336. */
  6337. map->sub_stripes = btrfs_raid_array[index].sub_stripes;
  6338. map->verified_stripes = 0;
  6339. if (num_stripes > 0)
  6340. map->stripe_size = btrfs_calc_stripe_length(map);
  6341. else
  6342. map->stripe_size = 0;
  6343. for (i = 0; i < num_stripes; i++) {
  6344. map->stripes[i].physical =
  6345. btrfs_stripe_offset_nr(leaf, chunk, i);
  6346. devid = btrfs_stripe_devid_nr(leaf, chunk, i);
  6347. args.devid = devid;
  6348. read_extent_buffer(leaf, uuid, (unsigned long)
  6349. btrfs_stripe_dev_uuid_nr(chunk, i),
  6350. BTRFS_UUID_SIZE);
  6351. args.uuid = uuid;
  6352. map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
  6353. if (!map->stripes[i].dev) {
  6354. map->stripes[i].dev = handle_missing_device(fs_info,
  6355. devid, uuid);
  6356. if (IS_ERR(map->stripes[i].dev)) {
  6357. ret = PTR_ERR(map->stripes[i].dev);
  6358. btrfs_free_chunk_map(map);
  6359. return ret;
  6360. }
  6361. }
  6362. set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
  6363. &(map->stripes[i].dev->dev_state));
  6364. }
  6365. ret = btrfs_add_chunk_map(fs_info, map);
  6366. if (ret < 0) {
  6367. btrfs_err(fs_info,
  6368. "failed to add chunk map, start=%llu len=%llu: %d",
  6369. map->start, map->chunk_len, ret);
  6370. btrfs_free_chunk_map(map);
  6371. }
  6372. return ret;
  6373. }
  6374. static void fill_device_from_item(struct extent_buffer *leaf,
  6375. struct btrfs_dev_item *dev_item,
  6376. struct btrfs_device *device)
  6377. {
  6378. unsigned long ptr;
  6379. device->devid = btrfs_device_id(leaf, dev_item);
  6380. device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
  6381. device->total_bytes = device->disk_total_bytes;
  6382. device->commit_total_bytes = device->disk_total_bytes;
  6383. device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
  6384. device->commit_bytes_used = device->bytes_used;
  6385. device->type = btrfs_device_type(leaf, dev_item);
  6386. device->io_align = btrfs_device_io_align(leaf, dev_item);
  6387. device->io_width = btrfs_device_io_width(leaf, dev_item);
  6388. device->sector_size = btrfs_device_sector_size(leaf, dev_item);
  6389. WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
  6390. clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
  6391. ptr = btrfs_device_uuid(dev_item);
  6392. read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
  6393. }
  6394. static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
  6395. u8 *fsid)
  6396. {
  6397. struct btrfs_fs_devices *fs_devices;
  6398. int ret;
  6399. lockdep_assert_held(&uuid_mutex);
  6400. ASSERT(fsid);
  6401. /* This will match only for multi-device seed fs */
  6402. list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
  6403. if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
  6404. return fs_devices;
  6405. fs_devices = find_fsid(fsid, NULL);
  6406. if (!fs_devices) {
  6407. if (!btrfs_test_opt(fs_info, DEGRADED)) {
  6408. btrfs_err(fs_info,
  6409. "failed to find fsid %pU when attempting to open seed devices",
  6410. fsid);
  6411. return ERR_PTR(-ENOENT);
  6412. }
  6413. fs_devices = alloc_fs_devices(fsid);
  6414. if (IS_ERR(fs_devices))
  6415. return fs_devices;
  6416. fs_devices->seeding = true;
  6417. fs_devices->opened = 1;
  6418. list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
  6419. return fs_devices;
  6420. }
  6421. /*
  6422. * Upon first call for a seed fs fsid, just create a private copy of the
  6423. * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
  6424. */
  6425. fs_devices = clone_fs_devices(fs_devices);
  6426. if (IS_ERR(fs_devices))
  6427. return fs_devices;
  6428. ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->sb);
  6429. if (ret) {
  6430. free_fs_devices(fs_devices);
  6431. return ERR_PTR(ret);
  6432. }
  6433. if (!fs_devices->seeding) {
  6434. close_fs_devices(fs_devices);
  6435. free_fs_devices(fs_devices);
  6436. return ERR_PTR(-EINVAL);
  6437. }
  6438. list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
  6439. return fs_devices;
  6440. }
  6441. static int read_one_dev(struct extent_buffer *leaf,
  6442. struct btrfs_dev_item *dev_item)
  6443. {
  6444. BTRFS_DEV_LOOKUP_ARGS(args);
  6445. struct btrfs_fs_info *fs_info = leaf->fs_info;
  6446. struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
  6447. struct btrfs_device *device;
  6448. u64 devid;
  6449. u8 fs_uuid[BTRFS_FSID_SIZE];
  6450. u8 dev_uuid[BTRFS_UUID_SIZE];
  6451. devid = btrfs_device_id(leaf, dev_item);
  6452. args.devid = devid;
  6453. read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
  6454. BTRFS_UUID_SIZE);
  6455. read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
  6456. BTRFS_FSID_SIZE);
  6457. args.uuid = dev_uuid;
  6458. args.fsid = fs_uuid;
  6459. if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
  6460. fs_devices = open_seed_devices(fs_info, fs_uuid);
  6461. if (IS_ERR(fs_devices))
  6462. return PTR_ERR(fs_devices);
  6463. }
  6464. device = btrfs_find_device(fs_info->fs_devices, &args);
  6465. if (!device) {
  6466. if (!btrfs_test_opt(fs_info, DEGRADED)) {
  6467. btrfs_report_missing_device(fs_info, devid,
  6468. dev_uuid, true);
  6469. return -ENOENT;
  6470. }
  6471. device = add_missing_dev(fs_devices, devid, dev_uuid);
  6472. if (IS_ERR(device)) {
  6473. btrfs_err(fs_info,
  6474. "failed to add missing dev %llu: %ld",
  6475. devid, PTR_ERR(device));
  6476. return PTR_ERR(device);
  6477. }
  6478. btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
  6479. } else {
  6480. if (!device->bdev) {
  6481. if (!btrfs_test_opt(fs_info, DEGRADED)) {
  6482. btrfs_report_missing_device(fs_info,
  6483. devid, dev_uuid, true);
  6484. return -ENOENT;
  6485. }
  6486. btrfs_report_missing_device(fs_info, devid,
  6487. dev_uuid, false);
  6488. }
  6489. if (!device->bdev &&
  6490. !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
  6491. /*
  6492. * this happens when a device that was properly setup
  6493. * in the device info lists suddenly goes bad.
  6494. * device->bdev is NULL, and so we have to set
  6495. * device->missing to one here
  6496. */
  6497. device->fs_devices->missing_devices++;
  6498. set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
  6499. }
  6500. /* Move the device to its own fs_devices */
  6501. if (device->fs_devices != fs_devices) {
  6502. ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
  6503. &device->dev_state));
  6504. list_move(&device->dev_list, &fs_devices->devices);
  6505. device->fs_devices->num_devices--;
  6506. fs_devices->num_devices++;
  6507. device->fs_devices->missing_devices--;
  6508. fs_devices->missing_devices++;
  6509. device->fs_devices = fs_devices;
  6510. }
  6511. }
  6512. if (device->fs_devices != fs_info->fs_devices) {
  6513. BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
  6514. if (device->generation !=
  6515. btrfs_device_generation(leaf, dev_item))
  6516. return -EINVAL;
  6517. }
  6518. fill_device_from_item(leaf, dev_item, device);
  6519. if (device->bdev) {
  6520. u64 max_total_bytes = bdev_nr_bytes(device->bdev);
  6521. if (device->total_bytes > max_total_bytes) {
  6522. btrfs_err(fs_info,
  6523. "device total_bytes should be at most %llu but found %llu",
  6524. max_total_bytes, device->total_bytes);
  6525. return -EINVAL;
  6526. }
  6527. }
  6528. set_bit(BTRFS_DEV_STATE_ITEM_FOUND, &device->dev_state);
  6529. set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
  6530. if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
  6531. !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
  6532. device->fs_devices->total_rw_bytes += device->total_bytes;
  6533. atomic64_add(device->total_bytes - device->bytes_used,
  6534. &fs_info->free_chunk_space);
  6535. }
  6536. return 0;
  6537. }
  6538. int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
  6539. {
  6540. struct btrfs_super_block *super_copy = fs_info->super_copy;
  6541. struct extent_buffer *sb;
  6542. u8 *array_ptr;
  6543. unsigned long sb_array_offset;
  6544. int ret = 0;
  6545. u32 array_size;
  6546. u32 cur_offset;
  6547. struct btrfs_key key;
  6548. ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
  6549. /*
  6550. * We allocated a dummy extent, just to use extent buffer accessors.
  6551. * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
  6552. * that's fine, we will not go beyond system chunk array anyway.
  6553. */
  6554. sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
  6555. if (!sb)
  6556. return -ENOMEM;
  6557. set_extent_buffer_uptodate(sb);
  6558. write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
  6559. array_size = btrfs_super_sys_array_size(super_copy);
  6560. array_ptr = super_copy->sys_chunk_array;
  6561. sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
  6562. cur_offset = 0;
  6563. while (cur_offset < array_size) {
  6564. struct btrfs_chunk *chunk;
  6565. struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)array_ptr;
  6566. u32 len = sizeof(*disk_key);
  6567. /*
  6568. * The sys_chunk_array has been already verified at super block
  6569. * read time. Only do ASSERT()s for basic checks.
  6570. */
  6571. ASSERT(cur_offset + len <= array_size);
  6572. btrfs_disk_key_to_cpu(&key, disk_key);
  6573. array_ptr += len;
  6574. sb_array_offset += len;
  6575. cur_offset += len;
  6576. ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY);
  6577. chunk = (struct btrfs_chunk *)sb_array_offset;
  6578. ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM);
  6579. len = btrfs_chunk_item_size(btrfs_chunk_num_stripes(sb, chunk));
  6580. ASSERT(cur_offset + len <= array_size);
  6581. ret = read_one_chunk(&key, sb, chunk);
  6582. if (ret)
  6583. break;
  6584. array_ptr += len;
  6585. sb_array_offset += len;
  6586. cur_offset += len;
  6587. }
  6588. clear_extent_buffer_uptodate(sb);
  6589. free_extent_buffer_stale(sb);
  6590. return ret;
  6591. }
  6592. /*
  6593. * Check if all chunks in the fs are OK for read-write degraded mount
  6594. *
  6595. * If the @failing_dev is specified, it's accounted as missing.
  6596. *
  6597. * Return true if all chunks meet the minimal RW mount requirements.
  6598. * Return false if any chunk doesn't meet the minimal RW mount requirements.
  6599. */
  6600. bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
  6601. struct btrfs_device *failing_dev)
  6602. {
  6603. struct btrfs_chunk_map *map;
  6604. u64 next_start;
  6605. bool ret = true;
  6606. map = btrfs_find_chunk_map(fs_info, 0, U64_MAX);
  6607. /* No chunk at all? Return false anyway */
  6608. if (!map)
  6609. return false;
  6610. while (map) {
  6611. int missing = 0;
  6612. int max_tolerated;
  6613. int i;
  6614. max_tolerated =
  6615. btrfs_get_num_tolerated_disk_barrier_failures(
  6616. map->type);
  6617. for (i = 0; i < map->num_stripes; i++) {
  6618. struct btrfs_device *dev = map->stripes[i].dev;
  6619. if (!dev || !dev->bdev ||
  6620. test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
  6621. test_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &dev->dev_state))
  6622. missing++;
  6623. else if (failing_dev && failing_dev == dev)
  6624. missing++;
  6625. }
  6626. if (missing > max_tolerated) {
  6627. if (!failing_dev)
  6628. btrfs_warn(fs_info,
  6629. "chunk %llu missing %d devices, max tolerance is %d for writable mount",
  6630. map->start, missing, max_tolerated);
  6631. btrfs_free_chunk_map(map);
  6632. return false;
  6633. }
  6634. next_start = map->start + map->chunk_len;
  6635. btrfs_free_chunk_map(map);
  6636. map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start);
  6637. }
  6638. return ret;
  6639. }
  6640. static void readahead_tree_node_children(struct extent_buffer *node)
  6641. {
  6642. int i;
  6643. const int nr_items = btrfs_header_nritems(node);
  6644. for (i = 0; i < nr_items; i++)
  6645. btrfs_readahead_node_child(node, i);
  6646. }
  6647. int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
  6648. {
  6649. struct btrfs_root *root = fs_info->chunk_root;
  6650. BTRFS_PATH_AUTO_FREE(path);
  6651. struct extent_buffer *leaf;
  6652. struct btrfs_key key;
  6653. struct btrfs_key found_key;
  6654. int ret;
  6655. int slot;
  6656. int iter_ret = 0;
  6657. u64 total_dev = 0;
  6658. u64 last_ra_node = 0;
  6659. path = btrfs_alloc_path();
  6660. if (!path)
  6661. return -ENOMEM;
  6662. /*
  6663. * uuid_mutex is needed only if we are mounting a sprout FS
  6664. * otherwise we don't need it.
  6665. */
  6666. mutex_lock(&uuid_mutex);
  6667. /*
  6668. * It is possible for mount and umount to race in such a way that
  6669. * we execute this code path, but open_fs_devices failed to clear
  6670. * total_rw_bytes. We certainly want it cleared before reading the
  6671. * device items, so clear it here.
  6672. */
  6673. fs_info->fs_devices->total_rw_bytes = 0;
  6674. /*
  6675. * Lockdep complains about possible circular locking dependency between
  6676. * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
  6677. * used for freeze protection of a fs (struct super_block.s_writers),
  6678. * which we take when starting a transaction, and extent buffers of the
  6679. * chunk tree if we call read_one_dev() while holding a lock on an
  6680. * extent buffer of the chunk tree. Since we are mounting the filesystem
  6681. * and at this point there can't be any concurrent task modifying the
  6682. * chunk tree, to keep it simple, just skip locking on the chunk tree.
  6683. */
  6684. ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
  6685. path->skip_locking = true;
  6686. /*
  6687. * Read all device items, and then all the chunk items. All
  6688. * device items are found before any chunk item (their object id
  6689. * is smaller than the lowest possible object id for a chunk
  6690. * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
  6691. */
  6692. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  6693. key.type = 0;
  6694. key.offset = 0;
  6695. btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
  6696. struct extent_buffer *node = path->nodes[1];
  6697. leaf = path->nodes[0];
  6698. slot = path->slots[0];
  6699. if (node) {
  6700. if (last_ra_node != node->start) {
  6701. readahead_tree_node_children(node);
  6702. last_ra_node = node->start;
  6703. }
  6704. }
  6705. if (found_key.type == BTRFS_DEV_ITEM_KEY) {
  6706. struct btrfs_dev_item *dev_item;
  6707. dev_item = btrfs_item_ptr(leaf, slot,
  6708. struct btrfs_dev_item);
  6709. ret = read_one_dev(leaf, dev_item);
  6710. if (ret)
  6711. goto error;
  6712. total_dev++;
  6713. } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
  6714. struct btrfs_chunk *chunk;
  6715. /*
  6716. * We are only called at mount time, so no need to take
  6717. * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
  6718. * we always lock first fs_info->chunk_mutex before
  6719. * acquiring any locks on the chunk tree. This is a
  6720. * requirement for chunk allocation, see the comment on
  6721. * top of btrfs_chunk_alloc() for details.
  6722. */
  6723. chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
  6724. ret = read_one_chunk(&found_key, leaf, chunk);
  6725. if (ret)
  6726. goto error;
  6727. }
  6728. }
  6729. /* Catch error found during iteration */
  6730. if (iter_ret < 0) {
  6731. ret = iter_ret;
  6732. goto error;
  6733. }
  6734. /*
  6735. * After loading chunk tree, we've got all device information,
  6736. * do another round of validation checks.
  6737. */
  6738. if (total_dev != fs_info->fs_devices->total_devices) {
  6739. btrfs_warn(fs_info,
  6740. "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
  6741. btrfs_super_num_devices(fs_info->super_copy),
  6742. total_dev);
  6743. fs_info->fs_devices->total_devices = total_dev;
  6744. btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
  6745. }
  6746. if (btrfs_super_total_bytes(fs_info->super_copy) <
  6747. fs_info->fs_devices->total_rw_bytes) {
  6748. btrfs_err(fs_info,
  6749. "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
  6750. btrfs_super_total_bytes(fs_info->super_copy),
  6751. fs_info->fs_devices->total_rw_bytes);
  6752. ret = -EINVAL;
  6753. goto error;
  6754. }
  6755. ret = 0;
  6756. error:
  6757. mutex_unlock(&uuid_mutex);
  6758. return ret;
  6759. }
  6760. int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
  6761. {
  6762. struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
  6763. struct btrfs_device *device;
  6764. int ret = 0;
  6765. mutex_lock(&fs_devices->device_list_mutex);
  6766. list_for_each_entry(device, &fs_devices->devices, dev_list)
  6767. device->fs_info = fs_info;
  6768. list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
  6769. list_for_each_entry(device, &seed_devs->devices, dev_list) {
  6770. device->fs_info = fs_info;
  6771. ret = btrfs_get_dev_zone_info(device, false);
  6772. if (ret)
  6773. break;
  6774. }
  6775. seed_devs->fs_info = fs_info;
  6776. }
  6777. mutex_unlock(&fs_devices->device_list_mutex);
  6778. return ret;
  6779. }
  6780. static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
  6781. const struct btrfs_dev_stats_item *ptr,
  6782. int index)
  6783. {
  6784. u64 val;
  6785. read_extent_buffer(eb, &val,
  6786. offsetof(struct btrfs_dev_stats_item, values) +
  6787. ((unsigned long)ptr) + (index * sizeof(u64)),
  6788. sizeof(val));
  6789. return val;
  6790. }
  6791. static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
  6792. struct btrfs_dev_stats_item *ptr,
  6793. int index, u64 val)
  6794. {
  6795. write_extent_buffer(eb, &val,
  6796. offsetof(struct btrfs_dev_stats_item, values) +
  6797. ((unsigned long)ptr) + (index * sizeof(u64)),
  6798. sizeof(val));
  6799. }
  6800. static int btrfs_device_init_dev_stats(struct btrfs_device *device,
  6801. struct btrfs_path *path)
  6802. {
  6803. struct btrfs_dev_stats_item *ptr;
  6804. struct extent_buffer *eb;
  6805. struct btrfs_key key;
  6806. int item_size;
  6807. int i, ret, slot;
  6808. if (!device->fs_info->dev_root)
  6809. return 0;
  6810. key.objectid = BTRFS_DEV_STATS_OBJECTID;
  6811. key.type = BTRFS_PERSISTENT_ITEM_KEY;
  6812. key.offset = device->devid;
  6813. ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
  6814. if (ret) {
  6815. for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
  6816. btrfs_dev_stat_set(device, i, 0);
  6817. device->dev_stats_valid = 1;
  6818. btrfs_release_path(path);
  6819. return ret < 0 ? ret : 0;
  6820. }
  6821. slot = path->slots[0];
  6822. eb = path->nodes[0];
  6823. item_size = btrfs_item_size(eb, slot);
  6824. ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
  6825. for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
  6826. if (item_size >= (1 + i) * sizeof(__le64))
  6827. btrfs_dev_stat_set(device, i,
  6828. btrfs_dev_stats_value(eb, ptr, i));
  6829. else
  6830. btrfs_dev_stat_set(device, i, 0);
  6831. }
  6832. device->dev_stats_valid = 1;
  6833. btrfs_dev_stat_print_on_load(device);
  6834. btrfs_release_path(path);
  6835. return 0;
  6836. }
  6837. int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
  6838. {
  6839. struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
  6840. struct btrfs_device *device;
  6841. BTRFS_PATH_AUTO_FREE(path);
  6842. int ret = 0;
  6843. path = btrfs_alloc_path();
  6844. if (!path)
  6845. return -ENOMEM;
  6846. mutex_lock(&fs_devices->device_list_mutex);
  6847. list_for_each_entry(device, &fs_devices->devices, dev_list) {
  6848. ret = btrfs_device_init_dev_stats(device, path);
  6849. if (ret)
  6850. goto out;
  6851. }
  6852. list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
  6853. list_for_each_entry(device, &seed_devs->devices, dev_list) {
  6854. ret = btrfs_device_init_dev_stats(device, path);
  6855. if (ret)
  6856. goto out;
  6857. }
  6858. }
  6859. out:
  6860. mutex_unlock(&fs_devices->device_list_mutex);
  6861. return ret;
  6862. }
  6863. static int update_dev_stat_item(struct btrfs_trans_handle *trans,
  6864. struct btrfs_device *device)
  6865. {
  6866. struct btrfs_fs_info *fs_info = trans->fs_info;
  6867. struct btrfs_root *dev_root = fs_info->dev_root;
  6868. BTRFS_PATH_AUTO_FREE(path);
  6869. struct btrfs_key key;
  6870. struct extent_buffer *eb;
  6871. struct btrfs_dev_stats_item *ptr;
  6872. int ret;
  6873. int i;
  6874. key.objectid = BTRFS_DEV_STATS_OBJECTID;
  6875. key.type = BTRFS_PERSISTENT_ITEM_KEY;
  6876. key.offset = device->devid;
  6877. path = btrfs_alloc_path();
  6878. if (!path)
  6879. return -ENOMEM;
  6880. ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
  6881. if (ret < 0) {
  6882. btrfs_warn(fs_info,
  6883. "error %d while searching for dev_stats item for device %s",
  6884. ret, btrfs_dev_name(device));
  6885. return ret;
  6886. }
  6887. if (ret == 0 &&
  6888. btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
  6889. /* need to delete old one and insert a new one */
  6890. ret = btrfs_del_item(trans, dev_root, path);
  6891. if (ret != 0) {
  6892. btrfs_warn(fs_info,
  6893. "delete too small dev_stats item for device %s failed %d",
  6894. btrfs_dev_name(device), ret);
  6895. return ret;
  6896. }
  6897. ret = 1;
  6898. }
  6899. if (ret == 1) {
  6900. /* need to insert a new item */
  6901. btrfs_release_path(path);
  6902. ret = btrfs_insert_empty_item(trans, dev_root, path,
  6903. &key, sizeof(*ptr));
  6904. if (ret < 0) {
  6905. btrfs_warn(fs_info,
  6906. "insert dev_stats item for device %s failed %d",
  6907. btrfs_dev_name(device), ret);
  6908. return ret;
  6909. }
  6910. }
  6911. eb = path->nodes[0];
  6912. ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
  6913. for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
  6914. btrfs_set_dev_stats_value(eb, ptr, i,
  6915. btrfs_dev_stat_read(device, i));
  6916. return ret;
  6917. }
  6918. /*
  6919. * called from commit_transaction. Writes all changed device stats to disk.
  6920. */
  6921. int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
  6922. {
  6923. struct btrfs_fs_info *fs_info = trans->fs_info;
  6924. struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
  6925. struct btrfs_device *device;
  6926. int stats_cnt;
  6927. int ret = 0;
  6928. mutex_lock(&fs_devices->device_list_mutex);
  6929. list_for_each_entry(device, &fs_devices->devices, dev_list) {
  6930. stats_cnt = atomic_read(&device->dev_stats_ccnt);
  6931. if (!device->dev_stats_valid || stats_cnt == 0)
  6932. continue;
  6933. /*
  6934. * There is a LOAD-LOAD control dependency between the value of
  6935. * dev_stats_ccnt and updating the on-disk values which requires
  6936. * reading the in-memory counters. Such control dependencies
  6937. * require explicit read memory barriers.
  6938. *
  6939. * This memory barriers pairs with smp_mb__before_atomic in
  6940. * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
  6941. * barrier implied by atomic_xchg in
  6942. * btrfs_dev_stats_read_and_reset
  6943. */
  6944. smp_rmb();
  6945. ret = update_dev_stat_item(trans, device);
  6946. if (ret)
  6947. break;
  6948. atomic_sub(stats_cnt, &device->dev_stats_ccnt);
  6949. }
  6950. mutex_unlock(&fs_devices->device_list_mutex);
  6951. return ret;
  6952. }
  6953. void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
  6954. {
  6955. btrfs_dev_stat_inc(dev, index);
  6956. if (!dev->dev_stats_valid)
  6957. return;
  6958. btrfs_err_rl(dev->fs_info,
  6959. "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
  6960. btrfs_dev_name(dev),
  6961. btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
  6962. btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
  6963. btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
  6964. btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
  6965. btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
  6966. }
  6967. static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
  6968. {
  6969. int i;
  6970. for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
  6971. if (btrfs_dev_stat_read(dev, i) != 0)
  6972. break;
  6973. if (i == BTRFS_DEV_STAT_VALUES_MAX)
  6974. return; /* all values == 0, suppress message */
  6975. btrfs_info(dev->fs_info,
  6976. "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
  6977. btrfs_dev_name(dev),
  6978. btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
  6979. btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
  6980. btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
  6981. btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
  6982. btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
  6983. }
  6984. int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
  6985. struct btrfs_ioctl_get_dev_stats *stats)
  6986. {
  6987. BTRFS_DEV_LOOKUP_ARGS(args);
  6988. struct btrfs_device *dev;
  6989. struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
  6990. int i;
  6991. mutex_lock(&fs_devices->device_list_mutex);
  6992. args.devid = stats->devid;
  6993. dev = btrfs_find_device(fs_info->fs_devices, &args);
  6994. mutex_unlock(&fs_devices->device_list_mutex);
  6995. if (!dev) {
  6996. btrfs_warn(fs_info, "get dev_stats failed, device not found");
  6997. return -ENODEV;
  6998. } else if (!dev->dev_stats_valid) {
  6999. btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
  7000. return -ENODEV;
  7001. } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
  7002. for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
  7003. if (stats->nr_items > i)
  7004. stats->values[i] =
  7005. btrfs_dev_stat_read_and_reset(dev, i);
  7006. else
  7007. btrfs_dev_stat_set(dev, i, 0);
  7008. }
  7009. btrfs_info(fs_info, "device stats zeroed by %s (%d)",
  7010. current->comm, task_pid_nr(current));
  7011. } else {
  7012. for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
  7013. if (stats->nr_items > i)
  7014. stats->values[i] = btrfs_dev_stat_read(dev, i);
  7015. }
  7016. if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
  7017. stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
  7018. return 0;
  7019. }
  7020. /*
  7021. * Update the size and bytes used for each device where it changed. This is
  7022. * delayed since we would otherwise get errors while writing out the
  7023. * superblocks.
  7024. *
  7025. * Must be invoked during transaction commit.
  7026. */
  7027. void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
  7028. {
  7029. struct btrfs_device *curr, *next;
  7030. ASSERT(trans->state == TRANS_STATE_COMMIT_DOING, "state=%d" , trans->state);
  7031. if (list_empty(&trans->dev_update_list))
  7032. return;
  7033. /*
  7034. * We don't need the device_list_mutex here. This list is owned by the
  7035. * transaction and the transaction must complete before the device is
  7036. * released.
  7037. */
  7038. mutex_lock(&trans->fs_info->chunk_mutex);
  7039. list_for_each_entry_safe(curr, next, &trans->dev_update_list,
  7040. post_commit_list) {
  7041. list_del_init(&curr->post_commit_list);
  7042. curr->commit_total_bytes = curr->disk_total_bytes;
  7043. curr->commit_bytes_used = curr->bytes_used;
  7044. }
  7045. mutex_unlock(&trans->fs_info->chunk_mutex);
  7046. }
  7047. /*
  7048. * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
  7049. */
  7050. int btrfs_bg_type_to_factor(u64 flags)
  7051. {
  7052. const int index = btrfs_bg_flags_to_raid_index(flags);
  7053. return btrfs_raid_array[index].ncopies;
  7054. }
  7055. static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
  7056. u64 chunk_offset, u64 devid,
  7057. u64 physical_offset, u64 physical_len)
  7058. {
  7059. struct btrfs_dev_lookup_args args = { .devid = devid };
  7060. struct btrfs_chunk_map *map;
  7061. struct btrfs_device *dev;
  7062. u64 stripe_len;
  7063. bool found = false;
  7064. int ret = 0;
  7065. int i;
  7066. map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
  7067. if (unlikely(!map)) {
  7068. btrfs_err(fs_info,
  7069. "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
  7070. physical_offset, devid);
  7071. ret = -EUCLEAN;
  7072. goto out;
  7073. }
  7074. stripe_len = btrfs_calc_stripe_length(map);
  7075. if (unlikely(physical_len != stripe_len)) {
  7076. btrfs_err(fs_info,
  7077. "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
  7078. physical_offset, devid, map->start, physical_len,
  7079. stripe_len);
  7080. ret = -EUCLEAN;
  7081. goto out;
  7082. }
  7083. /*
  7084. * Very old mkfs.btrfs (before v4.15) will not respect the reserved
  7085. * space. Although kernel can handle it without problem, better to warn
  7086. * the users.
  7087. */
  7088. if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
  7089. btrfs_warn(fs_info,
  7090. "devid %llu physical %llu len %llu inside the reserved space",
  7091. devid, physical_offset, physical_len);
  7092. for (i = 0; i < map->num_stripes; i++) {
  7093. if (unlikely(map->stripes[i].dev->devid == devid &&
  7094. map->stripes[i].physical == physical_offset)) {
  7095. found = true;
  7096. if (map->verified_stripes >= map->num_stripes) {
  7097. btrfs_err(fs_info,
  7098. "too many dev extents for chunk %llu found",
  7099. map->start);
  7100. ret = -EUCLEAN;
  7101. goto out;
  7102. }
  7103. map->verified_stripes++;
  7104. break;
  7105. }
  7106. }
  7107. if (unlikely(!found)) {
  7108. btrfs_err(fs_info,
  7109. "dev extent physical offset %llu devid %llu has no corresponding chunk",
  7110. physical_offset, devid);
  7111. ret = -EUCLEAN;
  7112. }
  7113. /* Make sure no dev extent is beyond device boundary */
  7114. dev = btrfs_find_device(fs_info->fs_devices, &args);
  7115. if (unlikely(!dev)) {
  7116. btrfs_err(fs_info, "failed to find devid %llu", devid);
  7117. ret = -EUCLEAN;
  7118. goto out;
  7119. }
  7120. if (unlikely(physical_offset + physical_len > dev->disk_total_bytes)) {
  7121. btrfs_err(fs_info,
  7122. "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
  7123. devid, physical_offset, physical_len,
  7124. dev->disk_total_bytes);
  7125. ret = -EUCLEAN;
  7126. goto out;
  7127. }
  7128. if (dev->zone_info) {
  7129. u64 zone_size = dev->zone_info->zone_size;
  7130. if (unlikely(!IS_ALIGNED(physical_offset, zone_size) ||
  7131. !IS_ALIGNED(physical_len, zone_size))) {
  7132. btrfs_err(fs_info,
  7133. "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
  7134. devid, physical_offset, physical_len);
  7135. ret = -EUCLEAN;
  7136. goto out;
  7137. }
  7138. }
  7139. out:
  7140. btrfs_free_chunk_map(map);
  7141. return ret;
  7142. }
  7143. static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
  7144. {
  7145. struct rb_node *node;
  7146. int ret = 0;
  7147. read_lock(&fs_info->mapping_tree_lock);
  7148. for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
  7149. struct btrfs_chunk_map *map;
  7150. map = rb_entry(node, struct btrfs_chunk_map, rb_node);
  7151. if (unlikely(map->num_stripes != map->verified_stripes)) {
  7152. btrfs_err(fs_info,
  7153. "chunk %llu has missing dev extent, have %d expect %d",
  7154. map->start, map->verified_stripes, map->num_stripes);
  7155. ret = -EUCLEAN;
  7156. goto out;
  7157. }
  7158. }
  7159. out:
  7160. read_unlock(&fs_info->mapping_tree_lock);
  7161. return ret;
  7162. }
  7163. /*
  7164. * Ensure that all dev extents are mapped to correct chunk, otherwise
  7165. * later chunk allocation/free would cause unexpected behavior.
  7166. *
  7167. * NOTE: This will iterate through the whole device tree, which should be of
  7168. * the same size level as the chunk tree. This slightly increases mount time.
  7169. */
  7170. int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
  7171. {
  7172. BTRFS_PATH_AUTO_FREE(path);
  7173. struct btrfs_root *root = fs_info->dev_root;
  7174. struct btrfs_key key;
  7175. u64 prev_devid = 0;
  7176. u64 prev_dev_ext_end = 0;
  7177. int ret = 0;
  7178. /*
  7179. * We don't have a dev_root because we mounted with ignorebadroots and
  7180. * failed to load the root, so we want to skip the verification in this
  7181. * case for sure.
  7182. *
  7183. * However if the dev root is fine, but the tree itself is corrupted
  7184. * we'd still fail to mount. This verification is only to make sure
  7185. * writes can happen safely, so instead just bypass this check
  7186. * completely in the case of IGNOREBADROOTS.
  7187. */
  7188. if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
  7189. return 0;
  7190. key.objectid = 1;
  7191. key.type = BTRFS_DEV_EXTENT_KEY;
  7192. key.offset = 0;
  7193. path = btrfs_alloc_path();
  7194. if (!path)
  7195. return -ENOMEM;
  7196. path->reada = READA_FORWARD_ALWAYS;
  7197. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  7198. if (ret < 0)
  7199. return ret;
  7200. if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
  7201. ret = btrfs_next_leaf(root, path);
  7202. if (ret < 0)
  7203. return ret;
  7204. /* No dev extents at all? Not good */
  7205. if (unlikely(ret > 0))
  7206. return -EUCLEAN;
  7207. }
  7208. while (1) {
  7209. struct extent_buffer *leaf = path->nodes[0];
  7210. struct btrfs_dev_extent *dext;
  7211. int slot = path->slots[0];
  7212. u64 chunk_offset;
  7213. u64 physical_offset;
  7214. u64 physical_len;
  7215. u64 devid;
  7216. btrfs_item_key_to_cpu(leaf, &key, slot);
  7217. if (key.type != BTRFS_DEV_EXTENT_KEY)
  7218. break;
  7219. devid = key.objectid;
  7220. physical_offset = key.offset;
  7221. dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
  7222. chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
  7223. physical_len = btrfs_dev_extent_length(leaf, dext);
  7224. /* Check if this dev extent overlaps with the previous one */
  7225. if (unlikely(devid == prev_devid && physical_offset < prev_dev_ext_end)) {
  7226. btrfs_err(fs_info,
  7227. "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
  7228. devid, physical_offset, prev_dev_ext_end);
  7229. return -EUCLEAN;
  7230. }
  7231. ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
  7232. physical_offset, physical_len);
  7233. if (ret < 0)
  7234. return ret;
  7235. prev_devid = devid;
  7236. prev_dev_ext_end = physical_offset + physical_len;
  7237. ret = btrfs_next_item(root, path);
  7238. if (ret < 0)
  7239. return ret;
  7240. if (ret > 0) {
  7241. ret = 0;
  7242. break;
  7243. }
  7244. }
  7245. /* Ensure all chunks have corresponding dev extents */
  7246. return verify_chunk_dev_extent_mapping(fs_info);
  7247. }
  7248. /*
  7249. * Ensure that all devices registered in the fs have their device items in the
  7250. * chunk tree.
  7251. *
  7252. * Return true if unexpected device is found.
  7253. * Return false otherwise.
  7254. */
  7255. bool btrfs_verify_dev_items(const struct btrfs_fs_info *fs_info)
  7256. {
  7257. struct btrfs_fs_devices *seed_devs;
  7258. struct btrfs_device *dev;
  7259. bool ret = false;
  7260. mutex_lock(&uuid_mutex);
  7261. list_for_each_entry(dev, &fs_info->fs_devices->devices, dev_list) {
  7262. if (!test_bit(BTRFS_DEV_STATE_ITEM_FOUND, &dev->dev_state)) {
  7263. btrfs_err(fs_info,
  7264. "devid %llu path %s is registered but not found in chunk tree",
  7265. dev->devid, btrfs_dev_name(dev));
  7266. ret = true;
  7267. }
  7268. }
  7269. list_for_each_entry(seed_devs, &fs_info->fs_devices->seed_list, seed_list) {
  7270. list_for_each_entry(dev, &seed_devs->devices, dev_list) {
  7271. if (!test_bit(BTRFS_DEV_STATE_ITEM_FOUND, &dev->dev_state)) {
  7272. btrfs_err(fs_info,
  7273. "devid %llu path %s is registered but not found in chunk tree",
  7274. dev->devid, btrfs_dev_name(dev));
  7275. ret = true;
  7276. }
  7277. }
  7278. }
  7279. mutex_unlock(&uuid_mutex);
  7280. if (ret)
  7281. btrfs_err(fs_info,
  7282. "remove the above devices or use 'btrfs device scan --forget <dev>' to unregister them before mount");
  7283. return ret;
  7284. }
  7285. /*
  7286. * Check whether the given block group or device is pinned by any inode being
  7287. * used as a swapfile.
  7288. */
  7289. bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
  7290. {
  7291. struct btrfs_swapfile_pin *sp;
  7292. struct rb_node *node;
  7293. spin_lock(&fs_info->swapfile_pins_lock);
  7294. node = fs_info->swapfile_pins.rb_node;
  7295. while (node) {
  7296. sp = rb_entry(node, struct btrfs_swapfile_pin, node);
  7297. if (ptr < sp->ptr)
  7298. node = node->rb_left;
  7299. else if (ptr > sp->ptr)
  7300. node = node->rb_right;
  7301. else
  7302. break;
  7303. }
  7304. spin_unlock(&fs_info->swapfile_pins_lock);
  7305. return node != NULL;
  7306. }
  7307. static int relocating_repair_kthread(void *data)
  7308. {
  7309. struct btrfs_block_group *cache = data;
  7310. struct btrfs_fs_info *fs_info = cache->fs_info;
  7311. u64 target;
  7312. int ret = 0;
  7313. target = cache->start;
  7314. btrfs_put_block_group(cache);
  7315. guard(super_write)(fs_info->sb);
  7316. if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
  7317. btrfs_info(fs_info,
  7318. "zoned: skip relocating block group %llu to repair: EBUSY",
  7319. target);
  7320. return -EBUSY;
  7321. }
  7322. mutex_lock(&fs_info->reclaim_bgs_lock);
  7323. /* Ensure block group still exists */
  7324. cache = btrfs_lookup_block_group(fs_info, target);
  7325. if (!cache)
  7326. goto out;
  7327. if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
  7328. goto out;
  7329. ret = btrfs_may_alloc_data_chunk(fs_info, target);
  7330. if (ret < 0)
  7331. goto out;
  7332. btrfs_info(fs_info,
  7333. "zoned: relocating block group %llu to repair IO failure",
  7334. target);
  7335. ret = btrfs_relocate_chunk(fs_info, target, true);
  7336. out:
  7337. if (cache)
  7338. btrfs_put_block_group(cache);
  7339. mutex_unlock(&fs_info->reclaim_bgs_lock);
  7340. btrfs_exclop_finish(fs_info);
  7341. return ret;
  7342. }
  7343. bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
  7344. {
  7345. struct btrfs_block_group *cache;
  7346. if (!btrfs_is_zoned(fs_info))
  7347. return false;
  7348. /* Do not attempt to repair in degraded state */
  7349. if (btrfs_test_opt(fs_info, DEGRADED))
  7350. return true;
  7351. cache = btrfs_lookup_block_group(fs_info, logical);
  7352. if (!cache)
  7353. return true;
  7354. if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
  7355. btrfs_put_block_group(cache);
  7356. return true;
  7357. }
  7358. kthread_run(relocating_repair_kthread, cache,
  7359. "btrfs-relocating-repair");
  7360. return true;
  7361. }
  7362. static void map_raid56_repair_block(struct btrfs_io_context *bioc,
  7363. struct btrfs_io_stripe *smap,
  7364. u64 logical)
  7365. {
  7366. int data_stripes = nr_bioc_data_stripes(bioc);
  7367. int i;
  7368. for (i = 0; i < data_stripes; i++) {
  7369. u64 stripe_start = bioc->full_stripe_logical +
  7370. btrfs_stripe_nr_to_offset(i);
  7371. if (logical >= stripe_start &&
  7372. logical < stripe_start + BTRFS_STRIPE_LEN)
  7373. break;
  7374. }
  7375. ASSERT(i < data_stripes, "i=%d data_stripes=%d", i, data_stripes);
  7376. smap->dev = bioc->stripes[i].dev;
  7377. smap->physical = bioc->stripes[i].physical +
  7378. ((logical - bioc->full_stripe_logical) &
  7379. BTRFS_STRIPE_LEN_MASK);
  7380. }
  7381. /*
  7382. * Map a repair write into a single device.
  7383. *
  7384. * A repair write is triggered by read time repair or scrub, which would only
  7385. * update the contents of a single device.
  7386. * Not update any other mirrors nor go through RMW path.
  7387. *
  7388. * Callers should ensure:
  7389. *
  7390. * - Call btrfs_bio_counter_inc_blocked() first
  7391. * - The range does not cross stripe boundary
  7392. * - Has a valid @mirror_num passed in.
  7393. */
  7394. int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
  7395. struct btrfs_io_stripe *smap, u64 logical,
  7396. u32 length, int mirror_num)
  7397. {
  7398. struct btrfs_io_context *bioc = NULL;
  7399. u64 map_length = length;
  7400. int mirror_ret = mirror_num;
  7401. int ret;
  7402. ASSERT(mirror_num > 0, "mirror_num=%d", mirror_num);
  7403. ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
  7404. &bioc, smap, &mirror_ret);
  7405. if (ret < 0)
  7406. return ret;
  7407. /* The map range should not cross stripe boundary. */
  7408. ASSERT(map_length >= length, "map_length=%llu length=%u", map_length, length);
  7409. /* Already mapped to single stripe. */
  7410. if (!bioc)
  7411. goto out;
  7412. /* Map the RAID56 multi-stripe writes to a single one. */
  7413. if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
  7414. map_raid56_repair_block(bioc, smap, logical);
  7415. goto out;
  7416. }
  7417. ASSERT(mirror_num <= bioc->num_stripes,
  7418. "mirror_num=%d num_stripes=%d", mirror_num, bioc->num_stripes);
  7419. smap->dev = bioc->stripes[mirror_num - 1].dev;
  7420. smap->physical = bioc->stripes[mirror_num - 1].physical;
  7421. out:
  7422. btrfs_put_bioc(bioc);
  7423. ASSERT(smap->dev);
  7424. return 0;
  7425. }