disk-io.c 142 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2007 Oracle. All rights reserved.
  4. */
  5. #include <linux/fs.h>
  6. #include <linux/blkdev.h>
  7. #include <linux/radix-tree.h>
  8. #include <linux/writeback.h>
  9. #include <linux/workqueue.h>
  10. #include <linux/kthread.h>
  11. #include <linux/slab.h>
  12. #include <linux/migrate.h>
  13. #include <linux/ratelimit.h>
  14. #include <linux/uuid.h>
  15. #include <linux/semaphore.h>
  16. #include <linux/error-injection.h>
  17. #include <linux/crc32c.h>
  18. #include <linux/sched/mm.h>
  19. #include <linux/unaligned.h>
  20. #include "ctree.h"
  21. #include "disk-io.h"
  22. #include "transaction.h"
  23. #include "btrfs_inode.h"
  24. #include "delayed-inode.h"
  25. #include "bio.h"
  26. #include "print-tree.h"
  27. #include "locking.h"
  28. #include "tree-log.h"
  29. #include "free-space-cache.h"
  30. #include "free-space-tree.h"
  31. #include "dev-replace.h"
  32. #include "raid56.h"
  33. #include "sysfs.h"
  34. #include "qgroup.h"
  35. #include "compression.h"
  36. #include "tree-checker.h"
  37. #include "ref-verify.h"
  38. #include "block-group.h"
  39. #include "discard.h"
  40. #include "space-info.h"
  41. #include "zoned.h"
  42. #include "subpage.h"
  43. #include "fs.h"
  44. #include "accessors.h"
  45. #include "extent-tree.h"
  46. #include "root-tree.h"
  47. #include "defrag.h"
  48. #include "uuid-tree.h"
  49. #include "relocation.h"
  50. #include "scrub.h"
  51. #include "super.h"
  52. #include "delayed-inode.h"
  53. #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
  54. BTRFS_HEADER_FLAG_RELOC |\
  55. BTRFS_SUPER_FLAG_ERROR |\
  56. BTRFS_SUPER_FLAG_SEEDING |\
  57. BTRFS_SUPER_FLAG_METADUMP |\
  58. BTRFS_SUPER_FLAG_METADUMP_V2)
  59. static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
  60. static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
  61. /*
  62. * Compute the csum of a btree block and store the result to provided buffer.
  63. */
  64. static void csum_tree_block(struct extent_buffer *buf, u8 *result)
  65. {
  66. struct btrfs_fs_info *fs_info = buf->fs_info;
  67. int num_pages;
  68. u32 first_page_part;
  69. struct btrfs_csum_ctx csum;
  70. char *kaddr;
  71. int i;
  72. btrfs_csum_init(&csum, fs_info->csum_type);
  73. if (buf->addr) {
  74. /* Pages are contiguous, handle them as a big one. */
  75. kaddr = buf->addr;
  76. first_page_part = fs_info->nodesize;
  77. num_pages = 1;
  78. } else {
  79. kaddr = folio_address(buf->folios[0]);
  80. first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
  81. num_pages = num_extent_pages(buf);
  82. }
  83. btrfs_csum_update(&csum, kaddr + BTRFS_CSUM_SIZE,
  84. first_page_part - BTRFS_CSUM_SIZE);
  85. /*
  86. * Multiple single-page folios case would reach here.
  87. *
  88. * nodesize <= PAGE_SIZE and large folio all handled by above
  89. * btrfs_csum_update() already.
  90. */
  91. for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
  92. kaddr = folio_address(buf->folios[i]);
  93. btrfs_csum_update(&csum, kaddr, PAGE_SIZE);
  94. }
  95. memset(result, 0, BTRFS_CSUM_SIZE);
  96. btrfs_csum_final(&csum, result);
  97. }
  98. /*
  99. * we can't consider a given block up to date unless the transid of the
  100. * block matches the transid in the parent node's pointer. This is how we
  101. * detect blocks that either didn't get written at all or got written
  102. * in the wrong place.
  103. */
  104. int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, bool atomic)
  105. {
  106. if (!extent_buffer_uptodate(eb))
  107. return 0;
  108. if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
  109. return 1;
  110. if (atomic)
  111. return -EAGAIN;
  112. if (!extent_buffer_uptodate(eb) ||
  113. btrfs_header_generation(eb) != parent_transid) {
  114. btrfs_err_rl(eb->fs_info,
  115. "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
  116. eb->start, eb->read_mirror,
  117. parent_transid, btrfs_header_generation(eb));
  118. clear_extent_buffer_uptodate(eb);
  119. return 0;
  120. }
  121. return 1;
  122. }
  123. static bool btrfs_supported_super_csum(u16 csum_type)
  124. {
  125. switch (csum_type) {
  126. case BTRFS_CSUM_TYPE_CRC32:
  127. case BTRFS_CSUM_TYPE_XXHASH:
  128. case BTRFS_CSUM_TYPE_SHA256:
  129. case BTRFS_CSUM_TYPE_BLAKE2:
  130. return true;
  131. default:
  132. return false;
  133. }
  134. }
  135. /*
  136. * Return 0 if the superblock checksum type matches the checksum value of that
  137. * algorithm. Pass the raw disk superblock data.
  138. */
  139. int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
  140. const struct btrfs_super_block *disk_sb)
  141. {
  142. u8 result[BTRFS_CSUM_SIZE];
  143. /*
  144. * The super_block structure does not span the whole
  145. * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
  146. * filled with zeros and is included in the checksum.
  147. */
  148. btrfs_csum(fs_info->csum_type, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
  149. BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
  150. if (memcmp(disk_sb->csum, result, fs_info->csum_size))
  151. return 1;
  152. return 0;
  153. }
  154. static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
  155. int mirror_num)
  156. {
  157. struct btrfs_fs_info *fs_info = eb->fs_info;
  158. const u32 step = min(fs_info->nodesize, PAGE_SIZE);
  159. const u32 nr_steps = eb->len / step;
  160. phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
  161. if (sb_rdonly(fs_info->sb))
  162. return -EROFS;
  163. for (int i = 0; i < num_extent_pages(eb); i++) {
  164. struct folio *folio = eb->folios[i];
  165. /* No large folio support yet. */
  166. ASSERT(folio_order(folio) == 0);
  167. ASSERT(i < nr_steps);
  168. /*
  169. * For nodesize < page size, there is just one paddr, with some
  170. * offset inside the page.
  171. *
  172. * For nodesize >= page size, it's one or more paddrs, and eb->start
  173. * must be aligned to page boundary.
  174. */
  175. paddrs[i] = page_to_phys(&folio->page) + offset_in_page(eb->start);
  176. }
  177. return btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len,
  178. eb->start, paddrs, step, mirror_num);
  179. }
  180. /*
  181. * helper to read a given tree block, doing retries as required when
  182. * the checksums don't match and we have alternate mirrors to try.
  183. *
  184. * @check: expected tree parentness check, see the comments of the
  185. * structure for details.
  186. */
  187. int btrfs_read_extent_buffer(struct extent_buffer *eb,
  188. const struct btrfs_tree_parent_check *check)
  189. {
  190. struct btrfs_fs_info *fs_info = eb->fs_info;
  191. int failed = 0;
  192. int ret;
  193. int num_copies = 0;
  194. int mirror_num = 0;
  195. int failed_mirror = 0;
  196. ASSERT(check);
  197. while (1) {
  198. ret = read_extent_buffer_pages(eb, mirror_num, check);
  199. if (!ret)
  200. break;
  201. num_copies = btrfs_num_copies(fs_info,
  202. eb->start, eb->len);
  203. if (num_copies == 1)
  204. break;
  205. if (!failed_mirror) {
  206. failed = 1;
  207. failed_mirror = eb->read_mirror;
  208. }
  209. mirror_num++;
  210. if (mirror_num == failed_mirror)
  211. mirror_num++;
  212. if (mirror_num > num_copies)
  213. break;
  214. }
  215. if (failed && !ret && failed_mirror)
  216. btrfs_repair_eb_io_failure(eb, failed_mirror);
  217. return ret;
  218. }
  219. /*
  220. * Checksum a dirty tree block before IO.
  221. */
  222. int btree_csum_one_bio(struct btrfs_bio *bbio)
  223. {
  224. struct extent_buffer *eb = bbio->private;
  225. struct btrfs_fs_info *fs_info = eb->fs_info;
  226. u64 found_start = btrfs_header_bytenr(eb);
  227. u64 last_trans;
  228. u8 result[BTRFS_CSUM_SIZE];
  229. int ret;
  230. /* Btree blocks are always contiguous on disk. */
  231. if (WARN_ON_ONCE(bbio->file_offset != eb->start))
  232. return -EIO;
  233. if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
  234. return -EIO;
  235. /*
  236. * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
  237. * checksum it but zero-out its content. This is done to preserve
  238. * ordering of I/O without unnecessarily writing out data.
  239. */
  240. if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
  241. memzero_extent_buffer(eb, 0, eb->len);
  242. return 0;
  243. }
  244. if (WARN_ON_ONCE(found_start != eb->start))
  245. return -EIO;
  246. if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb)))
  247. return -EIO;
  248. ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
  249. offsetof(struct btrfs_header, fsid),
  250. BTRFS_FSID_SIZE) == 0);
  251. csum_tree_block(eb, result);
  252. if (btrfs_header_level(eb))
  253. ret = btrfs_check_node(eb);
  254. else
  255. ret = btrfs_check_leaf(eb);
  256. if (ret < 0)
  257. goto error;
  258. /*
  259. * Also check the generation, the eb reached here must be newer than
  260. * last committed. Or something seriously wrong happened.
  261. */
  262. last_trans = btrfs_get_last_trans_committed(fs_info);
  263. if (unlikely(btrfs_header_generation(eb) <= last_trans)) {
  264. ret = -EUCLEAN;
  265. btrfs_err(fs_info,
  266. "block=%llu bad generation, have %llu expect > %llu",
  267. eb->start, btrfs_header_generation(eb), last_trans);
  268. goto error;
  269. }
  270. write_extent_buffer(eb, result, 0, fs_info->csum_size);
  271. return 0;
  272. error:
  273. btrfs_print_tree(eb, 0);
  274. btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
  275. eb->start);
  276. /*
  277. * Be noisy if this is an extent buffer from a log tree. We don't abort
  278. * a transaction in case there's a bad log tree extent buffer, we just
  279. * fallback to a transaction commit. Still we want to know when there is
  280. * a bad log tree extent buffer, as that may signal a bug somewhere.
  281. */
  282. WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
  283. btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
  284. return ret;
  285. }
  286. static bool check_tree_block_fsid(struct extent_buffer *eb)
  287. {
  288. struct btrfs_fs_info *fs_info = eb->fs_info;
  289. struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
  290. u8 fsid[BTRFS_FSID_SIZE];
  291. read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
  292. BTRFS_FSID_SIZE);
  293. /*
  294. * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid.
  295. * This is then overwritten by metadata_uuid if it is present in the
  296. * device_list_add(). The same true for a seed device as well. So use of
  297. * fs_devices::metadata_uuid is appropriate here.
  298. */
  299. if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0)
  300. return false;
  301. list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
  302. if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
  303. return false;
  304. return true;
  305. }
  306. /* Do basic extent buffer checks at read time */
  307. int btrfs_validate_extent_buffer(struct extent_buffer *eb,
  308. const struct btrfs_tree_parent_check *check)
  309. {
  310. struct btrfs_fs_info *fs_info = eb->fs_info;
  311. u64 found_start;
  312. const u32 csum_size = fs_info->csum_size;
  313. u8 found_level;
  314. u8 result[BTRFS_CSUM_SIZE];
  315. const u8 *header_csum;
  316. int ret = 0;
  317. const bool ignore_csum = btrfs_test_opt(fs_info, IGNOREMETACSUMS);
  318. ASSERT(check);
  319. found_start = btrfs_header_bytenr(eb);
  320. if (unlikely(found_start != eb->start)) {
  321. btrfs_err_rl(fs_info,
  322. "bad tree block start, mirror %u want %llu have %llu",
  323. eb->read_mirror, eb->start, found_start);
  324. return -EIO;
  325. }
  326. if (unlikely(check_tree_block_fsid(eb))) {
  327. btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
  328. eb->start, eb->read_mirror);
  329. return -EIO;
  330. }
  331. found_level = btrfs_header_level(eb);
  332. if (unlikely(found_level >= BTRFS_MAX_LEVEL)) {
  333. btrfs_err(fs_info,
  334. "bad tree block level, mirror %u level %d on logical %llu",
  335. eb->read_mirror, btrfs_header_level(eb), eb->start);
  336. return -EIO;
  337. }
  338. csum_tree_block(eb, result);
  339. header_csum = folio_address(eb->folios[0]) +
  340. get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum));
  341. if (memcmp(result, header_csum, csum_size) != 0) {
  342. btrfs_warn_rl(fs_info,
  343. "checksum verify failed on logical %llu mirror %u wanted " BTRFS_CSUM_FMT " found " BTRFS_CSUM_FMT " level %d%s",
  344. eb->start, eb->read_mirror,
  345. BTRFS_CSUM_FMT_VALUE(csum_size, header_csum),
  346. BTRFS_CSUM_FMT_VALUE(csum_size, result),
  347. btrfs_header_level(eb),
  348. ignore_csum ? ", ignored" : "");
  349. if (unlikely(!ignore_csum))
  350. return -EUCLEAN;
  351. }
  352. if (unlikely(found_level != check->level)) {
  353. btrfs_err(fs_info,
  354. "level verify failed on logical %llu mirror %u wanted %u found %u",
  355. eb->start, eb->read_mirror, check->level, found_level);
  356. return -EIO;
  357. }
  358. if (unlikely(check->transid &&
  359. btrfs_header_generation(eb) != check->transid)) {
  360. btrfs_err_rl(eb->fs_info,
  361. "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
  362. eb->start, eb->read_mirror, check->transid,
  363. btrfs_header_generation(eb));
  364. return -EIO;
  365. }
  366. if (check->has_first_key) {
  367. const struct btrfs_key *expect_key = &check->first_key;
  368. struct btrfs_key found_key;
  369. if (found_level)
  370. btrfs_node_key_to_cpu(eb, &found_key, 0);
  371. else
  372. btrfs_item_key_to_cpu(eb, &found_key, 0);
  373. if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) {
  374. btrfs_err(fs_info,
  375. "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
  376. eb->start, check->transid,
  377. expect_key->objectid,
  378. expect_key->type, expect_key->offset,
  379. found_key.objectid, found_key.type,
  380. found_key.offset);
  381. return -EUCLEAN;
  382. }
  383. }
  384. if (check->owner_root) {
  385. ret = btrfs_check_eb_owner(eb, check->owner_root);
  386. if (ret < 0)
  387. return ret;
  388. }
  389. /* If this is a leaf block and it is corrupt, just return -EIO. */
  390. if (found_level == 0 && btrfs_check_leaf(eb))
  391. ret = -EIO;
  392. if (found_level > 0 && btrfs_check_node(eb))
  393. ret = -EIO;
  394. if (ret)
  395. btrfs_err(fs_info,
  396. "read time tree block corruption detected on logical %llu mirror %u",
  397. eb->start, eb->read_mirror);
  398. return ret;
  399. }
  400. #ifdef CONFIG_MIGRATION
  401. static int btree_migrate_folio(struct address_space *mapping,
  402. struct folio *dst, struct folio *src, enum migrate_mode mode)
  403. {
  404. /*
  405. * we can't safely write a btree page from here,
  406. * we haven't done the locking hook
  407. */
  408. if (folio_test_dirty(src))
  409. return -EAGAIN;
  410. /*
  411. * Buffers may be managed in a filesystem specific way.
  412. * We must have no buffers or drop them.
  413. */
  414. if (folio_get_private(src) &&
  415. !filemap_release_folio(src, GFP_KERNEL))
  416. return -EAGAIN;
  417. return migrate_folio(mapping, dst, src, mode);
  418. }
  419. #else
  420. #define btree_migrate_folio NULL
  421. #endif
  422. static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
  423. {
  424. if (folio_test_writeback(folio) || folio_test_dirty(folio))
  425. return false;
  426. return try_release_extent_buffer(folio);
  427. }
  428. static void btree_invalidate_folio(struct folio *folio, size_t offset,
  429. size_t length)
  430. {
  431. struct extent_io_tree *tree;
  432. tree = &folio_to_inode(folio)->io_tree;
  433. extent_invalidate_folio(tree, folio, offset);
  434. btree_release_folio(folio, GFP_NOFS);
  435. if (folio_get_private(folio)) {
  436. btrfs_warn(folio_to_fs_info(folio),
  437. "folio private not zero on folio %llu",
  438. (unsigned long long)folio_pos(folio));
  439. folio_detach_private(folio);
  440. }
  441. }
  442. #ifdef DEBUG
  443. static bool btree_dirty_folio(struct address_space *mapping,
  444. struct folio *folio)
  445. {
  446. struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
  447. struct btrfs_subpage_info *spi = fs_info->subpage_info;
  448. struct btrfs_subpage *subpage;
  449. struct extent_buffer *eb;
  450. int cur_bit = 0;
  451. u64 page_start = folio_pos(folio);
  452. if (fs_info->sectorsize == PAGE_SIZE) {
  453. eb = folio_get_private(folio);
  454. BUG_ON(!eb);
  455. BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
  456. BUG_ON(!atomic_read(&eb->refs));
  457. btrfs_assert_tree_write_locked(eb);
  458. return filemap_dirty_folio(mapping, folio);
  459. }
  460. ASSERT(spi);
  461. subpage = folio_get_private(folio);
  462. for (cur_bit = spi->dirty_offset;
  463. cur_bit < spi->dirty_offset + spi->bitmap_nr_bits;
  464. cur_bit++) {
  465. unsigned long flags;
  466. u64 cur;
  467. spin_lock_irqsave(&subpage->lock, flags);
  468. if (!test_bit(cur_bit, subpage->bitmaps)) {
  469. spin_unlock_irqrestore(&subpage->lock, flags);
  470. continue;
  471. }
  472. spin_unlock_irqrestore(&subpage->lock, flags);
  473. cur = page_start + cur_bit * fs_info->sectorsize;
  474. eb = find_extent_buffer(fs_info, cur);
  475. ASSERT(eb);
  476. ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
  477. ASSERT(atomic_read(&eb->refs));
  478. btrfs_assert_tree_write_locked(eb);
  479. free_extent_buffer(eb);
  480. cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1;
  481. }
  482. return filemap_dirty_folio(mapping, folio);
  483. }
  484. #else
  485. #define btree_dirty_folio filemap_dirty_folio
  486. #endif
  487. static const struct address_space_operations btree_aops = {
  488. .writepages = btree_writepages,
  489. .release_folio = btree_release_folio,
  490. .invalidate_folio = btree_invalidate_folio,
  491. .migrate_folio = btree_migrate_folio,
  492. .dirty_folio = btree_dirty_folio,
  493. };
  494. struct extent_buffer *btrfs_find_create_tree_block(
  495. struct btrfs_fs_info *fs_info,
  496. u64 bytenr, u64 owner_root,
  497. int level)
  498. {
  499. if (btrfs_is_testing(fs_info))
  500. return alloc_test_extent_buffer(fs_info, bytenr);
  501. return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
  502. }
  503. /*
  504. * Read tree block at logical address @bytenr and do variant basic but critical
  505. * verification.
  506. *
  507. * @check: expected tree parentness check, see comments of the
  508. * structure for details.
  509. */
  510. struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
  511. struct btrfs_tree_parent_check *check)
  512. {
  513. struct extent_buffer *buf = NULL;
  514. int ret;
  515. ASSERT(check);
  516. buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root,
  517. check->level);
  518. if (IS_ERR(buf))
  519. return buf;
  520. ret = btrfs_read_extent_buffer(buf, check);
  521. if (ret) {
  522. free_extent_buffer_stale(buf);
  523. return ERR_PTR(ret);
  524. }
  525. return buf;
  526. }
  527. static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
  528. u64 objectid, gfp_t flags)
  529. {
  530. struct btrfs_root *root;
  531. root = kzalloc_obj(*root, flags);
  532. if (!root)
  533. return NULL;
  534. root->fs_info = fs_info;
  535. root->root_key.objectid = objectid;
  536. RB_CLEAR_NODE(&root->rb_node);
  537. xa_init(&root->inodes);
  538. xa_init(&root->delayed_nodes);
  539. btrfs_init_root_block_rsv(root);
  540. INIT_LIST_HEAD(&root->dirty_list);
  541. INIT_LIST_HEAD(&root->root_list);
  542. INIT_LIST_HEAD(&root->delalloc_inodes);
  543. INIT_LIST_HEAD(&root->delalloc_root);
  544. INIT_LIST_HEAD(&root->ordered_extents);
  545. INIT_LIST_HEAD(&root->ordered_root);
  546. INIT_LIST_HEAD(&root->reloc_dirty_list);
  547. spin_lock_init(&root->delalloc_lock);
  548. spin_lock_init(&root->ordered_extent_lock);
  549. spin_lock_init(&root->accounting_lock);
  550. spin_lock_init(&root->qgroup_meta_rsv_lock);
  551. mutex_init(&root->objectid_mutex);
  552. mutex_init(&root->log_mutex);
  553. mutex_init(&root->ordered_extent_mutex);
  554. mutex_init(&root->delalloc_mutex);
  555. init_waitqueue_head(&root->qgroup_flush_wait);
  556. init_waitqueue_head(&root->log_writer_wait);
  557. init_waitqueue_head(&root->log_commit_wait[0]);
  558. init_waitqueue_head(&root->log_commit_wait[1]);
  559. INIT_LIST_HEAD(&root->log_ctxs[0]);
  560. INIT_LIST_HEAD(&root->log_ctxs[1]);
  561. atomic_set(&root->log_commit[0], 0);
  562. atomic_set(&root->log_commit[1], 0);
  563. atomic_set(&root->log_writers, 0);
  564. atomic_set(&root->log_batch, 0);
  565. refcount_set(&root->refs, 1);
  566. atomic_set(&root->snapshot_force_cow, 0);
  567. atomic_set(&root->nr_swapfiles, 0);
  568. root->log_transid_committed = -1;
  569. if (!btrfs_is_testing(fs_info)) {
  570. btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages,
  571. IO_TREE_ROOT_DIRTY_LOG_PAGES);
  572. btrfs_extent_io_tree_init(fs_info, &root->log_csum_range,
  573. IO_TREE_LOG_CSUM_RANGE);
  574. }
  575. spin_lock_init(&root->root_item_lock);
  576. btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
  577. #ifdef CONFIG_BTRFS_DEBUG
  578. INIT_LIST_HEAD(&root->leak_list);
  579. spin_lock(&fs_info->fs_roots_radix_lock);
  580. list_add_tail(&root->leak_list, &fs_info->allocated_roots);
  581. spin_unlock(&fs_info->fs_roots_radix_lock);
  582. #endif
  583. return root;
  584. }
  585. #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
  586. /* Should only be used by the testing infrastructure */
  587. struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
  588. {
  589. struct btrfs_root *root;
  590. if (!fs_info)
  591. return ERR_PTR(-EINVAL);
  592. root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
  593. if (!root)
  594. return ERR_PTR(-ENOMEM);
  595. /* We don't use the stripesize in selftest, set it as sectorsize */
  596. root->alloc_bytenr = 0;
  597. return root;
  598. }
  599. #endif
  600. static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
  601. {
  602. const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
  603. const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
  604. return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
  605. }
  606. static int global_root_key_cmp(const void *k, const struct rb_node *node)
  607. {
  608. const struct btrfs_key *key = k;
  609. const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
  610. return btrfs_comp_cpu_keys(key, &root->root_key);
  611. }
  612. int btrfs_global_root_insert(struct btrfs_root *root)
  613. {
  614. struct btrfs_fs_info *fs_info = root->fs_info;
  615. struct rb_node *tmp;
  616. int ret = 0;
  617. write_lock(&fs_info->global_root_lock);
  618. tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
  619. write_unlock(&fs_info->global_root_lock);
  620. if (tmp) {
  621. ret = -EEXIST;
  622. btrfs_warn(fs_info, "global root %llu %llu already exists",
  623. btrfs_root_id(root), root->root_key.offset);
  624. }
  625. return ret;
  626. }
  627. void btrfs_global_root_delete(struct btrfs_root *root)
  628. {
  629. struct btrfs_fs_info *fs_info = root->fs_info;
  630. write_lock(&fs_info->global_root_lock);
  631. rb_erase(&root->rb_node, &fs_info->global_root_tree);
  632. write_unlock(&fs_info->global_root_lock);
  633. }
  634. struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
  635. struct btrfs_key *key)
  636. {
  637. struct rb_node *node;
  638. struct btrfs_root *root = NULL;
  639. read_lock(&fs_info->global_root_lock);
  640. node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
  641. if (node)
  642. root = container_of(node, struct btrfs_root, rb_node);
  643. read_unlock(&fs_info->global_root_lock);
  644. return root;
  645. }
  646. static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
  647. {
  648. struct btrfs_block_group *block_group;
  649. u64 ret;
  650. if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
  651. return 0;
  652. if (bytenr)
  653. block_group = btrfs_lookup_block_group(fs_info, bytenr);
  654. else
  655. block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
  656. ASSERT(block_group);
  657. if (!block_group)
  658. return 0;
  659. ret = block_group->global_root_id;
  660. btrfs_put_block_group(block_group);
  661. return ret;
  662. }
  663. struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
  664. {
  665. struct btrfs_key key = {
  666. .objectid = BTRFS_CSUM_TREE_OBJECTID,
  667. .type = BTRFS_ROOT_ITEM_KEY,
  668. .offset = btrfs_global_root_id(fs_info, bytenr),
  669. };
  670. return btrfs_global_root(fs_info, &key);
  671. }
  672. struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
  673. {
  674. struct btrfs_key key = {
  675. .objectid = BTRFS_EXTENT_TREE_OBJECTID,
  676. .type = BTRFS_ROOT_ITEM_KEY,
  677. .offset = btrfs_global_root_id(fs_info, bytenr),
  678. };
  679. return btrfs_global_root(fs_info, &key);
  680. }
  681. struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
  682. u64 objectid)
  683. {
  684. struct btrfs_fs_info *fs_info = trans->fs_info;
  685. struct extent_buffer *leaf;
  686. struct btrfs_root *tree_root = fs_info->tree_root;
  687. struct btrfs_root *root;
  688. unsigned int nofs_flag;
  689. int ret = 0;
  690. /*
  691. * We're holding a transaction handle, so use a NOFS memory allocation
  692. * context to avoid deadlock if reclaim happens.
  693. */
  694. nofs_flag = memalloc_nofs_save();
  695. root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
  696. memalloc_nofs_restore(nofs_flag);
  697. if (!root)
  698. return ERR_PTR(-ENOMEM);
  699. root->root_key.objectid = objectid;
  700. root->root_key.type = BTRFS_ROOT_ITEM_KEY;
  701. root->root_key.offset = 0;
  702. leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
  703. 0, BTRFS_NESTING_NORMAL);
  704. if (IS_ERR(leaf)) {
  705. ret = PTR_ERR(leaf);
  706. leaf = NULL;
  707. goto fail;
  708. }
  709. root->node = leaf;
  710. btrfs_mark_buffer_dirty(trans, leaf);
  711. root->commit_root = btrfs_root_node(root);
  712. set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
  713. btrfs_set_root_flags(&root->root_item, 0);
  714. btrfs_set_root_limit(&root->root_item, 0);
  715. btrfs_set_root_bytenr(&root->root_item, leaf->start);
  716. btrfs_set_root_generation(&root->root_item, trans->transid);
  717. btrfs_set_root_level(&root->root_item, 0);
  718. btrfs_set_root_refs(&root->root_item, 1);
  719. btrfs_set_root_used(&root->root_item, leaf->len);
  720. btrfs_set_root_last_snapshot(&root->root_item, 0);
  721. btrfs_set_root_dirid(&root->root_item, 0);
  722. if (btrfs_is_fstree(objectid))
  723. generate_random_guid(root->root_item.uuid);
  724. else
  725. export_guid(root->root_item.uuid, &guid_null);
  726. btrfs_set_root_drop_level(&root->root_item, 0);
  727. btrfs_tree_unlock(leaf);
  728. ret = btrfs_insert_root(trans, tree_root, &root->root_key, &root->root_item);
  729. if (ret)
  730. goto fail;
  731. return root;
  732. fail:
  733. btrfs_put_root(root);
  734. return ERR_PTR(ret);
  735. }
  736. static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info)
  737. {
  738. struct btrfs_root *root;
  739. root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
  740. if (!root)
  741. return ERR_PTR(-ENOMEM);
  742. root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
  743. root->root_key.type = BTRFS_ROOT_ITEM_KEY;
  744. root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
  745. return root;
  746. }
  747. int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
  748. struct btrfs_root *root)
  749. {
  750. struct extent_buffer *leaf;
  751. /*
  752. * DON'T set SHAREABLE bit for log trees.
  753. *
  754. * Log trees are not exposed to user space thus can't be snapshotted,
  755. * and they go away before a real commit is actually done.
  756. *
  757. * They do store pointers to file data extents, and those reference
  758. * counts still get updated (along with back refs to the log tree).
  759. */
  760. leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
  761. NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL);
  762. if (IS_ERR(leaf))
  763. return PTR_ERR(leaf);
  764. root->node = leaf;
  765. btrfs_mark_buffer_dirty(trans, root->node);
  766. btrfs_tree_unlock(root->node);
  767. return 0;
  768. }
  769. int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
  770. struct btrfs_fs_info *fs_info)
  771. {
  772. struct btrfs_root *log_root;
  773. log_root = alloc_log_tree(fs_info);
  774. if (IS_ERR(log_root))
  775. return PTR_ERR(log_root);
  776. if (!btrfs_is_zoned(fs_info)) {
  777. int ret = btrfs_alloc_log_tree_node(trans, log_root);
  778. if (ret) {
  779. btrfs_put_root(log_root);
  780. return ret;
  781. }
  782. }
  783. WARN_ON(fs_info->log_root_tree);
  784. fs_info->log_root_tree = log_root;
  785. return 0;
  786. }
  787. int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
  788. struct btrfs_root *root)
  789. {
  790. struct btrfs_fs_info *fs_info = root->fs_info;
  791. struct btrfs_root *log_root;
  792. struct btrfs_inode_item *inode_item;
  793. int ret;
  794. log_root = alloc_log_tree(fs_info);
  795. if (IS_ERR(log_root))
  796. return PTR_ERR(log_root);
  797. ret = btrfs_alloc_log_tree_node(trans, log_root);
  798. if (ret) {
  799. btrfs_put_root(log_root);
  800. return ret;
  801. }
  802. btrfs_set_root_last_trans(log_root, trans->transid);
  803. log_root->root_key.offset = btrfs_root_id(root);
  804. inode_item = &log_root->root_item.inode;
  805. btrfs_set_stack_inode_generation(inode_item, 1);
  806. btrfs_set_stack_inode_size(inode_item, 3);
  807. btrfs_set_stack_inode_nlink(inode_item, 1);
  808. btrfs_set_stack_inode_nbytes(inode_item,
  809. fs_info->nodesize);
  810. btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
  811. btrfs_set_root_node(&log_root->root_item, log_root->node);
  812. WARN_ON(root->log_root);
  813. root->log_root = log_root;
  814. btrfs_set_root_log_transid(root, 0);
  815. root->log_transid_committed = -1;
  816. btrfs_set_root_last_log_commit(root, 0);
  817. return 0;
  818. }
  819. static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
  820. struct btrfs_path *path,
  821. const struct btrfs_key *key)
  822. {
  823. struct btrfs_root *root;
  824. struct btrfs_tree_parent_check check = { 0 };
  825. struct btrfs_fs_info *fs_info = tree_root->fs_info;
  826. u64 generation;
  827. int ret;
  828. int level;
  829. root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
  830. if (!root)
  831. return ERR_PTR(-ENOMEM);
  832. ret = btrfs_find_root(tree_root, key, path,
  833. &root->root_item, &root->root_key);
  834. if (ret) {
  835. if (ret > 0)
  836. ret = -ENOENT;
  837. goto fail;
  838. }
  839. generation = btrfs_root_generation(&root->root_item);
  840. level = btrfs_root_level(&root->root_item);
  841. check.level = level;
  842. check.transid = generation;
  843. check.owner_root = key->objectid;
  844. root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item),
  845. &check);
  846. if (IS_ERR(root->node)) {
  847. ret = PTR_ERR(root->node);
  848. root->node = NULL;
  849. goto fail;
  850. }
  851. if (unlikely(!btrfs_buffer_uptodate(root->node, generation, false))) {
  852. ret = -EIO;
  853. goto fail;
  854. }
  855. /*
  856. * For real fs, and not log/reloc trees, root owner must
  857. * match its root node owner
  858. */
  859. if (unlikely(!btrfs_is_testing(fs_info) &&
  860. btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
  861. btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
  862. btrfs_root_id(root) != btrfs_header_owner(root->node))) {
  863. btrfs_crit(fs_info,
  864. "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
  865. btrfs_root_id(root), root->node->start,
  866. btrfs_header_owner(root->node),
  867. btrfs_root_id(root));
  868. ret = -EUCLEAN;
  869. goto fail;
  870. }
  871. root->commit_root = btrfs_root_node(root);
  872. return root;
  873. fail:
  874. btrfs_put_root(root);
  875. return ERR_PTR(ret);
  876. }
  877. struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
  878. const struct btrfs_key *key)
  879. {
  880. struct btrfs_root *root;
  881. BTRFS_PATH_AUTO_FREE(path);
  882. path = btrfs_alloc_path();
  883. if (!path)
  884. return ERR_PTR(-ENOMEM);
  885. root = read_tree_root_path(tree_root, path, key);
  886. return root;
  887. }
  888. /*
  889. * Initialize subvolume root in-memory structure.
  890. *
  891. * @anon_dev: anonymous device to attach to the root, if zero, allocate new
  892. *
  893. * In case of failure the caller is responsible to call btrfs_free_fs_root()
  894. */
  895. static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
  896. {
  897. int ret;
  898. btrfs_drew_lock_init(&root->snapshot_lock);
  899. if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
  900. !btrfs_is_data_reloc_root(root) &&
  901. btrfs_is_fstree(btrfs_root_id(root))) {
  902. set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
  903. btrfs_check_and_init_root_item(&root->root_item);
  904. }
  905. /*
  906. * Don't assign anonymous block device to roots that are not exposed to
  907. * userspace, the id pool is limited to 1M
  908. */
  909. if (btrfs_is_fstree(btrfs_root_id(root)) &&
  910. btrfs_root_refs(&root->root_item) > 0) {
  911. if (!anon_dev) {
  912. ret = get_anon_bdev(&root->anon_dev);
  913. if (ret)
  914. return ret;
  915. } else {
  916. root->anon_dev = anon_dev;
  917. }
  918. }
  919. mutex_lock(&root->objectid_mutex);
  920. ret = btrfs_init_root_free_objectid(root);
  921. if (ret) {
  922. mutex_unlock(&root->objectid_mutex);
  923. return ret;
  924. }
  925. ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
  926. mutex_unlock(&root->objectid_mutex);
  927. return 0;
  928. }
  929. static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
  930. u64 root_id)
  931. {
  932. struct btrfs_root *root;
  933. spin_lock(&fs_info->fs_roots_radix_lock);
  934. root = radix_tree_lookup(&fs_info->fs_roots_radix,
  935. (unsigned long)root_id);
  936. root = btrfs_grab_root(root);
  937. spin_unlock(&fs_info->fs_roots_radix_lock);
  938. return root;
  939. }
  940. static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
  941. u64 objectid)
  942. {
  943. struct btrfs_key key = {
  944. .objectid = objectid,
  945. .type = BTRFS_ROOT_ITEM_KEY,
  946. .offset = 0,
  947. };
  948. switch (objectid) {
  949. case BTRFS_ROOT_TREE_OBJECTID:
  950. return btrfs_grab_root(fs_info->tree_root);
  951. case BTRFS_EXTENT_TREE_OBJECTID:
  952. return btrfs_grab_root(btrfs_global_root(fs_info, &key));
  953. case BTRFS_CHUNK_TREE_OBJECTID:
  954. return btrfs_grab_root(fs_info->chunk_root);
  955. case BTRFS_DEV_TREE_OBJECTID:
  956. return btrfs_grab_root(fs_info->dev_root);
  957. case BTRFS_CSUM_TREE_OBJECTID:
  958. return btrfs_grab_root(btrfs_global_root(fs_info, &key));
  959. case BTRFS_QUOTA_TREE_OBJECTID:
  960. return btrfs_grab_root(fs_info->quota_root);
  961. case BTRFS_UUID_TREE_OBJECTID:
  962. return btrfs_grab_root(fs_info->uuid_root);
  963. case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
  964. return btrfs_grab_root(fs_info->block_group_root);
  965. case BTRFS_FREE_SPACE_TREE_OBJECTID:
  966. return btrfs_grab_root(btrfs_global_root(fs_info, &key));
  967. case BTRFS_RAID_STRIPE_TREE_OBJECTID:
  968. return btrfs_grab_root(fs_info->stripe_root);
  969. case BTRFS_REMAP_TREE_OBJECTID:
  970. return btrfs_grab_root(fs_info->remap_root);
  971. default:
  972. return NULL;
  973. }
  974. }
  975. int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
  976. struct btrfs_root *root)
  977. {
  978. int ret;
  979. ret = radix_tree_preload(GFP_NOFS);
  980. if (ret)
  981. return ret;
  982. spin_lock(&fs_info->fs_roots_radix_lock);
  983. ret = radix_tree_insert(&fs_info->fs_roots_radix,
  984. (unsigned long)btrfs_root_id(root),
  985. root);
  986. if (ret == 0) {
  987. btrfs_grab_root(root);
  988. set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
  989. }
  990. spin_unlock(&fs_info->fs_roots_radix_lock);
  991. radix_tree_preload_end();
  992. return ret;
  993. }
  994. void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info)
  995. {
  996. #ifdef CONFIG_BTRFS_DEBUG
  997. struct btrfs_root *root;
  998. while (!list_empty(&fs_info->allocated_roots)) {
  999. char buf[BTRFS_ROOT_NAME_BUF_LEN];
  1000. root = list_first_entry(&fs_info->allocated_roots,
  1001. struct btrfs_root, leak_list);
  1002. btrfs_err(fs_info, "leaked root %s refcount %d",
  1003. btrfs_root_name(&root->root_key, buf),
  1004. refcount_read(&root->refs));
  1005. WARN_ON_ONCE(1);
  1006. while (refcount_read(&root->refs) > 1)
  1007. btrfs_put_root(root);
  1008. btrfs_put_root(root);
  1009. }
  1010. #endif
  1011. }
  1012. static void free_global_roots(struct btrfs_fs_info *fs_info)
  1013. {
  1014. struct btrfs_root *root;
  1015. struct rb_node *node;
  1016. while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
  1017. root = rb_entry(node, struct btrfs_root, rb_node);
  1018. rb_erase(&root->rb_node, &fs_info->global_root_tree);
  1019. btrfs_put_root(root);
  1020. }
  1021. }
  1022. void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
  1023. {
  1024. struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;
  1025. if (fs_info->fs_devices)
  1026. btrfs_close_devices(fs_info->fs_devices);
  1027. btrfs_free_compress_wsm(fs_info);
  1028. percpu_counter_destroy(&fs_info->stats_read_blocks);
  1029. percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
  1030. percpu_counter_destroy(&fs_info->delalloc_bytes);
  1031. percpu_counter_destroy(&fs_info->ordered_bytes);
  1032. if (percpu_counter_initialized(em_counter))
  1033. ASSERT(percpu_counter_sum_positive(em_counter) == 0);
  1034. percpu_counter_destroy(em_counter);
  1035. percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
  1036. btrfs_free_stripe_hash_table(fs_info);
  1037. btrfs_free_ref_cache(fs_info);
  1038. kfree(fs_info->balance_ctl);
  1039. free_global_roots(fs_info);
  1040. btrfs_put_root(fs_info->tree_root);
  1041. btrfs_put_root(fs_info->chunk_root);
  1042. btrfs_put_root(fs_info->dev_root);
  1043. btrfs_put_root(fs_info->quota_root);
  1044. btrfs_put_root(fs_info->uuid_root);
  1045. btrfs_put_root(fs_info->fs_root);
  1046. btrfs_put_root(fs_info->data_reloc_root);
  1047. btrfs_put_root(fs_info->block_group_root);
  1048. btrfs_put_root(fs_info->stripe_root);
  1049. btrfs_put_root(fs_info->remap_root);
  1050. btrfs_check_leaked_roots(fs_info);
  1051. btrfs_extent_buffer_leak_debug_check(fs_info);
  1052. kfree(fs_info->super_copy);
  1053. kfree(fs_info->super_for_commit);
  1054. kvfree(fs_info);
  1055. }
  1056. /*
  1057. * Get an in-memory reference of a root structure.
  1058. *
  1059. * For essential trees like root/extent tree, we grab it from fs_info directly.
  1060. * For subvolume trees, we check the cached filesystem roots first. If not
  1061. * found, then read it from disk and add it to cached fs roots.
  1062. *
  1063. * Caller should release the root by calling btrfs_put_root() after the usage.
  1064. *
  1065. * NOTE: Reloc and log trees can't be read by this function as they share the
  1066. * same root objectid.
  1067. *
  1068. * @objectid: root id
  1069. * @anon_dev: preallocated anonymous block device number for new roots,
  1070. * pass NULL for a new allocation.
  1071. * @check_ref: whether to check root item references, If true, return -ENOENT
  1072. * for orphan roots
  1073. */
  1074. static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
  1075. u64 objectid, dev_t *anon_dev,
  1076. bool check_ref)
  1077. {
  1078. struct btrfs_root *root;
  1079. struct btrfs_path *path;
  1080. struct btrfs_key key;
  1081. int ret;
  1082. root = btrfs_get_global_root(fs_info, objectid);
  1083. if (root)
  1084. return root;
  1085. /*
  1086. * If we're called for non-subvolume trees, and above function didn't
  1087. * find one, do not try to read it from disk.
  1088. *
  1089. * This is namely for free-space-tree and quota tree, which can change
  1090. * at runtime and should only be grabbed from fs_info.
  1091. */
  1092. if (!btrfs_is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
  1093. return ERR_PTR(-ENOENT);
  1094. again:
  1095. root = btrfs_lookup_fs_root(fs_info, objectid);
  1096. if (root) {
  1097. /*
  1098. * Some other caller may have read out the newly inserted
  1099. * subvolume already (for things like backref walk etc). Not
  1100. * that common but still possible. In that case, we just need
  1101. * to free the anon_dev.
  1102. */
  1103. if (unlikely(anon_dev && *anon_dev)) {
  1104. free_anon_bdev(*anon_dev);
  1105. *anon_dev = 0;
  1106. }
  1107. if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
  1108. btrfs_put_root(root);
  1109. return ERR_PTR(-ENOENT);
  1110. }
  1111. return root;
  1112. }
  1113. key.objectid = objectid;
  1114. key.type = BTRFS_ROOT_ITEM_KEY;
  1115. key.offset = (u64)-1;
  1116. root = btrfs_read_tree_root(fs_info->tree_root, &key);
  1117. if (IS_ERR(root))
  1118. return root;
  1119. if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
  1120. ret = -ENOENT;
  1121. goto fail;
  1122. }
  1123. ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0);
  1124. if (ret)
  1125. goto fail;
  1126. path = btrfs_alloc_path();
  1127. if (!path) {
  1128. ret = -ENOMEM;
  1129. goto fail;
  1130. }
  1131. key.objectid = BTRFS_ORPHAN_OBJECTID;
  1132. key.type = BTRFS_ORPHAN_ITEM_KEY;
  1133. key.offset = objectid;
  1134. ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
  1135. btrfs_free_path(path);
  1136. if (ret < 0)
  1137. goto fail;
  1138. if (ret == 0)
  1139. set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
  1140. ret = btrfs_insert_fs_root(fs_info, root);
  1141. if (ret) {
  1142. if (ret == -EEXIST) {
  1143. btrfs_put_root(root);
  1144. goto again;
  1145. }
  1146. goto fail;
  1147. }
  1148. return root;
  1149. fail:
  1150. /*
  1151. * If our caller provided us an anonymous device, then it's his
  1152. * responsibility to free it in case we fail. So we have to set our
  1153. * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
  1154. * and once again by our caller.
  1155. */
  1156. if (anon_dev && *anon_dev)
  1157. root->anon_dev = 0;
  1158. btrfs_put_root(root);
  1159. return ERR_PTR(ret);
  1160. }
  1161. /*
  1162. * Get in-memory reference of a root structure
  1163. *
  1164. * @objectid: tree objectid
  1165. * @check_ref: if set, verify that the tree exists and the item has at least
  1166. * one reference
  1167. */
  1168. struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
  1169. u64 objectid, bool check_ref)
  1170. {
  1171. return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref);
  1172. }
  1173. /*
  1174. * Get in-memory reference of a root structure, created as new, optionally pass
  1175. * the anonymous block device id
  1176. *
  1177. * @objectid: tree objectid
  1178. * @anon_dev: if NULL, allocate a new anonymous block device or use the
  1179. * parameter value if not NULL
  1180. */
  1181. struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
  1182. u64 objectid, dev_t *anon_dev)
  1183. {
  1184. return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
  1185. }
  1186. /*
  1187. * Return a root for the given objectid.
  1188. *
  1189. * @fs_info: the fs_info
  1190. * @objectid: the objectid we need to lookup
  1191. *
  1192. * This is exclusively used for backref walking, and exists specifically because
  1193. * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref
  1194. * creation time, which means we may have to read the tree_root in order to look
  1195. * up a fs root that is not in memory. If the root is not in memory we will
  1196. * read the tree root commit root and look up the fs root from there. This is a
  1197. * temporary root, it will not be inserted into the radix tree as it doesn't
  1198. * have the most uptodate information, it'll simply be discarded once the
  1199. * backref code is finished using the root.
  1200. */
  1201. struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
  1202. struct btrfs_path *path,
  1203. u64 objectid)
  1204. {
  1205. struct btrfs_root *root;
  1206. struct btrfs_key key;
  1207. ASSERT(path->search_commit_root && path->skip_locking);
  1208. /*
  1209. * This can return -ENOENT if we ask for a root that doesn't exist, but
  1210. * since this is called via the backref walking code we won't be looking
  1211. * up a root that doesn't exist, unless there's corruption. So if root
  1212. * != NULL just return it.
  1213. */
  1214. root = btrfs_get_global_root(fs_info, objectid);
  1215. if (root)
  1216. return root;
  1217. root = btrfs_lookup_fs_root(fs_info, objectid);
  1218. if (root)
  1219. return root;
  1220. key.objectid = objectid;
  1221. key.type = BTRFS_ROOT_ITEM_KEY;
  1222. key.offset = (u64)-1;
  1223. root = read_tree_root_path(fs_info->tree_root, path, &key);
  1224. btrfs_release_path(path);
  1225. return root;
  1226. }
  1227. static int cleaner_kthread(void *arg)
  1228. {
  1229. struct btrfs_fs_info *fs_info = arg;
  1230. int again;
  1231. while (1) {
  1232. again = 0;
  1233. set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
  1234. /* Make the cleaner go to sleep early. */
  1235. if (btrfs_need_cleaner_sleep(fs_info))
  1236. goto sleep;
  1237. /*
  1238. * Do not do anything if we might cause open_ctree() to block
  1239. * before we have finished mounting the filesystem.
  1240. */
  1241. if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
  1242. goto sleep;
  1243. if (!mutex_trylock(&fs_info->cleaner_mutex))
  1244. goto sleep;
  1245. /*
  1246. * Avoid the problem that we change the status of the fs
  1247. * during the above check and trylock.
  1248. */
  1249. if (btrfs_need_cleaner_sleep(fs_info)) {
  1250. mutex_unlock(&fs_info->cleaner_mutex);
  1251. goto sleep;
  1252. }
  1253. if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
  1254. btrfs_sysfs_feature_update(fs_info);
  1255. btrfs_run_delayed_iputs(fs_info);
  1256. again = btrfs_clean_one_deleted_snapshot(fs_info);
  1257. mutex_unlock(&fs_info->cleaner_mutex);
  1258. /*
  1259. * The defragger has dealt with the R/O remount and umount,
  1260. * needn't do anything special here.
  1261. */
  1262. btrfs_run_defrag_inodes(fs_info);
  1263. if (btrfs_fs_incompat(fs_info, REMAP_TREE) &&
  1264. !btrfs_test_opt(fs_info, DISCARD_ASYNC))
  1265. btrfs_handle_fully_remapped_bgs(fs_info);
  1266. /*
  1267. * Acquires fs_info->reclaim_bgs_lock to avoid racing
  1268. * with relocation (btrfs_relocate_chunk) and relocation
  1269. * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
  1270. * after acquiring fs_info->reclaim_bgs_lock. So we
  1271. * can't hold, nor need to, fs_info->cleaner_mutex when deleting
  1272. * unused block groups.
  1273. */
  1274. btrfs_delete_unused_bgs(fs_info);
  1275. /*
  1276. * Reclaim block groups in the reclaim_bgs list after we deleted
  1277. * all unused block_groups. This possibly gives us some more free
  1278. * space.
  1279. */
  1280. btrfs_reclaim_bgs(fs_info);
  1281. sleep:
  1282. clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
  1283. if (kthread_should_park())
  1284. kthread_parkme();
  1285. if (kthread_should_stop())
  1286. return 0;
  1287. if (!again) {
  1288. set_current_state(TASK_INTERRUPTIBLE);
  1289. schedule();
  1290. __set_current_state(TASK_RUNNING);
  1291. }
  1292. }
  1293. }
  1294. static int transaction_kthread(void *arg)
  1295. {
  1296. struct btrfs_root *root = arg;
  1297. struct btrfs_fs_info *fs_info = root->fs_info;
  1298. struct btrfs_trans_handle *trans;
  1299. struct btrfs_transaction *cur;
  1300. u64 transid;
  1301. time64_t delta;
  1302. unsigned long delay;
  1303. bool cannot_commit;
  1304. do {
  1305. cannot_commit = false;
  1306. delay = secs_to_jiffies(fs_info->commit_interval);
  1307. mutex_lock(&fs_info->transaction_kthread_mutex);
  1308. spin_lock(&fs_info->trans_lock);
  1309. cur = fs_info->running_transaction;
  1310. if (!cur) {
  1311. spin_unlock(&fs_info->trans_lock);
  1312. goto sleep;
  1313. }
  1314. delta = ktime_get_seconds() - cur->start_time;
  1315. if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
  1316. cur->state < TRANS_STATE_COMMIT_PREP &&
  1317. delta < fs_info->commit_interval) {
  1318. spin_unlock(&fs_info->trans_lock);
  1319. delay -= secs_to_jiffies(delta - 1);
  1320. delay = min(delay,
  1321. secs_to_jiffies(fs_info->commit_interval));
  1322. goto sleep;
  1323. }
  1324. transid = cur->transid;
  1325. spin_unlock(&fs_info->trans_lock);
  1326. /* If the file system is aborted, this will always fail. */
  1327. trans = btrfs_attach_transaction(root);
  1328. if (IS_ERR(trans)) {
  1329. if (PTR_ERR(trans) != -ENOENT)
  1330. cannot_commit = true;
  1331. goto sleep;
  1332. }
  1333. if (transid == trans->transid) {
  1334. btrfs_commit_transaction(trans);
  1335. } else {
  1336. btrfs_end_transaction(trans);
  1337. }
  1338. sleep:
  1339. wake_up_process(fs_info->cleaner_kthread);
  1340. mutex_unlock(&fs_info->transaction_kthread_mutex);
  1341. if (BTRFS_FS_ERROR(fs_info))
  1342. btrfs_cleanup_transaction(fs_info);
  1343. if (!kthread_should_stop() &&
  1344. (!btrfs_transaction_blocked(fs_info) ||
  1345. cannot_commit))
  1346. schedule_timeout_interruptible(delay);
  1347. } while (!kthread_should_stop());
  1348. return 0;
  1349. }
  1350. /*
  1351. * This will find the highest generation in the array of root backups. The
  1352. * index of the highest array is returned, or -EINVAL if we can't find
  1353. * anything.
  1354. *
  1355. * We check to make sure the array is valid by comparing the
  1356. * generation of the latest root in the array with the generation
  1357. * in the super block. If they don't match we pitch it.
  1358. */
  1359. static int find_newest_super_backup(struct btrfs_fs_info *info)
  1360. {
  1361. const u64 newest_gen = btrfs_super_generation(info->super_copy);
  1362. u64 cur;
  1363. struct btrfs_root_backup *root_backup;
  1364. int i;
  1365. for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
  1366. root_backup = info->super_copy->super_roots + i;
  1367. cur = btrfs_backup_tree_root_gen(root_backup);
  1368. if (cur == newest_gen)
  1369. return i;
  1370. }
  1371. return -EINVAL;
  1372. }
  1373. /*
  1374. * copy all the root pointers into the super backup array.
  1375. * this will bump the backup pointer by one when it is
  1376. * done
  1377. */
  1378. static int backup_super_roots(struct btrfs_fs_info *info)
  1379. {
  1380. const int next_backup = info->backup_root_index;
  1381. struct btrfs_root_backup *root_backup;
  1382. root_backup = info->super_for_commit->super_roots + next_backup;
  1383. /*
  1384. * make sure all of our padding and empty slots get zero filled
  1385. * regardless of which ones we use today
  1386. */
  1387. memset(root_backup, 0, sizeof(*root_backup));
  1388. info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
  1389. btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
  1390. btrfs_set_backup_tree_root_gen(root_backup,
  1391. btrfs_header_generation(info->tree_root->node));
  1392. btrfs_set_backup_tree_root_level(root_backup,
  1393. btrfs_header_level(info->tree_root->node));
  1394. btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
  1395. btrfs_set_backup_chunk_root_gen(root_backup,
  1396. btrfs_header_generation(info->chunk_root->node));
  1397. btrfs_set_backup_chunk_root_level(root_backup,
  1398. btrfs_header_level(info->chunk_root->node));
  1399. if (!btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
  1400. struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
  1401. struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
  1402. if (unlikely(!extent_root)) {
  1403. btrfs_err(info, "missing extent root for extent at bytenr 0");
  1404. return -EUCLEAN;
  1405. }
  1406. if (unlikely(!csum_root)) {
  1407. btrfs_err(info, "missing csum root for extent at bytenr 0");
  1408. return -EUCLEAN;
  1409. }
  1410. btrfs_set_backup_extent_root(root_backup,
  1411. extent_root->node->start);
  1412. btrfs_set_backup_extent_root_gen(root_backup,
  1413. btrfs_header_generation(extent_root->node));
  1414. btrfs_set_backup_extent_root_level(root_backup,
  1415. btrfs_header_level(extent_root->node));
  1416. btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
  1417. btrfs_set_backup_csum_root_gen(root_backup,
  1418. btrfs_header_generation(csum_root->node));
  1419. btrfs_set_backup_csum_root_level(root_backup,
  1420. btrfs_header_level(csum_root->node));
  1421. }
  1422. /*
  1423. * we might commit during log recovery, which happens before we set
  1424. * the fs_root. Make sure it is valid before we fill it in.
  1425. */
  1426. if (info->fs_root && info->fs_root->node) {
  1427. btrfs_set_backup_fs_root(root_backup,
  1428. info->fs_root->node->start);
  1429. btrfs_set_backup_fs_root_gen(root_backup,
  1430. btrfs_header_generation(info->fs_root->node));
  1431. btrfs_set_backup_fs_root_level(root_backup,
  1432. btrfs_header_level(info->fs_root->node));
  1433. }
  1434. btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
  1435. btrfs_set_backup_dev_root_gen(root_backup,
  1436. btrfs_header_generation(info->dev_root->node));
  1437. btrfs_set_backup_dev_root_level(root_backup,
  1438. btrfs_header_level(info->dev_root->node));
  1439. btrfs_set_backup_total_bytes(root_backup,
  1440. btrfs_super_total_bytes(info->super_copy));
  1441. btrfs_set_backup_bytes_used(root_backup,
  1442. btrfs_super_bytes_used(info->super_copy));
  1443. btrfs_set_backup_num_devices(root_backup,
  1444. btrfs_super_num_devices(info->super_copy));
  1445. /*
  1446. * if we don't copy this out to the super_copy, it won't get remembered
  1447. * for the next commit
  1448. */
  1449. memcpy(&info->super_copy->super_roots,
  1450. &info->super_for_commit->super_roots,
  1451. sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
  1452. return 0;
  1453. }
  1454. /*
  1455. * Reads a backup root based on the passed priority. Prio 0 is the newest, prio
  1456. * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
  1457. *
  1458. * @fs_info: filesystem whose backup roots need to be read
  1459. * @priority: priority of backup root required
  1460. *
  1461. * Returns backup root index on success and -EINVAL otherwise.
  1462. */
  1463. static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
  1464. {
  1465. int backup_index = find_newest_super_backup(fs_info);
  1466. struct btrfs_super_block *super = fs_info->super_copy;
  1467. struct btrfs_root_backup *root_backup;
  1468. if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
  1469. if (priority == 0)
  1470. return backup_index;
  1471. backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
  1472. backup_index %= BTRFS_NUM_BACKUP_ROOTS;
  1473. } else {
  1474. return -EINVAL;
  1475. }
  1476. root_backup = super->super_roots + backup_index;
  1477. btrfs_set_super_generation(super,
  1478. btrfs_backup_tree_root_gen(root_backup));
  1479. btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
  1480. btrfs_set_super_root_level(super,
  1481. btrfs_backup_tree_root_level(root_backup));
  1482. btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
  1483. /*
  1484. * Fixme: the total bytes and num_devices need to match or we should
  1485. * need a fsck
  1486. */
  1487. btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
  1488. btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
  1489. return backup_index;
  1490. }
  1491. /* helper to cleanup workers */
  1492. static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
  1493. {
  1494. btrfs_destroy_workqueue(fs_info->fixup_workers);
  1495. btrfs_destroy_workqueue(fs_info->delalloc_workers);
  1496. btrfs_destroy_workqueue(fs_info->workers);
  1497. if (fs_info->endio_workers)
  1498. destroy_workqueue(fs_info->endio_workers);
  1499. if (fs_info->rmw_workers)
  1500. destroy_workqueue(fs_info->rmw_workers);
  1501. btrfs_destroy_workqueue(fs_info->endio_write_workers);
  1502. btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
  1503. btrfs_destroy_workqueue(fs_info->delayed_workers);
  1504. btrfs_destroy_workqueue(fs_info->caching_workers);
  1505. btrfs_destroy_workqueue(fs_info->flush_workers);
  1506. btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
  1507. if (fs_info->discard_ctl.discard_workers)
  1508. destroy_workqueue(fs_info->discard_ctl.discard_workers);
  1509. /*
  1510. * Now that all other work queues are destroyed, we can safely destroy
  1511. * the queues used for metadata I/O, since tasks from those other work
  1512. * queues can do metadata I/O operations.
  1513. */
  1514. if (fs_info->endio_meta_workers)
  1515. destroy_workqueue(fs_info->endio_meta_workers);
  1516. }
  1517. static void free_root_extent_buffers(struct btrfs_root *root)
  1518. {
  1519. if (root) {
  1520. free_extent_buffer(root->node);
  1521. free_extent_buffer(root->commit_root);
  1522. root->node = NULL;
  1523. root->commit_root = NULL;
  1524. }
  1525. }
  1526. static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
  1527. {
  1528. struct btrfs_root *root, *tmp;
  1529. rbtree_postorder_for_each_entry_safe(root, tmp,
  1530. &fs_info->global_root_tree,
  1531. rb_node)
  1532. free_root_extent_buffers(root);
  1533. }
  1534. /* helper to cleanup tree roots */
  1535. static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
  1536. {
  1537. free_root_extent_buffers(info->tree_root);
  1538. free_global_root_pointers(info);
  1539. free_root_extent_buffers(info->dev_root);
  1540. free_root_extent_buffers(info->quota_root);
  1541. free_root_extent_buffers(info->uuid_root);
  1542. free_root_extent_buffers(info->fs_root);
  1543. free_root_extent_buffers(info->data_reloc_root);
  1544. free_root_extent_buffers(info->block_group_root);
  1545. free_root_extent_buffers(info->stripe_root);
  1546. free_root_extent_buffers(info->remap_root);
  1547. if (free_chunk_root)
  1548. free_root_extent_buffers(info->chunk_root);
  1549. }
  1550. void btrfs_put_root(struct btrfs_root *root)
  1551. {
  1552. if (!root)
  1553. return;
  1554. if (refcount_dec_and_test(&root->refs)) {
  1555. if (WARN_ON(!xa_empty(&root->inodes)))
  1556. xa_destroy(&root->inodes);
  1557. if (WARN_ON(!xa_empty(&root->delayed_nodes)))
  1558. xa_destroy(&root->delayed_nodes);
  1559. WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
  1560. if (root->anon_dev)
  1561. free_anon_bdev(root->anon_dev);
  1562. free_root_extent_buffers(root);
  1563. #ifdef CONFIG_BTRFS_DEBUG
  1564. spin_lock(&root->fs_info->fs_roots_radix_lock);
  1565. list_del_init(&root->leak_list);
  1566. spin_unlock(&root->fs_info->fs_roots_radix_lock);
  1567. #endif
  1568. kfree(root);
  1569. }
  1570. }
  1571. void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
  1572. {
  1573. int ret;
  1574. struct btrfs_root *gang[8];
  1575. int i;
  1576. while (!list_empty(&fs_info->dead_roots)) {
  1577. gang[0] = list_first_entry(&fs_info->dead_roots,
  1578. struct btrfs_root, root_list);
  1579. list_del(&gang[0]->root_list);
  1580. if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
  1581. btrfs_drop_and_free_fs_root(fs_info, gang[0]);
  1582. btrfs_put_root(gang[0]);
  1583. }
  1584. while (1) {
  1585. ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
  1586. (void **)gang, 0,
  1587. ARRAY_SIZE(gang));
  1588. if (!ret)
  1589. break;
  1590. for (i = 0; i < ret; i++)
  1591. btrfs_drop_and_free_fs_root(fs_info, gang[i]);
  1592. }
  1593. }
  1594. static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
  1595. {
  1596. mutex_init(&fs_info->scrub_lock);
  1597. atomic_set(&fs_info->scrubs_running, 0);
  1598. atomic_set(&fs_info->scrub_pause_req, 0);
  1599. atomic_set(&fs_info->scrubs_paused, 0);
  1600. atomic_set(&fs_info->scrub_cancel_req, 0);
  1601. init_waitqueue_head(&fs_info->scrub_pause_wait);
  1602. refcount_set(&fs_info->scrub_workers_refcnt, 0);
  1603. }
  1604. static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
  1605. {
  1606. spin_lock_init(&fs_info->balance_lock);
  1607. mutex_init(&fs_info->balance_mutex);
  1608. atomic_set(&fs_info->balance_pause_req, 0);
  1609. atomic_set(&fs_info->balance_cancel_req, 0);
  1610. fs_info->balance_ctl = NULL;
  1611. init_waitqueue_head(&fs_info->balance_wait_q);
  1612. atomic_set(&fs_info->reloc_cancel_req, 0);
  1613. }
  1614. static int btrfs_init_btree_inode(struct super_block *sb)
  1615. {
  1616. struct btrfs_fs_info *fs_info = btrfs_sb(sb);
  1617. unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
  1618. fs_info->tree_root);
  1619. struct inode *inode;
  1620. inode = new_inode(sb);
  1621. if (!inode)
  1622. return -ENOMEM;
  1623. btrfs_set_inode_number(BTRFS_I(inode), BTRFS_BTREE_INODE_OBJECTID);
  1624. set_nlink(inode, 1);
  1625. /*
  1626. * we set the i_size on the btree inode to the max possible int.
  1627. * the real end of the address space is determined by all of
  1628. * the devices in the system
  1629. */
  1630. inode->i_size = OFFSET_MAX;
  1631. inode->i_mapping->a_ops = &btree_aops;
  1632. mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
  1633. btrfs_extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
  1634. IO_TREE_BTREE_INODE_IO);
  1635. btrfs_extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
  1636. BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
  1637. set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
  1638. __insert_inode_hash(inode, hash);
  1639. set_bit(AS_KERNEL_FILE, &inode->i_mapping->flags);
  1640. fs_info->btree_inode = inode;
  1641. return 0;
  1642. }
  1643. static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
  1644. {
  1645. mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
  1646. init_rwsem(&fs_info->dev_replace.rwsem);
  1647. init_waitqueue_head(&fs_info->dev_replace.replace_wait);
  1648. }
  1649. static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
  1650. {
  1651. spin_lock_init(&fs_info->qgroup_lock);
  1652. mutex_init(&fs_info->qgroup_ioctl_lock);
  1653. fs_info->qgroup_tree = RB_ROOT;
  1654. INIT_LIST_HEAD(&fs_info->dirty_qgroups);
  1655. fs_info->qgroup_seq = 1;
  1656. fs_info->qgroup_rescan_running = false;
  1657. fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
  1658. mutex_init(&fs_info->qgroup_rescan_lock);
  1659. }
  1660. static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
  1661. {
  1662. u32 max_active = fs_info->thread_pool_size;
  1663. unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
  1664. unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU;
  1665. fs_info->workers =
  1666. btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
  1667. fs_info->delalloc_workers =
  1668. btrfs_alloc_workqueue(fs_info, "delalloc",
  1669. flags, max_active, 2);
  1670. fs_info->flush_workers =
  1671. btrfs_alloc_workqueue(fs_info, "flush_delalloc",
  1672. flags, max_active, 0);
  1673. fs_info->caching_workers =
  1674. btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
  1675. fs_info->fixup_workers =
  1676. btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags);
  1677. fs_info->endio_workers =
  1678. alloc_workqueue("btrfs-endio", flags, max_active);
  1679. fs_info->endio_meta_workers =
  1680. alloc_workqueue("btrfs-endio-meta", flags, max_active);
  1681. fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
  1682. fs_info->endio_write_workers =
  1683. btrfs_alloc_workqueue(fs_info, "endio-write", flags,
  1684. max_active, 2);
  1685. fs_info->endio_freespace_worker =
  1686. btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
  1687. max_active, 0);
  1688. fs_info->delayed_workers =
  1689. btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
  1690. max_active, 0);
  1691. fs_info->qgroup_rescan_workers =
  1692. btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan",
  1693. ordered_flags);
  1694. fs_info->discard_ctl.discard_workers =
  1695. alloc_ordered_workqueue("btrfs-discard", WQ_FREEZABLE);
  1696. if (!(fs_info->workers &&
  1697. fs_info->delalloc_workers && fs_info->flush_workers &&
  1698. fs_info->endio_workers && fs_info->endio_meta_workers &&
  1699. fs_info->endio_write_workers &&
  1700. fs_info->endio_freespace_worker && fs_info->rmw_workers &&
  1701. fs_info->caching_workers && fs_info->fixup_workers &&
  1702. fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
  1703. fs_info->discard_ctl.discard_workers)) {
  1704. return -ENOMEM;
  1705. }
  1706. return 0;
  1707. }
  1708. static void btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
  1709. {
  1710. /* Check if the checksum implementation is a fast accelerated one. */
  1711. switch (csum_type) {
  1712. case BTRFS_CSUM_TYPE_CRC32:
  1713. if (crc32_optimizations() & CRC32C_OPTIMIZATION)
  1714. set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
  1715. break;
  1716. case BTRFS_CSUM_TYPE_XXHASH:
  1717. set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
  1718. break;
  1719. default:
  1720. break;
  1721. }
  1722. btrfs_info(fs_info, "using %s checksum algorithm",
  1723. btrfs_super_csum_name(csum_type));
  1724. }
  1725. static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
  1726. struct btrfs_fs_devices *fs_devices)
  1727. {
  1728. int ret;
  1729. struct btrfs_tree_parent_check check = { 0 };
  1730. struct btrfs_root *log_tree_root;
  1731. struct btrfs_super_block *disk_super = fs_info->super_copy;
  1732. u64 bytenr = btrfs_super_log_root(disk_super);
  1733. int level = btrfs_super_log_root_level(disk_super);
  1734. if (unlikely(fs_devices->rw_devices == 0)) {
  1735. btrfs_err(fs_info, "log replay required on RO media");
  1736. return -EIO;
  1737. }
  1738. log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
  1739. GFP_KERNEL);
  1740. if (!log_tree_root)
  1741. return -ENOMEM;
  1742. check.level = level;
  1743. check.transid = fs_info->generation + 1;
  1744. check.owner_root = BTRFS_TREE_LOG_OBJECTID;
  1745. log_tree_root->node = read_tree_block(fs_info, bytenr, &check);
  1746. if (IS_ERR(log_tree_root->node)) {
  1747. ret = PTR_ERR(log_tree_root->node);
  1748. log_tree_root->node = NULL;
  1749. btrfs_err(fs_info, "failed to read log tree with error: %d", ret);
  1750. btrfs_put_root(log_tree_root);
  1751. return ret;
  1752. }
  1753. if (unlikely(!extent_buffer_uptodate(log_tree_root->node))) {
  1754. btrfs_err(fs_info, "failed to read log tree");
  1755. btrfs_put_root(log_tree_root);
  1756. return -EIO;
  1757. }
  1758. /* returns with log_tree_root freed on success */
  1759. ret = btrfs_recover_log_trees(log_tree_root);
  1760. btrfs_put_root(log_tree_root);
  1761. if (unlikely(ret)) {
  1762. ASSERT(BTRFS_FS_ERROR(fs_info) != 0);
  1763. btrfs_err(fs_info, "failed to recover log trees with error: %d", ret);
  1764. return ret;
  1765. }
  1766. if (sb_rdonly(fs_info->sb)) {
  1767. ret = btrfs_commit_super(fs_info);
  1768. if (ret)
  1769. return ret;
  1770. }
  1771. return 0;
  1772. }
  1773. static int load_global_roots_objectid(struct btrfs_root *tree_root,
  1774. struct btrfs_path *path, u64 objectid,
  1775. const char *name)
  1776. {
  1777. struct btrfs_fs_info *fs_info = tree_root->fs_info;
  1778. struct btrfs_root *root;
  1779. u64 max_global_id = 0;
  1780. int ret;
  1781. struct btrfs_key key = {
  1782. .objectid = objectid,
  1783. .type = BTRFS_ROOT_ITEM_KEY,
  1784. .offset = 0,
  1785. };
  1786. bool found = false;
  1787. /* If we have IGNOREDATACSUMS skip loading these roots. */
  1788. if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
  1789. btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
  1790. set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state);
  1791. return 0;
  1792. }
  1793. while (1) {
  1794. ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
  1795. if (ret < 0)
  1796. break;
  1797. if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
  1798. ret = btrfs_next_leaf(tree_root, path);
  1799. if (ret) {
  1800. if (ret > 0)
  1801. ret = 0;
  1802. break;
  1803. }
  1804. }
  1805. ret = 0;
  1806. btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
  1807. if (key.objectid != objectid)
  1808. break;
  1809. btrfs_release_path(path);
  1810. /*
  1811. * Just worry about this for extent tree, it'll be the same for
  1812. * everybody.
  1813. */
  1814. if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
  1815. max_global_id = max(max_global_id, key.offset);
  1816. found = true;
  1817. root = read_tree_root_path(tree_root, path, &key);
  1818. if (IS_ERR(root)) {
  1819. ret = PTR_ERR(root);
  1820. break;
  1821. }
  1822. set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
  1823. ret = btrfs_global_root_insert(root);
  1824. if (ret) {
  1825. btrfs_put_root(root);
  1826. break;
  1827. }
  1828. key.offset++;
  1829. }
  1830. btrfs_release_path(path);
  1831. if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
  1832. fs_info->nr_global_roots = max_global_id + 1;
  1833. if (!found || ret) {
  1834. if (objectid == BTRFS_CSUM_TREE_OBJECTID)
  1835. set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state);
  1836. if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
  1837. ret = ret ? ret : -ENOENT;
  1838. else
  1839. ret = 0;
  1840. btrfs_err(fs_info, "failed to load root %s", name);
  1841. }
  1842. return ret;
  1843. }
  1844. static int load_global_roots(struct btrfs_root *tree_root)
  1845. {
  1846. BTRFS_PATH_AUTO_FREE(path);
  1847. int ret;
  1848. path = btrfs_alloc_path();
  1849. if (!path)
  1850. return -ENOMEM;
  1851. ret = load_global_roots_objectid(tree_root, path,
  1852. BTRFS_EXTENT_TREE_OBJECTID, "extent");
  1853. if (ret)
  1854. return ret;
  1855. ret = load_global_roots_objectid(tree_root, path,
  1856. BTRFS_CSUM_TREE_OBJECTID, "csum");
  1857. if (ret)
  1858. return ret;
  1859. if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
  1860. return ret;
  1861. return load_global_roots_objectid(tree_root, path,
  1862. BTRFS_FREE_SPACE_TREE_OBJECTID,
  1863. "free space");
  1864. }
  1865. static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
  1866. {
  1867. struct btrfs_root *tree_root = fs_info->tree_root;
  1868. struct btrfs_root *root;
  1869. struct btrfs_key location;
  1870. int ret;
  1871. ASSERT(fs_info->tree_root);
  1872. ret = load_global_roots(tree_root);
  1873. if (ret)
  1874. return ret;
  1875. location.type = BTRFS_ROOT_ITEM_KEY;
  1876. location.offset = 0;
  1877. if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
  1878. location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
  1879. root = btrfs_read_tree_root(tree_root, &location);
  1880. if (IS_ERR(root)) {
  1881. if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
  1882. ret = PTR_ERR(root);
  1883. goto out;
  1884. }
  1885. } else {
  1886. set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
  1887. fs_info->block_group_root = root;
  1888. }
  1889. }
  1890. location.objectid = BTRFS_DEV_TREE_OBJECTID;
  1891. root = btrfs_read_tree_root(tree_root, &location);
  1892. if (IS_ERR(root)) {
  1893. if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
  1894. ret = PTR_ERR(root);
  1895. goto out;
  1896. }
  1897. } else {
  1898. set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
  1899. fs_info->dev_root = root;
  1900. }
  1901. /* Initialize fs_info for all devices in any case */
  1902. ret = btrfs_init_devices_late(fs_info);
  1903. if (ret)
  1904. goto out;
  1905. if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
  1906. /* The remap_root has already been loaded in load_important_roots(). */
  1907. root = fs_info->remap_root;
  1908. set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
  1909. root->root_key.objectid = BTRFS_REMAP_TREE_OBJECTID;
  1910. root->root_key.type = BTRFS_ROOT_ITEM_KEY;
  1911. root->root_key.offset = 0;
  1912. /* Check that data reloc tree doesn't also exist. */
  1913. location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
  1914. root = btrfs_read_tree_root(fs_info->tree_root, &location);
  1915. if (!IS_ERR(root)) {
  1916. btrfs_err(fs_info, "data reloc tree exists when remap-tree enabled");
  1917. btrfs_put_root(root);
  1918. return -EIO;
  1919. } else if (PTR_ERR(root) != -ENOENT) {
  1920. btrfs_warn(fs_info, "error %ld when checking for data reloc tree",
  1921. PTR_ERR(root));
  1922. }
  1923. } else {
  1924. /*
  1925. * This tree can share blocks with some other fs tree during
  1926. * relocation and we need a proper setup by btrfs_get_fs_root().
  1927. */
  1928. root = btrfs_get_fs_root(tree_root->fs_info,
  1929. BTRFS_DATA_RELOC_TREE_OBJECTID, true);
  1930. if (IS_ERR(root)) {
  1931. if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
  1932. location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
  1933. ret = PTR_ERR(root);
  1934. goto out;
  1935. }
  1936. } else {
  1937. set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
  1938. fs_info->data_reloc_root = root;
  1939. }
  1940. }
  1941. location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
  1942. root = btrfs_read_tree_root(tree_root, &location);
  1943. if (!IS_ERR(root)) {
  1944. set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
  1945. fs_info->quota_root = root;
  1946. }
  1947. location.objectid = BTRFS_UUID_TREE_OBJECTID;
  1948. root = btrfs_read_tree_root(tree_root, &location);
  1949. if (IS_ERR(root)) {
  1950. if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
  1951. ret = PTR_ERR(root);
  1952. if (ret != -ENOENT)
  1953. goto out;
  1954. }
  1955. } else {
  1956. set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
  1957. fs_info->uuid_root = root;
  1958. }
  1959. if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
  1960. location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
  1961. root = btrfs_read_tree_root(tree_root, &location);
  1962. if (IS_ERR(root)) {
  1963. if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
  1964. ret = PTR_ERR(root);
  1965. goto out;
  1966. }
  1967. } else {
  1968. set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
  1969. fs_info->stripe_root = root;
  1970. }
  1971. }
  1972. return 0;
  1973. out:
  1974. btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
  1975. location.objectid, ret);
  1976. return ret;
  1977. }
  1978. static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
  1979. const struct btrfs_super_block *sb)
  1980. {
  1981. unsigned int cur = 0; /* Offset inside the sys chunk array */
  1982. /*
  1983. * At sb read time, fs_info is not fully initialized. Thus we have
  1984. * to use super block sectorsize, which should have been validated.
  1985. */
  1986. const u32 sectorsize = btrfs_super_sectorsize(sb);
  1987. u32 sys_array_size = btrfs_super_sys_array_size(sb);
  1988. if (unlikely(sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)) {
  1989. btrfs_err(fs_info, "system chunk array too big %u > %u",
  1990. sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
  1991. return -EUCLEAN;
  1992. }
  1993. while (cur < sys_array_size) {
  1994. struct btrfs_disk_key *disk_key;
  1995. struct btrfs_chunk *chunk;
  1996. struct btrfs_key key;
  1997. u64 type;
  1998. u16 num_stripes;
  1999. u32 len;
  2000. int ret;
  2001. disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur);
  2002. len = sizeof(*disk_key);
  2003. if (unlikely(cur + len > sys_array_size))
  2004. goto short_read;
  2005. cur += len;
  2006. btrfs_disk_key_to_cpu(&key, disk_key);
  2007. if (unlikely(key.type != BTRFS_CHUNK_ITEM_KEY)) {
  2008. btrfs_err(fs_info,
  2009. "unexpected item type %u in sys_array at offset %u",
  2010. key.type, cur);
  2011. return -EUCLEAN;
  2012. }
  2013. chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur);
  2014. num_stripes = btrfs_stack_chunk_num_stripes(chunk);
  2015. if (unlikely(cur + btrfs_chunk_item_size(num_stripes) > sys_array_size))
  2016. goto short_read;
  2017. type = btrfs_stack_chunk_type(chunk);
  2018. if (unlikely(!(type & BTRFS_BLOCK_GROUP_SYSTEM))) {
  2019. btrfs_err(fs_info,
  2020. "invalid chunk type %llu in sys_array at offset %u",
  2021. type, cur);
  2022. return -EUCLEAN;
  2023. }
  2024. ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset,
  2025. sectorsize);
  2026. if (ret < 0)
  2027. return ret;
  2028. cur += btrfs_chunk_item_size(num_stripes);
  2029. }
  2030. return 0;
  2031. short_read:
  2032. btrfs_err(fs_info,
  2033. "super block sys chunk array short read, cur=%u sys_array_size=%u",
  2034. cur, sys_array_size);
  2035. return -EUCLEAN;
  2036. }
  2037. /*
  2038. * Real super block validation
  2039. * NOTE: super csum type and incompat features will not be checked here.
  2040. *
  2041. * @sb: super block to check
  2042. * @mirror_num: the super block number to check its bytenr:
  2043. * 0 the primary (1st) sb
  2044. * 1, 2 2nd and 3rd backup copy
  2045. * -1 skip bytenr check
  2046. */
  2047. int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
  2048. const struct btrfs_super_block *sb, int mirror_num)
  2049. {
  2050. u64 nodesize = btrfs_super_nodesize(sb);
  2051. u64 sectorsize = btrfs_super_sectorsize(sb);
  2052. int ret = 0;
  2053. const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS);
  2054. if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
  2055. btrfs_err(fs_info, "no valid FS found");
  2056. ret = -EINVAL;
  2057. }
  2058. if ((btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) {
  2059. if (!ignore_flags) {
  2060. btrfs_err(fs_info,
  2061. "unrecognized or unsupported super flag 0x%llx",
  2062. btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
  2063. ret = -EINVAL;
  2064. } else {
  2065. btrfs_info(fs_info,
  2066. "unrecognized or unsupported super flags: 0x%llx, ignored",
  2067. btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
  2068. }
  2069. }
  2070. if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
  2071. btrfs_err(fs_info, "tree_root level too big: %d >= %d",
  2072. btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
  2073. ret = -EINVAL;
  2074. }
  2075. if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
  2076. btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
  2077. btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
  2078. ret = -EINVAL;
  2079. }
  2080. if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
  2081. btrfs_err(fs_info, "log_root level too big: %d >= %d",
  2082. btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
  2083. ret = -EINVAL;
  2084. }
  2085. /*
  2086. * Check sectorsize and nodesize first, other check will need it.
  2087. * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
  2088. */
  2089. if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE ||
  2090. sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
  2091. btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
  2092. ret = -EINVAL;
  2093. }
  2094. if (!btrfs_supported_blocksize(sectorsize)) {
  2095. btrfs_err(fs_info,
  2096. "sectorsize %llu not yet supported for page size %lu",
  2097. sectorsize, PAGE_SIZE);
  2098. ret = -EINVAL;
  2099. }
  2100. if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
  2101. nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
  2102. btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
  2103. ret = -EINVAL;
  2104. }
  2105. if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
  2106. btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
  2107. le32_to_cpu(sb->__unused_leafsize), nodesize);
  2108. ret = -EINVAL;
  2109. }
  2110. /* Root alignment check */
  2111. if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
  2112. btrfs_err(fs_info, "tree_root block unaligned: %llu",
  2113. btrfs_super_root(sb));
  2114. ret = -EINVAL;
  2115. }
  2116. if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
  2117. btrfs_err(fs_info, "chunk_root block unaligned: %llu",
  2118. btrfs_super_chunk_root(sb));
  2119. ret = -EINVAL;
  2120. }
  2121. if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
  2122. btrfs_err(fs_info, "log_root block unaligned: %llu",
  2123. btrfs_super_log_root(sb));
  2124. ret = -EINVAL;
  2125. }
  2126. if (!fs_info->fs_devices->temp_fsid &&
  2127. memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
  2128. btrfs_err(fs_info,
  2129. "superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
  2130. sb->fsid, fs_info->fs_devices->fsid);
  2131. ret = -EINVAL;
  2132. }
  2133. if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb),
  2134. BTRFS_FSID_SIZE) != 0) {
  2135. btrfs_err(fs_info,
  2136. "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
  2137. btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid);
  2138. ret = -EINVAL;
  2139. }
  2140. if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
  2141. BTRFS_FSID_SIZE) != 0) {
  2142. btrfs_err(fs_info,
  2143. "dev_item UUID does not match metadata fsid: %pU != %pU",
  2144. fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
  2145. ret = -EINVAL;
  2146. }
  2147. /*
  2148. * Artificial requirement for block-group-tree to force newer features
  2149. * (free-space-tree, no-holes) so the test matrix is smaller.
  2150. */
  2151. if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
  2152. (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
  2153. !btrfs_fs_incompat(fs_info, NO_HOLES))) {
  2154. btrfs_err(fs_info,
  2155. "block-group-tree feature requires free-space-tree and no-holes");
  2156. ret = -EINVAL;
  2157. }
  2158. if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
  2159. /*
  2160. * Reduce test matrix for remap tree by requiring block-group-tree
  2161. * and no-holes. Free-space-tree is a hard requirement.
  2162. */
  2163. if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
  2164. !btrfs_fs_incompat(fs_info, NO_HOLES) ||
  2165. !btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
  2166. btrfs_err(fs_info,
  2167. "remap-tree feature requires free-space-tree, no-holes, and block-group-tree");
  2168. ret = -EINVAL;
  2169. }
  2170. if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
  2171. btrfs_err(fs_info, "remap-tree not supported with mixed-bg");
  2172. ret = -EINVAL;
  2173. }
  2174. if (btrfs_fs_incompat(fs_info, ZONED)) {
  2175. btrfs_err(fs_info, "remap-tree not supported with zoned devices");
  2176. ret = -EINVAL;
  2177. }
  2178. if (sectorsize > PAGE_SIZE) {
  2179. btrfs_err(fs_info, "remap-tree not supported when block size > page size");
  2180. ret = -EINVAL;
  2181. }
  2182. }
  2183. /*
  2184. * Hint to catch really bogus numbers, bitflips or so, more exact checks are
  2185. * done later
  2186. */
  2187. if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
  2188. btrfs_err(fs_info, "bytes_used is too small %llu",
  2189. btrfs_super_bytes_used(sb));
  2190. ret = -EINVAL;
  2191. }
  2192. if (!is_power_of_2(btrfs_super_stripesize(sb))) {
  2193. btrfs_err(fs_info, "invalid stripesize %u",
  2194. btrfs_super_stripesize(sb));
  2195. ret = -EINVAL;
  2196. }
  2197. if (btrfs_super_num_devices(sb) > (1UL << 31))
  2198. btrfs_warn(fs_info, "suspicious number of devices: %llu",
  2199. btrfs_super_num_devices(sb));
  2200. if (btrfs_super_num_devices(sb) == 0) {
  2201. btrfs_err(fs_info, "number of devices is 0");
  2202. ret = -EINVAL;
  2203. }
  2204. if (mirror_num >= 0 &&
  2205. btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
  2206. btrfs_err(fs_info, "super offset mismatch %llu != %llu",
  2207. btrfs_super_bytenr(sb), btrfs_sb_offset(mirror_num));
  2208. ret = -EINVAL;
  2209. }
  2210. if (ret)
  2211. return ret;
  2212. ret = validate_sys_chunk_array(fs_info, sb);
  2213. /*
  2214. * Obvious sys_chunk_array corruptions, it must hold at least one key
  2215. * and one chunk
  2216. */
  2217. if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
  2218. btrfs_err(fs_info, "system chunk array too big %u > %u",
  2219. btrfs_super_sys_array_size(sb),
  2220. BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
  2221. ret = -EINVAL;
  2222. }
  2223. if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
  2224. + sizeof(struct btrfs_chunk)) {
  2225. btrfs_err(fs_info, "system chunk array too small %u < %zu",
  2226. btrfs_super_sys_array_size(sb),
  2227. sizeof(struct btrfs_disk_key)
  2228. + sizeof(struct btrfs_chunk));
  2229. ret = -EINVAL;
  2230. }
  2231. /*
  2232. * The generation is a global counter, we'll trust it more than the others
  2233. * but it's still possible that it's the one that's wrong.
  2234. */
  2235. if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
  2236. btrfs_warn(fs_info,
  2237. "suspicious: generation < chunk_root_generation: %llu < %llu",
  2238. btrfs_super_generation(sb),
  2239. btrfs_super_chunk_root_generation(sb));
  2240. if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
  2241. && btrfs_super_cache_generation(sb) != (u64)-1)
  2242. btrfs_warn(fs_info,
  2243. "suspicious: generation < cache_generation: %llu < %llu",
  2244. btrfs_super_generation(sb),
  2245. btrfs_super_cache_generation(sb));
  2246. return ret;
  2247. }
  2248. /*
  2249. * Validation of super block at mount time.
  2250. * Some checks already done early at mount time, like csum type and incompat
  2251. * flags will be skipped.
  2252. */
  2253. static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
  2254. {
  2255. return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
  2256. }
  2257. /*
  2258. * Validation of super block at write time.
  2259. * Some checks like bytenr check will be skipped as their values will be
  2260. * overwritten soon.
  2261. * Extra checks like csum type and incompat flags will be done here.
  2262. */
  2263. static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
  2264. struct btrfs_super_block *sb)
  2265. {
  2266. int ret;
  2267. ret = btrfs_validate_super(fs_info, sb, -1);
  2268. if (ret < 0)
  2269. goto out;
  2270. if (unlikely(!btrfs_supported_super_csum(btrfs_super_csum_type(sb)))) {
  2271. ret = -EUCLEAN;
  2272. btrfs_err(fs_info, "invalid csum type, has %u want %u",
  2273. btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
  2274. goto out;
  2275. }
  2276. if (unlikely(btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP)) {
  2277. ret = -EUCLEAN;
  2278. btrfs_err(fs_info,
  2279. "invalid incompat flags, has 0x%llx valid mask 0x%llx",
  2280. btrfs_super_incompat_flags(sb),
  2281. (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
  2282. goto out;
  2283. }
  2284. out:
  2285. if (ret < 0)
  2286. btrfs_err(fs_info,
  2287. "super block corruption detected before writing it to disk");
  2288. return ret;
  2289. }
  2290. static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
  2291. {
  2292. struct btrfs_tree_parent_check check = {
  2293. .level = level,
  2294. .transid = gen,
  2295. .owner_root = btrfs_root_id(root)
  2296. };
  2297. int ret = 0;
  2298. root->node = read_tree_block(root->fs_info, bytenr, &check);
  2299. if (IS_ERR(root->node)) {
  2300. ret = PTR_ERR(root->node);
  2301. root->node = NULL;
  2302. return ret;
  2303. }
  2304. if (unlikely(!extent_buffer_uptodate(root->node))) {
  2305. free_extent_buffer(root->node);
  2306. root->node = NULL;
  2307. return -EIO;
  2308. }
  2309. btrfs_set_root_node(&root->root_item, root->node);
  2310. root->commit_root = btrfs_root_node(root);
  2311. btrfs_set_root_refs(&root->root_item, 1);
  2312. return ret;
  2313. }
  2314. static int load_important_roots(struct btrfs_fs_info *fs_info)
  2315. {
  2316. struct btrfs_super_block *sb = fs_info->super_copy;
  2317. u64 gen, bytenr;
  2318. int level, ret;
  2319. bytenr = btrfs_super_root(sb);
  2320. gen = btrfs_super_generation(sb);
  2321. level = btrfs_super_root_level(sb);
  2322. ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
  2323. if (ret) {
  2324. btrfs_warn(fs_info, "couldn't read tree root");
  2325. return ret;
  2326. }
  2327. if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
  2328. bytenr = btrfs_super_remap_root(sb);
  2329. gen = btrfs_super_remap_root_generation(sb);
  2330. level = btrfs_super_remap_root_level(sb);
  2331. ret = load_super_root(fs_info->remap_root, bytenr, gen, level);
  2332. if (ret) {
  2333. btrfs_warn(fs_info, "couldn't read remap root");
  2334. return ret;
  2335. }
  2336. }
  2337. return 0;
  2338. }
  2339. static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
  2340. {
  2341. int backup_index = find_newest_super_backup(fs_info);
  2342. struct btrfs_super_block *sb = fs_info->super_copy;
  2343. struct btrfs_root *tree_root = fs_info->tree_root;
  2344. bool handle_error = false;
  2345. int ret = 0;
  2346. int i;
  2347. for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
  2348. if (handle_error) {
  2349. if (!IS_ERR(tree_root->node))
  2350. free_extent_buffer(tree_root->node);
  2351. tree_root->node = NULL;
  2352. if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
  2353. break;
  2354. free_root_pointers(fs_info, 0);
  2355. /*
  2356. * Don't use the log in recovery mode, it won't be
  2357. * valid
  2358. */
  2359. btrfs_set_super_log_root(sb, 0);
  2360. btrfs_warn(fs_info, "try to load backup roots slot %d", i);
  2361. ret = read_backup_root(fs_info, i);
  2362. backup_index = ret;
  2363. if (ret < 0)
  2364. return ret;
  2365. }
  2366. ret = load_important_roots(fs_info);
  2367. if (ret) {
  2368. handle_error = true;
  2369. continue;
  2370. }
  2371. /*
  2372. * No need to hold btrfs_root::objectid_mutex since the fs
  2373. * hasn't been fully initialised and we are the only user
  2374. */
  2375. ret = btrfs_init_root_free_objectid(tree_root);
  2376. if (ret < 0) {
  2377. handle_error = true;
  2378. continue;
  2379. }
  2380. ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
  2381. ret = btrfs_read_roots(fs_info);
  2382. if (ret < 0) {
  2383. handle_error = true;
  2384. continue;
  2385. }
  2386. /* All successful */
  2387. fs_info->generation = btrfs_header_generation(tree_root->node);
  2388. btrfs_set_last_trans_committed(fs_info, fs_info->generation);
  2389. fs_info->last_reloc_trans = 0;
  2390. /* Always begin writing backup roots after the one being used */
  2391. if (backup_index < 0) {
  2392. fs_info->backup_root_index = 0;
  2393. } else {
  2394. fs_info->backup_root_index = backup_index + 1;
  2395. fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
  2396. }
  2397. break;
  2398. }
  2399. return ret;
  2400. }
  2401. /*
  2402. * Lockdep gets confused between our buffer_tree which requires IRQ locking because
  2403. * we modify marks in the IRQ context, and our delayed inode xarray which doesn't
  2404. * have these requirements. Use a class key so lockdep doesn't get them mixed up.
  2405. */
  2406. static struct lock_class_key buffer_xa_class;
  2407. void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
  2408. {
  2409. INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
  2410. /* Use the same flags as mapping->i_pages. */
  2411. xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
  2412. lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class);
  2413. INIT_LIST_HEAD(&fs_info->trans_list);
  2414. INIT_LIST_HEAD(&fs_info->dead_roots);
  2415. INIT_LIST_HEAD(&fs_info->delayed_iputs);
  2416. INIT_LIST_HEAD(&fs_info->delalloc_roots);
  2417. INIT_LIST_HEAD(&fs_info->caching_block_groups);
  2418. spin_lock_init(&fs_info->delalloc_root_lock);
  2419. spin_lock_init(&fs_info->trans_lock);
  2420. spin_lock_init(&fs_info->fs_roots_radix_lock);
  2421. spin_lock_init(&fs_info->delayed_iput_lock);
  2422. spin_lock_init(&fs_info->defrag_inodes_lock);
  2423. spin_lock_init(&fs_info->super_lock);
  2424. spin_lock_init(&fs_info->unused_bgs_lock);
  2425. spin_lock_init(&fs_info->treelog_bg_lock);
  2426. spin_lock_init(&fs_info->zone_active_bgs_lock);
  2427. spin_lock_init(&fs_info->relocation_bg_lock);
  2428. rwlock_init(&fs_info->tree_mod_log_lock);
  2429. rwlock_init(&fs_info->global_root_lock);
  2430. mutex_init(&fs_info->unused_bg_unpin_mutex);
  2431. mutex_init(&fs_info->reclaim_bgs_lock);
  2432. mutex_init(&fs_info->reloc_mutex);
  2433. mutex_init(&fs_info->delalloc_root_mutex);
  2434. mutex_init(&fs_info->zoned_meta_io_lock);
  2435. mutex_init(&fs_info->zoned_data_reloc_io_lock);
  2436. seqlock_init(&fs_info->profiles_lock);
  2437. btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
  2438. btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
  2439. btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
  2440. btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
  2441. btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep,
  2442. BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
  2443. btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
  2444. BTRFS_LOCKDEP_TRANS_UNBLOCKED);
  2445. btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
  2446. BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
  2447. btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
  2448. BTRFS_LOCKDEP_TRANS_COMPLETED);
  2449. INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
  2450. INIT_LIST_HEAD(&fs_info->space_info);
  2451. INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
  2452. INIT_LIST_HEAD(&fs_info->unused_bgs);
  2453. INIT_LIST_HEAD(&fs_info->reclaim_bgs);
  2454. INIT_LIST_HEAD(&fs_info->fully_remapped_bgs);
  2455. INIT_LIST_HEAD(&fs_info->zone_active_bgs);
  2456. #ifdef CONFIG_BTRFS_DEBUG
  2457. INIT_LIST_HEAD(&fs_info->allocated_roots);
  2458. INIT_LIST_HEAD(&fs_info->allocated_ebs);
  2459. spin_lock_init(&fs_info->eb_leak_lock);
  2460. #endif
  2461. fs_info->mapping_tree = RB_ROOT_CACHED;
  2462. rwlock_init(&fs_info->mapping_tree_lock);
  2463. btrfs_init_block_rsv(&fs_info->global_block_rsv,
  2464. BTRFS_BLOCK_RSV_GLOBAL);
  2465. btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
  2466. btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
  2467. btrfs_init_block_rsv(&fs_info->remap_block_rsv, BTRFS_BLOCK_RSV_REMAP);
  2468. btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG);
  2469. btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
  2470. btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
  2471. BTRFS_BLOCK_RSV_DELOPS);
  2472. btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
  2473. BTRFS_BLOCK_RSV_DELREFS);
  2474. atomic_set(&fs_info->async_delalloc_pages, 0);
  2475. atomic_set(&fs_info->defrag_running, 0);
  2476. atomic_set(&fs_info->nr_delayed_iputs, 0);
  2477. atomic64_set(&fs_info->tree_mod_seq, 0);
  2478. fs_info->global_root_tree = RB_ROOT;
  2479. fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
  2480. fs_info->metadata_ratio = 0;
  2481. fs_info->defrag_inodes = RB_ROOT;
  2482. atomic64_set(&fs_info->free_chunk_space, 0);
  2483. fs_info->tree_mod_log = RB_ROOT;
  2484. fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
  2485. btrfs_init_ref_verify(fs_info);
  2486. fs_info->thread_pool_size = min_t(unsigned long,
  2487. num_online_cpus() + 2, 8);
  2488. INIT_LIST_HEAD(&fs_info->ordered_roots);
  2489. spin_lock_init(&fs_info->ordered_root_lock);
  2490. btrfs_init_scrub(fs_info);
  2491. btrfs_init_balance(fs_info);
  2492. btrfs_init_async_reclaim_work(fs_info);
  2493. btrfs_init_extent_map_shrinker_work(fs_info);
  2494. rwlock_init(&fs_info->block_group_cache_lock);
  2495. fs_info->block_group_cache_tree = RB_ROOT_CACHED;
  2496. btrfs_extent_io_tree_init(fs_info, &fs_info->excluded_extents,
  2497. IO_TREE_FS_EXCLUDED_EXTENTS);
  2498. mutex_init(&fs_info->ordered_operations_mutex);
  2499. mutex_init(&fs_info->tree_log_mutex);
  2500. mutex_init(&fs_info->chunk_mutex);
  2501. mutex_init(&fs_info->transaction_kthread_mutex);
  2502. mutex_init(&fs_info->cleaner_mutex);
  2503. mutex_init(&fs_info->remap_mutex);
  2504. mutex_init(&fs_info->ro_block_group_mutex);
  2505. init_rwsem(&fs_info->commit_root_sem);
  2506. init_rwsem(&fs_info->cleanup_work_sem);
  2507. init_rwsem(&fs_info->subvol_sem);
  2508. sema_init(&fs_info->uuid_tree_rescan_sem, 1);
  2509. btrfs_init_dev_replace_locks(fs_info);
  2510. btrfs_init_qgroup(fs_info);
  2511. btrfs_discard_init(fs_info);
  2512. btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
  2513. btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
  2514. init_waitqueue_head(&fs_info->transaction_throttle);
  2515. init_waitqueue_head(&fs_info->transaction_wait);
  2516. init_waitqueue_head(&fs_info->transaction_blocked_wait);
  2517. init_waitqueue_head(&fs_info->async_submit_wait);
  2518. init_waitqueue_head(&fs_info->delayed_iputs_wait);
  2519. /* Usable values until the real ones are cached from the superblock */
  2520. fs_info->nodesize = 4096;
  2521. fs_info->sectorsize = 4096;
  2522. fs_info->sectorsize_bits = ilog2(4096);
  2523. fs_info->stripesize = 4096;
  2524. /* Default compress algorithm when user does -o compress */
  2525. fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
  2526. fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
  2527. spin_lock_init(&fs_info->swapfile_pins_lock);
  2528. fs_info->swapfile_pins = RB_ROOT;
  2529. fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
  2530. INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
  2531. }
  2532. static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
  2533. {
  2534. int ret;
  2535. fs_info->sb = sb;
  2536. /* Temporary fixed values for block size until we read the superblock. */
  2537. sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
  2538. sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
  2539. ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
  2540. if (ret)
  2541. return ret;
  2542. ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL);
  2543. if (ret)
  2544. return ret;
  2545. ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
  2546. if (ret)
  2547. return ret;
  2548. ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL);
  2549. if (ret)
  2550. return ret;
  2551. fs_info->dirty_metadata_batch = PAGE_SIZE *
  2552. (1 + ilog2(nr_cpu_ids));
  2553. ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
  2554. if (ret)
  2555. return ret;
  2556. ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
  2557. GFP_KERNEL);
  2558. if (ret)
  2559. return ret;
  2560. btrfs_init_delayed_root(&fs_info->delayed_root);
  2561. if (sb_rdonly(sb))
  2562. set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
  2563. if (btrfs_test_opt(fs_info, IGNOREMETACSUMS))
  2564. set_bit(BTRFS_FS_STATE_SKIP_META_CSUMS, &fs_info->fs_state);
  2565. return btrfs_alloc_stripe_hash_table(fs_info);
  2566. }
  2567. static int btrfs_uuid_rescan_kthread(void *data)
  2568. {
  2569. struct btrfs_fs_info *fs_info = data;
  2570. int ret;
  2571. /*
  2572. * 1st step is to iterate through the existing UUID tree and
  2573. * to delete all entries that contain outdated data.
  2574. * 2nd step is to add all missing entries to the UUID tree.
  2575. */
  2576. ret = btrfs_uuid_tree_iterate(fs_info);
  2577. if (ret < 0) {
  2578. if (ret != -EINTR)
  2579. btrfs_warn(fs_info, "iterating uuid_tree failed %d",
  2580. ret);
  2581. up(&fs_info->uuid_tree_rescan_sem);
  2582. return ret;
  2583. }
  2584. return btrfs_uuid_scan_kthread(data);
  2585. }
  2586. static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
  2587. {
  2588. struct task_struct *task;
  2589. down(&fs_info->uuid_tree_rescan_sem);
  2590. task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
  2591. if (IS_ERR(task)) {
  2592. /* fs_info->update_uuid_tree_gen remains 0 in all error case */
  2593. up(&fs_info->uuid_tree_rescan_sem);
  2594. return PTR_ERR(task);
  2595. }
  2596. return 0;
  2597. }
  2598. static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
  2599. {
  2600. u64 root_objectid = 0;
  2601. struct btrfs_root *gang[8];
  2602. int ret = 0;
  2603. while (1) {
  2604. unsigned int found;
  2605. spin_lock(&fs_info->fs_roots_radix_lock);
  2606. found = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
  2607. (void **)gang, root_objectid,
  2608. ARRAY_SIZE(gang));
  2609. if (!found) {
  2610. spin_unlock(&fs_info->fs_roots_radix_lock);
  2611. break;
  2612. }
  2613. root_objectid = btrfs_root_id(gang[found - 1]) + 1;
  2614. for (int i = 0; i < found; i++) {
  2615. /* Avoid to grab roots in dead_roots. */
  2616. if (btrfs_root_refs(&gang[i]->root_item) == 0) {
  2617. gang[i] = NULL;
  2618. continue;
  2619. }
  2620. /* Grab all the search result for later use. */
  2621. gang[i] = btrfs_grab_root(gang[i]);
  2622. }
  2623. spin_unlock(&fs_info->fs_roots_radix_lock);
  2624. for (int i = 0; i < found; i++) {
  2625. if (!gang[i])
  2626. continue;
  2627. root_objectid = btrfs_root_id(gang[i]);
  2628. /*
  2629. * Continue to release the remaining roots after the first
  2630. * error without cleanup and preserve the first error
  2631. * for the return.
  2632. */
  2633. if (!ret)
  2634. ret = btrfs_orphan_cleanup(gang[i]);
  2635. btrfs_put_root(gang[i]);
  2636. }
  2637. if (ret)
  2638. break;
  2639. root_objectid++;
  2640. }
  2641. return ret;
  2642. }
  2643. /*
  2644. * Mounting logic specific to read-write file systems. Shared by open_ctree
  2645. * and btrfs_remount when remounting from read-only to read-write.
  2646. */
  2647. int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
  2648. {
  2649. int ret;
  2650. const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
  2651. bool rebuild_free_space_tree = false;
  2652. if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
  2653. btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
  2654. if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
  2655. btrfs_warn(fs_info,
  2656. "'clear_cache' option is ignored with extent tree v2");
  2657. else if (btrfs_fs_incompat(fs_info, REMAP_TREE))
  2658. btrfs_warn(fs_info, "'clear_cache' option is ignored with remap tree");
  2659. else
  2660. rebuild_free_space_tree = true;
  2661. } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
  2662. !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
  2663. btrfs_warn(fs_info, "free space tree is invalid");
  2664. rebuild_free_space_tree = true;
  2665. }
  2666. if (rebuild_free_space_tree) {
  2667. btrfs_info(fs_info, "rebuilding free space tree");
  2668. ret = btrfs_rebuild_free_space_tree(fs_info);
  2669. if (ret) {
  2670. btrfs_warn(fs_info,
  2671. "failed to rebuild free space tree: %d", ret);
  2672. return ret;
  2673. }
  2674. }
  2675. if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
  2676. !btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
  2677. btrfs_info(fs_info, "disabling free space tree");
  2678. ret = btrfs_delete_free_space_tree(fs_info);
  2679. if (ret) {
  2680. btrfs_warn(fs_info,
  2681. "failed to disable free space tree: %d", ret);
  2682. return ret;
  2683. }
  2684. }
  2685. /*
  2686. * Before btrfs-progs v6.16.1 mkfs.btrfs can leave free space entries
  2687. * for deleted temporary chunks. Delete them if they exist.
  2688. */
  2689. ret = btrfs_delete_orphan_free_space_entries(fs_info);
  2690. if (ret < 0) {
  2691. btrfs_err(fs_info, "failed to delete orphan free space tree entries: %d", ret);
  2692. return ret;
  2693. }
  2694. /*
  2695. * btrfs_find_orphan_roots() is responsible for finding all the dead
  2696. * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
  2697. * them into the fs_info->fs_roots_radix tree. This must be done before
  2698. * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
  2699. * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
  2700. * item before the root's tree is deleted - this means that if we unmount
  2701. * or crash before the deletion completes, on the next mount we will not
  2702. * delete what remains of the tree because the orphan item does not
  2703. * exists anymore, which is what tells us we have a pending deletion.
  2704. */
  2705. ret = btrfs_find_orphan_roots(fs_info);
  2706. if (ret)
  2707. return ret;
  2708. ret = btrfs_cleanup_fs_roots(fs_info);
  2709. if (ret)
  2710. return ret;
  2711. down_read(&fs_info->cleanup_work_sem);
  2712. if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
  2713. (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
  2714. up_read(&fs_info->cleanup_work_sem);
  2715. return ret;
  2716. }
  2717. up_read(&fs_info->cleanup_work_sem);
  2718. mutex_lock(&fs_info->cleaner_mutex);
  2719. ret = btrfs_recover_relocation(fs_info);
  2720. mutex_unlock(&fs_info->cleaner_mutex);
  2721. if (ret < 0) {
  2722. btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
  2723. return ret;
  2724. }
  2725. if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
  2726. !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
  2727. btrfs_info(fs_info, "creating free space tree");
  2728. ret = btrfs_create_free_space_tree(fs_info);
  2729. if (ret) {
  2730. btrfs_warn(fs_info,
  2731. "failed to create free space tree: %d", ret);
  2732. return ret;
  2733. }
  2734. }
  2735. if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
  2736. ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
  2737. if (ret)
  2738. return ret;
  2739. }
  2740. ret = btrfs_resume_balance_async(fs_info);
  2741. if (ret)
  2742. return ret;
  2743. ret = btrfs_resume_dev_replace_async(fs_info);
  2744. if (ret) {
  2745. btrfs_warn(fs_info, "failed to resume dev_replace");
  2746. return ret;
  2747. }
  2748. btrfs_qgroup_rescan_resume(fs_info);
  2749. if (!fs_info->uuid_root) {
  2750. btrfs_info(fs_info, "creating UUID tree");
  2751. ret = btrfs_create_uuid_tree(fs_info);
  2752. if (ret) {
  2753. btrfs_warn(fs_info,
  2754. "failed to create the UUID tree %d", ret);
  2755. return ret;
  2756. }
  2757. }
  2758. return 0;
  2759. }
  2760. /*
  2761. * Do various sanity and dependency checks of different features.
  2762. *
  2763. * @is_rw_mount: If the mount is read-write.
  2764. *
  2765. * This is the place for less strict checks (like for subpage or artificial
  2766. * feature dependencies).
  2767. *
  2768. * For strict checks or possible corruption detection, see
  2769. * btrfs_validate_super().
  2770. *
  2771. * This should be called after btrfs_parse_options(), as some mount options
  2772. * (space cache related) can modify on-disk format like free space tree and
  2773. * screw up certain feature dependencies.
  2774. */
  2775. int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
  2776. {
  2777. struct btrfs_super_block *disk_super = fs_info->super_copy;
  2778. u64 incompat = btrfs_super_incompat_flags(disk_super);
  2779. const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
  2780. const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);
  2781. if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
  2782. btrfs_err(fs_info,
  2783. "cannot mount because of unknown incompat features (0x%llx)",
  2784. incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP);
  2785. return -EINVAL;
  2786. }
  2787. /* Runtime limitation for mixed block groups. */
  2788. if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
  2789. (fs_info->sectorsize != fs_info->nodesize)) {
  2790. btrfs_err(fs_info,
  2791. "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
  2792. fs_info->nodesize, fs_info->sectorsize);
  2793. return -EINVAL;
  2794. }
  2795. /* Mixed backref is an always-enabled feature. */
  2796. incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
  2797. /* Set compression related flags just in case. */
  2798. if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
  2799. incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
  2800. else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
  2801. incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
  2802. /*
  2803. * An ancient flag, which should really be marked deprecated.
  2804. * Such runtime limitation doesn't really need a incompat flag.
  2805. */
  2806. if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
  2807. incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
  2808. if (compat_ro_unsupp && is_rw_mount) {
  2809. btrfs_err(fs_info,
  2810. "cannot mount read-write because of unknown compat_ro features (0x%llx)",
  2811. compat_ro_unsupp);
  2812. return -EINVAL;
  2813. }
  2814. /*
  2815. * We have unsupported RO compat features, although RO mounted, we
  2816. * should not cause any metadata writes, including log replay.
  2817. * Or we could screw up whatever the new feature requires.
  2818. */
  2819. if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
  2820. !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
  2821. btrfs_err(fs_info,
  2822. "cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
  2823. compat_ro_unsupp);
  2824. return -EINVAL;
  2825. }
  2826. /*
  2827. * Artificial limitations for block group tree, to force
  2828. * block-group-tree to rely on no-holes and free-space-tree.
  2829. */
  2830. if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
  2831. (!btrfs_fs_incompat(fs_info, NO_HOLES) ||
  2832. !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
  2833. btrfs_err(fs_info,
  2834. "block-group-tree feature requires no-holes and free-space-tree features");
  2835. return -EINVAL;
  2836. }
  2837. /*
  2838. * Subpage/bs > ps runtime limitation on v1 cache.
  2839. *
  2840. * V1 space cache still has some hard coded PAGE_SIZE usage, while
  2841. * we're already defaulting to v2 cache, no need to bother v1 as it's
  2842. * going to be deprecated anyway.
  2843. */
  2844. if (fs_info->sectorsize != PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
  2845. btrfs_warn(fs_info,
  2846. "v1 space cache is not supported for page size %lu with sectorsize %u",
  2847. PAGE_SIZE, fs_info->sectorsize);
  2848. return -EINVAL;
  2849. }
  2850. /* This can be called by remount, we need to protect the super block. */
  2851. spin_lock(&fs_info->super_lock);
  2852. btrfs_set_super_incompat_flags(disk_super, incompat);
  2853. spin_unlock(&fs_info->super_lock);
  2854. return 0;
  2855. }
  2856. static bool fs_is_full_ro(const struct btrfs_fs_info *fs_info)
  2857. {
  2858. if (!sb_rdonly(fs_info->sb))
  2859. return false;
  2860. if (unlikely(fs_info->mount_opt & BTRFS_MOUNT_FULL_RO_MASK))
  2861. return true;
  2862. return false;
  2863. }
  2864. int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices)
  2865. {
  2866. u32 sectorsize;
  2867. u32 nodesize;
  2868. u32 stripesize;
  2869. u64 generation;
  2870. u16 csum_type;
  2871. struct btrfs_super_block *disk_super;
  2872. struct btrfs_fs_info *fs_info = btrfs_sb(sb);
  2873. struct btrfs_root *tree_root;
  2874. struct btrfs_root *chunk_root;
  2875. struct btrfs_root *remap_root;
  2876. int ret;
  2877. int level;
  2878. ret = init_mount_fs_info(fs_info, sb);
  2879. if (ret)
  2880. goto fail;
  2881. /* These need to be init'ed before we start creating inodes and such. */
  2882. tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
  2883. GFP_KERNEL);
  2884. fs_info->tree_root = tree_root;
  2885. chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
  2886. GFP_KERNEL);
  2887. fs_info->chunk_root = chunk_root;
  2888. if (!tree_root || !chunk_root) {
  2889. ret = -ENOMEM;
  2890. goto fail;
  2891. }
  2892. ret = btrfs_init_btree_inode(sb);
  2893. if (ret)
  2894. goto fail;
  2895. invalidate_bdev(fs_devices->latest_dev->bdev);
  2896. /*
  2897. * Read super block and check the signature bytes only
  2898. */
  2899. disk_super = btrfs_read_disk_super(fs_devices->latest_dev->bdev, 0, false);
  2900. if (IS_ERR(disk_super)) {
  2901. ret = PTR_ERR(disk_super);
  2902. goto fail_alloc;
  2903. }
  2904. btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid);
  2905. /*
  2906. * Verify the type first, if that or the checksum value are
  2907. * corrupted, we'll find out
  2908. */
  2909. csum_type = btrfs_super_csum_type(disk_super);
  2910. if (!btrfs_supported_super_csum(csum_type)) {
  2911. btrfs_err(fs_info, "unsupported checksum algorithm: %u",
  2912. csum_type);
  2913. ret = -EINVAL;
  2914. btrfs_release_disk_super(disk_super);
  2915. goto fail_alloc;
  2916. }
  2917. fs_info->csum_size = btrfs_super_csum_size(disk_super);
  2918. fs_info->csum_type = csum_type;
  2919. btrfs_init_csum_hash(fs_info, csum_type);
  2920. /*
  2921. * We want to check superblock checksum, the type is stored inside.
  2922. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
  2923. */
  2924. if (btrfs_check_super_csum(fs_info, disk_super)) {
  2925. btrfs_err(fs_info, "superblock checksum mismatch");
  2926. ret = -EINVAL;
  2927. btrfs_release_disk_super(disk_super);
  2928. goto fail_alloc;
  2929. }
  2930. /*
  2931. * super_copy is zeroed at allocation time and we never touch the
  2932. * following bytes up to INFO_SIZE, the checksum is calculated from
  2933. * the whole block of INFO_SIZE
  2934. */
  2935. memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
  2936. btrfs_release_disk_super(disk_super);
  2937. disk_super = fs_info->super_copy;
  2938. memcpy(fs_info->super_for_commit, fs_info->super_copy,
  2939. sizeof(*fs_info->super_for_commit));
  2940. ret = btrfs_validate_mount_super(fs_info);
  2941. if (ret) {
  2942. btrfs_err(fs_info, "superblock contains fatal errors");
  2943. ret = -EINVAL;
  2944. goto fail_alloc;
  2945. }
  2946. if (!btrfs_super_root(disk_super)) {
  2947. btrfs_err(fs_info, "invalid superblock tree root bytenr");
  2948. ret = -EINVAL;
  2949. goto fail_alloc;
  2950. }
  2951. /* check FS state, whether FS is broken. */
  2952. if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
  2953. WRITE_ONCE(fs_info->fs_error, -EUCLEAN);
  2954. /* If the fs has any rescue options, no transaction is allowed. */
  2955. if (fs_is_full_ro(fs_info))
  2956. WRITE_ONCE(fs_info->fs_error, -EROFS);
  2957. /* Set up fs_info before parsing mount options */
  2958. nodesize = btrfs_super_nodesize(disk_super);
  2959. sectorsize = btrfs_super_sectorsize(disk_super);
  2960. stripesize = sectorsize;
  2961. fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
  2962. fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
  2963. fs_info->nodesize = nodesize;
  2964. fs_info->nodesize_bits = ilog2(nodesize);
  2965. fs_info->sectorsize = sectorsize;
  2966. fs_info->sectorsize_bits = ilog2(sectorsize);
  2967. fs_info->block_min_order = ilog2(round_up(sectorsize, PAGE_SIZE) >> PAGE_SHIFT);
  2968. fs_info->block_max_order = ilog2((BITS_PER_LONG << fs_info->sectorsize_bits) >> PAGE_SHIFT);
  2969. fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
  2970. fs_info->stripesize = stripesize;
  2971. fs_info->fs_devices->fs_info = fs_info;
  2972. if (fs_info->sectorsize > PAGE_SIZE)
  2973. btrfs_warn(fs_info,
  2974. "support for block size %u with page size %lu is experimental, some features may be missing",
  2975. fs_info->sectorsize, PAGE_SIZE);
  2976. /*
  2977. * Handle the space caching options appropriately now that we have the
  2978. * super block loaded and validated.
  2979. */
  2980. btrfs_set_free_space_cache_settings(fs_info);
  2981. if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) {
  2982. ret = -EINVAL;
  2983. goto fail_alloc;
  2984. }
  2985. ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
  2986. if (ret < 0)
  2987. goto fail_alloc;
  2988. if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) {
  2989. remap_root = btrfs_alloc_root(fs_info, BTRFS_REMAP_TREE_OBJECTID,
  2990. GFP_KERNEL);
  2991. fs_info->remap_root = remap_root;
  2992. if (!remap_root) {
  2993. ret = -ENOMEM;
  2994. goto fail_alloc;
  2995. }
  2996. }
  2997. /*
  2998. * At this point our mount options are validated, if we set ->max_inline
  2999. * to something non-standard make sure we truncate it to sectorsize.
  3000. */
  3001. fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
  3002. ret = btrfs_alloc_compress_wsm(fs_info);
  3003. if (ret)
  3004. goto fail_sb_buffer;
  3005. ret = btrfs_init_workqueues(fs_info);
  3006. if (ret)
  3007. goto fail_sb_buffer;
  3008. sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
  3009. sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
  3010. /* Update the values for the current filesystem. */
  3011. sb->s_blocksize = sectorsize;
  3012. sb->s_blocksize_bits = blksize_bits(sectorsize);
  3013. memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
  3014. mutex_lock(&fs_info->chunk_mutex);
  3015. ret = btrfs_read_sys_array(fs_info);
  3016. mutex_unlock(&fs_info->chunk_mutex);
  3017. if (ret) {
  3018. btrfs_err(fs_info, "failed to read the system array: %d", ret);
  3019. goto fail_sb_buffer;
  3020. }
  3021. generation = btrfs_super_chunk_root_generation(disk_super);
  3022. level = btrfs_super_chunk_root_level(disk_super);
  3023. ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
  3024. generation, level);
  3025. if (ret) {
  3026. btrfs_err(fs_info, "failed to read chunk root");
  3027. goto fail_tree_roots;
  3028. }
  3029. read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
  3030. offsetof(struct btrfs_header, chunk_tree_uuid),
  3031. BTRFS_UUID_SIZE);
  3032. ret = btrfs_read_chunk_tree(fs_info);
  3033. if (ret) {
  3034. btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
  3035. goto fail_tree_roots;
  3036. }
  3037. /*
  3038. * At this point we know all the devices that make this filesystem,
  3039. * including the seed devices but we don't know yet if the replace
  3040. * target is required. So free devices that are not part of this
  3041. * filesystem but skip the replace target device which is checked
  3042. * below in btrfs_init_dev_replace().
  3043. */
  3044. btrfs_free_extra_devids(fs_devices);
  3045. if (unlikely(!fs_devices->latest_dev->bdev)) {
  3046. btrfs_err(fs_info, "failed to read devices");
  3047. ret = -EIO;
  3048. goto fail_tree_roots;
  3049. }
  3050. ret = init_tree_roots(fs_info);
  3051. if (ret)
  3052. goto fail_tree_roots;
  3053. /*
  3054. * Get zone type information of zoned block devices. This will also
  3055. * handle emulation of a zoned filesystem if a regular device has the
  3056. * zoned incompat feature flag set.
  3057. */
  3058. ret = btrfs_get_dev_zone_info_all_devices(fs_info);
  3059. if (ret) {
  3060. btrfs_err(fs_info,
  3061. "zoned: failed to read device zone info: %d", ret);
  3062. goto fail_block_groups;
  3063. }
  3064. /*
  3065. * If we have a uuid root and we're not being told to rescan we need to
  3066. * check the generation here so we can set the
  3067. * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the
  3068. * transaction during a balance or the log replay without updating the
  3069. * uuid generation, and then if we crash we would rescan the uuid tree,
  3070. * even though it was perfectly fine.
  3071. */
  3072. if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
  3073. fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
  3074. set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
  3075. if (unlikely(btrfs_verify_dev_items(fs_info))) {
  3076. ret = -EUCLEAN;
  3077. goto fail_block_groups;
  3078. }
  3079. ret = btrfs_verify_dev_extents(fs_info);
  3080. if (ret) {
  3081. btrfs_err(fs_info,
  3082. "failed to verify dev extents against chunks: %d",
  3083. ret);
  3084. goto fail_block_groups;
  3085. }
  3086. ret = btrfs_recover_balance(fs_info);
  3087. if (ret) {
  3088. btrfs_err(fs_info, "failed to recover balance: %d", ret);
  3089. goto fail_block_groups;
  3090. }
  3091. ret = btrfs_init_dev_stats(fs_info);
  3092. if (ret) {
  3093. btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
  3094. goto fail_block_groups;
  3095. }
  3096. ret = btrfs_init_dev_replace(fs_info);
  3097. if (ret) {
  3098. btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
  3099. goto fail_block_groups;
  3100. }
  3101. ret = btrfs_check_zoned_mode(fs_info);
  3102. if (ret) {
  3103. btrfs_err(fs_info, "failed to initialize zoned mode: %d",
  3104. ret);
  3105. goto fail_block_groups;
  3106. }
  3107. ret = btrfs_sysfs_add_fsid(fs_devices);
  3108. if (ret) {
  3109. btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
  3110. ret);
  3111. goto fail_block_groups;
  3112. }
  3113. ret = btrfs_sysfs_add_mounted(fs_info);
  3114. if (ret) {
  3115. btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
  3116. goto fail_fsdev_sysfs;
  3117. }
  3118. ret = btrfs_init_space_info(fs_info);
  3119. if (ret) {
  3120. btrfs_err(fs_info, "failed to initialize space info: %d", ret);
  3121. goto fail_sysfs;
  3122. }
  3123. ret = btrfs_read_block_groups(fs_info);
  3124. if (ret) {
  3125. btrfs_err(fs_info, "failed to read block groups: %d", ret);
  3126. goto fail_sysfs;
  3127. }
  3128. if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
  3129. ret = btrfs_populate_fully_remapped_bgs_list(fs_info);
  3130. if (ret) {
  3131. btrfs_err(fs_info, "failed to populate fully_remapped_bgs list: %d", ret);
  3132. goto fail_sysfs;
  3133. }
  3134. }
  3135. btrfs_free_zone_cache(fs_info);
  3136. btrfs_check_active_zone_reservation(fs_info);
  3137. if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
  3138. !btrfs_check_rw_degradable(fs_info, NULL)) {
  3139. btrfs_warn(fs_info,
  3140. "writable mount is not allowed due to too many missing devices");
  3141. ret = -EINVAL;
  3142. goto fail_sysfs;
  3143. }
  3144. fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
  3145. "btrfs-cleaner");
  3146. if (IS_ERR(fs_info->cleaner_kthread)) {
  3147. ret = PTR_ERR(fs_info->cleaner_kthread);
  3148. goto fail_sysfs;
  3149. }
  3150. fs_info->transaction_kthread = kthread_run(transaction_kthread,
  3151. tree_root,
  3152. "btrfs-transaction");
  3153. if (IS_ERR(fs_info->transaction_kthread)) {
  3154. ret = PTR_ERR(fs_info->transaction_kthread);
  3155. goto fail_cleaner;
  3156. }
  3157. /*
  3158. * Starts a transaction, must be called after the transaction kthread
  3159. * is initialized.
  3160. */
  3161. btrfs_zoned_reserve_data_reloc_bg(fs_info);
  3162. ret = btrfs_read_qgroup_config(fs_info);
  3163. if (ret)
  3164. goto fail_trans_kthread;
  3165. if (btrfs_build_ref_tree(fs_info))
  3166. btrfs_err(fs_info, "couldn't build ref tree");
  3167. /* do not make disk changes in broken FS or nologreplay is given */
  3168. if (btrfs_super_log_root(disk_super) != 0 &&
  3169. !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
  3170. btrfs_info(fs_info, "start tree-log replay");
  3171. ret = btrfs_replay_log(fs_info, fs_devices);
  3172. if (ret)
  3173. goto fail_qgroup;
  3174. }
  3175. fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
  3176. if (IS_ERR(fs_info->fs_root)) {
  3177. ret = PTR_ERR(fs_info->fs_root);
  3178. btrfs_err(fs_info, "failed to read fs tree: %d", ret);
  3179. fs_info->fs_root = NULL;
  3180. goto fail_qgroup;
  3181. }
  3182. if (sb_rdonly(sb))
  3183. return 0;
  3184. ret = btrfs_start_pre_rw_mount(fs_info);
  3185. if (ret) {
  3186. close_ctree(fs_info);
  3187. return ret;
  3188. }
  3189. btrfs_discard_resume(fs_info);
  3190. if (fs_info->uuid_root &&
  3191. (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
  3192. fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
  3193. btrfs_info(fs_info, "checking UUID tree");
  3194. ret = btrfs_check_uuid_tree(fs_info);
  3195. if (ret) {
  3196. btrfs_err(fs_info, "failed to check the UUID tree: %d", ret);
  3197. close_ctree(fs_info);
  3198. return ret;
  3199. }
  3200. }
  3201. set_bit(BTRFS_FS_OPEN, &fs_info->flags);
  3202. /* Kick the cleaner thread so it'll start deleting snapshots. */
  3203. if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
  3204. wake_up_process(fs_info->cleaner_kthread);
  3205. return 0;
  3206. fail_qgroup:
  3207. btrfs_free_qgroup_config(fs_info);
  3208. fail_trans_kthread:
  3209. kthread_stop(fs_info->transaction_kthread);
  3210. btrfs_cleanup_transaction(fs_info);
  3211. btrfs_free_fs_roots(fs_info);
  3212. fail_cleaner:
  3213. kthread_stop(fs_info->cleaner_kthread);
  3214. /*
  3215. * make sure we're done with the btree inode before we stop our
  3216. * kthreads
  3217. */
  3218. filemap_write_and_wait(fs_info->btree_inode->i_mapping);
  3219. fail_sysfs:
  3220. btrfs_sysfs_remove_mounted(fs_info);
  3221. fail_fsdev_sysfs:
  3222. btrfs_sysfs_remove_fsid(fs_info->fs_devices);
  3223. fail_block_groups:
  3224. btrfs_put_block_group_cache(fs_info);
  3225. fail_tree_roots:
  3226. if (fs_info->data_reloc_root)
  3227. btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
  3228. free_root_pointers(fs_info, true);
  3229. invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
  3230. fail_sb_buffer:
  3231. btrfs_stop_all_workers(fs_info);
  3232. btrfs_free_block_groups(fs_info);
  3233. fail_alloc:
  3234. btrfs_mapping_tree_free(fs_info);
  3235. iput(fs_info->btree_inode);
  3236. fail:
  3237. ASSERT(ret < 0);
  3238. return ret;
  3239. }
  3240. ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
  3241. static void btrfs_end_super_write(struct bio *bio)
  3242. {
  3243. struct btrfs_device *device = bio->bi_private;
  3244. struct folio_iter fi;
  3245. bio_for_each_folio_all(fi, bio) {
  3246. if (bio->bi_status) {
  3247. btrfs_warn_rl(device->fs_info,
  3248. "lost super block write due to IO error on %s (%d)",
  3249. btrfs_dev_name(device),
  3250. blk_status_to_errno(bio->bi_status));
  3251. btrfs_dev_stat_inc_and_print(device,
  3252. BTRFS_DEV_STAT_WRITE_ERRS);
  3253. /* Ensure failure if the primary sb fails. */
  3254. if (bio->bi_opf & REQ_FUA)
  3255. atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR,
  3256. &device->sb_write_errors);
  3257. else
  3258. atomic_inc(&device->sb_write_errors);
  3259. }
  3260. folio_unlock(fi.folio);
  3261. folio_put(fi.folio);
  3262. }
  3263. bio_put(bio);
  3264. }
  3265. /*
  3266. * Write superblock @sb to the @device. Do not wait for completion, all the
  3267. * folios we use for writing are locked.
  3268. *
  3269. * Write @max_mirrors copies of the superblock, where 0 means default that fit
  3270. * the expected device size at commit time. Note that max_mirrors must be
  3271. * same for write and wait phases.
  3272. *
  3273. * Return number of errors when folio is not found or submission fails.
  3274. */
  3275. static int write_dev_supers(struct btrfs_device *device,
  3276. struct btrfs_super_block *sb, int max_mirrors)
  3277. {
  3278. struct btrfs_fs_info *fs_info = device->fs_info;
  3279. struct address_space *mapping = device->bdev->bd_mapping;
  3280. int i;
  3281. int ret;
  3282. u64 bytenr, bytenr_orig;
  3283. atomic_set(&device->sb_write_errors, 0);
  3284. if (max_mirrors == 0)
  3285. max_mirrors = BTRFS_SUPER_MIRROR_MAX;
  3286. for (i = 0; i < max_mirrors; i++) {
  3287. struct folio *folio;
  3288. struct bio *bio;
  3289. struct btrfs_super_block *disk_super;
  3290. size_t offset;
  3291. bytenr_orig = btrfs_sb_offset(i);
  3292. ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
  3293. if (ret == -ENOENT) {
  3294. continue;
  3295. } else if (ret < 0) {
  3296. btrfs_err(device->fs_info,
  3297. "couldn't get super block location for mirror %d error %d",
  3298. i, ret);
  3299. atomic_inc(&device->sb_write_errors);
  3300. continue;
  3301. }
  3302. if (bytenr + BTRFS_SUPER_INFO_SIZE >=
  3303. device->commit_total_bytes)
  3304. break;
  3305. btrfs_set_super_bytenr(sb, bytenr_orig);
  3306. btrfs_csum(fs_info->csum_type, (const u8 *)sb + BTRFS_CSUM_SIZE,
  3307. BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, sb->csum);
  3308. folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT,
  3309. FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
  3310. GFP_NOFS);
  3311. if (IS_ERR(folio)) {
  3312. btrfs_err(device->fs_info,
  3313. "couldn't get super block page for bytenr %llu error %ld",
  3314. bytenr, PTR_ERR(folio));
  3315. atomic_inc(&device->sb_write_errors);
  3316. continue;
  3317. }
  3318. offset = offset_in_folio(folio, bytenr);
  3319. disk_super = folio_address(folio) + offset;
  3320. memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
  3321. /*
  3322. * Directly use bios here instead of relying on the page cache
  3323. * to do I/O, so we don't lose the ability to do integrity
  3324. * checking.
  3325. */
  3326. bio = bio_alloc(device->bdev, 1,
  3327. REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
  3328. GFP_NOFS);
  3329. bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
  3330. bio->bi_private = device;
  3331. bio->bi_end_io = btrfs_end_super_write;
  3332. bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset);
  3333. /*
  3334. * We FUA only the first super block. The others we allow to
  3335. * go down lazy and there's a short window where the on-disk
  3336. * copies might still contain the older version.
  3337. */
  3338. if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
  3339. bio->bi_opf |= REQ_FUA;
  3340. submit_bio(bio);
  3341. if (btrfs_advance_sb_log(device, i))
  3342. atomic_inc(&device->sb_write_errors);
  3343. }
  3344. return atomic_read(&device->sb_write_errors) < i ? 0 : -1;
  3345. }
  3346. /*
  3347. * Wait for write completion of superblocks done by write_dev_supers,
  3348. * @max_mirrors same for write and wait phases.
  3349. *
  3350. * Return -1 if primary super block write failed or when there were no super block
  3351. * copies written. Otherwise 0.
  3352. */
  3353. static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
  3354. {
  3355. int i;
  3356. int errors = 0;
  3357. bool primary_failed = false;
  3358. int ret;
  3359. u64 bytenr;
  3360. if (max_mirrors == 0)
  3361. max_mirrors = BTRFS_SUPER_MIRROR_MAX;
  3362. for (i = 0; i < max_mirrors; i++) {
  3363. struct folio *folio;
  3364. ret = btrfs_sb_log_location(device, i, READ, &bytenr);
  3365. if (ret == -ENOENT) {
  3366. break;
  3367. } else if (ret < 0) {
  3368. errors++;
  3369. if (i == 0)
  3370. primary_failed = true;
  3371. continue;
  3372. }
  3373. if (bytenr + BTRFS_SUPER_INFO_SIZE >=
  3374. device->commit_total_bytes)
  3375. break;
  3376. folio = filemap_get_folio(device->bdev->bd_mapping,
  3377. bytenr >> PAGE_SHIFT);
  3378. /* If the folio has been removed, then we know it completed. */
  3379. if (IS_ERR(folio))
  3380. continue;
  3381. /* Folio will be unlocked once the write completes. */
  3382. folio_wait_locked(folio);
  3383. folio_put(folio);
  3384. }
  3385. errors += atomic_read(&device->sb_write_errors);
  3386. if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR)
  3387. primary_failed = true;
  3388. if (primary_failed) {
  3389. btrfs_err(device->fs_info, "error writing primary super block to device %llu",
  3390. device->devid);
  3391. return -1;
  3392. }
  3393. return errors < i ? 0 : -1;
  3394. }
  3395. /*
  3396. * endio for the write_dev_flush, this will wake anyone waiting
  3397. * for the barrier when it is done
  3398. */
  3399. static void btrfs_end_empty_barrier(struct bio *bio)
  3400. {
  3401. bio_uninit(bio);
  3402. complete(bio->bi_private);
  3403. }
  3404. /*
  3405. * Submit a flush request to the device if it supports it. Error handling is
  3406. * done in the waiting counterpart.
  3407. */
  3408. static void write_dev_flush(struct btrfs_device *device)
  3409. {
  3410. struct bio *bio = &device->flush_bio;
  3411. clear_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state);
  3412. bio_init(bio, device->bdev, NULL, 0,
  3413. REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
  3414. bio->bi_end_io = btrfs_end_empty_barrier;
  3415. init_completion(&device->flush_wait);
  3416. bio->bi_private = &device->flush_wait;
  3417. submit_bio(bio);
  3418. set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
  3419. }
  3420. /*
  3421. * If the flush bio has been submitted by write_dev_flush, wait for it.
  3422. * Return true for any error, and false otherwise.
  3423. */
  3424. static bool wait_dev_flush(struct btrfs_device *device)
  3425. {
  3426. struct bio *bio = &device->flush_bio;
  3427. if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
  3428. return false;
  3429. wait_for_completion_io(&device->flush_wait);
  3430. if (bio->bi_status) {
  3431. set_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state);
  3432. btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS);
  3433. return true;
  3434. }
  3435. return false;
  3436. }
  3437. /*
  3438. * send an empty flush down to each device in parallel,
  3439. * then wait for them
  3440. */
  3441. static int barrier_all_devices(struct btrfs_fs_info *info)
  3442. {
  3443. struct list_head *head;
  3444. struct btrfs_device *dev;
  3445. int errors_wait = 0;
  3446. lockdep_assert_held(&info->fs_devices->device_list_mutex);
  3447. /* send down all the barriers */
  3448. head = &info->fs_devices->devices;
  3449. list_for_each_entry(dev, head, dev_list) {
  3450. if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
  3451. continue;
  3452. if (!dev->bdev)
  3453. continue;
  3454. if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
  3455. !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
  3456. continue;
  3457. write_dev_flush(dev);
  3458. }
  3459. /* wait for all the barriers */
  3460. list_for_each_entry(dev, head, dev_list) {
  3461. if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
  3462. continue;
  3463. if (!dev->bdev) {
  3464. errors_wait++;
  3465. continue;
  3466. }
  3467. if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
  3468. !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
  3469. continue;
  3470. if (wait_dev_flush(dev))
  3471. errors_wait++;
  3472. }
  3473. /*
  3474. * Checks flush failure of disks in order to determine the device
  3475. * state.
  3476. */
  3477. if (unlikely(errors_wait && !btrfs_check_rw_degradable(info, NULL)))
  3478. return -EIO;
  3479. return 0;
  3480. }
  3481. int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
  3482. {
  3483. int raid_type;
  3484. int min_tolerated = INT_MAX;
  3485. if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
  3486. (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
  3487. min_tolerated = min_t(int, min_tolerated,
  3488. btrfs_raid_array[BTRFS_RAID_SINGLE].
  3489. tolerated_failures);
  3490. for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
  3491. if (raid_type == BTRFS_RAID_SINGLE)
  3492. continue;
  3493. if (!(flags & btrfs_raid_array[raid_type].bg_flag))
  3494. continue;
  3495. min_tolerated = min_t(int, min_tolerated,
  3496. btrfs_raid_array[raid_type].
  3497. tolerated_failures);
  3498. }
  3499. if (min_tolerated == INT_MAX) {
  3500. btrfs_warn(NULL, "unknown raid flag: %llu", flags);
  3501. min_tolerated = 0;
  3502. }
  3503. return min_tolerated;
  3504. }
  3505. int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
  3506. {
  3507. struct list_head *head;
  3508. struct btrfs_device *dev;
  3509. struct btrfs_super_block *sb;
  3510. struct btrfs_dev_item *dev_item;
  3511. int ret;
  3512. int do_barriers;
  3513. int max_errors;
  3514. int total_errors = 0;
  3515. u64 flags;
  3516. do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
  3517. /*
  3518. * max_mirrors == 0 indicates we're from commit_transaction,
  3519. * not from fsync where the tree roots in fs_info have not
  3520. * been consistent on disk.
  3521. */
  3522. if (max_mirrors == 0) {
  3523. ret = backup_super_roots(fs_info);
  3524. if (ret < 0)
  3525. return ret;
  3526. }
  3527. sb = fs_info->super_for_commit;
  3528. dev_item = &sb->dev_item;
  3529. mutex_lock(&fs_info->fs_devices->device_list_mutex);
  3530. head = &fs_info->fs_devices->devices;
  3531. max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
  3532. if (do_barriers) {
  3533. ret = barrier_all_devices(fs_info);
  3534. if (ret) {
  3535. mutex_unlock(
  3536. &fs_info->fs_devices->device_list_mutex);
  3537. btrfs_handle_fs_error(fs_info, ret,
  3538. "errors while submitting device barriers.");
  3539. return ret;
  3540. }
  3541. }
  3542. list_for_each_entry(dev, head, dev_list) {
  3543. if (!dev->bdev) {
  3544. total_errors++;
  3545. continue;
  3546. }
  3547. if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
  3548. !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
  3549. continue;
  3550. btrfs_set_stack_device_generation(dev_item, 0);
  3551. btrfs_set_stack_device_type(dev_item, dev->type);
  3552. btrfs_set_stack_device_id(dev_item, dev->devid);
  3553. btrfs_set_stack_device_total_bytes(dev_item,
  3554. dev->commit_total_bytes);
  3555. btrfs_set_stack_device_bytes_used(dev_item,
  3556. dev->commit_bytes_used);
  3557. btrfs_set_stack_device_io_align(dev_item, dev->io_align);
  3558. btrfs_set_stack_device_io_width(dev_item, dev->io_width);
  3559. btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
  3560. memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
  3561. memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
  3562. BTRFS_FSID_SIZE);
  3563. flags = btrfs_super_flags(sb);
  3564. btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
  3565. ret = btrfs_validate_write_super(fs_info, sb);
  3566. if (unlikely(ret < 0)) {
  3567. mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  3568. btrfs_handle_fs_error(fs_info, -EUCLEAN,
  3569. "unexpected superblock corruption detected");
  3570. return -EUCLEAN;
  3571. }
  3572. ret = write_dev_supers(dev, sb, max_mirrors);
  3573. if (ret)
  3574. total_errors++;
  3575. }
  3576. if (unlikely(total_errors > max_errors)) {
  3577. btrfs_err(fs_info, "%d errors while writing supers",
  3578. total_errors);
  3579. mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  3580. /* FUA is masked off if unsupported and can't be the reason */
  3581. btrfs_handle_fs_error(fs_info, -EIO,
  3582. "%d errors while writing supers",
  3583. total_errors);
  3584. return -EIO;
  3585. }
  3586. total_errors = 0;
  3587. list_for_each_entry(dev, head, dev_list) {
  3588. if (!dev->bdev)
  3589. continue;
  3590. if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
  3591. !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
  3592. continue;
  3593. ret = wait_dev_supers(dev, max_mirrors);
  3594. if (ret)
  3595. total_errors++;
  3596. }
  3597. mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  3598. if (unlikely(total_errors > max_errors)) {
  3599. btrfs_handle_fs_error(fs_info, -EIO,
  3600. "%d errors while writing supers",
  3601. total_errors);
  3602. return -EIO;
  3603. }
  3604. return 0;
  3605. }
  3606. /* Drop a fs root from the radix tree and free it. */
  3607. void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
  3608. struct btrfs_root *root)
  3609. {
  3610. bool drop_ref = false;
  3611. spin_lock(&fs_info->fs_roots_radix_lock);
  3612. radix_tree_delete(&fs_info->fs_roots_radix,
  3613. (unsigned long)btrfs_root_id(root));
  3614. if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
  3615. drop_ref = true;
  3616. spin_unlock(&fs_info->fs_roots_radix_lock);
  3617. if (BTRFS_FS_ERROR(fs_info)) {
  3618. ASSERT(root->log_root == NULL);
  3619. if (root->reloc_root) {
  3620. btrfs_put_root(root->reloc_root);
  3621. root->reloc_root = NULL;
  3622. }
  3623. }
  3624. if (drop_ref)
  3625. btrfs_put_root(root);
  3626. }
  3627. int btrfs_commit_super(struct btrfs_fs_info *fs_info)
  3628. {
  3629. mutex_lock(&fs_info->cleaner_mutex);
  3630. btrfs_run_delayed_iputs(fs_info);
  3631. mutex_unlock(&fs_info->cleaner_mutex);
  3632. wake_up_process(fs_info->cleaner_kthread);
  3633. /* wait until ongoing cleanup work done */
  3634. down_write(&fs_info->cleanup_work_sem);
  3635. up_write(&fs_info->cleanup_work_sem);
  3636. return btrfs_commit_current_transaction(fs_info->tree_root);
  3637. }
  3638. static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
  3639. {
  3640. struct btrfs_transaction *trans;
  3641. struct btrfs_transaction *tmp;
  3642. bool found = false;
  3643. /*
  3644. * This function is only called at the very end of close_ctree(),
  3645. * thus no other running transaction, no need to take trans_lock.
  3646. */
  3647. ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
  3648. list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
  3649. struct extent_state *cached = NULL;
  3650. u64 dirty_bytes = 0;
  3651. u64 cur = 0;
  3652. u64 found_start;
  3653. u64 found_end;
  3654. found = true;
  3655. while (btrfs_find_first_extent_bit(&trans->dirty_pages, cur,
  3656. &found_start, &found_end,
  3657. EXTENT_DIRTY, &cached)) {
  3658. dirty_bytes += found_end + 1 - found_start;
  3659. cur = found_end + 1;
  3660. }
  3661. btrfs_warn(fs_info,
  3662. "transaction %llu (with %llu dirty metadata bytes) is not committed",
  3663. trans->transid, dirty_bytes);
  3664. btrfs_cleanup_one_transaction(trans);
  3665. if (trans == fs_info->running_transaction)
  3666. fs_info->running_transaction = NULL;
  3667. list_del_init(&trans->list);
  3668. btrfs_put_transaction(trans);
  3669. trace_btrfs_transaction_commit(fs_info);
  3670. }
  3671. ASSERT(!found);
  3672. }
  3673. void __cold close_ctree(struct btrfs_fs_info *fs_info)
  3674. {
  3675. int ret;
  3676. set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
  3677. /*
  3678. * If we had UNFINISHED_DROPS we could still be processing them, so
  3679. * clear that bit and wake up relocation so it can stop.
  3680. * We must do this before stopping the block group reclaim task, because
  3681. * at btrfs_relocate_block_group() we wait for this bit, and after the
  3682. * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
  3683. * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
  3684. * return 1.
  3685. */
  3686. btrfs_wake_unfinished_drop(fs_info);
  3687. /*
  3688. * We may have the reclaim task running and relocating a data block group,
  3689. * in which case it may create delayed iputs. So stop it before we park
  3690. * the cleaner kthread otherwise we can get new delayed iputs after
  3691. * parking the cleaner, and that can make the async reclaim task to hang
  3692. * if it's waiting for delayed iputs to complete, since the cleaner is
  3693. * parked and can not run delayed iputs - this will make us hang when
  3694. * trying to stop the async reclaim task.
  3695. */
  3696. cancel_work_sync(&fs_info->reclaim_bgs_work);
  3697. /*
  3698. * We don't want the cleaner to start new transactions, add more delayed
  3699. * iputs, etc. while we're closing. We can't use kthread_stop() yet
  3700. * because that frees the task_struct, and the transaction kthread might
  3701. * still try to wake up the cleaner.
  3702. */
  3703. kthread_park(fs_info->cleaner_kthread);
  3704. /* wait for the qgroup rescan worker to stop */
  3705. btrfs_qgroup_wait_for_completion(fs_info, false);
  3706. /* wait for the uuid_scan task to finish */
  3707. down(&fs_info->uuid_tree_rescan_sem);
  3708. /* avoid complains from lockdep et al., set sem back to initial state */
  3709. up(&fs_info->uuid_tree_rescan_sem);
  3710. /* pause restriper - we want to resume on mount */
  3711. btrfs_pause_balance(fs_info);
  3712. btrfs_dev_replace_suspend_for_unmount(fs_info);
  3713. btrfs_scrub_cancel(fs_info);
  3714. /* wait for any defraggers to finish */
  3715. wait_event(fs_info->transaction_wait,
  3716. (atomic_read(&fs_info->defrag_running) == 0));
  3717. /* clear out the rbtree of defraggable inodes */
  3718. btrfs_cleanup_defrag_inodes(fs_info);
  3719. /*
  3720. * Handle the error fs first, as it will flush and wait for all ordered
  3721. * extents. This will generate delayed iputs, thus we want to handle
  3722. * it first.
  3723. */
  3724. if (unlikely(BTRFS_FS_ERROR(fs_info)))
  3725. btrfs_error_commit_super(fs_info);
  3726. /*
  3727. * Wait for any fixup workers to complete.
  3728. * If we don't wait for them here and they are still running by the time
  3729. * we call kthread_stop() against the cleaner kthread further below, we
  3730. * get an use-after-free on the cleaner because the fixup worker adds an
  3731. * inode to the list of delayed iputs and then attempts to wakeup the
  3732. * cleaner kthread, which was already stopped and destroyed. We parked
  3733. * already the cleaner, but below we run all pending delayed iputs.
  3734. */
  3735. btrfs_flush_workqueue(fs_info->fixup_workers);
  3736. /*
  3737. * Similar case here, we have to wait for delalloc workers before we
  3738. * proceed below and stop the cleaner kthread, otherwise we trigger a
  3739. * use-after-tree on the cleaner kthread task_struct when a delalloc
  3740. * worker running submit_compressed_extents() adds a delayed iput, which
  3741. * does a wake up on the cleaner kthread, which was already freed below
  3742. * when we call kthread_stop().
  3743. */
  3744. btrfs_flush_workqueue(fs_info->delalloc_workers);
  3745. /*
  3746. * We can have ordered extents getting their last reference dropped from
  3747. * the fs_info->workers queue because for async writes for data bios we
  3748. * queue a work for that queue, at btrfs_wq_submit_bio(), that runs
  3749. * run_one_async_done() which calls btrfs_bio_end_io() in case the bio
  3750. * has an error, and that later function can do the final
  3751. * btrfs_put_ordered_extent() on the ordered extent attached to the bio,
  3752. * which adds a delayed iput for the inode. So we must flush the queue
  3753. * so that we don't have delayed iputs after committing the current
  3754. * transaction below and stopping the cleaner and transaction kthreads.
  3755. */
  3756. btrfs_flush_workqueue(fs_info->workers);
  3757. /*
  3758. * When finishing a compressed write bio we schedule a work queue item
  3759. * to finish an ordered extent - end_bbio_compressed_write()
  3760. * calls btrfs_finish_ordered_extent() which in turns does a call to
  3761. * btrfs_queue_ordered_fn(), and that queues the ordered extent
  3762. * completion either in the endio_write_workers work queue or in the
  3763. * fs_info->endio_freespace_worker work queue. We flush those queues
  3764. * below, so before we flush them we must flush this queue for the
  3765. * workers of compressed writes.
  3766. */
  3767. flush_workqueue(fs_info->endio_workers);
  3768. /*
  3769. * After we parked the cleaner kthread, ordered extents may have
  3770. * completed and created new delayed iputs. If one of the async reclaim
  3771. * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
  3772. * can hang forever trying to stop it, because if a delayed iput is
  3773. * added after it ran btrfs_run_delayed_iputs() and before it called
  3774. * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
  3775. * no one else to run iputs.
  3776. *
  3777. * So wait for all ongoing ordered extents to complete and then run
  3778. * delayed iputs. This works because once we reach this point no one
  3779. * can create new ordered extents, but delayed iputs can still be added
  3780. * by a reclaim worker (see comments further below).
  3781. *
  3782. * Also note that btrfs_wait_ordered_roots() is not safe here, because
  3783. * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
  3784. * but the delayed iput for the respective inode is made only when doing
  3785. * the final btrfs_put_ordered_extent() (which must happen at
  3786. * btrfs_finish_ordered_io() when we are unmounting).
  3787. */
  3788. btrfs_flush_workqueue(fs_info->endio_write_workers);
  3789. /* Ordered extents for free space inodes. */
  3790. btrfs_flush_workqueue(fs_info->endio_freespace_worker);
  3791. /*
  3792. * Run delayed iputs in case an async reclaim worker is waiting for them
  3793. * to be run as mentioned above.
  3794. */
  3795. btrfs_run_delayed_iputs(fs_info);
  3796. cancel_work_sync(&fs_info->async_reclaim_work);
  3797. cancel_work_sync(&fs_info->async_data_reclaim_work);
  3798. cancel_work_sync(&fs_info->preempt_reclaim_work);
  3799. cancel_work_sync(&fs_info->em_shrinker_work);
  3800. /*
  3801. * Run delayed iputs again because an async reclaim worker may have
  3802. * added new ones if it was flushing delalloc:
  3803. *
  3804. * shrink_delalloc() -> btrfs_start_delalloc_roots() ->
  3805. * start_delalloc_inodes() -> btrfs_add_delayed_iput()
  3806. */
  3807. btrfs_run_delayed_iputs(fs_info);
  3808. /* There should be no more workload to generate new delayed iputs. */
  3809. set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);
  3810. /* Cancel or finish ongoing discard work */
  3811. btrfs_discard_cleanup(fs_info);
  3812. if (!sb_rdonly(fs_info->sb)) {
  3813. /*
  3814. * The cleaner kthread is stopped, so do one final pass over
  3815. * unused block groups.
  3816. */
  3817. btrfs_delete_unused_bgs(fs_info);
  3818. /*
  3819. * There might be existing delayed inode workers still running
  3820. * and holding an empty delayed inode item. We must wait for
  3821. * them to complete first because they can create a transaction.
  3822. * This happens when someone calls btrfs_balance_delayed_items()
  3823. * and then a transaction commit runs the same delayed nodes
  3824. * before any delayed worker has done something with the nodes.
  3825. * We must wait for any worker here and not at transaction
  3826. * commit time since that could cause a deadlock.
  3827. * This is a very rare case.
  3828. */
  3829. btrfs_flush_workqueue(fs_info->delayed_workers);
  3830. /*
  3831. * If the filesystem is shutdown, then an attempt to commit the
  3832. * super block (or any write) will just fail. Since we freeze
  3833. * the filesystem before shutting it down, the filesystem is in
  3834. * a consistent state and we don't need to commit super blocks.
  3835. */
  3836. if (!btrfs_is_shutdown(fs_info)) {
  3837. ret = btrfs_commit_super(fs_info);
  3838. if (ret)
  3839. btrfs_err(fs_info, "commit super block returned %d", ret);
  3840. }
  3841. }
  3842. kthread_stop(fs_info->transaction_kthread);
  3843. kthread_stop(fs_info->cleaner_kthread);
  3844. ASSERT(list_empty(&fs_info->delayed_iputs));
  3845. set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
  3846. if (btrfs_check_quota_leak(fs_info)) {
  3847. DEBUG_WARN("qgroup reserved space leaked");
  3848. btrfs_err(fs_info, "qgroup reserved space leaked");
  3849. }
  3850. btrfs_free_qgroup_config(fs_info);
  3851. ASSERT(list_empty(&fs_info->delalloc_roots));
  3852. if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
  3853. btrfs_info(fs_info, "at unmount delalloc count %lld",
  3854. percpu_counter_sum(&fs_info->delalloc_bytes));
  3855. }
  3856. if (percpu_counter_sum(&fs_info->ordered_bytes))
  3857. btrfs_info(fs_info, "at unmount dio bytes count %lld",
  3858. percpu_counter_sum(&fs_info->ordered_bytes));
  3859. btrfs_sysfs_remove_mounted(fs_info);
  3860. btrfs_sysfs_remove_fsid(fs_info->fs_devices);
  3861. btrfs_put_block_group_cache(fs_info);
  3862. /*
  3863. * we must make sure there is not any read request to
  3864. * submit after we stopping all workers.
  3865. */
  3866. invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
  3867. btrfs_stop_all_workers(fs_info);
  3868. /* We shouldn't have any transaction open at this point */
  3869. warn_about_uncommitted_trans(fs_info);
  3870. clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
  3871. free_root_pointers(fs_info, true);
  3872. btrfs_free_fs_roots(fs_info);
  3873. /*
  3874. * We must free the block groups after dropping the fs_roots as we could
  3875. * have had an IO error and have left over tree log blocks that aren't
  3876. * cleaned up until the fs roots are freed. This makes the block group
  3877. * accounting appear to be wrong because there's pending reserved bytes,
  3878. * so make sure we do the block group cleanup afterwards.
  3879. */
  3880. btrfs_free_block_groups(fs_info);
  3881. iput(fs_info->btree_inode);
  3882. btrfs_mapping_tree_free(fs_info);
  3883. }
  3884. void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
  3885. struct extent_buffer *buf)
  3886. {
  3887. struct btrfs_fs_info *fs_info = buf->fs_info;
  3888. u64 transid = btrfs_header_generation(buf);
  3889. #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
  3890. /*
  3891. * This is a fast path so only do this check if we have sanity tests
  3892. * enabled. Normal people shouldn't be using unmapped buffers as dirty
  3893. * outside of the sanity tests.
  3894. */
  3895. if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
  3896. return;
  3897. #endif
  3898. /* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */
  3899. ASSERT(trans->transid == fs_info->generation);
  3900. btrfs_assert_tree_write_locked(buf);
  3901. if (unlikely(transid != fs_info->generation)) {
  3902. btrfs_abort_transaction(trans, -EUCLEAN);
  3903. btrfs_crit(fs_info,
  3904. "dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu",
  3905. buf->start, transid, fs_info->generation);
  3906. }
  3907. set_extent_buffer_dirty(buf);
  3908. }
  3909. static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
  3910. int flush_delayed)
  3911. {
  3912. /*
  3913. * looks as though older kernels can get into trouble with
  3914. * this code, they end up stuck in balance_dirty_pages forever
  3915. */
  3916. int ret;
  3917. if (current->flags & PF_MEMALLOC)
  3918. return;
  3919. if (flush_delayed)
  3920. btrfs_balance_delayed_items(fs_info);
  3921. ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
  3922. BTRFS_DIRTY_METADATA_THRESH,
  3923. fs_info->dirty_metadata_batch);
  3924. if (ret > 0) {
  3925. balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
  3926. }
  3927. }
  3928. void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
  3929. {
  3930. __btrfs_btree_balance_dirty(fs_info, 1);
  3931. }
  3932. void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
  3933. {
  3934. __btrfs_btree_balance_dirty(fs_info, 0);
  3935. }
  3936. static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
  3937. {
  3938. /* cleanup FS via transaction */
  3939. btrfs_cleanup_transaction(fs_info);
  3940. down_write(&fs_info->cleanup_work_sem);
  3941. up_write(&fs_info->cleanup_work_sem);
  3942. }
  3943. static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
  3944. {
  3945. struct btrfs_root *gang[8];
  3946. u64 root_objectid = 0;
  3947. int ret;
  3948. spin_lock(&fs_info->fs_roots_radix_lock);
  3949. while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
  3950. (void **)gang, root_objectid,
  3951. ARRAY_SIZE(gang))) != 0) {
  3952. int i;
  3953. for (i = 0; i < ret; i++)
  3954. gang[i] = btrfs_grab_root(gang[i]);
  3955. spin_unlock(&fs_info->fs_roots_radix_lock);
  3956. for (i = 0; i < ret; i++) {
  3957. if (!gang[i])
  3958. continue;
  3959. root_objectid = btrfs_root_id(gang[i]);
  3960. btrfs_free_log(NULL, gang[i]);
  3961. btrfs_put_root(gang[i]);
  3962. }
  3963. root_objectid++;
  3964. spin_lock(&fs_info->fs_roots_radix_lock);
  3965. }
  3966. spin_unlock(&fs_info->fs_roots_radix_lock);
  3967. btrfs_free_log_root_tree(NULL, fs_info);
  3968. }
  3969. static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
  3970. {
  3971. struct btrfs_ordered_extent *ordered;
  3972. spin_lock(&root->ordered_extent_lock);
  3973. /*
  3974. * This will just short circuit the ordered completion stuff which will
  3975. * make sure the ordered extent gets properly cleaned up.
  3976. */
  3977. list_for_each_entry(ordered, &root->ordered_extents,
  3978. root_extent_list)
  3979. set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
  3980. spin_unlock(&root->ordered_extent_lock);
  3981. }
  3982. static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
  3983. {
  3984. struct btrfs_root *root;
  3985. LIST_HEAD(splice);
  3986. spin_lock(&fs_info->ordered_root_lock);
  3987. list_splice_init(&fs_info->ordered_roots, &splice);
  3988. while (!list_empty(&splice)) {
  3989. root = list_first_entry(&splice, struct btrfs_root,
  3990. ordered_root);
  3991. list_move_tail(&root->ordered_root,
  3992. &fs_info->ordered_roots);
  3993. spin_unlock(&fs_info->ordered_root_lock);
  3994. btrfs_destroy_ordered_extents(root);
  3995. cond_resched();
  3996. spin_lock(&fs_info->ordered_root_lock);
  3997. }
  3998. spin_unlock(&fs_info->ordered_root_lock);
  3999. /*
  4000. * We need this here because if we've been flipped read-only we won't
  4001. * get sync() from the umount, so we need to make sure any ordered
  4002. * extents that haven't had their dirty pages IO start writeout yet
  4003. * actually get run and error out properly.
  4004. */
  4005. btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
  4006. }
  4007. static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
  4008. {
  4009. struct btrfs_inode *btrfs_inode;
  4010. LIST_HEAD(splice);
  4011. spin_lock(&root->delalloc_lock);
  4012. list_splice_init(&root->delalloc_inodes, &splice);
  4013. while (!list_empty(&splice)) {
  4014. struct inode *inode = NULL;
  4015. btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
  4016. delalloc_inodes);
  4017. btrfs_del_delalloc_inode(btrfs_inode);
  4018. spin_unlock(&root->delalloc_lock);
  4019. /*
  4020. * Make sure we get a live inode and that it'll not disappear
  4021. * meanwhile.
  4022. */
  4023. inode = igrab(&btrfs_inode->vfs_inode);
  4024. if (inode) {
  4025. unsigned int nofs_flag;
  4026. nofs_flag = memalloc_nofs_save();
  4027. invalidate_inode_pages2(inode->i_mapping);
  4028. memalloc_nofs_restore(nofs_flag);
  4029. iput(inode);
  4030. }
  4031. spin_lock(&root->delalloc_lock);
  4032. }
  4033. spin_unlock(&root->delalloc_lock);
  4034. }
  4035. static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
  4036. {
  4037. struct btrfs_root *root;
  4038. LIST_HEAD(splice);
  4039. spin_lock(&fs_info->delalloc_root_lock);
  4040. list_splice_init(&fs_info->delalloc_roots, &splice);
  4041. while (!list_empty(&splice)) {
  4042. root = list_first_entry(&splice, struct btrfs_root,
  4043. delalloc_root);
  4044. root = btrfs_grab_root(root);
  4045. BUG_ON(!root);
  4046. spin_unlock(&fs_info->delalloc_root_lock);
  4047. btrfs_destroy_delalloc_inodes(root);
  4048. btrfs_put_root(root);
  4049. spin_lock(&fs_info->delalloc_root_lock);
  4050. }
  4051. spin_unlock(&fs_info->delalloc_root_lock);
  4052. }
  4053. static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
  4054. struct extent_io_tree *dirty_pages,
  4055. int mark)
  4056. {
  4057. struct extent_buffer *eb;
  4058. u64 start = 0;
  4059. u64 end;
  4060. while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end,
  4061. mark, NULL)) {
  4062. btrfs_clear_extent_bit(dirty_pages, start, end, mark, NULL);
  4063. while (start <= end) {
  4064. eb = find_extent_buffer(fs_info, start);
  4065. start += fs_info->nodesize;
  4066. if (!eb)
  4067. continue;
  4068. btrfs_tree_lock(eb);
  4069. wait_on_extent_buffer_writeback(eb);
  4070. btrfs_clear_buffer_dirty(NULL, eb);
  4071. btrfs_tree_unlock(eb);
  4072. free_extent_buffer_stale(eb);
  4073. }
  4074. }
  4075. }
  4076. static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
  4077. struct extent_io_tree *unpin)
  4078. {
  4079. u64 start;
  4080. u64 end;
  4081. while (1) {
  4082. struct extent_state *cached_state = NULL;
  4083. /*
  4084. * The btrfs_finish_extent_commit() may get the same range as
  4085. * ours between find_first_extent_bit and clear_extent_dirty.
  4086. * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
  4087. * the same extent range.
  4088. */
  4089. mutex_lock(&fs_info->unused_bg_unpin_mutex);
  4090. if (!btrfs_find_first_extent_bit(unpin, 0, &start, &end,
  4091. EXTENT_DIRTY, &cached_state)) {
  4092. mutex_unlock(&fs_info->unused_bg_unpin_mutex);
  4093. break;
  4094. }
  4095. btrfs_clear_extent_dirty(unpin, start, end, &cached_state);
  4096. btrfs_free_extent_state(cached_state);
  4097. btrfs_error_unpin_extent_range(fs_info, start, end);
  4098. mutex_unlock(&fs_info->unused_bg_unpin_mutex);
  4099. cond_resched();
  4100. }
  4101. }
  4102. static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
  4103. {
  4104. struct inode *inode;
  4105. inode = cache->io_ctl.inode;
  4106. if (inode) {
  4107. unsigned int nofs_flag;
  4108. nofs_flag = memalloc_nofs_save();
  4109. invalidate_inode_pages2(inode->i_mapping);
  4110. memalloc_nofs_restore(nofs_flag);
  4111. BTRFS_I(inode)->generation = 0;
  4112. cache->io_ctl.inode = NULL;
  4113. iput(inode);
  4114. }
  4115. ASSERT(cache->io_ctl.pages == NULL);
  4116. btrfs_put_block_group(cache);
  4117. }
  4118. void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
  4119. struct btrfs_fs_info *fs_info)
  4120. {
  4121. struct btrfs_block_group *cache;
  4122. spin_lock(&cur_trans->dirty_bgs_lock);
  4123. while (!list_empty(&cur_trans->dirty_bgs)) {
  4124. cache = list_first_entry(&cur_trans->dirty_bgs,
  4125. struct btrfs_block_group,
  4126. dirty_list);
  4127. if (!list_empty(&cache->io_list)) {
  4128. spin_unlock(&cur_trans->dirty_bgs_lock);
  4129. list_del_init(&cache->io_list);
  4130. btrfs_cleanup_bg_io(cache);
  4131. spin_lock(&cur_trans->dirty_bgs_lock);
  4132. }
  4133. list_del_init(&cache->dirty_list);
  4134. spin_lock(&cache->lock);
  4135. cache->disk_cache_state = BTRFS_DC_ERROR;
  4136. spin_unlock(&cache->lock);
  4137. spin_unlock(&cur_trans->dirty_bgs_lock);
  4138. btrfs_put_block_group(cache);
  4139. btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
  4140. spin_lock(&cur_trans->dirty_bgs_lock);
  4141. }
  4142. spin_unlock(&cur_trans->dirty_bgs_lock);
  4143. /*
  4144. * Refer to the definition of io_bgs member for details why it's safe
  4145. * to use it without any locking
  4146. */
  4147. while (!list_empty(&cur_trans->io_bgs)) {
  4148. cache = list_first_entry(&cur_trans->io_bgs,
  4149. struct btrfs_block_group,
  4150. io_list);
  4151. list_del_init(&cache->io_list);
  4152. spin_lock(&cache->lock);
  4153. cache->disk_cache_state = BTRFS_DC_ERROR;
  4154. spin_unlock(&cache->lock);
  4155. btrfs_cleanup_bg_io(cache);
  4156. }
  4157. }
  4158. static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
  4159. {
  4160. struct btrfs_root *gang[8];
  4161. int i;
  4162. int ret;
  4163. spin_lock(&fs_info->fs_roots_radix_lock);
  4164. while (1) {
  4165. ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
  4166. (void **)gang, 0,
  4167. ARRAY_SIZE(gang),
  4168. BTRFS_ROOT_TRANS_TAG);
  4169. if (ret == 0)
  4170. break;
  4171. for (i = 0; i < ret; i++) {
  4172. struct btrfs_root *root = gang[i];
  4173. btrfs_qgroup_free_meta_all_pertrans(root);
  4174. radix_tree_tag_clear(&fs_info->fs_roots_radix,
  4175. (unsigned long)btrfs_root_id(root),
  4176. BTRFS_ROOT_TRANS_TAG);
  4177. }
  4178. }
  4179. spin_unlock(&fs_info->fs_roots_radix_lock);
  4180. }
  4181. void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans)
  4182. {
  4183. struct btrfs_fs_info *fs_info = cur_trans->fs_info;
  4184. struct btrfs_device *dev, *tmp;
  4185. btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
  4186. ASSERT(list_empty(&cur_trans->dirty_bgs));
  4187. ASSERT(list_empty(&cur_trans->io_bgs));
  4188. list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
  4189. post_commit_list) {
  4190. list_del_init(&dev->post_commit_list);
  4191. }
  4192. btrfs_destroy_delayed_refs(cur_trans);
  4193. cur_trans->state = TRANS_STATE_COMMIT_START;
  4194. wake_up(&fs_info->transaction_blocked_wait);
  4195. cur_trans->state = TRANS_STATE_UNBLOCKED;
  4196. wake_up(&fs_info->transaction_wait);
  4197. btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
  4198. EXTENT_DIRTY);
  4199. btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
  4200. cur_trans->state =TRANS_STATE_COMPLETED;
  4201. wake_up(&cur_trans->commit_wait);
  4202. }
  4203. static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
  4204. {
  4205. struct btrfs_transaction *t;
  4206. mutex_lock(&fs_info->transaction_kthread_mutex);
  4207. spin_lock(&fs_info->trans_lock);
  4208. while (!list_empty(&fs_info->trans_list)) {
  4209. t = list_first_entry(&fs_info->trans_list,
  4210. struct btrfs_transaction, list);
  4211. if (t->state >= TRANS_STATE_COMMIT_PREP) {
  4212. refcount_inc(&t->use_count);
  4213. spin_unlock(&fs_info->trans_lock);
  4214. btrfs_wait_for_commit(fs_info, t->transid);
  4215. btrfs_put_transaction(t);
  4216. spin_lock(&fs_info->trans_lock);
  4217. continue;
  4218. }
  4219. if (t == fs_info->running_transaction) {
  4220. t->state = TRANS_STATE_COMMIT_DOING;
  4221. spin_unlock(&fs_info->trans_lock);
  4222. /*
  4223. * We wait for 0 num_writers since we don't hold a trans
  4224. * handle open currently for this transaction.
  4225. */
  4226. wait_event(t->writer_wait,
  4227. atomic_read(&t->num_writers) == 0);
  4228. } else {
  4229. spin_unlock(&fs_info->trans_lock);
  4230. }
  4231. btrfs_cleanup_one_transaction(t);
  4232. spin_lock(&fs_info->trans_lock);
  4233. if (t == fs_info->running_transaction)
  4234. fs_info->running_transaction = NULL;
  4235. list_del_init(&t->list);
  4236. spin_unlock(&fs_info->trans_lock);
  4237. btrfs_put_transaction(t);
  4238. trace_btrfs_transaction_commit(fs_info);
  4239. spin_lock(&fs_info->trans_lock);
  4240. }
  4241. spin_unlock(&fs_info->trans_lock);
  4242. btrfs_destroy_all_ordered_extents(fs_info);
  4243. btrfs_destroy_delayed_inodes(fs_info);
  4244. btrfs_assert_delayed_root_empty(fs_info);
  4245. btrfs_destroy_all_delalloc_inodes(fs_info);
  4246. btrfs_drop_all_logs(fs_info);
  4247. btrfs_free_all_qgroup_pertrans(fs_info);
  4248. mutex_unlock(&fs_info->transaction_kthread_mutex);
  4249. return 0;
  4250. }
  4251. int btrfs_init_root_free_objectid(struct btrfs_root *root)
  4252. {
  4253. BTRFS_PATH_AUTO_FREE(path);
  4254. int ret;
  4255. struct extent_buffer *l;
  4256. struct btrfs_key search_key;
  4257. struct btrfs_key found_key;
  4258. int slot;
  4259. path = btrfs_alloc_path();
  4260. if (!path)
  4261. return -ENOMEM;
  4262. search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
  4263. search_key.type = -1;
  4264. search_key.offset = (u64)-1;
  4265. ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
  4266. if (ret < 0)
  4267. return ret;
  4268. if (unlikely(ret == 0)) {
  4269. /*
  4270. * Key with offset -1 found, there would have to exist a root
  4271. * with such id, but this is out of valid range.
  4272. */
  4273. return -EUCLEAN;
  4274. }
  4275. if (path->slots[0] > 0) {
  4276. slot = path->slots[0] - 1;
  4277. l = path->nodes[0];
  4278. btrfs_item_key_to_cpu(l, &found_key, slot);
  4279. root->free_objectid = max_t(u64, found_key.objectid + 1,
  4280. BTRFS_FIRST_FREE_OBJECTID);
  4281. } else {
  4282. root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
  4283. }
  4284. return 0;
  4285. }
  4286. int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
  4287. {
  4288. int ret;
  4289. mutex_lock(&root->objectid_mutex);
  4290. if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
  4291. btrfs_warn(root->fs_info,
  4292. "the objectid of root %llu reaches its highest value",
  4293. btrfs_root_id(root));
  4294. ret = -ENOSPC;
  4295. goto out;
  4296. }
  4297. *objectid = root->free_objectid++;
  4298. ret = 0;
  4299. out:
  4300. mutex_unlock(&root->objectid_mutex);
  4301. return ret;
  4302. }