namespace.c 164 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * linux/fs/namespace.c
  4. *
  5. * (C) Copyright Al Viro 2000, 2001
  6. *
  7. * Based on code from fs/super.c, copyright Linus Torvalds and others.
  8. * Heavily rewritten.
  9. */
  10. #include <linux/syscalls.h>
  11. #include <linux/export.h>
  12. #include <linux/capability.h>
  13. #include <linux/mnt_namespace.h>
  14. #include <linux/user_namespace.h>
  15. #include <linux/namei.h>
  16. #include <linux/security.h>
  17. #include <linux/cred.h>
  18. #include <linux/idr.h>
  19. #include <linux/init.h> /* init_rootfs */
  20. #include <linux/fs_struct.h> /* get_fs_root et.al. */
  21. #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
  22. #include <linux/file.h>
  23. #include <linux/uaccess.h>
  24. #include <linux/proc_ns.h>
  25. #include <linux/magic.h>
  26. #include <linux/memblock.h>
  27. #include <linux/proc_fs.h>
  28. #include <linux/task_work.h>
  29. #include <linux/sched/task.h>
  30. #include <uapi/linux/mount.h>
  31. #include <linux/fs_context.h>
  32. #include <linux/shmem_fs.h>
  33. #include <linux/mnt_idmapping.h>
  34. #include <linux/pidfs.h>
  35. #include <linux/nstree.h>
  36. #include "pnode.h"
  37. #include "internal.h"
  38. /* Maximum number of mounts in a mount namespace */
  39. static unsigned int sysctl_mount_max __read_mostly = 100000;
  40. static unsigned int m_hash_mask __ro_after_init;
  41. static unsigned int m_hash_shift __ro_after_init;
  42. static unsigned int mp_hash_mask __ro_after_init;
  43. static unsigned int mp_hash_shift __ro_after_init;
  44. static __initdata unsigned long mhash_entries;
  45. static int __init set_mhash_entries(char *str)
  46. {
  47. return kstrtoul(str, 0, &mhash_entries) == 0;
  48. }
  49. __setup("mhash_entries=", set_mhash_entries);
  50. static __initdata unsigned long mphash_entries;
  51. static int __init set_mphash_entries(char *str)
  52. {
  53. return kstrtoul(str, 0, &mphash_entries) == 0;
  54. }
  55. __setup("mphash_entries=", set_mphash_entries);
  56. static char * __initdata initramfs_options;
  57. static int __init initramfs_options_setup(char *str)
  58. {
  59. initramfs_options = str;
  60. return 1;
  61. }
  62. __setup("initramfs_options=", initramfs_options_setup);
  63. static u64 event;
  64. static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
  65. static DEFINE_IDA(mnt_group_ida);
  66. /* Don't allow confusion with old 32bit mount ID */
  67. #define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
  68. static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET;
  69. static struct hlist_head *mount_hashtable __ro_after_init;
  70. static struct hlist_head *mountpoint_hashtable __ro_after_init;
  71. static struct kmem_cache *mnt_cache __ro_after_init;
  72. static DECLARE_RWSEM(namespace_sem);
  73. static HLIST_HEAD(unmounted); /* protected by namespace_sem */
  74. static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
  75. static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */
  76. static inline void namespace_lock(void);
  77. static void namespace_unlock(void);
  78. DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock())
  79. DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem),
  80. up_read(&namespace_sem))
  81. DEFINE_FREE(mntput, struct vfsmount *, if (!IS_ERR(_T)) mntput(_T))
  82. #ifdef CONFIG_FSNOTIFY
  83. LIST_HEAD(notify_list); /* protected by namespace_sem */
  84. #endif
  85. enum mount_kattr_flags_t {
  86. MOUNT_KATTR_RECURSE = (1 << 0),
  87. MOUNT_KATTR_IDMAP_REPLACE = (1 << 1),
  88. };
  89. struct mount_kattr {
  90. unsigned int attr_set;
  91. unsigned int attr_clr;
  92. unsigned int propagation;
  93. unsigned int lookup_flags;
  94. enum mount_kattr_flags_t kflags;
  95. struct user_namespace *mnt_userns;
  96. struct mnt_idmap *mnt_idmap;
  97. };
  98. /* /sys/fs */
  99. struct kobject *fs_kobj __ro_after_init;
  100. EXPORT_SYMBOL_GPL(fs_kobj);
  101. /*
  102. * vfsmount lock may be taken for read to prevent changes to the
  103. * vfsmount hash, ie. during mountpoint lookups or walking back
  104. * up the tree.
  105. *
  106. * It should be taken for write in all cases where the vfsmount
  107. * tree or hash is modified or when a vfsmount structure is modified.
  108. */
  109. __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
  110. static void mnt_ns_release(struct mnt_namespace *ns)
  111. {
  112. /* keep alive for {list,stat}mount() */
  113. if (ns && refcount_dec_and_test(&ns->passive)) {
  114. fsnotify_mntns_delete(ns);
  115. put_user_ns(ns->user_ns);
  116. kfree(ns);
  117. }
  118. }
  119. DEFINE_FREE(mnt_ns_release, struct mnt_namespace *,
  120. if (!IS_ERR(_T)) mnt_ns_release(_T))
  121. static void mnt_ns_release_rcu(struct rcu_head *rcu)
  122. {
  123. mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu));
  124. }
  125. static void mnt_ns_tree_remove(struct mnt_namespace *ns)
  126. {
  127. /* remove from global mount namespace list */
  128. if (ns_tree_active(ns))
  129. ns_tree_remove(ns);
  130. call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu);
  131. }
  132. /*
  133. * Lookup a mount namespace by id and take a passive reference count. Taking a
  134. * passive reference means the mount namespace can be emptied if e.g., the last
  135. * task holding an active reference exits. To access the mounts of the
  136. * namespace the @namespace_sem must first be acquired. If the namespace has
  137. * already shut down before acquiring @namespace_sem, {list,stat}mount() will
  138. * see that the mount rbtree of the namespace is empty.
  139. *
  140. * Note the lookup is lockless protected by a sequence counter. We only
  141. * need to guard against false negatives as false positives aren't
  142. * possible. So if we didn't find a mount namespace and the sequence
  143. * counter has changed we need to retry. If the sequence counter is
  144. * still the same we know the search actually failed.
  145. */
  146. static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
  147. {
  148. struct mnt_namespace *mnt_ns;
  149. struct ns_common *ns;
  150. guard(rcu)();
  151. ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS);
  152. if (!ns)
  153. return NULL;
  154. /*
  155. * The last reference count is put with RCU delay so we can
  156. * unconditonally acquire a reference here.
  157. */
  158. mnt_ns = container_of(ns, struct mnt_namespace, ns);
  159. refcount_inc(&mnt_ns->passive);
  160. return mnt_ns;
  161. }
  162. static inline void lock_mount_hash(void)
  163. {
  164. write_seqlock(&mount_lock);
  165. }
  166. static inline void unlock_mount_hash(void)
  167. {
  168. write_sequnlock(&mount_lock);
  169. }
  170. static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
  171. {
  172. unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
  173. tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
  174. tmp = tmp + (tmp >> m_hash_shift);
  175. return &mount_hashtable[tmp & m_hash_mask];
  176. }
  177. static inline struct hlist_head *mp_hash(struct dentry *dentry)
  178. {
  179. unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
  180. tmp = tmp + (tmp >> mp_hash_shift);
  181. return &mountpoint_hashtable[tmp & mp_hash_mask];
  182. }
  183. static int mnt_alloc_id(struct mount *mnt)
  184. {
  185. int res;
  186. xa_lock(&mnt_id_xa);
  187. res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, xa_limit_31b, GFP_KERNEL);
  188. if (!res)
  189. mnt->mnt_id_unique = ++mnt_id_ctr;
  190. xa_unlock(&mnt_id_xa);
  191. return res;
  192. }
  193. static void mnt_free_id(struct mount *mnt)
  194. {
  195. xa_erase(&mnt_id_xa, mnt->mnt_id);
  196. }
  197. /*
  198. * Allocate a new peer group ID
  199. */
  200. static int mnt_alloc_group_id(struct mount *mnt)
  201. {
  202. int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);
  203. if (res < 0)
  204. return res;
  205. mnt->mnt_group_id = res;
  206. return 0;
  207. }
  208. /*
  209. * Release a peer group ID
  210. */
  211. void mnt_release_group_id(struct mount *mnt)
  212. {
  213. ida_free(&mnt_group_ida, mnt->mnt_group_id);
  214. mnt->mnt_group_id = 0;
  215. }
  216. /*
  217. * vfsmount lock must be held for read
  218. */
  219. static inline void mnt_add_count(struct mount *mnt, int n)
  220. {
  221. #ifdef CONFIG_SMP
  222. this_cpu_add(mnt->mnt_pcp->mnt_count, n);
  223. #else
  224. preempt_disable();
  225. mnt->mnt_count += n;
  226. preempt_enable();
  227. #endif
  228. }
  229. /*
  230. * vfsmount lock must be held for write
  231. */
  232. int mnt_get_count(struct mount *mnt)
  233. {
  234. #ifdef CONFIG_SMP
  235. int count = 0;
  236. int cpu;
  237. for_each_possible_cpu(cpu) {
  238. count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
  239. }
  240. return count;
  241. #else
  242. return mnt->mnt_count;
  243. #endif
  244. }
  245. static struct mount *alloc_vfsmnt(const char *name)
  246. {
  247. struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
  248. if (mnt) {
  249. int err;
  250. err = mnt_alloc_id(mnt);
  251. if (err)
  252. goto out_free_cache;
  253. if (name)
  254. mnt->mnt_devname = kstrdup_const(name,
  255. GFP_KERNEL_ACCOUNT);
  256. else
  257. mnt->mnt_devname = "none";
  258. if (!mnt->mnt_devname)
  259. goto out_free_id;
  260. #ifdef CONFIG_SMP
  261. mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
  262. if (!mnt->mnt_pcp)
  263. goto out_free_devname;
  264. this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
  265. #else
  266. mnt->mnt_count = 1;
  267. mnt->mnt_writers = 0;
  268. #endif
  269. INIT_HLIST_NODE(&mnt->mnt_hash);
  270. INIT_LIST_HEAD(&mnt->mnt_child);
  271. INIT_LIST_HEAD(&mnt->mnt_mounts);
  272. INIT_LIST_HEAD(&mnt->mnt_list);
  273. INIT_LIST_HEAD(&mnt->mnt_expire);
  274. INIT_LIST_HEAD(&mnt->mnt_share);
  275. INIT_HLIST_HEAD(&mnt->mnt_slave_list);
  276. INIT_HLIST_NODE(&mnt->mnt_slave);
  277. INIT_HLIST_NODE(&mnt->mnt_mp_list);
  278. INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
  279. RB_CLEAR_NODE(&mnt->mnt_node);
  280. mnt->mnt.mnt_idmap = &nop_mnt_idmap;
  281. }
  282. return mnt;
  283. #ifdef CONFIG_SMP
  284. out_free_devname:
  285. kfree_const(mnt->mnt_devname);
  286. #endif
  287. out_free_id:
  288. mnt_free_id(mnt);
  289. out_free_cache:
  290. kmem_cache_free(mnt_cache, mnt);
  291. return NULL;
  292. }
  293. /*
  294. * Most r/o checks on a fs are for operations that take
  295. * discrete amounts of time, like a write() or unlink().
  296. * We must keep track of when those operations start
  297. * (for permission checks) and when they end, so that
  298. * we can determine when writes are able to occur to
  299. * a filesystem.
  300. */
  301. /*
  302. * __mnt_is_readonly: check whether a mount is read-only
  303. * @mnt: the mount to check for its write status
  304. *
  305. * This shouldn't be used directly ouside of the VFS.
  306. * It does not guarantee that the filesystem will stay
  307. * r/w, just that it is right *now*. This can not and
  308. * should not be used in place of IS_RDONLY(inode).
  309. * mnt_want/drop_write() will _keep_ the filesystem
  310. * r/w.
  311. */
  312. bool __mnt_is_readonly(const struct vfsmount *mnt)
  313. {
  314. return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
  315. }
  316. EXPORT_SYMBOL_GPL(__mnt_is_readonly);
  317. static inline void mnt_inc_writers(struct mount *mnt)
  318. {
  319. #ifdef CONFIG_SMP
  320. this_cpu_inc(mnt->mnt_pcp->mnt_writers);
  321. #else
  322. mnt->mnt_writers++;
  323. #endif
  324. }
  325. static inline void mnt_dec_writers(struct mount *mnt)
  326. {
  327. #ifdef CONFIG_SMP
  328. this_cpu_dec(mnt->mnt_pcp->mnt_writers);
  329. #else
  330. mnt->mnt_writers--;
  331. #endif
  332. }
  333. static unsigned int mnt_get_writers(struct mount *mnt)
  334. {
  335. #ifdef CONFIG_SMP
  336. unsigned int count = 0;
  337. int cpu;
  338. for_each_possible_cpu(cpu) {
  339. count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
  340. }
  341. return count;
  342. #else
  343. return mnt->mnt_writers;
  344. #endif
  345. }
  346. static int mnt_is_readonly(const struct vfsmount *mnt)
  347. {
  348. if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
  349. return 1;
  350. /*
  351. * The barrier pairs with the barrier in sb_start_ro_state_change()
  352. * making sure if we don't see s_readonly_remount set yet, we also will
  353. * not see any superblock / mount flag changes done by remount.
  354. * It also pairs with the barrier in sb_end_ro_state_change()
  355. * assuring that if we see s_readonly_remount already cleared, we will
  356. * see the values of superblock / mount flags updated by remount.
  357. */
  358. smp_rmb();
  359. return __mnt_is_readonly(mnt);
  360. }
  361. /*
  362. * Most r/o & frozen checks on a fs are for operations that take discrete
  363. * amounts of time, like a write() or unlink(). We must keep track of when
  364. * those operations start (for permission checks) and when they end, so that we
  365. * can determine when writes are able to occur to a filesystem.
  366. */
  367. /**
  368. * mnt_get_write_access - get write access to a mount without freeze protection
  369. * @m: the mount on which to take a write
  370. *
  371. * This tells the low-level filesystem that a write is about to be performed to
  372. * it, and makes sure that writes are allowed (mnt it read-write) before
  373. * returning success. This operation does not protect against filesystem being
  374. * frozen. When the write operation is finished, mnt_put_write_access() must be
  375. * called. This is effectively a refcount.
  376. */
  377. int mnt_get_write_access(struct vfsmount *m)
  378. {
  379. struct mount *mnt = real_mount(m);
  380. int ret = 0;
  381. preempt_disable();
  382. mnt_inc_writers(mnt);
  383. /*
  384. * The store to mnt_inc_writers must be visible before we pass
  385. * WRITE_HOLD loop below, so that the slowpath can see our
  386. * incremented count after it has set WRITE_HOLD.
  387. */
  388. smp_mb();
  389. might_lock(&mount_lock.lock);
  390. while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) {
  391. if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
  392. cpu_relax();
  393. } else {
  394. /*
  395. * This prevents priority inversion, if the task
  396. * setting WRITE_HOLD got preempted on a remote
  397. * CPU, and it prevents life lock if the task setting
  398. * WRITE_HOLD has a lower priority and is bound to
  399. * the same CPU as the task that is spinning here.
  400. */
  401. preempt_enable();
  402. read_seqlock_excl(&mount_lock);
  403. read_sequnlock_excl(&mount_lock);
  404. preempt_disable();
  405. }
  406. }
  407. /*
  408. * The barrier pairs with the barrier sb_start_ro_state_change() making
  409. * sure that if we see WRITE_HOLD cleared, we will also see
  410. * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
  411. * mnt_is_readonly() and bail in case we are racing with remount
  412. * read-only.
  413. */
  414. smp_rmb();
  415. if (mnt_is_readonly(m)) {
  416. mnt_dec_writers(mnt);
  417. ret = -EROFS;
  418. }
  419. preempt_enable();
  420. return ret;
  421. }
  422. EXPORT_SYMBOL_GPL(mnt_get_write_access);
  423. /**
  424. * mnt_want_write - get write access to a mount
  425. * @m: the mount on which to take a write
  426. *
  427. * This tells the low-level filesystem that a write is about to be performed to
  428. * it, and makes sure that writes are allowed (mount is read-write, filesystem
  429. * is not frozen) before returning success. When the write operation is
  430. * finished, mnt_drop_write() must be called. This is effectively a refcount.
  431. */
  432. int mnt_want_write(struct vfsmount *m)
  433. {
  434. int ret;
  435. sb_start_write(m->mnt_sb);
  436. ret = mnt_get_write_access(m);
  437. if (ret)
  438. sb_end_write(m->mnt_sb);
  439. return ret;
  440. }
  441. EXPORT_SYMBOL_GPL(mnt_want_write);
  442. /**
  443. * mnt_get_write_access_file - get write access to a file's mount
  444. * @file: the file who's mount on which to take a write
  445. *
  446. * This is like mnt_get_write_access, but if @file is already open for write it
  447. * skips incrementing mnt_writers (since the open file already has a reference)
  448. * and instead only does the check for emergency r/o remounts. This must be
  449. * paired with mnt_put_write_access_file.
  450. */
  451. int mnt_get_write_access_file(struct file *file)
  452. {
  453. if (file->f_mode & FMODE_WRITER) {
  454. /*
  455. * Superblock may have become readonly while there are still
  456. * writable fd's, e.g. due to a fs error with errors=remount-ro
  457. */
  458. if (__mnt_is_readonly(file->f_path.mnt))
  459. return -EROFS;
  460. return 0;
  461. }
  462. return mnt_get_write_access(file->f_path.mnt);
  463. }
  464. /**
  465. * mnt_want_write_file - get write access to a file's mount
  466. * @file: the file who's mount on which to take a write
  467. *
  468. * This is like mnt_want_write, but if the file is already open for writing it
  469. * skips incrementing mnt_writers (since the open file already has a reference)
  470. * and instead only does the freeze protection and the check for emergency r/o
  471. * remounts. This must be paired with mnt_drop_write_file.
  472. */
  473. int mnt_want_write_file(struct file *file)
  474. {
  475. int ret;
  476. sb_start_write(file_inode(file)->i_sb);
  477. ret = mnt_get_write_access_file(file);
  478. if (ret)
  479. sb_end_write(file_inode(file)->i_sb);
  480. return ret;
  481. }
  482. EXPORT_SYMBOL_GPL(mnt_want_write_file);
  483. /**
  484. * mnt_put_write_access - give up write access to a mount
  485. * @mnt: the mount on which to give up write access
  486. *
  487. * Tells the low-level filesystem that we are done
  488. * performing writes to it. Must be matched with
  489. * mnt_get_write_access() call above.
  490. */
  491. void mnt_put_write_access(struct vfsmount *mnt)
  492. {
  493. preempt_disable();
  494. mnt_dec_writers(real_mount(mnt));
  495. preempt_enable();
  496. }
  497. EXPORT_SYMBOL_GPL(mnt_put_write_access);
  498. /**
  499. * mnt_drop_write - give up write access to a mount
  500. * @mnt: the mount on which to give up write access
  501. *
  502. * Tells the low-level filesystem that we are done performing writes to it and
  503. * also allows filesystem to be frozen again. Must be matched with
  504. * mnt_want_write() call above.
  505. */
  506. void mnt_drop_write(struct vfsmount *mnt)
  507. {
  508. mnt_put_write_access(mnt);
  509. sb_end_write(mnt->mnt_sb);
  510. }
  511. EXPORT_SYMBOL_GPL(mnt_drop_write);
  512. void mnt_put_write_access_file(struct file *file)
  513. {
  514. if (!(file->f_mode & FMODE_WRITER))
  515. mnt_put_write_access(file->f_path.mnt);
  516. }
  517. void mnt_drop_write_file(struct file *file)
  518. {
  519. mnt_put_write_access_file(file);
  520. sb_end_write(file_inode(file)->i_sb);
  521. }
  522. EXPORT_SYMBOL(mnt_drop_write_file);
  523. /**
  524. * mnt_hold_writers - prevent write access to the given mount
  525. * @mnt: mnt to prevent write access to
  526. *
  527. * Prevents write access to @mnt if there are no active writers for @mnt.
  528. * This function needs to be called and return successfully before changing
  529. * properties of @mnt that need to remain stable for callers with write access
  530. * to @mnt.
  531. *
  532. * After this functions has been called successfully callers must pair it with
  533. * a call to mnt_unhold_writers() in order to stop preventing write access to
  534. * @mnt.
  535. *
  536. * Context: This function expects to be in mount_locked_reader scope serializing
  537. * setting WRITE_HOLD.
  538. * Return: On success 0 is returned.
  539. * On error, -EBUSY is returned.
  540. */
  541. static inline int mnt_hold_writers(struct mount *mnt)
  542. {
  543. set_write_hold(mnt);
  544. /*
  545. * After storing WRITE_HOLD, we'll read the counters. This store
  546. * should be visible before we do.
  547. */
  548. smp_mb();
  549. /*
  550. * With writers on hold, if this value is zero, then there are
  551. * definitely no active writers (although held writers may subsequently
  552. * increment the count, they'll have to wait, and decrement it after
  553. * seeing MNT_READONLY).
  554. *
  555. * It is OK to have counter incremented on one CPU and decremented on
  556. * another: the sum will add up correctly. The danger would be when we
  557. * sum up each counter, if we read a counter before it is incremented,
  558. * but then read another CPU's count which it has been subsequently
  559. * decremented from -- we would see more decrements than we should.
  560. * WRITE_HOLD protects against this scenario, because
  561. * mnt_want_write first increments count, then smp_mb, then spins on
  562. * WRITE_HOLD, so it can't be decremented by another CPU while
  563. * we're counting up here.
  564. */
  565. if (mnt_get_writers(mnt) > 0)
  566. return -EBUSY;
  567. return 0;
  568. }
  569. /**
  570. * mnt_unhold_writers - stop preventing write access to the given mount
  571. * @mnt: mnt to stop preventing write access to
  572. *
  573. * Stop preventing write access to @mnt allowing callers to gain write access
  574. * to @mnt again.
  575. *
  576. * This function can only be called after a call to mnt_hold_writers().
  577. *
  578. * Context: This function expects to be in the same mount_locked_reader scope
  579. * as the matching mnt_hold_writers().
  580. */
  581. static inline void mnt_unhold_writers(struct mount *mnt)
  582. {
  583. if (!test_write_hold(mnt))
  584. return;
  585. /*
  586. * MNT_READONLY must become visible before ~WRITE_HOLD, so writers
  587. * that become unheld will see MNT_READONLY.
  588. */
  589. smp_wmb();
  590. clear_write_hold(mnt);
  591. }
  592. static inline void mnt_del_instance(struct mount *m)
  593. {
  594. struct mount **p = m->mnt_pprev_for_sb;
  595. struct mount *next = m->mnt_next_for_sb;
  596. if (next)
  597. next->mnt_pprev_for_sb = p;
  598. *p = next;
  599. }
  600. static inline void mnt_add_instance(struct mount *m, struct super_block *s)
  601. {
  602. struct mount *first = s->s_mounts;
  603. if (first)
  604. first->mnt_pprev_for_sb = &m->mnt_next_for_sb;
  605. m->mnt_next_for_sb = first;
  606. m->mnt_pprev_for_sb = &s->s_mounts;
  607. s->s_mounts = m;
  608. }
  609. static int mnt_make_readonly(struct mount *mnt)
  610. {
  611. int ret;
  612. ret = mnt_hold_writers(mnt);
  613. if (!ret)
  614. mnt->mnt.mnt_flags |= MNT_READONLY;
  615. mnt_unhold_writers(mnt);
  616. return ret;
  617. }
  618. int sb_prepare_remount_readonly(struct super_block *sb)
  619. {
  620. int err = 0;
  621. /* Racy optimization. Recheck the counter under WRITE_HOLD */
  622. if (atomic_long_read(&sb->s_remove_count))
  623. return -EBUSY;
  624. guard(mount_locked_reader)();
  625. for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
  626. if (!(m->mnt.mnt_flags & MNT_READONLY)) {
  627. err = mnt_hold_writers(m);
  628. if (err)
  629. break;
  630. }
  631. }
  632. if (!err && atomic_long_read(&sb->s_remove_count))
  633. err = -EBUSY;
  634. if (!err)
  635. sb_start_ro_state_change(sb);
  636. for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
  637. if (test_write_hold(m))
  638. clear_write_hold(m);
  639. }
  640. return err;
  641. }
  642. static void free_vfsmnt(struct mount *mnt)
  643. {
  644. mnt_idmap_put(mnt_idmap(&mnt->mnt));
  645. kfree_const(mnt->mnt_devname);
  646. #ifdef CONFIG_SMP
  647. free_percpu(mnt->mnt_pcp);
  648. #endif
  649. kmem_cache_free(mnt_cache, mnt);
  650. }
  651. static void delayed_free_vfsmnt(struct rcu_head *head)
  652. {
  653. free_vfsmnt(container_of(head, struct mount, mnt_rcu));
  654. }
  655. /* call under rcu_read_lock */
  656. int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
  657. {
  658. struct mount *mnt;
  659. if (read_seqretry(&mount_lock, seq))
  660. return 1;
  661. if (bastard == NULL)
  662. return 0;
  663. mnt = real_mount(bastard);
  664. mnt_add_count(mnt, 1);
  665. smp_mb(); // see mntput_no_expire() and do_umount()
  666. if (likely(!read_seqretry(&mount_lock, seq)))
  667. return 0;
  668. lock_mount_hash();
  669. if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) {
  670. mnt_add_count(mnt, -1);
  671. unlock_mount_hash();
  672. return 1;
  673. }
  674. unlock_mount_hash();
  675. /* caller will mntput() */
  676. return -1;
  677. }
  678. /* call under rcu_read_lock */
  679. static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
  680. {
  681. int res = __legitimize_mnt(bastard, seq);
  682. if (likely(!res))
  683. return true;
  684. if (unlikely(res < 0)) {
  685. rcu_read_unlock();
  686. mntput(bastard);
  687. rcu_read_lock();
  688. }
  689. return false;
  690. }
  691. /**
  692. * __lookup_mnt - mount hash lookup
  693. * @mnt: parent mount
  694. * @dentry: dentry of mountpoint
  695. *
  696. * If @mnt has a child mount @c mounted on @dentry find and return it.
  697. * Caller must either hold the spinlock component of @mount_lock or
  698. * hold rcu_read_lock(), sample the seqcount component before the call
  699. * and recheck it afterwards.
  700. *
  701. * Return: The child of @mnt mounted on @dentry or %NULL.
  702. */
  703. struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
  704. {
  705. struct hlist_head *head = m_hash(mnt, dentry);
  706. struct mount *p;
  707. hlist_for_each_entry_rcu(p, head, mnt_hash)
  708. if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
  709. return p;
  710. return NULL;
  711. }
  712. /**
  713. * lookup_mnt - Return the child mount mounted at given location
  714. * @path: location in the namespace
  715. *
  716. * Acquires and returns a new reference to mount at given location
  717. * or %NULL if nothing is mounted there.
  718. */
  719. struct vfsmount *lookup_mnt(const struct path *path)
  720. {
  721. struct mount *child_mnt;
  722. struct vfsmount *m;
  723. unsigned seq;
  724. rcu_read_lock();
  725. do {
  726. seq = read_seqbegin(&mount_lock);
  727. child_mnt = __lookup_mnt(path->mnt, path->dentry);
  728. m = child_mnt ? &child_mnt->mnt : NULL;
  729. } while (!legitimize_mnt(m, seq));
  730. rcu_read_unlock();
  731. return m;
  732. }
  733. /*
  734. * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
  735. * current mount namespace.
  736. *
  737. * The common case is dentries are not mountpoints at all and that
  738. * test is handled inline. For the slow case when we are actually
  739. * dealing with a mountpoint of some kind, walk through all of the
  740. * mounts in the current mount namespace and test to see if the dentry
  741. * is a mountpoint.
  742. *
  743. * The mount_hashtable is not usable in the context because we
  744. * need to identify all mounts that may be in the current mount
  745. * namespace not just a mount that happens to have some specified
  746. * parent mount.
  747. */
  748. bool __is_local_mountpoint(const struct dentry *dentry)
  749. {
  750. struct mnt_namespace *ns = current->nsproxy->mnt_ns;
  751. struct mount *mnt, *n;
  752. guard(namespace_shared)();
  753. rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node)
  754. if (mnt->mnt_mountpoint == dentry)
  755. return true;
  756. return false;
  757. }
  758. struct pinned_mountpoint {
  759. struct hlist_node node;
  760. struct mountpoint *mp;
  761. struct mount *parent;
  762. };
  763. static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)
  764. {
  765. struct hlist_head *chain = mp_hash(dentry);
  766. struct mountpoint *mp;
  767. hlist_for_each_entry(mp, chain, m_hash) {
  768. if (mp->m_dentry == dentry) {
  769. hlist_add_head(&m->node, &mp->m_list);
  770. m->mp = mp;
  771. return true;
  772. }
  773. }
  774. return false;
  775. }
  776. static int get_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)
  777. {
  778. struct mountpoint *mp __free(kfree) = NULL;
  779. bool found;
  780. int ret;
  781. if (d_mountpoint(dentry)) {
  782. /* might be worth a WARN_ON() */
  783. if (d_unlinked(dentry))
  784. return -ENOENT;
  785. mountpoint:
  786. read_seqlock_excl(&mount_lock);
  787. found = lookup_mountpoint(dentry, m);
  788. read_sequnlock_excl(&mount_lock);
  789. if (found)
  790. return 0;
  791. }
  792. if (!mp)
  793. mp = kmalloc_obj(struct mountpoint);
  794. if (!mp)
  795. return -ENOMEM;
  796. /* Exactly one processes may set d_mounted */
  797. ret = d_set_mounted(dentry);
  798. /* Someone else set d_mounted? */
  799. if (ret == -EBUSY)
  800. goto mountpoint;
  801. /* The dentry is not available as a mountpoint? */
  802. if (ret)
  803. return ret;
  804. /* Add the new mountpoint to the hash table */
  805. read_seqlock_excl(&mount_lock);
  806. mp->m_dentry = dget(dentry);
  807. hlist_add_head(&mp->m_hash, mp_hash(dentry));
  808. INIT_HLIST_HEAD(&mp->m_list);
  809. hlist_add_head(&m->node, &mp->m_list);
  810. m->mp = no_free_ptr(mp);
  811. read_sequnlock_excl(&mount_lock);
  812. return 0;
  813. }
  814. /*
  815. * vfsmount lock must be held. Additionally, the caller is responsible
  816. * for serializing calls for given disposal list.
  817. */
  818. static void maybe_free_mountpoint(struct mountpoint *mp, struct list_head *list)
  819. {
  820. if (hlist_empty(&mp->m_list)) {
  821. struct dentry *dentry = mp->m_dentry;
  822. spin_lock(&dentry->d_lock);
  823. dentry->d_flags &= ~DCACHE_MOUNTED;
  824. spin_unlock(&dentry->d_lock);
  825. dput_to_list(dentry, list);
  826. hlist_del(&mp->m_hash);
  827. kfree(mp);
  828. }
  829. }
  830. /*
  831. * locks: mount_lock [read_seqlock_excl], namespace_sem [excl]
  832. */
  833. static void unpin_mountpoint(struct pinned_mountpoint *m)
  834. {
  835. if (m->mp) {
  836. hlist_del(&m->node);
  837. maybe_free_mountpoint(m->mp, &ex_mountpoints);
  838. }
  839. }
  840. static inline int check_mnt(const struct mount *mnt)
  841. {
  842. return mnt->mnt_ns == current->nsproxy->mnt_ns;
  843. }
  844. static inline bool check_anonymous_mnt(struct mount *mnt)
  845. {
  846. u64 seq;
  847. if (!is_anon_ns(mnt->mnt_ns))
  848. return false;
  849. seq = mnt->mnt_ns->seq_origin;
  850. return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id);
  851. }
  852. /*
  853. * vfsmount lock must be held for write
  854. */
  855. static void touch_mnt_namespace(struct mnt_namespace *ns)
  856. {
  857. if (ns) {
  858. ns->event = ++event;
  859. wake_up_interruptible(&ns->poll);
  860. }
  861. }
  862. /*
  863. * vfsmount lock must be held for write
  864. */
  865. static void __touch_mnt_namespace(struct mnt_namespace *ns)
  866. {
  867. if (ns && ns->event != event) {
  868. ns->event = event;
  869. wake_up_interruptible(&ns->poll);
  870. }
  871. }
  872. /*
  873. * locks: mount_lock[write_seqlock]
  874. */
  875. static void __umount_mnt(struct mount *mnt, struct list_head *shrink_list)
  876. {
  877. struct mountpoint *mp;
  878. struct mount *parent = mnt->mnt_parent;
  879. if (unlikely(parent->overmount == mnt))
  880. parent->overmount = NULL;
  881. mnt->mnt_parent = mnt;
  882. mnt->mnt_mountpoint = mnt->mnt.mnt_root;
  883. list_del_init(&mnt->mnt_child);
  884. hlist_del_init_rcu(&mnt->mnt_hash);
  885. hlist_del_init(&mnt->mnt_mp_list);
  886. mp = mnt->mnt_mp;
  887. mnt->mnt_mp = NULL;
  888. maybe_free_mountpoint(mp, shrink_list);
  889. }
  890. /*
  891. * locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints)
  892. */
  893. static void umount_mnt(struct mount *mnt)
  894. {
  895. __umount_mnt(mnt, &ex_mountpoints);
  896. }
  897. /*
  898. * vfsmount lock must be held for write
  899. */
  900. void mnt_set_mountpoint(struct mount *mnt,
  901. struct mountpoint *mp,
  902. struct mount *child_mnt)
  903. {
  904. child_mnt->mnt_mountpoint = mp->m_dentry;
  905. child_mnt->mnt_parent = mnt;
  906. child_mnt->mnt_mp = mp;
  907. hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
  908. }
  909. static void make_visible(struct mount *mnt)
  910. {
  911. struct mount *parent = mnt->mnt_parent;
  912. if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root))
  913. parent->overmount = mnt;
  914. hlist_add_head_rcu(&mnt->mnt_hash,
  915. m_hash(&parent->mnt, mnt->mnt_mountpoint));
  916. list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
  917. }
  918. /**
  919. * attach_mnt - mount a mount, attach to @mount_hashtable and parent's
  920. * list of child mounts
  921. * @parent: the parent
  922. * @mnt: the new mount
  923. * @mp: the new mountpoint
  924. *
  925. * Mount @mnt at @mp on @parent. Then attach @mnt
  926. * to @parent's child mount list and to @mount_hashtable.
  927. *
  928. * Note, when make_visible() is called @mnt->mnt_parent already points
  929. * to the correct parent.
  930. *
  931. * Context: This function expects namespace_lock() and lock_mount_hash()
  932. * to have been acquired in that order.
  933. */
  934. static void attach_mnt(struct mount *mnt, struct mount *parent,
  935. struct mountpoint *mp)
  936. {
  937. mnt_set_mountpoint(parent, mp, mnt);
  938. make_visible(mnt);
  939. }
  940. void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
  941. {
  942. struct mountpoint *old_mp = mnt->mnt_mp;
  943. list_del_init(&mnt->mnt_child);
  944. hlist_del_init(&mnt->mnt_mp_list);
  945. hlist_del_init_rcu(&mnt->mnt_hash);
  946. attach_mnt(mnt, parent, mp);
  947. maybe_free_mountpoint(old_mp, &ex_mountpoints);
  948. }
  949. static inline struct mount *node_to_mount(struct rb_node *node)
  950. {
  951. return node ? rb_entry(node, struct mount, mnt_node) : NULL;
  952. }
  953. static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
  954. {
  955. struct rb_node **link = &ns->mounts.rb_node;
  956. struct rb_node *parent = NULL;
  957. bool mnt_first_node = true, mnt_last_node = true;
  958. WARN_ON(mnt_ns_attached(mnt));
  959. mnt->mnt_ns = ns;
  960. while (*link) {
  961. parent = *link;
  962. if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) {
  963. link = &parent->rb_left;
  964. mnt_last_node = false;
  965. } else {
  966. link = &parent->rb_right;
  967. mnt_first_node = false;
  968. }
  969. }
  970. if (mnt_last_node)
  971. ns->mnt_last_node = &mnt->mnt_node;
  972. if (mnt_first_node)
  973. ns->mnt_first_node = &mnt->mnt_node;
  974. rb_link_node(&mnt->mnt_node, parent, link);
  975. rb_insert_color(&mnt->mnt_node, &ns->mounts);
  976. mnt_notify_add(mnt);
  977. }
  978. static struct mount *next_mnt(struct mount *p, struct mount *root)
  979. {
  980. struct list_head *next = p->mnt_mounts.next;
  981. if (next == &p->mnt_mounts) {
  982. while (1) {
  983. if (p == root)
  984. return NULL;
  985. next = p->mnt_child.next;
  986. if (next != &p->mnt_parent->mnt_mounts)
  987. break;
  988. p = p->mnt_parent;
  989. }
  990. }
  991. return list_entry(next, struct mount, mnt_child);
  992. }
  993. static struct mount *skip_mnt_tree(struct mount *p)
  994. {
  995. struct list_head *prev = p->mnt_mounts.prev;
  996. while (prev != &p->mnt_mounts) {
  997. p = list_entry(prev, struct mount, mnt_child);
  998. prev = p->mnt_mounts.prev;
  999. }
  1000. return p;
  1001. }
  1002. /*
  1003. * vfsmount lock must be held for write
  1004. */
  1005. static void commit_tree(struct mount *mnt)
  1006. {
  1007. struct mnt_namespace *n = mnt->mnt_parent->mnt_ns;
  1008. if (!mnt_ns_attached(mnt)) {
  1009. for (struct mount *m = mnt; m; m = next_mnt(m, mnt))
  1010. mnt_add_to_ns(n, m);
  1011. n->nr_mounts += n->pending_mounts;
  1012. n->pending_mounts = 0;
  1013. }
  1014. make_visible(mnt);
  1015. touch_mnt_namespace(n);
  1016. }
  1017. static void setup_mnt(struct mount *m, struct dentry *root)
  1018. {
  1019. struct super_block *s = root->d_sb;
  1020. atomic_inc(&s->s_active);
  1021. m->mnt.mnt_sb = s;
  1022. m->mnt.mnt_root = dget(root);
  1023. m->mnt_mountpoint = m->mnt.mnt_root;
  1024. m->mnt_parent = m;
  1025. guard(mount_locked_reader)();
  1026. mnt_add_instance(m, s);
  1027. }
  1028. /**
  1029. * vfs_create_mount - Create a mount for a configured superblock
  1030. * @fc: The configuration context with the superblock attached
  1031. *
  1032. * Create a mount to an already configured superblock. If necessary, the
  1033. * caller should invoke vfs_get_tree() before calling this.
  1034. *
  1035. * Note that this does not attach the mount to anything.
  1036. */
  1037. struct vfsmount *vfs_create_mount(struct fs_context *fc)
  1038. {
  1039. struct mount *mnt;
  1040. if (!fc->root)
  1041. return ERR_PTR(-EINVAL);
  1042. mnt = alloc_vfsmnt(fc->source);
  1043. if (!mnt)
  1044. return ERR_PTR(-ENOMEM);
  1045. if (fc->sb_flags & SB_KERNMOUNT)
  1046. mnt->mnt.mnt_flags = MNT_INTERNAL;
  1047. setup_mnt(mnt, fc->root);
  1048. return &mnt->mnt;
  1049. }
  1050. EXPORT_SYMBOL(vfs_create_mount);
  1051. struct vfsmount *fc_mount(struct fs_context *fc)
  1052. {
  1053. int err = vfs_get_tree(fc);
  1054. if (!err) {
  1055. up_write(&fc->root->d_sb->s_umount);
  1056. return vfs_create_mount(fc);
  1057. }
  1058. return ERR_PTR(err);
  1059. }
  1060. EXPORT_SYMBOL(fc_mount);
  1061. struct vfsmount *fc_mount_longterm(struct fs_context *fc)
  1062. {
  1063. struct vfsmount *mnt = fc_mount(fc);
  1064. if (!IS_ERR(mnt))
  1065. real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
  1066. return mnt;
  1067. }
  1068. EXPORT_SYMBOL(fc_mount_longterm);
  1069. struct vfsmount *vfs_kern_mount(struct file_system_type *type,
  1070. int flags, const char *name,
  1071. void *data)
  1072. {
  1073. struct fs_context *fc;
  1074. struct vfsmount *mnt;
  1075. int ret = 0;
  1076. if (!type)
  1077. return ERR_PTR(-EINVAL);
  1078. fc = fs_context_for_mount(type, flags);
  1079. if (IS_ERR(fc))
  1080. return ERR_CAST(fc);
  1081. if (name)
  1082. ret = vfs_parse_fs_string(fc, "source", name);
  1083. if (!ret)
  1084. ret = parse_monolithic_mount_data(fc, data);
  1085. if (!ret)
  1086. mnt = fc_mount(fc);
  1087. else
  1088. mnt = ERR_PTR(ret);
  1089. put_fs_context(fc);
  1090. return mnt;
  1091. }
  1092. EXPORT_SYMBOL_GPL(vfs_kern_mount);
  1093. static struct mount *clone_mnt(struct mount *old, struct dentry *root,
  1094. int flag)
  1095. {
  1096. struct mount *mnt;
  1097. int err;
  1098. mnt = alloc_vfsmnt(old->mnt_devname);
  1099. if (!mnt)
  1100. return ERR_PTR(-ENOMEM);
  1101. mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) &
  1102. ~MNT_INTERNAL_FLAGS;
  1103. if (flag & (CL_SLAVE | CL_PRIVATE))
  1104. mnt->mnt_group_id = 0; /* not a peer of original */
  1105. else
  1106. mnt->mnt_group_id = old->mnt_group_id;
  1107. if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
  1108. err = mnt_alloc_group_id(mnt);
  1109. if (err)
  1110. goto out_free;
  1111. }
  1112. if (mnt->mnt_group_id)
  1113. set_mnt_shared(mnt);
  1114. mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
  1115. setup_mnt(mnt, root);
  1116. if (flag & CL_PRIVATE) // we are done with it
  1117. return mnt;
  1118. if (peers(mnt, old))
  1119. list_add(&mnt->mnt_share, &old->mnt_share);
  1120. if ((flag & CL_SLAVE) && old->mnt_group_id) {
  1121. hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list);
  1122. mnt->mnt_master = old;
  1123. } else if (IS_MNT_SLAVE(old)) {
  1124. hlist_add_behind(&mnt->mnt_slave, &old->mnt_slave);
  1125. mnt->mnt_master = old->mnt_master;
  1126. }
  1127. return mnt;
  1128. out_free:
  1129. mnt_free_id(mnt);
  1130. free_vfsmnt(mnt);
  1131. return ERR_PTR(err);
  1132. }
  1133. static void cleanup_mnt(struct mount *mnt)
  1134. {
  1135. struct hlist_node *p;
  1136. struct mount *m;
  1137. /*
  1138. * The warning here probably indicates that somebody messed
  1139. * up a mnt_want/drop_write() pair. If this happens, the
  1140. * filesystem was probably unable to make r/w->r/o transitions.
  1141. * The locking used to deal with mnt_count decrement provides barriers,
  1142. * so mnt_get_writers() below is safe.
  1143. */
  1144. WARN_ON(mnt_get_writers(mnt));
  1145. if (unlikely(mnt->mnt_pins.first))
  1146. mnt_pin_kill(mnt);
  1147. hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
  1148. hlist_del(&m->mnt_umount);
  1149. mntput(&m->mnt);
  1150. }
  1151. fsnotify_vfsmount_delete(&mnt->mnt);
  1152. dput(mnt->mnt.mnt_root);
  1153. deactivate_super(mnt->mnt.mnt_sb);
  1154. mnt_free_id(mnt);
  1155. call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
  1156. }
  1157. static void __cleanup_mnt(struct rcu_head *head)
  1158. {
  1159. cleanup_mnt(container_of(head, struct mount, mnt_rcu));
  1160. }
  1161. static LLIST_HEAD(delayed_mntput_list);
  1162. static void delayed_mntput(struct work_struct *unused)
  1163. {
  1164. struct llist_node *node = llist_del_all(&delayed_mntput_list);
  1165. struct mount *m, *t;
  1166. llist_for_each_entry_safe(m, t, node, mnt_llist)
  1167. cleanup_mnt(m);
  1168. }
  1169. static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
  1170. static void noinline mntput_no_expire_slowpath(struct mount *mnt)
  1171. {
  1172. LIST_HEAD(list);
  1173. int count;
  1174. VFS_BUG_ON(mnt->mnt_ns);
  1175. lock_mount_hash();
  1176. /*
  1177. * make sure that if __legitimize_mnt() has not seen us grab
  1178. * mount_lock, we'll see their refcount increment here.
  1179. */
  1180. smp_mb();
  1181. mnt_add_count(mnt, -1);
  1182. count = mnt_get_count(mnt);
  1183. if (count != 0) {
  1184. WARN_ON(count < 0);
  1185. rcu_read_unlock();
  1186. unlock_mount_hash();
  1187. return;
  1188. }
  1189. if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
  1190. rcu_read_unlock();
  1191. unlock_mount_hash();
  1192. return;
  1193. }
  1194. mnt->mnt.mnt_flags |= MNT_DOOMED;
  1195. rcu_read_unlock();
  1196. mnt_del_instance(mnt);
  1197. if (unlikely(!list_empty(&mnt->mnt_expire)))
  1198. list_del(&mnt->mnt_expire);
  1199. if (unlikely(!list_empty(&mnt->mnt_mounts))) {
  1200. struct mount *p, *tmp;
  1201. list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
  1202. __umount_mnt(p, &list);
  1203. hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
  1204. }
  1205. }
  1206. unlock_mount_hash();
  1207. shrink_dentry_list(&list);
  1208. if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
  1209. struct task_struct *task = current;
  1210. if (likely(!(task->flags & PF_KTHREAD))) {
  1211. init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
  1212. if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
  1213. return;
  1214. }
  1215. if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
  1216. schedule_delayed_work(&delayed_mntput_work, 1);
  1217. return;
  1218. }
  1219. cleanup_mnt(mnt);
  1220. }
  1221. static void mntput_no_expire(struct mount *mnt)
  1222. {
  1223. rcu_read_lock();
  1224. if (likely(READ_ONCE(mnt->mnt_ns))) {
  1225. /*
  1226. * Since we don't do lock_mount_hash() here,
  1227. * ->mnt_ns can change under us. However, if it's
  1228. * non-NULL, then there's a reference that won't
  1229. * be dropped until after an RCU delay done after
  1230. * turning ->mnt_ns NULL. So if we observe it
  1231. * non-NULL under rcu_read_lock(), the reference
  1232. * we are dropping is not the final one.
  1233. */
  1234. mnt_add_count(mnt, -1);
  1235. rcu_read_unlock();
  1236. return;
  1237. }
  1238. mntput_no_expire_slowpath(mnt);
  1239. }
  1240. void mntput(struct vfsmount *mnt)
  1241. {
  1242. if (mnt) {
  1243. struct mount *m = real_mount(mnt);
  1244. /* avoid cacheline pingpong */
  1245. if (unlikely(m->mnt_expiry_mark))
  1246. WRITE_ONCE(m->mnt_expiry_mark, 0);
  1247. mntput_no_expire(m);
  1248. }
  1249. }
  1250. EXPORT_SYMBOL(mntput);
  1251. struct vfsmount *mntget(struct vfsmount *mnt)
  1252. {
  1253. if (mnt)
  1254. mnt_add_count(real_mount(mnt), 1);
  1255. return mnt;
  1256. }
  1257. EXPORT_SYMBOL(mntget);
  1258. /*
  1259. * Make a mount point inaccessible to new lookups.
  1260. * Because there may still be current users, the caller MUST WAIT
  1261. * for an RCU grace period before destroying the mount point.
  1262. */
  1263. void mnt_make_shortterm(struct vfsmount *mnt)
  1264. {
  1265. if (mnt)
  1266. real_mount(mnt)->mnt_ns = NULL;
  1267. }
  1268. /**
  1269. * path_is_mountpoint() - Check if path is a mount in the current namespace.
  1270. * @path: path to check
  1271. *
  1272. * d_mountpoint() can only be used reliably to establish if a dentry is
  1273. * not mounted in any namespace and that common case is handled inline.
  1274. * d_mountpoint() isn't aware of the possibility there may be multiple
  1275. * mounts using a given dentry in a different namespace. This function
  1276. * checks if the passed in path is a mountpoint rather than the dentry
  1277. * alone.
  1278. */
  1279. bool path_is_mountpoint(const struct path *path)
  1280. {
  1281. unsigned seq;
  1282. bool res;
  1283. if (!d_mountpoint(path->dentry))
  1284. return false;
  1285. rcu_read_lock();
  1286. do {
  1287. seq = read_seqbegin(&mount_lock);
  1288. res = __path_is_mountpoint(path);
  1289. } while (read_seqretry(&mount_lock, seq));
  1290. rcu_read_unlock();
  1291. return res;
  1292. }
  1293. EXPORT_SYMBOL(path_is_mountpoint);
  1294. struct vfsmount *mnt_clone_internal(const struct path *path)
  1295. {
  1296. struct mount *p;
  1297. p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
  1298. if (IS_ERR(p))
  1299. return ERR_CAST(p);
  1300. p->mnt.mnt_flags |= MNT_INTERNAL;
  1301. return &p->mnt;
  1302. }
  1303. /*
  1304. * Returns the mount which either has the specified mnt_id, or has the next
  1305. * smallest id afer the specified one.
  1306. */
  1307. static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
  1308. {
  1309. struct rb_node *node = ns->mounts.rb_node;
  1310. struct mount *ret = NULL;
  1311. while (node) {
  1312. struct mount *m = node_to_mount(node);
  1313. if (mnt_id <= m->mnt_id_unique) {
  1314. ret = node_to_mount(node);
  1315. if (mnt_id == m->mnt_id_unique)
  1316. break;
  1317. node = node->rb_left;
  1318. } else {
  1319. node = node->rb_right;
  1320. }
  1321. }
  1322. return ret;
  1323. }
  1324. /*
  1325. * Returns the mount which either has the specified mnt_id, or has the next
  1326. * greater id before the specified one.
  1327. */
  1328. static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id)
  1329. {
  1330. struct rb_node *node = ns->mounts.rb_node;
  1331. struct mount *ret = NULL;
  1332. while (node) {
  1333. struct mount *m = node_to_mount(node);
  1334. if (mnt_id >= m->mnt_id_unique) {
  1335. ret = node_to_mount(node);
  1336. if (mnt_id == m->mnt_id_unique)
  1337. break;
  1338. node = node->rb_right;
  1339. } else {
  1340. node = node->rb_left;
  1341. }
  1342. }
  1343. return ret;
  1344. }
  1345. #ifdef CONFIG_PROC_FS
  1346. /* iterator; we want it to have access to namespace_sem, thus here... */
  1347. static void *m_start(struct seq_file *m, loff_t *pos)
  1348. {
  1349. struct proc_mounts *p = m->private;
  1350. struct mount *mnt;
  1351. down_read(&namespace_sem);
  1352. mnt = mnt_find_id_at(p->ns, *pos);
  1353. if (mnt)
  1354. *pos = mnt->mnt_id_unique;
  1355. return mnt;
  1356. }
  1357. static void *m_next(struct seq_file *m, void *v, loff_t *pos)
  1358. {
  1359. struct mount *mnt = v;
  1360. struct rb_node *node = rb_next(&mnt->mnt_node);
  1361. if (node) {
  1362. struct mount *next = node_to_mount(node);
  1363. *pos = next->mnt_id_unique;
  1364. return next;
  1365. }
  1366. /*
  1367. * No more mounts. Set pos past current mount's ID so that if
  1368. * iteration restarts, mnt_find_id_at() returns NULL.
  1369. */
  1370. *pos = mnt->mnt_id_unique + 1;
  1371. return NULL;
  1372. }
  1373. static void m_stop(struct seq_file *m, void *v)
  1374. {
  1375. up_read(&namespace_sem);
  1376. }
  1377. static int m_show(struct seq_file *m, void *v)
  1378. {
  1379. struct proc_mounts *p = m->private;
  1380. struct mount *r = v;
  1381. return p->show(m, &r->mnt);
  1382. }
  1383. const struct seq_operations mounts_op = {
  1384. .start = m_start,
  1385. .next = m_next,
  1386. .stop = m_stop,
  1387. .show = m_show,
  1388. };
  1389. #endif /* CONFIG_PROC_FS */
  1390. /**
  1391. * may_umount_tree - check if a mount tree is busy
  1392. * @m: root of mount tree
  1393. *
  1394. * This is called to check if a tree of mounts has any
  1395. * open files, pwds, chroots or sub mounts that are
  1396. * busy.
  1397. */
  1398. int may_umount_tree(struct vfsmount *m)
  1399. {
  1400. struct mount *mnt = real_mount(m);
  1401. bool busy = false;
  1402. /* write lock needed for mnt_get_count */
  1403. lock_mount_hash();
  1404. for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) {
  1405. if (mnt_get_count(p) > (p == mnt ? 2 : 1)) {
  1406. busy = true;
  1407. break;
  1408. }
  1409. }
  1410. unlock_mount_hash();
  1411. return !busy;
  1412. }
  1413. EXPORT_SYMBOL(may_umount_tree);
  1414. /**
  1415. * may_umount - check if a mount point is busy
  1416. * @mnt: root of mount
  1417. *
  1418. * This is called to check if a mount point has any
  1419. * open files, pwds, chroots or sub mounts. If the
  1420. * mount has sub mounts this will return busy
  1421. * regardless of whether the sub mounts are busy.
  1422. *
  1423. * Doesn't take quota and stuff into account. IOW, in some cases it will
  1424. * give false negatives. The main reason why it's here is that we need
  1425. * a non-destructive way to look for easily umountable filesystems.
  1426. */
  1427. int may_umount(struct vfsmount *mnt)
  1428. {
  1429. int ret = 1;
  1430. down_read(&namespace_sem);
  1431. lock_mount_hash();
  1432. if (propagate_mount_busy(real_mount(mnt), 2))
  1433. ret = 0;
  1434. unlock_mount_hash();
  1435. up_read(&namespace_sem);
  1436. return ret;
  1437. }
  1438. EXPORT_SYMBOL(may_umount);
  1439. #ifdef CONFIG_FSNOTIFY
  1440. static void mnt_notify(struct mount *p)
  1441. {
  1442. if (!p->prev_ns && p->mnt_ns) {
  1443. fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
  1444. } else if (p->prev_ns && !p->mnt_ns) {
  1445. fsnotify_mnt_detach(p->prev_ns, &p->mnt);
  1446. } else if (p->prev_ns == p->mnt_ns) {
  1447. fsnotify_mnt_move(p->mnt_ns, &p->mnt);
  1448. } else {
  1449. fsnotify_mnt_detach(p->prev_ns, &p->mnt);
  1450. fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
  1451. }
  1452. p->prev_ns = p->mnt_ns;
  1453. }
  1454. static void notify_mnt_list(void)
  1455. {
  1456. struct mount *m, *tmp;
  1457. /*
  1458. * Notify about mounts that were added/reparented/detached/remain
  1459. * connected after unmount.
  1460. */
  1461. list_for_each_entry_safe(m, tmp, &notify_list, to_notify) {
  1462. mnt_notify(m);
  1463. list_del_init(&m->to_notify);
  1464. }
  1465. }
  1466. static bool need_notify_mnt_list(void)
  1467. {
  1468. return !list_empty(&notify_list);
  1469. }
  1470. #else
  1471. static void notify_mnt_list(void)
  1472. {
  1473. }
  1474. static bool need_notify_mnt_list(void)
  1475. {
  1476. return false;
  1477. }
  1478. #endif
  1479. static void free_mnt_ns(struct mnt_namespace *);
  1480. static void namespace_unlock(void)
  1481. {
  1482. struct hlist_head head;
  1483. struct hlist_node *p;
  1484. struct mount *m;
  1485. struct mnt_namespace *ns = emptied_ns;
  1486. LIST_HEAD(list);
  1487. hlist_move_list(&unmounted, &head);
  1488. list_splice_init(&ex_mountpoints, &list);
  1489. emptied_ns = NULL;
  1490. if (need_notify_mnt_list()) {
  1491. /*
  1492. * No point blocking out concurrent readers while notifications
  1493. * are sent. This will also allow statmount()/listmount() to run
  1494. * concurrently.
  1495. */
  1496. downgrade_write(&namespace_sem);
  1497. notify_mnt_list();
  1498. up_read(&namespace_sem);
  1499. } else {
  1500. up_write(&namespace_sem);
  1501. }
  1502. if (unlikely(ns)) {
  1503. /* Make sure we notice when we leak mounts. */
  1504. VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
  1505. free_mnt_ns(ns);
  1506. }
  1507. shrink_dentry_list(&list);
  1508. if (likely(hlist_empty(&head)))
  1509. return;
  1510. synchronize_rcu_expedited();
  1511. hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
  1512. hlist_del(&m->mnt_umount);
  1513. mntput(&m->mnt);
  1514. }
  1515. }
  1516. static inline void namespace_lock(void)
  1517. {
  1518. down_write(&namespace_sem);
  1519. }
  1520. enum umount_tree_flags {
  1521. UMOUNT_SYNC = 1,
  1522. UMOUNT_PROPAGATE = 2,
  1523. UMOUNT_CONNECTED = 4,
  1524. };
  1525. static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
  1526. {
  1527. /* Leaving mounts connected is only valid for lazy umounts */
  1528. if (how & UMOUNT_SYNC)
  1529. return true;
  1530. /* A mount without a parent has nothing to be connected to */
  1531. if (!mnt_has_parent(mnt))
  1532. return true;
  1533. /* Because the reference counting rules change when mounts are
  1534. * unmounted and connected, umounted mounts may not be
  1535. * connected to mounted mounts.
  1536. */
  1537. if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
  1538. return true;
  1539. /* Has it been requested that the mount remain connected? */
  1540. if (how & UMOUNT_CONNECTED)
  1541. return false;
  1542. /* Is the mount locked such that it needs to remain connected? */
  1543. if (IS_MNT_LOCKED(mnt))
  1544. return false;
  1545. /* By default disconnect the mount */
  1546. return true;
  1547. }
  1548. /*
  1549. * mount_lock must be held
  1550. * namespace_sem must be held for write
  1551. */
  1552. static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
  1553. {
  1554. LIST_HEAD(tmp_list);
  1555. struct mount *p;
  1556. if (how & UMOUNT_PROPAGATE)
  1557. propagate_mount_unlock(mnt);
  1558. /* Gather the mounts to umount */
  1559. for (p = mnt; p; p = next_mnt(p, mnt)) {
  1560. p->mnt.mnt_flags |= MNT_UMOUNT;
  1561. if (mnt_ns_attached(p))
  1562. move_from_ns(p);
  1563. list_add_tail(&p->mnt_list, &tmp_list);
  1564. }
  1565. /* Hide the mounts from mnt_mounts */
  1566. list_for_each_entry(p, &tmp_list, mnt_list) {
  1567. list_del_init(&p->mnt_child);
  1568. }
  1569. /* Add propagated mounts to the tmp_list */
  1570. if (how & UMOUNT_PROPAGATE)
  1571. propagate_umount(&tmp_list);
  1572. bulk_make_private(&tmp_list);
  1573. while (!list_empty(&tmp_list)) {
  1574. struct mnt_namespace *ns;
  1575. bool disconnect;
  1576. p = list_first_entry(&tmp_list, struct mount, mnt_list);
  1577. list_del_init(&p->mnt_expire);
  1578. list_del_init(&p->mnt_list);
  1579. ns = p->mnt_ns;
  1580. if (ns) {
  1581. ns->nr_mounts--;
  1582. __touch_mnt_namespace(ns);
  1583. }
  1584. p->mnt_ns = NULL;
  1585. if (how & UMOUNT_SYNC)
  1586. p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
  1587. disconnect = disconnect_mount(p, how);
  1588. if (mnt_has_parent(p)) {
  1589. if (!disconnect) {
  1590. /* Don't forget about p */
  1591. list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
  1592. } else {
  1593. umount_mnt(p);
  1594. }
  1595. }
  1596. if (disconnect)
  1597. hlist_add_head(&p->mnt_umount, &unmounted);
  1598. /*
  1599. * At this point p->mnt_ns is NULL, notification will be queued
  1600. * only if
  1601. *
  1602. * - p->prev_ns is non-NULL *and*
  1603. * - p->prev_ns->n_fsnotify_marks is non-NULL
  1604. *
  1605. * This will preclude queuing the mount if this is a cleanup
  1606. * after a failed copy_tree() or destruction of an anonymous
  1607. * namespace, etc.
  1608. */
  1609. mnt_notify_add(p);
  1610. }
  1611. }
  1612. static void shrink_submounts(struct mount *mnt);
  1613. static int do_umount_root(struct super_block *sb)
  1614. {
  1615. int ret = 0;
  1616. down_write(&sb->s_umount);
  1617. if (!sb_rdonly(sb)) {
  1618. struct fs_context *fc;
  1619. fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
  1620. SB_RDONLY);
  1621. if (IS_ERR(fc)) {
  1622. ret = PTR_ERR(fc);
  1623. } else {
  1624. ret = parse_monolithic_mount_data(fc, NULL);
  1625. if (!ret)
  1626. ret = reconfigure_super(fc);
  1627. put_fs_context(fc);
  1628. }
  1629. }
  1630. up_write(&sb->s_umount);
  1631. return ret;
  1632. }
  1633. static int do_umount(struct mount *mnt, int flags)
  1634. {
  1635. struct super_block *sb = mnt->mnt.mnt_sb;
  1636. int retval;
  1637. retval = security_sb_umount(&mnt->mnt, flags);
  1638. if (retval)
  1639. return retval;
  1640. /*
  1641. * Allow userspace to request a mountpoint be expired rather than
  1642. * unmounting unconditionally. Unmount only happens if:
  1643. * (1) the mark is already set (the mark is cleared by mntput())
  1644. * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
  1645. */
  1646. if (flags & MNT_EXPIRE) {
  1647. if (&mnt->mnt == current->fs->root.mnt ||
  1648. flags & (MNT_FORCE | MNT_DETACH))
  1649. return -EINVAL;
  1650. /*
  1651. * probably don't strictly need the lock here if we examined
  1652. * all race cases, but it's a slowpath.
  1653. */
  1654. lock_mount_hash();
  1655. if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) {
  1656. unlock_mount_hash();
  1657. return -EBUSY;
  1658. }
  1659. unlock_mount_hash();
  1660. if (!xchg(&mnt->mnt_expiry_mark, 1))
  1661. return -EAGAIN;
  1662. }
  1663. /*
  1664. * If we may have to abort operations to get out of this
  1665. * mount, and they will themselves hold resources we must
  1666. * allow the fs to do things. In the Unix tradition of
  1667. * 'Gee thats tricky lets do it in userspace' the umount_begin
  1668. * might fail to complete on the first run through as other tasks
  1669. * must return, and the like. Thats for the mount program to worry
  1670. * about for the moment.
  1671. */
  1672. if (flags & MNT_FORCE && sb->s_op->umount_begin) {
  1673. sb->s_op->umount_begin(sb);
  1674. }
  1675. /*
  1676. * No sense to grab the lock for this test, but test itself looks
  1677. * somewhat bogus. Suggestions for better replacement?
  1678. * Ho-hum... In principle, we might treat that as umount + switch
  1679. * to rootfs. GC would eventually take care of the old vfsmount.
  1680. * Actually it makes sense, especially if rootfs would contain a
  1681. * /reboot - static binary that would close all descriptors and
  1682. * call reboot(9). Then init(8) could umount root and exec /reboot.
  1683. */
  1684. if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
  1685. /*
  1686. * Special case for "unmounting" root ...
  1687. * we just try to remount it readonly.
  1688. */
  1689. if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
  1690. return -EPERM;
  1691. return do_umount_root(sb);
  1692. }
  1693. namespace_lock();
  1694. lock_mount_hash();
  1695. /* Repeat the earlier racy checks, now that we are holding the locks */
  1696. retval = -EINVAL;
  1697. if (!check_mnt(mnt))
  1698. goto out;
  1699. if (mnt->mnt.mnt_flags & MNT_LOCKED)
  1700. goto out;
  1701. if (!mnt_has_parent(mnt)) /* not the absolute root */
  1702. goto out;
  1703. event++;
  1704. if (flags & MNT_DETACH) {
  1705. umount_tree(mnt, UMOUNT_PROPAGATE);
  1706. retval = 0;
  1707. } else {
  1708. smp_mb(); // paired with __legitimize_mnt()
  1709. shrink_submounts(mnt);
  1710. retval = -EBUSY;
  1711. if (!propagate_mount_busy(mnt, 2)) {
  1712. umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
  1713. retval = 0;
  1714. }
  1715. }
  1716. out:
  1717. unlock_mount_hash();
  1718. namespace_unlock();
  1719. return retval;
  1720. }
  1721. /*
  1722. * __detach_mounts - lazily unmount all mounts on the specified dentry
  1723. *
  1724. * During unlink, rmdir, and d_drop it is possible to loose the path
  1725. * to an existing mountpoint, and wind up leaking the mount.
  1726. * detach_mounts allows lazily unmounting those mounts instead of
  1727. * leaking them.
  1728. *
  1729. * The caller may hold dentry->d_inode->i_rwsem.
  1730. */
  1731. void __detach_mounts(struct dentry *dentry)
  1732. {
  1733. struct pinned_mountpoint mp = {};
  1734. struct mount *mnt;
  1735. guard(namespace_excl)();
  1736. guard(mount_writer)();
  1737. if (!lookup_mountpoint(dentry, &mp))
  1738. return;
  1739. event++;
  1740. while (mp.node.next) {
  1741. mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list);
  1742. if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
  1743. umount_mnt(mnt);
  1744. hlist_add_head(&mnt->mnt_umount, &unmounted);
  1745. }
  1746. else umount_tree(mnt, UMOUNT_CONNECTED);
  1747. }
  1748. unpin_mountpoint(&mp);
  1749. }
  1750. /*
  1751. * Is the caller allowed to modify his namespace?
  1752. */
  1753. bool may_mount(void)
  1754. {
  1755. return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
  1756. }
  1757. static void warn_mandlock(void)
  1758. {
  1759. pr_warn_once("=======================================================\n"
  1760. "WARNING: The mand mount option has been deprecated and\n"
  1761. " and is ignored by this kernel. Remove the mand\n"
  1762. " option from the mount to silence this warning.\n"
  1763. "=======================================================\n");
  1764. }
  1765. static int can_umount(const struct path *path, int flags)
  1766. {
  1767. struct mount *mnt = real_mount(path->mnt);
  1768. struct super_block *sb = path->dentry->d_sb;
  1769. if (!may_mount())
  1770. return -EPERM;
  1771. if (!path_mounted(path))
  1772. return -EINVAL;
  1773. if (!check_mnt(mnt))
  1774. return -EINVAL;
  1775. if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
  1776. return -EINVAL;
  1777. if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
  1778. return -EPERM;
  1779. return 0;
  1780. }
  1781. // caller is responsible for flags being sane
  1782. int path_umount(const struct path *path, int flags)
  1783. {
  1784. struct mount *mnt = real_mount(path->mnt);
  1785. int ret;
  1786. ret = can_umount(path, flags);
  1787. if (!ret)
  1788. ret = do_umount(mnt, flags);
  1789. /* we mustn't call path_put() as that would clear mnt_expiry_mark */
  1790. dput(path->dentry);
  1791. mntput_no_expire(mnt);
  1792. return ret;
  1793. }
  1794. static int ksys_umount(char __user *name, int flags)
  1795. {
  1796. int lookup_flags = LOOKUP_MOUNTPOINT;
  1797. struct path path;
  1798. int ret;
  1799. // basic validity checks done first
  1800. if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
  1801. return -EINVAL;
  1802. if (!(flags & UMOUNT_NOFOLLOW))
  1803. lookup_flags |= LOOKUP_FOLLOW;
  1804. ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
  1805. if (ret)
  1806. return ret;
  1807. return path_umount(&path, flags);
  1808. }
  1809. SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
  1810. {
  1811. return ksys_umount(name, flags);
  1812. }
  1813. #ifdef __ARCH_WANT_SYS_OLDUMOUNT
  1814. /*
  1815. * The 2.0 compatible umount. No flags.
  1816. */
  1817. SYSCALL_DEFINE1(oldumount, char __user *, name)
  1818. {
  1819. return ksys_umount(name, 0);
  1820. }
  1821. #endif
  1822. static bool is_mnt_ns_file(struct dentry *dentry)
  1823. {
  1824. struct ns_common *ns;
  1825. /* Is this a proxy for a mount namespace? */
  1826. if (dentry->d_op != &ns_dentry_operations)
  1827. return false;
  1828. ns = d_inode(dentry)->i_private;
  1829. return ns->ops == &mntns_operations;
  1830. }
  1831. struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
  1832. {
  1833. return &mnt->ns;
  1834. }
  1835. struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
  1836. {
  1837. struct ns_common *ns;
  1838. guard(rcu)();
  1839. for (;;) {
  1840. ns = ns_tree_adjoined_rcu(mntns, previous);
  1841. if (IS_ERR(ns))
  1842. return ERR_CAST(ns);
  1843. mntns = to_mnt_ns(ns);
  1844. /*
  1845. * The last passive reference count is put with RCU
  1846. * delay so accessing the mount namespace is not just
  1847. * safe but all relevant members are still valid.
  1848. */
  1849. if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
  1850. continue;
  1851. /*
  1852. * We need an active reference count as we're persisting
  1853. * the mount namespace and it might already be on its
  1854. * deathbed.
  1855. */
  1856. if (!ns_ref_get(mntns))
  1857. continue;
  1858. return mntns;
  1859. }
  1860. }
  1861. struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry)
  1862. {
  1863. if (!is_mnt_ns_file(dentry))
  1864. return NULL;
  1865. return to_mnt_ns(get_proc_ns(dentry->d_inode));
  1866. }
  1867. static bool mnt_ns_loop(struct dentry *dentry)
  1868. {
  1869. /* Could bind mounting the mount namespace inode cause a
  1870. * mount namespace loop?
  1871. */
  1872. struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry);
  1873. if (!mnt_ns)
  1874. return false;
  1875. return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id;
  1876. }
  1877. struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
  1878. int flag)
  1879. {
  1880. struct mount *res, *src_parent, *src_root_child, *src_mnt,
  1881. *dst_parent, *dst_mnt;
  1882. if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root))
  1883. return ERR_PTR(-EINVAL);
  1884. if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
  1885. return ERR_PTR(-EINVAL);
  1886. res = dst_mnt = clone_mnt(src_root, dentry, flag);
  1887. if (IS_ERR(dst_mnt))
  1888. return dst_mnt;
  1889. src_parent = src_root;
  1890. list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {
  1891. if (!is_subdir(src_root_child->mnt_mountpoint, dentry))
  1892. continue;
  1893. for (src_mnt = src_root_child; src_mnt;
  1894. src_mnt = next_mnt(src_mnt, src_root_child)) {
  1895. if (!(flag & CL_COPY_UNBINDABLE) &&
  1896. IS_MNT_UNBINDABLE(src_mnt)) {
  1897. if (src_mnt->mnt.mnt_flags & MNT_LOCKED) {
  1898. /* Both unbindable and locked. */
  1899. dst_mnt = ERR_PTR(-EPERM);
  1900. goto out;
  1901. } else {
  1902. src_mnt = skip_mnt_tree(src_mnt);
  1903. continue;
  1904. }
  1905. }
  1906. if (!(flag & CL_COPY_MNT_NS_FILE) &&
  1907. is_mnt_ns_file(src_mnt->mnt.mnt_root)) {
  1908. src_mnt = skip_mnt_tree(src_mnt);
  1909. continue;
  1910. }
  1911. while (src_parent != src_mnt->mnt_parent) {
  1912. src_parent = src_parent->mnt_parent;
  1913. dst_mnt = dst_mnt->mnt_parent;
  1914. }
  1915. src_parent = src_mnt;
  1916. dst_parent = dst_mnt;
  1917. dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag);
  1918. if (IS_ERR(dst_mnt))
  1919. goto out;
  1920. lock_mount_hash();
  1921. if (src_mnt->mnt.mnt_flags & MNT_LOCKED)
  1922. dst_mnt->mnt.mnt_flags |= MNT_LOCKED;
  1923. if (unlikely(flag & CL_EXPIRE)) {
  1924. /* stick the duplicate mount on the same expiry
  1925. * list as the original if that was on one */
  1926. if (!list_empty(&src_mnt->mnt_expire))
  1927. list_add(&dst_mnt->mnt_expire,
  1928. &src_mnt->mnt_expire);
  1929. }
  1930. attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp);
  1931. unlock_mount_hash();
  1932. }
  1933. }
  1934. return res;
  1935. out:
  1936. if (res) {
  1937. lock_mount_hash();
  1938. umount_tree(res, UMOUNT_SYNC);
  1939. unlock_mount_hash();
  1940. }
  1941. return dst_mnt;
  1942. }
  1943. static inline bool extend_array(struct path **res, struct path **to_free,
  1944. unsigned n, unsigned *count, unsigned new_count)
  1945. {
  1946. struct path *p;
  1947. if (likely(n < *count))
  1948. return true;
  1949. p = kmalloc_objs(struct path, new_count);
  1950. if (p && *count)
  1951. memcpy(p, *res, *count * sizeof(struct path));
  1952. *count = new_count;
  1953. kfree(*to_free);
  1954. *to_free = *res = p;
  1955. return p;
  1956. }
  1957. const struct path *collect_paths(const struct path *path,
  1958. struct path *prealloc, unsigned count)
  1959. {
  1960. struct mount *root = real_mount(path->mnt);
  1961. struct mount *child;
  1962. struct path *res = prealloc, *to_free = NULL;
  1963. unsigned n = 0;
  1964. guard(namespace_shared)();
  1965. if (!check_mnt(root))
  1966. return ERR_PTR(-EINVAL);
  1967. if (!extend_array(&res, &to_free, 0, &count, 32))
  1968. return ERR_PTR(-ENOMEM);
  1969. res[n++] = *path;
  1970. list_for_each_entry(child, &root->mnt_mounts, mnt_child) {
  1971. if (!is_subdir(child->mnt_mountpoint, path->dentry))
  1972. continue;
  1973. for (struct mount *m = child; m; m = next_mnt(m, child)) {
  1974. if (!extend_array(&res, &to_free, n, &count, 2 * count))
  1975. return ERR_PTR(-ENOMEM);
  1976. res[n].mnt = &m->mnt;
  1977. res[n].dentry = m->mnt.mnt_root;
  1978. n++;
  1979. }
  1980. }
  1981. if (!extend_array(&res, &to_free, n, &count, count + 1))
  1982. return ERR_PTR(-ENOMEM);
  1983. memset(res + n, 0, (count - n) * sizeof(struct path));
  1984. for (struct path *p = res; p->mnt; p++)
  1985. path_get(p);
  1986. return res;
  1987. }
  1988. void drop_collected_paths(const struct path *paths, const struct path *prealloc)
  1989. {
  1990. for (const struct path *p = paths; p->mnt; p++)
  1991. path_put(p);
  1992. if (paths != prealloc)
  1993. kfree(paths);
  1994. }
  1995. static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
  1996. void dissolve_on_fput(struct vfsmount *mnt)
  1997. {
  1998. struct mount *m = real_mount(mnt);
  1999. /*
  2000. * m used to be the root of anon namespace; if it still is one,
  2001. * we need to dissolve the mount tree and free that namespace.
  2002. * Let's try to avoid taking namespace_sem if we can determine
  2003. * that there's nothing to do without it - rcu_read_lock() is
  2004. * enough to make anon_ns_root() memory-safe and once m has
  2005. * left its namespace, it's no longer our concern, since it will
  2006. * never become a root of anon ns again.
  2007. */
  2008. scoped_guard(rcu) {
  2009. if (!anon_ns_root(m))
  2010. return;
  2011. }
  2012. scoped_guard(namespace_excl) {
  2013. if (!anon_ns_root(m))
  2014. return;
  2015. emptied_ns = m->mnt_ns;
  2016. lock_mount_hash();
  2017. umount_tree(m, UMOUNT_CONNECTED);
  2018. unlock_mount_hash();
  2019. }
  2020. }
  2021. /* locks: namespace_shared && pinned(mnt) || mount_locked_reader */
  2022. static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
  2023. {
  2024. struct mount *child;
  2025. list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
  2026. if (!is_subdir(child->mnt_mountpoint, dentry))
  2027. continue;
  2028. if (child->mnt.mnt_flags & MNT_LOCKED)
  2029. return true;
  2030. }
  2031. return false;
  2032. }
  2033. bool has_locked_children(struct mount *mnt, struct dentry *dentry)
  2034. {
  2035. guard(mount_locked_reader)();
  2036. return __has_locked_children(mnt, dentry);
  2037. }
  2038. /*
  2039. * Check that there aren't references to earlier/same mount namespaces in the
  2040. * specified subtree. Such references can act as pins for mount namespaces
  2041. * that aren't checked by the mount-cycle checking code, thereby allowing
  2042. * cycles to be made.
  2043. *
  2044. * locks: mount_locked_reader || namespace_shared && pinned(subtree)
  2045. */
  2046. static bool check_for_nsfs_mounts(struct mount *subtree)
  2047. {
  2048. for (struct mount *p = subtree; p; p = next_mnt(p, subtree))
  2049. if (mnt_ns_loop(p->mnt.mnt_root))
  2050. return false;
  2051. return true;
  2052. }
  2053. /**
  2054. * clone_private_mount - create a private clone of a path
  2055. * @path: path to clone
  2056. *
  2057. * This creates a new vfsmount, which will be the clone of @path. The new mount
  2058. * will not be attached anywhere in the namespace and will be private (i.e.
  2059. * changes to the originating mount won't be propagated into this).
  2060. *
  2061. * This assumes caller has called or done the equivalent of may_mount().
  2062. *
  2063. * Release with mntput().
  2064. */
  2065. struct vfsmount *clone_private_mount(const struct path *path)
  2066. {
  2067. struct mount *old_mnt = real_mount(path->mnt);
  2068. struct mount *new_mnt;
  2069. guard(namespace_shared)();
  2070. if (IS_MNT_UNBINDABLE(old_mnt))
  2071. return ERR_PTR(-EINVAL);
  2072. /*
  2073. * Make sure the source mount is acceptable.
  2074. * Anything mounted in our mount namespace is allowed.
  2075. * Otherwise, it must be the root of an anonymous mount
  2076. * namespace, and we need to make sure no namespace
  2077. * loops get created.
  2078. */
  2079. if (!check_mnt(old_mnt)) {
  2080. if (!anon_ns_root(old_mnt))
  2081. return ERR_PTR(-EINVAL);
  2082. if (!check_for_nsfs_mounts(old_mnt))
  2083. return ERR_PTR(-EINVAL);
  2084. }
  2085. if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
  2086. return ERR_PTR(-EPERM);
  2087. if (__has_locked_children(old_mnt, path->dentry))
  2088. return ERR_PTR(-EINVAL);
  2089. new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
  2090. if (IS_ERR(new_mnt))
  2091. return ERR_PTR(-EINVAL);
  2092. /* Longterm mount to be removed by kern_unmount*() */
  2093. new_mnt->mnt_ns = MNT_NS_INTERNAL;
  2094. return &new_mnt->mnt;
  2095. }
  2096. EXPORT_SYMBOL_GPL(clone_private_mount);
  2097. static void lock_mnt_tree(struct mount *mnt)
  2098. {
  2099. struct mount *p;
  2100. for (p = mnt; p; p = next_mnt(p, mnt)) {
  2101. int flags = p->mnt.mnt_flags;
  2102. /* Don't allow unprivileged users to change mount flags */
  2103. flags |= MNT_LOCK_ATIME;
  2104. if (flags & MNT_READONLY)
  2105. flags |= MNT_LOCK_READONLY;
  2106. if (flags & MNT_NODEV)
  2107. flags |= MNT_LOCK_NODEV;
  2108. if (flags & MNT_NOSUID)
  2109. flags |= MNT_LOCK_NOSUID;
  2110. if (flags & MNT_NOEXEC)
  2111. flags |= MNT_LOCK_NOEXEC;
  2112. /* Don't allow unprivileged users to reveal what is under a mount */
  2113. if (list_empty(&p->mnt_expire) && p != mnt)
  2114. flags |= MNT_LOCKED;
  2115. p->mnt.mnt_flags = flags;
  2116. }
  2117. }
  2118. static void cleanup_group_ids(struct mount *mnt, struct mount *end)
  2119. {
  2120. struct mount *p;
  2121. for (p = mnt; p != end; p = next_mnt(p, mnt)) {
  2122. if (p->mnt_group_id && !IS_MNT_SHARED(p))
  2123. mnt_release_group_id(p);
  2124. }
  2125. }
  2126. static int invent_group_ids(struct mount *mnt, bool recurse)
  2127. {
  2128. struct mount *p;
  2129. for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
  2130. if (!p->mnt_group_id) {
  2131. int err = mnt_alloc_group_id(p);
  2132. if (err) {
  2133. cleanup_group_ids(mnt, p);
  2134. return err;
  2135. }
  2136. }
  2137. }
  2138. return 0;
  2139. }
  2140. int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
  2141. {
  2142. unsigned int max = READ_ONCE(sysctl_mount_max);
  2143. unsigned int mounts = 0;
  2144. struct mount *p;
  2145. if (ns->nr_mounts >= max)
  2146. return -ENOSPC;
  2147. max -= ns->nr_mounts;
  2148. if (ns->pending_mounts >= max)
  2149. return -ENOSPC;
  2150. max -= ns->pending_mounts;
  2151. for (p = mnt; p; p = next_mnt(p, mnt))
  2152. mounts++;
  2153. if (mounts > max)
  2154. return -ENOSPC;
  2155. ns->pending_mounts += mounts;
  2156. return 0;
  2157. }
  2158. enum mnt_tree_flags_t {
  2159. MNT_TREE_BENEATH = BIT(0),
  2160. MNT_TREE_PROPAGATION = BIT(1),
  2161. };
  2162. /**
  2163. * attach_recursive_mnt - attach a source mount tree
  2164. * @source_mnt: mount tree to be attached
  2165. * @dest: the context for mounting at the place where the tree should go
  2166. *
  2167. * NOTE: in the table below explains the semantics when a source mount
  2168. * of a given type is attached to a destination mount of a given type.
  2169. * ---------------------------------------------------------------------------
  2170. * | BIND MOUNT OPERATION |
  2171. * |**************************************************************************
  2172. * | source-->| shared | private | slave | unbindable |
  2173. * | dest | | | | |
  2174. * | | | | | | |
  2175. * | v | | | | |
  2176. * |**************************************************************************
  2177. * | shared | shared (++) | shared (+) | shared(+++)| invalid |
  2178. * | | | | | |
  2179. * |non-shared| shared (+) | private | slave (*) | invalid |
  2180. * ***************************************************************************
  2181. * A bind operation clones the source mount and mounts the clone on the
  2182. * destination mount.
  2183. *
  2184. * (++) the cloned mount is propagated to all the mounts in the propagation
  2185. * tree of the destination mount and the cloned mount is added to
  2186. * the peer group of the source mount.
  2187. * (+) the cloned mount is created under the destination mount and is marked
  2188. * as shared. The cloned mount is added to the peer group of the source
  2189. * mount.
  2190. * (+++) the mount is propagated to all the mounts in the propagation tree
  2191. * of the destination mount and the cloned mount is made slave
  2192. * of the same master as that of the source mount. The cloned mount
  2193. * is marked as 'shared and slave'.
  2194. * (*) the cloned mount is made a slave of the same master as that of the
  2195. * source mount.
  2196. *
  2197. * ---------------------------------------------------------------------------
  2198. * | MOVE MOUNT OPERATION |
  2199. * |**************************************************************************
  2200. * | source-->| shared | private | slave | unbindable |
  2201. * | dest | | | | |
  2202. * | | | | | | |
  2203. * | v | | | | |
  2204. * |**************************************************************************
  2205. * | shared | shared (+) | shared (+) | shared(+++) | invalid |
  2206. * | | | | | |
  2207. * |non-shared| shared (+*) | private | slave (*) | unbindable |
  2208. * ***************************************************************************
  2209. *
  2210. * (+) the mount is moved to the destination. And is then propagated to
  2211. * all the mounts in the propagation tree of the destination mount.
  2212. * (+*) the mount is moved to the destination.
  2213. * (+++) the mount is moved to the destination and is then propagated to
  2214. * all the mounts belonging to the destination mount's propagation tree.
  2215. * the mount is marked as 'shared and slave'.
  2216. * (*) the mount continues to be a slave at the new location.
  2217. *
  2218. * if the source mount is a tree, the operations explained above is
  2219. * applied to each mount in the tree.
  2220. * Must be called without spinlocks held, since this function can sleep
  2221. * in allocations.
  2222. *
  2223. * Context: The function expects namespace_lock() to be held.
  2224. * Return: If @source_mnt was successfully attached 0 is returned.
  2225. * Otherwise a negative error code is returned.
  2226. */
  2227. static int attach_recursive_mnt(struct mount *source_mnt,
  2228. const struct pinned_mountpoint *dest)
  2229. {
  2230. struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
  2231. struct mount *dest_mnt = dest->parent;
  2232. struct mountpoint *dest_mp = dest->mp;
  2233. HLIST_HEAD(tree_list);
  2234. struct mnt_namespace *ns = dest_mnt->mnt_ns;
  2235. struct pinned_mountpoint root = {};
  2236. struct mountpoint *shorter = NULL;
  2237. struct mount *child, *p;
  2238. struct mount *top;
  2239. struct hlist_node *n;
  2240. int err = 0;
  2241. bool moving = mnt_has_parent(source_mnt);
  2242. /*
  2243. * Preallocate a mountpoint in case the new mounts need to be
  2244. * mounted beneath mounts on the same mountpoint.
  2245. */
  2246. for (top = source_mnt; unlikely(top->overmount); top = top->overmount) {
  2247. if (!shorter && is_mnt_ns_file(top->mnt.mnt_root))
  2248. shorter = top->mnt_mp;
  2249. }
  2250. err = get_mountpoint(top->mnt.mnt_root, &root);
  2251. if (err)
  2252. return err;
  2253. /* Is there space to add these mounts to the mount namespace? */
  2254. if (!moving) {
  2255. err = count_mounts(ns, source_mnt);
  2256. if (err)
  2257. goto out;
  2258. }
  2259. if (IS_MNT_SHARED(dest_mnt)) {
  2260. err = invent_group_ids(source_mnt, true);
  2261. if (err)
  2262. goto out;
  2263. err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
  2264. }
  2265. lock_mount_hash();
  2266. if (err)
  2267. goto out_cleanup_ids;
  2268. if (IS_MNT_SHARED(dest_mnt)) {
  2269. for (p = source_mnt; p; p = next_mnt(p, source_mnt))
  2270. set_mnt_shared(p);
  2271. }
  2272. if (moving) {
  2273. umount_mnt(source_mnt);
  2274. mnt_notify_add(source_mnt);
  2275. /* if the mount is moved, it should no longer be expired
  2276. * automatically */
  2277. list_del_init(&source_mnt->mnt_expire);
  2278. } else {
  2279. if (source_mnt->mnt_ns) {
  2280. /* move from anon - the caller will destroy */
  2281. emptied_ns = source_mnt->mnt_ns;
  2282. for (p = source_mnt; p; p = next_mnt(p, source_mnt))
  2283. move_from_ns(p);
  2284. }
  2285. }
  2286. mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
  2287. /*
  2288. * Now the original copy is in the same state as the secondaries -
  2289. * its root attached to mountpoint, but not hashed and all mounts
  2290. * in it are either in our namespace or in no namespace at all.
  2291. * Add the original to the list of copies and deal with the
  2292. * rest of work for all of them uniformly.
  2293. */
  2294. hlist_add_head(&source_mnt->mnt_hash, &tree_list);
  2295. hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
  2296. struct mount *q;
  2297. hlist_del_init(&child->mnt_hash);
  2298. /* Notice when we are propagating across user namespaces */
  2299. if (child->mnt_parent->mnt_ns->user_ns != user_ns)
  2300. lock_mnt_tree(child);
  2301. q = __lookup_mnt(&child->mnt_parent->mnt,
  2302. child->mnt_mountpoint);
  2303. commit_tree(child);
  2304. if (q) {
  2305. struct mount *r = topmost_overmount(child);
  2306. struct mountpoint *mp = root.mp;
  2307. if (unlikely(shorter) && child != source_mnt)
  2308. mp = shorter;
  2309. mnt_change_mountpoint(r, mp, q);
  2310. }
  2311. }
  2312. unpin_mountpoint(&root);
  2313. unlock_mount_hash();
  2314. return 0;
  2315. out_cleanup_ids:
  2316. while (!hlist_empty(&tree_list)) {
  2317. child = hlist_entry(tree_list.first, struct mount, mnt_hash);
  2318. child->mnt_parent->mnt_ns->pending_mounts = 0;
  2319. umount_tree(child, UMOUNT_SYNC);
  2320. }
  2321. unlock_mount_hash();
  2322. cleanup_group_ids(source_mnt, NULL);
  2323. out:
  2324. ns->pending_mounts = 0;
  2325. read_seqlock_excl(&mount_lock);
  2326. unpin_mountpoint(&root);
  2327. read_sequnlock_excl(&mount_lock);
  2328. return err;
  2329. }
  2330. static inline struct mount *where_to_mount(const struct path *path,
  2331. struct dentry **dentry,
  2332. bool beneath)
  2333. {
  2334. struct mount *m;
  2335. if (unlikely(beneath)) {
  2336. m = topmost_overmount(real_mount(path->mnt));
  2337. *dentry = m->mnt_mountpoint;
  2338. return m->mnt_parent;
  2339. }
  2340. m = __lookup_mnt(path->mnt, path->dentry);
  2341. if (unlikely(m)) {
  2342. m = topmost_overmount(m);
  2343. *dentry = m->mnt.mnt_root;
  2344. return m;
  2345. }
  2346. *dentry = path->dentry;
  2347. return real_mount(path->mnt);
  2348. }
  2349. /**
  2350. * do_lock_mount - acquire environment for mounting
  2351. * @path: target path
  2352. * @res: context to set up
  2353. * @beneath: whether the intention is to mount beneath @path
  2354. *
  2355. * To mount something at given location, we need
  2356. * namespace_sem locked exclusive
  2357. * inode of dentry we are mounting on locked exclusive
  2358. * struct mountpoint for that dentry
  2359. * struct mount we are mounting on
  2360. *
  2361. * Results are stored in caller-supplied context (pinned_mountpoint);
  2362. * on success we have res->parent and res->mp pointing to parent and
  2363. * mountpoint respectively and res->node inserted into the ->m_list
  2364. * of the mountpoint, making sure the mountpoint won't disappear.
  2365. * On failure we have res->parent set to ERR_PTR(-E...), res->mp
  2366. * left NULL, res->node - empty.
  2367. * In case of success do_lock_mount returns with locks acquired (in
  2368. * proper order - inode lock nests outside of namespace_sem).
  2369. *
  2370. * Request to mount on overmounted location is treated as "mount on
  2371. * top of whatever's overmounting it"; request to mount beneath
  2372. * a location - "mount immediately beneath the topmost mount at that
  2373. * place".
  2374. *
  2375. * In all cases the location must not have been unmounted and the
  2376. * chosen mountpoint must be allowed to be mounted on. For "beneath"
  2377. * case we also require the location to be at the root of a mount
  2378. * that has a parent (i.e. is not a root of some namespace).
  2379. */
  2380. static void do_lock_mount(const struct path *path,
  2381. struct pinned_mountpoint *res,
  2382. bool beneath)
  2383. {
  2384. int err;
  2385. if (unlikely(beneath) && !path_mounted(path)) {
  2386. res->parent = ERR_PTR(-EINVAL);
  2387. return;
  2388. }
  2389. do {
  2390. struct dentry *dentry, *d;
  2391. struct mount *m, *n;
  2392. scoped_guard(mount_locked_reader) {
  2393. m = where_to_mount(path, &dentry, beneath);
  2394. if (&m->mnt != path->mnt) {
  2395. mntget(&m->mnt);
  2396. dget(dentry);
  2397. }
  2398. }
  2399. inode_lock(dentry->d_inode);
  2400. namespace_lock();
  2401. // check if the chain of mounts (if any) has changed.
  2402. scoped_guard(mount_locked_reader)
  2403. n = where_to_mount(path, &d, beneath);
  2404. if (unlikely(n != m || dentry != d))
  2405. err = -EAGAIN; // something moved, retry
  2406. else if (unlikely(cant_mount(dentry) || !is_mounted(path->mnt)))
  2407. err = -ENOENT; // not to be mounted on
  2408. else if (beneath && &m->mnt == path->mnt && !m->overmount)
  2409. err = -EINVAL;
  2410. else
  2411. err = get_mountpoint(dentry, res);
  2412. if (unlikely(err)) {
  2413. res->parent = ERR_PTR(err);
  2414. namespace_unlock();
  2415. inode_unlock(dentry->d_inode);
  2416. } else {
  2417. res->parent = m;
  2418. }
  2419. /*
  2420. * Drop the temporary references. This is subtle - on success
  2421. * we are doing that under namespace_sem, which would normally
  2422. * be forbidden. However, in that case we are guaranteed that
  2423. * refcounts won't reach zero, since we know that path->mnt
  2424. * is mounted and thus all mounts reachable from it are pinned
  2425. * and stable, along with their mountpoints and roots.
  2426. */
  2427. if (&m->mnt != path->mnt) {
  2428. dput(dentry);
  2429. mntput(&m->mnt);
  2430. }
  2431. } while (err == -EAGAIN);
  2432. }
  2433. static void __unlock_mount(struct pinned_mountpoint *m)
  2434. {
  2435. inode_unlock(m->mp->m_dentry->d_inode);
  2436. read_seqlock_excl(&mount_lock);
  2437. unpin_mountpoint(m);
  2438. read_sequnlock_excl(&mount_lock);
  2439. namespace_unlock();
  2440. }
  2441. static inline void unlock_mount(struct pinned_mountpoint *m)
  2442. {
  2443. if (!IS_ERR(m->parent))
  2444. __unlock_mount(m);
  2445. }
  2446. static void lock_mount_exact(const struct path *path,
  2447. struct pinned_mountpoint *mp, bool copy_mount,
  2448. unsigned int copy_flags);
  2449. #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
  2450. struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
  2451. do_lock_mount((path), &mp, (beneath))
  2452. #define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false)
  2453. #define LOCK_MOUNT_EXACT(mp, path) \
  2454. struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
  2455. lock_mount_exact((path), &mp, false, 0)
  2456. #define LOCK_MOUNT_EXACT_COPY(mp, path, copy_flags) \
  2457. struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
  2458. lock_mount_exact((path), &mp, true, (copy_flags))
  2459. static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp)
  2460. {
  2461. if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
  2462. return -EINVAL;
  2463. if (d_is_dir(mp->mp->m_dentry) !=
  2464. d_is_dir(mnt->mnt.mnt_root))
  2465. return -ENOTDIR;
  2466. return attach_recursive_mnt(mnt, mp);
  2467. }
  2468. static int may_change_propagation(const struct mount *m)
  2469. {
  2470. struct mnt_namespace *ns = m->mnt_ns;
  2471. // it must be mounted in some namespace
  2472. if (IS_ERR_OR_NULL(ns)) // is_mounted()
  2473. return -EINVAL;
  2474. // and the caller must be admin in userns of that namespace
  2475. if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
  2476. return -EPERM;
  2477. return 0;
  2478. }
  2479. /*
  2480. * Sanity check the flags to change_mnt_propagation.
  2481. */
  2482. static int flags_to_propagation_type(int ms_flags)
  2483. {
  2484. int type = ms_flags & ~(MS_REC | MS_SILENT);
  2485. /* Fail if any non-propagation flags are set */
  2486. if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
  2487. return 0;
  2488. /* Only one propagation flag should be set */
  2489. if (!is_power_of_2(type))
  2490. return 0;
  2491. return type;
  2492. }
  2493. /*
  2494. * recursively change the type of the mountpoint.
  2495. */
  2496. static int do_change_type(const struct path *path, int ms_flags)
  2497. {
  2498. struct mount *m;
  2499. struct mount *mnt = real_mount(path->mnt);
  2500. int recurse = ms_flags & MS_REC;
  2501. int type;
  2502. int err;
  2503. if (!path_mounted(path))
  2504. return -EINVAL;
  2505. type = flags_to_propagation_type(ms_flags);
  2506. if (!type)
  2507. return -EINVAL;
  2508. guard(namespace_excl)();
  2509. err = may_change_propagation(mnt);
  2510. if (err)
  2511. return err;
  2512. if (type == MS_SHARED) {
  2513. err = invent_group_ids(mnt, recurse);
  2514. if (err)
  2515. return err;
  2516. }
  2517. for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
  2518. change_mnt_propagation(m, type);
  2519. return 0;
  2520. }
  2521. /* may_copy_tree() - check if a mount tree can be copied
  2522. * @path: path to the mount tree to be copied
  2523. *
  2524. * This helper checks if the caller may copy the mount tree starting
  2525. * from @path->mnt. The caller may copy the mount tree under the
  2526. * following circumstances:
  2527. *
  2528. * (1) The caller is located in the mount namespace of the mount tree.
  2529. * This also implies that the mount does not belong to an anonymous
  2530. * mount namespace.
  2531. * (2) The caller tries to copy an nfs mount referring to a mount
  2532. * namespace, i.e., the caller is trying to copy a mount namespace
  2533. * entry from nsfs.
  2534. * (3) The caller tries to copy a pidfs mount referring to a pidfd.
  2535. * (4) The caller is trying to copy a mount tree that belongs to an
  2536. * anonymous mount namespace.
  2537. *
  2538. * For that to be safe, this helper enforces that the origin mount
  2539. * namespace the anonymous mount namespace was created from is the
  2540. * same as the caller's mount namespace by comparing the sequence
  2541. * numbers.
  2542. *
  2543. * This is not strictly necessary. The current semantics of the new
  2544. * mount api enforce that the caller must be located in the same
  2545. * mount namespace as the mount tree it interacts with. Using the
  2546. * origin sequence number preserves these semantics even for
  2547. * anonymous mount namespaces. However, one could envision extending
  2548. * the api to directly operate across mount namespace if needed.
  2549. *
  2550. * The ownership of a non-anonymous mount namespace such as the
  2551. * caller's cannot change.
  2552. * => We know that the caller's mount namespace is stable.
  2553. *
  2554. * If the origin sequence number of the anonymous mount namespace is
  2555. * the same as the sequence number of the caller's mount namespace.
  2556. * => The owning namespaces are the same.
  2557. *
  2558. * ==> The earlier capability check on the owning namespace of the
  2559. * caller's mount namespace ensures that the caller has the
  2560. * ability to copy the mount tree.
  2561. *
  2562. * Returns true if the mount tree can be copied, false otherwise.
  2563. */
  2564. static inline bool may_copy_tree(const struct path *path)
  2565. {
  2566. struct mount *mnt = real_mount(path->mnt);
  2567. const struct dentry_operations *d_op;
  2568. if (check_mnt(mnt))
  2569. return true;
  2570. d_op = path->dentry->d_op;
  2571. if (d_op == &ns_dentry_operations)
  2572. return true;
  2573. if (d_op == &pidfs_dentry_operations)
  2574. return true;
  2575. if (!is_mounted(path->mnt))
  2576. return false;
  2577. return check_anonymous_mnt(mnt);
  2578. }
  2579. static struct mount *__do_loopback(const struct path *old_path,
  2580. unsigned int flags, unsigned int copy_flags)
  2581. {
  2582. struct mount *old = real_mount(old_path->mnt);
  2583. bool recurse = flags & AT_RECURSIVE;
  2584. if (IS_MNT_UNBINDABLE(old))
  2585. return ERR_PTR(-EINVAL);
  2586. if (!may_copy_tree(old_path))
  2587. return ERR_PTR(-EINVAL);
  2588. if (!recurse && __has_locked_children(old, old_path->dentry))
  2589. return ERR_PTR(-EINVAL);
  2590. /*
  2591. * When creating a new mount namespace we don't want to copy over
  2592. * mounts of mount namespaces to avoid the risk of cycles and also to
  2593. * minimize the default complex interdependencies between mount
  2594. * namespaces.
  2595. *
  2596. * We could ofc just check whether all mount namespace files aren't
  2597. * creating cycles but really let's keep this simple.
  2598. */
  2599. if (!(flags & OPEN_TREE_NAMESPACE))
  2600. copy_flags |= CL_COPY_MNT_NS_FILE;
  2601. if (recurse)
  2602. return copy_tree(old, old_path->dentry, copy_flags);
  2603. return clone_mnt(old, old_path->dentry, copy_flags);
  2604. }
  2605. /*
  2606. * do loopback mount.
  2607. */
  2608. static int do_loopback(const struct path *path, const char *old_name,
  2609. int recurse)
  2610. {
  2611. struct path old_path __free(path_put) = {};
  2612. struct mount *mnt = NULL;
  2613. unsigned int flags = recurse ? AT_RECURSIVE : 0;
  2614. int err;
  2615. if (!old_name || !*old_name)
  2616. return -EINVAL;
  2617. err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
  2618. if (err)
  2619. return err;
  2620. if (mnt_ns_loop(old_path.dentry))
  2621. return -EINVAL;
  2622. LOCK_MOUNT(mp, path);
  2623. if (IS_ERR(mp.parent))
  2624. return PTR_ERR(mp.parent);
  2625. if (!check_mnt(mp.parent))
  2626. return -EINVAL;
  2627. mnt = __do_loopback(&old_path, flags, 0);
  2628. if (IS_ERR(mnt))
  2629. return PTR_ERR(mnt);
  2630. err = graft_tree(mnt, &mp);
  2631. if (err) {
  2632. lock_mount_hash();
  2633. umount_tree(mnt, UMOUNT_SYNC);
  2634. unlock_mount_hash();
  2635. }
  2636. return err;
  2637. }
  2638. static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned int flags)
  2639. {
  2640. struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
  2641. struct user_namespace *user_ns = mnt_ns->user_ns;
  2642. struct mount *mnt, *p;
  2643. ns = alloc_mnt_ns(user_ns, true);
  2644. if (IS_ERR(ns))
  2645. return ns;
  2646. guard(namespace_excl)();
  2647. /*
  2648. * Record the sequence number of the source mount namespace.
  2649. * This needs to hold namespace_sem to ensure that the mount
  2650. * doesn't get attached.
  2651. */
  2652. if (is_mounted(path->mnt)) {
  2653. src_mnt_ns = real_mount(path->mnt)->mnt_ns;
  2654. if (is_anon_ns(src_mnt_ns))
  2655. ns->seq_origin = src_mnt_ns->seq_origin;
  2656. else
  2657. ns->seq_origin = src_mnt_ns->ns.ns_id;
  2658. }
  2659. mnt = __do_loopback(path, flags, 0);
  2660. if (IS_ERR(mnt)) {
  2661. emptied_ns = ns;
  2662. return ERR_CAST(mnt);
  2663. }
  2664. for (p = mnt; p; p = next_mnt(p, mnt)) {
  2665. mnt_add_to_ns(ns, p);
  2666. ns->nr_mounts++;
  2667. }
  2668. ns->root = mnt;
  2669. return ns;
  2670. }
  2671. static struct file *open_detached_copy(struct path *path, unsigned int flags)
  2672. {
  2673. struct mnt_namespace *ns = get_detached_copy(path, flags);
  2674. struct file *file;
  2675. if (IS_ERR(ns))
  2676. return ERR_CAST(ns);
  2677. mntput(path->mnt);
  2678. path->mnt = mntget(&ns->root->mnt);
  2679. file = dentry_open(path, O_PATH, current_cred());
  2680. if (IS_ERR(file))
  2681. dissolve_on_fput(path->mnt);
  2682. else
  2683. file->f_mode |= FMODE_NEED_UNMOUNT;
  2684. return file;
  2685. }
  2686. static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
  2687. {
  2688. struct mnt_namespace *ns = current->nsproxy->mnt_ns;
  2689. struct user_namespace *user_ns = current_user_ns();
  2690. struct mnt_namespace *new_ns;
  2691. struct mount *new_ns_root, *old_ns_root;
  2692. struct path to_path;
  2693. struct mount *mnt;
  2694. unsigned int copy_flags = 0;
  2695. bool locked = false;
  2696. if (user_ns != ns->user_ns)
  2697. copy_flags |= CL_SLAVE;
  2698. new_ns = alloc_mnt_ns(user_ns, false);
  2699. if (IS_ERR(new_ns))
  2700. return ERR_CAST(new_ns);
  2701. old_ns_root = ns->root;
  2702. to_path.mnt = &old_ns_root->mnt;
  2703. to_path.dentry = old_ns_root->mnt.mnt_root;
  2704. VFS_WARN_ON_ONCE(old_ns_root->mnt.mnt_sb->s_type != &nullfs_fs_type);
  2705. LOCK_MOUNT_EXACT_COPY(mp, &to_path, copy_flags);
  2706. if (IS_ERR(mp.parent)) {
  2707. free_mnt_ns(new_ns);
  2708. return ERR_CAST(mp.parent);
  2709. }
  2710. new_ns_root = mp.parent;
  2711. /*
  2712. * If the real rootfs had a locked mount on top of it somewhere
  2713. * in the stack, lock the new mount tree as well so it can't be
  2714. * exposed.
  2715. */
  2716. mnt = old_ns_root;
  2717. while (mnt->overmount) {
  2718. mnt = mnt->overmount;
  2719. if (mnt->mnt.mnt_flags & MNT_LOCKED)
  2720. locked = true;
  2721. }
  2722. /*
  2723. * We don't emulate unshare()ing a mount namespace. We stick
  2724. * to the restrictions of creating detached bind-mounts. It
  2725. * has a lot saner and simpler semantics.
  2726. */
  2727. mnt = __do_loopback(path, flags, copy_flags);
  2728. scoped_guard(mount_writer) {
  2729. if (IS_ERR(mnt)) {
  2730. emptied_ns = new_ns;
  2731. umount_tree(new_ns_root, 0);
  2732. return ERR_CAST(mnt);
  2733. }
  2734. if (locked)
  2735. mnt->mnt.mnt_flags |= MNT_LOCKED;
  2736. /*
  2737. * now mount the detached tree on top of the copy
  2738. * of the real rootfs we created.
  2739. */
  2740. attach_mnt(mnt, new_ns_root, mp.mp);
  2741. if (user_ns != ns->user_ns)
  2742. lock_mnt_tree(new_ns_root);
  2743. }
  2744. for (mnt = new_ns_root; mnt; mnt = next_mnt(mnt, new_ns_root)) {
  2745. mnt_add_to_ns(new_ns, mnt);
  2746. new_ns->nr_mounts++;
  2747. }
  2748. new_ns->root = new_ns_root;
  2749. ns_tree_add_raw(new_ns);
  2750. return new_ns;
  2751. }
  2752. static struct file *open_new_namespace(struct path *path, unsigned int flags)
  2753. {
  2754. struct mnt_namespace *new_ns;
  2755. new_ns = create_new_namespace(path, flags);
  2756. if (IS_ERR(new_ns))
  2757. return ERR_CAST(new_ns);
  2758. return open_namespace_file(to_ns_common(new_ns));
  2759. }
  2760. static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
  2761. {
  2762. int ret;
  2763. struct path path __free(path_put) = {};
  2764. int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
  2765. BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
  2766. if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
  2767. AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
  2768. OPEN_TREE_CLOEXEC | OPEN_TREE_NAMESPACE))
  2769. return ERR_PTR(-EINVAL);
  2770. if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) ==
  2771. AT_RECURSIVE)
  2772. return ERR_PTR(-EINVAL);
  2773. if (hweight32(flags & (OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) > 1)
  2774. return ERR_PTR(-EINVAL);
  2775. if (flags & AT_NO_AUTOMOUNT)
  2776. lookup_flags &= ~LOOKUP_AUTOMOUNT;
  2777. if (flags & AT_SYMLINK_NOFOLLOW)
  2778. lookup_flags &= ~LOOKUP_FOLLOW;
  2779. /*
  2780. * If we create a new mount namespace with the cloned mount tree we
  2781. * just care about being privileged over our current user namespace.
  2782. * The new mount namespace will be owned by it.
  2783. */
  2784. if ((flags & OPEN_TREE_NAMESPACE) &&
  2785. !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
  2786. return ERR_PTR(-EPERM);
  2787. if ((flags & OPEN_TREE_CLONE) && !may_mount())
  2788. return ERR_PTR(-EPERM);
  2789. CLASS(filename_uflags, name)(filename, flags);
  2790. ret = filename_lookup(dfd, name, lookup_flags, &path, NULL);
  2791. if (unlikely(ret))
  2792. return ERR_PTR(ret);
  2793. if (flags & OPEN_TREE_NAMESPACE)
  2794. return open_new_namespace(&path, flags);
  2795. if (flags & OPEN_TREE_CLONE)
  2796. return open_detached_copy(&path, flags);
  2797. return dentry_open(&path, O_PATH, current_cred());
  2798. }
  2799. SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
  2800. {
  2801. return FD_ADD(flags, vfs_open_tree(dfd, filename, flags));
  2802. }
  2803. /*
  2804. * Don't allow locked mount flags to be cleared.
  2805. *
  2806. * No locks need to be held here while testing the various MNT_LOCK
  2807. * flags because those flags can never be cleared once they are set.
  2808. */
  2809. static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
  2810. {
  2811. unsigned int fl = mnt->mnt.mnt_flags;
  2812. if ((fl & MNT_LOCK_READONLY) &&
  2813. !(mnt_flags & MNT_READONLY))
  2814. return false;
  2815. if ((fl & MNT_LOCK_NODEV) &&
  2816. !(mnt_flags & MNT_NODEV))
  2817. return false;
  2818. if ((fl & MNT_LOCK_NOSUID) &&
  2819. !(mnt_flags & MNT_NOSUID))
  2820. return false;
  2821. if ((fl & MNT_LOCK_NOEXEC) &&
  2822. !(mnt_flags & MNT_NOEXEC))
  2823. return false;
  2824. if ((fl & MNT_LOCK_ATIME) &&
  2825. ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
  2826. return false;
  2827. return true;
  2828. }
  2829. static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
  2830. {
  2831. bool readonly_request = (mnt_flags & MNT_READONLY);
  2832. if (readonly_request == __mnt_is_readonly(&mnt->mnt))
  2833. return 0;
  2834. if (readonly_request)
  2835. return mnt_make_readonly(mnt);
  2836. mnt->mnt.mnt_flags &= ~MNT_READONLY;
  2837. return 0;
  2838. }
  2839. static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
  2840. {
  2841. mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
  2842. mnt->mnt.mnt_flags = mnt_flags;
  2843. touch_mnt_namespace(mnt->mnt_ns);
  2844. }
  2845. static void mnt_warn_timestamp_expiry(const struct path *mountpoint,
  2846. struct vfsmount *mnt)
  2847. {
  2848. struct super_block *sb = mnt->mnt_sb;
  2849. if (!__mnt_is_readonly(mnt) &&
  2850. (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
  2851. (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
  2852. char *buf, *mntpath;
  2853. buf = (char *)__get_free_page(GFP_KERNEL);
  2854. if (buf)
  2855. mntpath = d_path(mountpoint, buf, PAGE_SIZE);
  2856. else
  2857. mntpath = ERR_PTR(-ENOMEM);
  2858. if (IS_ERR(mntpath))
  2859. mntpath = "(unknown)";
  2860. pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
  2861. sb->s_type->name,
  2862. is_mounted(mnt) ? "remounted" : "mounted",
  2863. mntpath, &sb->s_time_max,
  2864. (unsigned long long)sb->s_time_max);
  2865. sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
  2866. if (buf)
  2867. free_page((unsigned long)buf);
  2868. }
  2869. }
  2870. /*
  2871. * Handle reconfiguration of the mountpoint only without alteration of the
  2872. * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND
  2873. * to mount(2).
  2874. */
  2875. static int do_reconfigure_mnt(const struct path *path, unsigned int mnt_flags)
  2876. {
  2877. struct super_block *sb = path->mnt->mnt_sb;
  2878. struct mount *mnt = real_mount(path->mnt);
  2879. int ret;
  2880. if (!check_mnt(mnt))
  2881. return -EINVAL;
  2882. if (!path_mounted(path))
  2883. return -EINVAL;
  2884. if (!can_change_locked_flags(mnt, mnt_flags))
  2885. return -EPERM;
  2886. /*
  2887. * We're only checking whether the superblock is read-only not
  2888. * changing it, so only take down_read(&sb->s_umount).
  2889. */
  2890. down_read(&sb->s_umount);
  2891. lock_mount_hash();
  2892. ret = change_mount_ro_state(mnt, mnt_flags);
  2893. if (ret == 0)
  2894. set_mount_attributes(mnt, mnt_flags);
  2895. unlock_mount_hash();
  2896. up_read(&sb->s_umount);
  2897. mnt_warn_timestamp_expiry(path, &mnt->mnt);
  2898. return ret;
  2899. }
  2900. /*
  2901. * change filesystem flags. dir should be a physical root of filesystem.
  2902. * If you've mounted a non-root directory somewhere and want to do remount
  2903. * on it - tough luck.
  2904. */
  2905. static int do_remount(const struct path *path, int sb_flags,
  2906. int mnt_flags, void *data)
  2907. {
  2908. int err;
  2909. struct super_block *sb = path->mnt->mnt_sb;
  2910. struct mount *mnt = real_mount(path->mnt);
  2911. struct fs_context *fc;
  2912. if (!check_mnt(mnt))
  2913. return -EINVAL;
  2914. if (!path_mounted(path))
  2915. return -EINVAL;
  2916. if (!can_change_locked_flags(mnt, mnt_flags))
  2917. return -EPERM;
  2918. fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
  2919. if (IS_ERR(fc))
  2920. return PTR_ERR(fc);
  2921. /*
  2922. * Indicate to the filesystem that the remount request is coming
  2923. * from the legacy mount system call.
  2924. */
  2925. fc->oldapi = true;
  2926. err = parse_monolithic_mount_data(fc, data);
  2927. if (!err) {
  2928. down_write(&sb->s_umount);
  2929. err = -EPERM;
  2930. if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
  2931. err = reconfigure_super(fc);
  2932. if (!err) {
  2933. lock_mount_hash();
  2934. set_mount_attributes(mnt, mnt_flags);
  2935. unlock_mount_hash();
  2936. }
  2937. }
  2938. up_write(&sb->s_umount);
  2939. }
  2940. mnt_warn_timestamp_expiry(path, &mnt->mnt);
  2941. put_fs_context(fc);
  2942. return err;
  2943. }
  2944. static inline int tree_contains_unbindable(struct mount *mnt)
  2945. {
  2946. struct mount *p;
  2947. for (p = mnt; p; p = next_mnt(p, mnt)) {
  2948. if (IS_MNT_UNBINDABLE(p))
  2949. return 1;
  2950. }
  2951. return 0;
  2952. }
  2953. static int do_set_group(const struct path *from_path, const struct path *to_path)
  2954. {
  2955. struct mount *from = real_mount(from_path->mnt);
  2956. struct mount *to = real_mount(to_path->mnt);
  2957. int err;
  2958. guard(namespace_excl)();
  2959. err = may_change_propagation(from);
  2960. if (err)
  2961. return err;
  2962. err = may_change_propagation(to);
  2963. if (err)
  2964. return err;
  2965. /* To and From paths should be mount roots */
  2966. if (!path_mounted(from_path))
  2967. return -EINVAL;
  2968. if (!path_mounted(to_path))
  2969. return -EINVAL;
  2970. /* Setting sharing groups is only allowed across same superblock */
  2971. if (from->mnt.mnt_sb != to->mnt.mnt_sb)
  2972. return -EINVAL;
  2973. /* From mount root should be wider than To mount root */
  2974. if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
  2975. return -EINVAL;
  2976. /* From mount should not have locked children in place of To's root */
  2977. if (__has_locked_children(from, to->mnt.mnt_root))
  2978. return -EINVAL;
  2979. /* Setting sharing groups is only allowed on private mounts */
  2980. if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
  2981. return -EINVAL;
  2982. /* From should not be private */
  2983. if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
  2984. return -EINVAL;
  2985. if (IS_MNT_SLAVE(from)) {
  2986. hlist_add_behind(&to->mnt_slave, &from->mnt_slave);
  2987. to->mnt_master = from->mnt_master;
  2988. }
  2989. if (IS_MNT_SHARED(from)) {
  2990. to->mnt_group_id = from->mnt_group_id;
  2991. list_add(&to->mnt_share, &from->mnt_share);
  2992. set_mnt_shared(to);
  2993. }
  2994. return 0;
  2995. }
  2996. /**
  2997. * path_overmounted - check if path is overmounted
  2998. * @path: path to check
  2999. *
  3000. * Check if path is overmounted, i.e., if there's a mount on top of
  3001. * @path->mnt with @path->dentry as mountpoint.
  3002. *
  3003. * Context: namespace_sem must be held at least shared.
  3004. * MUST NOT be called under lock_mount_hash() (there one should just
  3005. * call __lookup_mnt() and check if it returns NULL).
  3006. * Return: If path is overmounted true is returned, false if not.
  3007. */
  3008. static inline bool path_overmounted(const struct path *path)
  3009. {
  3010. unsigned seq = read_seqbegin(&mount_lock);
  3011. bool no_child;
  3012. rcu_read_lock();
  3013. no_child = !__lookup_mnt(path->mnt, path->dentry);
  3014. rcu_read_unlock();
  3015. if (need_seqretry(&mount_lock, seq)) {
  3016. read_seqlock_excl(&mount_lock);
  3017. no_child = !__lookup_mnt(path->mnt, path->dentry);
  3018. read_sequnlock_excl(&mount_lock);
  3019. }
  3020. return unlikely(!no_child);
  3021. }
  3022. /*
  3023. * Check if there is a possibly empty chain of descent from p1 to p2.
  3024. * Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl).
  3025. */
  3026. static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
  3027. {
  3028. while (p2 != p1 && mnt_has_parent(p2))
  3029. p2 = p2->mnt_parent;
  3030. return p2 == p1;
  3031. }
  3032. /**
  3033. * can_move_mount_beneath - check that we can mount beneath the top mount
  3034. * @mnt_from: mount we are trying to move
  3035. * @mnt_to: mount under which to mount
  3036. * @mp: mountpoint of @mnt_to
  3037. *
  3038. * - Make sure that nothing can be mounted beneath the caller's current
  3039. * root or the rootfs of the namespace.
  3040. * - Make sure that the caller can unmount the topmost mount ensuring
  3041. * that the caller could reveal the underlying mountpoint.
  3042. * - Ensure that nothing has been mounted on top of @mnt_from before we
  3043. * grabbed @namespace_sem to avoid creating pointless shadow mounts.
  3044. * - Prevent mounting beneath a mount if the propagation relationship
  3045. * between the source mount, parent mount, and top mount would lead to
  3046. * nonsensical mount trees.
  3047. *
  3048. * Context: This function expects namespace_lock() to be held.
  3049. * Return: On success 0, and on error a negative error code is returned.
  3050. */
  3051. static int can_move_mount_beneath(const struct mount *mnt_from,
  3052. const struct mount *mnt_to,
  3053. const struct mountpoint *mp)
  3054. {
  3055. struct mount *parent_mnt_to = mnt_to->mnt_parent;
  3056. if (IS_MNT_LOCKED(mnt_to))
  3057. return -EINVAL;
  3058. /* Avoid creating shadow mounts during mount propagation. */
  3059. if (mnt_from->overmount)
  3060. return -EINVAL;
  3061. /*
  3062. * Mounting beneath the rootfs only makes sense when the
  3063. * semantics of pivot_root(".", ".") are used.
  3064. */
  3065. if (&mnt_to->mnt == current->fs->root.mnt)
  3066. return -EINVAL;
  3067. if (parent_mnt_to == current->nsproxy->mnt_ns->root)
  3068. return -EINVAL;
  3069. if (mount_is_ancestor(mnt_to, mnt_from))
  3070. return -EINVAL;
  3071. /*
  3072. * If the parent mount propagates to the child mount this would
  3073. * mean mounting @mnt_from on @mnt_to->mnt_parent and then
  3074. * propagating a copy @c of @mnt_from on top of @mnt_to. This
  3075. * defeats the whole purpose of mounting beneath another mount.
  3076. */
  3077. if (propagation_would_overmount(parent_mnt_to, mnt_to, mp))
  3078. return -EINVAL;
  3079. /*
  3080. * If @mnt_to->mnt_parent propagates to @mnt_from this would
  3081. * mean propagating a copy @c of @mnt_from on top of @mnt_from.
  3082. * Afterwards @mnt_from would be mounted on top of
  3083. * @mnt_to->mnt_parent and @mnt_to would be unmounted from
  3084. * @mnt->mnt_parent and remounted on @mnt_from. But since @c is
  3085. * already mounted on @mnt_from, @mnt_to would ultimately be
  3086. * remounted on top of @c. Afterwards, @mnt_from would be
  3087. * covered by a copy @c of @mnt_from and @c would be covered by
  3088. * @mnt_from itself. This defeats the whole purpose of mounting
  3089. * @mnt_from beneath @mnt_to.
  3090. */
  3091. if (check_mnt(mnt_from) &&
  3092. propagation_would_overmount(parent_mnt_to, mnt_from, mp))
  3093. return -EINVAL;
  3094. return 0;
  3095. }
  3096. /* may_use_mount() - check if a mount tree can be used
  3097. * @mnt: vfsmount to be used
  3098. *
  3099. * This helper checks if the caller may use the mount tree starting
  3100. * from @path->mnt. The caller may use the mount tree under the
  3101. * following circumstances:
  3102. *
  3103. * (1) The caller is located in the mount namespace of the mount tree.
  3104. * This also implies that the mount does not belong to an anonymous
  3105. * mount namespace.
  3106. * (2) The caller is trying to use a mount tree that belongs to an
  3107. * anonymous mount namespace.
  3108. *
  3109. * For that to be safe, this helper enforces that the origin mount
  3110. * namespace the anonymous mount namespace was created from is the
  3111. * same as the caller's mount namespace by comparing the sequence
  3112. * numbers.
  3113. *
  3114. * The ownership of a non-anonymous mount namespace such as the
  3115. * caller's cannot change.
  3116. * => We know that the caller's mount namespace is stable.
  3117. *
  3118. * If the origin sequence number of the anonymous mount namespace is
  3119. * the same as the sequence number of the caller's mount namespace.
  3120. * => The owning namespaces are the same.
  3121. *
  3122. * ==> The earlier capability check on the owning namespace of the
  3123. * caller's mount namespace ensures that the caller has the
  3124. * ability to use the mount tree.
  3125. *
  3126. * Returns true if the mount tree can be used, false otherwise.
  3127. */
  3128. static inline bool may_use_mount(struct mount *mnt)
  3129. {
  3130. if (check_mnt(mnt))
  3131. return true;
  3132. /*
  3133. * Make sure that noone unmounted the target path or somehow
  3134. * managed to get their hands on something purely kernel
  3135. * internal.
  3136. */
  3137. if (!is_mounted(&mnt->mnt))
  3138. return false;
  3139. return check_anonymous_mnt(mnt);
  3140. }
  3141. static int do_move_mount(const struct path *old_path,
  3142. const struct path *new_path,
  3143. enum mnt_tree_flags_t flags)
  3144. {
  3145. struct mount *old = real_mount(old_path->mnt);
  3146. int err;
  3147. bool beneath = flags & MNT_TREE_BENEATH;
  3148. if (!path_mounted(old_path))
  3149. return -EINVAL;
  3150. if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry))
  3151. return -EINVAL;
  3152. LOCK_MOUNT_MAYBE_BENEATH(mp, new_path, beneath);
  3153. if (IS_ERR(mp.parent))
  3154. return PTR_ERR(mp.parent);
  3155. if (check_mnt(old)) {
  3156. /* if the source is in our namespace... */
  3157. /* ... it should be detachable from parent */
  3158. if (!mnt_has_parent(old) || IS_MNT_LOCKED(old))
  3159. return -EINVAL;
  3160. /* ... which should not be shared */
  3161. if (IS_MNT_SHARED(old->mnt_parent))
  3162. return -EINVAL;
  3163. /* ... and the target should be in our namespace */
  3164. if (!check_mnt(mp.parent))
  3165. return -EINVAL;
  3166. } else {
  3167. /*
  3168. * otherwise the source must be the root of some anon namespace.
  3169. */
  3170. if (!anon_ns_root(old))
  3171. return -EINVAL;
  3172. /*
  3173. * Bail out early if the target is within the same namespace -
  3174. * subsequent checks would've rejected that, but they lose
  3175. * some corner cases if we check it early.
  3176. */
  3177. if (old->mnt_ns == mp.parent->mnt_ns)
  3178. return -EINVAL;
  3179. /*
  3180. * Target should be either in our namespace or in an acceptable
  3181. * anon namespace, sensu check_anonymous_mnt().
  3182. */
  3183. if (!may_use_mount(mp.parent))
  3184. return -EINVAL;
  3185. }
  3186. if (beneath) {
  3187. struct mount *over = real_mount(new_path->mnt);
  3188. if (mp.parent != over->mnt_parent)
  3189. over = mp.parent->overmount;
  3190. err = can_move_mount_beneath(old, over, mp.mp);
  3191. if (err)
  3192. return err;
  3193. }
  3194. /*
  3195. * Don't move a mount tree containing unbindable mounts to a destination
  3196. * mount which is shared.
  3197. */
  3198. if (IS_MNT_SHARED(mp.parent) && tree_contains_unbindable(old))
  3199. return -EINVAL;
  3200. if (!check_for_nsfs_mounts(old))
  3201. return -ELOOP;
  3202. if (mount_is_ancestor(old, mp.parent))
  3203. return -ELOOP;
  3204. return attach_recursive_mnt(old, &mp);
  3205. }
  3206. static int do_move_mount_old(const struct path *path, const char *old_name)
  3207. {
  3208. struct path old_path __free(path_put) = {};
  3209. int err;
  3210. if (!old_name || !*old_name)
  3211. return -EINVAL;
  3212. err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
  3213. if (err)
  3214. return err;
  3215. return do_move_mount(&old_path, path, 0);
  3216. }
  3217. /*
  3218. * add a mount into a namespace's mount tree
  3219. */
  3220. static int do_add_mount(struct mount *newmnt, const struct pinned_mountpoint *mp,
  3221. int mnt_flags)
  3222. {
  3223. struct mount *parent = mp->parent;
  3224. if (IS_ERR(parent))
  3225. return PTR_ERR(parent);
  3226. mnt_flags &= ~MNT_INTERNAL_FLAGS;
  3227. if (unlikely(!check_mnt(parent))) {
  3228. /* that's acceptable only for automounts done in private ns */
  3229. if (!(mnt_flags & MNT_SHRINKABLE))
  3230. return -EINVAL;
  3231. /* ... and for those we'd better have mountpoint still alive */
  3232. if (!parent->mnt_ns)
  3233. return -EINVAL;
  3234. }
  3235. /* Refuse the same filesystem on the same mount point */
  3236. if (parent->mnt.mnt_sb == newmnt->mnt.mnt_sb &&
  3237. parent->mnt.mnt_root == mp->mp->m_dentry)
  3238. return -EBUSY;
  3239. if (d_is_symlink(newmnt->mnt.mnt_root))
  3240. return -EINVAL;
  3241. newmnt->mnt.mnt_flags = mnt_flags;
  3242. return graft_tree(newmnt, mp);
  3243. }
  3244. static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
  3245. /*
  3246. * Create a new mount using a superblock configuration and request it
  3247. * be added to the namespace tree.
  3248. */
  3249. static int do_new_mount_fc(struct fs_context *fc, const struct path *mountpoint,
  3250. unsigned int mnt_flags)
  3251. {
  3252. struct super_block *sb;
  3253. struct vfsmount *mnt __free(mntput) = fc_mount(fc);
  3254. int error;
  3255. if (IS_ERR(mnt))
  3256. return PTR_ERR(mnt);
  3257. sb = fc->root->d_sb;
  3258. error = security_sb_kern_mount(sb);
  3259. if (unlikely(error))
  3260. return error;
  3261. if (unlikely(mount_too_revealing(sb, &mnt_flags))) {
  3262. errorfcp(fc, "VFS", "Mount too revealing");
  3263. return -EPERM;
  3264. }
  3265. mnt_warn_timestamp_expiry(mountpoint, mnt);
  3266. LOCK_MOUNT(mp, mountpoint);
  3267. error = do_add_mount(real_mount(mnt), &mp, mnt_flags);
  3268. if (!error)
  3269. retain_and_null_ptr(mnt); // consumed on success
  3270. return error;
  3271. }
  3272. /*
  3273. * create a new mount for userspace and request it to be added into the
  3274. * namespace's tree
  3275. */
  3276. static int do_new_mount(const struct path *path, const char *fstype,
  3277. int sb_flags, int mnt_flags,
  3278. const char *name, void *data)
  3279. {
  3280. struct file_system_type *type;
  3281. struct fs_context *fc;
  3282. const char *subtype = NULL;
  3283. int err = 0;
  3284. if (!fstype)
  3285. return -EINVAL;
  3286. type = get_fs_type(fstype);
  3287. if (!type)
  3288. return -ENODEV;
  3289. if (type->fs_flags & FS_HAS_SUBTYPE) {
  3290. subtype = strchr(fstype, '.');
  3291. if (subtype) {
  3292. subtype++;
  3293. if (!*subtype) {
  3294. put_filesystem(type);
  3295. return -EINVAL;
  3296. }
  3297. }
  3298. }
  3299. fc = fs_context_for_mount(type, sb_flags);
  3300. put_filesystem(type);
  3301. if (IS_ERR(fc))
  3302. return PTR_ERR(fc);
  3303. /*
  3304. * Indicate to the filesystem that the mount request is coming
  3305. * from the legacy mount system call.
  3306. */
  3307. fc->oldapi = true;
  3308. if (subtype)
  3309. err = vfs_parse_fs_string(fc, "subtype", subtype);
  3310. if (!err && name)
  3311. err = vfs_parse_fs_string(fc, "source", name);
  3312. if (!err)
  3313. err = parse_monolithic_mount_data(fc, data);
  3314. if (!err && !mount_capable(fc))
  3315. err = -EPERM;
  3316. if (!err)
  3317. err = do_new_mount_fc(fc, path, mnt_flags);
  3318. put_fs_context(fc);
  3319. return err;
  3320. }
  3321. static void lock_mount_exact(const struct path *path,
  3322. struct pinned_mountpoint *mp, bool copy_mount,
  3323. unsigned int copy_flags)
  3324. {
  3325. struct dentry *dentry = path->dentry;
  3326. int err;
  3327. /* Assert that inode_lock() locked the correct inode. */
  3328. VFS_WARN_ON_ONCE(copy_mount && !path_mounted(path));
  3329. inode_lock(dentry->d_inode);
  3330. namespace_lock();
  3331. if (unlikely(cant_mount(dentry)))
  3332. err = -ENOENT;
  3333. else if (!copy_mount && path_overmounted(path))
  3334. err = -EBUSY;
  3335. else
  3336. err = get_mountpoint(dentry, mp);
  3337. if (unlikely(err)) {
  3338. namespace_unlock();
  3339. inode_unlock(dentry->d_inode);
  3340. mp->parent = ERR_PTR(err);
  3341. return;
  3342. }
  3343. if (copy_mount)
  3344. mp->parent = clone_mnt(real_mount(path->mnt), dentry, copy_flags);
  3345. else
  3346. mp->parent = real_mount(path->mnt);
  3347. if (unlikely(IS_ERR(mp->parent)))
  3348. __unlock_mount(mp);
  3349. }
  3350. int finish_automount(struct vfsmount *__m, const struct path *path)
  3351. {
  3352. struct vfsmount *m __free(mntput) = __m;
  3353. struct mount *mnt;
  3354. int err;
  3355. if (!m)
  3356. return 0;
  3357. if (IS_ERR(m))
  3358. return PTR_ERR(m);
  3359. mnt = real_mount(m);
  3360. if (m->mnt_root == path->dentry)
  3361. return -ELOOP;
  3362. /*
  3363. * we don't want to use LOCK_MOUNT() - in this case finding something
  3364. * that overmounts our mountpoint to be means "quitely drop what we've
  3365. * got", not "try to mount it on top".
  3366. */
  3367. LOCK_MOUNT_EXACT(mp, path);
  3368. if (mp.parent == ERR_PTR(-EBUSY))
  3369. return 0;
  3370. err = do_add_mount(mnt, &mp, path->mnt->mnt_flags | MNT_SHRINKABLE);
  3371. if (likely(!err))
  3372. retain_and_null_ptr(m);
  3373. return err;
  3374. }
  3375. /**
  3376. * mnt_set_expiry - Put a mount on an expiration list
  3377. * @mnt: The mount to list.
  3378. * @expiry_list: The list to add the mount to.
  3379. */
  3380. void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
  3381. {
  3382. guard(mount_locked_reader)();
  3383. list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
  3384. }
  3385. EXPORT_SYMBOL(mnt_set_expiry);
  3386. /*
  3387. * process a list of expirable mountpoints with the intent of discarding any
  3388. * mountpoints that aren't in use and haven't been touched since last we came
  3389. * here
  3390. */
  3391. void mark_mounts_for_expiry(struct list_head *mounts)
  3392. {
  3393. struct mount *mnt, *next;
  3394. LIST_HEAD(graveyard);
  3395. if (list_empty(mounts))
  3396. return;
  3397. guard(namespace_excl)();
  3398. guard(mount_writer)();
  3399. /* extract from the expiration list every vfsmount that matches the
  3400. * following criteria:
  3401. * - already mounted
  3402. * - only referenced by its parent vfsmount
  3403. * - still marked for expiry (marked on the last call here; marks are
  3404. * cleared by mntput())
  3405. */
  3406. list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
  3407. if (!is_mounted(&mnt->mnt))
  3408. continue;
  3409. if (!xchg(&mnt->mnt_expiry_mark, 1) ||
  3410. propagate_mount_busy(mnt, 1))
  3411. continue;
  3412. list_move(&mnt->mnt_expire, &graveyard);
  3413. }
  3414. while (!list_empty(&graveyard)) {
  3415. mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
  3416. touch_mnt_namespace(mnt->mnt_ns);
  3417. umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
  3418. }
  3419. }
  3420. EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
  3421. /*
  3422. * Ripoff of 'select_parent()'
  3423. *
  3424. * search the list of submounts for a given mountpoint, and move any
  3425. * shrinkable submounts to the 'graveyard' list.
  3426. */
  3427. static int select_submounts(struct mount *parent, struct list_head *graveyard)
  3428. {
  3429. struct mount *this_parent = parent;
  3430. struct list_head *next;
  3431. int found = 0;
  3432. repeat:
  3433. next = this_parent->mnt_mounts.next;
  3434. resume:
  3435. while (next != &this_parent->mnt_mounts) {
  3436. struct list_head *tmp = next;
  3437. struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
  3438. next = tmp->next;
  3439. if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
  3440. continue;
  3441. /*
  3442. * Descend a level if the d_mounts list is non-empty.
  3443. */
  3444. if (!list_empty(&mnt->mnt_mounts)) {
  3445. this_parent = mnt;
  3446. goto repeat;
  3447. }
  3448. if (!propagate_mount_busy(mnt, 1)) {
  3449. list_move_tail(&mnt->mnt_expire, graveyard);
  3450. found++;
  3451. }
  3452. }
  3453. /*
  3454. * All done at this level ... ascend and resume the search
  3455. */
  3456. if (this_parent != parent) {
  3457. next = this_parent->mnt_child.next;
  3458. this_parent = this_parent->mnt_parent;
  3459. goto resume;
  3460. }
  3461. return found;
  3462. }
  3463. /*
  3464. * process a list of expirable mountpoints with the intent of discarding any
  3465. * submounts of a specific parent mountpoint
  3466. *
  3467. * mount_lock must be held for write
  3468. */
  3469. static void shrink_submounts(struct mount *mnt)
  3470. {
  3471. LIST_HEAD(graveyard);
  3472. struct mount *m;
  3473. /* extract submounts of 'mountpoint' from the expiration list */
  3474. while (select_submounts(mnt, &graveyard)) {
  3475. while (!list_empty(&graveyard)) {
  3476. m = list_first_entry(&graveyard, struct mount,
  3477. mnt_expire);
  3478. touch_mnt_namespace(m->mnt_ns);
  3479. umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
  3480. }
  3481. }
  3482. }
  3483. static void *copy_mount_options(const void __user * data)
  3484. {
  3485. char *copy;
  3486. unsigned left, offset;
  3487. if (!data)
  3488. return NULL;
  3489. copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
  3490. if (!copy)
  3491. return ERR_PTR(-ENOMEM);
  3492. left = copy_from_user(copy, data, PAGE_SIZE);
  3493. /*
  3494. * Not all architectures have an exact copy_from_user(). Resort to
  3495. * byte at a time.
  3496. */
  3497. offset = PAGE_SIZE - left;
  3498. while (left) {
  3499. char c;
  3500. if (get_user(c, (const char __user *)data + offset))
  3501. break;
  3502. copy[offset] = c;
  3503. left--;
  3504. offset++;
  3505. }
  3506. if (left == PAGE_SIZE) {
  3507. kfree(copy);
  3508. return ERR_PTR(-EFAULT);
  3509. }
  3510. return copy;
  3511. }
  3512. static char *copy_mount_string(const void __user *data)
  3513. {
  3514. return data ? strndup_user(data, PATH_MAX) : NULL;
  3515. }
  3516. /*
  3517. * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
  3518. * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
  3519. *
  3520. * data is a (void *) that can point to any structure up to
  3521. * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
  3522. * information (or be NULL).
  3523. *
  3524. * Pre-0.97 versions of mount() didn't have a flags word.
  3525. * When the flags word was introduced its top half was required
  3526. * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
  3527. * Therefore, if this magic number is present, it carries no information
  3528. * and must be discarded.
  3529. */
  3530. int path_mount(const char *dev_name, const struct path *path,
  3531. const char *type_page, unsigned long flags, void *data_page)
  3532. {
  3533. unsigned int mnt_flags = 0, sb_flags;
  3534. int ret;
  3535. /* Discard magic */
  3536. if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
  3537. flags &= ~MS_MGC_MSK;
  3538. /* Basic sanity checks */
  3539. if (data_page)
  3540. ((char *)data_page)[PAGE_SIZE - 1] = 0;
  3541. if (flags & MS_NOUSER)
  3542. return -EINVAL;
  3543. ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
  3544. if (ret)
  3545. return ret;
  3546. if (!may_mount())
  3547. return -EPERM;
  3548. if (flags & SB_MANDLOCK)
  3549. warn_mandlock();
  3550. /* Default to relatime unless overriden */
  3551. if (!(flags & MS_NOATIME))
  3552. mnt_flags |= MNT_RELATIME;
  3553. /* Separate the per-mountpoint flags */
  3554. if (flags & MS_NOSUID)
  3555. mnt_flags |= MNT_NOSUID;
  3556. if (flags & MS_NODEV)
  3557. mnt_flags |= MNT_NODEV;
  3558. if (flags & MS_NOEXEC)
  3559. mnt_flags |= MNT_NOEXEC;
  3560. if (flags & MS_NOATIME)
  3561. mnt_flags |= MNT_NOATIME;
  3562. if (flags & MS_NODIRATIME)
  3563. mnt_flags |= MNT_NODIRATIME;
  3564. if (flags & MS_STRICTATIME)
  3565. mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
  3566. if (flags & MS_RDONLY)
  3567. mnt_flags |= MNT_READONLY;
  3568. if (flags & MS_NOSYMFOLLOW)
  3569. mnt_flags |= MNT_NOSYMFOLLOW;
  3570. /* The default atime for remount is preservation */
  3571. if ((flags & MS_REMOUNT) &&
  3572. ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
  3573. MS_STRICTATIME)) == 0)) {
  3574. mnt_flags &= ~MNT_ATIME_MASK;
  3575. mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
  3576. }
  3577. sb_flags = flags & (SB_RDONLY |
  3578. SB_SYNCHRONOUS |
  3579. SB_MANDLOCK |
  3580. SB_DIRSYNC |
  3581. SB_SILENT |
  3582. SB_POSIXACL |
  3583. SB_LAZYTIME |
  3584. SB_I_VERSION);
  3585. if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
  3586. return do_reconfigure_mnt(path, mnt_flags);
  3587. if (flags & MS_REMOUNT)
  3588. return do_remount(path, sb_flags, mnt_flags, data_page);
  3589. if (flags & MS_BIND)
  3590. return do_loopback(path, dev_name, flags & MS_REC);
  3591. if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
  3592. return do_change_type(path, flags);
  3593. if (flags & MS_MOVE)
  3594. return do_move_mount_old(path, dev_name);
  3595. return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
  3596. data_page);
  3597. }
  3598. int do_mount(const char *dev_name, const char __user *dir_name,
  3599. const char *type_page, unsigned long flags, void *data_page)
  3600. {
  3601. struct path path __free(path_put) = {};
  3602. int ret;
  3603. ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
  3604. if (ret)
  3605. return ret;
  3606. return path_mount(dev_name, &path, type_page, flags, data_page);
  3607. }
  3608. static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
  3609. {
  3610. return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
  3611. }
  3612. static void dec_mnt_namespaces(struct ucounts *ucounts)
  3613. {
  3614. dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
  3615. }
  3616. static void free_mnt_ns(struct mnt_namespace *ns)
  3617. {
  3618. if (!is_anon_ns(ns))
  3619. ns_common_free(ns);
  3620. dec_mnt_namespaces(ns->ucounts);
  3621. mnt_ns_tree_remove(ns);
  3622. }
  3623. static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
  3624. {
  3625. struct mnt_namespace *new_ns;
  3626. struct ucounts *ucounts;
  3627. int ret;
  3628. ucounts = inc_mnt_namespaces(user_ns);
  3629. if (!ucounts)
  3630. return ERR_PTR(-ENOSPC);
  3631. new_ns = kzalloc_obj(struct mnt_namespace, GFP_KERNEL_ACCOUNT);
  3632. if (!new_ns) {
  3633. dec_mnt_namespaces(ucounts);
  3634. return ERR_PTR(-ENOMEM);
  3635. }
  3636. if (anon)
  3637. ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO);
  3638. else
  3639. ret = ns_common_init(new_ns);
  3640. if (ret) {
  3641. kfree(new_ns);
  3642. dec_mnt_namespaces(ucounts);
  3643. return ERR_PTR(ret);
  3644. }
  3645. ns_tree_gen_id(new_ns);
  3646. new_ns->is_anon = anon;
  3647. refcount_set(&new_ns->passive, 1);
  3648. new_ns->mounts = RB_ROOT;
  3649. init_waitqueue_head(&new_ns->poll);
  3650. new_ns->user_ns = get_user_ns(user_ns);
  3651. new_ns->ucounts = ucounts;
  3652. return new_ns;
  3653. }
  3654. __latent_entropy
  3655. struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
  3656. struct user_namespace *user_ns, struct fs_struct *new_fs)
  3657. {
  3658. struct mnt_namespace *new_ns;
  3659. struct vfsmount *rootmnt __free(mntput) = NULL;
  3660. struct vfsmount *pwdmnt __free(mntput) = NULL;
  3661. struct mount *p, *q;
  3662. struct mount *old;
  3663. struct mount *new;
  3664. int copy_flags;
  3665. BUG_ON(!ns);
  3666. if (likely(!(flags & CLONE_NEWNS))) {
  3667. get_mnt_ns(ns);
  3668. return ns;
  3669. }
  3670. old = ns->root;
  3671. new_ns = alloc_mnt_ns(user_ns, false);
  3672. if (IS_ERR(new_ns))
  3673. return new_ns;
  3674. guard(namespace_excl)();
  3675. /* First pass: copy the tree topology */
  3676. copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
  3677. if (user_ns != ns->user_ns)
  3678. copy_flags |= CL_SLAVE;
  3679. new = copy_tree(old, old->mnt.mnt_root, copy_flags);
  3680. if (IS_ERR(new)) {
  3681. emptied_ns = new_ns;
  3682. return ERR_CAST(new);
  3683. }
  3684. if (user_ns != ns->user_ns) {
  3685. guard(mount_writer)();
  3686. lock_mnt_tree(new);
  3687. }
  3688. new_ns->root = new;
  3689. /*
  3690. * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
  3691. * as belonging to new namespace. We have already acquired a private
  3692. * fs_struct, so tsk->fs->lock is not needed.
  3693. */
  3694. p = old;
  3695. q = new;
  3696. while (p) {
  3697. mnt_add_to_ns(new_ns, q);
  3698. new_ns->nr_mounts++;
  3699. if (new_fs) {
  3700. if (&p->mnt == new_fs->root.mnt) {
  3701. new_fs->root.mnt = mntget(&q->mnt);
  3702. rootmnt = &p->mnt;
  3703. }
  3704. if (&p->mnt == new_fs->pwd.mnt) {
  3705. new_fs->pwd.mnt = mntget(&q->mnt);
  3706. pwdmnt = &p->mnt;
  3707. }
  3708. }
  3709. p = next_mnt(p, old);
  3710. q = next_mnt(q, new);
  3711. if (!q)
  3712. break;
  3713. // an mntns binding we'd skipped?
  3714. while (p->mnt.mnt_root != q->mnt.mnt_root)
  3715. p = next_mnt(skip_mnt_tree(p), old);
  3716. }
  3717. ns_tree_add_raw(new_ns);
  3718. return new_ns;
  3719. }
  3720. struct dentry *mount_subtree(struct vfsmount *m, const char *name)
  3721. {
  3722. struct mount *mnt = real_mount(m);
  3723. struct mnt_namespace *ns;
  3724. struct super_block *s;
  3725. struct path path;
  3726. int err;
  3727. ns = alloc_mnt_ns(&init_user_ns, true);
  3728. if (IS_ERR(ns)) {
  3729. mntput(m);
  3730. return ERR_CAST(ns);
  3731. }
  3732. ns->root = mnt;
  3733. ns->nr_mounts++;
  3734. mnt_add_to_ns(ns, mnt);
  3735. err = vfs_path_lookup(m->mnt_root, m,
  3736. name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
  3737. put_mnt_ns(ns);
  3738. if (err)
  3739. return ERR_PTR(err);
  3740. /* trade a vfsmount reference for active sb one */
  3741. s = path.mnt->mnt_sb;
  3742. atomic_inc(&s->s_active);
  3743. mntput(path.mnt);
  3744. /* lock the sucker */
  3745. down_write(&s->s_umount);
  3746. /* ... and return the root of (sub)tree on it */
  3747. return path.dentry;
  3748. }
  3749. EXPORT_SYMBOL(mount_subtree);
  3750. SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
  3751. char __user *, type, unsigned long, flags, void __user *, data)
  3752. {
  3753. int ret;
  3754. char *kernel_type;
  3755. char *kernel_dev;
  3756. void *options;
  3757. kernel_type = copy_mount_string(type);
  3758. ret = PTR_ERR(kernel_type);
  3759. if (IS_ERR(kernel_type))
  3760. goto out_type;
  3761. kernel_dev = copy_mount_string(dev_name);
  3762. ret = PTR_ERR(kernel_dev);
  3763. if (IS_ERR(kernel_dev))
  3764. goto out_dev;
  3765. options = copy_mount_options(data);
  3766. ret = PTR_ERR(options);
  3767. if (IS_ERR(options))
  3768. goto out_data;
  3769. ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
  3770. kfree(options);
  3771. out_data:
  3772. kfree(kernel_dev);
  3773. out_dev:
  3774. kfree(kernel_type);
  3775. out_type:
  3776. return ret;
  3777. }
  3778. #define FSMOUNT_VALID_FLAGS \
  3779. (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
  3780. MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \
  3781. MOUNT_ATTR_NOSYMFOLLOW)
  3782. #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
  3783. #define MOUNT_SETATTR_PROPAGATION_FLAGS \
  3784. (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
  3785. static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
  3786. {
  3787. unsigned int mnt_flags = 0;
  3788. if (attr_flags & MOUNT_ATTR_RDONLY)
  3789. mnt_flags |= MNT_READONLY;
  3790. if (attr_flags & MOUNT_ATTR_NOSUID)
  3791. mnt_flags |= MNT_NOSUID;
  3792. if (attr_flags & MOUNT_ATTR_NODEV)
  3793. mnt_flags |= MNT_NODEV;
  3794. if (attr_flags & MOUNT_ATTR_NOEXEC)
  3795. mnt_flags |= MNT_NOEXEC;
  3796. if (attr_flags & MOUNT_ATTR_NODIRATIME)
  3797. mnt_flags |= MNT_NODIRATIME;
  3798. if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
  3799. mnt_flags |= MNT_NOSYMFOLLOW;
  3800. return mnt_flags;
  3801. }
  3802. /*
  3803. * Create a kernel mount representation for a new, prepared superblock
  3804. * (specified by fs_fd) and attach to an open_tree-like file descriptor.
  3805. */
  3806. SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
  3807. unsigned int, attr_flags)
  3808. {
  3809. struct path new_path __free(path_put) = {};
  3810. struct mnt_namespace *ns;
  3811. struct fs_context *fc;
  3812. struct vfsmount *new_mnt;
  3813. struct mount *mnt;
  3814. unsigned int mnt_flags = 0;
  3815. long ret;
  3816. if (!may_mount())
  3817. return -EPERM;
  3818. if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
  3819. return -EINVAL;
  3820. if (attr_flags & ~FSMOUNT_VALID_FLAGS)
  3821. return -EINVAL;
  3822. mnt_flags = attr_flags_to_mnt_flags(attr_flags);
  3823. switch (attr_flags & MOUNT_ATTR__ATIME) {
  3824. case MOUNT_ATTR_STRICTATIME:
  3825. break;
  3826. case MOUNT_ATTR_NOATIME:
  3827. mnt_flags |= MNT_NOATIME;
  3828. break;
  3829. case MOUNT_ATTR_RELATIME:
  3830. mnt_flags |= MNT_RELATIME;
  3831. break;
  3832. default:
  3833. return -EINVAL;
  3834. }
  3835. CLASS(fd, f)(fs_fd);
  3836. if (fd_empty(f))
  3837. return -EBADF;
  3838. if (fd_file(f)->f_op != &fscontext_fops)
  3839. return -EINVAL;
  3840. fc = fd_file(f)->private_data;
  3841. ACQUIRE(mutex_intr, uapi_mutex)(&fc->uapi_mutex);
  3842. ret = ACQUIRE_ERR(mutex_intr, &uapi_mutex);
  3843. if (ret)
  3844. return ret;
  3845. /* There must be a valid superblock or we can't mount it */
  3846. ret = -EINVAL;
  3847. if (!fc->root)
  3848. return ret;
  3849. ret = -EPERM;
  3850. if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
  3851. errorfcp(fc, "VFS", "Mount too revealing");
  3852. return ret;
  3853. }
  3854. ret = -EBUSY;
  3855. if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
  3856. return ret;
  3857. if (fc->sb_flags & SB_MANDLOCK)
  3858. warn_mandlock();
  3859. new_mnt = vfs_create_mount(fc);
  3860. if (IS_ERR(new_mnt))
  3861. return PTR_ERR(new_mnt);
  3862. new_mnt->mnt_flags = mnt_flags;
  3863. new_path.dentry = dget(fc->root);
  3864. new_path.mnt = new_mnt;
  3865. /* We've done the mount bit - now move the file context into more or
  3866. * less the same state as if we'd done an fspick(). We don't want to
  3867. * do any memory allocation or anything like that at this point as we
  3868. * don't want to have to handle any errors incurred.
  3869. */
  3870. vfs_clean_context(fc);
  3871. ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
  3872. if (IS_ERR(ns))
  3873. return PTR_ERR(ns);
  3874. mnt = real_mount(new_path.mnt);
  3875. ns->root = mnt;
  3876. ns->nr_mounts = 1;
  3877. mnt_add_to_ns(ns, mnt);
  3878. mntget(new_path.mnt);
  3879. FD_PREPARE(fdf, (flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0,
  3880. dentry_open(&new_path, O_PATH, fc->cred));
  3881. if (fdf.err) {
  3882. dissolve_on_fput(new_path.mnt);
  3883. return fdf.err;
  3884. }
  3885. /*
  3886. * Attach to an apparent O_PATH fd with a note that we
  3887. * need to unmount it, not just simply put it.
  3888. */
  3889. fd_prepare_file(fdf)->f_mode |= FMODE_NEED_UNMOUNT;
  3890. return fd_publish(fdf);
  3891. }
  3892. static inline int vfs_move_mount(const struct path *from_path,
  3893. const struct path *to_path,
  3894. enum mnt_tree_flags_t mflags)
  3895. {
  3896. int ret;
  3897. ret = security_move_mount(from_path, to_path);
  3898. if (ret)
  3899. return ret;
  3900. if (mflags & MNT_TREE_PROPAGATION)
  3901. return do_set_group(from_path, to_path);
  3902. return do_move_mount(from_path, to_path, mflags);
  3903. }
  3904. /*
  3905. * Move a mount from one place to another. In combination with
  3906. * fsopen()/fsmount() this is used to install a new mount and in combination
  3907. * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
  3908. * a mount subtree.
  3909. *
  3910. * Note the flags value is a combination of MOVE_MOUNT_* flags.
  3911. */
  3912. SYSCALL_DEFINE5(move_mount,
  3913. int, from_dfd, const char __user *, from_pathname,
  3914. int, to_dfd, const char __user *, to_pathname,
  3915. unsigned int, flags)
  3916. {
  3917. struct path to_path __free(path_put) = {};
  3918. struct path from_path __free(path_put) = {};
  3919. unsigned int lflags, uflags;
  3920. enum mnt_tree_flags_t mflags = 0;
  3921. int ret = 0;
  3922. if (!may_mount())
  3923. return -EPERM;
  3924. if (flags & ~MOVE_MOUNT__MASK)
  3925. return -EINVAL;
  3926. if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) ==
  3927. (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
  3928. return -EINVAL;
  3929. if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION;
  3930. if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH;
  3931. uflags = 0;
  3932. if (flags & MOVE_MOUNT_T_EMPTY_PATH)
  3933. uflags = AT_EMPTY_PATH;
  3934. CLASS(filename_maybe_null,to_name)(to_pathname, uflags);
  3935. if (!to_name && to_dfd >= 0) {
  3936. CLASS(fd_raw, f_to)(to_dfd);
  3937. if (fd_empty(f_to))
  3938. return -EBADF;
  3939. to_path = fd_file(f_to)->f_path;
  3940. path_get(&to_path);
  3941. } else {
  3942. lflags = 0;
  3943. if (flags & MOVE_MOUNT_T_SYMLINKS)
  3944. lflags |= LOOKUP_FOLLOW;
  3945. if (flags & MOVE_MOUNT_T_AUTOMOUNTS)
  3946. lflags |= LOOKUP_AUTOMOUNT;
  3947. ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL);
  3948. if (ret)
  3949. return ret;
  3950. }
  3951. uflags = 0;
  3952. if (flags & MOVE_MOUNT_F_EMPTY_PATH)
  3953. uflags = AT_EMPTY_PATH;
  3954. CLASS(filename_maybe_null,from_name)(from_pathname, uflags);
  3955. if (!from_name && from_dfd >= 0) {
  3956. CLASS(fd_raw, f_from)(from_dfd);
  3957. if (fd_empty(f_from))
  3958. return -EBADF;
  3959. return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags);
  3960. }
  3961. lflags = 0;
  3962. if (flags & MOVE_MOUNT_F_SYMLINKS)
  3963. lflags |= LOOKUP_FOLLOW;
  3964. if (flags & MOVE_MOUNT_F_AUTOMOUNTS)
  3965. lflags |= LOOKUP_AUTOMOUNT;
  3966. ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL);
  3967. if (ret)
  3968. return ret;
  3969. return vfs_move_mount(&from_path, &to_path, mflags);
  3970. }
  3971. /*
  3972. * Return true if path is reachable from root
  3973. *
  3974. * locks: mount_locked_reader || namespace_shared && is_mounted(mnt)
  3975. */
  3976. bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
  3977. const struct path *root)
  3978. {
  3979. while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
  3980. dentry = mnt->mnt_mountpoint;
  3981. mnt = mnt->mnt_parent;
  3982. }
  3983. return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
  3984. }
  3985. bool path_is_under(const struct path *path1, const struct path *path2)
  3986. {
  3987. guard(mount_locked_reader)();
  3988. return is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
  3989. }
  3990. EXPORT_SYMBOL(path_is_under);
  3991. int path_pivot_root(struct path *new, struct path *old)
  3992. {
  3993. struct path root __free(path_put) = {};
  3994. struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
  3995. int error;
  3996. if (!may_mount())
  3997. return -EPERM;
  3998. error = security_sb_pivotroot(old, new);
  3999. if (error)
  4000. return error;
  4001. get_fs_root(current->fs, &root);
  4002. LOCK_MOUNT(old_mp, old);
  4003. old_mnt = old_mp.parent;
  4004. if (IS_ERR(old_mnt))
  4005. return PTR_ERR(old_mnt);
  4006. new_mnt = real_mount(new->mnt);
  4007. root_mnt = real_mount(root.mnt);
  4008. ex_parent = new_mnt->mnt_parent;
  4009. root_parent = root_mnt->mnt_parent;
  4010. if (IS_MNT_SHARED(old_mnt) ||
  4011. IS_MNT_SHARED(ex_parent) ||
  4012. IS_MNT_SHARED(root_parent))
  4013. return -EINVAL;
  4014. if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
  4015. return -EINVAL;
  4016. if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
  4017. return -EINVAL;
  4018. if (d_unlinked(new->dentry))
  4019. return -ENOENT;
  4020. if (new_mnt == root_mnt || old_mnt == root_mnt)
  4021. return -EBUSY; /* loop, on the same file system */
  4022. if (!path_mounted(&root))
  4023. return -EINVAL; /* not a mountpoint */
  4024. if (!mnt_has_parent(root_mnt))
  4025. return -EINVAL; /* absolute root */
  4026. if (!path_mounted(new))
  4027. return -EINVAL; /* not a mountpoint */
  4028. if (!mnt_has_parent(new_mnt))
  4029. return -EINVAL; /* absolute root */
  4030. /* make sure we can reach put_old from new_root */
  4031. if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, new))
  4032. return -EINVAL;
  4033. /* make certain new is below the root */
  4034. if (!is_path_reachable(new_mnt, new->dentry, &root))
  4035. return -EINVAL;
  4036. lock_mount_hash();
  4037. umount_mnt(new_mnt);
  4038. if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
  4039. new_mnt->mnt.mnt_flags |= MNT_LOCKED;
  4040. root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
  4041. }
  4042. /* mount new_root on / */
  4043. attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp);
  4044. umount_mnt(root_mnt);
  4045. /* mount old root on put_old */
  4046. attach_mnt(root_mnt, old_mnt, old_mp.mp);
  4047. touch_mnt_namespace(current->nsproxy->mnt_ns);
  4048. /* A moved mount should not expire automatically */
  4049. list_del_init(&new_mnt->mnt_expire);
  4050. unlock_mount_hash();
  4051. mnt_notify_add(root_mnt);
  4052. mnt_notify_add(new_mnt);
  4053. chroot_fs_refs(&root, new);
  4054. return 0;
  4055. }
  4056. /*
  4057. * pivot_root Semantics:
  4058. * Moves the root file system of the current process to the directory put_old,
  4059. * makes new_root as the new root file system of the current process, and sets
  4060. * root/cwd of all processes which had them on the current root to new_root.
  4061. *
  4062. * Restrictions:
  4063. * The new_root and put_old must be directories, and must not be on the
  4064. * same file system as the current process root. The put_old must be
  4065. * underneath new_root, i.e. adding a non-zero number of /.. to the string
  4066. * pointed to by put_old must yield the same directory as new_root. No other
  4067. * file system may be mounted on put_old. After all, new_root is a mountpoint.
  4068. *
  4069. * The immutable nullfs filesystem is mounted as the true root of the VFS
  4070. * hierarchy. The mutable rootfs (tmpfs/ramfs) is layered on top of this,
  4071. * allowing pivot_root() to work normally from initramfs.
  4072. *
  4073. * Notes:
  4074. * - we don't move root/cwd if they are not at the root (reason: if something
  4075. * cared enough to change them, it's probably wrong to force them elsewhere)
  4076. * - it's okay to pick a root that isn't the root of a file system, e.g.
  4077. * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
  4078. * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
  4079. * first.
  4080. */
  4081. SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
  4082. const char __user *, put_old)
  4083. {
  4084. struct path new __free(path_put) = {};
  4085. struct path old __free(path_put) = {};
  4086. int error;
  4087. error = user_path_at(AT_FDCWD, new_root,
  4088. LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
  4089. if (error)
  4090. return error;
  4091. error = user_path_at(AT_FDCWD, put_old,
  4092. LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
  4093. if (error)
  4094. return error;
  4095. return path_pivot_root(&new, &old);
  4096. }
  4097. static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
  4098. {
  4099. unsigned int flags = mnt->mnt.mnt_flags;
  4100. /* flags to clear */
  4101. flags &= ~kattr->attr_clr;
  4102. /* flags to raise */
  4103. flags |= kattr->attr_set;
  4104. return flags;
  4105. }
  4106. static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
  4107. {
  4108. struct vfsmount *m = &mnt->mnt;
  4109. struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
  4110. if (!kattr->mnt_idmap)
  4111. return 0;
  4112. /*
  4113. * Creating an idmapped mount with the filesystem wide idmapping
  4114. * doesn't make sense so block that. We don't allow mushy semantics.
  4115. */
  4116. if (kattr->mnt_userns == m->mnt_sb->s_user_ns)
  4117. return -EINVAL;
  4118. /*
  4119. * We only allow an mount to change it's idmapping if it has
  4120. * never been accessible to userspace.
  4121. */
  4122. if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m))
  4123. return -EPERM;
  4124. /* The underlying filesystem doesn't support idmapped mounts yet. */
  4125. if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
  4126. return -EINVAL;
  4127. /* The filesystem has turned off idmapped mounts. */
  4128. if (m->mnt_sb->s_iflags & SB_I_NOIDMAP)
  4129. return -EINVAL;
  4130. /* We're not controlling the superblock. */
  4131. if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
  4132. return -EPERM;
  4133. /* Mount has already been visible in the filesystem hierarchy. */
  4134. if (!is_anon_ns(mnt->mnt_ns))
  4135. return -EINVAL;
  4136. return 0;
  4137. }
  4138. /**
  4139. * mnt_allow_writers() - check whether the attribute change allows writers
  4140. * @kattr: the new mount attributes
  4141. * @mnt: the mount to which @kattr will be applied
  4142. *
  4143. * Check whether thew new mount attributes in @kattr allow concurrent writers.
  4144. *
  4145. * Return: true if writers need to be held, false if not
  4146. */
  4147. static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
  4148. const struct mount *mnt)
  4149. {
  4150. return (!(kattr->attr_set & MNT_READONLY) ||
  4151. (mnt->mnt.mnt_flags & MNT_READONLY)) &&
  4152. !kattr->mnt_idmap;
  4153. }
  4154. static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
  4155. {
  4156. struct mount *m;
  4157. int err;
  4158. for (m = mnt; m; m = next_mnt(m, mnt)) {
  4159. if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
  4160. err = -EPERM;
  4161. break;
  4162. }
  4163. err = can_idmap_mount(kattr, m);
  4164. if (err)
  4165. break;
  4166. if (!mnt_allow_writers(kattr, m)) {
  4167. err = mnt_hold_writers(m);
  4168. if (err) {
  4169. m = next_mnt(m, mnt);
  4170. break;
  4171. }
  4172. }
  4173. if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
  4174. return 0;
  4175. }
  4176. if (err) {
  4177. /* undo all mnt_hold_writers() we'd done */
  4178. for (struct mount *p = mnt; p != m; p = next_mnt(p, mnt))
  4179. mnt_unhold_writers(p);
  4180. }
  4181. return err;
  4182. }
  4183. static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
  4184. {
  4185. struct mnt_idmap *old_idmap;
  4186. if (!kattr->mnt_idmap)
  4187. return;
  4188. old_idmap = mnt_idmap(&mnt->mnt);
  4189. /* Pairs with smp_load_acquire() in mnt_idmap(). */
  4190. smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
  4191. mnt_idmap_put(old_idmap);
  4192. }
  4193. static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
  4194. {
  4195. struct mount *m;
  4196. for (m = mnt; m; m = next_mnt(m, mnt)) {
  4197. unsigned int flags;
  4198. do_idmap_mount(kattr, m);
  4199. flags = recalc_flags(kattr, m);
  4200. WRITE_ONCE(m->mnt.mnt_flags, flags);
  4201. /* If we had to hold writers unblock them. */
  4202. mnt_unhold_writers(m);
  4203. if (kattr->propagation)
  4204. change_mnt_propagation(m, kattr->propagation);
  4205. if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
  4206. break;
  4207. }
  4208. touch_mnt_namespace(mnt->mnt_ns);
  4209. }
  4210. static int do_mount_setattr(const struct path *path, struct mount_kattr *kattr)
  4211. {
  4212. struct mount *mnt = real_mount(path->mnt);
  4213. int err = 0;
  4214. if (!path_mounted(path))
  4215. return -EINVAL;
  4216. if (kattr->mnt_userns) {
  4217. struct mnt_idmap *mnt_idmap;
  4218. mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
  4219. if (IS_ERR(mnt_idmap))
  4220. return PTR_ERR(mnt_idmap);
  4221. kattr->mnt_idmap = mnt_idmap;
  4222. }
  4223. if (kattr->propagation) {
  4224. /*
  4225. * Only take namespace_lock() if we're actually changing
  4226. * propagation.
  4227. */
  4228. namespace_lock();
  4229. if (kattr->propagation == MS_SHARED) {
  4230. err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE);
  4231. if (err) {
  4232. namespace_unlock();
  4233. return err;
  4234. }
  4235. }
  4236. }
  4237. err = -EINVAL;
  4238. lock_mount_hash();
  4239. if (!anon_ns_root(mnt) && !check_mnt(mnt))
  4240. goto out;
  4241. /*
  4242. * First, we get the mount tree in a shape where we can change mount
  4243. * properties without failure. If we succeeded to do so we commit all
  4244. * changes and if we failed we clean up.
  4245. */
  4246. err = mount_setattr_prepare(kattr, mnt);
  4247. if (!err)
  4248. mount_setattr_commit(kattr, mnt);
  4249. out:
  4250. unlock_mount_hash();
  4251. if (kattr->propagation) {
  4252. if (err)
  4253. cleanup_group_ids(mnt, NULL);
  4254. namespace_unlock();
  4255. }
  4256. return err;
  4257. }
  4258. static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
  4259. struct mount_kattr *kattr)
  4260. {
  4261. struct ns_common *ns;
  4262. struct user_namespace *mnt_userns;
  4263. if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
  4264. return 0;
  4265. if (attr->attr_clr & MOUNT_ATTR_IDMAP) {
  4266. /*
  4267. * We can only remove an idmapping if it's never been
  4268. * exposed to userspace.
  4269. */
  4270. if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE))
  4271. return -EINVAL;
  4272. /*
  4273. * Removal of idmappings is equivalent to setting
  4274. * nop_mnt_idmap.
  4275. */
  4276. if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) {
  4277. kattr->mnt_idmap = &nop_mnt_idmap;
  4278. return 0;
  4279. }
  4280. }
  4281. if (attr->userns_fd > INT_MAX)
  4282. return -EINVAL;
  4283. CLASS(fd, f)(attr->userns_fd);
  4284. if (fd_empty(f))
  4285. return -EBADF;
  4286. if (!proc_ns_file(fd_file(f)))
  4287. return -EINVAL;
  4288. ns = get_proc_ns(file_inode(fd_file(f)));
  4289. if (ns->ns_type != CLONE_NEWUSER)
  4290. return -EINVAL;
  4291. /*
  4292. * The initial idmapping cannot be used to create an idmapped
  4293. * mount. We use the initial idmapping as an indicator of a mount
  4294. * that is not idmapped. It can simply be passed into helpers that
  4295. * are aware of idmapped mounts as a convenient shortcut. A user
  4296. * can just create a dedicated identity mapping to achieve the same
  4297. * result.
  4298. */
  4299. mnt_userns = container_of(ns, struct user_namespace, ns);
  4300. if (mnt_userns == &init_user_ns)
  4301. return -EPERM;
  4302. /* We're not controlling the target namespace. */
  4303. if (!ns_capable(mnt_userns, CAP_SYS_ADMIN))
  4304. return -EPERM;
  4305. kattr->mnt_userns = get_user_ns(mnt_userns);
  4306. return 0;
  4307. }
  4308. static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
  4309. struct mount_kattr *kattr)
  4310. {
  4311. if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
  4312. return -EINVAL;
  4313. if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
  4314. return -EINVAL;
  4315. kattr->propagation = attr->propagation;
  4316. if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
  4317. return -EINVAL;
  4318. kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
  4319. kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
  4320. /*
  4321. * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
  4322. * users wanting to transition to a different atime setting cannot
  4323. * simply specify the atime setting in @attr_set, but must also
  4324. * specify MOUNT_ATTR__ATIME in the @attr_clr field.
  4325. * So ensure that MOUNT_ATTR__ATIME can't be partially set in
  4326. * @attr_clr and that @attr_set can't have any atime bits set if
  4327. * MOUNT_ATTR__ATIME isn't set in @attr_clr.
  4328. */
  4329. if (attr->attr_clr & MOUNT_ATTR__ATIME) {
  4330. if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
  4331. return -EINVAL;
  4332. /*
  4333. * Clear all previous time settings as they are mutually
  4334. * exclusive.
  4335. */
  4336. kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
  4337. switch (attr->attr_set & MOUNT_ATTR__ATIME) {
  4338. case MOUNT_ATTR_RELATIME:
  4339. kattr->attr_set |= MNT_RELATIME;
  4340. break;
  4341. case MOUNT_ATTR_NOATIME:
  4342. kattr->attr_set |= MNT_NOATIME;
  4343. break;
  4344. case MOUNT_ATTR_STRICTATIME:
  4345. break;
  4346. default:
  4347. return -EINVAL;
  4348. }
  4349. } else {
  4350. if (attr->attr_set & MOUNT_ATTR__ATIME)
  4351. return -EINVAL;
  4352. }
  4353. return build_mount_idmapped(attr, usize, kattr);
  4354. }
  4355. static void finish_mount_kattr(struct mount_kattr *kattr)
  4356. {
  4357. if (kattr->mnt_userns) {
  4358. put_user_ns(kattr->mnt_userns);
  4359. kattr->mnt_userns = NULL;
  4360. }
  4361. if (kattr->mnt_idmap)
  4362. mnt_idmap_put(kattr->mnt_idmap);
  4363. }
  4364. static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize,
  4365. struct mount_kattr *kattr)
  4366. {
  4367. int ret;
  4368. struct mount_attr attr;
  4369. BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
  4370. if (unlikely(usize > PAGE_SIZE))
  4371. return -E2BIG;
  4372. if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
  4373. return -EINVAL;
  4374. if (!may_mount())
  4375. return -EPERM;
  4376. ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
  4377. if (ret)
  4378. return ret;
  4379. /* Don't bother walking through the mounts if this is a nop. */
  4380. if (attr.attr_set == 0 &&
  4381. attr.attr_clr == 0 &&
  4382. attr.propagation == 0)
  4383. return 0; /* Tell caller to not bother. */
  4384. ret = build_mount_kattr(&attr, usize, kattr);
  4385. if (ret < 0)
  4386. return ret;
  4387. return 1;
  4388. }
  4389. SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
  4390. unsigned int, flags, struct mount_attr __user *, uattr,
  4391. size_t, usize)
  4392. {
  4393. int err;
  4394. struct path target;
  4395. struct mount_kattr kattr;
  4396. unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
  4397. if (flags & ~(AT_EMPTY_PATH |
  4398. AT_RECURSIVE |
  4399. AT_SYMLINK_NOFOLLOW |
  4400. AT_NO_AUTOMOUNT))
  4401. return -EINVAL;
  4402. if (flags & AT_NO_AUTOMOUNT)
  4403. lookup_flags &= ~LOOKUP_AUTOMOUNT;
  4404. if (flags & AT_SYMLINK_NOFOLLOW)
  4405. lookup_flags &= ~LOOKUP_FOLLOW;
  4406. kattr = (struct mount_kattr) {
  4407. .lookup_flags = lookup_flags,
  4408. };
  4409. if (flags & AT_RECURSIVE)
  4410. kattr.kflags |= MOUNT_KATTR_RECURSE;
  4411. err = wants_mount_setattr(uattr, usize, &kattr);
  4412. if (err <= 0)
  4413. return err;
  4414. CLASS(filename_uflags, name)(path, flags);
  4415. err = filename_lookup(dfd, name, kattr.lookup_flags, &target, NULL);
  4416. if (!err) {
  4417. err = do_mount_setattr(&target, &kattr);
  4418. path_put(&target);
  4419. }
  4420. finish_mount_kattr(&kattr);
  4421. return err;
  4422. }
  4423. SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
  4424. unsigned, flags, struct mount_attr __user *, uattr,
  4425. size_t, usize)
  4426. {
  4427. if (!uattr && usize)
  4428. return -EINVAL;
  4429. FD_PREPARE(fdf, flags, vfs_open_tree(dfd, filename, flags));
  4430. if (fdf.err)
  4431. return fdf.err;
  4432. if (uattr) {
  4433. struct mount_kattr kattr = {};
  4434. struct file *file = fd_prepare_file(fdf);
  4435. int ret;
  4436. if (flags & OPEN_TREE_CLONE)
  4437. kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
  4438. if (flags & AT_RECURSIVE)
  4439. kattr.kflags |= MOUNT_KATTR_RECURSE;
  4440. ret = wants_mount_setattr(uattr, usize, &kattr);
  4441. if (ret > 0) {
  4442. ret = do_mount_setattr(&file->f_path, &kattr);
  4443. finish_mount_kattr(&kattr);
  4444. }
  4445. if (ret)
  4446. return ret;
  4447. }
  4448. return fd_publish(fdf);
  4449. }
  4450. int show_path(struct seq_file *m, struct dentry *root)
  4451. {
  4452. if (root->d_sb->s_op->show_path)
  4453. return root->d_sb->s_op->show_path(m, root);
  4454. seq_dentry(m, root, " \t\n\\");
  4455. return 0;
  4456. }
  4457. static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
  4458. {
  4459. struct mount *mnt = mnt_find_id_at(ns, id);
  4460. if (!mnt || mnt->mnt_id_unique != id)
  4461. return NULL;
  4462. return &mnt->mnt;
  4463. }
  4464. struct kstatmount {
  4465. struct statmount __user *buf;
  4466. size_t bufsize;
  4467. struct vfsmount *mnt;
  4468. struct mnt_idmap *idmap;
  4469. u64 mask;
  4470. struct path root;
  4471. struct seq_file seq;
  4472. /* Must be last --ends in a flexible-array member. */
  4473. struct statmount sm;
  4474. };
  4475. static u64 mnt_to_attr_flags(struct vfsmount *mnt)
  4476. {
  4477. unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
  4478. u64 attr_flags = 0;
  4479. if (mnt_flags & MNT_READONLY)
  4480. attr_flags |= MOUNT_ATTR_RDONLY;
  4481. if (mnt_flags & MNT_NOSUID)
  4482. attr_flags |= MOUNT_ATTR_NOSUID;
  4483. if (mnt_flags & MNT_NODEV)
  4484. attr_flags |= MOUNT_ATTR_NODEV;
  4485. if (mnt_flags & MNT_NOEXEC)
  4486. attr_flags |= MOUNT_ATTR_NOEXEC;
  4487. if (mnt_flags & MNT_NODIRATIME)
  4488. attr_flags |= MOUNT_ATTR_NODIRATIME;
  4489. if (mnt_flags & MNT_NOSYMFOLLOW)
  4490. attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
  4491. if (mnt_flags & MNT_NOATIME)
  4492. attr_flags |= MOUNT_ATTR_NOATIME;
  4493. else if (mnt_flags & MNT_RELATIME)
  4494. attr_flags |= MOUNT_ATTR_RELATIME;
  4495. else
  4496. attr_flags |= MOUNT_ATTR_STRICTATIME;
  4497. if (is_idmapped_mnt(mnt))
  4498. attr_flags |= MOUNT_ATTR_IDMAP;
  4499. return attr_flags;
  4500. }
  4501. static u64 mnt_to_propagation_flags(struct mount *m)
  4502. {
  4503. u64 propagation = 0;
  4504. if (IS_MNT_SHARED(m))
  4505. propagation |= MS_SHARED;
  4506. if (IS_MNT_SLAVE(m))
  4507. propagation |= MS_SLAVE;
  4508. if (IS_MNT_UNBINDABLE(m))
  4509. propagation |= MS_UNBINDABLE;
  4510. if (!propagation)
  4511. propagation |= MS_PRIVATE;
  4512. return propagation;
  4513. }
  4514. u64 vfsmount_to_propagation_flags(struct vfsmount *mnt)
  4515. {
  4516. return mnt_to_propagation_flags(real_mount(mnt));
  4517. }
  4518. EXPORT_SYMBOL_GPL(vfsmount_to_propagation_flags);
  4519. static void statmount_sb_basic(struct kstatmount *s)
  4520. {
  4521. struct super_block *sb = s->mnt->mnt_sb;
  4522. s->sm.mask |= STATMOUNT_SB_BASIC;
  4523. s->sm.sb_dev_major = MAJOR(sb->s_dev);
  4524. s->sm.sb_dev_minor = MINOR(sb->s_dev);
  4525. s->sm.sb_magic = sb->s_magic;
  4526. s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
  4527. }
  4528. static void statmount_mnt_basic(struct kstatmount *s)
  4529. {
  4530. struct mount *m = real_mount(s->mnt);
  4531. s->sm.mask |= STATMOUNT_MNT_BASIC;
  4532. s->sm.mnt_id = m->mnt_id_unique;
  4533. s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
  4534. s->sm.mnt_id_old = m->mnt_id;
  4535. s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
  4536. s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
  4537. s->sm.mnt_propagation = mnt_to_propagation_flags(m);
  4538. s->sm.mnt_peer_group = m->mnt_group_id;
  4539. s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
  4540. }
  4541. static void statmount_propagate_from(struct kstatmount *s)
  4542. {
  4543. struct mount *m = real_mount(s->mnt);
  4544. s->sm.mask |= STATMOUNT_PROPAGATE_FROM;
  4545. if (IS_MNT_SLAVE(m))
  4546. s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
  4547. }
  4548. static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq)
  4549. {
  4550. int ret;
  4551. size_t start = seq->count;
  4552. ret = show_path(seq, s->mnt->mnt_root);
  4553. if (ret)
  4554. return ret;
  4555. if (unlikely(seq_has_overflowed(seq)))
  4556. return -EAGAIN;
  4557. /*
  4558. * Unescape the result. It would be better if supplied string was not
  4559. * escaped in the first place, but that's a pretty invasive change.
  4560. */
  4561. seq->buf[seq->count] = '\0';
  4562. seq->count = start;
  4563. seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
  4564. return 0;
  4565. }
  4566. static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq)
  4567. {
  4568. struct vfsmount *mnt = s->mnt;
  4569. struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
  4570. int err;
  4571. err = seq_path_root(seq, &mnt_path, &s->root, "");
  4572. return err == SEQ_SKIP ? 0 : err;
  4573. }
  4574. static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
  4575. {
  4576. struct super_block *sb = s->mnt->mnt_sb;
  4577. seq_puts(seq, sb->s_type->name);
  4578. return 0;
  4579. }
  4580. static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq)
  4581. {
  4582. struct super_block *sb = s->mnt->mnt_sb;
  4583. if (sb->s_subtype)
  4584. seq_puts(seq, sb->s_subtype);
  4585. }
  4586. static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
  4587. {
  4588. struct super_block *sb = s->mnt->mnt_sb;
  4589. struct mount *r = real_mount(s->mnt);
  4590. if (sb->s_op->show_devname) {
  4591. size_t start = seq->count;
  4592. int ret;
  4593. ret = sb->s_op->show_devname(seq, s->mnt->mnt_root);
  4594. if (ret)
  4595. return ret;
  4596. if (unlikely(seq_has_overflowed(seq)))
  4597. return -EAGAIN;
  4598. /* Unescape the result */
  4599. seq->buf[seq->count] = '\0';
  4600. seq->count = start;
  4601. seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
  4602. } else {
  4603. seq_puts(seq, r->mnt_devname);
  4604. }
  4605. return 0;
  4606. }
  4607. static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
  4608. {
  4609. s->sm.mask |= STATMOUNT_MNT_NS_ID;
  4610. s->sm.mnt_ns_id = ns->ns.ns_id;
  4611. }
  4612. static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
  4613. {
  4614. struct vfsmount *mnt = s->mnt;
  4615. struct super_block *sb = mnt->mnt_sb;
  4616. size_t start = seq->count;
  4617. int err;
  4618. err = security_sb_show_options(seq, sb);
  4619. if (err)
  4620. return err;
  4621. if (sb->s_op->show_options) {
  4622. err = sb->s_op->show_options(seq, mnt->mnt_root);
  4623. if (err)
  4624. return err;
  4625. }
  4626. if (unlikely(seq_has_overflowed(seq)))
  4627. return -EAGAIN;
  4628. if (seq->count == start)
  4629. return 0;
  4630. /* skip leading comma */
  4631. memmove(seq->buf + start, seq->buf + start + 1,
  4632. seq->count - start - 1);
  4633. seq->count--;
  4634. return 0;
  4635. }
  4636. static inline int statmount_opt_process(struct seq_file *seq, size_t start)
  4637. {
  4638. char *buf_end, *opt_end, *src, *dst;
  4639. int count = 0;
  4640. if (unlikely(seq_has_overflowed(seq)))
  4641. return -EAGAIN;
  4642. buf_end = seq->buf + seq->count;
  4643. dst = seq->buf + start;
  4644. src = dst + 1; /* skip initial comma */
  4645. if (src >= buf_end) {
  4646. seq->count = start;
  4647. return 0;
  4648. }
  4649. *buf_end = '\0';
  4650. for (; src < buf_end; src = opt_end + 1) {
  4651. opt_end = strchrnul(src, ',');
  4652. *opt_end = '\0';
  4653. dst += string_unescape(src, dst, 0, UNESCAPE_OCTAL) + 1;
  4654. if (WARN_ON_ONCE(++count == INT_MAX))
  4655. return -EOVERFLOW;
  4656. }
  4657. seq->count = dst - 1 - seq->buf;
  4658. return count;
  4659. }
  4660. static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq)
  4661. {
  4662. struct vfsmount *mnt = s->mnt;
  4663. struct super_block *sb = mnt->mnt_sb;
  4664. size_t start = seq->count;
  4665. int err;
  4666. if (!sb->s_op->show_options)
  4667. return 0;
  4668. err = sb->s_op->show_options(seq, mnt->mnt_root);
  4669. if (err)
  4670. return err;
  4671. err = statmount_opt_process(seq, start);
  4672. if (err < 0)
  4673. return err;
  4674. s->sm.opt_num = err;
  4675. return 0;
  4676. }
  4677. static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq)
  4678. {
  4679. struct vfsmount *mnt = s->mnt;
  4680. struct super_block *sb = mnt->mnt_sb;
  4681. size_t start = seq->count;
  4682. int err;
  4683. err = security_sb_show_options(seq, sb);
  4684. if (err)
  4685. return err;
  4686. err = statmount_opt_process(seq, start);
  4687. if (err < 0)
  4688. return err;
  4689. s->sm.opt_sec_num = err;
  4690. return 0;
  4691. }
  4692. static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq)
  4693. {
  4694. int ret;
  4695. ret = statmount_mnt_idmap(s->idmap, seq, true);
  4696. if (ret < 0)
  4697. return ret;
  4698. s->sm.mnt_uidmap_num = ret;
  4699. /*
  4700. * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid
  4701. * mappings. This allows userspace to distinguish between a
  4702. * non-idmapped mount and an idmapped mount where none of the
  4703. * individual mappings are valid in the caller's idmapping.
  4704. */
  4705. if (is_valid_mnt_idmap(s->idmap))
  4706. s->sm.mask |= STATMOUNT_MNT_UIDMAP;
  4707. return 0;
  4708. }
  4709. static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq)
  4710. {
  4711. int ret;
  4712. ret = statmount_mnt_idmap(s->idmap, seq, false);
  4713. if (ret < 0)
  4714. return ret;
  4715. s->sm.mnt_gidmap_num = ret;
  4716. /*
  4717. * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid
  4718. * mappings. This allows userspace to distinguish between a
  4719. * non-idmapped mount and an idmapped mount where none of the
  4720. * individual mappings are valid in the caller's idmapping.
  4721. */
  4722. if (is_valid_mnt_idmap(s->idmap))
  4723. s->sm.mask |= STATMOUNT_MNT_GIDMAP;
  4724. return 0;
  4725. }
  4726. static int statmount_string(struct kstatmount *s, u64 flag)
  4727. {
  4728. int ret = 0;
  4729. size_t kbufsize;
  4730. struct seq_file *seq = &s->seq;
  4731. struct statmount *sm = &s->sm;
  4732. u32 start, *offp;
  4733. /* Reserve an empty string at the beginning for any unset offsets */
  4734. if (!seq->count)
  4735. seq_putc(seq, 0);
  4736. start = seq->count;
  4737. switch (flag) {
  4738. case STATMOUNT_FS_TYPE:
  4739. offp = &sm->fs_type;
  4740. ret = statmount_fs_type(s, seq);
  4741. break;
  4742. case STATMOUNT_MNT_ROOT:
  4743. offp = &sm->mnt_root;
  4744. ret = statmount_mnt_root(s, seq);
  4745. break;
  4746. case STATMOUNT_MNT_POINT:
  4747. offp = &sm->mnt_point;
  4748. ret = statmount_mnt_point(s, seq);
  4749. break;
  4750. case STATMOUNT_MNT_OPTS:
  4751. offp = &sm->mnt_opts;
  4752. ret = statmount_mnt_opts(s, seq);
  4753. break;
  4754. case STATMOUNT_OPT_ARRAY:
  4755. offp = &sm->opt_array;
  4756. ret = statmount_opt_array(s, seq);
  4757. break;
  4758. case STATMOUNT_OPT_SEC_ARRAY:
  4759. offp = &sm->opt_sec_array;
  4760. ret = statmount_opt_sec_array(s, seq);
  4761. break;
  4762. case STATMOUNT_FS_SUBTYPE:
  4763. offp = &sm->fs_subtype;
  4764. statmount_fs_subtype(s, seq);
  4765. break;
  4766. case STATMOUNT_SB_SOURCE:
  4767. offp = &sm->sb_source;
  4768. ret = statmount_sb_source(s, seq);
  4769. break;
  4770. case STATMOUNT_MNT_UIDMAP:
  4771. offp = &sm->mnt_uidmap;
  4772. ret = statmount_mnt_uidmap(s, seq);
  4773. break;
  4774. case STATMOUNT_MNT_GIDMAP:
  4775. offp = &sm->mnt_gidmap;
  4776. ret = statmount_mnt_gidmap(s, seq);
  4777. break;
  4778. default:
  4779. WARN_ON_ONCE(true);
  4780. return -EINVAL;
  4781. }
  4782. /*
  4783. * If nothing was emitted, return to avoid setting the flag
  4784. * and terminating the buffer.
  4785. */
  4786. if (seq->count == start)
  4787. return ret;
  4788. if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
  4789. return -EOVERFLOW;
  4790. if (kbufsize >= s->bufsize)
  4791. return -EOVERFLOW;
  4792. /* signal a retry */
  4793. if (unlikely(seq_has_overflowed(seq)))
  4794. return -EAGAIN;
  4795. if (ret)
  4796. return ret;
  4797. seq->buf[seq->count++] = '\0';
  4798. sm->mask |= flag;
  4799. *offp = start;
  4800. return 0;
  4801. }
  4802. static int copy_statmount_to_user(struct kstatmount *s)
  4803. {
  4804. struct statmount *sm = &s->sm;
  4805. struct seq_file *seq = &s->seq;
  4806. char __user *str = ((char __user *)s->buf) + sizeof(*sm);
  4807. size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
  4808. if (seq->count && copy_to_user(str, seq->buf, seq->count))
  4809. return -EFAULT;
  4810. /* Return the number of bytes copied to the buffer */
  4811. sm->size = copysize + seq->count;
  4812. if (copy_to_user(s->buf, sm, copysize))
  4813. return -EFAULT;
  4814. return 0;
  4815. }
  4816. static struct mount *listmnt_next(struct mount *curr, bool reverse)
  4817. {
  4818. struct rb_node *node;
  4819. if (reverse)
  4820. node = rb_prev(&curr->mnt_node);
  4821. else
  4822. node = rb_next(&curr->mnt_node);
  4823. return node_to_mount(node);
  4824. }
  4825. static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
  4826. {
  4827. struct mount *first, *child;
  4828. rwsem_assert_held(&namespace_sem);
  4829. /* We're looking at our own ns, just use get_fs_root. */
  4830. if (ns == current->nsproxy->mnt_ns) {
  4831. get_fs_root(current->fs, root);
  4832. return 0;
  4833. }
  4834. /*
  4835. * We have to find the first mount in our ns and use that, however it
  4836. * may not exist, so handle that properly.
  4837. */
  4838. if (mnt_ns_empty(ns))
  4839. return -ENOENT;
  4840. first = child = ns->root;
  4841. for (;;) {
  4842. child = listmnt_next(child, false);
  4843. if (!child)
  4844. return -ENOENT;
  4845. if (child->mnt_parent == first)
  4846. break;
  4847. }
  4848. root->mnt = mntget(&child->mnt);
  4849. root->dentry = dget(root->mnt->mnt_root);
  4850. return 0;
  4851. }
  4852. /* This must be updated whenever a new flag is added */
  4853. #define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \
  4854. STATMOUNT_MNT_BASIC | \
  4855. STATMOUNT_PROPAGATE_FROM | \
  4856. STATMOUNT_MNT_ROOT | \
  4857. STATMOUNT_MNT_POINT | \
  4858. STATMOUNT_FS_TYPE | \
  4859. STATMOUNT_MNT_NS_ID | \
  4860. STATMOUNT_MNT_OPTS | \
  4861. STATMOUNT_FS_SUBTYPE | \
  4862. STATMOUNT_SB_SOURCE | \
  4863. STATMOUNT_OPT_ARRAY | \
  4864. STATMOUNT_OPT_SEC_ARRAY | \
  4865. STATMOUNT_SUPPORTED_MASK | \
  4866. STATMOUNT_MNT_UIDMAP | \
  4867. STATMOUNT_MNT_GIDMAP)
  4868. /* locks: namespace_shared */
  4869. static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
  4870. struct file *mnt_file, struct mnt_namespace *ns)
  4871. {
  4872. int err;
  4873. if (mnt_file) {
  4874. WARN_ON_ONCE(ns != NULL);
  4875. s->mnt = mnt_file->f_path.mnt;
  4876. ns = real_mount(s->mnt)->mnt_ns;
  4877. if (IS_ERR(ns))
  4878. return PTR_ERR(ns);
  4879. if (!ns)
  4880. /*
  4881. * We can't set mount point and mnt_ns_id since we don't have a
  4882. * ns for the mount. This can happen if the mount is unmounted
  4883. * with MNT_DETACH.
  4884. */
  4885. s->mask &= ~(STATMOUNT_MNT_POINT | STATMOUNT_MNT_NS_ID);
  4886. } else {
  4887. /* Has the namespace already been emptied? */
  4888. if (mnt_ns_id && mnt_ns_empty(ns))
  4889. return -ENOENT;
  4890. s->mnt = lookup_mnt_in_ns(mnt_id, ns);
  4891. if (!s->mnt)
  4892. return -ENOENT;
  4893. }
  4894. if (ns) {
  4895. err = grab_requested_root(ns, &s->root);
  4896. if (err)
  4897. return err;
  4898. if (!mnt_file) {
  4899. struct mount *m;
  4900. /*
  4901. * Don't trigger audit denials. We just want to determine what
  4902. * mounts to show users.
  4903. */
  4904. m = real_mount(s->mnt);
  4905. if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
  4906. !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
  4907. return -EPERM;
  4908. }
  4909. }
  4910. err = security_sb_statfs(s->mnt->mnt_root);
  4911. if (err)
  4912. return err;
  4913. /*
  4914. * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap
  4915. * can change concurrently as we only hold the read-side of the
  4916. * namespace semaphore and mount properties may change with only
  4917. * the mount lock held.
  4918. *
  4919. * We could sample the mount lock sequence counter to detect
  4920. * those changes and retry. But it's not worth it. Worst that
  4921. * happens is that the mnt->mnt_idmap pointer is already changed
  4922. * while mnt->mnt_flags isn't or vica versa. So what.
  4923. *
  4924. * Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved
  4925. * via READ_ONCE()/WRITE_ONCE() and guard against theoretical
  4926. * torn read/write. That's all we care about right now.
  4927. */
  4928. s->idmap = mnt_idmap(s->mnt);
  4929. if (s->mask & STATMOUNT_MNT_BASIC)
  4930. statmount_mnt_basic(s);
  4931. if (s->mask & STATMOUNT_SB_BASIC)
  4932. statmount_sb_basic(s);
  4933. if (s->mask & STATMOUNT_PROPAGATE_FROM)
  4934. statmount_propagate_from(s);
  4935. if (s->mask & STATMOUNT_FS_TYPE)
  4936. err = statmount_string(s, STATMOUNT_FS_TYPE);
  4937. if (!err && s->mask & STATMOUNT_MNT_ROOT)
  4938. err = statmount_string(s, STATMOUNT_MNT_ROOT);
  4939. if (!err && s->mask & STATMOUNT_MNT_POINT)
  4940. err = statmount_string(s, STATMOUNT_MNT_POINT);
  4941. if (!err && s->mask & STATMOUNT_MNT_OPTS)
  4942. err = statmount_string(s, STATMOUNT_MNT_OPTS);
  4943. if (!err && s->mask & STATMOUNT_OPT_ARRAY)
  4944. err = statmount_string(s, STATMOUNT_OPT_ARRAY);
  4945. if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY)
  4946. err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY);
  4947. if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
  4948. err = statmount_string(s, STATMOUNT_FS_SUBTYPE);
  4949. if (!err && s->mask & STATMOUNT_SB_SOURCE)
  4950. err = statmount_string(s, STATMOUNT_SB_SOURCE);
  4951. if (!err && s->mask & STATMOUNT_MNT_UIDMAP)
  4952. err = statmount_string(s, STATMOUNT_MNT_UIDMAP);
  4953. if (!err && s->mask & STATMOUNT_MNT_GIDMAP)
  4954. err = statmount_string(s, STATMOUNT_MNT_GIDMAP);
  4955. if (!err && s->mask & STATMOUNT_MNT_NS_ID)
  4956. statmount_mnt_ns_id(s, ns);
  4957. if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) {
  4958. s->sm.mask |= STATMOUNT_SUPPORTED_MASK;
  4959. s->sm.supported_mask = STATMOUNT_SUPPORTED;
  4960. }
  4961. if (err)
  4962. return err;
  4963. /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */
  4964. WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask);
  4965. return 0;
  4966. }
  4967. static inline bool retry_statmount(const long ret, size_t *seq_size)
  4968. {
  4969. if (likely(ret != -EAGAIN))
  4970. return false;
  4971. if (unlikely(check_mul_overflow(*seq_size, 2, seq_size)))
  4972. return false;
  4973. if (unlikely(*seq_size > MAX_RW_COUNT))
  4974. return false;
  4975. return true;
  4976. }
  4977. #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
  4978. STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
  4979. STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
  4980. STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \
  4981. STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP)
  4982. static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
  4983. struct statmount __user *buf, size_t bufsize,
  4984. size_t seq_size)
  4985. {
  4986. if (!access_ok(buf, bufsize))
  4987. return -EFAULT;
  4988. memset(ks, 0, sizeof(*ks));
  4989. ks->mask = kreq->param;
  4990. ks->buf = buf;
  4991. ks->bufsize = bufsize;
  4992. if (ks->mask & STATMOUNT_STRING_REQ) {
  4993. if (bufsize == sizeof(ks->sm))
  4994. return -EOVERFLOW;
  4995. ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
  4996. if (!ks->seq.buf)
  4997. return -ENOMEM;
  4998. ks->seq.size = seq_size;
  4999. }
  5000. return 0;
  5001. }
  5002. static int copy_mnt_id_req(const struct mnt_id_req __user *req,
  5003. struct mnt_id_req *kreq, unsigned int flags)
  5004. {
  5005. int ret;
  5006. size_t usize;
  5007. BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);
  5008. ret = get_user(usize, &req->size);
  5009. if (ret)
  5010. return -EFAULT;
  5011. if (unlikely(usize > PAGE_SIZE))
  5012. return -E2BIG;
  5013. if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
  5014. return -EINVAL;
  5015. memset(kreq, 0, sizeof(*kreq));
  5016. ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
  5017. if (ret)
  5018. return ret;
  5019. if (flags & STATMOUNT_BY_FD) {
  5020. if (kreq->mnt_id || kreq->mnt_ns_id)
  5021. return -EINVAL;
  5022. } else {
  5023. if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id)
  5024. return -EINVAL;
  5025. /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
  5026. if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
  5027. return -EINVAL;
  5028. }
  5029. return 0;
  5030. }
  5031. /*
  5032. * If the user requested a specific mount namespace id, look that up and return
  5033. * that, or if not simply grab a passive reference on our mount namespace and
  5034. * return that.
  5035. */
  5036. static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq)
  5037. {
  5038. struct mnt_namespace *mnt_ns;
  5039. if (kreq->mnt_ns_id) {
  5040. mnt_ns = lookup_mnt_ns(kreq->mnt_ns_id);
  5041. if (!mnt_ns)
  5042. return ERR_PTR(-ENOENT);
  5043. } else if (kreq->mnt_ns_fd) {
  5044. struct ns_common *ns;
  5045. CLASS(fd, f)(kreq->mnt_ns_fd);
  5046. if (fd_empty(f))
  5047. return ERR_PTR(-EBADF);
  5048. if (!proc_ns_file(fd_file(f)))
  5049. return ERR_PTR(-EINVAL);
  5050. ns = get_proc_ns(file_inode(fd_file(f)));
  5051. if (ns->ns_type != CLONE_NEWNS)
  5052. return ERR_PTR(-EINVAL);
  5053. mnt_ns = to_mnt_ns(ns);
  5054. refcount_inc(&mnt_ns->passive);
  5055. } else {
  5056. mnt_ns = current->nsproxy->mnt_ns;
  5057. refcount_inc(&mnt_ns->passive);
  5058. }
  5059. return mnt_ns;
  5060. }
  5061. SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
  5062. struct statmount __user *, buf, size_t, bufsize,
  5063. unsigned int, flags)
  5064. {
  5065. struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
  5066. struct kstatmount *ks __free(kfree) = NULL;
  5067. struct file *mnt_file __free(fput) = NULL;
  5068. struct mnt_id_req kreq;
  5069. /* We currently support retrieval of 3 strings. */
  5070. size_t seq_size = 3 * PATH_MAX;
  5071. int ret;
  5072. if (flags & ~STATMOUNT_BY_FD)
  5073. return -EINVAL;
  5074. ret = copy_mnt_id_req(req, &kreq, flags);
  5075. if (ret)
  5076. return ret;
  5077. if (flags & STATMOUNT_BY_FD) {
  5078. mnt_file = fget_raw(kreq.mnt_fd);
  5079. if (!mnt_file)
  5080. return -EBADF;
  5081. /* do_statmount sets ns in case of STATMOUNT_BY_FD */
  5082. } else {
  5083. ns = grab_requested_mnt_ns(&kreq);
  5084. if (IS_ERR(ns))
  5085. return PTR_ERR(ns);
  5086. if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
  5087. !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
  5088. return -EPERM;
  5089. }
  5090. ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT);
  5091. if (!ks)
  5092. return -ENOMEM;
  5093. retry:
  5094. ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size);
  5095. if (ret)
  5096. return ret;
  5097. scoped_guard(namespace_shared)
  5098. ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, mnt_file, ns);
  5099. if (!ret)
  5100. ret = copy_statmount_to_user(ks);
  5101. kvfree(ks->seq.buf);
  5102. path_put(&ks->root);
  5103. if (retry_statmount(ret, &seq_size))
  5104. goto retry;
  5105. return ret;
  5106. }
  5107. struct klistmount {
  5108. u64 last_mnt_id;
  5109. u64 mnt_parent_id;
  5110. u64 *kmnt_ids;
  5111. u32 nr_mnt_ids;
  5112. struct mnt_namespace *ns;
  5113. struct path root;
  5114. };
  5115. /* locks: namespace_shared */
  5116. static ssize_t do_listmount(struct klistmount *kls, bool reverse)
  5117. {
  5118. struct mnt_namespace *ns = kls->ns;
  5119. u64 mnt_parent_id = kls->mnt_parent_id;
  5120. u64 last_mnt_id = kls->last_mnt_id;
  5121. u64 *mnt_ids = kls->kmnt_ids;
  5122. size_t nr_mnt_ids = kls->nr_mnt_ids;
  5123. struct path orig;
  5124. struct mount *r, *first;
  5125. ssize_t ret;
  5126. rwsem_assert_held(&namespace_sem);
  5127. ret = grab_requested_root(ns, &kls->root);
  5128. if (ret)
  5129. return ret;
  5130. if (mnt_parent_id == LSMT_ROOT) {
  5131. orig = kls->root;
  5132. } else {
  5133. orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
  5134. if (!orig.mnt)
  5135. return -ENOENT;
  5136. orig.dentry = orig.mnt->mnt_root;
  5137. }
  5138. /*
  5139. * Don't trigger audit denials. We just want to determine what
  5140. * mounts to show users.
  5141. */
  5142. if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) &&
  5143. !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
  5144. return -EPERM;
  5145. ret = security_sb_statfs(orig.dentry);
  5146. if (ret)
  5147. return ret;
  5148. if (!last_mnt_id) {
  5149. if (reverse)
  5150. first = node_to_mount(ns->mnt_last_node);
  5151. else
  5152. first = node_to_mount(ns->mnt_first_node);
  5153. } else {
  5154. if (reverse)
  5155. first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);
  5156. else
  5157. first = mnt_find_id_at(ns, last_mnt_id + 1);
  5158. }
  5159. for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) {
  5160. if (r->mnt_id_unique == mnt_parent_id)
  5161. continue;
  5162. if (!is_path_reachable(r, r->mnt.mnt_root, &orig))
  5163. continue;
  5164. *mnt_ids = r->mnt_id_unique;
  5165. mnt_ids++;
  5166. nr_mnt_ids--;
  5167. ret++;
  5168. }
  5169. return ret;
  5170. }
  5171. static void __free_klistmount_free(const struct klistmount *kls)
  5172. {
  5173. path_put(&kls->root);
  5174. kvfree(kls->kmnt_ids);
  5175. mnt_ns_release(kls->ns);
  5176. }
  5177. static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq,
  5178. size_t nr_mnt_ids)
  5179. {
  5180. u64 last_mnt_id = kreq->param;
  5181. struct mnt_namespace *ns;
  5182. /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
  5183. if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
  5184. return -EINVAL;
  5185. kls->last_mnt_id = last_mnt_id;
  5186. kls->nr_mnt_ids = nr_mnt_ids;
  5187. kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids),
  5188. GFP_KERNEL_ACCOUNT);
  5189. if (!kls->kmnt_ids)
  5190. return -ENOMEM;
  5191. ns = grab_requested_mnt_ns(kreq);
  5192. if (IS_ERR(ns))
  5193. return PTR_ERR(ns);
  5194. kls->ns = ns;
  5195. kls->mnt_parent_id = kreq->mnt_id;
  5196. return 0;
  5197. }
  5198. SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
  5199. u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
  5200. {
  5201. struct klistmount kls __free(klistmount_free) = {};
  5202. const size_t maxcount = 1000000;
  5203. struct mnt_id_req kreq;
  5204. ssize_t ret;
  5205. if (flags & ~LISTMOUNT_REVERSE)
  5206. return -EINVAL;
  5207. /*
  5208. * If the mount namespace really has more than 1 million mounts the
  5209. * caller must iterate over the mount namespace (and reconsider their
  5210. * system design...).
  5211. */
  5212. if (unlikely(nr_mnt_ids > maxcount))
  5213. return -EOVERFLOW;
  5214. if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
  5215. return -EFAULT;
  5216. ret = copy_mnt_id_req(req, &kreq, 0);
  5217. if (ret)
  5218. return ret;
  5219. ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids);
  5220. if (ret)
  5221. return ret;
  5222. if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) &&
  5223. !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN))
  5224. return -ENOENT;
  5225. /*
  5226. * We only need to guard against mount topology changes as
  5227. * listmount() doesn't care about any mount properties.
  5228. */
  5229. scoped_guard(namespace_shared)
  5230. ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE));
  5231. if (ret <= 0)
  5232. return ret;
  5233. if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids)))
  5234. return -EFAULT;
  5235. return ret;
  5236. }
  5237. struct mnt_namespace init_mnt_ns = {
  5238. .ns = NS_COMMON_INIT(init_mnt_ns),
  5239. .user_ns = &init_user_ns,
  5240. .passive = REFCOUNT_INIT(1),
  5241. .mounts = RB_ROOT,
  5242. .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll),
  5243. };
  5244. static void __init init_mount_tree(void)
  5245. {
  5246. struct vfsmount *mnt, *nullfs_mnt;
  5247. struct mount *mnt_root;
  5248. struct path root;
  5249. /*
  5250. * We create two mounts:
  5251. *
  5252. * (1) nullfs with mount id 1
  5253. * (2) mutable rootfs with mount id 2
  5254. *
  5255. * with (2) mounted on top of (1).
  5256. */
  5257. nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL);
  5258. if (IS_ERR(nullfs_mnt))
  5259. panic("VFS: Failed to create nullfs");
  5260. mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
  5261. if (IS_ERR(mnt))
  5262. panic("Can't create rootfs");
  5263. VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1);
  5264. VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2);
  5265. /* The namespace root is the nullfs mnt. */
  5266. mnt_root = real_mount(nullfs_mnt);
  5267. init_mnt_ns.root = mnt_root;
  5268. /* Mount mutable rootfs on top of nullfs. */
  5269. root.mnt = nullfs_mnt;
  5270. root.dentry = nullfs_mnt->mnt_root;
  5271. LOCK_MOUNT_EXACT(mp, &root);
  5272. if (unlikely(IS_ERR(mp.parent)))
  5273. panic("VFS: Failed to mount rootfs on nullfs");
  5274. scoped_guard(mount_writer)
  5275. attach_mnt(real_mount(mnt), mp.parent, mp.mp);
  5276. pr_info("VFS: Finished mounting rootfs on nullfs\n");
  5277. /*
  5278. * We've dropped all locks here but that's fine. Not just are we
  5279. * the only task that's running, there's no other mount
  5280. * namespace in existence and the initial mount namespace is
  5281. * completely empty until we add the mounts we just created.
  5282. */
  5283. for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) {
  5284. mnt_add_to_ns(&init_mnt_ns, p);
  5285. init_mnt_ns.nr_mounts++;
  5286. }
  5287. init_task.nsproxy->mnt_ns = &init_mnt_ns;
  5288. get_mnt_ns(&init_mnt_ns);
  5289. /* The root and pwd always point to the mutable rootfs. */
  5290. root.mnt = mnt;
  5291. root.dentry = mnt->mnt_root;
  5292. set_fs_pwd(current->fs, &root);
  5293. set_fs_root(current->fs, &root);
  5294. ns_tree_add(&init_mnt_ns);
  5295. }
  5296. void __init mnt_init(void)
  5297. {
  5298. int err;
  5299. mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
  5300. 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
  5301. mount_hashtable = alloc_large_system_hash("Mount-cache",
  5302. sizeof(struct hlist_head),
  5303. mhash_entries, 19,
  5304. HASH_ZERO,
  5305. &m_hash_shift, &m_hash_mask, 0, 0);
  5306. mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
  5307. sizeof(struct hlist_head),
  5308. mphash_entries, 19,
  5309. HASH_ZERO,
  5310. &mp_hash_shift, &mp_hash_mask, 0, 0);
  5311. if (!mount_hashtable || !mountpoint_hashtable)
  5312. panic("Failed to allocate mount hash table\n");
  5313. kernfs_init();
  5314. err = sysfs_init();
  5315. if (err)
  5316. printk(KERN_WARNING "%s: sysfs_init error: %d\n",
  5317. __func__, err);
  5318. fs_kobj = kobject_create_and_add("fs", NULL);
  5319. if (!fs_kobj)
  5320. printk(KERN_WARNING "%s: kobj create error\n", __func__);
  5321. shmem_init();
  5322. init_rootfs();
  5323. init_mount_tree();
  5324. }
  5325. void put_mnt_ns(struct mnt_namespace *ns)
  5326. {
  5327. if (!ns_ref_put(ns))
  5328. return;
  5329. guard(namespace_excl)();
  5330. emptied_ns = ns;
  5331. guard(mount_writer)();
  5332. umount_tree(ns->root, 0);
  5333. }
  5334. struct vfsmount *kern_mount(struct file_system_type *type)
  5335. {
  5336. struct vfsmount *mnt;
  5337. mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
  5338. if (!IS_ERR(mnt)) {
  5339. /*
  5340. * it is a longterm mount, don't release mnt until
  5341. * we unmount before file sys is unregistered
  5342. */
  5343. real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
  5344. }
  5345. return mnt;
  5346. }
  5347. EXPORT_SYMBOL_GPL(kern_mount);
  5348. void kern_unmount(struct vfsmount *mnt)
  5349. {
  5350. /* release long term mount so mount point can be released */
  5351. if (!IS_ERR(mnt)) {
  5352. mnt_make_shortterm(mnt);
  5353. synchronize_rcu(); /* yecchhh... */
  5354. mntput(mnt);
  5355. }
  5356. }
  5357. EXPORT_SYMBOL(kern_unmount);
  5358. void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
  5359. {
  5360. unsigned int i;
  5361. for (i = 0; i < num; i++)
  5362. mnt_make_shortterm(mnt[i]);
  5363. synchronize_rcu_expedited();
  5364. for (i = 0; i < num; i++)
  5365. mntput(mnt[i]);
  5366. }
  5367. EXPORT_SYMBOL(kern_unmount_array);
  5368. bool our_mnt(struct vfsmount *mnt)
  5369. {
  5370. return check_mnt(real_mount(mnt));
  5371. }
  5372. bool current_chrooted(void)
  5373. {
  5374. /* Does the current process have a non-standard root */
  5375. struct path fs_root __free(path_put) = {};
  5376. struct mount *root;
  5377. get_fs_root(current->fs, &fs_root);
  5378. /* Find the namespace root */
  5379. guard(mount_locked_reader)();
  5380. root = topmost_overmount(current->nsproxy->mnt_ns->root);
  5381. return fs_root.mnt != &root->mnt || !path_mounted(&fs_root);
  5382. }
  5383. static bool mnt_already_visible(struct mnt_namespace *ns,
  5384. const struct super_block *sb,
  5385. int *new_mnt_flags)
  5386. {
  5387. int new_flags = *new_mnt_flags;
  5388. struct mount *mnt, *n;
  5389. guard(namespace_shared)();
  5390. rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
  5391. struct mount *child;
  5392. int mnt_flags;
  5393. if (mnt->mnt.mnt_sb->s_type != sb->s_type)
  5394. continue;
  5395. /* This mount is not fully visible if it's root directory
  5396. * is not the root directory of the filesystem.
  5397. */
  5398. if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
  5399. continue;
  5400. /* A local view of the mount flags */
  5401. mnt_flags = mnt->mnt.mnt_flags;
  5402. /* Don't miss readonly hidden in the superblock flags */
  5403. if (sb_rdonly(mnt->mnt.mnt_sb))
  5404. mnt_flags |= MNT_LOCK_READONLY;
  5405. /* Verify the mount flags are equal to or more permissive
  5406. * than the proposed new mount.
  5407. */
  5408. if ((mnt_flags & MNT_LOCK_READONLY) &&
  5409. !(new_flags & MNT_READONLY))
  5410. continue;
  5411. if ((mnt_flags & MNT_LOCK_ATIME) &&
  5412. ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
  5413. continue;
  5414. /* This mount is not fully visible if there are any
  5415. * locked child mounts that cover anything except for
  5416. * empty directories.
  5417. */
  5418. list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
  5419. struct inode *inode = child->mnt_mountpoint->d_inode;
  5420. /* Only worry about locked mounts */
  5421. if (!(child->mnt.mnt_flags & MNT_LOCKED))
  5422. continue;
  5423. /* Is the directory permanently empty? */
  5424. if (!is_empty_dir_inode(inode))
  5425. goto next;
  5426. }
  5427. /* Preserve the locked attributes */
  5428. *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
  5429. MNT_LOCK_ATIME);
  5430. return true;
  5431. next: ;
  5432. }
  5433. return false;
  5434. }
  5435. static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
  5436. {
  5437. const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
  5438. struct mnt_namespace *ns = current->nsproxy->mnt_ns;
  5439. unsigned long s_iflags;
  5440. if (ns->user_ns == &init_user_ns)
  5441. return false;
  5442. /* Can this filesystem be too revealing? */
  5443. s_iflags = sb->s_iflags;
  5444. if (!(s_iflags & SB_I_USERNS_VISIBLE))
  5445. return false;
  5446. if ((s_iflags & required_iflags) != required_iflags) {
  5447. WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
  5448. required_iflags);
  5449. return true;
  5450. }
  5451. return !mnt_already_visible(ns, sb, new_mnt_flags);
  5452. }
  5453. bool mnt_may_suid(struct vfsmount *mnt)
  5454. {
  5455. /*
  5456. * Foreign mounts (accessed via fchdir or through /proc
  5457. * symlinks) are always treated as if they are nosuid. This
  5458. * prevents namespaces from trusting potentially unsafe
  5459. * suid/sgid bits, file caps, or security labels that originate
  5460. * in other namespaces.
  5461. */
  5462. return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
  5463. current_in_userns(mnt->mnt_sb->s_user_ns);
  5464. }
  5465. static struct ns_common *mntns_get(struct task_struct *task)
  5466. {
  5467. struct ns_common *ns = NULL;
  5468. struct nsproxy *nsproxy;
  5469. task_lock(task);
  5470. nsproxy = task->nsproxy;
  5471. if (nsproxy) {
  5472. ns = &nsproxy->mnt_ns->ns;
  5473. get_mnt_ns(to_mnt_ns(ns));
  5474. }
  5475. task_unlock(task);
  5476. return ns;
  5477. }
  5478. static void mntns_put(struct ns_common *ns)
  5479. {
  5480. put_mnt_ns(to_mnt_ns(ns));
  5481. }
  5482. static int mntns_install(struct nsset *nsset, struct ns_common *ns)
  5483. {
  5484. struct nsproxy *nsproxy = nsset->nsproxy;
  5485. struct fs_struct *fs = nsset->fs;
  5486. struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
  5487. struct user_namespace *user_ns = nsset->cred->user_ns;
  5488. struct path root;
  5489. int err;
  5490. if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
  5491. !ns_capable(user_ns, CAP_SYS_CHROOT) ||
  5492. !ns_capable(user_ns, CAP_SYS_ADMIN))
  5493. return -EPERM;
  5494. if (is_anon_ns(mnt_ns))
  5495. return -EINVAL;
  5496. if (fs->users != 1)
  5497. return -EINVAL;
  5498. get_mnt_ns(mnt_ns);
  5499. old_mnt_ns = nsproxy->mnt_ns;
  5500. nsproxy->mnt_ns = mnt_ns;
  5501. /* Find the root */
  5502. err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
  5503. "/", LOOKUP_DOWN, &root);
  5504. if (err) {
  5505. /* revert to old namespace */
  5506. nsproxy->mnt_ns = old_mnt_ns;
  5507. put_mnt_ns(mnt_ns);
  5508. return err;
  5509. }
  5510. put_mnt_ns(old_mnt_ns);
  5511. /* Update the pwd and root */
  5512. set_fs_pwd(fs, &root);
  5513. set_fs_root(fs, &root);
  5514. path_put(&root);
  5515. return 0;
  5516. }
  5517. static struct user_namespace *mntns_owner(struct ns_common *ns)
  5518. {
  5519. return to_mnt_ns(ns)->user_ns;
  5520. }
  5521. const struct proc_ns_operations mntns_operations = {
  5522. .name = "mnt",
  5523. .get = mntns_get,
  5524. .put = mntns_put,
  5525. .install = mntns_install,
  5526. .owner = mntns_owner,
  5527. };
  5528. #ifdef CONFIG_SYSCTL
  5529. static const struct ctl_table fs_namespace_sysctls[] = {
  5530. {
  5531. .procname = "mount-max",
  5532. .data = &sysctl_mount_max,
  5533. .maxlen = sizeof(unsigned int),
  5534. .mode = 0644,
  5535. .proc_handler = proc_dointvec_minmax,
  5536. .extra1 = SYSCTL_ONE,
  5537. },
  5538. };
  5539. static int __init init_fs_namespace_sysctls(void)
  5540. {
  5541. register_sysctl_init("fs", fs_namespace_sysctls);
  5542. return 0;
  5543. }
  5544. fs_initcall(init_fs_namespace_sysctls);
  5545. #endif /* CONFIG_SYSCTL */