vma.c 90 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * VMA-specific functions.
  4. */
  5. #include "vma_internal.h"
  6. #include "vma.h"
  7. struct mmap_state {
  8. struct mm_struct *mm;
  9. struct vma_iterator *vmi;
  10. unsigned long addr;
  11. unsigned long end;
  12. pgoff_t pgoff;
  13. unsigned long pglen;
  14. union {
  15. vm_flags_t vm_flags;
  16. vma_flags_t vma_flags;
  17. };
  18. struct file *file;
  19. pgprot_t page_prot;
  20. /* User-defined fields, perhaps updated by .mmap_prepare(). */
  21. const struct vm_operations_struct *vm_ops;
  22. void *vm_private_data;
  23. unsigned long charged;
  24. struct vm_area_struct *prev;
  25. struct vm_area_struct *next;
  26. /* Unmapping state. */
  27. struct vma_munmap_struct vms;
  28. struct ma_state mas_detach;
  29. struct maple_tree mt_detach;
  30. /* Determine if we can check KSM flags early in mmap() logic. */
  31. bool check_ksm_early :1;
  32. /* If we map new, hold the file rmap lock on mapping. */
  33. bool hold_file_rmap_lock :1;
  34. /* If .mmap_prepare changed the file, we don't need to pin. */
  35. bool file_doesnt_need_get :1;
  36. };
  37. #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \
  38. struct mmap_state name = { \
  39. .mm = mm_, \
  40. .vmi = vmi_, \
  41. .addr = addr_, \
  42. .end = (addr_) + (len_), \
  43. .pgoff = pgoff_, \
  44. .pglen = PHYS_PFN(len_), \
  45. .vm_flags = vm_flags_, \
  46. .file = file_, \
  47. .page_prot = vm_get_page_prot(vm_flags_), \
  48. }
  49. #define VMG_MMAP_STATE(name, map_, vma_) \
  50. struct vma_merge_struct name = { \
  51. .mm = (map_)->mm, \
  52. .vmi = (map_)->vmi, \
  53. .start = (map_)->addr, \
  54. .end = (map_)->end, \
  55. .vm_flags = (map_)->vm_flags, \
  56. .pgoff = (map_)->pgoff, \
  57. .file = (map_)->file, \
  58. .prev = (map_)->prev, \
  59. .middle = vma_, \
  60. .next = (vma_) ? NULL : (map_)->next, \
  61. .state = VMA_MERGE_START, \
  62. }
  63. /* Was this VMA ever forked from a parent, i.e. maybe contains CoW mappings? */
  64. static bool vma_is_fork_child(struct vm_area_struct *vma)
  65. {
  66. /*
  67. * The list_is_singular() test is to avoid merging VMA cloned from
  68. * parents. This can improve scalability caused by the anon_vma root
  69. * lock.
  70. */
  71. return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain);
  72. }
  73. static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
  74. {
  75. struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
  76. if (!mpol_equal(vmg->policy, vma_policy(vma)))
  77. return false;
  78. if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_IGNORE_MERGE)
  79. return false;
  80. if (vma->vm_file != vmg->file)
  81. return false;
  82. if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
  83. return false;
  84. if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
  85. return false;
  86. return true;
  87. }
  88. static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next)
  89. {
  90. struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev;
  91. struct vm_area_struct *src = vmg->middle; /* existing merge case. */
  92. struct anon_vma *tgt_anon = tgt->anon_vma;
  93. struct anon_vma *src_anon = vmg->anon_vma;
  94. /*
  95. * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we
  96. * will remove the existing VMA's anon_vma's so there's no scalability
  97. * concerns.
  98. */
  99. VM_WARN_ON(src && src_anon != src->anon_vma);
  100. /* Case 1 - we will dup_anon_vma() from src into tgt. */
  101. if (!tgt_anon && src_anon) {
  102. struct vm_area_struct *copied_from = vmg->copied_from;
  103. if (vma_is_fork_child(src))
  104. return false;
  105. if (vma_is_fork_child(copied_from))
  106. return false;
  107. return true;
  108. }
  109. /* Case 2 - we will simply use tgt's anon_vma. */
  110. if (tgt_anon && !src_anon)
  111. return !vma_is_fork_child(tgt);
  112. /* Case 3 - the anon_vma's are already shared. */
  113. return src_anon == tgt_anon;
  114. }
  115. /*
  116. * init_multi_vma_prep() - Initializer for struct vma_prepare
  117. * @vp: The vma_prepare struct
  118. * @vma: The vma that will be altered once locked
  119. * @vmg: The merge state that will be used to determine adjustment and VMA
  120. * removal.
  121. */
  122. static void init_multi_vma_prep(struct vma_prepare *vp,
  123. struct vm_area_struct *vma,
  124. struct vma_merge_struct *vmg)
  125. {
  126. struct vm_area_struct *adjust;
  127. struct vm_area_struct **remove = &vp->remove;
  128. memset(vp, 0, sizeof(struct vma_prepare));
  129. vp->vma = vma;
  130. vp->anon_vma = vma->anon_vma;
  131. if (vmg && vmg->__remove_middle) {
  132. *remove = vmg->middle;
  133. remove = &vp->remove2;
  134. }
  135. if (vmg && vmg->__remove_next)
  136. *remove = vmg->next;
  137. if (vmg && vmg->__adjust_middle_start)
  138. adjust = vmg->middle;
  139. else if (vmg && vmg->__adjust_next_start)
  140. adjust = vmg->next;
  141. else
  142. adjust = NULL;
  143. vp->adj_next = adjust;
  144. if (!vp->anon_vma && adjust)
  145. vp->anon_vma = adjust->anon_vma;
  146. VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma &&
  147. vp->anon_vma != adjust->anon_vma);
  148. vp->file = vma->vm_file;
  149. if (vp->file)
  150. vp->mapping = vma->vm_file->f_mapping;
  151. if (vmg && vmg->skip_vma_uprobe)
  152. vp->skip_vma_uprobe = true;
  153. }
  154. /*
  155. * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  156. * in front of (at a lower virtual address and file offset than) the vma.
  157. *
  158. * We cannot merge two vmas if they have differently assigned (non-NULL)
  159. * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  160. *
  161. * We don't check here for the merged mmap wrapping around the end of pagecache
  162. * indices (16TB on ia32) because do_mmap() does not permit mmap's which
  163. * wrap, nor mmaps which cover the final page at index -1UL.
  164. *
  165. * We assume the vma may be removed as part of the merge.
  166. */
  167. static bool can_vma_merge_before(struct vma_merge_struct *vmg)
  168. {
  169. pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
  170. if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
  171. is_mergeable_anon_vma(vmg, /* merge_next = */ true)) {
  172. if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
  173. return true;
  174. }
  175. return false;
  176. }
  177. /*
  178. * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  179. * beyond (at a higher virtual address and file offset than) the vma.
  180. *
  181. * We cannot merge two vmas if they have differently assigned (non-NULL)
  182. * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  183. *
  184. * We assume that vma is not removed as part of the merge.
  185. */
  186. static bool can_vma_merge_after(struct vma_merge_struct *vmg)
  187. {
  188. if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
  189. is_mergeable_anon_vma(vmg, /* merge_next = */ false)) {
  190. if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
  191. return true;
  192. }
  193. return false;
  194. }
  195. static void __vma_link_file(struct vm_area_struct *vma,
  196. struct address_space *mapping)
  197. {
  198. if (vma_is_shared_maywrite(vma))
  199. mapping_allow_writable(mapping);
  200. flush_dcache_mmap_lock(mapping);
  201. vma_interval_tree_insert(vma, &mapping->i_mmap);
  202. flush_dcache_mmap_unlock(mapping);
  203. }
  204. /*
  205. * Requires inode->i_mapping->i_mmap_rwsem
  206. */
  207. static void __remove_shared_vm_struct(struct vm_area_struct *vma,
  208. struct address_space *mapping)
  209. {
  210. if (vma_is_shared_maywrite(vma))
  211. mapping_unmap_writable(mapping);
  212. flush_dcache_mmap_lock(mapping);
  213. vma_interval_tree_remove(vma, &mapping->i_mmap);
  214. flush_dcache_mmap_unlock(mapping);
  215. }
  216. /*
  217. * vma has some anon_vma assigned, and is already inserted on that
  218. * anon_vma's interval trees.
  219. *
  220. * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
  221. * vma must be removed from the anon_vma's interval trees using
  222. * anon_vma_interval_tree_pre_update_vma().
  223. *
  224. * After the update, the vma will be reinserted using
  225. * anon_vma_interval_tree_post_update_vma().
  226. *
  227. * The entire update must be protected by exclusive mmap_lock and by
  228. * the root anon_vma's mutex.
  229. */
  230. static void
  231. anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
  232. {
  233. struct anon_vma_chain *avc;
  234. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  235. anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
  236. }
  237. static void
  238. anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
  239. {
  240. struct anon_vma_chain *avc;
  241. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  242. anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
  243. }
  244. /*
  245. * vma_prepare() - Helper function for handling locking VMAs prior to altering
  246. * @vp: The initialized vma_prepare struct
  247. */
  248. static void vma_prepare(struct vma_prepare *vp)
  249. {
  250. if (vp->file) {
  251. uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
  252. if (vp->adj_next)
  253. uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
  254. vp->adj_next->vm_end);
  255. i_mmap_lock_write(vp->mapping);
  256. if (vp->insert && vp->insert->vm_file) {
  257. /*
  258. * Put into interval tree now, so instantiated pages
  259. * are visible to arm/parisc __flush_dcache_page
  260. * throughout; but we cannot insert into address
  261. * space until vma start or end is updated.
  262. */
  263. __vma_link_file(vp->insert,
  264. vp->insert->vm_file->f_mapping);
  265. }
  266. }
  267. if (vp->anon_vma) {
  268. anon_vma_lock_write(vp->anon_vma);
  269. anon_vma_interval_tree_pre_update_vma(vp->vma);
  270. if (vp->adj_next)
  271. anon_vma_interval_tree_pre_update_vma(vp->adj_next);
  272. }
  273. if (vp->file) {
  274. flush_dcache_mmap_lock(vp->mapping);
  275. vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
  276. if (vp->adj_next)
  277. vma_interval_tree_remove(vp->adj_next,
  278. &vp->mapping->i_mmap);
  279. }
  280. }
  281. /*
  282. * vma_complete- Helper function for handling the unlocking after altering VMAs,
  283. * or for inserting a VMA.
  284. *
  285. * @vp: The vma_prepare struct
  286. * @vmi: The vma iterator
  287. * @mm: The mm_struct
  288. */
  289. static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
  290. struct mm_struct *mm)
  291. {
  292. if (vp->file) {
  293. if (vp->adj_next)
  294. vma_interval_tree_insert(vp->adj_next,
  295. &vp->mapping->i_mmap);
  296. vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
  297. flush_dcache_mmap_unlock(vp->mapping);
  298. }
  299. if (vp->remove && vp->file) {
  300. __remove_shared_vm_struct(vp->remove, vp->mapping);
  301. if (vp->remove2)
  302. __remove_shared_vm_struct(vp->remove2, vp->mapping);
  303. } else if (vp->insert) {
  304. /*
  305. * split_vma has split insert from vma, and needs
  306. * us to insert it before dropping the locks
  307. * (it may either follow vma or precede it).
  308. */
  309. vma_iter_store_new(vmi, vp->insert);
  310. mm->map_count++;
  311. }
  312. if (vp->anon_vma) {
  313. anon_vma_interval_tree_post_update_vma(vp->vma);
  314. if (vp->adj_next)
  315. anon_vma_interval_tree_post_update_vma(vp->adj_next);
  316. anon_vma_unlock_write(vp->anon_vma);
  317. }
  318. if (vp->file) {
  319. i_mmap_unlock_write(vp->mapping);
  320. if (!vp->skip_vma_uprobe) {
  321. uprobe_mmap(vp->vma);
  322. if (vp->adj_next)
  323. uprobe_mmap(vp->adj_next);
  324. }
  325. }
  326. if (vp->remove) {
  327. again:
  328. vma_mark_detached(vp->remove);
  329. if (vp->file) {
  330. uprobe_munmap(vp->remove, vp->remove->vm_start,
  331. vp->remove->vm_end);
  332. fput(vp->file);
  333. }
  334. if (vp->remove->anon_vma)
  335. unlink_anon_vmas(vp->remove);
  336. mm->map_count--;
  337. mpol_put(vma_policy(vp->remove));
  338. if (!vp->remove2)
  339. WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
  340. vm_area_free(vp->remove);
  341. /*
  342. * In mprotect's case 6 (see comments on vma_merge),
  343. * we are removing both mid and next vmas
  344. */
  345. if (vp->remove2) {
  346. vp->remove = vp->remove2;
  347. vp->remove2 = NULL;
  348. goto again;
  349. }
  350. }
  351. if (vp->insert && vp->file)
  352. uprobe_mmap(vp->insert);
  353. }
  354. /*
  355. * init_vma_prep() - Initializer wrapper for vma_prepare struct
  356. * @vp: The vma_prepare struct
  357. * @vma: The vma that will be altered once locked
  358. */
  359. static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma)
  360. {
  361. init_multi_vma_prep(vp, vma, NULL);
  362. }
  363. /*
  364. * Can the proposed VMA be merged with the left (previous) VMA taking into
  365. * account the start position of the proposed range.
  366. */
  367. static bool can_vma_merge_left(struct vma_merge_struct *vmg)
  368. {
  369. return vmg->prev && vmg->prev->vm_end == vmg->start &&
  370. can_vma_merge_after(vmg);
  371. }
  372. /*
  373. * Can the proposed VMA be merged with the right (next) VMA taking into
  374. * account the end position of the proposed range.
  375. *
  376. * In addition, if we can merge with the left VMA, ensure that left and right
  377. * anon_vma's are also compatible.
  378. */
  379. static bool can_vma_merge_right(struct vma_merge_struct *vmg,
  380. bool can_merge_left)
  381. {
  382. struct vm_area_struct *next = vmg->next;
  383. struct vm_area_struct *prev;
  384. if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg))
  385. return false;
  386. if (!can_merge_left)
  387. return true;
  388. /*
  389. * If we can merge with prev (left) and next (right), indicating that
  390. * each VMA's anon_vma is compatible with the proposed anon_vma, this
  391. * does not mean prev and next are compatible with EACH OTHER.
  392. *
  393. * We therefore check this in addition to mergeability to either side.
  394. */
  395. prev = vmg->prev;
  396. return !prev->anon_vma || !next->anon_vma ||
  397. prev->anon_vma == next->anon_vma;
  398. }
  399. /*
  400. * Close a vm structure and free it.
  401. */
  402. void remove_vma(struct vm_area_struct *vma)
  403. {
  404. might_sleep();
  405. vma_close(vma);
  406. if (vma->vm_file)
  407. fput(vma->vm_file);
  408. mpol_put(vma_policy(vma));
  409. vm_area_free(vma);
  410. }
  411. /*
  412. * Get rid of page table information in the indicated region.
  413. *
  414. * Called with the mm semaphore held.
  415. */
  416. void unmap_region(struct unmap_desc *unmap)
  417. {
  418. struct mm_struct *mm = unmap->first->vm_mm;
  419. struct mmu_gather tlb;
  420. tlb_gather_mmu(&tlb, mm);
  421. update_hiwater_rss(mm);
  422. unmap_vmas(&tlb, unmap);
  423. mas_set(unmap->mas, unmap->tree_reset);
  424. free_pgtables(&tlb, unmap);
  425. tlb_finish_mmu(&tlb);
  426. }
  427. /*
  428. * __split_vma() bypasses sysctl_max_map_count checking. We use this where it
  429. * has already been checked or doesn't make sense to fail.
  430. * VMA Iterator will point to the original VMA.
  431. */
  432. static __must_check int
  433. __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
  434. unsigned long addr, int new_below)
  435. {
  436. struct vma_prepare vp;
  437. struct vm_area_struct *new;
  438. int err;
  439. WARN_ON(vma->vm_start >= addr);
  440. WARN_ON(vma->vm_end <= addr);
  441. if (vma->vm_ops && vma->vm_ops->may_split) {
  442. err = vma->vm_ops->may_split(vma, addr);
  443. if (err)
  444. return err;
  445. }
  446. new = vm_area_dup(vma);
  447. if (!new)
  448. return -ENOMEM;
  449. if (new_below) {
  450. new->vm_end = addr;
  451. } else {
  452. new->vm_start = addr;
  453. new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
  454. }
  455. err = -ENOMEM;
  456. vma_iter_config(vmi, new->vm_start, new->vm_end);
  457. if (vma_iter_prealloc(vmi, new))
  458. goto out_free_vma;
  459. err = vma_dup_policy(vma, new);
  460. if (err)
  461. goto out_free_vmi;
  462. err = anon_vma_clone(new, vma, VMA_OP_SPLIT);
  463. if (err)
  464. goto out_free_mpol;
  465. if (new->vm_file)
  466. get_file(new->vm_file);
  467. if (new->vm_ops && new->vm_ops->open)
  468. new->vm_ops->open(new);
  469. vma_start_write(vma);
  470. vma_start_write(new);
  471. init_vma_prep(&vp, vma);
  472. vp.insert = new;
  473. vma_prepare(&vp);
  474. /*
  475. * Get rid of huge pages and shared page tables straddling the split
  476. * boundary.
  477. */
  478. vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
  479. if (is_vm_hugetlb_page(vma))
  480. hugetlb_split(vma, addr);
  481. if (new_below) {
  482. vma->vm_start = addr;
  483. vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
  484. } else {
  485. vma->vm_end = addr;
  486. }
  487. /* vma_complete stores the new vma */
  488. vma_complete(&vp, vmi, vma->vm_mm);
  489. validate_mm(vma->vm_mm);
  490. /* Success. */
  491. if (new_below)
  492. vma_next(vmi);
  493. else
  494. vma_prev(vmi);
  495. return 0;
  496. out_free_mpol:
  497. mpol_put(vma_policy(new));
  498. out_free_vmi:
  499. vma_iter_free(vmi);
  500. out_free_vma:
  501. vm_area_free(new);
  502. return err;
  503. }
  504. /*
  505. * Split a vma into two pieces at address 'addr', a new vma is allocated
  506. * either for the first part or the tail.
  507. */
  508. static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
  509. unsigned long addr, int new_below)
  510. {
  511. if (vma->vm_mm->map_count >= sysctl_max_map_count)
  512. return -ENOMEM;
  513. return __split_vma(vmi, vma, addr, new_below);
  514. }
  515. /*
  516. * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the
  517. * instance that the destination VMA has no anon_vma but the source does.
  518. *
  519. * @dst: The destination VMA
  520. * @src: The source VMA
  521. * @dup: Pointer to the destination VMA when successful.
  522. *
  523. * Returns: 0 on success.
  524. */
  525. static int dup_anon_vma(struct vm_area_struct *dst,
  526. struct vm_area_struct *src, struct vm_area_struct **dup)
  527. {
  528. /*
  529. * There are three cases to consider for correctly propagating
  530. * anon_vma's on merge.
  531. *
  532. * The first is trivial - neither VMA has anon_vma, we need not do
  533. * anything.
  534. *
  535. * The second where both have anon_vma is also a no-op, as they must
  536. * then be the same, so there is simply nothing to copy.
  537. *
  538. * Here we cover the third - if the destination VMA has no anon_vma,
  539. * that is it is unfaulted, we need to ensure that the newly merged
  540. * range is referenced by the anon_vma's of the source.
  541. */
  542. if (src->anon_vma && !dst->anon_vma) {
  543. int ret;
  544. vma_assert_write_locked(dst);
  545. dst->anon_vma = src->anon_vma;
  546. ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED);
  547. if (ret)
  548. return ret;
  549. *dup = dst;
  550. }
  551. return 0;
  552. }
  553. #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
  554. void validate_mm(struct mm_struct *mm)
  555. {
  556. int bug = 0;
  557. int i = 0;
  558. struct vm_area_struct *vma;
  559. VMA_ITERATOR(vmi, mm, 0);
  560. mt_validate(&mm->mm_mt);
  561. for_each_vma(vmi, vma) {
  562. #ifdef CONFIG_DEBUG_VM_RB
  563. struct anon_vma *anon_vma = vma->anon_vma;
  564. struct anon_vma_chain *avc;
  565. #endif
  566. unsigned long vmi_start, vmi_end;
  567. bool warn = 0;
  568. vmi_start = vma_iter_addr(&vmi);
  569. vmi_end = vma_iter_end(&vmi);
  570. if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
  571. warn = 1;
  572. if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
  573. warn = 1;
  574. if (warn) {
  575. pr_emerg("issue in %s\n", current->comm);
  576. dump_stack();
  577. dump_vma(vma);
  578. pr_emerg("tree range: %px start %lx end %lx\n", vma,
  579. vmi_start, vmi_end - 1);
  580. vma_iter_dump_tree(&vmi);
  581. }
  582. #ifdef CONFIG_DEBUG_VM_RB
  583. if (anon_vma) {
  584. anon_vma_lock_read(anon_vma);
  585. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  586. anon_vma_interval_tree_verify(avc);
  587. anon_vma_unlock_read(anon_vma);
  588. }
  589. #endif
  590. /* Check for a infinite loop */
  591. if (++i > mm->map_count + 10) {
  592. i = -1;
  593. break;
  594. }
  595. }
  596. if (i != mm->map_count) {
  597. pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
  598. bug = 1;
  599. }
  600. VM_BUG_ON_MM(bug, mm);
  601. }
  602. #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
  603. /*
  604. * Based on the vmg flag indicating whether we need to adjust the vm_start field
  605. * for the middle or next VMA, we calculate what the range of the newly adjusted
  606. * VMA ought to be, and set the VMA's range accordingly.
  607. */
  608. static void vmg_adjust_set_range(struct vma_merge_struct *vmg)
  609. {
  610. struct vm_area_struct *adjust;
  611. pgoff_t pgoff;
  612. if (vmg->__adjust_middle_start) {
  613. adjust = vmg->middle;
  614. pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start);
  615. } else if (vmg->__adjust_next_start) {
  616. adjust = vmg->next;
  617. pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end);
  618. } else {
  619. return;
  620. }
  621. vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff);
  622. }
  623. /*
  624. * Actually perform the VMA merge operation.
  625. *
  626. * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not
  627. * modify any VMAs or cause inconsistent state should an OOM condition arise.
  628. *
  629. * Returns 0 on success, or an error value on failure.
  630. */
  631. static int commit_merge(struct vma_merge_struct *vmg)
  632. {
  633. struct vm_area_struct *vma;
  634. struct vma_prepare vp;
  635. if (vmg->__adjust_next_start) {
  636. /* We manipulate middle and adjust next, which is the target. */
  637. vma = vmg->middle;
  638. vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end);
  639. } else {
  640. vma = vmg->target;
  641. /* Note: vma iterator must be pointing to 'start'. */
  642. vma_iter_config(vmg->vmi, vmg->start, vmg->end);
  643. }
  644. init_multi_vma_prep(&vp, vma, vmg);
  645. /*
  646. * If vmg->give_up_on_oom is set, we're safe, because we don't actually
  647. * manipulate any VMAs until we succeed at preallocation.
  648. *
  649. * Past this point, we will not return an error.
  650. */
  651. if (vma_iter_prealloc(vmg->vmi, vma))
  652. return -ENOMEM;
  653. vma_prepare(&vp);
  654. /*
  655. * THP pages may need to do additional splits if we increase
  656. * middle->vm_start.
  657. */
  658. vma_adjust_trans_huge(vma, vmg->start, vmg->end,
  659. vmg->__adjust_middle_start ? vmg->middle : NULL);
  660. vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
  661. vmg_adjust_set_range(vmg);
  662. vma_iter_store_overwrite(vmg->vmi, vmg->target);
  663. vma_complete(&vp, vmg->vmi, vma->vm_mm);
  664. return 0;
  665. }
  666. /* We can only remove VMAs when merging if they do not have a close hook. */
  667. static bool can_merge_remove_vma(struct vm_area_struct *vma)
  668. {
  669. return !vma->vm_ops || !vma->vm_ops->close;
  670. }
  671. /*
  672. * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its
  673. * attributes modified.
  674. *
  675. * @vmg: Describes the modifications being made to a VMA and associated
  676. * metadata.
  677. *
  678. * When the attributes of a range within a VMA change, then it might be possible
  679. * for immediately adjacent VMAs to be merged into that VMA due to having
  680. * identical properties.
  681. *
  682. * This function checks for the existence of any such mergeable VMAs and updates
  683. * the maple tree describing the @vmg->middle->vm_mm address space to account
  684. * for this, as well as any VMAs shrunk/expanded/deleted as a result of this
  685. * merge.
  686. *
  687. * As part of this operation, if a merge occurs, the @vmg object will have its
  688. * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
  689. * calls to this function should reset these fields.
  690. *
  691. * Returns: The merged VMA if merge succeeds, or NULL otherwise.
  692. *
  693. * ASSUMPTIONS:
  694. * - The caller must assign the VMA to be modified to @vmg->middle.
  695. * - The caller must have set @vmg->prev to the previous VMA, if there is one.
  696. * - The caller must not set @vmg->next, as we determine this.
  697. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
  698. * - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end).
  699. */
  700. static __must_check struct vm_area_struct *vma_merge_existing_range(
  701. struct vma_merge_struct *vmg)
  702. {
  703. vm_flags_t sticky_flags = vmg->vm_flags & VM_STICKY;
  704. struct vm_area_struct *middle = vmg->middle;
  705. struct vm_area_struct *prev = vmg->prev;
  706. struct vm_area_struct *next;
  707. struct vm_area_struct *anon_dup = NULL;
  708. unsigned long start = vmg->start;
  709. unsigned long end = vmg->end;
  710. bool left_side = middle && start == middle->vm_start;
  711. bool right_side = middle && end == middle->vm_end;
  712. int err = 0;
  713. bool merge_left, merge_right, merge_both;
  714. mmap_assert_write_locked(vmg->mm);
  715. VM_WARN_ON_VMG(!middle, vmg); /* We are modifying a VMA, so caller must specify. */
  716. VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */
  717. VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg);
  718. VM_WARN_ON_VMG(start >= end, vmg);
  719. /*
  720. * If middle == prev, then we are offset into a VMA. Otherwise, if we are
  721. * not, we must span a portion of the VMA.
  722. */
  723. VM_WARN_ON_VMG(middle &&
  724. ((middle != prev && vmg->start != middle->vm_start) ||
  725. vmg->end > middle->vm_end), vmg);
  726. /* The vmi must be positioned within vmg->middle. */
  727. VM_WARN_ON_VMG(middle &&
  728. !(vma_iter_addr(vmg->vmi) >= middle->vm_start &&
  729. vma_iter_addr(vmg->vmi) < middle->vm_end), vmg);
  730. /* An existing merge can never be used by the mremap() logic. */
  731. VM_WARN_ON_VMG(vmg->copied_from, vmg);
  732. vmg->state = VMA_MERGE_NOMERGE;
  733. /*
  734. * If a special mapping or if the range being modified is neither at the
  735. * furthermost left or right side of the VMA, then we have no chance of
  736. * merging and should abort.
  737. */
  738. if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side))
  739. return NULL;
  740. if (left_side)
  741. merge_left = can_vma_merge_left(vmg);
  742. else
  743. merge_left = false;
  744. if (right_side) {
  745. next = vmg->next = vma_iter_next_range(vmg->vmi);
  746. vma_iter_prev_range(vmg->vmi);
  747. merge_right = can_vma_merge_right(vmg, merge_left);
  748. } else {
  749. merge_right = false;
  750. next = NULL;
  751. }
  752. if (merge_left) /* If merging prev, position iterator there. */
  753. vma_prev(vmg->vmi);
  754. else if (!merge_right) /* If we have nothing to merge, abort. */
  755. return NULL;
  756. merge_both = merge_left && merge_right;
  757. /* If we span the entire VMA, a merge implies it will be deleted. */
  758. vmg->__remove_middle = left_side && right_side;
  759. /*
  760. * If we need to remove middle in its entirety but are unable to do so,
  761. * we have no sensible recourse but to abort the merge.
  762. */
  763. if (vmg->__remove_middle && !can_merge_remove_vma(middle))
  764. return NULL;
  765. /*
  766. * If we merge both VMAs, then next is also deleted. This implies
  767. * merge_will_delete_vma also.
  768. */
  769. vmg->__remove_next = merge_both;
  770. /*
  771. * If we cannot delete next, then we can reduce the operation to merging
  772. * prev and middle (thereby deleting middle).
  773. */
  774. if (vmg->__remove_next && !can_merge_remove_vma(next)) {
  775. vmg->__remove_next = false;
  776. merge_right = false;
  777. merge_both = false;
  778. }
  779. /* No matter what happens, we will be adjusting middle. */
  780. vma_start_write(middle);
  781. if (merge_right) {
  782. vma_start_write(next);
  783. vmg->target = next;
  784. sticky_flags |= (next->vm_flags & VM_STICKY);
  785. }
  786. if (merge_left) {
  787. vma_start_write(prev);
  788. vmg->target = prev;
  789. sticky_flags |= (prev->vm_flags & VM_STICKY);
  790. }
  791. if (merge_both) {
  792. /*
  793. * |<-------------------->|
  794. * |-------********-------|
  795. * prev middle next
  796. * extend delete delete
  797. */
  798. vmg->start = prev->vm_start;
  799. vmg->end = next->vm_end;
  800. vmg->pgoff = prev->vm_pgoff;
  801. /*
  802. * We already ensured anon_vma compatibility above, so now it's
  803. * simply a case of, if prev has no anon_vma object, which of
  804. * next or middle contains the anon_vma we must duplicate.
  805. */
  806. err = dup_anon_vma(prev, next->anon_vma ? next : middle,
  807. &anon_dup);
  808. } else if (merge_left) {
  809. /*
  810. * |<------------>| OR
  811. * |<----------------->|
  812. * |-------*************
  813. * prev middle
  814. * extend shrink/delete
  815. */
  816. vmg->start = prev->vm_start;
  817. vmg->pgoff = prev->vm_pgoff;
  818. if (!vmg->__remove_middle)
  819. vmg->__adjust_middle_start = true;
  820. err = dup_anon_vma(prev, middle, &anon_dup);
  821. } else { /* merge_right */
  822. /*
  823. * |<------------->| OR
  824. * |<----------------->|
  825. * *************-------|
  826. * middle next
  827. * shrink/delete extend
  828. */
  829. pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
  830. VM_WARN_ON_VMG(!merge_right, vmg);
  831. /* If we are offset into a VMA, then prev must be middle. */
  832. VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg);
  833. if (vmg->__remove_middle) {
  834. vmg->end = next->vm_end;
  835. vmg->pgoff = next->vm_pgoff - pglen;
  836. } else {
  837. /* We shrink middle and expand next. */
  838. vmg->__adjust_next_start = true;
  839. vmg->start = middle->vm_start;
  840. vmg->end = start;
  841. vmg->pgoff = middle->vm_pgoff;
  842. }
  843. err = dup_anon_vma(next, middle, &anon_dup);
  844. }
  845. if (err || commit_merge(vmg))
  846. goto abort;
  847. vm_flags_set(vmg->target, sticky_flags);
  848. khugepaged_enter_vma(vmg->target, vmg->vm_flags);
  849. vmg->state = VMA_MERGE_SUCCESS;
  850. return vmg->target;
  851. abort:
  852. vma_iter_set(vmg->vmi, start);
  853. vma_iter_load(vmg->vmi);
  854. if (anon_dup)
  855. unlink_anon_vmas(anon_dup);
  856. /*
  857. * This means we have failed to clone anon_vma's correctly, but no
  858. * actual changes to VMAs have occurred, so no harm no foul - if the
  859. * user doesn't want this reported and instead just wants to give up on
  860. * the merge, allow it.
  861. */
  862. if (!vmg->give_up_on_oom)
  863. vmg->state = VMA_MERGE_ERROR_NOMEM;
  864. return NULL;
  865. }
  866. /*
  867. * vma_merge_new_range - Attempt to merge a new VMA into address space
  868. *
  869. * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
  870. * (exclusive), which we try to merge with any adjacent VMAs if possible.
  871. *
  872. * We are about to add a VMA to the address space starting at @vmg->start and
  873. * ending at @vmg->end. There are three different possible scenarios:
  874. *
  875. * 1. There is a VMA with identical properties immediately adjacent to the
  876. * proposed new VMA [@vmg->start, @vmg->end) either before or after it -
  877. * EXPAND that VMA:
  878. *
  879. * Proposed: |-----| or |-----|
  880. * Existing: |----| |----|
  881. *
  882. * 2. There are VMAs with identical properties immediately adjacent to the
  883. * proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
  884. * EXPAND the former and REMOVE the latter:
  885. *
  886. * Proposed: |-----|
  887. * Existing: |----| |----|
  888. *
  889. * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
  890. * VMAs do not have identical attributes - NO MERGE POSSIBLE.
  891. *
  892. * In instances where we can merge, this function returns the expanded VMA which
  893. * will have its range adjusted accordingly and the underlying maple tree also
  894. * adjusted.
  895. *
  896. * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
  897. * to the VMA we expanded.
  898. *
  899. * This function adjusts @vmg to provide @vmg->next if not already specified,
  900. * and adjusts [@vmg->start, @vmg->end) to span the expanded range.
  901. *
  902. * ASSUMPTIONS:
  903. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
  904. * - The caller must have determined that [@vmg->start, @vmg->end) is empty,
  905. other than VMAs that will be unmapped should the operation succeed.
  906. * - The caller must have specified the previous vma in @vmg->prev.
  907. * - The caller must have specified the next vma in @vmg->next.
  908. * - The caller must have positioned the vmi at or before the gap.
  909. */
  910. struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
  911. {
  912. struct vm_area_struct *prev = vmg->prev;
  913. struct vm_area_struct *next = vmg->next;
  914. unsigned long end = vmg->end;
  915. bool can_merge_left, can_merge_right;
  916. mmap_assert_write_locked(vmg->mm);
  917. VM_WARN_ON_VMG(vmg->middle, vmg);
  918. VM_WARN_ON_VMG(vmg->target, vmg);
  919. /* vmi must point at or before the gap. */
  920. VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg);
  921. vmg->state = VMA_MERGE_NOMERGE;
  922. /* Special VMAs are unmergeable, also if no prev/next. */
  923. if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next))
  924. return NULL;
  925. can_merge_left = can_vma_merge_left(vmg);
  926. can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left);
  927. /* If we can merge with the next VMA, adjust vmg accordingly. */
  928. if (can_merge_right) {
  929. vmg->end = next->vm_end;
  930. vmg->target = next;
  931. }
  932. /* If we can merge with the previous VMA, adjust vmg accordingly. */
  933. if (can_merge_left) {
  934. vmg->start = prev->vm_start;
  935. vmg->target = prev;
  936. vmg->pgoff = prev->vm_pgoff;
  937. /*
  938. * If this merge would result in removal of the next VMA but we
  939. * are not permitted to do so, reduce the operation to merging
  940. * prev and vma.
  941. */
  942. if (can_merge_right && !can_merge_remove_vma(next))
  943. vmg->end = end;
  944. /* In expand-only case we are already positioned at prev. */
  945. if (!vmg->just_expand) {
  946. /* Equivalent to going to the previous range. */
  947. vma_prev(vmg->vmi);
  948. }
  949. }
  950. /*
  951. * Now try to expand adjacent VMA(s). This takes care of removing the
  952. * following VMA if we have VMAs on both sides.
  953. */
  954. if (vmg->target && !vma_expand(vmg)) {
  955. khugepaged_enter_vma(vmg->target, vmg->vm_flags);
  956. vmg->state = VMA_MERGE_SUCCESS;
  957. return vmg->target;
  958. }
  959. return NULL;
  960. }
  961. /*
  962. * vma_merge_copied_range - Attempt to merge a VMA that is being copied by
  963. * mremap()
  964. *
  965. * @vmg: Describes the VMA we are adding, in the copied-to range @vmg->start to
  966. * @vmg->end (exclusive), which we try to merge with any adjacent VMAs if
  967. * possible.
  968. *
  969. * vmg->prev, next, start, end, pgoff should all be relative to the COPIED TO
  970. * range, i.e. the target range for the VMA.
  971. *
  972. * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
  973. * to the VMA we expanded.
  974. *
  975. * ASSUMPTIONS: Same as vma_merge_new_range(), except vmg->middle must contain
  976. * the copied-from VMA.
  977. */
  978. static struct vm_area_struct *vma_merge_copied_range(struct vma_merge_struct *vmg)
  979. {
  980. /* We must have a copied-from VMA. */
  981. VM_WARN_ON_VMG(!vmg->middle, vmg);
  982. vmg->copied_from = vmg->middle;
  983. vmg->middle = NULL;
  984. return vma_merge_new_range(vmg);
  985. }
  986. /*
  987. * vma_expand - Expand an existing VMA
  988. *
  989. * @vmg: Describes a VMA expansion operation.
  990. *
  991. * Expand @vma to vmg->start and vmg->end. Can expand off the start and end.
  992. * Will expand over vmg->next if it's different from vmg->target and vmg->end ==
  993. * vmg->next->vm_end. Checking if the vmg->target can expand and merge with
  994. * vmg->next needs to be handled by the caller.
  995. *
  996. * Returns: 0 on success.
  997. *
  998. * ASSUMPTIONS:
  999. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
  1000. * - The caller must have set @vmg->target and @vmg->next.
  1001. */
  1002. int vma_expand(struct vma_merge_struct *vmg)
  1003. {
  1004. struct vm_area_struct *anon_dup = NULL;
  1005. struct vm_area_struct *target = vmg->target;
  1006. struct vm_area_struct *next = vmg->next;
  1007. bool remove_next = false;
  1008. vm_flags_t sticky_flags;
  1009. int ret = 0;
  1010. mmap_assert_write_locked(vmg->mm);
  1011. vma_start_write(target);
  1012. if (next && target != next && vmg->end == next->vm_end)
  1013. remove_next = true;
  1014. /* We must have a target. */
  1015. VM_WARN_ON_VMG(!target, vmg);
  1016. /* This should have already been checked by this point. */
  1017. VM_WARN_ON_VMG(remove_next && !can_merge_remove_vma(next), vmg);
  1018. /* Not merging but overwriting any part of next is not handled. */
  1019. VM_WARN_ON_VMG(next && !remove_next &&
  1020. next != target && vmg->end > next->vm_start, vmg);
  1021. /* Only handles expanding. */
  1022. VM_WARN_ON_VMG(target->vm_start < vmg->start ||
  1023. target->vm_end > vmg->end, vmg);
  1024. sticky_flags = vmg->vm_flags & VM_STICKY;
  1025. sticky_flags |= target->vm_flags & VM_STICKY;
  1026. if (remove_next)
  1027. sticky_flags |= next->vm_flags & VM_STICKY;
  1028. /*
  1029. * If we are removing the next VMA or copying from a VMA
  1030. * (e.g. mremap()'ing), we must propagate anon_vma state.
  1031. *
  1032. * Note that, by convention, callers ignore OOM for this case, so
  1033. * we don't need to account for vmg->give_up_on_mm here.
  1034. */
  1035. if (remove_next)
  1036. ret = dup_anon_vma(target, next, &anon_dup);
  1037. if (!ret && vmg->copied_from)
  1038. ret = dup_anon_vma(target, vmg->copied_from, &anon_dup);
  1039. if (ret)
  1040. return ret;
  1041. if (remove_next) {
  1042. vma_start_write(next);
  1043. vmg->__remove_next = true;
  1044. }
  1045. if (commit_merge(vmg))
  1046. goto nomem;
  1047. vm_flags_set(target, sticky_flags);
  1048. return 0;
  1049. nomem:
  1050. if (anon_dup)
  1051. unlink_anon_vmas(anon_dup);
  1052. /*
  1053. * If the user requests that we just give upon OOM, we are safe to do so
  1054. * here, as commit merge provides this contract to us. Nothing has been
  1055. * changed - no harm no foul, just don't report it.
  1056. */
  1057. if (!vmg->give_up_on_oom)
  1058. vmg->state = VMA_MERGE_ERROR_NOMEM;
  1059. return -ENOMEM;
  1060. }
  1061. /*
  1062. * vma_shrink() - Reduce an existing VMAs memory area
  1063. * @vmi: The vma iterator
  1064. * @vma: The VMA to modify
  1065. * @start: The new start
  1066. * @end: The new end
  1067. *
  1068. * Returns: 0 on success, -ENOMEM otherwise
  1069. */
  1070. int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
  1071. unsigned long start, unsigned long end, pgoff_t pgoff)
  1072. {
  1073. struct vma_prepare vp;
  1074. WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
  1075. if (vma->vm_start < start)
  1076. vma_iter_config(vmi, vma->vm_start, start);
  1077. else
  1078. vma_iter_config(vmi, end, vma->vm_end);
  1079. if (vma_iter_prealloc(vmi, NULL))
  1080. return -ENOMEM;
  1081. vma_start_write(vma);
  1082. init_vma_prep(&vp, vma);
  1083. vma_prepare(&vp);
  1084. vma_adjust_trans_huge(vma, start, end, NULL);
  1085. vma_iter_clear(vmi);
  1086. vma_set_range(vma, start, end, pgoff);
  1087. vma_complete(&vp, vmi, vma->vm_mm);
  1088. validate_mm(vma->vm_mm);
  1089. return 0;
  1090. }
  1091. static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
  1092. struct ma_state *mas_detach, bool mm_wr_locked)
  1093. {
  1094. struct unmap_desc unmap = {
  1095. .mas = mas_detach,
  1096. .first = vms->vma,
  1097. /* start and end may be different if there is no prev or next vma. */
  1098. .pg_start = vms->unmap_start,
  1099. .pg_end = vms->unmap_end,
  1100. .vma_start = vms->start,
  1101. .vma_end = vms->end,
  1102. /*
  1103. * The tree limits and reset differ from the normal case since it's a
  1104. * side-tree
  1105. */
  1106. .tree_reset = 1,
  1107. .tree_end = vms->vma_count,
  1108. /*
  1109. * We can free page tables without write-locking mmap_lock because VMAs
  1110. * were isolated before we downgraded mmap_lock.
  1111. */
  1112. .mm_wr_locked = mm_wr_locked,
  1113. };
  1114. if (!vms->clear_ptes) /* Nothing to do */
  1115. return;
  1116. mas_set(mas_detach, 1);
  1117. unmap_region(&unmap);
  1118. vms->clear_ptes = false;
  1119. }
  1120. static void vms_clean_up_area(struct vma_munmap_struct *vms,
  1121. struct ma_state *mas_detach)
  1122. {
  1123. struct vm_area_struct *vma;
  1124. if (!vms->nr_pages)
  1125. return;
  1126. vms_clear_ptes(vms, mas_detach, true);
  1127. mas_set(mas_detach, 0);
  1128. mas_for_each(mas_detach, vma, ULONG_MAX)
  1129. vma_close(vma);
  1130. }
  1131. /*
  1132. * vms_complete_munmap_vmas() - Finish the munmap() operation
  1133. * @vms: The vma munmap struct
  1134. * @mas_detach: The maple state of the detached vmas
  1135. *
  1136. * This updates the mm_struct, unmaps the region, frees the resources
  1137. * used for the munmap() and may downgrade the lock - if requested. Everything
  1138. * needed to be done once the vma maple tree is updated.
  1139. */
  1140. static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
  1141. struct ma_state *mas_detach)
  1142. {
  1143. struct vm_area_struct *vma;
  1144. struct mm_struct *mm;
  1145. mm = current->mm;
  1146. mm->map_count -= vms->vma_count;
  1147. mm->locked_vm -= vms->locked_vm;
  1148. if (vms->unlock)
  1149. mmap_write_downgrade(mm);
  1150. if (!vms->nr_pages)
  1151. return;
  1152. vms_clear_ptes(vms, mas_detach, !vms->unlock);
  1153. /* Update high watermark before we lower total_vm */
  1154. update_hiwater_vm(mm);
  1155. /* Stat accounting */
  1156. WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages);
  1157. /* Paranoid bookkeeping */
  1158. VM_WARN_ON(vms->exec_vm > mm->exec_vm);
  1159. VM_WARN_ON(vms->stack_vm > mm->stack_vm);
  1160. VM_WARN_ON(vms->data_vm > mm->data_vm);
  1161. mm->exec_vm -= vms->exec_vm;
  1162. mm->stack_vm -= vms->stack_vm;
  1163. mm->data_vm -= vms->data_vm;
  1164. /* Remove and clean up vmas */
  1165. mas_set(mas_detach, 0);
  1166. mas_for_each(mas_detach, vma, ULONG_MAX)
  1167. remove_vma(vma);
  1168. vm_unacct_memory(vms->nr_accounted);
  1169. validate_mm(mm);
  1170. if (vms->unlock)
  1171. mmap_read_unlock(mm);
  1172. __mt_destroy(mas_detach->tree);
  1173. }
  1174. /*
  1175. * reattach_vmas() - Undo any munmap work and free resources
  1176. * @mas_detach: The maple state with the detached maple tree
  1177. *
  1178. * Reattach any detached vmas and free up the maple tree used to track the vmas.
  1179. */
  1180. static void reattach_vmas(struct ma_state *mas_detach)
  1181. {
  1182. struct vm_area_struct *vma;
  1183. mas_set(mas_detach, 0);
  1184. mas_for_each(mas_detach, vma, ULONG_MAX)
  1185. vma_mark_attached(vma);
  1186. __mt_destroy(mas_detach->tree);
  1187. }
  1188. /*
  1189. * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
  1190. * for removal at a later date. Handles splitting first and last if necessary
  1191. * and marking the vmas as isolated.
  1192. *
  1193. * @vms: The vma munmap struct
  1194. * @mas_detach: The maple state tracking the detached tree
  1195. *
  1196. * Return: 0 on success, error otherwise
  1197. */
  1198. static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
  1199. struct ma_state *mas_detach)
  1200. {
  1201. struct vm_area_struct *next = NULL;
  1202. int error;
  1203. /*
  1204. * If we need to split any vma, do it now to save pain later.
  1205. * Does it split the first one?
  1206. */
  1207. if (vms->start > vms->vma->vm_start) {
  1208. /*
  1209. * Make sure that map_count on return from munmap() will
  1210. * not exceed its limit; but let map_count go just above
  1211. * its limit temporarily, to help free resources as expected.
  1212. */
  1213. if (vms->end < vms->vma->vm_end &&
  1214. vms->vma->vm_mm->map_count >= sysctl_max_map_count) {
  1215. error = -ENOMEM;
  1216. goto map_count_exceeded;
  1217. }
  1218. /* Don't bother splitting the VMA if we can't unmap it anyway */
  1219. if (vma_is_sealed(vms->vma)) {
  1220. error = -EPERM;
  1221. goto start_split_failed;
  1222. }
  1223. error = __split_vma(vms->vmi, vms->vma, vms->start, 1);
  1224. if (error)
  1225. goto start_split_failed;
  1226. }
  1227. vms->prev = vma_prev(vms->vmi);
  1228. if (vms->prev)
  1229. vms->unmap_start = vms->prev->vm_end;
  1230. /*
  1231. * Detach a range of VMAs from the mm. Using next as a temp variable as
  1232. * it is always overwritten.
  1233. */
  1234. for_each_vma_range(*(vms->vmi), next, vms->end) {
  1235. long nrpages;
  1236. if (vma_is_sealed(next)) {
  1237. error = -EPERM;
  1238. goto modify_vma_failed;
  1239. }
  1240. /* Does it split the end? */
  1241. if (next->vm_end > vms->end) {
  1242. error = __split_vma(vms->vmi, next, vms->end, 0);
  1243. if (error)
  1244. goto end_split_failed;
  1245. }
  1246. vma_start_write(next);
  1247. mas_set(mas_detach, vms->vma_count++);
  1248. error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
  1249. if (error)
  1250. goto munmap_gather_failed;
  1251. vma_mark_detached(next);
  1252. nrpages = vma_pages(next);
  1253. vms->nr_pages += nrpages;
  1254. if (next->vm_flags & VM_LOCKED)
  1255. vms->locked_vm += nrpages;
  1256. if (next->vm_flags & VM_ACCOUNT)
  1257. vms->nr_accounted += nrpages;
  1258. if (is_exec_mapping(next->vm_flags))
  1259. vms->exec_vm += nrpages;
  1260. else if (is_stack_mapping(next->vm_flags))
  1261. vms->stack_vm += nrpages;
  1262. else if (is_data_mapping(next->vm_flags))
  1263. vms->data_vm += nrpages;
  1264. if (vms->uf) {
  1265. /*
  1266. * If userfaultfd_unmap_prep returns an error the vmas
  1267. * will remain split, but userland will get a
  1268. * highly unexpected error anyway. This is no
  1269. * different than the case where the first of the two
  1270. * __split_vma fails, but we don't undo the first
  1271. * split, despite we could. This is unlikely enough
  1272. * failure that it's not worth optimizing it for.
  1273. */
  1274. error = userfaultfd_unmap_prep(next, vms->start,
  1275. vms->end, vms->uf);
  1276. if (error)
  1277. goto userfaultfd_error;
  1278. }
  1279. #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
  1280. BUG_ON(next->vm_start < vms->start);
  1281. BUG_ON(next->vm_start > vms->end);
  1282. #endif
  1283. }
  1284. vms->next = vma_next(vms->vmi);
  1285. if (vms->next)
  1286. vms->unmap_end = vms->next->vm_start;
  1287. #if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
  1288. /* Make sure no VMAs are about to be lost. */
  1289. {
  1290. MA_STATE(test, mas_detach->tree, 0, 0);
  1291. struct vm_area_struct *vma_mas, *vma_test;
  1292. int test_count = 0;
  1293. vma_iter_set(vms->vmi, vms->start);
  1294. rcu_read_lock();
  1295. vma_test = mas_find(&test, vms->vma_count - 1);
  1296. for_each_vma_range(*(vms->vmi), vma_mas, vms->end) {
  1297. BUG_ON(vma_mas != vma_test);
  1298. test_count++;
  1299. vma_test = mas_next(&test, vms->vma_count - 1);
  1300. }
  1301. rcu_read_unlock();
  1302. BUG_ON(vms->vma_count != test_count);
  1303. }
  1304. #endif
  1305. while (vma_iter_addr(vms->vmi) > vms->start)
  1306. vma_iter_prev_range(vms->vmi);
  1307. vms->clear_ptes = true;
  1308. return 0;
  1309. userfaultfd_error:
  1310. munmap_gather_failed:
  1311. end_split_failed:
  1312. modify_vma_failed:
  1313. reattach_vmas(mas_detach);
  1314. start_split_failed:
  1315. map_count_exceeded:
  1316. return error;
  1317. }
  1318. /*
  1319. * init_vma_munmap() - Initializer wrapper for vma_munmap_struct
  1320. * @vms: The vma munmap struct
  1321. * @vmi: The vma iterator
  1322. * @vma: The first vm_area_struct to munmap
  1323. * @start: The aligned start address to munmap
  1324. * @end: The aligned end address to munmap
  1325. * @uf: The userfaultfd list_head
  1326. * @unlock: Unlock after the operation. Only unlocked on success
  1327. */
  1328. static void init_vma_munmap(struct vma_munmap_struct *vms,
  1329. struct vma_iterator *vmi, struct vm_area_struct *vma,
  1330. unsigned long start, unsigned long end, struct list_head *uf,
  1331. bool unlock)
  1332. {
  1333. vms->vmi = vmi;
  1334. vms->vma = vma;
  1335. if (vma) {
  1336. vms->start = start;
  1337. vms->end = end;
  1338. } else {
  1339. vms->start = vms->end = 0;
  1340. }
  1341. vms->unlock = unlock;
  1342. vms->uf = uf;
  1343. vms->vma_count = 0;
  1344. vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0;
  1345. vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
  1346. vms->unmap_start = FIRST_USER_ADDRESS;
  1347. vms->unmap_end = USER_PGTABLES_CEILING;
  1348. vms->clear_ptes = false;
  1349. }
  1350. /*
  1351. * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
  1352. * @vmi: The vma iterator
  1353. * @vma: The starting vm_area_struct
  1354. * @mm: The mm_struct
  1355. * @start: The aligned start address to munmap.
  1356. * @end: The aligned end address to munmap.
  1357. * @uf: The userfaultfd list_head
  1358. * @unlock: Set to true to drop the mmap_lock. unlocking only happens on
  1359. * success.
  1360. *
  1361. * Return: 0 on success and drops the lock if so directed, error and leaves the
  1362. * lock held otherwise.
  1363. */
  1364. int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
  1365. struct mm_struct *mm, unsigned long start, unsigned long end,
  1366. struct list_head *uf, bool unlock)
  1367. {
  1368. struct maple_tree mt_detach;
  1369. MA_STATE(mas_detach, &mt_detach, 0, 0);
  1370. mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
  1371. mt_on_stack(mt_detach);
  1372. struct vma_munmap_struct vms;
  1373. int error;
  1374. init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock);
  1375. error = vms_gather_munmap_vmas(&vms, &mas_detach);
  1376. if (error)
  1377. goto gather_failed;
  1378. error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
  1379. if (error)
  1380. goto clear_tree_failed;
  1381. /* Point of no return */
  1382. vms_complete_munmap_vmas(&vms, &mas_detach);
  1383. return 0;
  1384. clear_tree_failed:
  1385. reattach_vmas(&mas_detach);
  1386. gather_failed:
  1387. validate_mm(mm);
  1388. return error;
  1389. }
  1390. /*
  1391. * do_vmi_munmap() - munmap a given range.
  1392. * @vmi: The vma iterator
  1393. * @mm: The mm_struct
  1394. * @start: The start address to munmap
  1395. * @len: The length of the range to munmap
  1396. * @uf: The userfaultfd list_head
  1397. * @unlock: set to true if the user wants to drop the mmap_lock on success
  1398. *
  1399. * This function takes a @mas that is either pointing to the previous VMA or set
  1400. * to MA_START and sets it up to remove the mapping(s). The @len will be
  1401. * aligned.
  1402. *
  1403. * Return: 0 on success and drops the lock if so directed, error and leaves the
  1404. * lock held otherwise.
  1405. */
  1406. int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
  1407. unsigned long start, size_t len, struct list_head *uf,
  1408. bool unlock)
  1409. {
  1410. unsigned long end;
  1411. struct vm_area_struct *vma;
  1412. if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
  1413. return -EINVAL;
  1414. end = start + PAGE_ALIGN(len);
  1415. if (end == start)
  1416. return -EINVAL;
  1417. /* Find the first overlapping VMA */
  1418. vma = vma_find(vmi, end);
  1419. if (!vma) {
  1420. if (unlock)
  1421. mmap_write_unlock(mm);
  1422. return 0;
  1423. }
  1424. return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
  1425. }
  1426. /*
  1427. * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
  1428. * context and anonymous VMA name within the range [start, end).
  1429. *
  1430. * As a result, we might be able to merge the newly modified VMA range with an
  1431. * adjacent VMA with identical properties.
  1432. *
  1433. * If no merge is possible and the range does not span the entirety of the VMA,
  1434. * we then need to split the VMA to accommodate the change.
  1435. *
  1436. * The function returns either the merged VMA, the original VMA if a split was
  1437. * required instead, or an error if the split failed.
  1438. */
  1439. static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
  1440. {
  1441. struct vm_area_struct *vma = vmg->middle;
  1442. unsigned long start = vmg->start;
  1443. unsigned long end = vmg->end;
  1444. struct vm_area_struct *merged;
  1445. /* First, try to merge. */
  1446. merged = vma_merge_existing_range(vmg);
  1447. if (merged)
  1448. return merged;
  1449. if (vmg_nomem(vmg))
  1450. return ERR_PTR(-ENOMEM);
  1451. /*
  1452. * Split can fail for reasons other than OOM, so if the user requests
  1453. * this it's probably a mistake.
  1454. */
  1455. VM_WARN_ON(vmg->give_up_on_oom &&
  1456. (vma->vm_start != start || vma->vm_end != end));
  1457. /* Split any preceding portion of the VMA. */
  1458. if (vma->vm_start < start) {
  1459. int err = split_vma(vmg->vmi, vma, start, 1);
  1460. if (err)
  1461. return ERR_PTR(err);
  1462. }
  1463. /* Split any trailing portion of the VMA. */
  1464. if (vma->vm_end > end) {
  1465. int err = split_vma(vmg->vmi, vma, end, 0);
  1466. if (err)
  1467. return ERR_PTR(err);
  1468. }
  1469. return vma;
  1470. }
  1471. struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
  1472. struct vm_area_struct *prev, struct vm_area_struct *vma,
  1473. unsigned long start, unsigned long end,
  1474. vm_flags_t *vm_flags_ptr)
  1475. {
  1476. VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
  1477. const vm_flags_t vm_flags = *vm_flags_ptr;
  1478. struct vm_area_struct *ret;
  1479. vmg.vm_flags = vm_flags;
  1480. ret = vma_modify(&vmg);
  1481. if (IS_ERR(ret))
  1482. return ret;
  1483. /*
  1484. * For a merge to succeed, the flags must match those
  1485. * requested. However, sticky flags may have been retained, so propagate
  1486. * them to the caller.
  1487. */
  1488. if (vmg.state == VMA_MERGE_SUCCESS)
  1489. *vm_flags_ptr = ret->vm_flags;
  1490. return ret;
  1491. }
  1492. struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi,
  1493. struct vm_area_struct *prev, struct vm_area_struct *vma,
  1494. unsigned long start, unsigned long end,
  1495. struct anon_vma_name *new_name)
  1496. {
  1497. VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
  1498. vmg.anon_name = new_name;
  1499. return vma_modify(&vmg);
  1500. }
  1501. struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi,
  1502. struct vm_area_struct *prev, struct vm_area_struct *vma,
  1503. unsigned long start, unsigned long end,
  1504. struct mempolicy *new_pol)
  1505. {
  1506. VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
  1507. vmg.policy = new_pol;
  1508. return vma_modify(&vmg);
  1509. }
  1510. struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi,
  1511. struct vm_area_struct *prev, struct vm_area_struct *vma,
  1512. unsigned long start, unsigned long end, vm_flags_t vm_flags,
  1513. struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom)
  1514. {
  1515. VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
  1516. vmg.vm_flags = vm_flags;
  1517. vmg.uffd_ctx = new_ctx;
  1518. if (give_up_on_oom)
  1519. vmg.give_up_on_oom = true;
  1520. return vma_modify(&vmg);
  1521. }
  1522. /*
  1523. * Expand vma by delta bytes, potentially merging with an immediately adjacent
  1524. * VMA with identical properties.
  1525. */
  1526. struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
  1527. struct vm_area_struct *vma,
  1528. unsigned long delta)
  1529. {
  1530. VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta);
  1531. vmg.next = vma_iter_next_rewind(vmi, NULL);
  1532. vmg.middle = NULL; /* We use the VMA to populate VMG fields only. */
  1533. return vma_merge_new_range(&vmg);
  1534. }
  1535. void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
  1536. {
  1537. vb->count = 0;
  1538. }
  1539. static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
  1540. {
  1541. struct address_space *mapping;
  1542. int i;
  1543. mapping = vb->vmas[0]->vm_file->f_mapping;
  1544. i_mmap_lock_write(mapping);
  1545. for (i = 0; i < vb->count; i++) {
  1546. VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
  1547. __remove_shared_vm_struct(vb->vmas[i], mapping);
  1548. }
  1549. i_mmap_unlock_write(mapping);
  1550. unlink_file_vma_batch_init(vb);
  1551. }
  1552. void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
  1553. struct vm_area_struct *vma)
  1554. {
  1555. if (vma->vm_file == NULL)
  1556. return;
  1557. if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
  1558. vb->count == ARRAY_SIZE(vb->vmas))
  1559. unlink_file_vma_batch_process(vb);
  1560. vb->vmas[vb->count] = vma;
  1561. vb->count++;
  1562. }
  1563. void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
  1564. {
  1565. if (vb->count > 0)
  1566. unlink_file_vma_batch_process(vb);
  1567. }
  1568. static void vma_link_file(struct vm_area_struct *vma, bool hold_rmap_lock)
  1569. {
  1570. struct file *file = vma->vm_file;
  1571. struct address_space *mapping;
  1572. if (file) {
  1573. mapping = file->f_mapping;
  1574. i_mmap_lock_write(mapping);
  1575. __vma_link_file(vma, mapping);
  1576. if (!hold_rmap_lock)
  1577. i_mmap_unlock_write(mapping);
  1578. }
  1579. }
  1580. static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
  1581. {
  1582. VMA_ITERATOR(vmi, mm, 0);
  1583. vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
  1584. if (vma_iter_prealloc(&vmi, vma))
  1585. return -ENOMEM;
  1586. vma_start_write(vma);
  1587. vma_iter_store_new(&vmi, vma);
  1588. vma_link_file(vma, /* hold_rmap_lock= */false);
  1589. mm->map_count++;
  1590. validate_mm(mm);
  1591. return 0;
  1592. }
  1593. /*
  1594. * Copy the vma structure to a new location in the same mm,
  1595. * prior to moving page table entries, to effect an mremap move.
  1596. */
  1597. struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
  1598. unsigned long addr, unsigned long len, pgoff_t pgoff,
  1599. bool *need_rmap_locks)
  1600. {
  1601. struct vm_area_struct *vma = *vmap;
  1602. unsigned long vma_start = vma->vm_start;
  1603. struct mm_struct *mm = vma->vm_mm;
  1604. struct vm_area_struct *new_vma;
  1605. bool faulted_in_anon_vma = true;
  1606. VMA_ITERATOR(vmi, mm, addr);
  1607. VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len);
  1608. /*
  1609. * If anonymous vma has not yet been faulted, update new pgoff
  1610. * to match new location, to increase its chance of merging.
  1611. */
  1612. if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
  1613. pgoff = addr >> PAGE_SHIFT;
  1614. faulted_in_anon_vma = false;
  1615. }
  1616. /*
  1617. * If the VMA we are copying might contain a uprobe PTE, ensure
  1618. * that we do not establish one upon merge. Otherwise, when mremap()
  1619. * moves page tables, it will orphan the newly created PTE.
  1620. */
  1621. if (vma->vm_file)
  1622. vmg.skip_vma_uprobe = true;
  1623. new_vma = find_vma_prev(mm, addr, &vmg.prev);
  1624. if (new_vma && new_vma->vm_start < addr + len)
  1625. return NULL; /* should never get here */
  1626. vmg.pgoff = pgoff;
  1627. vmg.next = vma_iter_next_rewind(&vmi, NULL);
  1628. new_vma = vma_merge_copied_range(&vmg);
  1629. if (new_vma) {
  1630. /*
  1631. * Source vma may have been merged into new_vma
  1632. */
  1633. if (unlikely(vma_start >= new_vma->vm_start &&
  1634. vma_start < new_vma->vm_end)) {
  1635. /*
  1636. * The only way we can get a vma_merge with
  1637. * self during an mremap is if the vma hasn't
  1638. * been faulted in yet and we were allowed to
  1639. * reset the dst vma->vm_pgoff to the
  1640. * destination address of the mremap to allow
  1641. * the merge to happen. mremap must change the
  1642. * vm_pgoff linearity between src and dst vmas
  1643. * (in turn preventing a vma_merge) to be
  1644. * safe. It is only safe to keep the vm_pgoff
  1645. * linear if there are no pages mapped yet.
  1646. */
  1647. VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
  1648. *vmap = vma = new_vma;
  1649. }
  1650. *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
  1651. } else {
  1652. new_vma = vm_area_dup(vma);
  1653. if (!new_vma)
  1654. goto out;
  1655. vma_set_range(new_vma, addr, addr + len, pgoff);
  1656. if (vma_dup_policy(vma, new_vma))
  1657. goto out_free_vma;
  1658. if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP))
  1659. goto out_free_mempol;
  1660. if (new_vma->vm_file)
  1661. get_file(new_vma->vm_file);
  1662. if (new_vma->vm_ops && new_vma->vm_ops->open)
  1663. new_vma->vm_ops->open(new_vma);
  1664. if (vma_link(mm, new_vma))
  1665. goto out_vma_link;
  1666. *need_rmap_locks = false;
  1667. }
  1668. return new_vma;
  1669. out_vma_link:
  1670. fixup_hugetlb_reservations(new_vma);
  1671. vma_close(new_vma);
  1672. if (new_vma->vm_file)
  1673. fput(new_vma->vm_file);
  1674. unlink_anon_vmas(new_vma);
  1675. out_free_mempol:
  1676. mpol_put(vma_policy(new_vma));
  1677. out_free_vma:
  1678. vm_area_free(new_vma);
  1679. out:
  1680. return NULL;
  1681. }
  1682. /*
  1683. * Rough compatibility check to quickly see if it's even worth looking
  1684. * at sharing an anon_vma.
  1685. *
  1686. * They need to have the same vm_file, and the flags can only differ
  1687. * in things that mprotect may change.
  1688. *
  1689. * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
  1690. * we can merge the two vma's. For example, we refuse to merge a vma if
  1691. * there is a vm_ops->close() function, because that indicates that the
  1692. * driver is doing some kind of reference counting. But that doesn't
  1693. * really matter for the anon_vma sharing case.
  1694. */
  1695. static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
  1696. {
  1697. return a->vm_end == b->vm_start &&
  1698. mpol_equal(vma_policy(a), vma_policy(b)) &&
  1699. a->vm_file == b->vm_file &&
  1700. !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_IGNORE_MERGE)) &&
  1701. b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
  1702. }
  1703. /*
  1704. * Do some basic sanity checking to see if we can re-use the anon_vma
  1705. * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
  1706. * the same as 'old', the other will be the new one that is trying
  1707. * to share the anon_vma.
  1708. *
  1709. * NOTE! This runs with mmap_lock held for reading, so it is possible that
  1710. * the anon_vma of 'old' is concurrently in the process of being set up
  1711. * by another page fault trying to merge _that_. But that's ok: if it
  1712. * is being set up, that automatically means that it will be a singleton
  1713. * acceptable for merging, so we can do all of this optimistically. But
  1714. * we do that READ_ONCE() to make sure that we never re-load the pointer.
  1715. *
  1716. * IOW: that the "list_is_singular()" test on the anon_vma_chain only
  1717. * matters for the 'stable anon_vma' case (ie the thing we want to avoid
  1718. * is to return an anon_vma that is "complex" due to having gone through
  1719. * a fork).
  1720. *
  1721. * We also make sure that the two vma's are compatible (adjacent,
  1722. * and with the same memory policies). That's all stable, even with just
  1723. * a read lock on the mmap_lock.
  1724. */
  1725. static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old,
  1726. struct vm_area_struct *a,
  1727. struct vm_area_struct *b)
  1728. {
  1729. if (anon_vma_compatible(a, b)) {
  1730. struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
  1731. if (anon_vma && list_is_singular(&old->anon_vma_chain))
  1732. return anon_vma;
  1733. }
  1734. return NULL;
  1735. }
  1736. /*
  1737. * find_mergeable_anon_vma is used by anon_vma_prepare, to check
  1738. * neighbouring vmas for a suitable anon_vma, before it goes off
  1739. * to allocate a new anon_vma. It checks because a repetitive
  1740. * sequence of mprotects and faults may otherwise lead to distinct
  1741. * anon_vmas being allocated, preventing vma merge in subsequent
  1742. * mprotect.
  1743. */
  1744. struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
  1745. {
  1746. struct anon_vma *anon_vma = NULL;
  1747. struct vm_area_struct *prev, *next;
  1748. VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
  1749. /* Try next first. */
  1750. next = vma_iter_load(&vmi);
  1751. if (next) {
  1752. anon_vma = reusable_anon_vma(next, vma, next);
  1753. if (anon_vma)
  1754. return anon_vma;
  1755. }
  1756. prev = vma_prev(&vmi);
  1757. VM_BUG_ON_VMA(prev != vma, vma);
  1758. prev = vma_prev(&vmi);
  1759. /* Try prev next. */
  1760. if (prev)
  1761. anon_vma = reusable_anon_vma(prev, prev, vma);
  1762. /*
  1763. * We might reach here with anon_vma == NULL if we can't find
  1764. * any reusable anon_vma.
  1765. * There's no absolute need to look only at touching neighbours:
  1766. * we could search further afield for "compatible" anon_vmas.
  1767. * But it would probably just be a waste of time searching,
  1768. * or lead to too many vmas hanging off the same anon_vma.
  1769. * We're trying to allow mprotect remerging later on,
  1770. * not trying to minimize memory used for anon_vmas.
  1771. */
  1772. return anon_vma;
  1773. }
  1774. static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
  1775. {
  1776. return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
  1777. }
  1778. static bool vma_is_shared_writable(struct vm_area_struct *vma)
  1779. {
  1780. return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
  1781. (VM_WRITE | VM_SHARED);
  1782. }
  1783. static bool vma_fs_can_writeback(struct vm_area_struct *vma)
  1784. {
  1785. /* No managed pages to writeback. */
  1786. if (vma->vm_flags & VM_PFNMAP)
  1787. return false;
  1788. return vma->vm_file && vma->vm_file->f_mapping &&
  1789. mapping_can_writeback(vma->vm_file->f_mapping);
  1790. }
  1791. /*
  1792. * Does this VMA require the underlying folios to have their dirty state
  1793. * tracked?
  1794. */
  1795. bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
  1796. {
  1797. /* Only shared, writable VMAs require dirty tracking. */
  1798. if (!vma_is_shared_writable(vma))
  1799. return false;
  1800. /* Does the filesystem need to be notified? */
  1801. if (vm_ops_needs_writenotify(vma->vm_ops))
  1802. return true;
  1803. /*
  1804. * Even if the filesystem doesn't indicate a need for writenotify, if it
  1805. * can writeback, dirty tracking is still required.
  1806. */
  1807. return vma_fs_can_writeback(vma);
  1808. }
  1809. /*
  1810. * Some shared mappings will want the pages marked read-only
  1811. * to track write events. If so, we'll downgrade vm_page_prot
  1812. * to the private version (using protection_map[] without the
  1813. * VM_SHARED bit).
  1814. */
  1815. bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
  1816. {
  1817. /* If it was private or non-writable, the write bit is already clear */
  1818. if (!vma_is_shared_writable(vma))
  1819. return false;
  1820. /* The backer wishes to know when pages are first written to? */
  1821. if (vm_ops_needs_writenotify(vma->vm_ops))
  1822. return true;
  1823. /* The open routine did something to the protections that pgprot_modify
  1824. * won't preserve? */
  1825. if (pgprot_val(vm_page_prot) !=
  1826. pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
  1827. return false;
  1828. /*
  1829. * Do we need to track softdirty? hugetlb does not support softdirty
  1830. * tracking yet.
  1831. */
  1832. if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
  1833. return true;
  1834. /* Do we need write faults for uffd-wp tracking? */
  1835. if (userfaultfd_wp(vma))
  1836. return true;
  1837. /* Can the mapping track the dirty pages? */
  1838. return vma_fs_can_writeback(vma);
  1839. }
  1840. static DEFINE_MUTEX(mm_all_locks_mutex);
  1841. static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
  1842. {
  1843. if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
  1844. /*
  1845. * The LSB of head.next can't change from under us
  1846. * because we hold the mm_all_locks_mutex.
  1847. */
  1848. down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
  1849. /*
  1850. * We can safely modify head.next after taking the
  1851. * anon_vma->root->rwsem. If some other vma in this mm shares
  1852. * the same anon_vma we won't take it again.
  1853. *
  1854. * No need of atomic instructions here, head.next
  1855. * can't change from under us thanks to the
  1856. * anon_vma->root->rwsem.
  1857. */
  1858. if (__test_and_set_bit(0, (unsigned long *)
  1859. &anon_vma->root->rb_root.rb_root.rb_node))
  1860. BUG();
  1861. }
  1862. }
  1863. static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  1864. {
  1865. if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
  1866. /*
  1867. * AS_MM_ALL_LOCKS can't change from under us because
  1868. * we hold the mm_all_locks_mutex.
  1869. *
  1870. * Operations on ->flags have to be atomic because
  1871. * even if AS_MM_ALL_LOCKS is stable thanks to the
  1872. * mm_all_locks_mutex, there may be other cpus
  1873. * changing other bitflags in parallel to us.
  1874. */
  1875. if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
  1876. BUG();
  1877. down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
  1878. }
  1879. }
  1880. /*
  1881. * This operation locks against the VM for all pte/vma/mm related
  1882. * operations that could ever happen on a certain mm. This includes
  1883. * vmtruncate, try_to_unmap, and all page faults.
  1884. *
  1885. * The caller must take the mmap_lock in write mode before calling
  1886. * mm_take_all_locks(). The caller isn't allowed to release the
  1887. * mmap_lock until mm_drop_all_locks() returns.
  1888. *
  1889. * mmap_lock in write mode is required in order to block all operations
  1890. * that could modify pagetables and free pages without need of
  1891. * altering the vma layout. It's also needed in write mode to avoid new
  1892. * anon_vmas to be associated with existing vmas.
  1893. *
  1894. * A single task can't take more than one mm_take_all_locks() in a row
  1895. * or it would deadlock.
  1896. *
  1897. * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
  1898. * mapping->flags avoid to take the same lock twice, if more than one
  1899. * vma in this mm is backed by the same anon_vma or address_space.
  1900. *
  1901. * We take locks in following order, accordingly to comment at beginning
  1902. * of mm/rmap.c:
  1903. * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
  1904. * hugetlb mapping);
  1905. * - all vmas marked locked
  1906. * - all i_mmap_rwsem locks;
  1907. * - all anon_vma->rwseml
  1908. *
  1909. * We can take all locks within these types randomly because the VM code
  1910. * doesn't nest them and we protected from parallel mm_take_all_locks() by
  1911. * mm_all_locks_mutex.
  1912. *
  1913. * mm_take_all_locks() and mm_drop_all_locks are expensive operations
  1914. * that may have to take thousand of locks.
  1915. *
  1916. * mm_take_all_locks() can fail if it's interrupted by signals.
  1917. */
  1918. int mm_take_all_locks(struct mm_struct *mm)
  1919. {
  1920. struct vm_area_struct *vma;
  1921. struct anon_vma_chain *avc;
  1922. VMA_ITERATOR(vmi, mm, 0);
  1923. mmap_assert_write_locked(mm);
  1924. mutex_lock(&mm_all_locks_mutex);
  1925. /*
  1926. * vma_start_write() does not have a complement in mm_drop_all_locks()
  1927. * because vma_start_write() is always asymmetrical; it marks a VMA as
  1928. * being written to until mmap_write_unlock() or mmap_write_downgrade()
  1929. * is reached.
  1930. */
  1931. for_each_vma(vmi, vma) {
  1932. if (signal_pending(current))
  1933. goto out_unlock;
  1934. vma_start_write(vma);
  1935. }
  1936. vma_iter_init(&vmi, mm, 0);
  1937. for_each_vma(vmi, vma) {
  1938. if (signal_pending(current))
  1939. goto out_unlock;
  1940. if (vma->vm_file && vma->vm_file->f_mapping &&
  1941. is_vm_hugetlb_page(vma))
  1942. vm_lock_mapping(mm, vma->vm_file->f_mapping);
  1943. }
  1944. vma_iter_init(&vmi, mm, 0);
  1945. for_each_vma(vmi, vma) {
  1946. if (signal_pending(current))
  1947. goto out_unlock;
  1948. if (vma->vm_file && vma->vm_file->f_mapping &&
  1949. !is_vm_hugetlb_page(vma))
  1950. vm_lock_mapping(mm, vma->vm_file->f_mapping);
  1951. }
  1952. vma_iter_init(&vmi, mm, 0);
  1953. for_each_vma(vmi, vma) {
  1954. if (signal_pending(current))
  1955. goto out_unlock;
  1956. if (vma->anon_vma)
  1957. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  1958. vm_lock_anon_vma(mm, avc->anon_vma);
  1959. }
  1960. return 0;
  1961. out_unlock:
  1962. mm_drop_all_locks(mm);
  1963. return -EINTR;
  1964. }
  1965. static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
  1966. {
  1967. if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
  1968. /*
  1969. * The LSB of head.next can't change to 0 from under
  1970. * us because we hold the mm_all_locks_mutex.
  1971. *
  1972. * We must however clear the bitflag before unlocking
  1973. * the vma so the users using the anon_vma->rb_root will
  1974. * never see our bitflag.
  1975. *
  1976. * No need of atomic instructions here, head.next
  1977. * can't change from under us until we release the
  1978. * anon_vma->root->rwsem.
  1979. */
  1980. if (!__test_and_clear_bit(0, (unsigned long *)
  1981. &anon_vma->root->rb_root.rb_root.rb_node))
  1982. BUG();
  1983. anon_vma_unlock_write(anon_vma);
  1984. }
  1985. }
  1986. static void vm_unlock_mapping(struct address_space *mapping)
  1987. {
  1988. if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
  1989. /*
  1990. * AS_MM_ALL_LOCKS can't change to 0 from under us
  1991. * because we hold the mm_all_locks_mutex.
  1992. */
  1993. i_mmap_unlock_write(mapping);
  1994. if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
  1995. &mapping->flags))
  1996. BUG();
  1997. }
  1998. }
  1999. /*
  2000. * The mmap_lock cannot be released by the caller until
  2001. * mm_drop_all_locks() returns.
  2002. */
  2003. void mm_drop_all_locks(struct mm_struct *mm)
  2004. {
  2005. struct vm_area_struct *vma;
  2006. struct anon_vma_chain *avc;
  2007. VMA_ITERATOR(vmi, mm, 0);
  2008. mmap_assert_write_locked(mm);
  2009. BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
  2010. for_each_vma(vmi, vma) {
  2011. if (vma->anon_vma)
  2012. list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
  2013. vm_unlock_anon_vma(avc->anon_vma);
  2014. if (vma->vm_file && vma->vm_file->f_mapping)
  2015. vm_unlock_mapping(vma->vm_file->f_mapping);
  2016. }
  2017. mutex_unlock(&mm_all_locks_mutex);
  2018. }
  2019. /*
  2020. * We account for memory if it's a private writeable mapping,
  2021. * not hugepages and VM_NORESERVE wasn't set.
  2022. */
  2023. static bool accountable_mapping(struct file *file, vm_flags_t vm_flags)
  2024. {
  2025. /*
  2026. * hugetlb has its own accounting separate from the core VM
  2027. * VM_HUGETLB may not be set yet so we cannot check for that flag.
  2028. */
  2029. if (file && is_file_hugepages(file))
  2030. return false;
  2031. return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
  2032. }
  2033. /*
  2034. * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap()
  2035. * operation.
  2036. * @vms: The vma unmap structure
  2037. * @mas_detach: The maple state with the detached maple tree
  2038. *
  2039. * Reattach any detached vmas, free up the maple tree used to track the vmas.
  2040. * If that's not possible because the ptes are cleared (and vm_ops->closed() may
  2041. * have been called), then a NULL is written over the vmas and the vmas are
  2042. * removed (munmap() completed).
  2043. */
  2044. static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
  2045. struct ma_state *mas_detach)
  2046. {
  2047. struct ma_state *mas = &vms->vmi->mas;
  2048. if (!vms->nr_pages)
  2049. return;
  2050. if (vms->clear_ptes)
  2051. return reattach_vmas(mas_detach);
  2052. /*
  2053. * Aborting cannot just call the vm_ops open() because they are often
  2054. * not symmetrical and state data has been lost. Resort to the old
  2055. * failure method of leaving a gap where the MAP_FIXED mapping failed.
  2056. */
  2057. mas_set_range(mas, vms->start, vms->end - 1);
  2058. mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
  2059. /* Clean up the insertion of the unfortunate gap */
  2060. vms_complete_munmap_vmas(vms, mas_detach);
  2061. }
  2062. static void update_ksm_flags(struct mmap_state *map)
  2063. {
  2064. map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags);
  2065. }
  2066. static void set_desc_from_map(struct vm_area_desc *desc,
  2067. const struct mmap_state *map)
  2068. {
  2069. desc->start = map->addr;
  2070. desc->end = map->end;
  2071. desc->pgoff = map->pgoff;
  2072. desc->vm_file = map->file;
  2073. desc->vma_flags = map->vma_flags;
  2074. desc->page_prot = map->page_prot;
  2075. }
  2076. /*
  2077. * __mmap_setup() - Prepare to gather any overlapping VMAs that need to be
  2078. * unmapped once the map operation is completed, check limits, account mapping
  2079. * and clean up any pre-existing VMAs.
  2080. *
  2081. * As a result it sets up the @map and @desc objects.
  2082. *
  2083. * @map: Mapping state.
  2084. * @desc: VMA descriptor
  2085. * @uf: Userfaultfd context list.
  2086. *
  2087. * Returns: 0 on success, error code otherwise.
  2088. */
  2089. static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc,
  2090. struct list_head *uf)
  2091. {
  2092. int error;
  2093. struct vma_iterator *vmi = map->vmi;
  2094. struct vma_munmap_struct *vms = &map->vms;
  2095. /* Find the first overlapping VMA and initialise unmap state. */
  2096. vms->vma = vma_find(vmi, map->end);
  2097. init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf,
  2098. /* unlock = */ false);
  2099. /* OK, we have overlapping VMAs - prepare to unmap them. */
  2100. if (vms->vma) {
  2101. mt_init_flags(&map->mt_detach,
  2102. vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
  2103. mt_on_stack(map->mt_detach);
  2104. mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0);
  2105. /* Prepare to unmap any existing mapping in the area */
  2106. error = vms_gather_munmap_vmas(vms, &map->mas_detach);
  2107. if (error) {
  2108. /* On error VMAs will already have been reattached. */
  2109. vms->nr_pages = 0;
  2110. return error;
  2111. }
  2112. map->next = vms->next;
  2113. map->prev = vms->prev;
  2114. } else {
  2115. map->next = vma_iter_next_rewind(vmi, &map->prev);
  2116. }
  2117. /* Check against address space limit. */
  2118. if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages))
  2119. return -ENOMEM;
  2120. /* Private writable mapping: check memory availability. */
  2121. if (accountable_mapping(map->file, map->vm_flags)) {
  2122. map->charged = map->pglen;
  2123. map->charged -= vms->nr_accounted;
  2124. if (map->charged) {
  2125. error = security_vm_enough_memory_mm(map->mm, map->charged);
  2126. if (error)
  2127. return error;
  2128. }
  2129. vms->nr_accounted = 0;
  2130. map->vm_flags |= VM_ACCOUNT;
  2131. }
  2132. /*
  2133. * Clear PTEs while the vma is still in the tree so that rmap
  2134. * cannot race with the freeing later in the truncate scenario.
  2135. * This is also needed for mmap_file(), which is why vm_ops
  2136. * close function is called.
  2137. */
  2138. vms_clean_up_area(vms, &map->mas_detach);
  2139. set_desc_from_map(desc, map);
  2140. return 0;
  2141. }
  2142. static int __mmap_new_file_vma(struct mmap_state *map,
  2143. struct vm_area_struct *vma)
  2144. {
  2145. struct vma_iterator *vmi = map->vmi;
  2146. int error;
  2147. vma->vm_file = map->file;
  2148. if (!map->file_doesnt_need_get)
  2149. get_file(map->file);
  2150. if (!map->file->f_op->mmap)
  2151. return 0;
  2152. error = mmap_file(vma->vm_file, vma);
  2153. if (error) {
  2154. UNMAP_STATE(unmap, vmi, vma, vma->vm_start, vma->vm_end,
  2155. map->prev, map->next);
  2156. fput(vma->vm_file);
  2157. vma->vm_file = NULL;
  2158. vma_iter_set(vmi, vma->vm_end);
  2159. /* Undo any partial mapping done by a device driver. */
  2160. unmap_region(&unmap);
  2161. return error;
  2162. }
  2163. /* Drivers cannot alter the address of the VMA. */
  2164. WARN_ON_ONCE(map->addr != vma->vm_start);
  2165. /*
  2166. * Drivers should not permit writability when previously it was
  2167. * disallowed.
  2168. */
  2169. VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags &&
  2170. !(map->vm_flags & VM_MAYWRITE) &&
  2171. (vma->vm_flags & VM_MAYWRITE));
  2172. map->file = vma->vm_file;
  2173. map->vm_flags = vma->vm_flags;
  2174. return 0;
  2175. }
  2176. /*
  2177. * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not
  2178. * possible.
  2179. *
  2180. * @map: Mapping state.
  2181. * @vmap: Output pointer for the new VMA.
  2182. *
  2183. * Returns: Zero on success, or an error.
  2184. */
  2185. static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
  2186. {
  2187. struct vma_iterator *vmi = map->vmi;
  2188. int error = 0;
  2189. struct vm_area_struct *vma;
  2190. /*
  2191. * Determine the object being mapped and call the appropriate
  2192. * specific mapper. the address has already been validated, but
  2193. * not unmapped, but the maps are removed from the list.
  2194. */
  2195. vma = vm_area_alloc(map->mm);
  2196. if (!vma)
  2197. return -ENOMEM;
  2198. vma_iter_config(vmi, map->addr, map->end);
  2199. vma_set_range(vma, map->addr, map->end, map->pgoff);
  2200. vm_flags_init(vma, map->vm_flags);
  2201. vma->vm_page_prot = map->page_prot;
  2202. if (vma_iter_prealloc(vmi, vma)) {
  2203. error = -ENOMEM;
  2204. goto free_vma;
  2205. }
  2206. if (map->file)
  2207. error = __mmap_new_file_vma(map, vma);
  2208. else if (map->vm_flags & VM_SHARED)
  2209. error = shmem_zero_setup(vma);
  2210. else
  2211. vma_set_anonymous(vma);
  2212. if (error)
  2213. goto free_iter_vma;
  2214. if (!map->check_ksm_early) {
  2215. update_ksm_flags(map);
  2216. vm_flags_init(vma, map->vm_flags);
  2217. }
  2218. #ifdef CONFIG_SPARC64
  2219. /* TODO: Fix SPARC ADI! */
  2220. WARN_ON_ONCE(!arch_validate_flags(map->vm_flags));
  2221. #endif
  2222. /* Lock the VMA since it is modified after insertion into VMA tree */
  2223. vma_start_write(vma);
  2224. vma_iter_store_new(vmi, vma);
  2225. map->mm->map_count++;
  2226. vma_link_file(vma, map->hold_file_rmap_lock);
  2227. /*
  2228. * vma_merge_new_range() calls khugepaged_enter_vma() too, the below
  2229. * call covers the non-merge case.
  2230. */
  2231. if (!vma_is_anonymous(vma))
  2232. khugepaged_enter_vma(vma, map->vm_flags);
  2233. *vmap = vma;
  2234. return 0;
  2235. free_iter_vma:
  2236. vma_iter_free(vmi);
  2237. free_vma:
  2238. vm_area_free(vma);
  2239. return error;
  2240. }
  2241. /*
  2242. * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping
  2243. * statistics, handle locking and finalise the VMA.
  2244. *
  2245. * @map: Mapping state.
  2246. * @vma: Merged or newly allocated VMA for the mmap()'d region.
  2247. */
  2248. static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
  2249. {
  2250. struct mm_struct *mm = map->mm;
  2251. vm_flags_t vm_flags = vma->vm_flags;
  2252. perf_event_mmap(vma);
  2253. /* Unmap any existing mapping in the area. */
  2254. vms_complete_munmap_vmas(&map->vms, &map->mas_detach);
  2255. vm_stat_account(mm, vma->vm_flags, map->pglen);
  2256. if (vm_flags & VM_LOCKED) {
  2257. if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
  2258. is_vm_hugetlb_page(vma) ||
  2259. vma == get_gate_vma(mm))
  2260. vm_flags_clear(vma, VM_LOCKED_MASK);
  2261. else
  2262. mm->locked_vm += map->pglen;
  2263. }
  2264. if (vma->vm_file)
  2265. uprobe_mmap(vma);
  2266. /*
  2267. * New (or expanded) vma always get soft dirty status.
  2268. * Otherwise user-space soft-dirty page tracker won't
  2269. * be able to distinguish situation when vma area unmapped,
  2270. * then new mapped in-place (which must be aimed as
  2271. * a completely new data area).
  2272. */
  2273. if (pgtable_supports_soft_dirty())
  2274. vm_flags_set(vma, VM_SOFTDIRTY);
  2275. vma_set_page_prot(vma);
  2276. }
  2277. static void call_action_prepare(struct mmap_state *map,
  2278. struct vm_area_desc *desc)
  2279. {
  2280. struct mmap_action *action = &desc->action;
  2281. mmap_action_prepare(action, desc);
  2282. if (action->hide_from_rmap_until_complete)
  2283. map->hold_file_rmap_lock = true;
  2284. }
  2285. /*
  2286. * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that
  2287. * specifies it.
  2288. *
  2289. * This is called prior to any merge attempt, and updates whitelisted fields
  2290. * that are permitted to be updated by the caller.
  2291. *
  2292. * All but user-defined fields will be pre-populated with original values.
  2293. *
  2294. * Returns 0 on success, or an error code otherwise.
  2295. */
  2296. static int call_mmap_prepare(struct mmap_state *map,
  2297. struct vm_area_desc *desc)
  2298. {
  2299. int err;
  2300. /* Invoke the hook. */
  2301. err = vfs_mmap_prepare(map->file, desc);
  2302. if (err)
  2303. return err;
  2304. call_action_prepare(map, desc);
  2305. /* Update fields permitted to be changed. */
  2306. map->pgoff = desc->pgoff;
  2307. if (desc->vm_file != map->file) {
  2308. map->file_doesnt_need_get = true;
  2309. map->file = desc->vm_file;
  2310. }
  2311. map->vma_flags = desc->vma_flags;
  2312. map->page_prot = desc->page_prot;
  2313. /* User-defined fields. */
  2314. map->vm_ops = desc->vm_ops;
  2315. map->vm_private_data = desc->private_data;
  2316. return 0;
  2317. }
  2318. static void set_vma_user_defined_fields(struct vm_area_struct *vma,
  2319. struct mmap_state *map)
  2320. {
  2321. if (map->vm_ops)
  2322. vma->vm_ops = map->vm_ops;
  2323. vma->vm_private_data = map->vm_private_data;
  2324. }
  2325. /*
  2326. * Are we guaranteed no driver can change state such as to preclude KSM merging?
  2327. * If so, let's set the KSM mergeable flag early so we don't break VMA merging.
  2328. */
  2329. static bool can_set_ksm_flags_early(struct mmap_state *map)
  2330. {
  2331. struct file *file = map->file;
  2332. /* Anonymous mappings have no driver which can change them. */
  2333. if (!file)
  2334. return true;
  2335. /*
  2336. * If .mmap_prepare() is specified, then the driver will have already
  2337. * manipulated state prior to updating KSM flags. So no need to worry
  2338. * about mmap callbacks modifying VMA flags after the KSM flag has been
  2339. * updated here, which could otherwise affect KSM eligibility.
  2340. */
  2341. if (file->f_op->mmap_prepare)
  2342. return true;
  2343. /* shmem is safe. */
  2344. if (shmem_file(file))
  2345. return true;
  2346. /* Any other .mmap callback is not safe. */
  2347. return false;
  2348. }
  2349. static int call_action_complete(struct mmap_state *map,
  2350. struct vm_area_desc *desc,
  2351. struct vm_area_struct *vma)
  2352. {
  2353. struct mmap_action *action = &desc->action;
  2354. int ret;
  2355. ret = mmap_action_complete(action, vma);
  2356. /* If we held the file rmap we need to release it. */
  2357. if (map->hold_file_rmap_lock) {
  2358. struct file *file = vma->vm_file;
  2359. i_mmap_unlock_write(file->f_mapping);
  2360. }
  2361. return ret;
  2362. }
  2363. static unsigned long __mmap_region(struct file *file, unsigned long addr,
  2364. unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
  2365. struct list_head *uf)
  2366. {
  2367. struct mm_struct *mm = current->mm;
  2368. struct vm_area_struct *vma = NULL;
  2369. bool have_mmap_prepare = file && file->f_op->mmap_prepare;
  2370. VMA_ITERATOR(vmi, mm, addr);
  2371. MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
  2372. struct vm_area_desc desc = {
  2373. .mm = mm,
  2374. .file = file,
  2375. .action = {
  2376. .type = MMAP_NOTHING, /* Default to no further action. */
  2377. },
  2378. };
  2379. bool allocated_new = false;
  2380. int error;
  2381. map.check_ksm_early = can_set_ksm_flags_early(&map);
  2382. error = __mmap_setup(&map, &desc, uf);
  2383. if (!error && have_mmap_prepare)
  2384. error = call_mmap_prepare(&map, &desc);
  2385. if (error)
  2386. goto abort_munmap;
  2387. if (map.check_ksm_early)
  2388. update_ksm_flags(&map);
  2389. /* Attempt to merge with adjacent VMAs... */
  2390. if (map.prev || map.next) {
  2391. VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL);
  2392. vma = vma_merge_new_range(&vmg);
  2393. }
  2394. /* ...but if we can't, allocate a new VMA. */
  2395. if (!vma) {
  2396. error = __mmap_new_vma(&map, &vma);
  2397. if (error)
  2398. goto unacct_error;
  2399. allocated_new = true;
  2400. }
  2401. if (have_mmap_prepare)
  2402. set_vma_user_defined_fields(vma, &map);
  2403. __mmap_complete(&map, vma);
  2404. if (have_mmap_prepare && allocated_new) {
  2405. error = call_action_complete(&map, &desc, vma);
  2406. if (error)
  2407. return error;
  2408. }
  2409. return addr;
  2410. /* Accounting was done by __mmap_setup(). */
  2411. unacct_error:
  2412. if (map.charged)
  2413. vm_unacct_memory(map.charged);
  2414. abort_munmap:
  2415. /*
  2416. * This indicates that .mmap_prepare has set a new file, differing from
  2417. * desc->vm_file. But since we're aborting the operation, only the
  2418. * original file will be cleaned up. Ensure we clean up both.
  2419. */
  2420. if (map.file_doesnt_need_get)
  2421. fput(map.file);
  2422. vms_abort_munmap_vmas(&map.vms, &map.mas_detach);
  2423. return error;
  2424. }
  2425. /**
  2426. * mmap_region() - Actually perform the userland mapping of a VMA into
  2427. * current->mm with known, aligned and overflow-checked @addr and @len, and
  2428. * correctly determined VMA flags @vm_flags and page offset @pgoff.
  2429. *
  2430. * This is an internal memory management function, and should not be used
  2431. * directly.
  2432. *
  2433. * The caller must write-lock current->mm->mmap_lock.
  2434. *
  2435. * @file: If a file-backed mapping, a pointer to the struct file describing the
  2436. * file to be mapped, otherwise NULL.
  2437. * @addr: The page-aligned address at which to perform the mapping.
  2438. * @len: The page-aligned, non-zero, length of the mapping.
  2439. * @vm_flags: The VMA flags which should be applied to the mapping.
  2440. * @pgoff: If @file is specified, the page offset into the file, if not then
  2441. * the virtual page offset in memory of the anonymous mapping.
  2442. * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap
  2443. * events.
  2444. *
  2445. * Returns: Either an error, or the address at which the requested mapping has
  2446. * been performed.
  2447. */
  2448. unsigned long mmap_region(struct file *file, unsigned long addr,
  2449. unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
  2450. struct list_head *uf)
  2451. {
  2452. unsigned long ret;
  2453. bool writable_file_mapping = false;
  2454. mmap_assert_write_locked(current->mm);
  2455. /* Check to see if MDWE is applicable. */
  2456. if (map_deny_write_exec(vm_flags, vm_flags))
  2457. return -EACCES;
  2458. /* Allow architectures to sanity-check the vm_flags. */
  2459. if (!arch_validate_flags(vm_flags))
  2460. return -EINVAL;
  2461. /* Map writable and ensure this isn't a sealed memfd. */
  2462. if (file && is_shared_maywrite_vm_flags(vm_flags)) {
  2463. int error = mapping_map_writable(file->f_mapping);
  2464. if (error)
  2465. return error;
  2466. writable_file_mapping = true;
  2467. }
  2468. ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
  2469. /* Clear our write mapping regardless of error. */
  2470. if (writable_file_mapping)
  2471. mapping_unmap_writable(file->f_mapping);
  2472. validate_mm(current->mm);
  2473. return ret;
  2474. }
  2475. /*
  2476. * do_brk_flags() - Increase the brk vma if the flags match.
  2477. * @vmi: The vma iterator
  2478. * @addr: The start address
  2479. * @len: The length of the increase
  2480. * @vma: The vma,
  2481. * @vm_flags: The VMA Flags
  2482. *
  2483. * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
  2484. * do not match then create a new anonymous VMA. Eventually we may be able to
  2485. * do some brk-specific accounting here.
  2486. */
  2487. int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
  2488. unsigned long addr, unsigned long len, vm_flags_t vm_flags)
  2489. {
  2490. struct mm_struct *mm = current->mm;
  2491. /*
  2492. * Check against address space limits by the changed size
  2493. * Note: This happens *after* clearing old mappings in some code paths.
  2494. */
  2495. vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
  2496. vm_flags = ksm_vma_flags(mm, NULL, vm_flags);
  2497. if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT))
  2498. return -ENOMEM;
  2499. if (mm->map_count > sysctl_max_map_count)
  2500. return -ENOMEM;
  2501. if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
  2502. return -ENOMEM;
  2503. /*
  2504. * Expand the existing vma if possible; Note that singular lists do not
  2505. * occur after forking, so the expand will only happen on new VMAs.
  2506. */
  2507. if (vma && vma->vm_end == addr) {
  2508. VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr));
  2509. vmg.prev = vma;
  2510. /* vmi is positioned at prev, which this mode expects. */
  2511. vmg.just_expand = true;
  2512. if (vma_merge_new_range(&vmg))
  2513. goto out;
  2514. else if (vmg_nomem(&vmg))
  2515. goto unacct_fail;
  2516. }
  2517. if (vma)
  2518. vma_iter_next_range(vmi);
  2519. /* create a vma struct for an anonymous mapping */
  2520. vma = vm_area_alloc(mm);
  2521. if (!vma)
  2522. goto unacct_fail;
  2523. vma_set_anonymous(vma);
  2524. vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
  2525. vm_flags_init(vma, vm_flags);
  2526. vma->vm_page_prot = vm_get_page_prot(vm_flags);
  2527. vma_start_write(vma);
  2528. if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
  2529. goto mas_store_fail;
  2530. mm->map_count++;
  2531. validate_mm(mm);
  2532. out:
  2533. perf_event_mmap(vma);
  2534. mm->total_vm += len >> PAGE_SHIFT;
  2535. mm->data_vm += len >> PAGE_SHIFT;
  2536. if (vm_flags & VM_LOCKED)
  2537. mm->locked_vm += (len >> PAGE_SHIFT);
  2538. if (pgtable_supports_soft_dirty())
  2539. vm_flags_set(vma, VM_SOFTDIRTY);
  2540. return 0;
  2541. mas_store_fail:
  2542. vm_area_free(vma);
  2543. unacct_fail:
  2544. vm_unacct_memory(len >> PAGE_SHIFT);
  2545. return -ENOMEM;
  2546. }
  2547. /**
  2548. * unmapped_area() - Find an area between the low_limit and the high_limit with
  2549. * the correct alignment and offset, all from @info. Note: current->mm is used
  2550. * for the search.
  2551. *
  2552. * @info: The unmapped area information including the range [low_limit -
  2553. * high_limit), the alignment offset and mask.
  2554. *
  2555. * Return: A memory address or -ENOMEM.
  2556. */
  2557. unsigned long unmapped_area(struct vm_unmapped_area_info *info)
  2558. {
  2559. unsigned long length, gap;
  2560. unsigned long low_limit, high_limit;
  2561. struct vm_area_struct *tmp;
  2562. VMA_ITERATOR(vmi, current->mm, 0);
  2563. /* Adjust search length to account for worst case alignment overhead */
  2564. length = info->length + info->align_mask + info->start_gap;
  2565. if (length < info->length)
  2566. return -ENOMEM;
  2567. low_limit = info->low_limit;
  2568. if (low_limit < mmap_min_addr)
  2569. low_limit = mmap_min_addr;
  2570. high_limit = info->high_limit;
  2571. retry:
  2572. if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
  2573. return -ENOMEM;
  2574. /*
  2575. * Adjust for the gap first so it doesn't interfere with the later
  2576. * alignment. The first step is the minimum needed to fulfill the start
  2577. * gap, the next step is the minimum to align that. It is the minimum
  2578. * needed to fulfill both.
  2579. */
  2580. gap = vma_iter_addr(&vmi) + info->start_gap;
  2581. gap += (info->align_offset - gap) & info->align_mask;
  2582. tmp = vma_next(&vmi);
  2583. if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
  2584. if (vm_start_gap(tmp) < gap + length - 1) {
  2585. low_limit = tmp->vm_end;
  2586. vma_iter_reset(&vmi);
  2587. goto retry;
  2588. }
  2589. } else {
  2590. tmp = vma_prev(&vmi);
  2591. if (tmp && vm_end_gap(tmp) > gap) {
  2592. low_limit = vm_end_gap(tmp);
  2593. vma_iter_reset(&vmi);
  2594. goto retry;
  2595. }
  2596. }
  2597. return gap;
  2598. }
  2599. /**
  2600. * unmapped_area_topdown() - Find an area between the low_limit and the
  2601. * high_limit with the correct alignment and offset at the highest available
  2602. * address, all from @info. Note: current->mm is used for the search.
  2603. *
  2604. * @info: The unmapped area information including the range [low_limit -
  2605. * high_limit), the alignment offset and mask.
  2606. *
  2607. * Return: A memory address or -ENOMEM.
  2608. */
  2609. unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
  2610. {
  2611. unsigned long length, gap, gap_end;
  2612. unsigned long low_limit, high_limit;
  2613. struct vm_area_struct *tmp;
  2614. VMA_ITERATOR(vmi, current->mm, 0);
  2615. /* Adjust search length to account for worst case alignment overhead */
  2616. length = info->length + info->align_mask + info->start_gap;
  2617. if (length < info->length)
  2618. return -ENOMEM;
  2619. low_limit = info->low_limit;
  2620. if (low_limit < mmap_min_addr)
  2621. low_limit = mmap_min_addr;
  2622. high_limit = info->high_limit;
  2623. retry:
  2624. if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
  2625. return -ENOMEM;
  2626. gap = vma_iter_end(&vmi) - info->length;
  2627. gap -= (gap - info->align_offset) & info->align_mask;
  2628. gap_end = vma_iter_end(&vmi);
  2629. tmp = vma_next(&vmi);
  2630. if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
  2631. if (vm_start_gap(tmp) < gap_end) {
  2632. high_limit = vm_start_gap(tmp);
  2633. vma_iter_reset(&vmi);
  2634. goto retry;
  2635. }
  2636. } else {
  2637. tmp = vma_prev(&vmi);
  2638. if (tmp && vm_end_gap(tmp) > gap) {
  2639. high_limit = tmp->vm_start;
  2640. vma_iter_reset(&vmi);
  2641. goto retry;
  2642. }
  2643. }
  2644. return gap;
  2645. }
  2646. /*
  2647. * Verify that the stack growth is acceptable and
  2648. * update accounting. This is shared with both the
  2649. * grow-up and grow-down cases.
  2650. */
  2651. static int acct_stack_growth(struct vm_area_struct *vma,
  2652. unsigned long size, unsigned long grow)
  2653. {
  2654. struct mm_struct *mm = vma->vm_mm;
  2655. unsigned long new_start;
  2656. /* address space limit tests */
  2657. if (!may_expand_vm(mm, vma->vm_flags, grow))
  2658. return -ENOMEM;
  2659. /* Stack limit test */
  2660. if (size > rlimit(RLIMIT_STACK))
  2661. return -ENOMEM;
  2662. /* mlock limit tests */
  2663. if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, grow << PAGE_SHIFT))
  2664. return -ENOMEM;
  2665. /* Check to ensure the stack will not grow into a hugetlb-only region */
  2666. new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
  2667. vma->vm_end - size;
  2668. if (is_hugepage_only_range(vma->vm_mm, new_start, size))
  2669. return -EFAULT;
  2670. /*
  2671. * Overcommit.. This must be the final test, as it will
  2672. * update security statistics.
  2673. */
  2674. if (security_vm_enough_memory_mm(mm, grow))
  2675. return -ENOMEM;
  2676. return 0;
  2677. }
  2678. #if defined(CONFIG_STACK_GROWSUP)
  2679. /*
  2680. * PA-RISC uses this for its stack.
  2681. * vma is the last one with address > vma->vm_end. Have to extend vma.
  2682. */
  2683. int expand_upwards(struct vm_area_struct *vma, unsigned long address)
  2684. {
  2685. struct mm_struct *mm = vma->vm_mm;
  2686. struct vm_area_struct *next;
  2687. unsigned long gap_addr;
  2688. int error = 0;
  2689. VMA_ITERATOR(vmi, mm, vma->vm_start);
  2690. if (!(vma->vm_flags & VM_GROWSUP))
  2691. return -EFAULT;
  2692. mmap_assert_write_locked(mm);
  2693. /* Guard against exceeding limits of the address space. */
  2694. address &= PAGE_MASK;
  2695. if (address >= (TASK_SIZE & PAGE_MASK))
  2696. return -ENOMEM;
  2697. address += PAGE_SIZE;
  2698. /* Enforce stack_guard_gap */
  2699. gap_addr = address + stack_guard_gap;
  2700. /* Guard against overflow */
  2701. if (gap_addr < address || gap_addr > TASK_SIZE)
  2702. gap_addr = TASK_SIZE;
  2703. next = find_vma_intersection(mm, vma->vm_end, gap_addr);
  2704. if (next && vma_is_accessible(next)) {
  2705. if (!(next->vm_flags & VM_GROWSUP))
  2706. return -ENOMEM;
  2707. /* Check that both stack segments have the same anon_vma? */
  2708. }
  2709. if (next)
  2710. vma_iter_prev_range_limit(&vmi, address);
  2711. vma_iter_config(&vmi, vma->vm_start, address);
  2712. if (vma_iter_prealloc(&vmi, vma))
  2713. return -ENOMEM;
  2714. /* We must make sure the anon_vma is allocated. */
  2715. if (unlikely(anon_vma_prepare(vma))) {
  2716. vma_iter_free(&vmi);
  2717. return -ENOMEM;
  2718. }
  2719. /* Lock the VMA before expanding to prevent concurrent page faults */
  2720. vma_start_write(vma);
  2721. /* We update the anon VMA tree. */
  2722. anon_vma_lock_write(vma->anon_vma);
  2723. /* Somebody else might have raced and expanded it already */
  2724. if (address > vma->vm_end) {
  2725. unsigned long size, grow;
  2726. size = address - vma->vm_start;
  2727. grow = (address - vma->vm_end) >> PAGE_SHIFT;
  2728. error = -ENOMEM;
  2729. if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
  2730. error = acct_stack_growth(vma, size, grow);
  2731. if (!error) {
  2732. if (vma->vm_flags & VM_LOCKED)
  2733. mm->locked_vm += grow;
  2734. vm_stat_account(mm, vma->vm_flags, grow);
  2735. anon_vma_interval_tree_pre_update_vma(vma);
  2736. vma->vm_end = address;
  2737. /* Overwrite old entry in mtree. */
  2738. vma_iter_store_overwrite(&vmi, vma);
  2739. anon_vma_interval_tree_post_update_vma(vma);
  2740. perf_event_mmap(vma);
  2741. }
  2742. }
  2743. }
  2744. anon_vma_unlock_write(vma->anon_vma);
  2745. vma_iter_free(&vmi);
  2746. validate_mm(mm);
  2747. return error;
  2748. }
  2749. #endif /* CONFIG_STACK_GROWSUP */
  2750. /*
  2751. * vma is the first one with address < vma->vm_start. Have to extend vma.
  2752. * mmap_lock held for writing.
  2753. */
  2754. int expand_downwards(struct vm_area_struct *vma, unsigned long address)
  2755. {
  2756. struct mm_struct *mm = vma->vm_mm;
  2757. struct vm_area_struct *prev;
  2758. int error = 0;
  2759. VMA_ITERATOR(vmi, mm, vma->vm_start);
  2760. if (!(vma->vm_flags & VM_GROWSDOWN))
  2761. return -EFAULT;
  2762. mmap_assert_write_locked(mm);
  2763. address &= PAGE_MASK;
  2764. if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
  2765. return -EPERM;
  2766. /* Enforce stack_guard_gap */
  2767. prev = vma_prev(&vmi);
  2768. /* Check that both stack segments have the same anon_vma? */
  2769. if (prev) {
  2770. if (!(prev->vm_flags & VM_GROWSDOWN) &&
  2771. vma_is_accessible(prev) &&
  2772. (address - prev->vm_end < stack_guard_gap))
  2773. return -ENOMEM;
  2774. }
  2775. if (prev)
  2776. vma_iter_next_range_limit(&vmi, vma->vm_start);
  2777. vma_iter_config(&vmi, address, vma->vm_end);
  2778. if (vma_iter_prealloc(&vmi, vma))
  2779. return -ENOMEM;
  2780. /* We must make sure the anon_vma is allocated. */
  2781. if (unlikely(anon_vma_prepare(vma))) {
  2782. vma_iter_free(&vmi);
  2783. return -ENOMEM;
  2784. }
  2785. /* Lock the VMA before expanding to prevent concurrent page faults */
  2786. vma_start_write(vma);
  2787. /* We update the anon VMA tree. */
  2788. anon_vma_lock_write(vma->anon_vma);
  2789. /* Somebody else might have raced and expanded it already */
  2790. if (address < vma->vm_start) {
  2791. unsigned long size, grow;
  2792. size = vma->vm_end - address;
  2793. grow = (vma->vm_start - address) >> PAGE_SHIFT;
  2794. error = -ENOMEM;
  2795. if (grow <= vma->vm_pgoff) {
  2796. error = acct_stack_growth(vma, size, grow);
  2797. if (!error) {
  2798. if (vma->vm_flags & VM_LOCKED)
  2799. mm->locked_vm += grow;
  2800. vm_stat_account(mm, vma->vm_flags, grow);
  2801. anon_vma_interval_tree_pre_update_vma(vma);
  2802. vma->vm_start = address;
  2803. vma->vm_pgoff -= grow;
  2804. /* Overwrite old entry in mtree. */
  2805. vma_iter_store_overwrite(&vmi, vma);
  2806. anon_vma_interval_tree_post_update_vma(vma);
  2807. perf_event_mmap(vma);
  2808. }
  2809. }
  2810. }
  2811. anon_vma_unlock_write(vma->anon_vma);
  2812. vma_iter_free(&vmi);
  2813. validate_mm(mm);
  2814. return error;
  2815. }
  2816. int __vm_munmap(unsigned long start, size_t len, bool unlock)
  2817. {
  2818. int ret;
  2819. struct mm_struct *mm = current->mm;
  2820. LIST_HEAD(uf);
  2821. VMA_ITERATOR(vmi, mm, start);
  2822. if (mmap_write_lock_killable(mm))
  2823. return -EINTR;
  2824. ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
  2825. if (ret || !unlock)
  2826. mmap_write_unlock(mm);
  2827. userfaultfd_unmap_complete(mm, &uf);
  2828. return ret;
  2829. }
  2830. /* Insert vm structure into process list sorted by address
  2831. * and into the inode's i_mmap tree. If vm_file is non-NULL
  2832. * then i_mmap_rwsem is taken here.
  2833. */
  2834. int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
  2835. {
  2836. unsigned long charged = vma_pages(vma);
  2837. if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
  2838. return -ENOMEM;
  2839. if ((vma->vm_flags & VM_ACCOUNT) &&
  2840. security_vm_enough_memory_mm(mm, charged))
  2841. return -ENOMEM;
  2842. /*
  2843. * The vm_pgoff of a purely anonymous vma should be irrelevant
  2844. * until its first write fault, when page's anon_vma and index
  2845. * are set. But now set the vm_pgoff it will almost certainly
  2846. * end up with (unless mremap moves it elsewhere before that
  2847. * first wfault), so /proc/pid/maps tells a consistent story.
  2848. *
  2849. * By setting it to reflect the virtual start address of the
  2850. * vma, merges and splits can happen in a seamless way, just
  2851. * using the existing file pgoff checks and manipulations.
  2852. * Similarly in do_mmap and in do_brk_flags.
  2853. */
  2854. if (vma_is_anonymous(vma)) {
  2855. BUG_ON(vma->anon_vma);
  2856. vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
  2857. }
  2858. if (vma_link(mm, vma)) {
  2859. if (vma->vm_flags & VM_ACCOUNT)
  2860. vm_unacct_memory(charged);
  2861. return -ENOMEM;
  2862. }
  2863. return 0;
  2864. }