migrate.c 75 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Memory Migration functionality - linux/mm/migrate.c
  4. *
  5. * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
  6. *
  7. * Page migration was first developed in the context of the memory hotplug
  8. * project. The main authors of the migration code are:
  9. *
  10. * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  11. * Hirokazu Takahashi <taka@valinux.co.jp>
  12. * Dave Hansen <haveblue@us.ibm.com>
  13. * Christoph Lameter
  14. */
  15. #include <linux/migrate.h>
  16. #include <linux/export.h>
  17. #include <linux/swap.h>
  18. #include <linux/leafops.h>
  19. #include <linux/pagemap.h>
  20. #include <linux/buffer_head.h>
  21. #include <linux/mm_inline.h>
  22. #include <linux/ksm.h>
  23. #include <linux/rmap.h>
  24. #include <linux/topology.h>
  25. #include <linux/cpu.h>
  26. #include <linux/cpuset.h>
  27. #include <linux/writeback.h>
  28. #include <linux/mempolicy.h>
  29. #include <linux/vmalloc.h>
  30. #include <linux/security.h>
  31. #include <linux/backing-dev.h>
  32. #include <linux/compaction.h>
  33. #include <linux/syscalls.h>
  34. #include <linux/compat.h>
  35. #include <linux/hugetlb.h>
  36. #include <linux/gfp.h>
  37. #include <linux/page_idle.h>
  38. #include <linux/page_owner.h>
  39. #include <linux/sched/mm.h>
  40. #include <linux/ptrace.h>
  41. #include <linux/memory.h>
  42. #include <linux/sched/sysctl.h>
  43. #include <linux/memory-tiers.h>
  44. #include <linux/pagewalk.h>
  45. #include <asm/tlbflush.h>
  46. #include <trace/events/migrate.h>
  47. #include "internal.h"
  48. #include "swap.h"
  49. static const struct movable_operations *offline_movable_ops;
  50. static const struct movable_operations *zsmalloc_movable_ops;
  51. int set_movable_ops(const struct movable_operations *ops, enum pagetype type)
  52. {
  53. /*
  54. * We only allow for selected types and don't handle concurrent
  55. * registration attempts yet.
  56. */
  57. switch (type) {
  58. case PGTY_offline:
  59. if (offline_movable_ops && ops)
  60. return -EBUSY;
  61. offline_movable_ops = ops;
  62. break;
  63. case PGTY_zsmalloc:
  64. if (zsmalloc_movable_ops && ops)
  65. return -EBUSY;
  66. zsmalloc_movable_ops = ops;
  67. break;
  68. default:
  69. return -EINVAL;
  70. }
  71. return 0;
  72. }
  73. EXPORT_SYMBOL_GPL(set_movable_ops);
  74. static const struct movable_operations *page_movable_ops(struct page *page)
  75. {
  76. VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
  77. /*
  78. * If we enable page migration for a page of a certain type by marking
  79. * it as movable, the page type must be sticky until the page gets freed
  80. * back to the buddy.
  81. */
  82. if (PageOffline(page))
  83. /* Only balloon page migration sets PageOffline pages movable. */
  84. return offline_movable_ops;
  85. if (PageZsmalloc(page))
  86. return zsmalloc_movable_ops;
  87. return NULL;
  88. }
  89. /**
  90. * isolate_movable_ops_page - isolate a movable_ops page for migration
  91. * @page: The page.
  92. * @mode: The isolation mode.
  93. *
  94. * Try to isolate a movable_ops page for migration. Will fail if the page is
  95. * not a movable_ops page, if the page is already isolated for migration
  96. * or if the page was just was released by its owner.
  97. *
  98. * Once isolated, the page cannot get freed until it is either putback
  99. * or migrated.
  100. *
  101. * Returns true if isolation succeeded, otherwise false.
  102. */
  103. bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
  104. {
  105. /*
  106. * TODO: these pages will not be folios in the future. All
  107. * folio dependencies will have to be removed.
  108. */
  109. struct folio *folio = folio_get_nontail_page(page);
  110. const struct movable_operations *mops;
  111. /*
  112. * Avoid burning cycles with pages that are yet under __free_pages(),
  113. * or just got freed under us.
  114. *
  115. * In case we 'win' a race for a movable page being freed under us and
  116. * raise its refcount preventing __free_pages() from doing its job
  117. * the put_page() at the end of this block will take care of
  118. * release this page, thus avoiding a nasty leakage.
  119. */
  120. if (!folio)
  121. goto out;
  122. /*
  123. * Check for movable_ops pages before taking the page lock because
  124. * we use non-atomic bitops on newly allocated page flags so
  125. * unconditionally grabbing the lock ruins page's owner side.
  126. *
  127. * Note that once a page has movable_ops, it will stay that way
  128. * until the page was freed.
  129. */
  130. if (unlikely(!page_has_movable_ops(page)))
  131. goto out_putfolio;
  132. /*
  133. * As movable pages are not isolated from LRU lists, concurrent
  134. * compaction threads can race against page migration functions
  135. * as well as race against the releasing a page.
  136. *
  137. * In order to avoid having an already isolated movable page
  138. * being (wrongly) re-isolated while it is under migration,
  139. * or to avoid attempting to isolate pages being released,
  140. * lets be sure we have the page lock
  141. * before proceeding with the movable page isolation steps.
  142. */
  143. if (unlikely(!folio_trylock(folio)))
  144. goto out_putfolio;
  145. VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
  146. if (PageMovableOpsIsolated(page))
  147. goto out_no_isolated;
  148. mops = page_movable_ops(page);
  149. if (WARN_ON_ONCE(!mops))
  150. goto out_no_isolated;
  151. if (!mops->isolate_page(page, mode))
  152. goto out_no_isolated;
  153. /* Driver shouldn't use the isolated flag */
  154. VM_WARN_ON_ONCE_PAGE(PageMovableOpsIsolated(page), page);
  155. SetPageMovableOpsIsolated(page);
  156. folio_unlock(folio);
  157. return true;
  158. out_no_isolated:
  159. folio_unlock(folio);
  160. out_putfolio:
  161. folio_put(folio);
  162. out:
  163. return false;
  164. }
  165. /**
  166. * putback_movable_ops_page - putback an isolated movable_ops page
  167. * @page: The isolated page.
  168. *
  169. * Putback an isolated movable_ops page.
  170. *
  171. * After the page was putback, it might get freed instantly.
  172. */
  173. static void putback_movable_ops_page(struct page *page)
  174. {
  175. /*
  176. * TODO: these pages will not be folios in the future. All
  177. * folio dependencies will have to be removed.
  178. */
  179. struct folio *folio = page_folio(page);
  180. VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
  181. VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(page), page);
  182. folio_lock(folio);
  183. page_movable_ops(page)->putback_page(page);
  184. ClearPageMovableOpsIsolated(page);
  185. folio_unlock(folio);
  186. folio_put(folio);
  187. }
  188. /**
  189. * migrate_movable_ops_page - migrate an isolated movable_ops page
  190. * @dst: The destination page.
  191. * @src: The source page.
  192. * @mode: The migration mode.
  193. *
  194. * Migrate an isolated movable_ops page.
  195. *
  196. * If the src page was already released by its owner, the src page is
  197. * un-isolated (putback) and migration succeeds; the migration core will be the
  198. * owner of both pages.
  199. *
  200. * If the src page was not released by its owner and the migration was
  201. * successful, the owner of the src page and the dst page are swapped and
  202. * the src page is un-isolated.
  203. *
  204. * If migration fails, the ownership stays unmodified and the src page
  205. * remains isolated: migration may be retried later or the page can be putback.
  206. *
  207. * TODO: migration core will treat both pages as folios and lock them before
  208. * this call to unlock them after this call. Further, the folio refcounts on
  209. * src and dst are also released by migration core. These pages will not be
  210. * folios in the future, so that must be reworked.
  211. *
  212. * Returns 0 on success, otherwise a negative error code.
  213. */
  214. static int migrate_movable_ops_page(struct page *dst, struct page *src,
  215. enum migrate_mode mode)
  216. {
  217. int rc;
  218. VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src);
  219. VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src);
  220. rc = page_movable_ops(src)->migrate_page(dst, src, mode);
  221. if (!rc)
  222. ClearPageMovableOpsIsolated(src);
  223. return rc;
  224. }
  225. /*
  226. * Put previously isolated pages back onto the appropriate lists
  227. * from where they were once taken off for compaction/migration.
  228. *
  229. * This function shall be used whenever the isolated pageset has been
  230. * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
  231. * and folio_isolate_hugetlb().
  232. */
  233. void putback_movable_pages(struct list_head *l)
  234. {
  235. struct folio *folio;
  236. struct folio *folio2;
  237. list_for_each_entry_safe(folio, folio2, l, lru) {
  238. if (unlikely(folio_test_hugetlb(folio))) {
  239. folio_putback_hugetlb(folio);
  240. continue;
  241. }
  242. list_del(&folio->lru);
  243. if (unlikely(page_has_movable_ops(&folio->page))) {
  244. putback_movable_ops_page(&folio->page);
  245. } else {
  246. node_stat_mod_folio(folio, NR_ISOLATED_ANON +
  247. folio_is_file_lru(folio), -folio_nr_pages(folio));
  248. folio_putback_lru(folio);
  249. }
  250. }
  251. }
  252. /* Must be called with an elevated refcount on the non-hugetlb folio */
  253. bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
  254. {
  255. if (folio_test_hugetlb(folio))
  256. return folio_isolate_hugetlb(folio, list);
  257. if (page_has_movable_ops(&folio->page)) {
  258. if (!isolate_movable_ops_page(&folio->page,
  259. ISOLATE_UNEVICTABLE))
  260. return false;
  261. } else {
  262. if (!folio_isolate_lru(folio))
  263. return false;
  264. node_stat_add_folio(folio, NR_ISOLATED_ANON +
  265. folio_is_file_lru(folio));
  266. }
  267. list_add(&folio->lru, list);
  268. return true;
  269. }
  270. static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
  271. struct folio *folio, pte_t old_pte, unsigned long idx)
  272. {
  273. struct page *page = folio_page(folio, idx);
  274. pte_t newpte;
  275. if (PageCompound(page) || PageHWPoison(page))
  276. return false;
  277. VM_BUG_ON_PAGE(!PageAnon(page), page);
  278. VM_BUG_ON_PAGE(!PageLocked(page), page);
  279. VM_BUG_ON_PAGE(pte_present(old_pte), page);
  280. VM_WARN_ON_ONCE_FOLIO(folio_is_device_private(folio), folio);
  281. if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
  282. mm_forbids_zeropage(pvmw->vma->vm_mm))
  283. return false;
  284. /*
  285. * The pmd entry mapping the old thp was flushed and the pte mapping
  286. * this subpage has been non present. If the subpage is only zero-filled
  287. * then map it to the shared zeropage.
  288. */
  289. if (!pages_identical(page, ZERO_PAGE(0)))
  290. return false;
  291. newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
  292. pvmw->vma->vm_page_prot));
  293. if (pte_swp_soft_dirty(old_pte))
  294. newpte = pte_mksoft_dirty(newpte);
  295. if (pte_swp_uffd_wp(old_pte))
  296. newpte = pte_mkuffd_wp(newpte);
  297. set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
  298. dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
  299. return true;
  300. }
  301. struct rmap_walk_arg {
  302. struct folio *folio;
  303. bool map_unused_to_zeropage;
  304. };
  305. /*
  306. * Restore a potential migration pte to a working pte entry
  307. */
  308. static bool remove_migration_pte(struct folio *folio,
  309. struct vm_area_struct *vma, unsigned long addr, void *arg)
  310. {
  311. struct rmap_walk_arg *rmap_walk_arg = arg;
  312. DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
  313. while (page_vma_mapped_walk(&pvmw)) {
  314. rmap_t rmap_flags = RMAP_NONE;
  315. pte_t old_pte;
  316. pte_t pte;
  317. softleaf_t entry;
  318. struct page *new;
  319. unsigned long idx = 0;
  320. /* pgoff is invalid for ksm pages, but they are never large */
  321. if (folio_test_large(folio) && !folio_test_hugetlb(folio))
  322. idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
  323. new = folio_page(folio, idx);
  324. #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  325. /* PMD-mapped THP migration entry */
  326. if (!pvmw.pte) {
  327. VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
  328. !folio_test_pmd_mappable(folio), folio);
  329. remove_migration_pmd(&pvmw, new);
  330. continue;
  331. }
  332. #endif
  333. old_pte = ptep_get(pvmw.pte);
  334. if (rmap_walk_arg->map_unused_to_zeropage &&
  335. try_to_map_unused_to_zeropage(&pvmw, folio, old_pte, idx))
  336. continue;
  337. folio_get(folio);
  338. pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
  339. entry = softleaf_from_pte(old_pte);
  340. if (!softleaf_is_migration_young(entry))
  341. pte = pte_mkold(pte);
  342. if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry))
  343. pte = pte_mkdirty(pte);
  344. if (pte_swp_soft_dirty(old_pte))
  345. pte = pte_mksoft_dirty(pte);
  346. else
  347. pte = pte_clear_soft_dirty(pte);
  348. if (softleaf_is_migration_write(entry))
  349. pte = pte_mkwrite(pte, vma);
  350. else if (pte_swp_uffd_wp(old_pte))
  351. pte = pte_mkuffd_wp(pte);
  352. if (folio_test_anon(folio) && !softleaf_is_migration_read(entry))
  353. rmap_flags |= RMAP_EXCLUSIVE;
  354. if (unlikely(is_device_private_page(new))) {
  355. if (pte_write(pte))
  356. entry = make_writable_device_private_entry(
  357. page_to_pfn(new));
  358. else
  359. entry = make_readable_device_private_entry(
  360. page_to_pfn(new));
  361. pte = softleaf_to_pte(entry);
  362. if (pte_swp_soft_dirty(old_pte))
  363. pte = pte_swp_mksoft_dirty(pte);
  364. if (pte_swp_uffd_wp(old_pte))
  365. pte = pte_swp_mkuffd_wp(pte);
  366. }
  367. #ifdef CONFIG_HUGETLB_PAGE
  368. if (folio_test_hugetlb(folio)) {
  369. struct hstate *h = hstate_vma(vma);
  370. unsigned int shift = huge_page_shift(h);
  371. unsigned long psize = huge_page_size(h);
  372. pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
  373. if (folio_test_anon(folio))
  374. hugetlb_add_anon_rmap(folio, vma, pvmw.address,
  375. rmap_flags);
  376. else
  377. hugetlb_add_file_rmap(folio);
  378. set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte,
  379. psize);
  380. } else
  381. #endif
  382. {
  383. if (folio_test_anon(folio))
  384. folio_add_anon_rmap_pte(folio, new, vma,
  385. pvmw.address, rmap_flags);
  386. else
  387. folio_add_file_rmap_pte(folio, new, vma);
  388. set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
  389. }
  390. if (READ_ONCE(vma->vm_flags) & VM_LOCKED)
  391. mlock_drain_local();
  392. trace_remove_migration_pte(pvmw.address, pte_val(pte),
  393. compound_order(new));
  394. /* No need to invalidate - it was non-present before */
  395. update_mmu_cache(vma, pvmw.address, pvmw.pte);
  396. }
  397. return true;
  398. }
  399. /*
  400. * Get rid of all migration entries and replace them by
  401. * references to the indicated page.
  402. */
  403. void remove_migration_ptes(struct folio *src, struct folio *dst,
  404. enum ttu_flags flags)
  405. {
  406. struct rmap_walk_arg rmap_walk_arg = {
  407. .folio = src,
  408. .map_unused_to_zeropage = flags & TTU_USE_SHARED_ZEROPAGE,
  409. };
  410. struct rmap_walk_control rwc = {
  411. .rmap_one = remove_migration_pte,
  412. .arg = &rmap_walk_arg,
  413. };
  414. VM_BUG_ON_FOLIO((flags & TTU_USE_SHARED_ZEROPAGE) && (src != dst), src);
  415. if (flags & TTU_RMAP_LOCKED)
  416. rmap_walk_locked(dst, &rwc);
  417. else
  418. rmap_walk(dst, &rwc);
  419. }
  420. /*
  421. * Something used the pte of a page under migration. We need to
  422. * get to the page and wait until migration is finished.
  423. * When we return from this function the fault will be retried.
  424. */
  425. void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
  426. unsigned long address)
  427. {
  428. spinlock_t *ptl;
  429. pte_t *ptep;
  430. pte_t pte;
  431. softleaf_t entry;
  432. ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
  433. if (!ptep)
  434. return;
  435. pte = ptep_get(ptep);
  436. pte_unmap(ptep);
  437. if (pte_none(pte) || pte_present(pte))
  438. goto out;
  439. entry = softleaf_from_pte(pte);
  440. if (!softleaf_is_migration(entry))
  441. goto out;
  442. softleaf_entry_wait_on_locked(entry, ptl);
  443. return;
  444. out:
  445. spin_unlock(ptl);
  446. }
  447. #ifdef CONFIG_HUGETLB_PAGE
  448. /*
  449. * The vma read lock must be held upon entry. Holding that lock prevents either
  450. * the pte or the ptl from being freed.
  451. *
  452. * This function will release the vma lock before returning.
  453. */
  454. void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
  455. {
  456. spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep);
  457. softleaf_t entry;
  458. pte_t pte;
  459. hugetlb_vma_assert_locked(vma);
  460. spin_lock(ptl);
  461. pte = huge_ptep_get(vma->vm_mm, addr, ptep);
  462. if (huge_pte_none(pte))
  463. goto fail;
  464. entry = softleaf_from_pte(pte);
  465. if (softleaf_is_migration(entry)) {
  466. /*
  467. * If migration entry existed, safe to release vma lock
  468. * here because the pgtable page won't be freed without the
  469. * pgtable lock released. See comment right above pgtable
  470. * lock release in softleaf_entry_wait_on_locked().
  471. */
  472. hugetlb_vma_unlock_read(vma);
  473. softleaf_entry_wait_on_locked(entry, ptl);
  474. return;
  475. }
  476. fail:
  477. spin_unlock(ptl);
  478. hugetlb_vma_unlock_read(vma);
  479. }
  480. #endif
  481. #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  482. void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
  483. {
  484. spinlock_t *ptl;
  485. ptl = pmd_lock(mm, pmd);
  486. if (!pmd_is_migration_entry(*pmd))
  487. goto unlock;
  488. softleaf_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl);
  489. return;
  490. unlock:
  491. spin_unlock(ptl);
  492. }
  493. #endif
  494. /*
  495. * Replace the folio in the mapping.
  496. *
  497. * The number of remaining references must be:
  498. * 1 for anonymous folios without a mapping
  499. * 2 for folios with a mapping
  500. * 3 for folios with a mapping and the private flag set.
  501. */
  502. static int __folio_migrate_mapping(struct address_space *mapping,
  503. struct folio *newfolio, struct folio *folio, int expected_count)
  504. {
  505. XA_STATE(xas, &mapping->i_pages, folio->index);
  506. struct swap_cluster_info *ci = NULL;
  507. struct zone *oldzone, *newzone;
  508. int dirty;
  509. long nr = folio_nr_pages(folio);
  510. if (!mapping) {
  511. /* Take off deferred split queue while frozen and memcg set */
  512. if (folio_test_large(folio) &&
  513. folio_test_large_rmappable(folio)) {
  514. if (!folio_ref_freeze(folio, expected_count))
  515. return -EAGAIN;
  516. folio_unqueue_deferred_split(folio);
  517. folio_ref_unfreeze(folio, expected_count);
  518. }
  519. /* No turning back from here */
  520. newfolio->index = folio->index;
  521. newfolio->mapping = folio->mapping;
  522. if (folio_test_anon(folio) && folio_test_large(folio))
  523. mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
  524. if (folio_test_swapbacked(folio))
  525. __folio_set_swapbacked(newfolio);
  526. return 0;
  527. }
  528. oldzone = folio_zone(folio);
  529. newzone = folio_zone(newfolio);
  530. if (folio_test_swapcache(folio))
  531. ci = swap_cluster_get_and_lock_irq(folio);
  532. else
  533. xas_lock_irq(&xas);
  534. if (!folio_ref_freeze(folio, expected_count)) {
  535. if (ci)
  536. swap_cluster_unlock_irq(ci);
  537. else
  538. xas_unlock_irq(&xas);
  539. return -EAGAIN;
  540. }
  541. /* Take off deferred split queue while frozen and memcg set */
  542. folio_unqueue_deferred_split(folio);
  543. /*
  544. * Now we know that no one else is looking at the folio:
  545. * no turning back from here.
  546. */
  547. newfolio->index = folio->index;
  548. newfolio->mapping = folio->mapping;
  549. if (folio_test_anon(folio) && folio_test_large(folio))
  550. mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
  551. folio_ref_add(newfolio, nr); /* add cache reference */
  552. if (folio_test_swapbacked(folio))
  553. __folio_set_swapbacked(newfolio);
  554. if (folio_test_swapcache(folio)) {
  555. folio_set_swapcache(newfolio);
  556. newfolio->private = folio_get_private(folio);
  557. }
  558. /* Move dirty while folio refs frozen and newfolio not yet exposed */
  559. dirty = folio_test_dirty(folio);
  560. if (dirty) {
  561. folio_clear_dirty(folio);
  562. folio_set_dirty(newfolio);
  563. }
  564. if (folio_test_swapcache(folio))
  565. __swap_cache_replace_folio(ci, folio, newfolio);
  566. else
  567. xas_store(&xas, newfolio);
  568. /*
  569. * Drop cache reference from old folio by unfreezing
  570. * to one less reference.
  571. * We know this isn't the last reference.
  572. */
  573. folio_ref_unfreeze(folio, expected_count - nr);
  574. /* Leave irq disabled to prevent preemption while updating stats */
  575. if (ci)
  576. swap_cluster_unlock(ci);
  577. else
  578. xas_unlock(&xas);
  579. /*
  580. * If moved to a different zone then also account
  581. * the folio for that zone. Other VM counters will be
  582. * taken care of when we establish references to the
  583. * new folio and drop references to the old folio.
  584. *
  585. * Note that anonymous folios are accounted for
  586. * via NR_FILE_PAGES and NR_ANON_MAPPED if they
  587. * are mapped to swap space.
  588. */
  589. if (newzone != oldzone) {
  590. struct lruvec *old_lruvec, *new_lruvec;
  591. struct mem_cgroup *memcg;
  592. memcg = folio_memcg(folio);
  593. old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
  594. new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
  595. mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
  596. mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
  597. if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
  598. mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
  599. mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
  600. if (folio_test_pmd_mappable(folio)) {
  601. mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr);
  602. mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr);
  603. }
  604. }
  605. #ifdef CONFIG_SWAP
  606. if (folio_test_swapcache(folio)) {
  607. mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
  608. mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
  609. }
  610. #endif
  611. if (dirty && mapping_can_writeback(mapping)) {
  612. mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
  613. __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
  614. mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
  615. __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
  616. }
  617. }
  618. local_irq_enable();
  619. return 0;
  620. }
  621. int folio_migrate_mapping(struct address_space *mapping,
  622. struct folio *newfolio, struct folio *folio, int extra_count)
  623. {
  624. int expected_count = folio_expected_ref_count(folio) + extra_count + 1;
  625. if (folio_ref_count(folio) != expected_count)
  626. return -EAGAIN;
  627. return __folio_migrate_mapping(mapping, newfolio, folio, expected_count);
  628. }
  629. EXPORT_SYMBOL(folio_migrate_mapping);
  630. /*
  631. * The expected number of remaining references is the same as that
  632. * of folio_migrate_mapping().
  633. */
  634. int migrate_huge_page_move_mapping(struct address_space *mapping,
  635. struct folio *dst, struct folio *src)
  636. {
  637. XA_STATE(xas, &mapping->i_pages, src->index);
  638. int rc, expected_count = folio_expected_ref_count(src) + 1;
  639. if (folio_ref_count(src) != expected_count)
  640. return -EAGAIN;
  641. rc = folio_mc_copy(dst, src);
  642. if (unlikely(rc))
  643. return rc;
  644. xas_lock_irq(&xas);
  645. if (!folio_ref_freeze(src, expected_count)) {
  646. xas_unlock_irq(&xas);
  647. return -EAGAIN;
  648. }
  649. dst->index = src->index;
  650. dst->mapping = src->mapping;
  651. folio_ref_add(dst, folio_nr_pages(dst));
  652. xas_store(&xas, dst);
  653. folio_ref_unfreeze(src, expected_count - folio_nr_pages(src));
  654. xas_unlock_irq(&xas);
  655. return 0;
  656. }
  657. /*
  658. * Copy the flags and some other ancillary information
  659. */
  660. void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
  661. {
  662. int cpupid;
  663. if (folio_test_referenced(folio))
  664. folio_set_referenced(newfolio);
  665. if (folio_test_uptodate(folio))
  666. folio_mark_uptodate(newfolio);
  667. if (folio_test_clear_active(folio)) {
  668. VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
  669. folio_set_active(newfolio);
  670. } else if (folio_test_clear_unevictable(folio))
  671. folio_set_unevictable(newfolio);
  672. if (folio_test_workingset(folio))
  673. folio_set_workingset(newfolio);
  674. if (folio_test_checked(folio))
  675. folio_set_checked(newfolio);
  676. /*
  677. * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via
  678. * migration entries. We can still have PG_anon_exclusive set on an
  679. * effectively unmapped and unreferenced first sub-pages of an
  680. * anonymous THP: we can simply copy it here via PG_mappedtodisk.
  681. */
  682. if (folio_test_mappedtodisk(folio))
  683. folio_set_mappedtodisk(newfolio);
  684. /* Move dirty on pages not done by folio_migrate_mapping() */
  685. if (folio_test_dirty(folio))
  686. folio_set_dirty(newfolio);
  687. if (folio_test_young(folio))
  688. folio_set_young(newfolio);
  689. if (folio_test_idle(folio))
  690. folio_set_idle(newfolio);
  691. folio_migrate_refs(newfolio, folio);
  692. /*
  693. * Copy NUMA information to the new page, to prevent over-eager
  694. * future migrations of this same page.
  695. */
  696. cpupid = folio_xchg_last_cpupid(folio, -1);
  697. /*
  698. * For memory tiering mode, when migrate between slow and fast
  699. * memory node, reset cpupid, because that is used to record
  700. * page access time in slow memory node.
  701. */
  702. if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) {
  703. bool f_toptier = node_is_toptier(folio_nid(folio));
  704. bool t_toptier = node_is_toptier(folio_nid(newfolio));
  705. if (f_toptier != t_toptier)
  706. cpupid = -1;
  707. }
  708. folio_xchg_last_cpupid(newfolio, cpupid);
  709. folio_migrate_ksm(newfolio, folio);
  710. /*
  711. * Please do not reorder this without considering how mm/ksm.c's
  712. * ksm_get_folio() depends upon ksm_migrate_page() and the
  713. * swapcache flag.
  714. */
  715. if (folio_test_swapcache(folio))
  716. folio_clear_swapcache(folio);
  717. folio_clear_private(folio);
  718. /* page->private contains hugetlb specific flags */
  719. if (!folio_test_hugetlb(folio))
  720. folio->private = NULL;
  721. /*
  722. * If any waiters have accumulated on the new page then
  723. * wake them up.
  724. */
  725. if (folio_test_writeback(newfolio))
  726. folio_end_writeback(newfolio);
  727. /*
  728. * PG_readahead shares the same bit with PG_reclaim. The above
  729. * end_page_writeback() may clear PG_readahead mistakenly, so set the
  730. * bit after that.
  731. */
  732. if (folio_test_readahead(folio))
  733. folio_set_readahead(newfolio);
  734. folio_copy_owner(newfolio, folio);
  735. pgalloc_tag_swap(newfolio, folio);
  736. mem_cgroup_migrate(folio, newfolio);
  737. }
  738. EXPORT_SYMBOL(folio_migrate_flags);
  739. /************************************************************
  740. * Migration functions
  741. ***********************************************************/
  742. static int __migrate_folio(struct address_space *mapping, struct folio *dst,
  743. struct folio *src, void *src_private,
  744. enum migrate_mode mode)
  745. {
  746. int rc, expected_count = folio_expected_ref_count(src) + 1;
  747. /* Check whether src does not have extra refs before we do more work */
  748. if (folio_ref_count(src) != expected_count)
  749. return -EAGAIN;
  750. rc = folio_mc_copy(dst, src);
  751. if (unlikely(rc))
  752. return rc;
  753. rc = __folio_migrate_mapping(mapping, dst, src, expected_count);
  754. if (rc)
  755. return rc;
  756. if (src_private)
  757. folio_attach_private(dst, folio_detach_private(src));
  758. folio_migrate_flags(dst, src);
  759. return 0;
  760. }
  761. /**
  762. * migrate_folio() - Simple folio migration.
  763. * @mapping: The address_space containing the folio.
  764. * @dst: The folio to migrate the data to.
  765. * @src: The folio containing the current data.
  766. * @mode: How to migrate the page.
  767. *
  768. * Common logic to directly migrate a single LRU folio suitable for
  769. * folios that do not have private data.
  770. *
  771. * Folios are locked upon entry and exit.
  772. */
  773. int migrate_folio(struct address_space *mapping, struct folio *dst,
  774. struct folio *src, enum migrate_mode mode)
  775. {
  776. BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
  777. return __migrate_folio(mapping, dst, src, NULL, mode);
  778. }
  779. EXPORT_SYMBOL(migrate_folio);
  780. #ifdef CONFIG_BUFFER_HEAD
  781. /* Returns true if all buffers are successfully locked */
  782. static bool buffer_migrate_lock_buffers(struct buffer_head *head,
  783. enum migrate_mode mode)
  784. {
  785. struct buffer_head *bh = head;
  786. struct buffer_head *failed_bh;
  787. do {
  788. if (!trylock_buffer(bh)) {
  789. if (mode == MIGRATE_ASYNC)
  790. goto unlock;
  791. if (mode == MIGRATE_SYNC_LIGHT && !buffer_uptodate(bh))
  792. goto unlock;
  793. lock_buffer(bh);
  794. }
  795. bh = bh->b_this_page;
  796. } while (bh != head);
  797. return true;
  798. unlock:
  799. /* We failed to lock the buffer and cannot stall. */
  800. failed_bh = bh;
  801. bh = head;
  802. while (bh != failed_bh) {
  803. unlock_buffer(bh);
  804. bh = bh->b_this_page;
  805. }
  806. return false;
  807. }
  808. static int __buffer_migrate_folio(struct address_space *mapping,
  809. struct folio *dst, struct folio *src, enum migrate_mode mode,
  810. bool check_refs)
  811. {
  812. struct buffer_head *bh, *head;
  813. int rc;
  814. int expected_count;
  815. head = folio_buffers(src);
  816. if (!head)
  817. return migrate_folio(mapping, dst, src, mode);
  818. /* Check whether page does not have extra refs before we do more work */
  819. expected_count = folio_expected_ref_count(src) + 1;
  820. if (folio_ref_count(src) != expected_count)
  821. return -EAGAIN;
  822. if (!buffer_migrate_lock_buffers(head, mode))
  823. return -EAGAIN;
  824. if (check_refs) {
  825. bool busy, migrating;
  826. bool invalidated = false;
  827. migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state);
  828. VM_WARN_ON_ONCE(migrating);
  829. recheck_buffers:
  830. busy = false;
  831. spin_lock(&mapping->i_private_lock);
  832. bh = head;
  833. do {
  834. if (atomic_read(&bh->b_count)) {
  835. busy = true;
  836. break;
  837. }
  838. bh = bh->b_this_page;
  839. } while (bh != head);
  840. spin_unlock(&mapping->i_private_lock);
  841. if (busy) {
  842. if (invalidated) {
  843. rc = -EAGAIN;
  844. goto unlock_buffers;
  845. }
  846. invalidate_bh_lrus();
  847. invalidated = true;
  848. goto recheck_buffers;
  849. }
  850. }
  851. rc = filemap_migrate_folio(mapping, dst, src, mode);
  852. if (rc)
  853. goto unlock_buffers;
  854. bh = head;
  855. do {
  856. folio_set_bh(bh, dst, bh_offset(bh));
  857. bh = bh->b_this_page;
  858. } while (bh != head);
  859. unlock_buffers:
  860. if (check_refs)
  861. clear_bit_unlock(BH_Migrate, &head->b_state);
  862. bh = head;
  863. do {
  864. unlock_buffer(bh);
  865. bh = bh->b_this_page;
  866. } while (bh != head);
  867. return rc;
  868. }
  869. /**
  870. * buffer_migrate_folio() - Migration function for folios with buffers.
  871. * @mapping: The address space containing @src.
  872. * @dst: The folio to migrate to.
  873. * @src: The folio to migrate from.
  874. * @mode: How to migrate the folio.
  875. *
  876. * This function can only be used if the underlying filesystem guarantees
  877. * that no other references to @src exist. For example attached buffer
  878. * heads are accessed only under the folio lock. If your filesystem cannot
  879. * provide this guarantee, buffer_migrate_folio_norefs() may be more
  880. * appropriate.
  881. *
  882. * Return: 0 on success or a negative errno on failure.
  883. */
  884. int buffer_migrate_folio(struct address_space *mapping,
  885. struct folio *dst, struct folio *src, enum migrate_mode mode)
  886. {
  887. return __buffer_migrate_folio(mapping, dst, src, mode, false);
  888. }
  889. EXPORT_SYMBOL(buffer_migrate_folio);
  890. /**
  891. * buffer_migrate_folio_norefs() - Migration function for folios with buffers.
  892. * @mapping: The address space containing @src.
  893. * @dst: The folio to migrate to.
  894. * @src: The folio to migrate from.
  895. * @mode: How to migrate the folio.
  896. *
  897. * Like buffer_migrate_folio() except that this variant is more careful
  898. * and checks that there are also no buffer head references. This function
  899. * is the right one for mappings where buffer heads are directly looked
  900. * up and referenced (such as block device mappings).
  901. *
  902. * Return: 0 on success or a negative errno on failure.
  903. */
  904. int buffer_migrate_folio_norefs(struct address_space *mapping,
  905. struct folio *dst, struct folio *src, enum migrate_mode mode)
  906. {
  907. return __buffer_migrate_folio(mapping, dst, src, mode, true);
  908. }
  909. EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
  910. #endif /* CONFIG_BUFFER_HEAD */
  911. int filemap_migrate_folio(struct address_space *mapping,
  912. struct folio *dst, struct folio *src, enum migrate_mode mode)
  913. {
  914. return __migrate_folio(mapping, dst, src, folio_get_private(src), mode);
  915. }
  916. EXPORT_SYMBOL_GPL(filemap_migrate_folio);
  917. /*
  918. * Default handling if a filesystem does not provide a migration function.
  919. */
  920. static int fallback_migrate_folio(struct address_space *mapping,
  921. struct folio *dst, struct folio *src, enum migrate_mode mode)
  922. {
  923. WARN_ONCE(mapping->a_ops->writepages,
  924. "%ps does not implement migrate_folio\n",
  925. mapping->a_ops);
  926. if (folio_test_dirty(src))
  927. return -EBUSY;
  928. /*
  929. * Filesystem may have private data at folio->private that we
  930. * can't migrate automatically.
  931. */
  932. if (!filemap_release_folio(src, GFP_KERNEL))
  933. return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
  934. return migrate_folio(mapping, dst, src, mode);
  935. }
  936. /*
  937. * Move a src folio to a newly allocated dst folio.
  938. *
  939. * The src and dst folios are locked and the src folios was unmapped from
  940. * the page tables.
  941. *
  942. * On success, the src folio was replaced by the dst folio.
  943. *
  944. * Return value:
  945. * < 0 - error code
  946. * 0 - success
  947. */
  948. static int move_to_new_folio(struct folio *dst, struct folio *src,
  949. enum migrate_mode mode)
  950. {
  951. struct address_space *mapping = folio_mapping(src);
  952. int rc = -EAGAIN;
  953. VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
  954. VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
  955. if (!mapping)
  956. rc = migrate_folio(mapping, dst, src, mode);
  957. else if (mapping_inaccessible(mapping))
  958. rc = -EOPNOTSUPP;
  959. else if (mapping->a_ops->migrate_folio)
  960. /*
  961. * Most folios have a mapping and most filesystems
  962. * provide a migrate_folio callback. Anonymous folios
  963. * are part of swap space which also has its own
  964. * migrate_folio callback. This is the most common path
  965. * for page migration.
  966. */
  967. rc = mapping->a_ops->migrate_folio(mapping, dst, src,
  968. mode);
  969. else
  970. rc = fallback_migrate_folio(mapping, dst, src, mode);
  971. if (!rc) {
  972. /*
  973. * For pagecache folios, src->mapping must be cleared before src
  974. * is freed. Anonymous folios must stay anonymous until freed.
  975. */
  976. if (!folio_test_anon(src))
  977. src->mapping = NULL;
  978. if (likely(!folio_is_zone_device(dst)))
  979. flush_dcache_folio(dst);
  980. }
  981. return rc;
  982. }
  983. /*
  984. * To record some information during migration, we use unused private
  985. * field of struct folio of the newly allocated destination folio.
  986. * This is safe because nobody is using it except us.
  987. */
  988. enum {
  989. PAGE_WAS_MAPPED = BIT(0),
  990. PAGE_WAS_MLOCKED = BIT(1),
  991. PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED,
  992. };
  993. static void __migrate_folio_record(struct folio *dst,
  994. int old_page_state,
  995. struct anon_vma *anon_vma)
  996. {
  997. dst->private = (void *)anon_vma + old_page_state;
  998. }
  999. static void __migrate_folio_extract(struct folio *dst,
  1000. int *old_page_state,
  1001. struct anon_vma **anon_vmap)
  1002. {
  1003. unsigned long private = (unsigned long)dst->private;
  1004. *anon_vmap = (struct anon_vma *)(private & ~PAGE_OLD_STATES);
  1005. *old_page_state = private & PAGE_OLD_STATES;
  1006. dst->private = NULL;
  1007. }
  1008. /* Restore the source folio to the original state upon failure */
  1009. static void migrate_folio_undo_src(struct folio *src,
  1010. int page_was_mapped,
  1011. struct anon_vma *anon_vma,
  1012. bool locked,
  1013. struct list_head *ret)
  1014. {
  1015. if (page_was_mapped)
  1016. remove_migration_ptes(src, src, 0);
  1017. /* Drop an anon_vma reference if we took one */
  1018. if (anon_vma)
  1019. put_anon_vma(anon_vma);
  1020. if (locked)
  1021. folio_unlock(src);
  1022. if (ret)
  1023. list_move_tail(&src->lru, ret);
  1024. }
  1025. /* Restore the destination folio to the original state upon failure */
  1026. static void migrate_folio_undo_dst(struct folio *dst, bool locked,
  1027. free_folio_t put_new_folio, unsigned long private)
  1028. {
  1029. if (locked)
  1030. folio_unlock(dst);
  1031. if (put_new_folio)
  1032. put_new_folio(dst, private);
  1033. else
  1034. folio_put(dst);
  1035. }
  1036. /* Cleanup src folio upon migration success */
  1037. static void migrate_folio_done(struct folio *src,
  1038. enum migrate_reason reason)
  1039. {
  1040. if (likely(!page_has_movable_ops(&src->page)) && reason != MR_DEMOTION)
  1041. mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
  1042. folio_is_file_lru(src), -folio_nr_pages(src));
  1043. if (reason != MR_MEMORY_FAILURE)
  1044. /* We release the page in page_handle_poison. */
  1045. folio_put(src);
  1046. }
  1047. /* Obtain the lock on page, remove all ptes. */
  1048. static int migrate_folio_unmap(new_folio_t get_new_folio,
  1049. free_folio_t put_new_folio, unsigned long private,
  1050. struct folio *src, struct folio **dstp, enum migrate_mode mode,
  1051. struct list_head *ret)
  1052. {
  1053. struct folio *dst;
  1054. int rc = -EAGAIN;
  1055. int old_page_state = 0;
  1056. struct anon_vma *anon_vma = NULL;
  1057. bool locked = false;
  1058. bool dst_locked = false;
  1059. dst = get_new_folio(src, private);
  1060. if (!dst)
  1061. return -ENOMEM;
  1062. *dstp = dst;
  1063. dst->private = NULL;
  1064. if (!folio_trylock(src)) {
  1065. if (mode == MIGRATE_ASYNC)
  1066. goto out;
  1067. /*
  1068. * It's not safe for direct compaction to call lock_page.
  1069. * For example, during page readahead pages are added locked
  1070. * to the LRU. Later, when the IO completes the pages are
  1071. * marked uptodate and unlocked. However, the queueing
  1072. * could be merging multiple pages for one bio (e.g.
  1073. * mpage_readahead). If an allocation happens for the
  1074. * second or third page, the process can end up locking
  1075. * the same page twice and deadlocking. Rather than
  1076. * trying to be clever about what pages can be locked,
  1077. * avoid the use of lock_page for direct compaction
  1078. * altogether.
  1079. */
  1080. if (current->flags & PF_MEMALLOC)
  1081. goto out;
  1082. /*
  1083. * In "light" mode, we can wait for transient locks (eg
  1084. * inserting a page into the page table), but it's not
  1085. * worth waiting for I/O.
  1086. */
  1087. if (mode == MIGRATE_SYNC_LIGHT && !folio_test_uptodate(src))
  1088. goto out;
  1089. folio_lock(src);
  1090. }
  1091. locked = true;
  1092. if (folio_test_mlocked(src))
  1093. old_page_state |= PAGE_WAS_MLOCKED;
  1094. if (folio_test_writeback(src)) {
  1095. /*
  1096. * Only in the case of a full synchronous migration is it
  1097. * necessary to wait for PageWriteback. In the async case,
  1098. * the retry loop is too short and in the sync-light case,
  1099. * the overhead of stalling is too much
  1100. */
  1101. switch (mode) {
  1102. case MIGRATE_SYNC:
  1103. break;
  1104. default:
  1105. rc = -EBUSY;
  1106. goto out;
  1107. }
  1108. folio_wait_writeback(src);
  1109. }
  1110. /*
  1111. * By try_to_migrate(), src->mapcount goes down to 0 here. In this case,
  1112. * we cannot notice that anon_vma is freed while we migrate a page.
  1113. * This get_anon_vma() delays freeing anon_vma pointer until the end
  1114. * of migration. File cache pages are no problem because of page_lock()
  1115. * File Caches may use write_page() or lock_page() in migration, then,
  1116. * just care Anon page here.
  1117. *
  1118. * Only folio_get_anon_vma() understands the subtleties of
  1119. * getting a hold on an anon_vma from outside one of its mms.
  1120. * But if we cannot get anon_vma, then we won't need it anyway,
  1121. * because that implies that the anon page is no longer mapped
  1122. * (and cannot be remapped so long as we hold the page lock).
  1123. */
  1124. if (folio_test_anon(src) && !folio_test_ksm(src))
  1125. anon_vma = folio_get_anon_vma(src);
  1126. /*
  1127. * Block others from accessing the new page when we get around to
  1128. * establishing additional references. We are usually the only one
  1129. * holding a reference to dst at this point. We used to have a BUG
  1130. * here if folio_trylock(dst) fails, but would like to allow for
  1131. * cases where there might be a race with the previous use of dst.
  1132. * This is much like races on refcount of oldpage: just don't BUG().
  1133. */
  1134. if (unlikely(!folio_trylock(dst)))
  1135. goto out;
  1136. dst_locked = true;
  1137. if (unlikely(page_has_movable_ops(&src->page))) {
  1138. __migrate_folio_record(dst, old_page_state, anon_vma);
  1139. return 0;
  1140. }
  1141. /*
  1142. * Corner case handling:
  1143. * 1. When a new swap-cache page is read into, it is added to the LRU
  1144. * and treated as swapcache but it has no rmap yet.
  1145. * Calling try_to_unmap() against a src->mapping==NULL page will
  1146. * trigger a BUG. So handle it here.
  1147. * 2. An orphaned page (see truncate_cleanup_page) might have
  1148. * fs-private metadata. The page can be picked up due to memory
  1149. * offlining. Everywhere else except page reclaim, the page is
  1150. * invisible to the vm, so the page can not be migrated. So try to
  1151. * free the metadata, so the page can be freed.
  1152. */
  1153. if (!src->mapping) {
  1154. if (folio_test_private(src)) {
  1155. try_to_free_buffers(src);
  1156. goto out;
  1157. }
  1158. } else if (folio_mapped(src)) {
  1159. /* Establish migration ptes */
  1160. VM_BUG_ON_FOLIO(folio_test_anon(src) &&
  1161. !folio_test_ksm(src) && !anon_vma, src);
  1162. try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
  1163. old_page_state |= PAGE_WAS_MAPPED;
  1164. }
  1165. if (!folio_mapped(src)) {
  1166. __migrate_folio_record(dst, old_page_state, anon_vma);
  1167. return 0;
  1168. }
  1169. out:
  1170. /*
  1171. * A folio that has not been unmapped will be restored to
  1172. * right list unless we want to retry.
  1173. */
  1174. if (rc == -EAGAIN)
  1175. ret = NULL;
  1176. migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
  1177. anon_vma, locked, ret);
  1178. migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private);
  1179. return rc;
  1180. }
  1181. /* Migrate the folio to the newly allocated folio in dst. */
  1182. static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
  1183. struct folio *src, struct folio *dst,
  1184. enum migrate_mode mode, enum migrate_reason reason,
  1185. struct list_head *ret)
  1186. {
  1187. int rc;
  1188. int old_page_state = 0;
  1189. struct anon_vma *anon_vma = NULL;
  1190. struct list_head *prev;
  1191. __migrate_folio_extract(dst, &old_page_state, &anon_vma);
  1192. prev = dst->lru.prev;
  1193. list_del(&dst->lru);
  1194. if (unlikely(page_has_movable_ops(&src->page))) {
  1195. rc = migrate_movable_ops_page(&dst->page, &src->page, mode);
  1196. if (rc)
  1197. goto out;
  1198. goto out_unlock_both;
  1199. }
  1200. rc = move_to_new_folio(dst, src, mode);
  1201. if (rc)
  1202. goto out;
  1203. /*
  1204. * When successful, push dst to LRU immediately: so that if it
  1205. * turns out to be an mlocked page, remove_migration_ptes() will
  1206. * automatically build up the correct dst->mlock_count for it.
  1207. *
  1208. * We would like to do something similar for the old page, when
  1209. * unsuccessful, and other cases when a page has been temporarily
  1210. * isolated from the unevictable LRU: but this case is the easiest.
  1211. */
  1212. folio_add_lru(dst);
  1213. if (old_page_state & PAGE_WAS_MLOCKED)
  1214. lru_add_drain();
  1215. if (old_page_state & PAGE_WAS_MAPPED)
  1216. remove_migration_ptes(src, dst, 0);
  1217. out_unlock_both:
  1218. folio_unlock(dst);
  1219. folio_set_owner_migrate_reason(dst, reason);
  1220. /*
  1221. * If migration is successful, decrease refcount of dst,
  1222. * which will not free the page because new page owner increased
  1223. * refcounter.
  1224. */
  1225. folio_put(dst);
  1226. /*
  1227. * A folio that has been migrated has all references removed
  1228. * and will be freed.
  1229. */
  1230. list_del(&src->lru);
  1231. /* Drop an anon_vma reference if we took one */
  1232. if (anon_vma)
  1233. put_anon_vma(anon_vma);
  1234. folio_unlock(src);
  1235. migrate_folio_done(src, reason);
  1236. return rc;
  1237. out:
  1238. /*
  1239. * A folio that has not been migrated will be restored to
  1240. * right list unless we want to retry.
  1241. */
  1242. if (rc == -EAGAIN) {
  1243. list_add(&dst->lru, prev);
  1244. __migrate_folio_record(dst, old_page_state, anon_vma);
  1245. return rc;
  1246. }
  1247. migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
  1248. anon_vma, true, ret);
  1249. migrate_folio_undo_dst(dst, true, put_new_folio, private);
  1250. return rc;
  1251. }
  1252. /*
  1253. * Counterpart of unmap_and_move_page() for hugepage migration.
  1254. *
  1255. * This function doesn't wait the completion of hugepage I/O
  1256. * because there is no race between I/O and migration for hugepage.
  1257. * Note that currently hugepage I/O occurs only in direct I/O
  1258. * where no lock is held and PG_writeback is irrelevant,
  1259. * and writeback status of all subpages are counted in the reference
  1260. * count of the head page (i.e. if all subpages of a 2MB hugepage are
  1261. * under direct I/O, the reference of the head page is 512 and a bit more.)
  1262. * This means that when we try to migrate hugepage whose subpages are
  1263. * doing direct I/O, some references remain after try_to_unmap() and
  1264. * hugepage migration fails without data corruption.
  1265. *
  1266. * There is also no race when direct I/O is issued on the page under migration,
  1267. * because then pte is replaced with migration swap entry and direct I/O code
  1268. * will wait in the page fault for migration to complete.
  1269. */
  1270. static int unmap_and_move_huge_page(new_folio_t get_new_folio,
  1271. free_folio_t put_new_folio, unsigned long private,
  1272. struct folio *src, int force, enum migrate_mode mode,
  1273. int reason, struct list_head *ret)
  1274. {
  1275. struct folio *dst;
  1276. int rc = -EAGAIN;
  1277. int page_was_mapped = 0;
  1278. struct anon_vma *anon_vma = NULL;
  1279. struct address_space *mapping = NULL;
  1280. enum ttu_flags ttu = 0;
  1281. if (folio_ref_count(src) == 1) {
  1282. /* page was freed from under us. So we are done. */
  1283. folio_putback_hugetlb(src);
  1284. return 0;
  1285. }
  1286. dst = get_new_folio(src, private);
  1287. if (!dst)
  1288. return -ENOMEM;
  1289. if (!folio_trylock(src)) {
  1290. if (!force)
  1291. goto out;
  1292. switch (mode) {
  1293. case MIGRATE_SYNC:
  1294. break;
  1295. default:
  1296. goto out;
  1297. }
  1298. folio_lock(src);
  1299. }
  1300. /*
  1301. * Check for pages which are in the process of being freed. Without
  1302. * folio_mapping() set, hugetlbfs specific move page routine will not
  1303. * be called and we could leak usage counts for subpools.
  1304. */
  1305. if (hugetlb_folio_subpool(src) && !folio_mapping(src)) {
  1306. rc = -EBUSY;
  1307. goto out_unlock;
  1308. }
  1309. if (folio_test_anon(src))
  1310. anon_vma = folio_get_anon_vma(src);
  1311. if (unlikely(!folio_trylock(dst)))
  1312. goto put_anon;
  1313. if (folio_mapped(src)) {
  1314. if (!folio_test_anon(src)) {
  1315. /*
  1316. * In shared mappings, try_to_unmap could potentially
  1317. * call huge_pmd_unshare. Because of this, take
  1318. * semaphore in write mode here and set TTU_RMAP_LOCKED
  1319. * to let lower levels know we have taken the lock.
  1320. */
  1321. mapping = hugetlb_folio_mapping_lock_write(src);
  1322. if (unlikely(!mapping))
  1323. goto unlock_put_anon;
  1324. ttu = TTU_RMAP_LOCKED;
  1325. }
  1326. try_to_migrate(src, ttu);
  1327. page_was_mapped = 1;
  1328. }
  1329. if (!folio_mapped(src))
  1330. rc = move_to_new_folio(dst, src, mode);
  1331. if (page_was_mapped)
  1332. remove_migration_ptes(src, !rc ? dst : src, ttu);
  1333. if (ttu & TTU_RMAP_LOCKED)
  1334. i_mmap_unlock_write(mapping);
  1335. unlock_put_anon:
  1336. folio_unlock(dst);
  1337. put_anon:
  1338. if (anon_vma)
  1339. put_anon_vma(anon_vma);
  1340. if (!rc) {
  1341. move_hugetlb_state(src, dst, reason);
  1342. put_new_folio = NULL;
  1343. }
  1344. out_unlock:
  1345. folio_unlock(src);
  1346. out:
  1347. if (!rc)
  1348. folio_putback_hugetlb(src);
  1349. else if (rc != -EAGAIN)
  1350. list_move_tail(&src->lru, ret);
  1351. /*
  1352. * If migration was not successful and there's a freeing callback,
  1353. * return the folio to that special allocator. Otherwise, simply drop
  1354. * our additional reference.
  1355. */
  1356. if (put_new_folio)
  1357. put_new_folio(dst, private);
  1358. else
  1359. folio_put(dst);
  1360. return rc;
  1361. }
  1362. static inline int try_split_folio(struct folio *folio, struct list_head *split_folios,
  1363. enum migrate_mode mode)
  1364. {
  1365. int rc;
  1366. if (mode == MIGRATE_ASYNC) {
  1367. if (!folio_trylock(folio))
  1368. return -EAGAIN;
  1369. } else {
  1370. folio_lock(folio);
  1371. }
  1372. rc = split_folio_to_list(folio, split_folios);
  1373. folio_unlock(folio);
  1374. if (!rc)
  1375. list_move_tail(&folio->lru, split_folios);
  1376. return rc;
  1377. }
  1378. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1379. #define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR
  1380. #else
  1381. #define NR_MAX_BATCHED_MIGRATION 512
  1382. #endif
  1383. #define NR_MAX_MIGRATE_PAGES_RETRY 10
  1384. #define NR_MAX_MIGRATE_ASYNC_RETRY 3
  1385. #define NR_MAX_MIGRATE_SYNC_RETRY \
  1386. (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY)
  1387. struct migrate_pages_stats {
  1388. int nr_succeeded; /* Normal and large folios migrated successfully, in
  1389. units of base pages */
  1390. int nr_failed_pages; /* Normal and large folios failed to be migrated, in
  1391. units of base pages. Untried folios aren't counted */
  1392. int nr_thp_succeeded; /* THP migrated successfully */
  1393. int nr_thp_failed; /* THP failed to be migrated */
  1394. int nr_thp_split; /* THP split before migrating */
  1395. int nr_split; /* Large folio (include THP) split before migrating */
  1396. };
  1397. /*
  1398. * Returns the number of hugetlb folios that were not migrated, or an error code
  1399. * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable
  1400. * any more because the list has become empty or no retryable hugetlb folios
  1401. * exist any more. It is caller's responsibility to call putback_movable_pages()
  1402. * only if ret != 0.
  1403. */
  1404. static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
  1405. free_folio_t put_new_folio, unsigned long private,
  1406. enum migrate_mode mode, int reason,
  1407. struct migrate_pages_stats *stats,
  1408. struct list_head *ret_folios)
  1409. {
  1410. int retry = 1;
  1411. int nr_failed = 0;
  1412. int nr_retry_pages = 0;
  1413. int pass = 0;
  1414. struct folio *folio, *folio2;
  1415. int rc, nr_pages;
  1416. for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && retry; pass++) {
  1417. retry = 0;
  1418. nr_retry_pages = 0;
  1419. list_for_each_entry_safe(folio, folio2, from, lru) {
  1420. if (!folio_test_hugetlb(folio))
  1421. continue;
  1422. nr_pages = folio_nr_pages(folio);
  1423. cond_resched();
  1424. /*
  1425. * Migratability of hugepages depends on architectures and
  1426. * their size. This check is necessary because some callers
  1427. * of hugepage migration like soft offline and memory
  1428. * hotremove don't walk through page tables or check whether
  1429. * the hugepage is pmd-based or not before kicking migration.
  1430. */
  1431. if (!hugepage_migration_supported(folio_hstate(folio))) {
  1432. nr_failed++;
  1433. stats->nr_failed_pages += nr_pages;
  1434. list_move_tail(&folio->lru, ret_folios);
  1435. continue;
  1436. }
  1437. rc = unmap_and_move_huge_page(get_new_folio,
  1438. put_new_folio, private,
  1439. folio, pass > 2, mode,
  1440. reason, ret_folios);
  1441. /*
  1442. * The rules are:
  1443. * 0: hugetlb folio will be put back
  1444. * -EAGAIN: stay on the from list
  1445. * -ENOMEM: stay on the from list
  1446. * Other errno: put on ret_folios list
  1447. */
  1448. switch(rc) {
  1449. case -ENOMEM:
  1450. /*
  1451. * When memory is low, don't bother to try to migrate
  1452. * other folios, just exit.
  1453. */
  1454. stats->nr_failed_pages += nr_pages + nr_retry_pages;
  1455. return -ENOMEM;
  1456. case -EAGAIN:
  1457. retry++;
  1458. nr_retry_pages += nr_pages;
  1459. break;
  1460. case 0:
  1461. stats->nr_succeeded += nr_pages;
  1462. break;
  1463. default:
  1464. /*
  1465. * Permanent failure (-EBUSY, etc.):
  1466. * unlike -EAGAIN case, the failed folio is
  1467. * removed from migration folio list and not
  1468. * retried in the next outer loop.
  1469. */
  1470. nr_failed++;
  1471. stats->nr_failed_pages += nr_pages;
  1472. break;
  1473. }
  1474. }
  1475. }
  1476. /*
  1477. * nr_failed is number of hugetlb folios failed to be migrated. After
  1478. * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb
  1479. * folios as failed.
  1480. */
  1481. nr_failed += retry;
  1482. stats->nr_failed_pages += nr_retry_pages;
  1483. return nr_failed;
  1484. }
  1485. static void migrate_folios_move(struct list_head *src_folios,
  1486. struct list_head *dst_folios,
  1487. free_folio_t put_new_folio, unsigned long private,
  1488. enum migrate_mode mode, int reason,
  1489. struct list_head *ret_folios,
  1490. struct migrate_pages_stats *stats,
  1491. int *retry, int *thp_retry, int *nr_failed,
  1492. int *nr_retry_pages)
  1493. {
  1494. struct folio *folio, *folio2, *dst, *dst2;
  1495. bool is_thp;
  1496. int nr_pages;
  1497. int rc;
  1498. dst = list_first_entry(dst_folios, struct folio, lru);
  1499. dst2 = list_next_entry(dst, lru);
  1500. list_for_each_entry_safe(folio, folio2, src_folios, lru) {
  1501. is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
  1502. nr_pages = folio_nr_pages(folio);
  1503. cond_resched();
  1504. rc = migrate_folio_move(put_new_folio, private,
  1505. folio, dst, mode,
  1506. reason, ret_folios);
  1507. /*
  1508. * The rules are:
  1509. * 0: folio will be freed
  1510. * -EAGAIN: stay on the unmap_folios list
  1511. * Other errno: put on ret_folios list
  1512. */
  1513. switch (rc) {
  1514. case -EAGAIN:
  1515. *retry += 1;
  1516. *thp_retry += is_thp;
  1517. *nr_retry_pages += nr_pages;
  1518. break;
  1519. case 0:
  1520. stats->nr_succeeded += nr_pages;
  1521. stats->nr_thp_succeeded += is_thp;
  1522. break;
  1523. default:
  1524. *nr_failed += 1;
  1525. stats->nr_thp_failed += is_thp;
  1526. stats->nr_failed_pages += nr_pages;
  1527. break;
  1528. }
  1529. dst = dst2;
  1530. dst2 = list_next_entry(dst, lru);
  1531. }
  1532. }
  1533. static void migrate_folios_undo(struct list_head *src_folios,
  1534. struct list_head *dst_folios,
  1535. free_folio_t put_new_folio, unsigned long private,
  1536. struct list_head *ret_folios)
  1537. {
  1538. struct folio *folio, *folio2, *dst, *dst2;
  1539. dst = list_first_entry(dst_folios, struct folio, lru);
  1540. dst2 = list_next_entry(dst, lru);
  1541. list_for_each_entry_safe(folio, folio2, src_folios, lru) {
  1542. int old_page_state = 0;
  1543. struct anon_vma *anon_vma = NULL;
  1544. __migrate_folio_extract(dst, &old_page_state, &anon_vma);
  1545. migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
  1546. anon_vma, true, ret_folios);
  1547. list_del(&dst->lru);
  1548. migrate_folio_undo_dst(dst, true, put_new_folio, private);
  1549. dst = dst2;
  1550. dst2 = list_next_entry(dst, lru);
  1551. }
  1552. }
  1553. /*
  1554. * migrate_pages_batch() first unmaps folios in the from list as many as
  1555. * possible, then move the unmapped folios.
  1556. *
  1557. * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a
  1558. * lock or bit when we have locked more than one folio. Which may cause
  1559. * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the
  1560. * length of the from list must be <= 1.
  1561. */
  1562. static int migrate_pages_batch(struct list_head *from,
  1563. new_folio_t get_new_folio, free_folio_t put_new_folio,
  1564. unsigned long private, enum migrate_mode mode, int reason,
  1565. struct list_head *ret_folios, struct list_head *split_folios,
  1566. struct migrate_pages_stats *stats, int nr_pass)
  1567. {
  1568. int retry = 1;
  1569. int thp_retry = 1;
  1570. int nr_failed = 0;
  1571. int nr_retry_pages = 0;
  1572. int pass = 0;
  1573. bool is_thp = false;
  1574. bool is_large = false;
  1575. struct folio *folio, *folio2, *dst = NULL;
  1576. int rc, rc_saved = 0, nr_pages;
  1577. LIST_HEAD(unmap_folios);
  1578. LIST_HEAD(dst_folios);
  1579. bool nosplit = (reason == MR_NUMA_MISPLACED);
  1580. VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
  1581. !list_empty(from) && !list_is_singular(from));
  1582. for (pass = 0; pass < nr_pass && retry; pass++) {
  1583. retry = 0;
  1584. thp_retry = 0;
  1585. nr_retry_pages = 0;
  1586. list_for_each_entry_safe(folio, folio2, from, lru) {
  1587. is_large = folio_test_large(folio);
  1588. is_thp = folio_test_pmd_mappable(folio);
  1589. nr_pages = folio_nr_pages(folio);
  1590. cond_resched();
  1591. /*
  1592. * The rare folio on the deferred split list should
  1593. * be split now. It should not count as a failure:
  1594. * but increment nr_failed because, without doing so,
  1595. * migrate_pages() may report success with (split but
  1596. * unmigrated) pages still on its fromlist; whereas it
  1597. * always reports success when its fromlist is empty.
  1598. * stats->nr_thp_failed should be increased too,
  1599. * otherwise stats inconsistency will happen when
  1600. * migrate_pages_batch is called via migrate_pages()
  1601. * with MIGRATE_SYNC and MIGRATE_ASYNC.
  1602. *
  1603. * Only check it without removing it from the list.
  1604. * Since the folio can be on deferred_split_scan()
  1605. * local list and removing it can cause the local list
  1606. * corruption. Folio split process below can handle it
  1607. * with the help of folio_ref_freeze().
  1608. *
  1609. * nr_pages > 2 is needed to avoid checking order-1
  1610. * page cache folios. They exist, in contrast to
  1611. * non-existent order-1 anonymous folios, and do not
  1612. * use _deferred_list.
  1613. */
  1614. if (nr_pages > 2 &&
  1615. !list_empty(&folio->_deferred_list) &&
  1616. folio_test_partially_mapped(folio)) {
  1617. if (!try_split_folio(folio, split_folios, mode)) {
  1618. nr_failed++;
  1619. stats->nr_thp_failed += is_thp;
  1620. stats->nr_thp_split += is_thp;
  1621. stats->nr_split++;
  1622. continue;
  1623. }
  1624. }
  1625. /*
  1626. * Large folio migration might be unsupported or
  1627. * the allocation might be failed so we should retry
  1628. * on the same folio with the large folio split
  1629. * to normal folios.
  1630. *
  1631. * Split folios are put in split_folios, and
  1632. * we will migrate them after the rest of the
  1633. * list is processed.
  1634. */
  1635. if (!thp_migration_supported() && is_thp) {
  1636. nr_failed++;
  1637. stats->nr_thp_failed++;
  1638. if (!try_split_folio(folio, split_folios, mode)) {
  1639. stats->nr_thp_split++;
  1640. stats->nr_split++;
  1641. continue;
  1642. }
  1643. stats->nr_failed_pages += nr_pages;
  1644. list_move_tail(&folio->lru, ret_folios);
  1645. continue;
  1646. }
  1647. /*
  1648. * If we are holding the last folio reference, the folio
  1649. * was freed from under us, so just drop our reference.
  1650. */
  1651. if (likely(!page_has_movable_ops(&folio->page)) &&
  1652. folio_ref_count(folio) == 1) {
  1653. folio_clear_active(folio);
  1654. folio_clear_unevictable(folio);
  1655. list_del(&folio->lru);
  1656. migrate_folio_done(folio, reason);
  1657. stats->nr_succeeded += nr_pages;
  1658. stats->nr_thp_succeeded += is_thp;
  1659. continue;
  1660. }
  1661. rc = migrate_folio_unmap(get_new_folio, put_new_folio,
  1662. private, folio, &dst, mode, ret_folios);
  1663. /*
  1664. * The rules are:
  1665. * 0: folio will be put on unmap_folios list,
  1666. * dst folio put on dst_folios list
  1667. * -EAGAIN: stay on the from list
  1668. * -ENOMEM: stay on the from list
  1669. * Other errno: put on ret_folios list
  1670. */
  1671. switch(rc) {
  1672. case -ENOMEM:
  1673. /*
  1674. * When memory is low, don't bother to try to migrate
  1675. * other folios, move unmapped folios, then exit.
  1676. */
  1677. nr_failed++;
  1678. stats->nr_thp_failed += is_thp;
  1679. /* Large folio NUMA faulting doesn't split to retry. */
  1680. if (is_large && !nosplit) {
  1681. int ret = try_split_folio(folio, split_folios, mode);
  1682. if (!ret) {
  1683. stats->nr_thp_split += is_thp;
  1684. stats->nr_split++;
  1685. break;
  1686. } else if (reason == MR_LONGTERM_PIN &&
  1687. ret == -EAGAIN) {
  1688. /*
  1689. * Try again to split large folio to
  1690. * mitigate the failure of longterm pinning.
  1691. */
  1692. retry++;
  1693. thp_retry += is_thp;
  1694. nr_retry_pages += nr_pages;
  1695. /* Undo duplicated failure counting. */
  1696. nr_failed--;
  1697. stats->nr_thp_failed -= is_thp;
  1698. break;
  1699. }
  1700. }
  1701. stats->nr_failed_pages += nr_pages + nr_retry_pages;
  1702. /* nr_failed isn't updated for not used */
  1703. stats->nr_thp_failed += thp_retry;
  1704. rc_saved = rc;
  1705. if (list_empty(&unmap_folios))
  1706. goto out;
  1707. else
  1708. goto move;
  1709. case -EAGAIN:
  1710. retry++;
  1711. thp_retry += is_thp;
  1712. nr_retry_pages += nr_pages;
  1713. break;
  1714. case 0:
  1715. list_move_tail(&folio->lru, &unmap_folios);
  1716. list_add_tail(&dst->lru, &dst_folios);
  1717. break;
  1718. default:
  1719. /*
  1720. * Permanent failure (-EBUSY, etc.):
  1721. * unlike -EAGAIN case, the failed folio is
  1722. * removed from migration folio list and not
  1723. * retried in the next outer loop.
  1724. */
  1725. nr_failed++;
  1726. stats->nr_thp_failed += is_thp;
  1727. stats->nr_failed_pages += nr_pages;
  1728. break;
  1729. }
  1730. }
  1731. }
  1732. nr_failed += retry;
  1733. stats->nr_thp_failed += thp_retry;
  1734. stats->nr_failed_pages += nr_retry_pages;
  1735. move:
  1736. /* Flush TLBs for all unmapped folios */
  1737. try_to_unmap_flush();
  1738. retry = 1;
  1739. for (pass = 0; pass < nr_pass && retry; pass++) {
  1740. retry = 0;
  1741. thp_retry = 0;
  1742. nr_retry_pages = 0;
  1743. /* Move the unmapped folios */
  1744. migrate_folios_move(&unmap_folios, &dst_folios,
  1745. put_new_folio, private, mode, reason,
  1746. ret_folios, stats, &retry, &thp_retry,
  1747. &nr_failed, &nr_retry_pages);
  1748. }
  1749. nr_failed += retry;
  1750. stats->nr_thp_failed += thp_retry;
  1751. stats->nr_failed_pages += nr_retry_pages;
  1752. rc = rc_saved ? : nr_failed;
  1753. out:
  1754. /* Cleanup remaining folios */
  1755. migrate_folios_undo(&unmap_folios, &dst_folios,
  1756. put_new_folio, private, ret_folios);
  1757. return rc;
  1758. }
  1759. static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio,
  1760. free_folio_t put_new_folio, unsigned long private,
  1761. enum migrate_mode mode, int reason,
  1762. struct list_head *ret_folios, struct list_head *split_folios,
  1763. struct migrate_pages_stats *stats)
  1764. {
  1765. int rc, nr_failed = 0;
  1766. LIST_HEAD(folios);
  1767. struct migrate_pages_stats astats;
  1768. memset(&astats, 0, sizeof(astats));
  1769. /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */
  1770. rc = migrate_pages_batch(from, get_new_folio, put_new_folio, private, MIGRATE_ASYNC,
  1771. reason, &folios, split_folios, &astats,
  1772. NR_MAX_MIGRATE_ASYNC_RETRY);
  1773. stats->nr_succeeded += astats.nr_succeeded;
  1774. stats->nr_thp_succeeded += astats.nr_thp_succeeded;
  1775. stats->nr_thp_split += astats.nr_thp_split;
  1776. stats->nr_split += astats.nr_split;
  1777. if (rc < 0) {
  1778. stats->nr_failed_pages += astats.nr_failed_pages;
  1779. stats->nr_thp_failed += astats.nr_thp_failed;
  1780. list_splice_tail(&folios, ret_folios);
  1781. return rc;
  1782. }
  1783. stats->nr_thp_failed += astats.nr_thp_split;
  1784. /*
  1785. * Do not count rc, as pages will be retried below.
  1786. * Count nr_split only, since it includes nr_thp_split.
  1787. */
  1788. nr_failed += astats.nr_split;
  1789. /*
  1790. * Fall back to migrate all failed folios one by one synchronously. All
  1791. * failed folios except split THPs will be retried, so their failure
  1792. * isn't counted
  1793. */
  1794. list_splice_tail_init(&folios, from);
  1795. while (!list_empty(from)) {
  1796. list_move(from->next, &folios);
  1797. rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
  1798. private, mode, reason, ret_folios,
  1799. split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY);
  1800. list_splice_tail_init(&folios, ret_folios);
  1801. if (rc < 0)
  1802. return rc;
  1803. nr_failed += rc;
  1804. }
  1805. return nr_failed;
  1806. }
  1807. /*
  1808. * migrate_pages - migrate the folios specified in a list, to the free folios
  1809. * supplied as the target for the page migration
  1810. *
  1811. * @from: The list of folios to be migrated.
  1812. * @get_new_folio: The function used to allocate free folios to be used
  1813. * as the target of the folio migration.
  1814. * @put_new_folio: The function used to free target folios if migration
  1815. * fails, or NULL if no special handling is necessary.
  1816. * @private: Private data to be passed on to get_new_folio()
  1817. * @mode: The migration mode that specifies the constraints for
  1818. * folio migration, if any.
  1819. * @reason: The reason for folio migration.
  1820. * @ret_succeeded: Set to the number of folios migrated successfully if
  1821. * the caller passes a non-NULL pointer.
  1822. *
  1823. * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios
  1824. * are movable any more because the list has become empty or no retryable folios
  1825. * exist any more. It is caller's responsibility to call putback_movable_pages()
  1826. * only if ret != 0.
  1827. *
  1828. * Returns the number of {normal folio, large folio, hugetlb} that were not
  1829. * migrated, or an error code. The number of large folio splits will be
  1830. * considered as the number of non-migrated large folio, no matter how many
  1831. * split folios of the large folio are migrated successfully.
  1832. */
  1833. int migrate_pages(struct list_head *from, new_folio_t get_new_folio,
  1834. free_folio_t put_new_folio, unsigned long private,
  1835. enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
  1836. {
  1837. int rc, rc_gather;
  1838. int nr_pages;
  1839. struct folio *folio, *folio2;
  1840. LIST_HEAD(folios);
  1841. LIST_HEAD(ret_folios);
  1842. LIST_HEAD(split_folios);
  1843. struct migrate_pages_stats stats;
  1844. trace_mm_migrate_pages_start(mode, reason);
  1845. memset(&stats, 0, sizeof(stats));
  1846. rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private,
  1847. mode, reason, &stats, &ret_folios);
  1848. if (rc_gather < 0)
  1849. goto out;
  1850. again:
  1851. nr_pages = 0;
  1852. list_for_each_entry_safe(folio, folio2, from, lru) {
  1853. /* Retried hugetlb folios will be kept in list */
  1854. if (folio_test_hugetlb(folio)) {
  1855. list_move_tail(&folio->lru, &ret_folios);
  1856. continue;
  1857. }
  1858. nr_pages += folio_nr_pages(folio);
  1859. if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
  1860. break;
  1861. }
  1862. if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
  1863. list_cut_before(&folios, from, &folio2->lru);
  1864. else
  1865. list_splice_init(from, &folios);
  1866. if (mode == MIGRATE_ASYNC)
  1867. rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
  1868. private, mode, reason, &ret_folios,
  1869. &split_folios, &stats,
  1870. NR_MAX_MIGRATE_PAGES_RETRY);
  1871. else
  1872. rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio,
  1873. private, mode, reason, &ret_folios,
  1874. &split_folios, &stats);
  1875. list_splice_tail_init(&folios, &ret_folios);
  1876. if (rc < 0) {
  1877. rc_gather = rc;
  1878. list_splice_tail(&split_folios, &ret_folios);
  1879. goto out;
  1880. }
  1881. if (!list_empty(&split_folios)) {
  1882. /*
  1883. * Failure isn't counted since all split folios of a large folio
  1884. * is counted as 1 failure already. And, we only try to migrate
  1885. * with minimal effort, force MIGRATE_ASYNC mode and retry once.
  1886. */
  1887. migrate_pages_batch(&split_folios, get_new_folio,
  1888. put_new_folio, private, MIGRATE_ASYNC, reason,
  1889. &ret_folios, NULL, &stats, 1);
  1890. list_splice_tail_init(&split_folios, &ret_folios);
  1891. }
  1892. rc_gather += rc;
  1893. if (!list_empty(from))
  1894. goto again;
  1895. out:
  1896. /*
  1897. * Put the permanent failure folio back to migration list, they
  1898. * will be put back to the right list by the caller.
  1899. */
  1900. list_splice(&ret_folios, from);
  1901. /*
  1902. * Return 0 in case all split folios of fail-to-migrate large folios
  1903. * are migrated successfully.
  1904. */
  1905. if (list_empty(from))
  1906. rc_gather = 0;
  1907. count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded);
  1908. count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages);
  1909. count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded);
  1910. count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed);
  1911. count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split);
  1912. trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages,
  1913. stats.nr_thp_succeeded, stats.nr_thp_failed,
  1914. stats.nr_thp_split, stats.nr_split, mode,
  1915. reason);
  1916. if (ret_succeeded)
  1917. *ret_succeeded = stats.nr_succeeded;
  1918. return rc_gather;
  1919. }
  1920. struct folio *alloc_migration_target(struct folio *src, unsigned long private)
  1921. {
  1922. struct migration_target_control *mtc;
  1923. gfp_t gfp_mask;
  1924. unsigned int order = 0;
  1925. int nid;
  1926. enum zone_type zidx;
  1927. mtc = (struct migration_target_control *)private;
  1928. gfp_mask = mtc->gfp_mask;
  1929. nid = mtc->nid;
  1930. if (nid == NUMA_NO_NODE)
  1931. nid = folio_nid(src);
  1932. if (folio_test_hugetlb(src)) {
  1933. struct hstate *h = folio_hstate(src);
  1934. gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
  1935. return alloc_hugetlb_folio_nodemask(h, nid,
  1936. mtc->nmask, gfp_mask,
  1937. htlb_allow_alloc_fallback(mtc->reason));
  1938. }
  1939. if (folio_test_large(src)) {
  1940. /*
  1941. * clear __GFP_RECLAIM to make the migration callback
  1942. * consistent with regular THP allocations.
  1943. */
  1944. gfp_mask &= ~__GFP_RECLAIM;
  1945. gfp_mask |= GFP_TRANSHUGE;
  1946. order = folio_order(src);
  1947. }
  1948. zidx = folio_zonenum(src);
  1949. if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
  1950. gfp_mask |= __GFP_HIGHMEM;
  1951. return __folio_alloc(gfp_mask, order, nid, mtc->nmask);
  1952. }
  1953. #ifdef CONFIG_NUMA
  1954. static int store_status(int __user *status, int start, int value, int nr)
  1955. {
  1956. while (nr-- > 0) {
  1957. if (put_user(value, status + start))
  1958. return -EFAULT;
  1959. start++;
  1960. }
  1961. return 0;
  1962. }
  1963. static int do_move_pages_to_node(struct list_head *pagelist, int node)
  1964. {
  1965. int err;
  1966. struct migration_target_control mtc = {
  1967. .nid = node,
  1968. .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
  1969. .reason = MR_SYSCALL,
  1970. };
  1971. err = migrate_pages(pagelist, alloc_migration_target, NULL,
  1972. (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
  1973. if (err)
  1974. putback_movable_pages(pagelist);
  1975. return err;
  1976. }
  1977. static int __add_folio_for_migration(struct folio *folio, int node,
  1978. struct list_head *pagelist, bool migrate_all)
  1979. {
  1980. if (is_zero_folio(folio) || is_huge_zero_folio(folio))
  1981. return -EFAULT;
  1982. if (folio_is_zone_device(folio))
  1983. return -ENOENT;
  1984. if (folio_nid(folio) == node)
  1985. return 0;
  1986. if (folio_maybe_mapped_shared(folio) && !migrate_all)
  1987. return -EACCES;
  1988. if (folio_test_hugetlb(folio)) {
  1989. if (folio_isolate_hugetlb(folio, pagelist))
  1990. return 1;
  1991. } else if (folio_isolate_lru(folio)) {
  1992. list_add_tail(&folio->lru, pagelist);
  1993. node_stat_mod_folio(folio,
  1994. NR_ISOLATED_ANON + folio_is_file_lru(folio),
  1995. folio_nr_pages(folio));
  1996. return 1;
  1997. }
  1998. return -EBUSY;
  1999. }
  2000. /*
  2001. * Resolves the given address to a struct folio, isolates it from the LRU and
  2002. * puts it to the given pagelist.
  2003. * Returns:
  2004. * errno - if the folio cannot be found/isolated
  2005. * 0 - when it doesn't have to be migrated because it is already on the
  2006. * target node
  2007. * 1 - when it has been queued
  2008. */
  2009. static int add_folio_for_migration(struct mm_struct *mm, const void __user *p,
  2010. int node, struct list_head *pagelist, bool migrate_all)
  2011. {
  2012. struct vm_area_struct *vma;
  2013. struct folio_walk fw;
  2014. struct folio *folio;
  2015. unsigned long addr;
  2016. int err = -EFAULT;
  2017. mmap_read_lock(mm);
  2018. addr = (unsigned long)untagged_addr_remote(mm, p);
  2019. vma = vma_lookup(mm, addr);
  2020. if (vma && vma_migratable(vma)) {
  2021. folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE);
  2022. if (folio) {
  2023. err = __add_folio_for_migration(folio, node, pagelist,
  2024. migrate_all);
  2025. folio_walk_end(&fw, vma);
  2026. } else {
  2027. err = -ENOENT;
  2028. }
  2029. }
  2030. mmap_read_unlock(mm);
  2031. return err;
  2032. }
  2033. static int move_pages_and_store_status(int node,
  2034. struct list_head *pagelist, int __user *status,
  2035. int start, int i, unsigned long nr_pages)
  2036. {
  2037. int err;
  2038. if (list_empty(pagelist))
  2039. return 0;
  2040. err = do_move_pages_to_node(pagelist, node);
  2041. if (err) {
  2042. /*
  2043. * Positive err means the number of failed
  2044. * pages to migrate. Since we are going to
  2045. * abort and return the number of non-migrated
  2046. * pages, so need to include the rest of the
  2047. * nr_pages that have not been attempted as
  2048. * well.
  2049. */
  2050. if (err > 0)
  2051. err += nr_pages - i;
  2052. return err;
  2053. }
  2054. return store_status(status, start, node, i - start);
  2055. }
  2056. /*
  2057. * Migrate an array of page address onto an array of nodes and fill
  2058. * the corresponding array of status.
  2059. */
  2060. static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
  2061. unsigned long nr_pages,
  2062. const void __user * __user *pages,
  2063. const int __user *nodes,
  2064. int __user *status, int flags)
  2065. {
  2066. compat_uptr_t __user *compat_pages = (void __user *)pages;
  2067. int current_node = NUMA_NO_NODE;
  2068. LIST_HEAD(pagelist);
  2069. int start, i;
  2070. int err = 0, err1;
  2071. lru_cache_disable();
  2072. for (i = start = 0; i < nr_pages; i++) {
  2073. const void __user *p;
  2074. int node;
  2075. err = -EFAULT;
  2076. if (in_compat_syscall()) {
  2077. compat_uptr_t cp;
  2078. if (get_user(cp, compat_pages + i))
  2079. goto out_flush;
  2080. p = compat_ptr(cp);
  2081. } else {
  2082. if (get_user(p, pages + i))
  2083. goto out_flush;
  2084. }
  2085. if (get_user(node, nodes + i))
  2086. goto out_flush;
  2087. err = -ENODEV;
  2088. if (node < 0 || node >= MAX_NUMNODES)
  2089. goto out_flush;
  2090. if (!node_state(node, N_MEMORY))
  2091. goto out_flush;
  2092. err = -EACCES;
  2093. if (!node_isset(node, task_nodes))
  2094. goto out_flush;
  2095. if (current_node == NUMA_NO_NODE) {
  2096. current_node = node;
  2097. start = i;
  2098. } else if (node != current_node) {
  2099. err = move_pages_and_store_status(current_node,
  2100. &pagelist, status, start, i, nr_pages);
  2101. if (err)
  2102. goto out;
  2103. start = i;
  2104. current_node = node;
  2105. }
  2106. /*
  2107. * Errors in the page lookup or isolation are not fatal and we simply
  2108. * report them via status
  2109. */
  2110. err = add_folio_for_migration(mm, p, current_node, &pagelist,
  2111. flags & MPOL_MF_MOVE_ALL);
  2112. if (err > 0) {
  2113. /* The page is successfully queued for migration */
  2114. continue;
  2115. }
  2116. /*
  2117. * If the page is already on the target node (!err), store the
  2118. * node, otherwise, store the err.
  2119. */
  2120. err = store_status(status, i, err ? : current_node, 1);
  2121. if (err)
  2122. goto out_flush;
  2123. err = move_pages_and_store_status(current_node, &pagelist,
  2124. status, start, i, nr_pages);
  2125. if (err) {
  2126. /* We have accounted for page i */
  2127. if (err > 0)
  2128. err--;
  2129. goto out;
  2130. }
  2131. current_node = NUMA_NO_NODE;
  2132. }
  2133. out_flush:
  2134. /* Make sure we do not overwrite the existing error */
  2135. err1 = move_pages_and_store_status(current_node, &pagelist,
  2136. status, start, i, nr_pages);
  2137. if (err >= 0)
  2138. err = err1;
  2139. out:
  2140. lru_cache_enable();
  2141. return err;
  2142. }
  2143. /*
  2144. * Determine the nodes of an array of pages and store it in an array of status.
  2145. */
  2146. static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
  2147. const void __user **pages, int *status)
  2148. {
  2149. unsigned long i;
  2150. mmap_read_lock(mm);
  2151. for (i = 0; i < nr_pages; i++) {
  2152. unsigned long addr = (unsigned long)(*pages);
  2153. struct vm_area_struct *vma;
  2154. struct folio_walk fw;
  2155. struct folio *folio;
  2156. int err = -EFAULT;
  2157. vma = vma_lookup(mm, addr);
  2158. if (!vma)
  2159. goto set_status;
  2160. folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE);
  2161. if (folio) {
  2162. if (is_zero_folio(folio) || is_huge_zero_folio(folio))
  2163. err = -EFAULT;
  2164. else if (folio_is_zone_device(folio))
  2165. err = -ENOENT;
  2166. else
  2167. err = folio_nid(folio);
  2168. folio_walk_end(&fw, vma);
  2169. } else {
  2170. err = -ENOENT;
  2171. }
  2172. set_status:
  2173. *status = err;
  2174. pages++;
  2175. status++;
  2176. }
  2177. mmap_read_unlock(mm);
  2178. }
  2179. static int get_compat_pages_array(const void __user *chunk_pages[],
  2180. const void __user * __user *pages,
  2181. unsigned long chunk_offset,
  2182. unsigned long chunk_nr)
  2183. {
  2184. compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
  2185. compat_uptr_t p;
  2186. int i;
  2187. for (i = 0; i < chunk_nr; i++) {
  2188. if (get_user(p, pages32 + chunk_offset + i))
  2189. return -EFAULT;
  2190. chunk_pages[i] = compat_ptr(p);
  2191. }
  2192. return 0;
  2193. }
  2194. /*
  2195. * Determine the nodes of a user array of pages and store it in
  2196. * a user array of status.
  2197. */
  2198. static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
  2199. const void __user * __user *pages,
  2200. int __user *status)
  2201. {
  2202. #define DO_PAGES_STAT_CHUNK_NR 16UL
  2203. const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
  2204. int chunk_status[DO_PAGES_STAT_CHUNK_NR];
  2205. unsigned long chunk_offset = 0;
  2206. while (nr_pages) {
  2207. unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
  2208. if (in_compat_syscall()) {
  2209. if (get_compat_pages_array(chunk_pages, pages,
  2210. chunk_offset, chunk_nr))
  2211. break;
  2212. } else {
  2213. if (copy_from_user(chunk_pages, pages + chunk_offset,
  2214. chunk_nr * sizeof(*chunk_pages)))
  2215. break;
  2216. }
  2217. do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
  2218. if (copy_to_user(status + chunk_offset, chunk_status,
  2219. chunk_nr * sizeof(*status)))
  2220. break;
  2221. chunk_offset += chunk_nr;
  2222. nr_pages -= chunk_nr;
  2223. }
  2224. return nr_pages ? -EFAULT : 0;
  2225. }
  2226. static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
  2227. {
  2228. struct task_struct *task;
  2229. struct mm_struct *mm;
  2230. /*
  2231. * There is no need to check if current process has the right to modify
  2232. * the specified process when they are same.
  2233. */
  2234. if (!pid) {
  2235. mmget(current->mm);
  2236. *mem_nodes = cpuset_mems_allowed(current);
  2237. return current->mm;
  2238. }
  2239. task = find_get_task_by_vpid(pid);
  2240. if (!task) {
  2241. return ERR_PTR(-ESRCH);
  2242. }
  2243. /*
  2244. * Check if this process has the right to modify the specified
  2245. * process. Use the regular "ptrace_may_access()" checks.
  2246. */
  2247. if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
  2248. mm = ERR_PTR(-EPERM);
  2249. goto out;
  2250. }
  2251. mm = ERR_PTR(security_task_movememory(task));
  2252. if (IS_ERR(mm))
  2253. goto out;
  2254. *mem_nodes = cpuset_mems_allowed(task);
  2255. mm = get_task_mm(task);
  2256. out:
  2257. put_task_struct(task);
  2258. if (!mm)
  2259. mm = ERR_PTR(-EINVAL);
  2260. return mm;
  2261. }
  2262. /*
  2263. * Move a list of pages in the address space of the currently executing
  2264. * process.
  2265. */
  2266. static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
  2267. const void __user * __user *pages,
  2268. const int __user *nodes,
  2269. int __user *status, int flags)
  2270. {
  2271. struct mm_struct *mm;
  2272. int err;
  2273. nodemask_t task_nodes;
  2274. /* Check flags */
  2275. if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
  2276. return -EINVAL;
  2277. if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
  2278. return -EPERM;
  2279. mm = find_mm_struct(pid, &task_nodes);
  2280. if (IS_ERR(mm))
  2281. return PTR_ERR(mm);
  2282. if (nodes)
  2283. err = do_pages_move(mm, task_nodes, nr_pages, pages,
  2284. nodes, status, flags);
  2285. else
  2286. err = do_pages_stat(mm, nr_pages, pages, status);
  2287. mmput(mm);
  2288. return err;
  2289. }
  2290. SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
  2291. const void __user * __user *, pages,
  2292. const int __user *, nodes,
  2293. int __user *, status, int, flags)
  2294. {
  2295. return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
  2296. }
  2297. #ifdef CONFIG_NUMA_BALANCING
  2298. /*
  2299. * Returns true if this is a safe migration target node for misplaced NUMA
  2300. * pages. Currently it only checks the watermarks which is crude.
  2301. */
  2302. static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
  2303. unsigned long nr_migrate_pages)
  2304. {
  2305. int z;
  2306. for (z = pgdat->nr_zones - 1; z >= 0; z--) {
  2307. struct zone *zone = pgdat->node_zones + z;
  2308. if (!managed_zone(zone))
  2309. continue;
  2310. /* Avoid waking kswapd by allocating pages_to_migrate pages. */
  2311. if (!zone_watermark_ok(zone, 0,
  2312. high_wmark_pages(zone) +
  2313. nr_migrate_pages,
  2314. ZONE_MOVABLE, ALLOC_CMA))
  2315. continue;
  2316. return true;
  2317. }
  2318. return false;
  2319. }
  2320. static struct folio *alloc_misplaced_dst_folio(struct folio *src,
  2321. unsigned long data)
  2322. {
  2323. int nid = (int) data;
  2324. int order = folio_order(src);
  2325. gfp_t gfp = __GFP_THISNODE;
  2326. if (order > 0)
  2327. gfp |= GFP_TRANSHUGE_LIGHT;
  2328. else {
  2329. gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY |
  2330. __GFP_NOWARN;
  2331. gfp &= ~__GFP_RECLAIM;
  2332. }
  2333. return __folio_alloc_node(gfp, order, nid);
  2334. }
  2335. /*
  2336. * Prepare for calling migrate_misplaced_folio() by isolating the folio if
  2337. * permitted. Must be called with the PTL still held.
  2338. */
  2339. int migrate_misplaced_folio_prepare(struct folio *folio,
  2340. struct vm_area_struct *vma, int node)
  2341. {
  2342. int nr_pages = folio_nr_pages(folio);
  2343. pg_data_t *pgdat = NODE_DATA(node);
  2344. if (folio_is_file_lru(folio)) {
  2345. /*
  2346. * Do not migrate file folios that are mapped in multiple
  2347. * processes with execute permissions as they are probably
  2348. * shared libraries.
  2349. *
  2350. * See folio_maybe_mapped_shared() on possible imprecision
  2351. * when we cannot easily detect if a folio is shared.
  2352. */
  2353. if ((vma->vm_flags & VM_EXEC) && folio_maybe_mapped_shared(folio))
  2354. return -EACCES;
  2355. /*
  2356. * Do not migrate dirty folios as not all filesystems can move
  2357. * dirty folios in MIGRATE_ASYNC mode which is a waste of
  2358. * cycles.
  2359. */
  2360. if (folio_test_dirty(folio))
  2361. return -EAGAIN;
  2362. }
  2363. /* Avoid migrating to a node that is nearly full */
  2364. if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
  2365. int z;
  2366. if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
  2367. return -EAGAIN;
  2368. for (z = pgdat->nr_zones - 1; z >= 0; z--) {
  2369. if (managed_zone(pgdat->node_zones + z))
  2370. break;
  2371. }
  2372. /*
  2373. * If there are no managed zones, it should not proceed
  2374. * further.
  2375. */
  2376. if (z < 0)
  2377. return -EAGAIN;
  2378. wakeup_kswapd(pgdat->node_zones + z, 0,
  2379. folio_order(folio), ZONE_MOVABLE);
  2380. return -EAGAIN;
  2381. }
  2382. if (!folio_isolate_lru(folio))
  2383. return -EAGAIN;
  2384. node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio),
  2385. nr_pages);
  2386. return 0;
  2387. }
  2388. /*
  2389. * Attempt to migrate a misplaced folio to the specified destination
  2390. * node. Caller is expected to have isolated the folio by calling
  2391. * migrate_misplaced_folio_prepare(), which will result in an
  2392. * elevated reference count on the folio. This function will un-isolate the
  2393. * folio, dereferencing the folio before returning.
  2394. */
  2395. int migrate_misplaced_folio(struct folio *folio, int node)
  2396. {
  2397. pg_data_t *pgdat = NODE_DATA(node);
  2398. int nr_remaining;
  2399. unsigned int nr_succeeded;
  2400. LIST_HEAD(migratepages);
  2401. struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
  2402. struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
  2403. list_add(&folio->lru, &migratepages);
  2404. nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio,
  2405. NULL, node, MIGRATE_ASYNC,
  2406. MR_NUMA_MISPLACED, &nr_succeeded);
  2407. if (nr_remaining && !list_empty(&migratepages))
  2408. putback_movable_pages(&migratepages);
  2409. if (nr_succeeded) {
  2410. count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
  2411. count_memcg_events(memcg, NUMA_PAGE_MIGRATE, nr_succeeded);
  2412. if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
  2413. && !node_is_toptier(folio_nid(folio))
  2414. && node_is_toptier(node))
  2415. mod_lruvec_state(lruvec, PGPROMOTE_SUCCESS, nr_succeeded);
  2416. }
  2417. mem_cgroup_put(memcg);
  2418. BUG_ON(!list_empty(&migratepages));
  2419. return nr_remaining ? -EAGAIN : 0;
  2420. }
  2421. #endif /* CONFIG_NUMA_BALANCING */
  2422. #endif /* CONFIG_NUMA */