huge_memory.c 137 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 2009 Red Hat, Inc.
  4. */
  5. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  6. #include <linux/mm.h>
  7. #include <linux/sched.h>
  8. #include <linux/sched/mm.h>
  9. #include <linux/sched/numa_balancing.h>
  10. #include <linux/highmem.h>
  11. #include <linux/hugetlb.h>
  12. #include <linux/mmu_notifier.h>
  13. #include <linux/rmap.h>
  14. #include <linux/swap.h>
  15. #include <linux/shrinker.h>
  16. #include <linux/mm_inline.h>
  17. #include <linux/swapops.h>
  18. #include <linux/backing-dev.h>
  19. #include <linux/dax.h>
  20. #include <linux/mm_types.h>
  21. #include <linux/khugepaged.h>
  22. #include <linux/freezer.h>
  23. #include <linux/mman.h>
  24. #include <linux/memremap.h>
  25. #include <linux/pagemap.h>
  26. #include <linux/debugfs.h>
  27. #include <linux/migrate.h>
  28. #include <linux/hashtable.h>
  29. #include <linux/userfaultfd_k.h>
  30. #include <linux/page_idle.h>
  31. #include <linux/shmem_fs.h>
  32. #include <linux/oom.h>
  33. #include <linux/numa.h>
  34. #include <linux/page_owner.h>
  35. #include <linux/sched/sysctl.h>
  36. #include <linux/memory-tiers.h>
  37. #include <linux/compat.h>
  38. #include <linux/pgalloc.h>
  39. #include <linux/pgalloc_tag.h>
  40. #include <linux/pagewalk.h>
  41. #include <asm/tlb.h>
  42. #include "internal.h"
  43. #include "swap.h"
  44. #define CREATE_TRACE_POINTS
  45. #include <trace/events/thp.h>
  46. /*
  47. * By default, transparent hugepage support is disabled in order to avoid
  48. * risking an increased memory footprint for applications that are not
  49. * guaranteed to benefit from it. When transparent hugepage support is
  50. * enabled, it is for all mappings, and khugepaged scans all mappings.
  51. * Defrag is invoked by khugepaged hugepage allocations and by page faults
  52. * for all hugepage allocations.
  53. */
  54. unsigned long transparent_hugepage_flags __read_mostly =
  55. #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
  56. (1<<TRANSPARENT_HUGEPAGE_FLAG)|
  57. #endif
  58. #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
  59. (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
  60. #endif
  61. (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
  62. (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
  63. (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  64. static struct shrinker *deferred_split_shrinker;
  65. static unsigned long deferred_split_count(struct shrinker *shrink,
  66. struct shrink_control *sc);
  67. static unsigned long deferred_split_scan(struct shrinker *shrink,
  68. struct shrink_control *sc);
  69. static bool split_underused_thp = true;
  70. static atomic_t huge_zero_refcount;
  71. struct folio *huge_zero_folio __read_mostly;
  72. unsigned long huge_zero_pfn __read_mostly = ~0UL;
  73. unsigned long huge_anon_orders_always __read_mostly;
  74. unsigned long huge_anon_orders_madvise __read_mostly;
  75. unsigned long huge_anon_orders_inherit __read_mostly;
  76. static bool anon_orders_configured __initdata;
  77. static inline bool file_thp_enabled(struct vm_area_struct *vma)
  78. {
  79. struct inode *inode;
  80. if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
  81. return false;
  82. if (!vma->vm_file)
  83. return false;
  84. inode = file_inode(vma->vm_file);
  85. if (IS_ANON_FILE(inode))
  86. return false;
  87. return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
  88. }
  89. unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
  90. vm_flags_t vm_flags,
  91. enum tva_type type,
  92. unsigned long orders)
  93. {
  94. const bool smaps = type == TVA_SMAPS;
  95. const bool in_pf = type == TVA_PAGEFAULT;
  96. const bool forced_collapse = type == TVA_FORCED_COLLAPSE;
  97. unsigned long supported_orders;
  98. /* Check the intersection of requested and supported orders. */
  99. if (vma_is_anonymous(vma))
  100. supported_orders = THP_ORDERS_ALL_ANON;
  101. else if (vma_is_special_huge(vma))
  102. supported_orders = THP_ORDERS_ALL_SPECIAL;
  103. else
  104. supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
  105. orders &= supported_orders;
  106. if (!orders)
  107. return 0;
  108. if (!vma->vm_mm) /* vdso */
  109. return 0;
  110. if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse))
  111. return 0;
  112. /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
  113. if (vma_is_dax(vma))
  114. return in_pf ? orders : 0;
  115. /*
  116. * khugepaged special VMA and hugetlb VMA.
  117. * Must be checked after dax since some dax mappings may have
  118. * VM_MIXEDMAP set.
  119. */
  120. if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
  121. return 0;
  122. /*
  123. * Check alignment for file vma and size for both file and anon vma by
  124. * filtering out the unsuitable orders.
  125. *
  126. * Skip the check for page fault. Huge fault does the check in fault
  127. * handlers.
  128. */
  129. if (!in_pf) {
  130. int order = highest_order(orders);
  131. unsigned long addr;
  132. while (orders) {
  133. addr = vma->vm_end - (PAGE_SIZE << order);
  134. if (thp_vma_suitable_order(vma, addr, order))
  135. break;
  136. order = next_order(&orders, order);
  137. }
  138. if (!orders)
  139. return 0;
  140. }
  141. /*
  142. * Enabled via shmem mount options or sysfs settings.
  143. * Must be done before hugepage flags check since shmem has its
  144. * own flags.
  145. */
  146. if (!in_pf && shmem_file(vma->vm_file))
  147. return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file),
  148. vma, vma->vm_pgoff, 0,
  149. forced_collapse);
  150. if (!vma_is_anonymous(vma)) {
  151. /*
  152. * Enforce THP collapse requirements as necessary. Anonymous vmas
  153. * were already handled in thp_vma_allowable_orders().
  154. */
  155. if (!forced_collapse &&
  156. (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
  157. !hugepage_global_always())))
  158. return 0;
  159. /*
  160. * Trust that ->huge_fault() handlers know what they are doing
  161. * in fault path.
  162. */
  163. if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
  164. return orders;
  165. /* Only regular file is valid in collapse path */
  166. if (((!in_pf || smaps)) && file_thp_enabled(vma))
  167. return orders;
  168. return 0;
  169. }
  170. if (vma_is_temporary_stack(vma))
  171. return 0;
  172. /*
  173. * THPeligible bit of smaps should show 1 for proper VMAs even
  174. * though anon_vma is not initialized yet.
  175. *
  176. * Allow page fault since anon_vma may be not initialized until
  177. * the first page fault.
  178. */
  179. if (!vma->anon_vma)
  180. return (smaps || in_pf) ? orders : 0;
  181. return orders;
  182. }
  183. static bool get_huge_zero_folio(void)
  184. {
  185. struct folio *zero_folio;
  186. retry:
  187. if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
  188. return true;
  189. zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO | __GFP_ZEROTAGS) &
  190. ~__GFP_MOVABLE,
  191. HPAGE_PMD_ORDER);
  192. if (!zero_folio) {
  193. count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
  194. return false;
  195. }
  196. /* Ensure zero folio won't have large_rmappable flag set. */
  197. folio_clear_large_rmappable(zero_folio);
  198. preempt_disable();
  199. if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
  200. preempt_enable();
  201. folio_put(zero_folio);
  202. goto retry;
  203. }
  204. WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
  205. /* We take additional reference here. It will be put back by shrinker */
  206. atomic_set(&huge_zero_refcount, 2);
  207. preempt_enable();
  208. count_vm_event(THP_ZERO_PAGE_ALLOC);
  209. return true;
  210. }
  211. static void put_huge_zero_folio(void)
  212. {
  213. /*
  214. * Counter should never go to zero here. Only shrinker can put
  215. * last reference.
  216. */
  217. BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
  218. }
  219. struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
  220. {
  221. if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
  222. return huge_zero_folio;
  223. if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
  224. return READ_ONCE(huge_zero_folio);
  225. if (!get_huge_zero_folio())
  226. return NULL;
  227. if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm))
  228. put_huge_zero_folio();
  229. return READ_ONCE(huge_zero_folio);
  230. }
  231. void mm_put_huge_zero_folio(struct mm_struct *mm)
  232. {
  233. if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
  234. return;
  235. if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
  236. put_huge_zero_folio();
  237. }
  238. static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink,
  239. struct shrink_control *sc)
  240. {
  241. /* we can free zero page only if last reference remains */
  242. return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
  243. }
  244. static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink,
  245. struct shrink_control *sc)
  246. {
  247. if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
  248. struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
  249. BUG_ON(zero_folio == NULL);
  250. WRITE_ONCE(huge_zero_pfn, ~0UL);
  251. folio_put(zero_folio);
  252. return HPAGE_PMD_NR;
  253. }
  254. return 0;
  255. }
  256. static struct shrinker *huge_zero_folio_shrinker;
  257. #ifdef CONFIG_SYSFS
  258. static ssize_t enabled_show(struct kobject *kobj,
  259. struct kobj_attribute *attr, char *buf)
  260. {
  261. const char *output;
  262. if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
  263. output = "[always] madvise never";
  264. else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  265. &transparent_hugepage_flags))
  266. output = "always [madvise] never";
  267. else
  268. output = "always madvise [never]";
  269. return sysfs_emit(buf, "%s\n", output);
  270. }
  271. static ssize_t enabled_store(struct kobject *kobj,
  272. struct kobj_attribute *attr,
  273. const char *buf, size_t count)
  274. {
  275. ssize_t ret = count;
  276. if (sysfs_streq(buf, "always")) {
  277. clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
  278. set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
  279. } else if (sysfs_streq(buf, "madvise")) {
  280. clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
  281. set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
  282. } else if (sysfs_streq(buf, "never")) {
  283. clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
  284. clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
  285. } else
  286. ret = -EINVAL;
  287. if (ret > 0) {
  288. int err = start_stop_khugepaged();
  289. if (err)
  290. ret = err;
  291. }
  292. return ret;
  293. }
  294. static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
  295. ssize_t single_hugepage_flag_show(struct kobject *kobj,
  296. struct kobj_attribute *attr, char *buf,
  297. enum transparent_hugepage_flag flag)
  298. {
  299. return sysfs_emit(buf, "%d\n",
  300. !!test_bit(flag, &transparent_hugepage_flags));
  301. }
  302. ssize_t single_hugepage_flag_store(struct kobject *kobj,
  303. struct kobj_attribute *attr,
  304. const char *buf, size_t count,
  305. enum transparent_hugepage_flag flag)
  306. {
  307. unsigned long value;
  308. int ret;
  309. ret = kstrtoul(buf, 10, &value);
  310. if (ret < 0)
  311. return ret;
  312. if (value > 1)
  313. return -EINVAL;
  314. if (value)
  315. set_bit(flag, &transparent_hugepage_flags);
  316. else
  317. clear_bit(flag, &transparent_hugepage_flags);
  318. return count;
  319. }
  320. static ssize_t defrag_show(struct kobject *kobj,
  321. struct kobj_attribute *attr, char *buf)
  322. {
  323. const char *output;
  324. if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
  325. &transparent_hugepage_flags))
  326. output = "[always] defer defer+madvise madvise never";
  327. else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
  328. &transparent_hugepage_flags))
  329. output = "always [defer] defer+madvise madvise never";
  330. else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
  331. &transparent_hugepage_flags))
  332. output = "always defer [defer+madvise] madvise never";
  333. else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
  334. &transparent_hugepage_flags))
  335. output = "always defer defer+madvise [madvise] never";
  336. else
  337. output = "always defer defer+madvise madvise [never]";
  338. return sysfs_emit(buf, "%s\n", output);
  339. }
  340. static ssize_t defrag_store(struct kobject *kobj,
  341. struct kobj_attribute *attr,
  342. const char *buf, size_t count)
  343. {
  344. if (sysfs_streq(buf, "always")) {
  345. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  346. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  347. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  348. set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  349. } else if (sysfs_streq(buf, "defer+madvise")) {
  350. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  351. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  352. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  353. set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  354. } else if (sysfs_streq(buf, "defer")) {
  355. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  356. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  357. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  358. set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  359. } else if (sysfs_streq(buf, "madvise")) {
  360. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  361. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  362. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  363. set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  364. } else if (sysfs_streq(buf, "never")) {
  365. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
  366. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
  367. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
  368. clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
  369. } else
  370. return -EINVAL;
  371. return count;
  372. }
  373. static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
  374. static ssize_t use_zero_page_show(struct kobject *kobj,
  375. struct kobj_attribute *attr, char *buf)
  376. {
  377. return single_hugepage_flag_show(kobj, attr, buf,
  378. TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  379. }
  380. static ssize_t use_zero_page_store(struct kobject *kobj,
  381. struct kobj_attribute *attr, const char *buf, size_t count)
  382. {
  383. return single_hugepage_flag_store(kobj, attr, buf, count,
  384. TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  385. }
  386. static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
  387. static ssize_t hpage_pmd_size_show(struct kobject *kobj,
  388. struct kobj_attribute *attr, char *buf)
  389. {
  390. return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
  391. }
  392. static struct kobj_attribute hpage_pmd_size_attr =
  393. __ATTR_RO(hpage_pmd_size);
  394. static ssize_t split_underused_thp_show(struct kobject *kobj,
  395. struct kobj_attribute *attr, char *buf)
  396. {
  397. return sysfs_emit(buf, "%d\n", split_underused_thp);
  398. }
  399. static ssize_t split_underused_thp_store(struct kobject *kobj,
  400. struct kobj_attribute *attr,
  401. const char *buf, size_t count)
  402. {
  403. int err = kstrtobool(buf, &split_underused_thp);
  404. if (err < 0)
  405. return err;
  406. return count;
  407. }
  408. static struct kobj_attribute split_underused_thp_attr = __ATTR(
  409. shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store);
  410. static struct attribute *hugepage_attr[] = {
  411. &enabled_attr.attr,
  412. &defrag_attr.attr,
  413. &use_zero_page_attr.attr,
  414. &hpage_pmd_size_attr.attr,
  415. #ifdef CONFIG_SHMEM
  416. &shmem_enabled_attr.attr,
  417. #endif
  418. &split_underused_thp_attr.attr,
  419. NULL,
  420. };
  421. static const struct attribute_group hugepage_attr_group = {
  422. .attrs = hugepage_attr,
  423. };
  424. static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
  425. static void thpsize_release(struct kobject *kobj);
  426. static DEFINE_SPINLOCK(huge_anon_orders_lock);
  427. static LIST_HEAD(thpsize_list);
  428. static ssize_t anon_enabled_show(struct kobject *kobj,
  429. struct kobj_attribute *attr, char *buf)
  430. {
  431. int order = to_thpsize(kobj)->order;
  432. const char *output;
  433. if (test_bit(order, &huge_anon_orders_always))
  434. output = "[always] inherit madvise never";
  435. else if (test_bit(order, &huge_anon_orders_inherit))
  436. output = "always [inherit] madvise never";
  437. else if (test_bit(order, &huge_anon_orders_madvise))
  438. output = "always inherit [madvise] never";
  439. else
  440. output = "always inherit madvise [never]";
  441. return sysfs_emit(buf, "%s\n", output);
  442. }
  443. static ssize_t anon_enabled_store(struct kobject *kobj,
  444. struct kobj_attribute *attr,
  445. const char *buf, size_t count)
  446. {
  447. int order = to_thpsize(kobj)->order;
  448. ssize_t ret = count;
  449. if (sysfs_streq(buf, "always")) {
  450. spin_lock(&huge_anon_orders_lock);
  451. clear_bit(order, &huge_anon_orders_inherit);
  452. clear_bit(order, &huge_anon_orders_madvise);
  453. set_bit(order, &huge_anon_orders_always);
  454. spin_unlock(&huge_anon_orders_lock);
  455. } else if (sysfs_streq(buf, "inherit")) {
  456. spin_lock(&huge_anon_orders_lock);
  457. clear_bit(order, &huge_anon_orders_always);
  458. clear_bit(order, &huge_anon_orders_madvise);
  459. set_bit(order, &huge_anon_orders_inherit);
  460. spin_unlock(&huge_anon_orders_lock);
  461. } else if (sysfs_streq(buf, "madvise")) {
  462. spin_lock(&huge_anon_orders_lock);
  463. clear_bit(order, &huge_anon_orders_always);
  464. clear_bit(order, &huge_anon_orders_inherit);
  465. set_bit(order, &huge_anon_orders_madvise);
  466. spin_unlock(&huge_anon_orders_lock);
  467. } else if (sysfs_streq(buf, "never")) {
  468. spin_lock(&huge_anon_orders_lock);
  469. clear_bit(order, &huge_anon_orders_always);
  470. clear_bit(order, &huge_anon_orders_inherit);
  471. clear_bit(order, &huge_anon_orders_madvise);
  472. spin_unlock(&huge_anon_orders_lock);
  473. } else
  474. ret = -EINVAL;
  475. if (ret > 0) {
  476. int err;
  477. err = start_stop_khugepaged();
  478. if (err)
  479. ret = err;
  480. }
  481. return ret;
  482. }
  483. static struct kobj_attribute anon_enabled_attr =
  484. __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store);
  485. static struct attribute *anon_ctrl_attrs[] = {
  486. &anon_enabled_attr.attr,
  487. NULL,
  488. };
  489. static const struct attribute_group anon_ctrl_attr_grp = {
  490. .attrs = anon_ctrl_attrs,
  491. };
  492. static struct attribute *file_ctrl_attrs[] = {
  493. #ifdef CONFIG_SHMEM
  494. &thpsize_shmem_enabled_attr.attr,
  495. #endif
  496. NULL,
  497. };
  498. static const struct attribute_group file_ctrl_attr_grp = {
  499. .attrs = file_ctrl_attrs,
  500. };
  501. static struct attribute *any_ctrl_attrs[] = {
  502. NULL,
  503. };
  504. static const struct attribute_group any_ctrl_attr_grp = {
  505. .attrs = any_ctrl_attrs,
  506. };
  507. static const struct kobj_type thpsize_ktype = {
  508. .release = &thpsize_release,
  509. .sysfs_ops = &kobj_sysfs_ops,
  510. };
  511. DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};
  512. static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
  513. {
  514. unsigned long sum = 0;
  515. int cpu;
  516. for_each_possible_cpu(cpu) {
  517. struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
  518. sum += this->stats[order][item];
  519. }
  520. return sum;
  521. }
  522. #define DEFINE_MTHP_STAT_ATTR(_name, _index) \
  523. static ssize_t _name##_show(struct kobject *kobj, \
  524. struct kobj_attribute *attr, char *buf) \
  525. { \
  526. int order = to_thpsize(kobj)->order; \
  527. \
  528. return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
  529. } \
  530. static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
  531. DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
  532. DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
  533. DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
  534. DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
  535. DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
  536. DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
  537. DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
  538. DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
  539. DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
  540. #ifdef CONFIG_SHMEM
  541. DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC);
  542. DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK);
  543. DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
  544. #endif
  545. DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
  546. DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
  547. DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
  548. DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
  549. DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
  550. static struct attribute *anon_stats_attrs[] = {
  551. &anon_fault_alloc_attr.attr,
  552. &anon_fault_fallback_attr.attr,
  553. &anon_fault_fallback_charge_attr.attr,
  554. #ifndef CONFIG_SHMEM
  555. &zswpout_attr.attr,
  556. &swpin_attr.attr,
  557. &swpin_fallback_attr.attr,
  558. &swpin_fallback_charge_attr.attr,
  559. &swpout_attr.attr,
  560. &swpout_fallback_attr.attr,
  561. #endif
  562. &split_deferred_attr.attr,
  563. &nr_anon_attr.attr,
  564. &nr_anon_partially_mapped_attr.attr,
  565. NULL,
  566. };
  567. static struct attribute_group anon_stats_attr_grp = {
  568. .name = "stats",
  569. .attrs = anon_stats_attrs,
  570. };
  571. static struct attribute *file_stats_attrs[] = {
  572. #ifdef CONFIG_SHMEM
  573. &shmem_alloc_attr.attr,
  574. &shmem_fallback_attr.attr,
  575. &shmem_fallback_charge_attr.attr,
  576. #endif
  577. NULL,
  578. };
  579. static struct attribute_group file_stats_attr_grp = {
  580. .name = "stats",
  581. .attrs = file_stats_attrs,
  582. };
  583. static struct attribute *any_stats_attrs[] = {
  584. #ifdef CONFIG_SHMEM
  585. &zswpout_attr.attr,
  586. &swpin_attr.attr,
  587. &swpin_fallback_attr.attr,
  588. &swpin_fallback_charge_attr.attr,
  589. &swpout_attr.attr,
  590. &swpout_fallback_attr.attr,
  591. #endif
  592. &split_attr.attr,
  593. &split_failed_attr.attr,
  594. NULL,
  595. };
  596. static struct attribute_group any_stats_attr_grp = {
  597. .name = "stats",
  598. .attrs = any_stats_attrs,
  599. };
  600. static int sysfs_add_group(struct kobject *kobj,
  601. const struct attribute_group *grp)
  602. {
  603. int ret = -ENOENT;
  604. /*
  605. * If the group is named, try to merge first, assuming the subdirectory
  606. * was already created. This avoids the warning emitted by
  607. * sysfs_create_group() if the directory already exists.
  608. */
  609. if (grp->name)
  610. ret = sysfs_merge_group(kobj, grp);
  611. if (ret)
  612. ret = sysfs_create_group(kobj, grp);
  613. return ret;
  614. }
  615. static struct thpsize *thpsize_create(int order, struct kobject *parent)
  616. {
  617. unsigned long size = (PAGE_SIZE << order) / SZ_1K;
  618. struct thpsize *thpsize;
  619. int ret = -ENOMEM;
  620. thpsize = kzalloc_obj(*thpsize);
  621. if (!thpsize)
  622. goto err;
  623. thpsize->order = order;
  624. ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
  625. "hugepages-%lukB", size);
  626. if (ret) {
  627. kfree(thpsize);
  628. goto err;
  629. }
  630. ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp);
  631. if (ret)
  632. goto err_put;
  633. ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp);
  634. if (ret)
  635. goto err_put;
  636. if (BIT(order) & THP_ORDERS_ALL_ANON) {
  637. ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp);
  638. if (ret)
  639. goto err_put;
  640. ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp);
  641. if (ret)
  642. goto err_put;
  643. }
  644. if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) {
  645. ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp);
  646. if (ret)
  647. goto err_put;
  648. ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp);
  649. if (ret)
  650. goto err_put;
  651. }
  652. return thpsize;
  653. err_put:
  654. kobject_put(&thpsize->kobj);
  655. err:
  656. return ERR_PTR(ret);
  657. }
  658. static void thpsize_release(struct kobject *kobj)
  659. {
  660. kfree(to_thpsize(kobj));
  661. }
  662. static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
  663. {
  664. int err;
  665. struct thpsize *thpsize;
  666. unsigned long orders;
  667. int order;
  668. /*
  669. * Default to setting PMD-sized THP to inherit the global setting and
  670. * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
  671. * constant so we have to do this here.
  672. */
  673. if (!anon_orders_configured)
  674. huge_anon_orders_inherit = BIT(PMD_ORDER);
  675. *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
  676. if (unlikely(!*hugepage_kobj)) {
  677. pr_err("failed to create transparent hugepage kobject\n");
  678. return -ENOMEM;
  679. }
  680. err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
  681. if (err) {
  682. pr_err("failed to register transparent hugepage group\n");
  683. goto delete_obj;
  684. }
  685. err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
  686. if (err) {
  687. pr_err("failed to register transparent hugepage group\n");
  688. goto remove_hp_group;
  689. }
  690. orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT;
  691. order = highest_order(orders);
  692. while (orders) {
  693. thpsize = thpsize_create(order, *hugepage_kobj);
  694. if (IS_ERR(thpsize)) {
  695. pr_err("failed to create thpsize for order %d\n", order);
  696. err = PTR_ERR(thpsize);
  697. goto remove_all;
  698. }
  699. list_add(&thpsize->node, &thpsize_list);
  700. order = next_order(&orders, order);
  701. }
  702. return 0;
  703. remove_all:
  704. hugepage_exit_sysfs(*hugepage_kobj);
  705. return err;
  706. remove_hp_group:
  707. sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
  708. delete_obj:
  709. kobject_put(*hugepage_kobj);
  710. return err;
  711. }
  712. static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
  713. {
  714. struct thpsize *thpsize, *tmp;
  715. list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
  716. list_del(&thpsize->node);
  717. kobject_put(&thpsize->kobj);
  718. }
  719. sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
  720. sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
  721. kobject_put(hugepage_kobj);
  722. }
  723. #else
  724. static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
  725. {
  726. return 0;
  727. }
  728. static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
  729. {
  730. }
  731. #endif /* CONFIG_SYSFS */
  732. static int __init thp_shrinker_init(void)
  733. {
  734. deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
  735. SHRINKER_MEMCG_AWARE |
  736. SHRINKER_NONSLAB,
  737. "thp-deferred_split");
  738. if (!deferred_split_shrinker)
  739. return -ENOMEM;
  740. deferred_split_shrinker->count_objects = deferred_split_count;
  741. deferred_split_shrinker->scan_objects = deferred_split_scan;
  742. shrinker_register(deferred_split_shrinker);
  743. if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) {
  744. /*
  745. * Bump the reference of the huge_zero_folio and do not
  746. * initialize the shrinker.
  747. *
  748. * huge_zero_folio will always be NULL on failure. We assume
  749. * that get_huge_zero_folio() will most likely not fail as
  750. * thp_shrinker_init() is invoked early on during boot.
  751. */
  752. if (!get_huge_zero_folio())
  753. pr_warn("Allocating persistent huge zero folio failed\n");
  754. return 0;
  755. }
  756. huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
  757. if (!huge_zero_folio_shrinker) {
  758. shrinker_free(deferred_split_shrinker);
  759. return -ENOMEM;
  760. }
  761. huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count;
  762. huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan;
  763. shrinker_register(huge_zero_folio_shrinker);
  764. return 0;
  765. }
  766. static void __init thp_shrinker_exit(void)
  767. {
  768. shrinker_free(huge_zero_folio_shrinker);
  769. shrinker_free(deferred_split_shrinker);
  770. }
  771. static int __init hugepage_init(void)
  772. {
  773. int err;
  774. struct kobject *hugepage_kobj;
  775. if (!has_transparent_hugepage()) {
  776. transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
  777. return -EINVAL;
  778. }
  779. /*
  780. * hugepages can't be allocated by the buddy allocator
  781. */
  782. MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
  783. err = hugepage_init_sysfs(&hugepage_kobj);
  784. if (err)
  785. goto err_sysfs;
  786. err = khugepaged_init();
  787. if (err)
  788. goto err_slab;
  789. err = thp_shrinker_init();
  790. if (err)
  791. goto err_shrinker;
  792. /*
  793. * By default disable transparent hugepages on smaller systems,
  794. * where the extra memory used could hurt more than TLB overhead
  795. * is likely to save. The admin can still enable it through /sys.
  796. */
  797. if (totalram_pages() < MB_TO_PAGES(512)) {
  798. transparent_hugepage_flags = 0;
  799. return 0;
  800. }
  801. err = start_stop_khugepaged();
  802. if (err)
  803. goto err_khugepaged;
  804. return 0;
  805. err_khugepaged:
  806. thp_shrinker_exit();
  807. err_shrinker:
  808. khugepaged_destroy();
  809. err_slab:
  810. hugepage_exit_sysfs(hugepage_kobj);
  811. err_sysfs:
  812. return err;
  813. }
  814. subsys_initcall(hugepage_init);
  815. static int __init setup_transparent_hugepage(char *str)
  816. {
  817. int ret = 0;
  818. if (!str)
  819. goto out;
  820. if (!strcmp(str, "always")) {
  821. set_bit(TRANSPARENT_HUGEPAGE_FLAG,
  822. &transparent_hugepage_flags);
  823. clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  824. &transparent_hugepage_flags);
  825. ret = 1;
  826. } else if (!strcmp(str, "madvise")) {
  827. clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  828. &transparent_hugepage_flags);
  829. set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  830. &transparent_hugepage_flags);
  831. ret = 1;
  832. } else if (!strcmp(str, "never")) {
  833. clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
  834. &transparent_hugepage_flags);
  835. clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
  836. &transparent_hugepage_flags);
  837. ret = 1;
  838. }
  839. out:
  840. if (!ret)
  841. pr_warn("transparent_hugepage= cannot parse, ignored\n");
  842. return ret;
  843. }
  844. __setup("transparent_hugepage=", setup_transparent_hugepage);
  845. static char str_dup[PAGE_SIZE] __initdata;
  846. static int __init setup_thp_anon(char *str)
  847. {
  848. char *token, *range, *policy, *subtoken;
  849. unsigned long always, inherit, madvise;
  850. char *start_size, *end_size;
  851. int start, end, nr;
  852. char *p;
  853. if (!str || strlen(str) + 1 > PAGE_SIZE)
  854. goto err;
  855. strscpy(str_dup, str);
  856. always = huge_anon_orders_always;
  857. madvise = huge_anon_orders_madvise;
  858. inherit = huge_anon_orders_inherit;
  859. p = str_dup;
  860. while ((token = strsep(&p, ";")) != NULL) {
  861. range = strsep(&token, ":");
  862. policy = token;
  863. if (!policy)
  864. goto err;
  865. while ((subtoken = strsep(&range, ",")) != NULL) {
  866. if (strchr(subtoken, '-')) {
  867. start_size = strsep(&subtoken, "-");
  868. end_size = subtoken;
  869. start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON);
  870. end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON);
  871. } else {
  872. start_size = end_size = subtoken;
  873. start = end = get_order_from_str(subtoken,
  874. THP_ORDERS_ALL_ANON);
  875. }
  876. if (start == -EINVAL) {
  877. pr_err("invalid size %s in thp_anon boot parameter\n", start_size);
  878. goto err;
  879. }
  880. if (end == -EINVAL) {
  881. pr_err("invalid size %s in thp_anon boot parameter\n", end_size);
  882. goto err;
  883. }
  884. if (start < 0 || end < 0 || start > end)
  885. goto err;
  886. nr = end - start + 1;
  887. if (!strcmp(policy, "always")) {
  888. bitmap_set(&always, start, nr);
  889. bitmap_clear(&inherit, start, nr);
  890. bitmap_clear(&madvise, start, nr);
  891. } else if (!strcmp(policy, "madvise")) {
  892. bitmap_set(&madvise, start, nr);
  893. bitmap_clear(&inherit, start, nr);
  894. bitmap_clear(&always, start, nr);
  895. } else if (!strcmp(policy, "inherit")) {
  896. bitmap_set(&inherit, start, nr);
  897. bitmap_clear(&madvise, start, nr);
  898. bitmap_clear(&always, start, nr);
  899. } else if (!strcmp(policy, "never")) {
  900. bitmap_clear(&inherit, start, nr);
  901. bitmap_clear(&madvise, start, nr);
  902. bitmap_clear(&always, start, nr);
  903. } else {
  904. pr_err("invalid policy %s in thp_anon boot parameter\n", policy);
  905. goto err;
  906. }
  907. }
  908. }
  909. huge_anon_orders_always = always;
  910. huge_anon_orders_madvise = madvise;
  911. huge_anon_orders_inherit = inherit;
  912. anon_orders_configured = true;
  913. return 1;
  914. err:
  915. pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str);
  916. return 0;
  917. }
  918. __setup("thp_anon=", setup_thp_anon);
  919. pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
  920. {
  921. if (likely(vma->vm_flags & VM_WRITE))
  922. pmd = pmd_mkwrite(pmd, vma);
  923. return pmd;
  924. }
  925. static struct deferred_split *split_queue_node(int nid)
  926. {
  927. struct pglist_data *pgdata = NODE_DATA(nid);
  928. return &pgdata->deferred_split_queue;
  929. }
  930. #ifdef CONFIG_MEMCG
  931. static inline
  932. struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
  933. struct deferred_split *queue)
  934. {
  935. if (mem_cgroup_disabled())
  936. return NULL;
  937. if (split_queue_node(folio_nid(folio)) == queue)
  938. return NULL;
  939. return container_of(queue, struct mem_cgroup, deferred_split_queue);
  940. }
  941. static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
  942. {
  943. return memcg ? &memcg->deferred_split_queue : split_queue_node(nid);
  944. }
  945. #else
  946. static inline
  947. struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
  948. struct deferred_split *queue)
  949. {
  950. return NULL;
  951. }
  952. static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
  953. {
  954. return split_queue_node(nid);
  955. }
  956. #endif
  957. static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg)
  958. {
  959. struct deferred_split *queue;
  960. retry:
  961. queue = memcg_split_queue(nid, memcg);
  962. spin_lock(&queue->split_queue_lock);
  963. /*
  964. * There is a period between setting memcg to dying and reparenting
  965. * deferred split queue, and during this period the THPs in the deferred
  966. * split queue will be hidden from the shrinker side.
  967. */
  968. if (unlikely(memcg_is_dying(memcg))) {
  969. spin_unlock(&queue->split_queue_lock);
  970. memcg = parent_mem_cgroup(memcg);
  971. goto retry;
  972. }
  973. return queue;
  974. }
  975. static struct deferred_split *
  976. split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags)
  977. {
  978. struct deferred_split *queue;
  979. retry:
  980. queue = memcg_split_queue(nid, memcg);
  981. spin_lock_irqsave(&queue->split_queue_lock, *flags);
  982. if (unlikely(memcg_is_dying(memcg))) {
  983. spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
  984. memcg = parent_mem_cgroup(memcg);
  985. goto retry;
  986. }
  987. return queue;
  988. }
  989. static struct deferred_split *folio_split_queue_lock(struct folio *folio)
  990. {
  991. return split_queue_lock(folio_nid(folio), folio_memcg(folio));
  992. }
  993. static struct deferred_split *
  994. folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
  995. {
  996. return split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
  997. }
  998. static inline void split_queue_unlock(struct deferred_split *queue)
  999. {
  1000. spin_unlock(&queue->split_queue_lock);
  1001. }
  1002. static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
  1003. unsigned long flags)
  1004. {
  1005. spin_unlock_irqrestore(&queue->split_queue_lock, flags);
  1006. }
  1007. static inline bool is_transparent_hugepage(const struct folio *folio)
  1008. {
  1009. if (!folio_test_large(folio))
  1010. return false;
  1011. return is_huge_zero_folio(folio) ||
  1012. folio_test_large_rmappable(folio);
  1013. }
  1014. static unsigned long __thp_get_unmapped_area(struct file *filp,
  1015. unsigned long addr, unsigned long len,
  1016. loff_t off, unsigned long flags, unsigned long size,
  1017. vm_flags_t vm_flags)
  1018. {
  1019. loff_t off_end = off + len;
  1020. loff_t off_align = round_up(off, size);
  1021. unsigned long len_pad, ret, off_sub;
  1022. if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())
  1023. return 0;
  1024. if (off_end <= off_align || (off_end - off_align) < size)
  1025. return 0;
  1026. len_pad = len + size;
  1027. if (len_pad < len || (off + len_pad) < off)
  1028. return 0;
  1029. ret = mm_get_unmapped_area_vmflags(filp, addr, len_pad,
  1030. off >> PAGE_SHIFT, flags, vm_flags);
  1031. /*
  1032. * The failure might be due to length padding. The caller will retry
  1033. * without the padding.
  1034. */
  1035. if (IS_ERR_VALUE(ret))
  1036. return 0;
  1037. /*
  1038. * Do not try to align to THP boundary if allocation at the address
  1039. * hint succeeds.
  1040. */
  1041. if (ret == addr)
  1042. return addr;
  1043. off_sub = (off - ret) & (size - 1);
  1044. if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub)
  1045. return ret + size;
  1046. ret += off_sub;
  1047. return ret;
  1048. }
  1049. unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
  1050. unsigned long len, unsigned long pgoff, unsigned long flags,
  1051. vm_flags_t vm_flags)
  1052. {
  1053. unsigned long ret;
  1054. loff_t off = (loff_t)pgoff << PAGE_SHIFT;
  1055. ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
  1056. if (ret)
  1057. return ret;
  1058. return mm_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags,
  1059. vm_flags);
  1060. }
  1061. unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
  1062. unsigned long len, unsigned long pgoff, unsigned long flags)
  1063. {
  1064. return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
  1065. }
  1066. EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
  1067. static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
  1068. unsigned long addr)
  1069. {
  1070. gfp_t gfp = vma_thp_gfp_mask(vma);
  1071. const int order = HPAGE_PMD_ORDER;
  1072. struct folio *folio;
  1073. folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK);
  1074. if (unlikely(!folio)) {
  1075. count_vm_event(THP_FAULT_FALLBACK);
  1076. count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
  1077. return NULL;
  1078. }
  1079. VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
  1080. if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
  1081. folio_put(folio);
  1082. count_vm_event(THP_FAULT_FALLBACK);
  1083. count_vm_event(THP_FAULT_FALLBACK_CHARGE);
  1084. count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
  1085. count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
  1086. return NULL;
  1087. }
  1088. folio_throttle_swaprate(folio, gfp);
  1089. /*
  1090. * When a folio is not zeroed during allocation (__GFP_ZERO not used)
  1091. * or user folios require special handling, folio_zero_user() is used to
  1092. * make sure that the page corresponding to the faulting address will be
  1093. * hot in the cache after zeroing.
  1094. */
  1095. if (user_alloc_needs_zeroing())
  1096. folio_zero_user(folio, addr);
  1097. /*
  1098. * The memory barrier inside __folio_mark_uptodate makes sure that
  1099. * folio_zero_user writes become visible before the set_pmd_at()
  1100. * write.
  1101. */
  1102. __folio_mark_uptodate(folio);
  1103. return folio;
  1104. }
  1105. void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd,
  1106. struct vm_area_struct *vma, unsigned long haddr)
  1107. {
  1108. pmd_t entry;
  1109. entry = folio_mk_pmd(folio, vma->vm_page_prot);
  1110. entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
  1111. folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
  1112. folio_add_lru_vma(folio, vma);
  1113. set_pmd_at(vma->vm_mm, haddr, pmd, entry);
  1114. update_mmu_cache_pmd(vma, haddr, pmd);
  1115. deferred_split_folio(folio, false);
  1116. }
  1117. static void map_anon_folio_pmd_pf(struct folio *folio, pmd_t *pmd,
  1118. struct vm_area_struct *vma, unsigned long haddr)
  1119. {
  1120. map_anon_folio_pmd_nopf(folio, pmd, vma, haddr);
  1121. add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  1122. count_vm_event(THP_FAULT_ALLOC);
  1123. count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
  1124. count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
  1125. }
  1126. static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
  1127. {
  1128. unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
  1129. struct vm_area_struct *vma = vmf->vma;
  1130. struct folio *folio;
  1131. pgtable_t pgtable;
  1132. vm_fault_t ret = 0;
  1133. folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
  1134. if (unlikely(!folio))
  1135. return VM_FAULT_FALLBACK;
  1136. pgtable = pte_alloc_one(vma->vm_mm);
  1137. if (unlikely(!pgtable)) {
  1138. ret = VM_FAULT_OOM;
  1139. goto release;
  1140. }
  1141. vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  1142. if (unlikely(!pmd_none(*vmf->pmd))) {
  1143. goto unlock_release;
  1144. } else {
  1145. ret = check_stable_address_space(vma->vm_mm);
  1146. if (ret)
  1147. goto unlock_release;
  1148. /* Deliver the page fault to userland */
  1149. if (userfaultfd_missing(vma)) {
  1150. spin_unlock(vmf->ptl);
  1151. folio_put(folio);
  1152. pte_free(vma->vm_mm, pgtable);
  1153. ret = handle_userfault(vmf, VM_UFFD_MISSING);
  1154. VM_BUG_ON(ret & VM_FAULT_FALLBACK);
  1155. return ret;
  1156. }
  1157. pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
  1158. map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr);
  1159. mm_inc_nr_ptes(vma->vm_mm);
  1160. spin_unlock(vmf->ptl);
  1161. }
  1162. return 0;
  1163. unlock_release:
  1164. spin_unlock(vmf->ptl);
  1165. release:
  1166. if (pgtable)
  1167. pte_free(vma->vm_mm, pgtable);
  1168. folio_put(folio);
  1169. return ret;
  1170. }
  1171. vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
  1172. {
  1173. struct vm_area_struct *vma = vmf->vma;
  1174. vm_fault_t ret = 0;
  1175. spinlock_t *ptl;
  1176. softleaf_t entry;
  1177. struct page *page;
  1178. struct folio *folio;
  1179. if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
  1180. vma_end_read(vma);
  1181. return VM_FAULT_RETRY;
  1182. }
  1183. ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  1184. if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) {
  1185. spin_unlock(ptl);
  1186. return 0;
  1187. }
  1188. entry = softleaf_from_pmd(vmf->orig_pmd);
  1189. page = softleaf_to_page(entry);
  1190. folio = page_folio(page);
  1191. vmf->page = page;
  1192. vmf->pte = NULL;
  1193. if (folio_trylock(folio)) {
  1194. folio_get(folio);
  1195. spin_unlock(ptl);
  1196. ret = page_pgmap(page)->ops->migrate_to_ram(vmf);
  1197. folio_unlock(folio);
  1198. folio_put(folio);
  1199. } else {
  1200. spin_unlock(ptl);
  1201. }
  1202. return ret;
  1203. }
  1204. /*
  1205. * always: directly stall for all thp allocations
  1206. * defer: wake kswapd and fail if not immediately available
  1207. * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
  1208. * fail if not immediately available
  1209. * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
  1210. * available
  1211. * never: never stall for any thp allocation
  1212. */
  1213. gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
  1214. {
  1215. const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
  1216. /* Always do synchronous compaction */
  1217. if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
  1218. return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
  1219. /* Kick kcompactd and fail quickly */
  1220. if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
  1221. return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
  1222. /* Synchronous compaction if madvised, otherwise kick kcompactd */
  1223. if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
  1224. return GFP_TRANSHUGE_LIGHT |
  1225. (vma_madvised ? __GFP_DIRECT_RECLAIM :
  1226. __GFP_KSWAPD_RECLAIM);
  1227. /* Only do synchronous compaction if madvised */
  1228. if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
  1229. return GFP_TRANSHUGE_LIGHT |
  1230. (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
  1231. return GFP_TRANSHUGE_LIGHT;
  1232. }
  1233. /* Caller must hold page table lock. */
  1234. static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
  1235. struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
  1236. struct folio *zero_folio)
  1237. {
  1238. pmd_t entry;
  1239. entry = folio_mk_pmd(zero_folio, vma->vm_page_prot);
  1240. entry = pmd_mkspecial(entry);
  1241. pgtable_trans_huge_deposit(mm, pmd, pgtable);
  1242. set_pmd_at(mm, haddr, pmd, entry);
  1243. mm_inc_nr_ptes(mm);
  1244. }
  1245. vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
  1246. {
  1247. struct vm_area_struct *vma = vmf->vma;
  1248. unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
  1249. vm_fault_t ret;
  1250. if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
  1251. return VM_FAULT_FALLBACK;
  1252. ret = vmf_anon_prepare(vmf);
  1253. if (ret)
  1254. return ret;
  1255. khugepaged_enter_vma(vma, vma->vm_flags);
  1256. if (!(vmf->flags & FAULT_FLAG_WRITE) &&
  1257. !mm_forbids_zeropage(vma->vm_mm) &&
  1258. transparent_hugepage_use_zero_page()) {
  1259. pgtable_t pgtable;
  1260. struct folio *zero_folio;
  1261. vm_fault_t ret;
  1262. pgtable = pte_alloc_one(vma->vm_mm);
  1263. if (unlikely(!pgtable))
  1264. return VM_FAULT_OOM;
  1265. zero_folio = mm_get_huge_zero_folio(vma->vm_mm);
  1266. if (unlikely(!zero_folio)) {
  1267. pte_free(vma->vm_mm, pgtable);
  1268. count_vm_event(THP_FAULT_FALLBACK);
  1269. return VM_FAULT_FALLBACK;
  1270. }
  1271. vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  1272. ret = 0;
  1273. if (pmd_none(*vmf->pmd)) {
  1274. ret = check_stable_address_space(vma->vm_mm);
  1275. if (ret) {
  1276. spin_unlock(vmf->ptl);
  1277. pte_free(vma->vm_mm, pgtable);
  1278. } else if (userfaultfd_missing(vma)) {
  1279. spin_unlock(vmf->ptl);
  1280. pte_free(vma->vm_mm, pgtable);
  1281. ret = handle_userfault(vmf, VM_UFFD_MISSING);
  1282. VM_BUG_ON(ret & VM_FAULT_FALLBACK);
  1283. } else {
  1284. set_huge_zero_folio(pgtable, vma->vm_mm, vma,
  1285. haddr, vmf->pmd, zero_folio);
  1286. update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
  1287. spin_unlock(vmf->ptl);
  1288. }
  1289. } else {
  1290. spin_unlock(vmf->ptl);
  1291. pte_free(vma->vm_mm, pgtable);
  1292. }
  1293. return ret;
  1294. }
  1295. return __do_huge_pmd_anonymous_page(vmf);
  1296. }
  1297. struct folio_or_pfn {
  1298. union {
  1299. struct folio *folio;
  1300. unsigned long pfn;
  1301. };
  1302. bool is_folio;
  1303. };
  1304. static vm_fault_t insert_pmd(struct vm_area_struct *vma, unsigned long addr,
  1305. pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot,
  1306. bool write)
  1307. {
  1308. struct mm_struct *mm = vma->vm_mm;
  1309. pgtable_t pgtable = NULL;
  1310. spinlock_t *ptl;
  1311. pmd_t entry;
  1312. if (addr < vma->vm_start || addr >= vma->vm_end)
  1313. return VM_FAULT_SIGBUS;
  1314. if (arch_needs_pgtable_deposit()) {
  1315. pgtable = pte_alloc_one(vma->vm_mm);
  1316. if (!pgtable)
  1317. return VM_FAULT_OOM;
  1318. }
  1319. ptl = pmd_lock(mm, pmd);
  1320. if (!pmd_none(*pmd)) {
  1321. const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
  1322. fop.pfn;
  1323. if (write) {
  1324. if (pmd_pfn(*pmd) != pfn) {
  1325. WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
  1326. goto out_unlock;
  1327. }
  1328. entry = pmd_mkyoung(*pmd);
  1329. entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
  1330. if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
  1331. update_mmu_cache_pmd(vma, addr, pmd);
  1332. }
  1333. goto out_unlock;
  1334. }
  1335. if (fop.is_folio) {
  1336. entry = folio_mk_pmd(fop.folio, vma->vm_page_prot);
  1337. if (is_huge_zero_folio(fop.folio)) {
  1338. entry = pmd_mkspecial(entry);
  1339. } else {
  1340. folio_get(fop.folio);
  1341. folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
  1342. add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
  1343. }
  1344. } else {
  1345. entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
  1346. entry = pmd_mkspecial(entry);
  1347. }
  1348. if (write) {
  1349. entry = pmd_mkyoung(pmd_mkdirty(entry));
  1350. entry = maybe_pmd_mkwrite(entry, vma);
  1351. }
  1352. if (pgtable) {
  1353. pgtable_trans_huge_deposit(mm, pmd, pgtable);
  1354. mm_inc_nr_ptes(mm);
  1355. pgtable = NULL;
  1356. }
  1357. set_pmd_at(mm, addr, pmd, entry);
  1358. update_mmu_cache_pmd(vma, addr, pmd);
  1359. out_unlock:
  1360. spin_unlock(ptl);
  1361. if (pgtable)
  1362. pte_free(mm, pgtable);
  1363. return VM_FAULT_NOPAGE;
  1364. }
  1365. /**
  1366. * vmf_insert_pfn_pmd - insert a pmd size pfn
  1367. * @vmf: Structure describing the fault
  1368. * @pfn: pfn to insert
  1369. * @write: whether it's a write fault
  1370. *
  1371. * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
  1372. *
  1373. * Return: vm_fault_t value.
  1374. */
  1375. vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
  1376. bool write)
  1377. {
  1378. unsigned long addr = vmf->address & PMD_MASK;
  1379. struct vm_area_struct *vma = vmf->vma;
  1380. pgprot_t pgprot = vma->vm_page_prot;
  1381. struct folio_or_pfn fop = {
  1382. .pfn = pfn,
  1383. };
  1384. /*
  1385. * If we had pmd_special, we could avoid all these restrictions,
  1386. * but we need to be consistent with PTEs and architectures that
  1387. * can't support a 'special' bit.
  1388. */
  1389. BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
  1390. BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
  1391. (VM_PFNMAP|VM_MIXEDMAP));
  1392. BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
  1393. pfnmap_setup_cachemode_pfn(pfn, &pgprot);
  1394. return insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write);
  1395. }
  1396. EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
  1397. vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
  1398. bool write)
  1399. {
  1400. struct vm_area_struct *vma = vmf->vma;
  1401. unsigned long addr = vmf->address & PMD_MASK;
  1402. struct folio_or_pfn fop = {
  1403. .folio = folio,
  1404. .is_folio = true,
  1405. };
  1406. if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
  1407. return VM_FAULT_SIGBUS;
  1408. return insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, write);
  1409. }
  1410. EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
  1411. #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  1412. static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
  1413. {
  1414. if (likely(vma->vm_flags & VM_WRITE))
  1415. pud = pud_mkwrite(pud);
  1416. return pud;
  1417. }
  1418. static vm_fault_t insert_pud(struct vm_area_struct *vma, unsigned long addr,
  1419. pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write)
  1420. {
  1421. struct mm_struct *mm = vma->vm_mm;
  1422. spinlock_t *ptl;
  1423. pud_t entry;
  1424. if (addr < vma->vm_start || addr >= vma->vm_end)
  1425. return VM_FAULT_SIGBUS;
  1426. ptl = pud_lock(mm, pud);
  1427. if (!pud_none(*pud)) {
  1428. const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
  1429. fop.pfn;
  1430. if (write) {
  1431. if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
  1432. goto out_unlock;
  1433. entry = pud_mkyoung(*pud);
  1434. entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
  1435. if (pudp_set_access_flags(vma, addr, pud, entry, 1))
  1436. update_mmu_cache_pud(vma, addr, pud);
  1437. }
  1438. goto out_unlock;
  1439. }
  1440. if (fop.is_folio) {
  1441. entry = folio_mk_pud(fop.folio, vma->vm_page_prot);
  1442. folio_get(fop.folio);
  1443. folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma);
  1444. add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR);
  1445. } else {
  1446. entry = pud_mkhuge(pfn_pud(fop.pfn, prot));
  1447. entry = pud_mkspecial(entry);
  1448. }
  1449. if (write) {
  1450. entry = pud_mkyoung(pud_mkdirty(entry));
  1451. entry = maybe_pud_mkwrite(entry, vma);
  1452. }
  1453. set_pud_at(mm, addr, pud, entry);
  1454. update_mmu_cache_pud(vma, addr, pud);
  1455. out_unlock:
  1456. spin_unlock(ptl);
  1457. return VM_FAULT_NOPAGE;
  1458. }
  1459. /**
  1460. * vmf_insert_pfn_pud - insert a pud size pfn
  1461. * @vmf: Structure describing the fault
  1462. * @pfn: pfn to insert
  1463. * @write: whether it's a write fault
  1464. *
  1465. * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
  1466. *
  1467. * Return: vm_fault_t value.
  1468. */
  1469. vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
  1470. bool write)
  1471. {
  1472. unsigned long addr = vmf->address & PUD_MASK;
  1473. struct vm_area_struct *vma = vmf->vma;
  1474. pgprot_t pgprot = vma->vm_page_prot;
  1475. struct folio_or_pfn fop = {
  1476. .pfn = pfn,
  1477. };
  1478. /*
  1479. * If we had pud_special, we could avoid all these restrictions,
  1480. * but we need to be consistent with PTEs and architectures that
  1481. * can't support a 'special' bit.
  1482. */
  1483. BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
  1484. BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
  1485. (VM_PFNMAP|VM_MIXEDMAP));
  1486. BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
  1487. pfnmap_setup_cachemode_pfn(pfn, &pgprot);
  1488. return insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
  1489. }
  1490. EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
  1491. /**
  1492. * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry
  1493. * @vmf: Structure describing the fault
  1494. * @folio: folio to insert
  1495. * @write: whether it's a write fault
  1496. *
  1497. * Return: vm_fault_t value.
  1498. */
  1499. vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
  1500. bool write)
  1501. {
  1502. struct vm_area_struct *vma = vmf->vma;
  1503. unsigned long addr = vmf->address & PUD_MASK;
  1504. struct folio_or_pfn fop = {
  1505. .folio = folio,
  1506. .is_folio = true,
  1507. };
  1508. if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
  1509. return VM_FAULT_SIGBUS;
  1510. return insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
  1511. }
  1512. EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
  1513. #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
  1514. /**
  1515. * touch_pmd - Mark page table pmd entry as accessed and dirty (for write)
  1516. * @vma: The VMA covering @addr
  1517. * @addr: The virtual address
  1518. * @pmd: pmd pointer into the page table mapping @addr
  1519. * @write: Whether it's a write access
  1520. *
  1521. * Return: whether the pmd entry is changed
  1522. */
  1523. bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
  1524. pmd_t *pmd, bool write)
  1525. {
  1526. pmd_t entry;
  1527. entry = pmd_mkyoung(*pmd);
  1528. if (write)
  1529. entry = pmd_mkdirty(entry);
  1530. if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
  1531. pmd, entry, write)) {
  1532. update_mmu_cache_pmd(vma, addr, pmd);
  1533. return true;
  1534. }
  1535. return false;
  1536. }
  1537. static void copy_huge_non_present_pmd(
  1538. struct mm_struct *dst_mm, struct mm_struct *src_mm,
  1539. pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
  1540. struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
  1541. pmd_t pmd, pgtable_t pgtable)
  1542. {
  1543. softleaf_t entry = softleaf_from_pmd(pmd);
  1544. struct folio *src_folio;
  1545. VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(pmd));
  1546. if (softleaf_is_migration_write(entry) ||
  1547. softleaf_is_migration_read_exclusive(entry)) {
  1548. entry = make_readable_migration_entry(swp_offset(entry));
  1549. pmd = swp_entry_to_pmd(entry);
  1550. if (pmd_swp_soft_dirty(*src_pmd))
  1551. pmd = pmd_swp_mksoft_dirty(pmd);
  1552. if (pmd_swp_uffd_wp(*src_pmd))
  1553. pmd = pmd_swp_mkuffd_wp(pmd);
  1554. set_pmd_at(src_mm, addr, src_pmd, pmd);
  1555. } else if (softleaf_is_device_private(entry)) {
  1556. /*
  1557. * For device private entries, since there are no
  1558. * read exclusive entries, writable = !readable
  1559. */
  1560. if (softleaf_is_device_private_write(entry)) {
  1561. entry = make_readable_device_private_entry(swp_offset(entry));
  1562. pmd = swp_entry_to_pmd(entry);
  1563. if (pmd_swp_soft_dirty(*src_pmd))
  1564. pmd = pmd_swp_mksoft_dirty(pmd);
  1565. if (pmd_swp_uffd_wp(*src_pmd))
  1566. pmd = pmd_swp_mkuffd_wp(pmd);
  1567. set_pmd_at(src_mm, addr, src_pmd, pmd);
  1568. }
  1569. src_folio = softleaf_to_folio(entry);
  1570. VM_WARN_ON(!folio_test_large(src_folio));
  1571. folio_get(src_folio);
  1572. /*
  1573. * folio_try_dup_anon_rmap_pmd does not fail for
  1574. * device private entries.
  1575. */
  1576. folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
  1577. dst_vma, src_vma);
  1578. }
  1579. add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  1580. mm_inc_nr_ptes(dst_mm);
  1581. pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
  1582. if (!userfaultfd_wp(dst_vma))
  1583. pmd = pmd_swp_clear_uffd_wp(pmd);
  1584. set_pmd_at(dst_mm, addr, dst_pmd, pmd);
  1585. }
  1586. int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  1587. pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
  1588. struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
  1589. {
  1590. spinlock_t *dst_ptl, *src_ptl;
  1591. struct page *src_page;
  1592. struct folio *src_folio;
  1593. pmd_t pmd;
  1594. pgtable_t pgtable = NULL;
  1595. int ret = -ENOMEM;
  1596. pmd = pmdp_get_lockless(src_pmd);
  1597. if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
  1598. !is_huge_zero_pmd(pmd))) {
  1599. dst_ptl = pmd_lock(dst_mm, dst_pmd);
  1600. src_ptl = pmd_lockptr(src_mm, src_pmd);
  1601. spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
  1602. /*
  1603. * No need to recheck the pmd, it can't change with write
  1604. * mmap lock held here.
  1605. *
  1606. * Meanwhile, making sure it's not a CoW VMA with writable
  1607. * mapping, otherwise it means either the anon page wrongly
  1608. * applied special bit, or we made the PRIVATE mapping be
  1609. * able to wrongly write to the backend MMIO.
  1610. */
  1611. VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
  1612. goto set_pmd;
  1613. }
  1614. /* Skip if can be re-fill on fault */
  1615. if (!vma_is_anonymous(dst_vma))
  1616. return 0;
  1617. pgtable = pte_alloc_one(dst_mm);
  1618. if (unlikely(!pgtable))
  1619. goto out;
  1620. dst_ptl = pmd_lock(dst_mm, dst_pmd);
  1621. src_ptl = pmd_lockptr(src_mm, src_pmd);
  1622. spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
  1623. ret = -EAGAIN;
  1624. pmd = *src_pmd;
  1625. if (unlikely(thp_migration_supported() &&
  1626. pmd_is_valid_softleaf(pmd))) {
  1627. copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
  1628. dst_vma, src_vma, pmd, pgtable);
  1629. ret = 0;
  1630. goto out_unlock;
  1631. }
  1632. if (unlikely(!pmd_trans_huge(pmd))) {
  1633. pte_free(dst_mm, pgtable);
  1634. goto out_unlock;
  1635. }
  1636. /*
  1637. * When page table lock is held, the huge zero pmd should not be
  1638. * under splitting since we don't split the page itself, only pmd to
  1639. * a page table.
  1640. */
  1641. if (is_huge_zero_pmd(pmd)) {
  1642. /*
  1643. * mm_get_huge_zero_folio() will never allocate a new
  1644. * folio here, since we already have a zero page to
  1645. * copy. It just takes a reference.
  1646. */
  1647. mm_get_huge_zero_folio(dst_mm);
  1648. goto out_zero_page;
  1649. }
  1650. src_page = pmd_page(pmd);
  1651. VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
  1652. src_folio = page_folio(src_page);
  1653. folio_get(src_folio);
  1654. if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) {
  1655. /* Page maybe pinned: split and retry the fault on PTEs. */
  1656. folio_put(src_folio);
  1657. pte_free(dst_mm, pgtable);
  1658. spin_unlock(src_ptl);
  1659. spin_unlock(dst_ptl);
  1660. __split_huge_pmd(src_vma, src_pmd, addr, false);
  1661. return -EAGAIN;
  1662. }
  1663. add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  1664. out_zero_page:
  1665. mm_inc_nr_ptes(dst_mm);
  1666. pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
  1667. pmdp_set_wrprotect(src_mm, addr, src_pmd);
  1668. if (!userfaultfd_wp(dst_vma))
  1669. pmd = pmd_clear_uffd_wp(pmd);
  1670. pmd = pmd_wrprotect(pmd);
  1671. set_pmd:
  1672. pmd = pmd_mkold(pmd);
  1673. set_pmd_at(dst_mm, addr, dst_pmd, pmd);
  1674. ret = 0;
  1675. out_unlock:
  1676. spin_unlock(src_ptl);
  1677. spin_unlock(dst_ptl);
  1678. out:
  1679. return ret;
  1680. }
  1681. #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  1682. void touch_pud(struct vm_area_struct *vma, unsigned long addr,
  1683. pud_t *pud, bool write)
  1684. {
  1685. pud_t _pud;
  1686. _pud = pud_mkyoung(*pud);
  1687. if (write)
  1688. _pud = pud_mkdirty(_pud);
  1689. if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
  1690. pud, _pud, write))
  1691. update_mmu_cache_pud(vma, addr, pud);
  1692. }
  1693. int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  1694. pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
  1695. struct vm_area_struct *vma)
  1696. {
  1697. spinlock_t *dst_ptl, *src_ptl;
  1698. pud_t pud;
  1699. int ret;
  1700. dst_ptl = pud_lock(dst_mm, dst_pud);
  1701. src_ptl = pud_lockptr(src_mm, src_pud);
  1702. spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
  1703. ret = -EAGAIN;
  1704. pud = *src_pud;
  1705. if (unlikely(!pud_trans_huge(pud)))
  1706. goto out_unlock;
  1707. /*
  1708. * TODO: once we support anonymous pages, use
  1709. * folio_try_dup_anon_rmap_*() and split if duplicating fails.
  1710. */
  1711. if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) {
  1712. pudp_set_wrprotect(src_mm, addr, src_pud);
  1713. pud = pud_wrprotect(pud);
  1714. }
  1715. pud = pud_mkold(pud);
  1716. set_pud_at(dst_mm, addr, dst_pud, pud);
  1717. ret = 0;
  1718. out_unlock:
  1719. spin_unlock(src_ptl);
  1720. spin_unlock(dst_ptl);
  1721. return ret;
  1722. }
  1723. void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
  1724. {
  1725. bool write = vmf->flags & FAULT_FLAG_WRITE;
  1726. vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
  1727. if (unlikely(!pud_same(*vmf->pud, orig_pud)))
  1728. goto unlock;
  1729. touch_pud(vmf->vma, vmf->address, vmf->pud, write);
  1730. unlock:
  1731. spin_unlock(vmf->ptl);
  1732. }
  1733. #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
  1734. bool huge_pmd_set_accessed(struct vm_fault *vmf)
  1735. {
  1736. bool write = vmf->flags & FAULT_FLAG_WRITE;
  1737. if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
  1738. return false;
  1739. return touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
  1740. }
  1741. static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
  1742. {
  1743. unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
  1744. struct vm_area_struct *vma = vmf->vma;
  1745. struct mmu_notifier_range range;
  1746. struct folio *folio;
  1747. vm_fault_t ret = 0;
  1748. folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
  1749. if (unlikely(!folio))
  1750. return VM_FAULT_FALLBACK;
  1751. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
  1752. haddr + HPAGE_PMD_SIZE);
  1753. mmu_notifier_invalidate_range_start(&range);
  1754. vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  1755. if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
  1756. goto release;
  1757. ret = check_stable_address_space(vma->vm_mm);
  1758. if (ret)
  1759. goto release;
  1760. (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
  1761. map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr);
  1762. goto unlock;
  1763. release:
  1764. folio_put(folio);
  1765. unlock:
  1766. spin_unlock(vmf->ptl);
  1767. mmu_notifier_invalidate_range_end(&range);
  1768. return ret;
  1769. }
  1770. vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
  1771. {
  1772. const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
  1773. struct vm_area_struct *vma = vmf->vma;
  1774. struct folio *folio;
  1775. struct page *page;
  1776. unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
  1777. pmd_t orig_pmd = vmf->orig_pmd;
  1778. vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
  1779. VM_BUG_ON_VMA(!vma->anon_vma, vma);
  1780. if (is_huge_zero_pmd(orig_pmd)) {
  1781. vm_fault_t ret = do_huge_zero_wp_pmd(vmf);
  1782. if (!(ret & VM_FAULT_FALLBACK))
  1783. return ret;
  1784. /* Fallback to splitting PMD if THP cannot be allocated */
  1785. goto fallback;
  1786. }
  1787. spin_lock(vmf->ptl);
  1788. if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
  1789. spin_unlock(vmf->ptl);
  1790. return 0;
  1791. }
  1792. page = pmd_page(orig_pmd);
  1793. folio = page_folio(page);
  1794. VM_BUG_ON_PAGE(!PageHead(page), page);
  1795. /* Early check when only holding the PT lock. */
  1796. if (PageAnonExclusive(page))
  1797. goto reuse;
  1798. if (!folio_trylock(folio)) {
  1799. folio_get(folio);
  1800. spin_unlock(vmf->ptl);
  1801. folio_lock(folio);
  1802. spin_lock(vmf->ptl);
  1803. if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
  1804. spin_unlock(vmf->ptl);
  1805. folio_unlock(folio);
  1806. folio_put(folio);
  1807. return 0;
  1808. }
  1809. folio_put(folio);
  1810. }
  1811. /* Recheck after temporarily dropping the PT lock. */
  1812. if (PageAnonExclusive(page)) {
  1813. folio_unlock(folio);
  1814. goto reuse;
  1815. }
  1816. /*
  1817. * See do_wp_page(): we can only reuse the folio exclusively if
  1818. * there are no additional references. Note that we always drain
  1819. * the LRU cache immediately after adding a THP.
  1820. */
  1821. if (folio_ref_count(folio) >
  1822. 1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
  1823. goto unlock_fallback;
  1824. if (folio_test_swapcache(folio))
  1825. folio_free_swap(folio);
  1826. if (folio_ref_count(folio) == 1) {
  1827. pmd_t entry;
  1828. folio_move_anon_rmap(folio, vma);
  1829. SetPageAnonExclusive(page);
  1830. folio_unlock(folio);
  1831. reuse:
  1832. if (unlikely(unshare)) {
  1833. spin_unlock(vmf->ptl);
  1834. return 0;
  1835. }
  1836. entry = pmd_mkyoung(orig_pmd);
  1837. entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
  1838. if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
  1839. update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
  1840. spin_unlock(vmf->ptl);
  1841. return 0;
  1842. }
  1843. unlock_fallback:
  1844. folio_unlock(folio);
  1845. spin_unlock(vmf->ptl);
  1846. fallback:
  1847. __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
  1848. return VM_FAULT_FALLBACK;
  1849. }
  1850. static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
  1851. unsigned long addr, pmd_t pmd)
  1852. {
  1853. struct page *page;
  1854. if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
  1855. return false;
  1856. /* Don't touch entries that are not even readable (NUMA hinting). */
  1857. if (pmd_protnone(pmd))
  1858. return false;
  1859. /* Do we need write faults for softdirty tracking? */
  1860. if (pmd_needs_soft_dirty_wp(vma, pmd))
  1861. return false;
  1862. /* Do we need write faults for uffd-wp tracking? */
  1863. if (userfaultfd_huge_pmd_wp(vma, pmd))
  1864. return false;
  1865. if (!(vma->vm_flags & VM_SHARED)) {
  1866. /* See can_change_pte_writable(). */
  1867. page = vm_normal_page_pmd(vma, addr, pmd);
  1868. return page && PageAnon(page) && PageAnonExclusive(page);
  1869. }
  1870. /* See can_change_pte_writable(). */
  1871. return pmd_dirty(pmd);
  1872. }
  1873. /* NUMA hinting page fault entry point for trans huge pmds */
  1874. vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
  1875. {
  1876. struct vm_area_struct *vma = vmf->vma;
  1877. struct folio *folio;
  1878. unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
  1879. int nid = NUMA_NO_NODE;
  1880. int target_nid, last_cpupid;
  1881. pmd_t pmd, old_pmd;
  1882. bool writable = false;
  1883. int flags = 0;
  1884. vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  1885. old_pmd = pmdp_get(vmf->pmd);
  1886. if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) {
  1887. spin_unlock(vmf->ptl);
  1888. return 0;
  1889. }
  1890. pmd = pmd_modify(old_pmd, vma->vm_page_prot);
  1891. /*
  1892. * Detect now whether the PMD could be writable; this information
  1893. * is only valid while holding the PT lock.
  1894. */
  1895. writable = pmd_write(pmd);
  1896. if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
  1897. can_change_pmd_writable(vma, vmf->address, pmd))
  1898. writable = true;
  1899. folio = vm_normal_folio_pmd(vma, haddr, pmd);
  1900. if (!folio)
  1901. goto out_map;
  1902. nid = folio_nid(folio);
  1903. target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable,
  1904. &last_cpupid);
  1905. if (target_nid == NUMA_NO_NODE)
  1906. goto out_map;
  1907. if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
  1908. flags |= TNF_MIGRATE_FAIL;
  1909. goto out_map;
  1910. }
  1911. /* The folio is isolated and isolation code holds a folio reference. */
  1912. spin_unlock(vmf->ptl);
  1913. writable = false;
  1914. if (!migrate_misplaced_folio(folio, target_nid)) {
  1915. flags |= TNF_MIGRATED;
  1916. nid = target_nid;
  1917. task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
  1918. return 0;
  1919. }
  1920. flags |= TNF_MIGRATE_FAIL;
  1921. vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  1922. if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) {
  1923. spin_unlock(vmf->ptl);
  1924. return 0;
  1925. }
  1926. out_map:
  1927. /* Restore the PMD */
  1928. pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot);
  1929. pmd = pmd_mkyoung(pmd);
  1930. if (writable)
  1931. pmd = pmd_mkwrite(pmd, vma);
  1932. set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
  1933. update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
  1934. spin_unlock(vmf->ptl);
  1935. if (nid != NUMA_NO_NODE)
  1936. task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
  1937. return 0;
  1938. }
  1939. /*
  1940. * Return true if we do MADV_FREE successfully on entire pmd page.
  1941. * Otherwise, return false.
  1942. */
  1943. bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
  1944. pmd_t *pmd, unsigned long addr, unsigned long next)
  1945. {
  1946. spinlock_t *ptl;
  1947. pmd_t orig_pmd;
  1948. struct folio *folio;
  1949. struct mm_struct *mm = tlb->mm;
  1950. bool ret = false;
  1951. tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
  1952. ptl = pmd_trans_huge_lock(pmd, vma);
  1953. if (!ptl)
  1954. goto out_unlocked;
  1955. orig_pmd = *pmd;
  1956. if (is_huge_zero_pmd(orig_pmd))
  1957. goto out;
  1958. if (unlikely(!pmd_present(orig_pmd))) {
  1959. VM_BUG_ON(thp_migration_supported() &&
  1960. !pmd_is_migration_entry(orig_pmd));
  1961. goto out;
  1962. }
  1963. folio = pmd_folio(orig_pmd);
  1964. /*
  1965. * If other processes are mapping this folio, we couldn't discard
  1966. * the folio unless they all do MADV_FREE so let's skip the folio.
  1967. */
  1968. if (folio_maybe_mapped_shared(folio))
  1969. goto out;
  1970. if (!folio_trylock(folio))
  1971. goto out;
  1972. /*
  1973. * If user want to discard part-pages of THP, split it so MADV_FREE
  1974. * will deactivate only them.
  1975. */
  1976. if (next - addr != HPAGE_PMD_SIZE) {
  1977. folio_get(folio);
  1978. spin_unlock(ptl);
  1979. split_folio(folio);
  1980. folio_unlock(folio);
  1981. folio_put(folio);
  1982. goto out_unlocked;
  1983. }
  1984. if (folio_test_dirty(folio))
  1985. folio_clear_dirty(folio);
  1986. folio_unlock(folio);
  1987. if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
  1988. pmdp_invalidate(vma, addr, pmd);
  1989. orig_pmd = pmd_mkold(orig_pmd);
  1990. orig_pmd = pmd_mkclean(orig_pmd);
  1991. set_pmd_at(mm, addr, pmd, orig_pmd);
  1992. tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  1993. }
  1994. folio_mark_lazyfree(folio);
  1995. ret = true;
  1996. out:
  1997. spin_unlock(ptl);
  1998. out_unlocked:
  1999. return ret;
  2000. }
  2001. static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
  2002. {
  2003. pgtable_t pgtable;
  2004. pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  2005. pte_free(mm, pgtable);
  2006. mm_dec_nr_ptes(mm);
  2007. }
  2008. int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
  2009. pmd_t *pmd, unsigned long addr)
  2010. {
  2011. pmd_t orig_pmd;
  2012. spinlock_t *ptl;
  2013. tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
  2014. ptl = __pmd_trans_huge_lock(pmd, vma);
  2015. if (!ptl)
  2016. return 0;
  2017. /*
  2018. * For architectures like ppc64 we look at deposited pgtable
  2019. * when calling pmdp_huge_get_and_clear. So do the
  2020. * pgtable_trans_huge_withdraw after finishing pmdp related
  2021. * operations.
  2022. */
  2023. orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
  2024. tlb->fullmm);
  2025. arch_check_zapped_pmd(vma, orig_pmd);
  2026. tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
  2027. if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
  2028. if (arch_needs_pgtable_deposit())
  2029. zap_deposited_table(tlb->mm, pmd);
  2030. spin_unlock(ptl);
  2031. } else if (is_huge_zero_pmd(orig_pmd)) {
  2032. if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
  2033. zap_deposited_table(tlb->mm, pmd);
  2034. spin_unlock(ptl);
  2035. } else {
  2036. struct folio *folio = NULL;
  2037. int flush_needed = 1;
  2038. if (pmd_present(orig_pmd)) {
  2039. struct page *page = pmd_page(orig_pmd);
  2040. folio = page_folio(page);
  2041. folio_remove_rmap_pmd(folio, page, vma);
  2042. WARN_ON_ONCE(folio_mapcount(folio) < 0);
  2043. VM_BUG_ON_PAGE(!PageHead(page), page);
  2044. } else if (pmd_is_valid_softleaf(orig_pmd)) {
  2045. const softleaf_t entry = softleaf_from_pmd(orig_pmd);
  2046. folio = softleaf_to_folio(entry);
  2047. flush_needed = 0;
  2048. if (!thp_migration_supported())
  2049. WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
  2050. }
  2051. if (folio_test_anon(folio)) {
  2052. zap_deposited_table(tlb->mm, pmd);
  2053. add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
  2054. } else {
  2055. if (arch_needs_pgtable_deposit())
  2056. zap_deposited_table(tlb->mm, pmd);
  2057. add_mm_counter(tlb->mm, mm_counter_file(folio),
  2058. -HPAGE_PMD_NR);
  2059. /*
  2060. * Use flush_needed to indicate whether the PMD entry
  2061. * is present, instead of checking pmd_present() again.
  2062. */
  2063. if (flush_needed && pmd_young(orig_pmd) &&
  2064. likely(vma_has_recency(vma)))
  2065. folio_mark_accessed(folio);
  2066. }
  2067. if (folio_is_device_private(folio)) {
  2068. folio_remove_rmap_pmd(folio, &folio->page, vma);
  2069. WARN_ON_ONCE(folio_mapcount(folio) < 0);
  2070. folio_put(folio);
  2071. }
  2072. spin_unlock(ptl);
  2073. if (flush_needed)
  2074. tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
  2075. }
  2076. return 1;
  2077. }
  2078. #ifndef pmd_move_must_withdraw
  2079. static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
  2080. spinlock_t *old_pmd_ptl,
  2081. struct vm_area_struct *vma)
  2082. {
  2083. /*
  2084. * With split pmd lock we also need to move preallocated
  2085. * PTE page table if new_pmd is on different PMD page table.
  2086. *
  2087. * We also don't deposit and withdraw tables for file pages.
  2088. */
  2089. return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
  2090. }
  2091. #endif
  2092. static pmd_t move_soft_dirty_pmd(pmd_t pmd)
  2093. {
  2094. if (pgtable_supports_soft_dirty()) {
  2095. if (unlikely(pmd_is_migration_entry(pmd)))
  2096. pmd = pmd_swp_mksoft_dirty(pmd);
  2097. else if (pmd_present(pmd))
  2098. pmd = pmd_mksoft_dirty(pmd);
  2099. }
  2100. return pmd;
  2101. }
  2102. static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
  2103. {
  2104. if (pmd_none(pmd))
  2105. return pmd;
  2106. if (pmd_present(pmd))
  2107. pmd = pmd_clear_uffd_wp(pmd);
  2108. else
  2109. pmd = pmd_swp_clear_uffd_wp(pmd);
  2110. return pmd;
  2111. }
  2112. bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
  2113. unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
  2114. {
  2115. spinlock_t *old_ptl, *new_ptl;
  2116. pmd_t pmd;
  2117. struct mm_struct *mm = vma->vm_mm;
  2118. bool force_flush = false;
  2119. /*
  2120. * The destination pmd shouldn't be established, free_pgtables()
  2121. * should have released it; but move_page_tables() might have already
  2122. * inserted a page table, if racing against shmem/file collapse.
  2123. */
  2124. if (!pmd_none(*new_pmd)) {
  2125. VM_BUG_ON(pmd_trans_huge(*new_pmd));
  2126. return false;
  2127. }
  2128. /*
  2129. * We don't have to worry about the ordering of src and dst
  2130. * ptlocks because exclusive mmap_lock prevents deadlock.
  2131. */
  2132. old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
  2133. if (old_ptl) {
  2134. new_ptl = pmd_lockptr(mm, new_pmd);
  2135. if (new_ptl != old_ptl)
  2136. spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
  2137. pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
  2138. if (pmd_present(pmd))
  2139. force_flush = true;
  2140. VM_BUG_ON(!pmd_none(*new_pmd));
  2141. if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
  2142. pgtable_t pgtable;
  2143. pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
  2144. pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
  2145. }
  2146. pmd = move_soft_dirty_pmd(pmd);
  2147. if (vma_has_uffd_without_event_remap(vma))
  2148. pmd = clear_uffd_wp_pmd(pmd);
  2149. set_pmd_at(mm, new_addr, new_pmd, pmd);
  2150. if (force_flush)
  2151. flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
  2152. if (new_ptl != old_ptl)
  2153. spin_unlock(new_ptl);
  2154. spin_unlock(old_ptl);
  2155. return true;
  2156. }
  2157. return false;
  2158. }
  2159. static void change_non_present_huge_pmd(struct mm_struct *mm,
  2160. unsigned long addr, pmd_t *pmd, bool uffd_wp,
  2161. bool uffd_wp_resolve)
  2162. {
  2163. softleaf_t entry = softleaf_from_pmd(*pmd);
  2164. const struct folio *folio = softleaf_to_folio(entry);
  2165. pmd_t newpmd;
  2166. VM_WARN_ON(!pmd_is_valid_softleaf(*pmd));
  2167. if (softleaf_is_migration_write(entry)) {
  2168. /*
  2169. * A protection check is difficult so
  2170. * just be safe and disable write
  2171. */
  2172. if (folio_test_anon(folio))
  2173. entry = make_readable_exclusive_migration_entry(swp_offset(entry));
  2174. else
  2175. entry = make_readable_migration_entry(swp_offset(entry));
  2176. newpmd = swp_entry_to_pmd(entry);
  2177. if (pmd_swp_soft_dirty(*pmd))
  2178. newpmd = pmd_swp_mksoft_dirty(newpmd);
  2179. } else if (softleaf_is_device_private_write(entry)) {
  2180. entry = make_readable_device_private_entry(swp_offset(entry));
  2181. newpmd = swp_entry_to_pmd(entry);
  2182. } else {
  2183. newpmd = *pmd;
  2184. }
  2185. if (uffd_wp)
  2186. newpmd = pmd_swp_mkuffd_wp(newpmd);
  2187. else if (uffd_wp_resolve)
  2188. newpmd = pmd_swp_clear_uffd_wp(newpmd);
  2189. if (!pmd_same(*pmd, newpmd))
  2190. set_pmd_at(mm, addr, pmd, newpmd);
  2191. }
  2192. /*
  2193. * Returns
  2194. * - 0 if PMD could not be locked
  2195. * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
  2196. * or if prot_numa but THP migration is not supported
  2197. * - HPAGE_PMD_NR if protections changed and TLB flush necessary
  2198. */
  2199. int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
  2200. pmd_t *pmd, unsigned long addr, pgprot_t newprot,
  2201. unsigned long cp_flags)
  2202. {
  2203. struct mm_struct *mm = vma->vm_mm;
  2204. spinlock_t *ptl;
  2205. pmd_t oldpmd, entry;
  2206. bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
  2207. bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
  2208. bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
  2209. int ret = 1;
  2210. tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
  2211. if (prot_numa && !thp_migration_supported())
  2212. return 1;
  2213. ptl = __pmd_trans_huge_lock(pmd, vma);
  2214. if (!ptl)
  2215. return 0;
  2216. if (thp_migration_supported() && pmd_is_valid_softleaf(*pmd)) {
  2217. change_non_present_huge_pmd(mm, addr, pmd, uffd_wp,
  2218. uffd_wp_resolve);
  2219. goto unlock;
  2220. }
  2221. if (prot_numa) {
  2222. /*
  2223. * Avoid trapping faults against the zero page. The read-only
  2224. * data is likely to be read-cached on the local CPU and
  2225. * local/remote hits to the zero page are not interesting.
  2226. */
  2227. if (is_huge_zero_pmd(*pmd))
  2228. goto unlock;
  2229. if (pmd_protnone(*pmd))
  2230. goto unlock;
  2231. if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma,
  2232. vma_is_single_threaded_private(vma)))
  2233. goto unlock;
  2234. }
  2235. /*
  2236. * In case prot_numa, we are under mmap_read_lock(mm). It's critical
  2237. * to not clear pmd intermittently to avoid race with MADV_DONTNEED
  2238. * which is also under mmap_read_lock(mm):
  2239. *
  2240. * CPU0: CPU1:
  2241. * change_huge_pmd(prot_numa=1)
  2242. * pmdp_huge_get_and_clear_notify()
  2243. * madvise_dontneed()
  2244. * zap_pmd_range()
  2245. * pmd_trans_huge(*pmd) == 0 (without ptl)
  2246. * // skip the pmd
  2247. * set_pmd_at();
  2248. * // pmd is re-established
  2249. *
  2250. * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
  2251. * which may break userspace.
  2252. *
  2253. * pmdp_invalidate_ad() is required to make sure we don't miss
  2254. * dirty/young flags set by hardware.
  2255. */
  2256. oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
  2257. entry = pmd_modify(oldpmd, newprot);
  2258. if (uffd_wp)
  2259. entry = pmd_mkuffd_wp(entry);
  2260. else if (uffd_wp_resolve)
  2261. /*
  2262. * Leave the write bit to be handled by PF interrupt
  2263. * handler, then things like COW could be properly
  2264. * handled.
  2265. */
  2266. entry = pmd_clear_uffd_wp(entry);
  2267. /* See change_pte_range(). */
  2268. if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
  2269. can_change_pmd_writable(vma, addr, entry))
  2270. entry = pmd_mkwrite(entry, vma);
  2271. ret = HPAGE_PMD_NR;
  2272. set_pmd_at(mm, addr, pmd, entry);
  2273. if (huge_pmd_needs_flush(oldpmd, entry))
  2274. tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
  2275. unlock:
  2276. spin_unlock(ptl);
  2277. return ret;
  2278. }
  2279. /*
  2280. * Returns:
  2281. *
  2282. * - 0: if pud leaf changed from under us
  2283. * - 1: if pud can be skipped
  2284. * - HPAGE_PUD_NR: if pud was successfully processed
  2285. */
  2286. #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  2287. int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
  2288. pud_t *pudp, unsigned long addr, pgprot_t newprot,
  2289. unsigned long cp_flags)
  2290. {
  2291. struct mm_struct *mm = vma->vm_mm;
  2292. pud_t oldpud, entry;
  2293. spinlock_t *ptl;
  2294. tlb_change_page_size(tlb, HPAGE_PUD_SIZE);
  2295. /* NUMA balancing doesn't apply to dax */
  2296. if (cp_flags & MM_CP_PROT_NUMA)
  2297. return 1;
  2298. /*
  2299. * Huge entries on userfault-wp only works with anonymous, while we
  2300. * don't have anonymous PUDs yet.
  2301. */
  2302. if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL))
  2303. return 1;
  2304. ptl = __pud_trans_huge_lock(pudp, vma);
  2305. if (!ptl)
  2306. return 0;
  2307. /*
  2308. * Can't clear PUD or it can race with concurrent zapping. See
  2309. * change_huge_pmd().
  2310. */
  2311. oldpud = pudp_invalidate(vma, addr, pudp);
  2312. entry = pud_modify(oldpud, newprot);
  2313. set_pud_at(mm, addr, pudp, entry);
  2314. tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE);
  2315. spin_unlock(ptl);
  2316. return HPAGE_PUD_NR;
  2317. }
  2318. #endif
  2319. #ifdef CONFIG_USERFAULTFD
  2320. /*
  2321. * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
  2322. * the caller, but it must return after releasing the page_table_lock.
  2323. * Just move the page from src_pmd to dst_pmd if possible.
  2324. * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
  2325. * repeated by the caller, or other errors in case of failure.
  2326. */
  2327. int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
  2328. struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
  2329. unsigned long dst_addr, unsigned long src_addr)
  2330. {
  2331. pmd_t _dst_pmd, src_pmdval;
  2332. struct page *src_page;
  2333. struct folio *src_folio;
  2334. spinlock_t *src_ptl, *dst_ptl;
  2335. pgtable_t src_pgtable;
  2336. struct mmu_notifier_range range;
  2337. int err = 0;
  2338. src_pmdval = *src_pmd;
  2339. src_ptl = pmd_lockptr(mm, src_pmd);
  2340. lockdep_assert_held(src_ptl);
  2341. vma_assert_locked(src_vma);
  2342. vma_assert_locked(dst_vma);
  2343. /* Sanity checks before the operation */
  2344. if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
  2345. WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
  2346. spin_unlock(src_ptl);
  2347. return -EINVAL;
  2348. }
  2349. if (!pmd_trans_huge(src_pmdval)) {
  2350. spin_unlock(src_ptl);
  2351. if (pmd_is_migration_entry(src_pmdval)) {
  2352. pmd_migration_entry_wait(mm, &src_pmdval);
  2353. return -EAGAIN;
  2354. }
  2355. return -ENOENT;
  2356. }
  2357. src_page = pmd_page(src_pmdval);
  2358. if (!is_huge_zero_pmd(src_pmdval)) {
  2359. if (unlikely(!PageAnonExclusive(src_page))) {
  2360. spin_unlock(src_ptl);
  2361. return -EBUSY;
  2362. }
  2363. src_folio = page_folio(src_page);
  2364. folio_get(src_folio);
  2365. } else
  2366. src_folio = NULL;
  2367. spin_unlock(src_ptl);
  2368. flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
  2369. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
  2370. src_addr + HPAGE_PMD_SIZE);
  2371. mmu_notifier_invalidate_range_start(&range);
  2372. if (src_folio)
  2373. folio_lock(src_folio);
  2374. dst_ptl = pmd_lockptr(mm, dst_pmd);
  2375. double_pt_lock(src_ptl, dst_ptl);
  2376. if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
  2377. !pmd_same(*dst_pmd, dst_pmdval))) {
  2378. err = -EAGAIN;
  2379. goto unlock_ptls;
  2380. }
  2381. if (src_folio) {
  2382. if (folio_maybe_dma_pinned(src_folio) ||
  2383. !PageAnonExclusive(&src_folio->page)) {
  2384. err = -EBUSY;
  2385. goto unlock_ptls;
  2386. }
  2387. if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
  2388. WARN_ON_ONCE(!folio_test_anon(src_folio))) {
  2389. err = -EBUSY;
  2390. goto unlock_ptls;
  2391. }
  2392. src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
  2393. /* Folio got pinned from under us. Put it back and fail the move. */
  2394. if (folio_maybe_dma_pinned(src_folio)) {
  2395. set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
  2396. err = -EBUSY;
  2397. goto unlock_ptls;
  2398. }
  2399. folio_move_anon_rmap(src_folio, dst_vma);
  2400. src_folio->index = linear_page_index(dst_vma, dst_addr);
  2401. _dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
  2402. /* Follow mremap() behavior and treat the entry dirty after the move */
  2403. _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
  2404. } else {
  2405. src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
  2406. _dst_pmd = move_soft_dirty_pmd(src_pmdval);
  2407. _dst_pmd = clear_uffd_wp_pmd(_dst_pmd);
  2408. }
  2409. set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
  2410. src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
  2411. pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
  2412. unlock_ptls:
  2413. double_pt_unlock(src_ptl, dst_ptl);
  2414. /* unblock rmap walks */
  2415. if (src_folio)
  2416. folio_unlock(src_folio);
  2417. mmu_notifier_invalidate_range_end(&range);
  2418. if (src_folio)
  2419. folio_put(src_folio);
  2420. return err;
  2421. }
  2422. #endif /* CONFIG_USERFAULTFD */
  2423. /*
  2424. * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
  2425. *
  2426. * Note that if it returns page table lock pointer, this routine returns without
  2427. * unlocking page table lock. So callers must unlock it.
  2428. */
  2429. spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
  2430. {
  2431. spinlock_t *ptl;
  2432. ptl = pmd_lock(vma->vm_mm, pmd);
  2433. if (likely(pmd_is_huge(*pmd)))
  2434. return ptl;
  2435. spin_unlock(ptl);
  2436. return NULL;
  2437. }
  2438. /*
  2439. * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
  2440. *
  2441. * Note that if it returns page table lock pointer, this routine returns without
  2442. * unlocking page table lock. So callers must unlock it.
  2443. */
  2444. spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
  2445. {
  2446. spinlock_t *ptl;
  2447. ptl = pud_lock(vma->vm_mm, pud);
  2448. if (likely(pud_trans_huge(*pud)))
  2449. return ptl;
  2450. spin_unlock(ptl);
  2451. return NULL;
  2452. }
  2453. #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  2454. int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
  2455. pud_t *pud, unsigned long addr)
  2456. {
  2457. spinlock_t *ptl;
  2458. pud_t orig_pud;
  2459. ptl = __pud_trans_huge_lock(pud, vma);
  2460. if (!ptl)
  2461. return 0;
  2462. orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
  2463. arch_check_zapped_pud(vma, orig_pud);
  2464. tlb_remove_pud_tlb_entry(tlb, pud, addr);
  2465. if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
  2466. spin_unlock(ptl);
  2467. /* No zero page support yet */
  2468. } else {
  2469. struct page *page = NULL;
  2470. struct folio *folio;
  2471. /* No support for anonymous PUD pages or migration yet */
  2472. VM_WARN_ON_ONCE(vma_is_anonymous(vma) ||
  2473. !pud_present(orig_pud));
  2474. page = pud_page(orig_pud);
  2475. folio = page_folio(page);
  2476. folio_remove_rmap_pud(folio, page, vma);
  2477. add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);
  2478. spin_unlock(ptl);
  2479. tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
  2480. }
  2481. return 1;
  2482. }
  2483. static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
  2484. unsigned long haddr)
  2485. {
  2486. struct folio *folio;
  2487. struct page *page;
  2488. pud_t old_pud;
  2489. VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
  2490. VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
  2491. VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
  2492. VM_BUG_ON(!pud_trans_huge(*pud));
  2493. count_vm_event(THP_SPLIT_PUD);
  2494. old_pud = pudp_huge_clear_flush(vma, haddr, pud);
  2495. if (!vma_is_dax(vma))
  2496. return;
  2497. page = pud_page(old_pud);
  2498. folio = page_folio(page);
  2499. if (!folio_test_dirty(folio) && pud_dirty(old_pud))
  2500. folio_mark_dirty(folio);
  2501. if (!folio_test_referenced(folio) && pud_young(old_pud))
  2502. folio_set_referenced(folio);
  2503. folio_remove_rmap_pud(folio, page, vma);
  2504. folio_put(folio);
  2505. add_mm_counter(vma->vm_mm, mm_counter_file(folio),
  2506. -HPAGE_PUD_NR);
  2507. }
  2508. void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
  2509. unsigned long address)
  2510. {
  2511. spinlock_t *ptl;
  2512. struct mmu_notifier_range range;
  2513. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
  2514. address & HPAGE_PUD_MASK,
  2515. (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
  2516. mmu_notifier_invalidate_range_start(&range);
  2517. ptl = pud_lock(vma->vm_mm, pud);
  2518. if (unlikely(!pud_trans_huge(*pud)))
  2519. goto out;
  2520. __split_huge_pud_locked(vma, pud, range.start);
  2521. out:
  2522. spin_unlock(ptl);
  2523. mmu_notifier_invalidate_range_end(&range);
  2524. }
  2525. #else
  2526. void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
  2527. unsigned long address)
  2528. {
  2529. }
  2530. #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
  2531. static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
  2532. unsigned long haddr, pmd_t *pmd)
  2533. {
  2534. struct mm_struct *mm = vma->vm_mm;
  2535. pgtable_t pgtable;
  2536. pmd_t _pmd, old_pmd;
  2537. unsigned long addr;
  2538. pte_t *pte;
  2539. int i;
  2540. /*
  2541. * Leave pmd empty until pte is filled note that it is fine to delay
  2542. * notification until mmu_notifier_invalidate_range_end() as we are
  2543. * replacing a zero pmd write protected page with a zero pte write
  2544. * protected page.
  2545. *
  2546. * See Documentation/mm/mmu_notifier.rst
  2547. */
  2548. old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
  2549. pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  2550. pmd_populate(mm, &_pmd, pgtable);
  2551. pte = pte_offset_map(&_pmd, haddr);
  2552. VM_BUG_ON(!pte);
  2553. for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
  2554. pte_t entry;
  2555. entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
  2556. entry = pte_mkspecial(entry);
  2557. if (pmd_uffd_wp(old_pmd))
  2558. entry = pte_mkuffd_wp(entry);
  2559. VM_BUG_ON(!pte_none(ptep_get(pte)));
  2560. set_pte_at(mm, addr, pte, entry);
  2561. pte++;
  2562. }
  2563. pte_unmap(pte - 1);
  2564. smp_wmb(); /* make pte visible before pmd */
  2565. pmd_populate(mm, pmd, pgtable);
  2566. }
  2567. static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
  2568. unsigned long haddr, bool freeze)
  2569. {
  2570. struct mm_struct *mm = vma->vm_mm;
  2571. struct folio *folio;
  2572. struct page *page;
  2573. pgtable_t pgtable;
  2574. pmd_t old_pmd, _pmd;
  2575. bool soft_dirty, uffd_wp = false, young = false, write = false;
  2576. bool anon_exclusive = false, dirty = false;
  2577. unsigned long addr;
  2578. pte_t *pte;
  2579. int i;
  2580. VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
  2581. VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
  2582. VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
  2583. VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(*pmd) && !pmd_trans_huge(*pmd));
  2584. count_vm_event(THP_SPLIT_PMD);
  2585. if (!vma_is_anonymous(vma)) {
  2586. old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
  2587. /*
  2588. * We are going to unmap this huge page. So
  2589. * just go ahead and zap it
  2590. */
  2591. if (arch_needs_pgtable_deposit())
  2592. zap_deposited_table(mm, pmd);
  2593. if (!vma_is_dax(vma) && vma_is_special_huge(vma))
  2594. return;
  2595. if (unlikely(pmd_is_migration_entry(old_pmd))) {
  2596. const softleaf_t old_entry = softleaf_from_pmd(old_pmd);
  2597. folio = softleaf_to_folio(old_entry);
  2598. } else if (is_huge_zero_pmd(old_pmd)) {
  2599. return;
  2600. } else {
  2601. page = pmd_page(old_pmd);
  2602. folio = page_folio(page);
  2603. if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
  2604. folio_mark_dirty(folio);
  2605. if (!folio_test_referenced(folio) && pmd_young(old_pmd))
  2606. folio_set_referenced(folio);
  2607. folio_remove_rmap_pmd(folio, page, vma);
  2608. folio_put(folio);
  2609. }
  2610. add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
  2611. return;
  2612. }
  2613. if (is_huge_zero_pmd(*pmd)) {
  2614. /*
  2615. * FIXME: Do we want to invalidate secondary mmu by calling
  2616. * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
  2617. * inside __split_huge_pmd() ?
  2618. *
  2619. * We are going from a zero huge page write protected to zero
  2620. * small page also write protected so it does not seems useful
  2621. * to invalidate secondary mmu at this time.
  2622. */
  2623. return __split_huge_zero_page_pmd(vma, haddr, pmd);
  2624. }
  2625. if (pmd_is_migration_entry(*pmd)) {
  2626. softleaf_t entry;
  2627. old_pmd = *pmd;
  2628. entry = softleaf_from_pmd(old_pmd);
  2629. page = softleaf_to_page(entry);
  2630. folio = page_folio(page);
  2631. soft_dirty = pmd_swp_soft_dirty(old_pmd);
  2632. uffd_wp = pmd_swp_uffd_wp(old_pmd);
  2633. write = softleaf_is_migration_write(entry);
  2634. if (PageAnon(page))
  2635. anon_exclusive = softleaf_is_migration_read_exclusive(entry);
  2636. young = softleaf_is_migration_young(entry);
  2637. dirty = softleaf_is_migration_dirty(entry);
  2638. } else if (pmd_is_device_private_entry(*pmd)) {
  2639. softleaf_t entry;
  2640. old_pmd = *pmd;
  2641. entry = softleaf_from_pmd(old_pmd);
  2642. page = softleaf_to_page(entry);
  2643. folio = page_folio(page);
  2644. soft_dirty = pmd_swp_soft_dirty(old_pmd);
  2645. uffd_wp = pmd_swp_uffd_wp(old_pmd);
  2646. write = softleaf_is_device_private_write(entry);
  2647. anon_exclusive = PageAnonExclusive(page);
  2648. /*
  2649. * Device private THP should be treated the same as regular
  2650. * folios w.r.t anon exclusive handling. See the comments for
  2651. * folio handling and anon_exclusive below.
  2652. */
  2653. if (freeze && anon_exclusive &&
  2654. folio_try_share_anon_rmap_pmd(folio, page))
  2655. freeze = false;
  2656. if (!freeze) {
  2657. rmap_t rmap_flags = RMAP_NONE;
  2658. folio_ref_add(folio, HPAGE_PMD_NR - 1);
  2659. if (anon_exclusive)
  2660. rmap_flags |= RMAP_EXCLUSIVE;
  2661. folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
  2662. vma, haddr, rmap_flags);
  2663. }
  2664. } else {
  2665. /*
  2666. * Up to this point the pmd is present and huge and userland has
  2667. * the whole access to the hugepage during the split (which
  2668. * happens in place). If we overwrite the pmd with the not-huge
  2669. * version pointing to the pte here (which of course we could if
  2670. * all CPUs were bug free), userland could trigger a small page
  2671. * size TLB miss on the small sized TLB while the hugepage TLB
  2672. * entry is still established in the huge TLB. Some CPU doesn't
  2673. * like that. See
  2674. * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
  2675. * 383 on page 105. Intel should be safe but is also warns that
  2676. * it's only safe if the permission and cache attributes of the
  2677. * two entries loaded in the two TLB is identical (which should
  2678. * be the case here). But it is generally safer to never allow
  2679. * small and huge TLB entries for the same virtual address to be
  2680. * loaded simultaneously. So instead of doing "pmd_populate();
  2681. * flush_pmd_tlb_range();" we first mark the current pmd
  2682. * notpresent (atomically because here the pmd_trans_huge must
  2683. * remain set at all times on the pmd until the split is
  2684. * complete for this pmd), then we flush the SMP TLB and finally
  2685. * we write the non-huge version of the pmd entry with
  2686. * pmd_populate.
  2687. */
  2688. old_pmd = pmdp_invalidate(vma, haddr, pmd);
  2689. page = pmd_page(old_pmd);
  2690. folio = page_folio(page);
  2691. if (pmd_dirty(old_pmd)) {
  2692. dirty = true;
  2693. folio_set_dirty(folio);
  2694. }
  2695. write = pmd_write(old_pmd);
  2696. young = pmd_young(old_pmd);
  2697. soft_dirty = pmd_soft_dirty(old_pmd);
  2698. uffd_wp = pmd_uffd_wp(old_pmd);
  2699. VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
  2700. VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
  2701. /*
  2702. * Without "freeze", we'll simply split the PMD, propagating the
  2703. * PageAnonExclusive() flag for each PTE by setting it for
  2704. * each subpage -- no need to (temporarily) clear.
  2705. *
  2706. * With "freeze" we want to replace mapped pages by
  2707. * migration entries right away. This is only possible if we
  2708. * managed to clear PageAnonExclusive() -- see
  2709. * set_pmd_migration_entry().
  2710. *
  2711. * In case we cannot clear PageAnonExclusive(), split the PMD
  2712. * only and let try_to_migrate_one() fail later.
  2713. *
  2714. * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
  2715. */
  2716. anon_exclusive = PageAnonExclusive(page);
  2717. if (freeze && anon_exclusive &&
  2718. folio_try_share_anon_rmap_pmd(folio, page))
  2719. freeze = false;
  2720. if (!freeze) {
  2721. rmap_t rmap_flags = RMAP_NONE;
  2722. folio_ref_add(folio, HPAGE_PMD_NR - 1);
  2723. if (anon_exclusive)
  2724. rmap_flags |= RMAP_EXCLUSIVE;
  2725. folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
  2726. vma, haddr, rmap_flags);
  2727. }
  2728. }
  2729. /*
  2730. * Withdraw the table only after we mark the pmd entry invalid.
  2731. * This's critical for some architectures (Power).
  2732. */
  2733. pgtable = pgtable_trans_huge_withdraw(mm, pmd);
  2734. pmd_populate(mm, &_pmd, pgtable);
  2735. pte = pte_offset_map(&_pmd, haddr);
  2736. VM_BUG_ON(!pte);
  2737. /*
  2738. * Note that NUMA hinting access restrictions are not transferred to
  2739. * avoid any possibility of altering permissions across VMAs.
  2740. */
  2741. if (freeze || pmd_is_migration_entry(old_pmd)) {
  2742. pte_t entry;
  2743. swp_entry_t swp_entry;
  2744. for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
  2745. if (write)
  2746. swp_entry = make_writable_migration_entry(
  2747. page_to_pfn(page + i));
  2748. else if (anon_exclusive)
  2749. swp_entry = make_readable_exclusive_migration_entry(
  2750. page_to_pfn(page + i));
  2751. else
  2752. swp_entry = make_readable_migration_entry(
  2753. page_to_pfn(page + i));
  2754. if (young)
  2755. swp_entry = make_migration_entry_young(swp_entry);
  2756. if (dirty)
  2757. swp_entry = make_migration_entry_dirty(swp_entry);
  2758. entry = swp_entry_to_pte(swp_entry);
  2759. if (soft_dirty)
  2760. entry = pte_swp_mksoft_dirty(entry);
  2761. if (uffd_wp)
  2762. entry = pte_swp_mkuffd_wp(entry);
  2763. VM_WARN_ON(!pte_none(ptep_get(pte + i)));
  2764. set_pte_at(mm, addr, pte + i, entry);
  2765. }
  2766. } else if (pmd_is_device_private_entry(old_pmd)) {
  2767. pte_t entry;
  2768. swp_entry_t swp_entry;
  2769. for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
  2770. /*
  2771. * anon_exclusive was already propagated to the relevant
  2772. * pages corresponding to the pte entries when freeze
  2773. * is false.
  2774. */
  2775. if (write)
  2776. swp_entry = make_writable_device_private_entry(
  2777. page_to_pfn(page + i));
  2778. else
  2779. swp_entry = make_readable_device_private_entry(
  2780. page_to_pfn(page + i));
  2781. /*
  2782. * Young and dirty bits are not progated via swp_entry
  2783. */
  2784. entry = swp_entry_to_pte(swp_entry);
  2785. if (soft_dirty)
  2786. entry = pte_swp_mksoft_dirty(entry);
  2787. if (uffd_wp)
  2788. entry = pte_swp_mkuffd_wp(entry);
  2789. VM_WARN_ON(!pte_none(ptep_get(pte + i)));
  2790. set_pte_at(mm, addr, pte + i, entry);
  2791. }
  2792. } else {
  2793. pte_t entry;
  2794. entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
  2795. if (write)
  2796. entry = pte_mkwrite(entry, vma);
  2797. if (!young)
  2798. entry = pte_mkold(entry);
  2799. /* NOTE: this may set soft-dirty too on some archs */
  2800. if (dirty)
  2801. entry = pte_mkdirty(entry);
  2802. if (soft_dirty)
  2803. entry = pte_mksoft_dirty(entry);
  2804. if (uffd_wp)
  2805. entry = pte_mkuffd_wp(entry);
  2806. for (i = 0; i < HPAGE_PMD_NR; i++)
  2807. VM_WARN_ON(!pte_none(ptep_get(pte + i)));
  2808. set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
  2809. }
  2810. pte_unmap(pte);
  2811. if (!pmd_is_migration_entry(*pmd))
  2812. folio_remove_rmap_pmd(folio, page, vma);
  2813. if (freeze)
  2814. put_page(page);
  2815. smp_wmb(); /* make pte visible before pmd */
  2816. pmd_populate(mm, pmd, pgtable);
  2817. }
  2818. void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
  2819. pmd_t *pmd, bool freeze)
  2820. {
  2821. VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
  2822. if (pmd_trans_huge(*pmd) || pmd_is_valid_softleaf(*pmd))
  2823. __split_huge_pmd_locked(vma, pmd, address, freeze);
  2824. }
  2825. void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  2826. unsigned long address, bool freeze)
  2827. {
  2828. spinlock_t *ptl;
  2829. struct mmu_notifier_range range;
  2830. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
  2831. address & HPAGE_PMD_MASK,
  2832. (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
  2833. mmu_notifier_invalidate_range_start(&range);
  2834. ptl = pmd_lock(vma->vm_mm, pmd);
  2835. split_huge_pmd_locked(vma, range.start, pmd, freeze);
  2836. spin_unlock(ptl);
  2837. mmu_notifier_invalidate_range_end(&range);
  2838. }
  2839. void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
  2840. bool freeze)
  2841. {
  2842. pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
  2843. if (!pmd)
  2844. return;
  2845. __split_huge_pmd(vma, pmd, address, freeze);
  2846. }
  2847. static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
  2848. {
  2849. /*
  2850. * If the new address isn't hpage aligned and it could previously
  2851. * contain an hugepage: check if we need to split an huge pmd.
  2852. */
  2853. if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
  2854. range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
  2855. ALIGN(address, HPAGE_PMD_SIZE)))
  2856. split_huge_pmd_address(vma, address, false);
  2857. }
  2858. void vma_adjust_trans_huge(struct vm_area_struct *vma,
  2859. unsigned long start,
  2860. unsigned long end,
  2861. struct vm_area_struct *next)
  2862. {
  2863. /* Check if we need to split start first. */
  2864. split_huge_pmd_if_needed(vma, start);
  2865. /* Check if we need to split end next. */
  2866. split_huge_pmd_if_needed(vma, end);
  2867. /* If we're incrementing next->vm_start, we might need to split it. */
  2868. if (next)
  2869. split_huge_pmd_if_needed(next, end);
  2870. }
  2871. static void unmap_folio(struct folio *folio)
  2872. {
  2873. enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
  2874. TTU_BATCH_FLUSH;
  2875. VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
  2876. if (folio_test_pmd_mappable(folio))
  2877. ttu_flags |= TTU_SPLIT_HUGE_PMD;
  2878. /*
  2879. * Anon pages need migration entries to preserve them, but file
  2880. * pages can simply be left unmapped, then faulted back on demand.
  2881. * If that is ever changed (perhaps for mlock), update remap_page().
  2882. */
  2883. if (folio_test_anon(folio))
  2884. try_to_migrate(folio, ttu_flags);
  2885. else
  2886. try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
  2887. try_to_unmap_flush();
  2888. }
  2889. static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
  2890. unsigned long addr, pmd_t *pmdp,
  2891. struct folio *folio)
  2892. {
  2893. struct mm_struct *mm = vma->vm_mm;
  2894. int ref_count, map_count;
  2895. pmd_t orig_pmd = *pmdp;
  2896. if (pmd_dirty(orig_pmd))
  2897. folio_set_dirty(folio);
  2898. if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
  2899. folio_set_swapbacked(folio);
  2900. return false;
  2901. }
  2902. orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
  2903. /*
  2904. * Syncing against concurrent GUP-fast:
  2905. * - clear PMD; barrier; read refcount
  2906. * - inc refcount; barrier; read PMD
  2907. */
  2908. smp_mb();
  2909. ref_count = folio_ref_count(folio);
  2910. map_count = folio_mapcount(folio);
  2911. /*
  2912. * Order reads for folio refcount and dirty flag
  2913. * (see comments in __remove_mapping()).
  2914. */
  2915. smp_rmb();
  2916. /*
  2917. * If the folio or its PMD is redirtied at this point, or if there
  2918. * are unexpected references, we will give up to discard this folio
  2919. * and remap it.
  2920. *
  2921. * The only folio refs must be one from isolation plus the rmap(s).
  2922. */
  2923. if (pmd_dirty(orig_pmd))
  2924. folio_set_dirty(folio);
  2925. if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
  2926. folio_set_swapbacked(folio);
  2927. set_pmd_at(mm, addr, pmdp, orig_pmd);
  2928. return false;
  2929. }
  2930. if (ref_count != map_count + 1) {
  2931. set_pmd_at(mm, addr, pmdp, orig_pmd);
  2932. return false;
  2933. }
  2934. folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma);
  2935. zap_deposited_table(mm, pmdp);
  2936. add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
  2937. if (vma->vm_flags & VM_LOCKED)
  2938. mlock_drain_local();
  2939. folio_put(folio);
  2940. return true;
  2941. }
  2942. bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
  2943. pmd_t *pmdp, struct folio *folio)
  2944. {
  2945. VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
  2946. VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
  2947. VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
  2948. VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
  2949. VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
  2950. return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
  2951. }
  2952. static void remap_page(struct folio *folio, unsigned long nr, int flags)
  2953. {
  2954. int i = 0;
  2955. /* If unmap_folio() uses try_to_migrate() on file, remove this check */
  2956. if (!folio_test_anon(folio))
  2957. return;
  2958. for (;;) {
  2959. remove_migration_ptes(folio, folio, TTU_RMAP_LOCKED | flags);
  2960. i += folio_nr_pages(folio);
  2961. if (i >= nr)
  2962. break;
  2963. folio = folio_next(folio);
  2964. }
  2965. }
  2966. static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
  2967. struct lruvec *lruvec, struct list_head *list)
  2968. {
  2969. VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
  2970. lockdep_assert_held(&lruvec->lru_lock);
  2971. if (folio_is_device_private(folio))
  2972. return;
  2973. if (list) {
  2974. /* page reclaim is reclaiming a huge page */
  2975. VM_WARN_ON(folio_test_lru(folio));
  2976. folio_get(new_folio);
  2977. list_add_tail(&new_folio->lru, list);
  2978. } else {
  2979. /* head is still on lru (and we have it frozen) */
  2980. VM_WARN_ON(!folio_test_lru(folio));
  2981. if (folio_test_unevictable(folio))
  2982. new_folio->mlock_count = 0;
  2983. else
  2984. list_add_tail(&new_folio->lru, &folio->lru);
  2985. folio_set_lru(new_folio);
  2986. }
  2987. }
  2988. static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
  2989. {
  2990. for (; nr_pages; page++, nr_pages--)
  2991. if (PageHWPoison(page))
  2992. return true;
  2993. return false;
  2994. }
  2995. /*
  2996. * It splits @folio into @new_order folios and copies the @folio metadata to
  2997. * all the resulting folios.
  2998. */
  2999. static void __split_folio_to_order(struct folio *folio, int old_order,
  3000. int new_order)
  3001. {
  3002. /* Scan poisoned pages when split a poisoned folio to large folios */
  3003. const bool handle_hwpoison = folio_test_has_hwpoisoned(folio) && new_order;
  3004. long new_nr_pages = 1 << new_order;
  3005. long nr_pages = 1 << old_order;
  3006. long i;
  3007. folio_clear_has_hwpoisoned(folio);
  3008. /* Check first new_nr_pages since the loop below skips them */
  3009. if (handle_hwpoison &&
  3010. page_range_has_hwpoisoned(folio_page(folio, 0), new_nr_pages))
  3011. folio_set_has_hwpoisoned(folio);
  3012. /*
  3013. * Skip the first new_nr_pages, since the new folio from them have all
  3014. * the flags from the original folio.
  3015. */
  3016. for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) {
  3017. struct page *new_head = &folio->page + i;
  3018. /*
  3019. * Careful: new_folio is not a "real" folio before we cleared PageTail.
  3020. * Don't pass it around before clear_compound_head().
  3021. */
  3022. struct folio *new_folio = (struct folio *)new_head;
  3023. VM_BUG_ON_PAGE(atomic_read(&new_folio->_mapcount) != -1, new_head);
  3024. /*
  3025. * Clone page flags before unfreezing refcount.
  3026. *
  3027. * After successful get_page_unless_zero() might follow flags change,
  3028. * for example lock_page() which set PG_waiters.
  3029. *
  3030. * Note that for mapped sub-pages of an anonymous THP,
  3031. * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
  3032. * the migration entry instead from where remap_page() will restore it.
  3033. * We can still have PG_anon_exclusive set on effectively unmapped and
  3034. * unreferenced sub-pages of an anonymous THP: we can simply drop
  3035. * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
  3036. */
  3037. new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
  3038. new_folio->flags.f |= (folio->flags.f &
  3039. ((1L << PG_referenced) |
  3040. (1L << PG_swapbacked) |
  3041. (1L << PG_swapcache) |
  3042. (1L << PG_mlocked) |
  3043. (1L << PG_uptodate) |
  3044. (1L << PG_active) |
  3045. (1L << PG_workingset) |
  3046. (1L << PG_locked) |
  3047. (1L << PG_unevictable) |
  3048. #ifdef CONFIG_ARCH_USES_PG_ARCH_2
  3049. (1L << PG_arch_2) |
  3050. #endif
  3051. #ifdef CONFIG_ARCH_USES_PG_ARCH_3
  3052. (1L << PG_arch_3) |
  3053. #endif
  3054. (1L << PG_dirty) |
  3055. LRU_GEN_MASK | LRU_REFS_MASK));
  3056. if (handle_hwpoison &&
  3057. page_range_has_hwpoisoned(new_head, new_nr_pages))
  3058. folio_set_has_hwpoisoned(new_folio);
  3059. new_folio->mapping = folio->mapping;
  3060. new_folio->index = folio->index + i;
  3061. if (folio_test_swapcache(folio))
  3062. new_folio->swap.val = folio->swap.val + i;
  3063. /* Page flags must be visible before we make the page non-compound. */
  3064. smp_wmb();
  3065. /*
  3066. * Clear PageTail before unfreezing page refcount.
  3067. *
  3068. * After successful get_page_unless_zero() might follow put_page()
  3069. * which needs correct compound_head().
  3070. */
  3071. clear_compound_head(new_head);
  3072. if (new_order) {
  3073. prep_compound_page(new_head, new_order);
  3074. folio_set_large_rmappable(new_folio);
  3075. }
  3076. if (folio_test_young(folio))
  3077. folio_set_young(new_folio);
  3078. if (folio_test_idle(folio))
  3079. folio_set_idle(new_folio);
  3080. #ifdef CONFIG_MEMCG
  3081. new_folio->memcg_data = folio->memcg_data;
  3082. #endif
  3083. folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
  3084. }
  3085. if (new_order)
  3086. folio_set_order(folio, new_order);
  3087. else
  3088. ClearPageCompound(&folio->page);
  3089. }
  3090. /**
  3091. * __split_unmapped_folio() - splits an unmapped @folio to lower order folios in
  3092. * two ways: uniform split or non-uniform split.
  3093. * @folio: the to-be-split folio
  3094. * @new_order: the smallest order of the after split folios (since buddy
  3095. * allocator like split generates folios with orders from @folio's
  3096. * order - 1 to new_order).
  3097. * @split_at: in buddy allocator like split, the folio containing @split_at
  3098. * will be split until its order becomes @new_order.
  3099. * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
  3100. * @mapping: @folio->mapping
  3101. * @split_type: if the split is uniform or not (buddy allocator like split)
  3102. *
  3103. *
  3104. * 1. uniform split: the given @folio into multiple @new_order small folios,
  3105. * where all small folios have the same order. This is done when
  3106. * split_type is SPLIT_TYPE_UNIFORM.
  3107. * 2. buddy allocator like (non-uniform) split: the given @folio is split into
  3108. * half and one of the half (containing the given page) is split into half
  3109. * until the given @folio's order becomes @new_order. This is done when
  3110. * split_type is SPLIT_TYPE_NON_UNIFORM.
  3111. *
  3112. * The high level flow for these two methods are:
  3113. *
  3114. * 1. uniform split: @xas is split with no expectation of failure and a single
  3115. * __split_folio_to_order() is called to split the @folio into @new_order
  3116. * along with stats update.
  3117. * 2. non-uniform split: folio_order - @new_order calls to
  3118. * __split_folio_to_order() are expected to be made in a for loop to split
  3119. * the @folio to one lower order at a time. The folio containing @split_at
  3120. * is split in each iteration. @xas is split into half in each iteration and
  3121. * can fail. A failed @xas split leaves split folios as is without merging
  3122. * them back.
  3123. *
  3124. * After splitting, the caller's folio reference will be transferred to the
  3125. * folio containing @split_at. The caller needs to unlock and/or free
  3126. * after-split folios if necessary.
  3127. *
  3128. * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
  3129. * split but not to @new_order, the caller needs to check)
  3130. */
  3131. static int __split_unmapped_folio(struct folio *folio, int new_order,
  3132. struct page *split_at, struct xa_state *xas,
  3133. struct address_space *mapping, enum split_type split_type)
  3134. {
  3135. const bool is_anon = folio_test_anon(folio);
  3136. int old_order = folio_order(folio);
  3137. int start_order = split_type == SPLIT_TYPE_UNIFORM ? new_order : old_order - 1;
  3138. struct folio *old_folio = folio;
  3139. int split_order;
  3140. /*
  3141. * split to new_order one order at a time. For uniform split,
  3142. * folio is split to new_order directly.
  3143. */
  3144. for (split_order = start_order;
  3145. split_order >= new_order;
  3146. split_order--) {
  3147. int nr_new_folios = 1UL << (old_order - split_order);
  3148. /* order-1 anonymous folio is not supported */
  3149. if (is_anon && split_order == 1)
  3150. continue;
  3151. if (mapping) {
  3152. /*
  3153. * uniform split has xas_split_alloc() called before
  3154. * irq is disabled to allocate enough memory, whereas
  3155. * non-uniform split can handle ENOMEM.
  3156. * Use the to-be-split folio, so that a parallel
  3157. * folio_try_get() waits on it until xarray is updated
  3158. * with after-split folios and the original one is
  3159. * unfrozen.
  3160. */
  3161. if (split_type == SPLIT_TYPE_UNIFORM) {
  3162. xas_split(xas, old_folio, old_order);
  3163. } else {
  3164. xas_set_order(xas, folio->index, split_order);
  3165. xas_try_split(xas, old_folio, old_order);
  3166. if (xas_error(xas))
  3167. return xas_error(xas);
  3168. }
  3169. }
  3170. folio_split_memcg_refs(folio, old_order, split_order);
  3171. split_page_owner(&folio->page, old_order, split_order);
  3172. pgalloc_tag_split(folio, old_order, split_order);
  3173. __split_folio_to_order(folio, old_order, split_order);
  3174. if (is_anon) {
  3175. mod_mthp_stat(old_order, MTHP_STAT_NR_ANON, -1);
  3176. mod_mthp_stat(split_order, MTHP_STAT_NR_ANON, nr_new_folios);
  3177. }
  3178. /*
  3179. * If uniform split, the process is complete.
  3180. * If non-uniform, continue splitting the folio at @split_at
  3181. * as long as the next @split_order is >= @new_order.
  3182. */
  3183. folio = page_folio(split_at);
  3184. old_order = split_order;
  3185. }
  3186. return 0;
  3187. }
  3188. /**
  3189. * folio_check_splittable() - check if a folio can be split to a given order
  3190. * @folio: folio to be split
  3191. * @new_order: the smallest order of the after split folios (since buddy
  3192. * allocator like split generates folios with orders from @folio's
  3193. * order - 1 to new_order).
  3194. * @split_type: uniform or non-uniform split
  3195. *
  3196. * folio_check_splittable() checks if @folio can be split to @new_order using
  3197. * @split_type method. The truncated folio check must come first.
  3198. *
  3199. * Context: folio must be locked.
  3200. *
  3201. * Return: 0 - @folio can be split to @new_order, otherwise an error number is
  3202. * returned.
  3203. */
  3204. int folio_check_splittable(struct folio *folio, unsigned int new_order,
  3205. enum split_type split_type)
  3206. {
  3207. VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
  3208. /*
  3209. * Folios that just got truncated cannot get split. Signal to the
  3210. * caller that there was a race.
  3211. *
  3212. * TODO: this will also currently refuse folios without a mapping in the
  3213. * swapcache (shmem or to-be-anon folios).
  3214. */
  3215. if (!folio->mapping && !folio_test_anon(folio))
  3216. return -EBUSY;
  3217. if (folio_test_anon(folio)) {
  3218. /* order-1 is not supported for anonymous THP. */
  3219. if (new_order == 1)
  3220. return -EINVAL;
  3221. } else if (split_type == SPLIT_TYPE_NON_UNIFORM || new_order) {
  3222. if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
  3223. !mapping_large_folio_support(folio->mapping)) {
  3224. /*
  3225. * We can always split a folio down to a single page
  3226. * (new_order == 0) uniformly.
  3227. *
  3228. * For any other scenario
  3229. * a) uniform split targeting a large folio
  3230. * (new_order > 0)
  3231. * b) any non-uniform split
  3232. * we must confirm that the file system supports large
  3233. * folios.
  3234. *
  3235. * Note that we might still have THPs in such
  3236. * mappings, which is created from khugepaged when
  3237. * CONFIG_READ_ONLY_THP_FOR_FS is enabled. But in that
  3238. * case, the mapping does not actually support large
  3239. * folios properly.
  3240. */
  3241. return -EINVAL;
  3242. }
  3243. }
  3244. /*
  3245. * swapcache folio could only be split to order 0
  3246. *
  3247. * non-uniform split creates after-split folios with orders from
  3248. * folio_order(folio) - 1 to new_order, making it not suitable for any
  3249. * swapcache folio split. Only uniform split to order-0 can be used
  3250. * here.
  3251. */
  3252. if ((split_type == SPLIT_TYPE_NON_UNIFORM || new_order) && folio_test_swapcache(folio)) {
  3253. return -EINVAL;
  3254. }
  3255. if (is_huge_zero_folio(folio))
  3256. return -EINVAL;
  3257. if (folio_test_writeback(folio))
  3258. return -EBUSY;
  3259. return 0;
  3260. }
  3261. /* Number of folio references from the pagecache or the swapcache. */
  3262. static unsigned int folio_cache_ref_count(const struct folio *folio)
  3263. {
  3264. if (folio_test_anon(folio) && !folio_test_swapcache(folio))
  3265. return 0;
  3266. return folio_nr_pages(folio);
  3267. }
  3268. static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
  3269. struct page *split_at, struct xa_state *xas,
  3270. struct address_space *mapping, bool do_lru,
  3271. struct list_head *list, enum split_type split_type,
  3272. pgoff_t end, int *nr_shmem_dropped)
  3273. {
  3274. struct folio *end_folio = folio_next(folio);
  3275. struct folio *new_folio, *next;
  3276. int old_order = folio_order(folio);
  3277. int ret = 0;
  3278. struct deferred_split *ds_queue;
  3279. VM_WARN_ON_ONCE(!mapping && end);
  3280. /* Prevent deferred_split_scan() touching ->_refcount */
  3281. ds_queue = folio_split_queue_lock(folio);
  3282. if (folio_ref_freeze(folio, folio_cache_ref_count(folio) + 1)) {
  3283. struct swap_cluster_info *ci = NULL;
  3284. struct lruvec *lruvec;
  3285. if (old_order > 1) {
  3286. if (!list_empty(&folio->_deferred_list)) {
  3287. ds_queue->split_queue_len--;
  3288. /*
  3289. * Reinitialize page_deferred_list after removing the
  3290. * page from the split_queue, otherwise a subsequent
  3291. * split will see list corruption when checking the
  3292. * page_deferred_list.
  3293. */
  3294. list_del_init(&folio->_deferred_list);
  3295. }
  3296. if (folio_test_partially_mapped(folio)) {
  3297. folio_clear_partially_mapped(folio);
  3298. mod_mthp_stat(old_order,
  3299. MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
  3300. }
  3301. }
  3302. split_queue_unlock(ds_queue);
  3303. if (mapping) {
  3304. int nr = folio_nr_pages(folio);
  3305. if (folio_test_pmd_mappable(folio) &&
  3306. new_order < HPAGE_PMD_ORDER) {
  3307. if (folio_test_swapbacked(folio)) {
  3308. lruvec_stat_mod_folio(folio,
  3309. NR_SHMEM_THPS, -nr);
  3310. } else {
  3311. lruvec_stat_mod_folio(folio,
  3312. NR_FILE_THPS, -nr);
  3313. filemap_nr_thps_dec(mapping);
  3314. }
  3315. }
  3316. }
  3317. if (folio_test_swapcache(folio)) {
  3318. if (mapping) {
  3319. VM_WARN_ON_ONCE_FOLIO(mapping, folio);
  3320. return -EINVAL;
  3321. }
  3322. ci = swap_cluster_get_and_lock(folio);
  3323. }
  3324. /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
  3325. if (do_lru)
  3326. lruvec = folio_lruvec_lock(folio);
  3327. ret = __split_unmapped_folio(folio, new_order, split_at, xas,
  3328. mapping, split_type);
  3329. /*
  3330. * Unfreeze after-split folios and put them back to the right
  3331. * list. @folio should be kept frozon until page cache
  3332. * entries are updated with all the other after-split folios
  3333. * to prevent others seeing stale page cache entries.
  3334. * As a result, new_folio starts from the next folio of
  3335. * @folio.
  3336. */
  3337. for (new_folio = folio_next(folio); new_folio != end_folio;
  3338. new_folio = next) {
  3339. unsigned long nr_pages = folio_nr_pages(new_folio);
  3340. next = folio_next(new_folio);
  3341. zone_device_private_split_cb(folio, new_folio);
  3342. folio_ref_unfreeze(new_folio,
  3343. folio_cache_ref_count(new_folio) + 1);
  3344. if (do_lru)
  3345. lru_add_split_folio(folio, new_folio, lruvec, list);
  3346. /*
  3347. * Anonymous folio with swap cache.
  3348. * NOTE: shmem in swap cache is not supported yet.
  3349. */
  3350. if (ci) {
  3351. __swap_cache_replace_folio(ci, folio, new_folio);
  3352. continue;
  3353. }
  3354. /* Anonymous folio without swap cache */
  3355. if (!mapping)
  3356. continue;
  3357. /* Add the new folio to the page cache. */
  3358. if (new_folio->index < end) {
  3359. __xa_store(&mapping->i_pages, new_folio->index,
  3360. new_folio, 0);
  3361. continue;
  3362. }
  3363. VM_WARN_ON_ONCE(!nr_shmem_dropped);
  3364. /* Drop folio beyond EOF: ->index >= end */
  3365. if (shmem_mapping(mapping) && nr_shmem_dropped)
  3366. *nr_shmem_dropped += nr_pages;
  3367. else if (folio_test_clear_dirty(new_folio))
  3368. folio_account_cleaned(
  3369. new_folio, inode_to_wb(mapping->host));
  3370. __filemap_remove_folio(new_folio, NULL);
  3371. folio_put_refs(new_folio, nr_pages);
  3372. }
  3373. zone_device_private_split_cb(folio, NULL);
  3374. /*
  3375. * Unfreeze @folio only after all page cache entries, which
  3376. * used to point to it, have been updated with new folios.
  3377. * Otherwise, a parallel folio_try_get() can grab @folio
  3378. * and its caller can see stale page cache entries.
  3379. */
  3380. folio_ref_unfreeze(folio, folio_cache_ref_count(folio) + 1);
  3381. if (do_lru)
  3382. unlock_page_lruvec(lruvec);
  3383. if (ci)
  3384. swap_cluster_unlock(ci);
  3385. } else {
  3386. split_queue_unlock(ds_queue);
  3387. return -EAGAIN;
  3388. }
  3389. return ret;
  3390. }
  3391. /**
  3392. * __folio_split() - split a folio at @split_at to a @new_order folio
  3393. * @folio: folio to split
  3394. * @new_order: the order of the new folio
  3395. * @split_at: a page within the new folio
  3396. * @lock_at: a page within @folio to be left locked to caller
  3397. * @list: after-split folios will be put on it if non NULL
  3398. * @split_type: perform uniform split or not (non-uniform split)
  3399. *
  3400. * It calls __split_unmapped_folio() to perform uniform and non-uniform split.
  3401. * It is in charge of checking whether the split is supported or not and
  3402. * preparing @folio for __split_unmapped_folio().
  3403. *
  3404. * After splitting, the after-split folio containing @lock_at remains locked
  3405. * and others are unlocked:
  3406. * 1. for uniform split, @lock_at points to one of @folio's subpages;
  3407. * 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio.
  3408. *
  3409. * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
  3410. * split but not to @new_order, the caller needs to check)
  3411. */
  3412. static int __folio_split(struct folio *folio, unsigned int new_order,
  3413. struct page *split_at, struct page *lock_at,
  3414. struct list_head *list, enum split_type split_type)
  3415. {
  3416. XA_STATE(xas, &folio->mapping->i_pages, folio->index);
  3417. struct folio *end_folio = folio_next(folio);
  3418. bool is_anon = folio_test_anon(folio);
  3419. struct address_space *mapping = NULL;
  3420. struct anon_vma *anon_vma = NULL;
  3421. int old_order = folio_order(folio);
  3422. struct folio *new_folio, *next;
  3423. int nr_shmem_dropped = 0;
  3424. enum ttu_flags ttu_flags = 0;
  3425. int ret;
  3426. pgoff_t end = 0;
  3427. VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
  3428. VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
  3429. if (folio != page_folio(split_at) || folio != page_folio(lock_at)) {
  3430. ret = -EINVAL;
  3431. goto out;
  3432. }
  3433. if (new_order >= old_order) {
  3434. ret = -EINVAL;
  3435. goto out;
  3436. }
  3437. ret = folio_check_splittable(folio, new_order, split_type);
  3438. if (ret) {
  3439. VM_WARN_ONCE(ret == -EINVAL, "Tried to split an unsplittable folio");
  3440. goto out;
  3441. }
  3442. if (is_anon) {
  3443. /*
  3444. * The caller does not necessarily hold an mmap_lock that would
  3445. * prevent the anon_vma disappearing so we first we take a
  3446. * reference to it and then lock the anon_vma for write. This
  3447. * is similar to folio_lock_anon_vma_read except the write lock
  3448. * is taken to serialise against parallel split or collapse
  3449. * operations.
  3450. */
  3451. anon_vma = folio_get_anon_vma(folio);
  3452. if (!anon_vma) {
  3453. ret = -EBUSY;
  3454. goto out;
  3455. }
  3456. anon_vma_lock_write(anon_vma);
  3457. mapping = NULL;
  3458. } else {
  3459. unsigned int min_order;
  3460. gfp_t gfp;
  3461. mapping = folio->mapping;
  3462. min_order = mapping_min_folio_order(folio->mapping);
  3463. if (new_order < min_order) {
  3464. ret = -EINVAL;
  3465. goto out;
  3466. }
  3467. gfp = current_gfp_context(mapping_gfp_mask(mapping) &
  3468. GFP_RECLAIM_MASK);
  3469. if (!filemap_release_folio(folio, gfp)) {
  3470. ret = -EBUSY;
  3471. goto out;
  3472. }
  3473. if (split_type == SPLIT_TYPE_UNIFORM) {
  3474. xas_set_order(&xas, folio->index, new_order);
  3475. xas_split_alloc(&xas, folio, old_order, gfp);
  3476. if (xas_error(&xas)) {
  3477. ret = xas_error(&xas);
  3478. goto out;
  3479. }
  3480. }
  3481. anon_vma = NULL;
  3482. i_mmap_lock_read(mapping);
  3483. /*
  3484. *__split_unmapped_folio() may need to trim off pages beyond
  3485. * EOF: but on 32-bit, i_size_read() takes an irq-unsafe
  3486. * seqlock, which cannot be nested inside the page tree lock.
  3487. * So note end now: i_size itself may be changed at any moment,
  3488. * but folio lock is good enough to serialize the trimming.
  3489. */
  3490. end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
  3491. if (shmem_mapping(mapping))
  3492. end = shmem_fallocend(mapping->host, end);
  3493. }
  3494. /*
  3495. * Racy check if we can split the page, before unmap_folio() will
  3496. * split PMDs
  3497. */
  3498. if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1) {
  3499. ret = -EAGAIN;
  3500. goto out_unlock;
  3501. }
  3502. unmap_folio(folio);
  3503. /* block interrupt reentry in xa_lock and spinlock */
  3504. local_irq_disable();
  3505. if (mapping) {
  3506. /*
  3507. * Check if the folio is present in page cache.
  3508. * We assume all tail are present too, if folio is there.
  3509. */
  3510. xas_lock(&xas);
  3511. xas_reset(&xas);
  3512. if (xas_load(&xas) != folio) {
  3513. ret = -EAGAIN;
  3514. goto fail;
  3515. }
  3516. }
  3517. ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping,
  3518. true, list, split_type, end, &nr_shmem_dropped);
  3519. fail:
  3520. if (mapping)
  3521. xas_unlock(&xas);
  3522. local_irq_enable();
  3523. if (nr_shmem_dropped)
  3524. shmem_uncharge(mapping->host, nr_shmem_dropped);
  3525. if (!ret && is_anon && !folio_is_device_private(folio))
  3526. ttu_flags = TTU_USE_SHARED_ZEROPAGE;
  3527. remap_page(folio, 1 << old_order, ttu_flags);
  3528. /*
  3529. * Unlock all after-split folios except the one containing
  3530. * @lock_at page. If @folio is not split, it will be kept locked.
  3531. */
  3532. for (new_folio = folio; new_folio != end_folio; new_folio = next) {
  3533. next = folio_next(new_folio);
  3534. if (new_folio == page_folio(lock_at))
  3535. continue;
  3536. folio_unlock(new_folio);
  3537. /*
  3538. * Subpages may be freed if there wasn't any mapping
  3539. * like if add_to_swap() is running on a lru page that
  3540. * had its mapping zapped. And freeing these pages
  3541. * requires taking the lru_lock so we do the put_page
  3542. * of the tail pages after the split is complete.
  3543. */
  3544. free_folio_and_swap_cache(new_folio);
  3545. }
  3546. out_unlock:
  3547. if (anon_vma) {
  3548. anon_vma_unlock_write(anon_vma);
  3549. put_anon_vma(anon_vma);
  3550. }
  3551. if (mapping)
  3552. i_mmap_unlock_read(mapping);
  3553. out:
  3554. xas_destroy(&xas);
  3555. if (old_order == HPAGE_PMD_ORDER)
  3556. count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
  3557. count_mthp_stat(old_order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
  3558. return ret;
  3559. }
  3560. /**
  3561. * folio_split_unmapped() - split a large anon folio that is already unmapped
  3562. * @folio: folio to split
  3563. * @new_order: the order of folios after split
  3564. *
  3565. * This function is a helper for splitting folios that have already been
  3566. * unmapped. The use case is that the device or the CPU can refuse to migrate
  3567. * THP pages in the middle of migration, due to allocation issues on either
  3568. * side.
  3569. *
  3570. * anon_vma_lock is not required to be held, mmap_read_lock() or
  3571. * mmap_write_lock() should be held. @folio is expected to be locked by the
  3572. * caller. device-private and non device-private folios are supported along
  3573. * with folios that are in the swapcache. @folio should also be unmapped and
  3574. * isolated from LRU (if applicable)
  3575. *
  3576. * Upon return, the folio is not remapped, split folios are not added to LRU,
  3577. * free_folio_and_swap_cache() is not called, and new folios remain locked.
  3578. *
  3579. * Return: 0 on success, -EAGAIN if the folio cannot be split (e.g., due to
  3580. * insufficient reference count or extra pins).
  3581. */
  3582. int folio_split_unmapped(struct folio *folio, unsigned int new_order)
  3583. {
  3584. int ret = 0;
  3585. VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
  3586. VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
  3587. VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
  3588. VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio);
  3589. if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1)
  3590. return -EAGAIN;
  3591. local_irq_disable();
  3592. ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL,
  3593. NULL, false, NULL, SPLIT_TYPE_UNIFORM,
  3594. 0, NULL);
  3595. local_irq_enable();
  3596. return ret;
  3597. }
  3598. /*
  3599. * This function splits a large folio into smaller folios of order @new_order.
  3600. * @page can point to any page of the large folio to split. The split operation
  3601. * does not change the position of @page.
  3602. *
  3603. * Prerequisites:
  3604. *
  3605. * 1) The caller must hold a reference on the @page's owning folio, also known
  3606. * as the large folio.
  3607. *
  3608. * 2) The large folio must be locked.
  3609. *
  3610. * 3) The folio must not be pinned. Any unexpected folio references, including
  3611. * GUP pins, will result in the folio not getting split; instead, the caller
  3612. * will receive an -EAGAIN.
  3613. *
  3614. * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
  3615. * supported for non-file-backed folios, because folio->_deferred_list, which
  3616. * is used by partially mapped folios, is stored in subpage 2, but an order-1
  3617. * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
  3618. * since they do not use _deferred_list.
  3619. *
  3620. * After splitting, the caller's folio reference will be transferred to @page,
  3621. * resulting in a raised refcount of @page after this call. The other pages may
  3622. * be freed if they are not mapped.
  3623. *
  3624. * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
  3625. *
  3626. * Pages in @new_order will inherit the mapping, flags, and so on from the
  3627. * huge page.
  3628. *
  3629. * Returns 0 if the huge page was split successfully.
  3630. *
  3631. * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
  3632. * the folio was concurrently removed from the page cache.
  3633. *
  3634. * Returns -EBUSY when trying to split the huge zeropage, if the folio is
  3635. * under writeback, if fs-specific folio metadata cannot currently be
  3636. * released, or if some unexpected race happened (e.g., anon VMA disappeared,
  3637. * truncation).
  3638. *
  3639. * Callers should ensure that the order respects the address space mapping
  3640. * min-order if one is set for non-anonymous folios.
  3641. *
  3642. * Returns -EINVAL when trying to split to an order that is incompatible
  3643. * with the folio. Splitting to order 0 is compatible with all folios.
  3644. */
  3645. int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
  3646. unsigned int new_order)
  3647. {
  3648. struct folio *folio = page_folio(page);
  3649. return __folio_split(folio, new_order, &folio->page, page, list,
  3650. SPLIT_TYPE_UNIFORM);
  3651. }
  3652. /**
  3653. * folio_split() - split a folio at @split_at to a @new_order folio
  3654. * @folio: folio to split
  3655. * @new_order: the order of the new folio
  3656. * @split_at: a page within the new folio
  3657. * @list: after-split folios are added to @list if not null, otherwise to LRU
  3658. * list
  3659. *
  3660. * It has the same prerequisites and returns as
  3661. * split_huge_page_to_list_to_order().
  3662. *
  3663. * Split a folio at @split_at to a new_order folio, leave the
  3664. * remaining subpages of the original folio as large as possible. For example,
  3665. * in the case of splitting an order-9 folio at its third order-3 subpages to
  3666. * an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio.
  3667. * After the split, there will be a group of folios with different orders and
  3668. * the new folio containing @split_at is marked in bracket:
  3669. * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8].
  3670. *
  3671. * After split, folio is left locked for caller.
  3672. *
  3673. * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
  3674. * split but not to @new_order, the caller needs to check)
  3675. */
  3676. int folio_split(struct folio *folio, unsigned int new_order,
  3677. struct page *split_at, struct list_head *list)
  3678. {
  3679. return __folio_split(folio, new_order, split_at, &folio->page, list,
  3680. SPLIT_TYPE_NON_UNIFORM);
  3681. }
  3682. /**
  3683. * min_order_for_split() - get the minimum order @folio can be split to
  3684. * @folio: folio to split
  3685. *
  3686. * min_order_for_split() tells the minimum order @folio can be split to.
  3687. * If a file-backed folio is truncated, 0 will be returned. Any subsequent
  3688. * split attempt should get -EBUSY from split checking code.
  3689. *
  3690. * Return: @folio's minimum order for split
  3691. */
  3692. unsigned int min_order_for_split(struct folio *folio)
  3693. {
  3694. if (folio_test_anon(folio))
  3695. return 0;
  3696. /*
  3697. * If the folio got truncated, we don't know the previous mapping and
  3698. * consequently the old min order. But it doesn't matter, as any split
  3699. * attempt will immediately fail with -EBUSY as the folio cannot get
  3700. * split until freed.
  3701. */
  3702. if (!folio->mapping)
  3703. return 0;
  3704. return mapping_min_folio_order(folio->mapping);
  3705. }
  3706. int split_folio_to_list(struct folio *folio, struct list_head *list)
  3707. {
  3708. return split_huge_page_to_list_to_order(&folio->page, list, 0);
  3709. }
  3710. /*
  3711. * __folio_unqueue_deferred_split() is not to be called directly:
  3712. * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
  3713. * limits its calls to those folios which may have a _deferred_list for
  3714. * queueing THP splits, and that list is (racily observed to be) non-empty.
  3715. *
  3716. * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
  3717. * zero: because even when split_queue_lock is held, a non-empty _deferred_list
  3718. * might be in use on deferred_split_scan()'s unlocked on-stack list.
  3719. *
  3720. * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
  3721. * therefore important to unqueue deferred split before changing folio memcg.
  3722. */
  3723. bool __folio_unqueue_deferred_split(struct folio *folio)
  3724. {
  3725. struct deferred_split *ds_queue;
  3726. unsigned long flags;
  3727. bool unqueued = false;
  3728. WARN_ON_ONCE(folio_ref_count(folio));
  3729. WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
  3730. ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
  3731. if (!list_empty(&folio->_deferred_list)) {
  3732. ds_queue->split_queue_len--;
  3733. if (folio_test_partially_mapped(folio)) {
  3734. folio_clear_partially_mapped(folio);
  3735. mod_mthp_stat(folio_order(folio),
  3736. MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
  3737. }
  3738. list_del_init(&folio->_deferred_list);
  3739. unqueued = true;
  3740. }
  3741. split_queue_unlock_irqrestore(ds_queue, flags);
  3742. return unqueued; /* useful for debug warnings */
  3743. }
  3744. /* partially_mapped=false won't clear PG_partially_mapped folio flag */
  3745. void deferred_split_folio(struct folio *folio, bool partially_mapped)
  3746. {
  3747. struct deferred_split *ds_queue;
  3748. unsigned long flags;
  3749. /*
  3750. * Order 1 folios have no space for a deferred list, but we also
  3751. * won't waste much memory by not adding them to the deferred list.
  3752. */
  3753. if (folio_order(folio) <= 1)
  3754. return;
  3755. if (!partially_mapped && !split_underused_thp)
  3756. return;
  3757. /*
  3758. * Exclude swapcache: originally to avoid a corrupt deferred split
  3759. * queue. Nowadays that is fully prevented by memcg1_swapout();
  3760. * but if page reclaim is already handling the same folio, it is
  3761. * unnecessary to handle it again in the shrinker, so excluding
  3762. * swapcache here may still be a useful optimization.
  3763. */
  3764. if (folio_test_swapcache(folio))
  3765. return;
  3766. ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
  3767. if (partially_mapped) {
  3768. if (!folio_test_partially_mapped(folio)) {
  3769. folio_set_partially_mapped(folio);
  3770. if (folio_test_pmd_mappable(folio))
  3771. count_vm_event(THP_DEFERRED_SPLIT_PAGE);
  3772. count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
  3773. mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
  3774. }
  3775. } else {
  3776. /* partially mapped folios cannot become non-partially mapped */
  3777. VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
  3778. }
  3779. if (list_empty(&folio->_deferred_list)) {
  3780. struct mem_cgroup *memcg;
  3781. memcg = folio_split_queue_memcg(folio, ds_queue);
  3782. list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
  3783. ds_queue->split_queue_len++;
  3784. if (memcg)
  3785. set_shrinker_bit(memcg, folio_nid(folio),
  3786. shrinker_id(deferred_split_shrinker));
  3787. }
  3788. split_queue_unlock_irqrestore(ds_queue, flags);
  3789. }
  3790. static unsigned long deferred_split_count(struct shrinker *shrink,
  3791. struct shrink_control *sc)
  3792. {
  3793. struct pglist_data *pgdata = NODE_DATA(sc->nid);
  3794. struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
  3795. #ifdef CONFIG_MEMCG
  3796. if (sc->memcg)
  3797. ds_queue = &sc->memcg->deferred_split_queue;
  3798. #endif
  3799. return READ_ONCE(ds_queue->split_queue_len);
  3800. }
  3801. static bool thp_underused(struct folio *folio)
  3802. {
  3803. int num_zero_pages = 0, num_filled_pages = 0;
  3804. int i;
  3805. if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
  3806. return false;
  3807. if (folio_contain_hwpoisoned_page(folio))
  3808. return false;
  3809. for (i = 0; i < folio_nr_pages(folio); i++) {
  3810. if (pages_identical(folio_page(folio, i), ZERO_PAGE(0))) {
  3811. if (++num_zero_pages > khugepaged_max_ptes_none)
  3812. return true;
  3813. } else {
  3814. /*
  3815. * Another path for early exit once the number
  3816. * of non-zero filled pages exceeds threshold.
  3817. */
  3818. if (++num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none)
  3819. return false;
  3820. }
  3821. }
  3822. return false;
  3823. }
  3824. static unsigned long deferred_split_scan(struct shrinker *shrink,
  3825. struct shrink_control *sc)
  3826. {
  3827. struct deferred_split *ds_queue;
  3828. unsigned long flags;
  3829. struct folio *folio, *next;
  3830. int split = 0, i;
  3831. struct folio_batch fbatch;
  3832. folio_batch_init(&fbatch);
  3833. retry:
  3834. ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
  3835. /* Take pin on all head pages to avoid freeing them under us */
  3836. list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
  3837. _deferred_list) {
  3838. if (folio_try_get(folio)) {
  3839. folio_batch_add(&fbatch, folio);
  3840. } else if (folio_test_partially_mapped(folio)) {
  3841. /* We lost race with folio_put() */
  3842. folio_clear_partially_mapped(folio);
  3843. mod_mthp_stat(folio_order(folio),
  3844. MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
  3845. }
  3846. list_del_init(&folio->_deferred_list);
  3847. ds_queue->split_queue_len--;
  3848. if (!--sc->nr_to_scan)
  3849. break;
  3850. if (!folio_batch_space(&fbatch))
  3851. break;
  3852. }
  3853. split_queue_unlock_irqrestore(ds_queue, flags);
  3854. for (i = 0; i < folio_batch_count(&fbatch); i++) {
  3855. bool did_split = false;
  3856. bool underused = false;
  3857. struct deferred_split *fqueue;
  3858. folio = fbatch.folios[i];
  3859. if (!folio_test_partially_mapped(folio)) {
  3860. /*
  3861. * See try_to_map_unused_to_zeropage(): we cannot
  3862. * optimize zero-filled pages after splitting an
  3863. * mlocked folio.
  3864. */
  3865. if (folio_test_mlocked(folio))
  3866. goto next;
  3867. underused = thp_underused(folio);
  3868. if (!underused)
  3869. goto next;
  3870. }
  3871. if (!folio_trylock(folio))
  3872. goto next;
  3873. if (!split_folio(folio)) {
  3874. did_split = true;
  3875. if (underused)
  3876. count_vm_event(THP_UNDERUSED_SPLIT_PAGE);
  3877. split++;
  3878. }
  3879. folio_unlock(folio);
  3880. next:
  3881. if (did_split || !folio_test_partially_mapped(folio))
  3882. continue;
  3883. /*
  3884. * Only add back to the queue if folio is partially mapped.
  3885. * If thp_underused returns false, or if split_folio fails
  3886. * in the case it was underused, then consider it used and
  3887. * don't add it back to split_queue.
  3888. */
  3889. fqueue = folio_split_queue_lock_irqsave(folio, &flags);
  3890. if (list_empty(&folio->_deferred_list)) {
  3891. list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
  3892. fqueue->split_queue_len++;
  3893. }
  3894. split_queue_unlock_irqrestore(fqueue, flags);
  3895. }
  3896. folios_put(&fbatch);
  3897. if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) {
  3898. cond_resched();
  3899. goto retry;
  3900. }
  3901. /*
  3902. * Stop shrinker if we didn't split any page, but the queue is empty.
  3903. * This can happen if pages were freed under us.
  3904. */
  3905. if (!split && list_empty(&ds_queue->split_queue))
  3906. return SHRINK_STOP;
  3907. return split;
  3908. }
  3909. #ifdef CONFIG_MEMCG
  3910. void reparent_deferred_split_queue(struct mem_cgroup *memcg)
  3911. {
  3912. struct mem_cgroup *parent = parent_mem_cgroup(memcg);
  3913. struct deferred_split *ds_queue = &memcg->deferred_split_queue;
  3914. struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
  3915. int nid;
  3916. spin_lock_irq(&ds_queue->split_queue_lock);
  3917. spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
  3918. if (!ds_queue->split_queue_len)
  3919. goto unlock;
  3920. list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
  3921. parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
  3922. ds_queue->split_queue_len = 0;
  3923. for_each_node(nid)
  3924. set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));
  3925. unlock:
  3926. spin_unlock(&parent_ds_queue->split_queue_lock);
  3927. spin_unlock_irq(&ds_queue->split_queue_lock);
  3928. }
  3929. #endif
  3930. #ifdef CONFIG_DEBUG_FS
  3931. static void split_huge_pages_all(void)
  3932. {
  3933. struct zone *zone;
  3934. struct page *page;
  3935. struct folio *folio;
  3936. unsigned long pfn, max_zone_pfn;
  3937. unsigned long total = 0, split = 0;
  3938. pr_debug("Split all THPs\n");
  3939. for_each_zone(zone) {
  3940. if (!managed_zone(zone))
  3941. continue;
  3942. max_zone_pfn = zone_end_pfn(zone);
  3943. for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
  3944. int nr_pages;
  3945. page = pfn_to_online_page(pfn);
  3946. if (!page || PageTail(page))
  3947. continue;
  3948. folio = page_folio(page);
  3949. if (!folio_try_get(folio))
  3950. continue;
  3951. if (unlikely(page_folio(page) != folio))
  3952. goto next;
  3953. if (zone != folio_zone(folio))
  3954. goto next;
  3955. if (!folio_test_large(folio)
  3956. || folio_test_hugetlb(folio)
  3957. || !folio_test_lru(folio))
  3958. goto next;
  3959. total++;
  3960. folio_lock(folio);
  3961. nr_pages = folio_nr_pages(folio);
  3962. if (!split_folio(folio))
  3963. split++;
  3964. pfn += nr_pages - 1;
  3965. folio_unlock(folio);
  3966. next:
  3967. folio_put(folio);
  3968. cond_resched();
  3969. }
  3970. }
  3971. pr_debug("%lu of %lu THP split\n", split, total);
  3972. }
  3973. static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
  3974. {
  3975. return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
  3976. is_vm_hugetlb_page(vma);
  3977. }
  3978. static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
  3979. unsigned long vaddr_end, unsigned int new_order,
  3980. long in_folio_offset)
  3981. {
  3982. int ret = 0;
  3983. struct task_struct *task;
  3984. struct mm_struct *mm;
  3985. unsigned long total = 0, split = 0;
  3986. unsigned long addr;
  3987. vaddr_start &= PAGE_MASK;
  3988. vaddr_end &= PAGE_MASK;
  3989. task = find_get_task_by_vpid(pid);
  3990. if (!task) {
  3991. ret = -ESRCH;
  3992. goto out;
  3993. }
  3994. /* Find the mm_struct */
  3995. mm = get_task_mm(task);
  3996. put_task_struct(task);
  3997. if (!mm) {
  3998. ret = -EINVAL;
  3999. goto out;
  4000. }
  4001. pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
  4002. pid, vaddr_start, vaddr_end, new_order, in_folio_offset);
  4003. mmap_read_lock(mm);
  4004. /*
  4005. * always increase addr by PAGE_SIZE, since we could have a PTE page
  4006. * table filled with PTE-mapped THPs, each of which is distinct.
  4007. */
  4008. for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
  4009. struct vm_area_struct *vma = vma_lookup(mm, addr);
  4010. struct folio_walk fw;
  4011. struct folio *folio;
  4012. struct address_space *mapping;
  4013. unsigned int target_order = new_order;
  4014. if (!vma)
  4015. break;
  4016. /* skip special VMA and hugetlb VMA */
  4017. if (vma_not_suitable_for_thp_split(vma)) {
  4018. addr = vma->vm_end;
  4019. continue;
  4020. }
  4021. folio = folio_walk_start(&fw, vma, addr, 0);
  4022. if (!folio)
  4023. continue;
  4024. if (!is_transparent_hugepage(folio))
  4025. goto next;
  4026. if (!folio_test_anon(folio)) {
  4027. mapping = folio->mapping;
  4028. target_order = max(new_order,
  4029. mapping_min_folio_order(mapping));
  4030. }
  4031. if (target_order >= folio_order(folio))
  4032. goto next;
  4033. total++;
  4034. /*
  4035. * For folios with private, split_huge_page_to_list_to_order()
  4036. * will try to drop it before split and then check if the folio
  4037. * can be split or not. So skip the check here.
  4038. */
  4039. if (!folio_test_private(folio) &&
  4040. folio_expected_ref_count(folio) != folio_ref_count(folio))
  4041. goto next;
  4042. if (!folio_trylock(folio))
  4043. goto next;
  4044. folio_get(folio);
  4045. folio_walk_end(&fw, vma);
  4046. if (!folio_test_anon(folio) && folio->mapping != mapping)
  4047. goto unlock;
  4048. if (in_folio_offset < 0 ||
  4049. in_folio_offset >= folio_nr_pages(folio)) {
  4050. if (!split_folio_to_order(folio, target_order))
  4051. split++;
  4052. } else {
  4053. struct page *split_at = folio_page(folio,
  4054. in_folio_offset);
  4055. if (!folio_split(folio, target_order, split_at, NULL))
  4056. split++;
  4057. }
  4058. unlock:
  4059. folio_unlock(folio);
  4060. folio_put(folio);
  4061. cond_resched();
  4062. continue;
  4063. next:
  4064. folio_walk_end(&fw, vma);
  4065. cond_resched();
  4066. }
  4067. mmap_read_unlock(mm);
  4068. mmput(mm);
  4069. pr_debug("%lu of %lu THP split\n", split, total);
  4070. out:
  4071. return ret;
  4072. }
  4073. static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
  4074. pgoff_t off_end, unsigned int new_order,
  4075. long in_folio_offset)
  4076. {
  4077. struct file *candidate;
  4078. struct address_space *mapping;
  4079. pgoff_t index;
  4080. int nr_pages = 1;
  4081. unsigned long total = 0, split = 0;
  4082. unsigned int min_order;
  4083. unsigned int target_order;
  4084. CLASS(filename_kernel, file)(file_path);
  4085. candidate = file_open_name(file, O_RDONLY, 0);
  4086. if (IS_ERR(candidate))
  4087. return -EINVAL;
  4088. pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
  4089. file_path, off_start, off_end, new_order, in_folio_offset);
  4090. mapping = candidate->f_mapping;
  4091. min_order = mapping_min_folio_order(mapping);
  4092. target_order = max(new_order, min_order);
  4093. for (index = off_start; index < off_end; index += nr_pages) {
  4094. struct folio *folio = filemap_get_folio(mapping, index);
  4095. nr_pages = 1;
  4096. if (IS_ERR(folio))
  4097. continue;
  4098. if (!folio_test_large(folio))
  4099. goto next;
  4100. total++;
  4101. nr_pages = folio_nr_pages(folio);
  4102. if (target_order >= folio_order(folio))
  4103. goto next;
  4104. if (!folio_trylock(folio))
  4105. goto next;
  4106. if (folio->mapping != mapping)
  4107. goto unlock;
  4108. if (in_folio_offset < 0 || in_folio_offset >= nr_pages) {
  4109. if (!split_folio_to_order(folio, target_order))
  4110. split++;
  4111. } else {
  4112. struct page *split_at = folio_page(folio,
  4113. in_folio_offset);
  4114. if (!folio_split(folio, target_order, split_at, NULL))
  4115. split++;
  4116. }
  4117. unlock:
  4118. folio_unlock(folio);
  4119. next:
  4120. folio_put(folio);
  4121. cond_resched();
  4122. }
  4123. filp_close(candidate, NULL);
  4124. pr_debug("%lu of %lu file-backed THP split\n", split, total);
  4125. return 0;
  4126. }
  4127. #define MAX_INPUT_BUF_SZ 255
  4128. static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
  4129. size_t count, loff_t *ppops)
  4130. {
  4131. static DEFINE_MUTEX(split_debug_mutex);
  4132. ssize_t ret;
  4133. /*
  4134. * hold pid, start_vaddr, end_vaddr, new_order or
  4135. * file_path, off_start, off_end, new_order
  4136. */
  4137. char input_buf[MAX_INPUT_BUF_SZ];
  4138. int pid;
  4139. unsigned long vaddr_start, vaddr_end;
  4140. unsigned int new_order = 0;
  4141. long in_folio_offset = -1;
  4142. ret = mutex_lock_interruptible(&split_debug_mutex);
  4143. if (ret)
  4144. return ret;
  4145. ret = -EFAULT;
  4146. memset(input_buf, 0, MAX_INPUT_BUF_SZ);
  4147. if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
  4148. goto out;
  4149. input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
  4150. if (input_buf[0] == '/') {
  4151. char *tok;
  4152. char *tok_buf = input_buf;
  4153. char file_path[MAX_INPUT_BUF_SZ];
  4154. pgoff_t off_start = 0, off_end = 0;
  4155. size_t input_len = strlen(input_buf);
  4156. tok = strsep(&tok_buf, ",");
  4157. if (tok && tok_buf) {
  4158. strscpy(file_path, tok);
  4159. } else {
  4160. ret = -EINVAL;
  4161. goto out;
  4162. }
  4163. ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end,
  4164. &new_order, &in_folio_offset);
  4165. if (ret != 2 && ret != 3 && ret != 4) {
  4166. ret = -EINVAL;
  4167. goto out;
  4168. }
  4169. ret = split_huge_pages_in_file(file_path, off_start, off_end,
  4170. new_order, in_folio_offset);
  4171. if (!ret)
  4172. ret = input_len;
  4173. goto out;
  4174. }
  4175. ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start,
  4176. &vaddr_end, &new_order, &in_folio_offset);
  4177. if (ret == 1 && pid == 1) {
  4178. split_huge_pages_all();
  4179. ret = strlen(input_buf);
  4180. goto out;
  4181. } else if (ret != 3 && ret != 4 && ret != 5) {
  4182. ret = -EINVAL;
  4183. goto out;
  4184. }
  4185. ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order,
  4186. in_folio_offset);
  4187. if (!ret)
  4188. ret = strlen(input_buf);
  4189. out:
  4190. mutex_unlock(&split_debug_mutex);
  4191. return ret;
  4192. }
  4193. static const struct file_operations split_huge_pages_fops = {
  4194. .owner = THIS_MODULE,
  4195. .write = split_huge_pages_write,
  4196. };
  4197. static int __init split_huge_pages_debugfs(void)
  4198. {
  4199. debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
  4200. &split_huge_pages_fops);
  4201. return 0;
  4202. }
  4203. late_initcall(split_huge_pages_debugfs);
  4204. #endif
  4205. #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
  4206. int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
  4207. struct page *page)
  4208. {
  4209. struct folio *folio = page_folio(page);
  4210. struct vm_area_struct *vma = pvmw->vma;
  4211. struct mm_struct *mm = vma->vm_mm;
  4212. unsigned long address = pvmw->address;
  4213. bool anon_exclusive;
  4214. pmd_t pmdval;
  4215. swp_entry_t entry;
  4216. pmd_t pmdswp;
  4217. if (!(pvmw->pmd && !pvmw->pte))
  4218. return 0;
  4219. flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
  4220. if (unlikely(!pmd_present(*pvmw->pmd)))
  4221. pmdval = pmdp_huge_get_and_clear(vma->vm_mm, address, pvmw->pmd);
  4222. else
  4223. pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
  4224. /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
  4225. anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
  4226. if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
  4227. set_pmd_at(mm, address, pvmw->pmd, pmdval);
  4228. return -EBUSY;
  4229. }
  4230. if (pmd_dirty(pmdval))
  4231. folio_mark_dirty(folio);
  4232. if (pmd_write(pmdval))
  4233. entry = make_writable_migration_entry(page_to_pfn(page));
  4234. else if (anon_exclusive)
  4235. entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
  4236. else
  4237. entry = make_readable_migration_entry(page_to_pfn(page));
  4238. if (pmd_young(pmdval))
  4239. entry = make_migration_entry_young(entry);
  4240. if (pmd_dirty(pmdval))
  4241. entry = make_migration_entry_dirty(entry);
  4242. pmdswp = swp_entry_to_pmd(entry);
  4243. if (pmd_soft_dirty(pmdval))
  4244. pmdswp = pmd_swp_mksoft_dirty(pmdswp);
  4245. if (pmd_uffd_wp(pmdval))
  4246. pmdswp = pmd_swp_mkuffd_wp(pmdswp);
  4247. set_pmd_at(mm, address, pvmw->pmd, pmdswp);
  4248. folio_remove_rmap_pmd(folio, page, vma);
  4249. folio_put(folio);
  4250. trace_set_migration_pmd(address, pmd_val(pmdswp));
  4251. return 0;
  4252. }
  4253. void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
  4254. {
  4255. struct folio *folio = page_folio(new);
  4256. struct vm_area_struct *vma = pvmw->vma;
  4257. struct mm_struct *mm = vma->vm_mm;
  4258. unsigned long address = pvmw->address;
  4259. unsigned long haddr = address & HPAGE_PMD_MASK;
  4260. pmd_t pmde;
  4261. softleaf_t entry;
  4262. if (!(pvmw->pmd && !pvmw->pte))
  4263. return;
  4264. entry = softleaf_from_pmd(*pvmw->pmd);
  4265. folio_get(folio);
  4266. pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
  4267. if (pmd_swp_soft_dirty(*pvmw->pmd))
  4268. pmde = pmd_mksoft_dirty(pmde);
  4269. if (softleaf_is_migration_write(entry))
  4270. pmde = pmd_mkwrite(pmde, vma);
  4271. if (pmd_swp_uffd_wp(*pvmw->pmd))
  4272. pmde = pmd_mkuffd_wp(pmde);
  4273. if (!softleaf_is_migration_young(entry))
  4274. pmde = pmd_mkold(pmde);
  4275. /* NOTE: this may contain setting soft-dirty on some archs */
  4276. if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry))
  4277. pmde = pmd_mkdirty(pmde);
  4278. if (folio_is_device_private(folio)) {
  4279. swp_entry_t entry;
  4280. if (pmd_write(pmde))
  4281. entry = make_writable_device_private_entry(
  4282. page_to_pfn(new));
  4283. else
  4284. entry = make_readable_device_private_entry(
  4285. page_to_pfn(new));
  4286. pmde = swp_entry_to_pmd(entry);
  4287. if (pmd_swp_soft_dirty(*pvmw->pmd))
  4288. pmde = pmd_swp_mksoft_dirty(pmde);
  4289. if (pmd_swp_uffd_wp(*pvmw->pmd))
  4290. pmde = pmd_swp_mkuffd_wp(pmde);
  4291. }
  4292. if (folio_test_anon(folio)) {
  4293. rmap_t rmap_flags = RMAP_NONE;
  4294. if (!softleaf_is_migration_read(entry))
  4295. rmap_flags |= RMAP_EXCLUSIVE;
  4296. folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
  4297. } else {
  4298. folio_add_file_rmap_pmd(folio, new, vma);
  4299. }
  4300. VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
  4301. set_pmd_at(mm, haddr, pvmw->pmd, pmde);
  4302. /* No need to invalidate - it was non-present before */
  4303. update_mmu_cache_pmd(vma, address, pvmw->pmd);
  4304. trace_remove_migration_pmd(address, pmd_val(pmde));
  4305. }
  4306. #endif