swapfile.c 103 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * linux/mm/swapfile.c
  4. *
  5. * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
  6. * Swap reorganised 29.12.95, Stephen Tweedie
  7. */
  8. #include <linux/blkdev.h>
  9. #include <linux/mm.h>
  10. #include <linux/sched/mm.h>
  11. #include <linux/sched/task.h>
  12. #include <linux/hugetlb.h>
  13. #include <linux/mman.h>
  14. #include <linux/slab.h>
  15. #include <linux/kernel_stat.h>
  16. #include <linux/swap.h>
  17. #include <linux/vmalloc.h>
  18. #include <linux/pagemap.h>
  19. #include <linux/namei.h>
  20. #include <linux/shmem_fs.h>
  21. #include <linux/blk-cgroup.h>
  22. #include <linux/random.h>
  23. #include <linux/writeback.h>
  24. #include <linux/proc_fs.h>
  25. #include <linux/seq_file.h>
  26. #include <linux/init.h>
  27. #include <linux/ksm.h>
  28. #include <linux/rmap.h>
  29. #include <linux/security.h>
  30. #include <linux/backing-dev.h>
  31. #include <linux/mutex.h>
  32. #include <linux/capability.h>
  33. #include <linux/syscalls.h>
  34. #include <linux/memcontrol.h>
  35. #include <linux/poll.h>
  36. #include <linux/oom.h>
  37. #include <linux/swapfile.h>
  38. #include <linux/export.h>
  39. #include <linux/sort.h>
  40. #include <linux/completion.h>
  41. #include <linux/suspend.h>
  42. #include <linux/zswap.h>
  43. #include <linux/plist.h>
  44. #include <asm/tlbflush.h>
  45. #include <linux/leafops.h>
  46. #include <linux/swap_cgroup.h>
  47. #include "swap_table.h"
  48. #include "internal.h"
  49. #include "swap_table.h"
  50. #include "swap.h"
  51. static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
  52. unsigned char);
  53. static void free_swap_count_continuations(struct swap_info_struct *);
  54. static void swap_range_alloc(struct swap_info_struct *si,
  55. unsigned int nr_entries);
  56. static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr);
  57. static void swap_put_entry_locked(struct swap_info_struct *si,
  58. struct swap_cluster_info *ci,
  59. unsigned long offset);
  60. static bool folio_swapcache_freeable(struct folio *folio);
  61. static void move_cluster(struct swap_info_struct *si,
  62. struct swap_cluster_info *ci, struct list_head *list,
  63. enum swap_cluster_flags new_flags);
  64. static DEFINE_SPINLOCK(swap_lock);
  65. static unsigned int nr_swapfiles;
  66. atomic_long_t nr_swap_pages;
  67. /*
  68. * Some modules use swappable objects and may try to swap them out under
  69. * memory pressure (via the shrinker). Before doing so, they may wish to
  70. * check to see if any swap space is available.
  71. */
  72. EXPORT_SYMBOL_GPL(nr_swap_pages);
  73. /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
  74. long total_swap_pages;
  75. #define DEF_SWAP_PRIO -1
  76. unsigned long swapfile_maximum_size;
  77. #ifdef CONFIG_MIGRATION
  78. bool swap_migration_ad_supported;
  79. #endif /* CONFIG_MIGRATION */
  80. static const char Bad_file[] = "Bad swap file entry ";
  81. static const char Bad_offset[] = "Bad swap offset entry ";
  82. /*
  83. * all active swap_info_structs
  84. * protected with swap_lock, and ordered by priority.
  85. */
  86. static PLIST_HEAD(swap_active_head);
  87. /*
  88. * all available (active, not full) swap_info_structs
  89. * protected with swap_avail_lock, ordered by priority.
  90. * This is used by folio_alloc_swap() instead of swap_active_head
  91. * because swap_active_head includes all swap_info_structs,
  92. * but folio_alloc_swap() doesn't need to look at full ones.
  93. * This uses its own lock instead of swap_lock because when a
  94. * swap_info_struct changes between not-full/full, it needs to
  95. * add/remove itself to/from this list, but the swap_info_struct->lock
  96. * is held and the locking order requires swap_lock to be taken
  97. * before any swap_info_struct->lock.
  98. */
  99. static PLIST_HEAD(swap_avail_head);
  100. static DEFINE_SPINLOCK(swap_avail_lock);
  101. struct swap_info_struct *swap_info[MAX_SWAPFILES];
  102. static struct kmem_cache *swap_table_cachep;
  103. static DEFINE_MUTEX(swapon_mutex);
  104. static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
  105. /* Activity counter to indicate that a swapon or swapoff has occurred */
  106. static atomic_t proc_poll_event = ATOMIC_INIT(0);
  107. atomic_t nr_rotate_swap = ATOMIC_INIT(0);
  108. struct percpu_swap_cluster {
  109. struct swap_info_struct *si[SWAP_NR_ORDERS];
  110. unsigned long offset[SWAP_NR_ORDERS];
  111. local_lock_t lock;
  112. };
  113. static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
  114. .si = { NULL },
  115. .offset = { SWAP_ENTRY_INVALID },
  116. .lock = INIT_LOCAL_LOCK(),
  117. };
  118. /* May return NULL on invalid type, caller must check for NULL return */
  119. static struct swap_info_struct *swap_type_to_info(int type)
  120. {
  121. if (type >= MAX_SWAPFILES)
  122. return NULL;
  123. return READ_ONCE(swap_info[type]); /* rcu_dereference() */
  124. }
  125. /* May return NULL on invalid entry, caller must check for NULL return */
  126. static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry)
  127. {
  128. return swap_type_to_info(swp_type(entry));
  129. }
  130. /*
  131. * Use the second highest bit of inuse_pages counter as the indicator
  132. * if one swap device is on the available plist, so the atomic can
  133. * still be updated arithmetically while having special data embedded.
  134. *
  135. * inuse_pages counter is the only thing indicating if a device should
  136. * be on avail_lists or not (except swapon / swapoff). By embedding the
  137. * off-list bit in the atomic counter, updates no longer need any lock
  138. * to check the list status.
  139. *
  140. * This bit will be set if the device is not on the plist and not
  141. * usable, will be cleared if the device is on the plist.
  142. */
  143. #define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
  144. #define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
  145. static long swap_usage_in_pages(struct swap_info_struct *si)
  146. {
  147. return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
  148. }
  149. /* Reclaim the swap entry anyway if possible */
  150. #define TTRS_ANYWAY 0x1
  151. /*
  152. * Reclaim the swap entry if there are no more mappings of the
  153. * corresponding page
  154. */
  155. #define TTRS_UNMAPPED 0x2
  156. /* Reclaim the swap entry if swap is getting full */
  157. #define TTRS_FULL 0x4
  158. static bool swap_only_has_cache(struct swap_info_struct *si,
  159. struct swap_cluster_info *ci,
  160. unsigned long offset, int nr_pages)
  161. {
  162. unsigned int ci_off = offset % SWAPFILE_CLUSTER;
  163. unsigned char *map = si->swap_map + offset;
  164. unsigned char *map_end = map + nr_pages;
  165. unsigned long swp_tb;
  166. do {
  167. swp_tb = __swap_table_get(ci, ci_off);
  168. VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb));
  169. if (*map)
  170. return false;
  171. ++ci_off;
  172. } while (++map < map_end);
  173. return true;
  174. }
  175. /*
  176. * returns number of pages in the folio that backs the swap entry. If positive,
  177. * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
  178. * folio was associated with the swap entry.
  179. */
  180. static int __try_to_reclaim_swap(struct swap_info_struct *si,
  181. unsigned long offset, unsigned long flags)
  182. {
  183. const swp_entry_t entry = swp_entry(si->type, offset);
  184. struct swap_cluster_info *ci;
  185. struct folio *folio;
  186. int ret, nr_pages;
  187. bool need_reclaim;
  188. again:
  189. folio = swap_cache_get_folio(entry);
  190. if (!folio)
  191. return 0;
  192. nr_pages = folio_nr_pages(folio);
  193. ret = -nr_pages;
  194. /*
  195. * We hold a folio lock here. We have to use trylock for
  196. * avoiding deadlock. This is a special case and you should
  197. * use folio_free_swap() with explicit folio_lock() in usual
  198. * operations.
  199. */
  200. if (!folio_trylock(folio))
  201. goto out;
  202. /*
  203. * Offset could point to the middle of a large folio, or folio
  204. * may no longer point to the expected offset before it's locked.
  205. */
  206. if (!folio_matches_swap_entry(folio, entry)) {
  207. folio_unlock(folio);
  208. folio_put(folio);
  209. goto again;
  210. }
  211. offset = swp_offset(folio->swap);
  212. need_reclaim = ((flags & TTRS_ANYWAY) ||
  213. ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
  214. ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
  215. if (!need_reclaim || !folio_swapcache_freeable(folio))
  216. goto out_unlock;
  217. /*
  218. * It's safe to delete the folio from swap cache only if the folio
  219. * is in swap cache with swap count == 0. The slots have no page table
  220. * reference or pending writeback, and can't be allocated to others.
  221. */
  222. ci = swap_cluster_lock(si, offset);
  223. need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages);
  224. swap_cluster_unlock(ci);
  225. if (!need_reclaim)
  226. goto out_unlock;
  227. swap_cache_del_folio(folio);
  228. folio_set_dirty(folio);
  229. ret = nr_pages;
  230. out_unlock:
  231. folio_unlock(folio);
  232. out:
  233. folio_put(folio);
  234. return ret;
  235. }
  236. static inline struct swap_extent *first_se(struct swap_info_struct *sis)
  237. {
  238. struct rb_node *rb = rb_first(&sis->swap_extent_root);
  239. return rb_entry(rb, struct swap_extent, rb_node);
  240. }
  241. static inline struct swap_extent *next_se(struct swap_extent *se)
  242. {
  243. struct rb_node *rb = rb_next(&se->rb_node);
  244. return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
  245. }
  246. /*
  247. * swapon tell device that all the old swap contents can be discarded,
  248. * to allow the swap device to optimize its wear-levelling.
  249. */
  250. static int discard_swap(struct swap_info_struct *si)
  251. {
  252. struct swap_extent *se;
  253. sector_t start_block;
  254. sector_t nr_blocks;
  255. int err = 0;
  256. /* Do not discard the swap header page! */
  257. se = first_se(si);
  258. start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
  259. nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
  260. if (nr_blocks) {
  261. err = blkdev_issue_discard(si->bdev, start_block,
  262. nr_blocks, GFP_KERNEL);
  263. if (err)
  264. return err;
  265. cond_resched();
  266. }
  267. for (se = next_se(se); se; se = next_se(se)) {
  268. start_block = se->start_block << (PAGE_SHIFT - 9);
  269. nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
  270. err = blkdev_issue_discard(si->bdev, start_block,
  271. nr_blocks, GFP_KERNEL);
  272. if (err)
  273. break;
  274. cond_resched();
  275. }
  276. return err; /* That will often be -EOPNOTSUPP */
  277. }
  278. static struct swap_extent *
  279. offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
  280. {
  281. struct swap_extent *se;
  282. struct rb_node *rb;
  283. rb = sis->swap_extent_root.rb_node;
  284. while (rb) {
  285. se = rb_entry(rb, struct swap_extent, rb_node);
  286. if (offset < se->start_page)
  287. rb = rb->rb_left;
  288. else if (offset >= se->start_page + se->nr_pages)
  289. rb = rb->rb_right;
  290. else
  291. return se;
  292. }
  293. /* It *must* be present */
  294. BUG();
  295. }
  296. sector_t swap_folio_sector(struct folio *folio)
  297. {
  298. struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
  299. struct swap_extent *se;
  300. sector_t sector;
  301. pgoff_t offset;
  302. offset = swp_offset(folio->swap);
  303. se = offset_to_swap_extent(sis, offset);
  304. sector = se->start_block + (offset - se->start_page);
  305. return sector << (PAGE_SHIFT - 9);
  306. }
  307. /*
  308. * swap allocation tell device that a cluster of swap can now be discarded,
  309. * to allow the swap device to optimize its wear-levelling.
  310. */
  311. static void discard_swap_cluster(struct swap_info_struct *si,
  312. pgoff_t start_page, pgoff_t nr_pages)
  313. {
  314. struct swap_extent *se = offset_to_swap_extent(si, start_page);
  315. while (nr_pages) {
  316. pgoff_t offset = start_page - se->start_page;
  317. sector_t start_block = se->start_block + offset;
  318. sector_t nr_blocks = se->nr_pages - offset;
  319. if (nr_blocks > nr_pages)
  320. nr_blocks = nr_pages;
  321. start_page += nr_blocks;
  322. nr_pages -= nr_blocks;
  323. start_block <<= PAGE_SHIFT - 9;
  324. nr_blocks <<= PAGE_SHIFT - 9;
  325. if (blkdev_issue_discard(si->bdev, start_block,
  326. nr_blocks, GFP_NOIO))
  327. break;
  328. se = next_se(se);
  329. }
  330. }
  331. #define LATENCY_LIMIT 256
  332. static inline bool cluster_is_empty(struct swap_cluster_info *info)
  333. {
  334. return info->count == 0;
  335. }
  336. static inline bool cluster_is_discard(struct swap_cluster_info *info)
  337. {
  338. return info->flags == CLUSTER_FLAG_DISCARD;
  339. }
  340. static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci)
  341. {
  342. return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock));
  343. }
  344. static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
  345. {
  346. if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
  347. return false;
  348. if (!cluster_table_is_alloced(ci))
  349. return false;
  350. if (!order)
  351. return true;
  352. return cluster_is_empty(ci) || order == ci->order;
  353. }
  354. static inline unsigned int cluster_index(struct swap_info_struct *si,
  355. struct swap_cluster_info *ci)
  356. {
  357. return ci - si->cluster_info;
  358. }
  359. static inline unsigned int cluster_offset(struct swap_info_struct *si,
  360. struct swap_cluster_info *ci)
  361. {
  362. return cluster_index(si, ci) * SWAPFILE_CLUSTER;
  363. }
  364. static struct swap_table *swap_table_alloc(gfp_t gfp)
  365. {
  366. struct folio *folio;
  367. if (!SWP_TABLE_USE_PAGE)
  368. return kmem_cache_zalloc(swap_table_cachep, gfp);
  369. folio = folio_alloc(gfp | __GFP_ZERO, 0);
  370. if (folio)
  371. return folio_address(folio);
  372. return NULL;
  373. }
  374. static void swap_table_free_folio_rcu_cb(struct rcu_head *head)
  375. {
  376. struct folio *folio;
  377. folio = page_folio(container_of(head, struct page, rcu_head));
  378. folio_put(folio);
  379. }
  380. static void swap_table_free(struct swap_table *table)
  381. {
  382. if (!SWP_TABLE_USE_PAGE) {
  383. kmem_cache_free(swap_table_cachep, table);
  384. return;
  385. }
  386. call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head),
  387. swap_table_free_folio_rcu_cb);
  388. }
  389. static void swap_cluster_free_table(struct swap_cluster_info *ci)
  390. {
  391. unsigned int ci_off;
  392. struct swap_table *table;
  393. /* Only empty cluster's table is allow to be freed */
  394. lockdep_assert_held(&ci->lock);
  395. VM_WARN_ON_ONCE(!cluster_is_empty(ci));
  396. for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++)
  397. VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off)));
  398. table = (void *)rcu_dereference_protected(ci->table, true);
  399. rcu_assign_pointer(ci->table, NULL);
  400. swap_table_free(table);
  401. }
  402. /*
  403. * Allocate swap table for one cluster. Attempt an atomic allocation first,
  404. * then fallback to sleeping allocation.
  405. */
  406. static struct swap_cluster_info *
  407. swap_cluster_alloc_table(struct swap_info_struct *si,
  408. struct swap_cluster_info *ci)
  409. {
  410. struct swap_table *table;
  411. /*
  412. * Only cluster isolation from the allocator does table allocation.
  413. * Swap allocator uses percpu clusters and holds the local lock.
  414. */
  415. lockdep_assert_held(&ci->lock);
  416. lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
  417. /* The cluster must be free and was just isolated from the free list. */
  418. VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci));
  419. table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
  420. if (table) {
  421. rcu_assign_pointer(ci->table, table);
  422. return ci;
  423. }
  424. /*
  425. * Try a sleep allocation. Each isolated free cluster may cause
  426. * a sleep allocation, but there is a limited number of them, so
  427. * the potential recursive allocation is limited.
  428. */
  429. spin_unlock(&ci->lock);
  430. if (!(si->flags & SWP_SOLIDSTATE))
  431. spin_unlock(&si->global_cluster_lock);
  432. local_unlock(&percpu_swap_cluster.lock);
  433. table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL);
  434. /*
  435. * Back to atomic context. We might have migrated to a new CPU with a
  436. * usable percpu cluster. But just keep using the isolated cluster to
  437. * make things easier. Migration indicates a slight change of workload
  438. * so using a new free cluster might not be a bad idea, and the worst
  439. * could happen with ignoring the percpu cluster is fragmentation,
  440. * which is acceptable since this fallback and race is rare.
  441. */
  442. local_lock(&percpu_swap_cluster.lock);
  443. if (!(si->flags & SWP_SOLIDSTATE))
  444. spin_lock(&si->global_cluster_lock);
  445. spin_lock(&ci->lock);
  446. /* Nothing except this helper should touch a dangling empty cluster. */
  447. if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) {
  448. if (table)
  449. swap_table_free(table);
  450. return ci;
  451. }
  452. if (!table) {
  453. move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
  454. spin_unlock(&ci->lock);
  455. return NULL;
  456. }
  457. rcu_assign_pointer(ci->table, table);
  458. return ci;
  459. }
  460. static void move_cluster(struct swap_info_struct *si,
  461. struct swap_cluster_info *ci, struct list_head *list,
  462. enum swap_cluster_flags new_flags)
  463. {
  464. VM_WARN_ON(ci->flags == new_flags);
  465. BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX);
  466. lockdep_assert_held(&ci->lock);
  467. spin_lock(&si->lock);
  468. if (ci->flags == CLUSTER_FLAG_NONE)
  469. list_add_tail(&ci->list, list);
  470. else
  471. list_move_tail(&ci->list, list);
  472. spin_unlock(&si->lock);
  473. ci->flags = new_flags;
  474. }
  475. /* Add a cluster to discard list and schedule it to do discard */
  476. static void swap_cluster_schedule_discard(struct swap_info_struct *si,
  477. struct swap_cluster_info *ci)
  478. {
  479. VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
  480. move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD);
  481. schedule_work(&si->discard_work);
  482. }
  483. static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
  484. {
  485. swap_cluster_free_table(ci);
  486. move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
  487. ci->order = 0;
  488. }
  489. /*
  490. * Isolate and lock the first cluster that is not contented on a list,
  491. * clean its flag before taken off-list. Cluster flag must be in sync
  492. * with list status, so cluster updaters can always know the cluster
  493. * list status without touching si lock.
  494. *
  495. * Note it's possible that all clusters on a list are contented so
  496. * this returns NULL for an non-empty list.
  497. */
  498. static struct swap_cluster_info *isolate_lock_cluster(
  499. struct swap_info_struct *si, struct list_head *list)
  500. {
  501. struct swap_cluster_info *ci, *found = NULL;
  502. spin_lock(&si->lock);
  503. list_for_each_entry(ci, list, list) {
  504. if (!spin_trylock(&ci->lock))
  505. continue;
  506. /* We may only isolate and clear flags of following lists */
  507. VM_BUG_ON(!ci->flags);
  508. VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE &&
  509. ci->flags != CLUSTER_FLAG_FULL);
  510. list_del(&ci->list);
  511. ci->flags = CLUSTER_FLAG_NONE;
  512. found = ci;
  513. break;
  514. }
  515. spin_unlock(&si->lock);
  516. if (found && !cluster_table_is_alloced(found)) {
  517. /* Only an empty free cluster's swap table can be freed. */
  518. VM_WARN_ON_ONCE(list != &si->free_clusters);
  519. VM_WARN_ON_ONCE(!cluster_is_empty(found));
  520. return swap_cluster_alloc_table(si, found);
  521. }
  522. return found;
  523. }
  524. /*
  525. * Doing discard actually. After a cluster discard is finished, the cluster
  526. * will be added to free cluster list. Discard cluster is a bit special as
  527. * they don't participate in allocation or reclaim, so clusters marked as
  528. * CLUSTER_FLAG_DISCARD must remain off-list or on discard list.
  529. */
  530. static bool swap_do_scheduled_discard(struct swap_info_struct *si)
  531. {
  532. struct swap_cluster_info *ci;
  533. bool ret = false;
  534. unsigned int idx;
  535. spin_lock(&si->lock);
  536. while (!list_empty(&si->discard_clusters)) {
  537. ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
  538. /*
  539. * Delete the cluster from list to prepare for discard, but keep
  540. * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
  541. * pointing to it, or ran into by relocate_cluster.
  542. */
  543. list_del(&ci->list);
  544. idx = cluster_index(si, ci);
  545. spin_unlock(&si->lock);
  546. discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
  547. SWAPFILE_CLUSTER);
  548. spin_lock(&ci->lock);
  549. /*
  550. * Discard is done, clear its flags as it's off-list, then
  551. * return the cluster to allocation list.
  552. */
  553. ci->flags = CLUSTER_FLAG_NONE;
  554. __free_cluster(si, ci);
  555. spin_unlock(&ci->lock);
  556. ret = true;
  557. spin_lock(&si->lock);
  558. }
  559. spin_unlock(&si->lock);
  560. return ret;
  561. }
  562. static void swap_discard_work(struct work_struct *work)
  563. {
  564. struct swap_info_struct *si;
  565. si = container_of(work, struct swap_info_struct, discard_work);
  566. swap_do_scheduled_discard(si);
  567. }
  568. static void swap_users_ref_free(struct percpu_ref *ref)
  569. {
  570. struct swap_info_struct *si;
  571. si = container_of(ref, struct swap_info_struct, users);
  572. complete(&si->comp);
  573. }
  574. /*
  575. * Must be called after freeing if ci->count == 0, moves the cluster to free
  576. * or discard list.
  577. */
  578. static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
  579. {
  580. VM_BUG_ON(ci->count != 0);
  581. VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
  582. lockdep_assert_held(&ci->lock);
  583. /*
  584. * If the swap is discardable, prepare discard the cluster
  585. * instead of free it immediately. The cluster will be freed
  586. * after discard.
  587. */
  588. if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
  589. (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
  590. swap_cluster_schedule_discard(si, ci);
  591. return;
  592. }
  593. __free_cluster(si, ci);
  594. }
  595. /*
  596. * Must be called after freeing if ci->count != 0, moves the cluster to
  597. * nonfull list.
  598. */
  599. static void partial_free_cluster(struct swap_info_struct *si,
  600. struct swap_cluster_info *ci)
  601. {
  602. VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER);
  603. lockdep_assert_held(&ci->lock);
  604. if (ci->flags != CLUSTER_FLAG_NONFULL)
  605. move_cluster(si, ci, &si->nonfull_clusters[ci->order],
  606. CLUSTER_FLAG_NONFULL);
  607. }
  608. /*
  609. * Must be called after allocation, moves the cluster to full or frag list.
  610. * Note: allocation doesn't acquire si lock, and may drop the ci lock for
  611. * reclaim, so the cluster could be any where when called.
  612. */
  613. static void relocate_cluster(struct swap_info_struct *si,
  614. struct swap_cluster_info *ci)
  615. {
  616. lockdep_assert_held(&ci->lock);
  617. /* Discard cluster must remain off-list or on discard list */
  618. if (cluster_is_discard(ci))
  619. return;
  620. if (!ci->count) {
  621. if (ci->flags != CLUSTER_FLAG_FREE)
  622. free_cluster(si, ci);
  623. } else if (ci->count != SWAPFILE_CLUSTER) {
  624. if (ci->flags != CLUSTER_FLAG_FRAG)
  625. move_cluster(si, ci, &si->frag_clusters[ci->order],
  626. CLUSTER_FLAG_FRAG);
  627. } else {
  628. if (ci->flags != CLUSTER_FLAG_FULL)
  629. move_cluster(si, ci, &si->full_clusters,
  630. CLUSTER_FLAG_FULL);
  631. }
  632. }
  633. /*
  634. * The cluster corresponding to @offset will be accounted as having one bad
  635. * slot. The cluster will not be added to the free cluster list, and its
  636. * usage counter will be increased by 1. Only used for initialization.
  637. */
  638. static int swap_cluster_setup_bad_slot(struct swap_cluster_info *cluster_info,
  639. unsigned long offset)
  640. {
  641. unsigned long idx = offset / SWAPFILE_CLUSTER;
  642. struct swap_table *table;
  643. struct swap_cluster_info *ci;
  644. ci = cluster_info + idx;
  645. if (!ci->table) {
  646. table = swap_table_alloc(GFP_KERNEL);
  647. if (!table)
  648. return -ENOMEM;
  649. rcu_assign_pointer(ci->table, table);
  650. }
  651. ci->count++;
  652. WARN_ON(ci->count > SWAPFILE_CLUSTER);
  653. WARN_ON(ci->flags);
  654. return 0;
  655. }
  656. /*
  657. * Reclaim drops the ci lock, so the cluster may become unusable (freed or
  658. * stolen by a lower order). @usable will be set to false if that happens.
  659. */
  660. static bool cluster_reclaim_range(struct swap_info_struct *si,
  661. struct swap_cluster_info *ci,
  662. unsigned long start, unsigned int order,
  663. bool *usable)
  664. {
  665. unsigned int nr_pages = 1 << order;
  666. unsigned long offset = start, end = start + nr_pages;
  667. unsigned char *map = si->swap_map;
  668. unsigned long swp_tb;
  669. spin_unlock(&ci->lock);
  670. do {
  671. if (READ_ONCE(map[offset]))
  672. break;
  673. swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
  674. if (swp_tb_is_folio(swp_tb)) {
  675. if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0)
  676. break;
  677. }
  678. } while (++offset < end);
  679. spin_lock(&ci->lock);
  680. /*
  681. * We just dropped ci->lock so cluster could be used by another
  682. * order or got freed, check if it's still usable or empty.
  683. */
  684. if (!cluster_is_usable(ci, order)) {
  685. *usable = false;
  686. return false;
  687. }
  688. *usable = true;
  689. /* Fast path, no need to scan if the whole cluster is empty */
  690. if (cluster_is_empty(ci))
  691. return true;
  692. /*
  693. * Recheck the range no matter reclaim succeeded or not, the slot
  694. * could have been be freed while we are not holding the lock.
  695. */
  696. for (offset = start; offset < end; offset++) {
  697. swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
  698. if (map[offset] || !swp_tb_is_null(swp_tb))
  699. return false;
  700. }
  701. return true;
  702. }
  703. static bool cluster_scan_range(struct swap_info_struct *si,
  704. struct swap_cluster_info *ci,
  705. unsigned long offset, unsigned int nr_pages,
  706. bool *need_reclaim)
  707. {
  708. unsigned long end = offset + nr_pages;
  709. unsigned char *map = si->swap_map;
  710. unsigned long swp_tb;
  711. if (cluster_is_empty(ci))
  712. return true;
  713. do {
  714. if (map[offset])
  715. return false;
  716. swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
  717. if (swp_tb_is_folio(swp_tb)) {
  718. if (!vm_swap_full())
  719. return false;
  720. *need_reclaim = true;
  721. } else {
  722. /* A entry with no count and no cache must be null */
  723. VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
  724. }
  725. } while (++offset < end);
  726. return true;
  727. }
  728. /*
  729. * Currently, the swap table is not used for count tracking, just
  730. * do a sanity check here to ensure nothing leaked, so the swap
  731. * table should be empty upon freeing.
  732. */
  733. static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci,
  734. unsigned int start, unsigned int nr)
  735. {
  736. unsigned int ci_off = start % SWAPFILE_CLUSTER;
  737. unsigned int ci_end = ci_off + nr;
  738. unsigned long swp_tb;
  739. if (IS_ENABLED(CONFIG_DEBUG_VM)) {
  740. do {
  741. swp_tb = __swap_table_get(ci, ci_off);
  742. VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
  743. } while (++ci_off < ci_end);
  744. }
  745. }
  746. static bool cluster_alloc_range(struct swap_info_struct *si,
  747. struct swap_cluster_info *ci,
  748. struct folio *folio,
  749. unsigned int offset)
  750. {
  751. unsigned long nr_pages;
  752. unsigned int order;
  753. lockdep_assert_held(&ci->lock);
  754. if (!(si->flags & SWP_WRITEOK))
  755. return false;
  756. /*
  757. * All mm swap allocation starts with a folio (folio_alloc_swap),
  758. * it's also the only allocation path for large orders allocation.
  759. * Such swap slots starts with count == 0 and will be increased
  760. * upon folio unmap.
  761. *
  762. * Else, it's a exclusive order 0 allocation for hibernation.
  763. * The slot starts with count == 1 and never increases.
  764. */
  765. if (likely(folio)) {
  766. order = folio_order(folio);
  767. nr_pages = 1 << order;
  768. __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset));
  769. } else if (IS_ENABLED(CONFIG_HIBERNATION)) {
  770. order = 0;
  771. nr_pages = 1;
  772. WARN_ON_ONCE(si->swap_map[offset]);
  773. si->swap_map[offset] = 1;
  774. swap_cluster_assert_table_empty(ci, offset, 1);
  775. } else {
  776. /* Allocation without folio is only possible with hibernation */
  777. WARN_ON_ONCE(1);
  778. return false;
  779. }
  780. /*
  781. * The first allocation in a cluster makes the
  782. * cluster exclusive to this order
  783. */
  784. if (cluster_is_empty(ci))
  785. ci->order = order;
  786. ci->count += nr_pages;
  787. swap_range_alloc(si, nr_pages);
  788. return true;
  789. }
  790. /* Try use a new cluster for current CPU and allocate from it. */
  791. static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
  792. struct swap_cluster_info *ci,
  793. struct folio *folio, unsigned long offset)
  794. {
  795. unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
  796. unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
  797. unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
  798. unsigned int order = likely(folio) ? folio_order(folio) : 0;
  799. unsigned int nr_pages = 1 << order;
  800. bool need_reclaim, ret, usable;
  801. lockdep_assert_held(&ci->lock);
  802. VM_WARN_ON(!cluster_is_usable(ci, order));
  803. if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER)
  804. goto out;
  805. for (end -= nr_pages; offset <= end; offset += nr_pages) {
  806. need_reclaim = false;
  807. if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim))
  808. continue;
  809. if (need_reclaim) {
  810. ret = cluster_reclaim_range(si, ci, offset, order, &usable);
  811. if (!usable)
  812. goto out;
  813. if (cluster_is_empty(ci))
  814. offset = start;
  815. /* Reclaim failed but cluster is usable, try next */
  816. if (!ret)
  817. continue;
  818. }
  819. if (!cluster_alloc_range(si, ci, folio, offset))
  820. break;
  821. found = offset;
  822. offset += nr_pages;
  823. if (ci->count < SWAPFILE_CLUSTER && offset <= end)
  824. next = offset;
  825. break;
  826. }
  827. out:
  828. relocate_cluster(si, ci);
  829. swap_cluster_unlock(ci);
  830. if (si->flags & SWP_SOLIDSTATE) {
  831. this_cpu_write(percpu_swap_cluster.offset[order], next);
  832. this_cpu_write(percpu_swap_cluster.si[order], si);
  833. } else {
  834. si->global_cluster->next[order] = next;
  835. }
  836. return found;
  837. }
  838. static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
  839. struct list_head *list,
  840. struct folio *folio,
  841. bool scan_all)
  842. {
  843. unsigned int found = SWAP_ENTRY_INVALID;
  844. do {
  845. struct swap_cluster_info *ci = isolate_lock_cluster(si, list);
  846. unsigned long offset;
  847. if (!ci)
  848. break;
  849. offset = cluster_offset(si, ci);
  850. found = alloc_swap_scan_cluster(si, ci, folio, offset);
  851. if (found)
  852. break;
  853. } while (scan_all);
  854. return found;
  855. }
  856. static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
  857. {
  858. long to_scan = 1;
  859. unsigned long offset, end;
  860. struct swap_cluster_info *ci;
  861. unsigned char *map = si->swap_map;
  862. int nr_reclaim;
  863. if (force)
  864. to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
  865. while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
  866. offset = cluster_offset(si, ci);
  867. end = min(si->max, offset + SWAPFILE_CLUSTER);
  868. to_scan--;
  869. while (offset < end) {
  870. if (!READ_ONCE(map[offset]) &&
  871. swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) {
  872. spin_unlock(&ci->lock);
  873. nr_reclaim = __try_to_reclaim_swap(si, offset,
  874. TTRS_ANYWAY);
  875. spin_lock(&ci->lock);
  876. if (nr_reclaim) {
  877. offset += abs(nr_reclaim);
  878. continue;
  879. }
  880. }
  881. offset++;
  882. }
  883. /* in case no swap cache is reclaimed */
  884. if (ci->flags == CLUSTER_FLAG_NONE)
  885. relocate_cluster(si, ci);
  886. swap_cluster_unlock(ci);
  887. if (to_scan <= 0)
  888. break;
  889. }
  890. }
  891. static void swap_reclaim_work(struct work_struct *work)
  892. {
  893. struct swap_info_struct *si;
  894. si = container_of(work, struct swap_info_struct, reclaim_work);
  895. swap_reclaim_full_clusters(si, true);
  896. }
  897. /*
  898. * Try to allocate swap entries with specified order and try set a new
  899. * cluster for current CPU too.
  900. */
  901. static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
  902. struct folio *folio)
  903. {
  904. struct swap_cluster_info *ci;
  905. unsigned int order = likely(folio) ? folio_order(folio) : 0;
  906. unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
  907. /*
  908. * Swapfile is not block device so unable
  909. * to allocate large entries.
  910. */
  911. if (order && !(si->flags & SWP_BLKDEV))
  912. return 0;
  913. if (!(si->flags & SWP_SOLIDSTATE)) {
  914. /* Serialize HDD SWAP allocation for each device. */
  915. spin_lock(&si->global_cluster_lock);
  916. offset = si->global_cluster->next[order];
  917. if (offset == SWAP_ENTRY_INVALID)
  918. goto new_cluster;
  919. ci = swap_cluster_lock(si, offset);
  920. /* Cluster could have been used by another order */
  921. if (cluster_is_usable(ci, order)) {
  922. if (cluster_is_empty(ci))
  923. offset = cluster_offset(si, ci);
  924. found = alloc_swap_scan_cluster(si, ci, folio, offset);
  925. } else {
  926. swap_cluster_unlock(ci);
  927. }
  928. if (found)
  929. goto done;
  930. }
  931. new_cluster:
  932. /*
  933. * If the device need discard, prefer new cluster over nonfull
  934. * to spread out the writes.
  935. */
  936. if (si->flags & SWP_PAGE_DISCARD) {
  937. found = alloc_swap_scan_list(si, &si->free_clusters, folio, false);
  938. if (found)
  939. goto done;
  940. }
  941. if (order < PMD_ORDER) {
  942. found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], folio, true);
  943. if (found)
  944. goto done;
  945. }
  946. if (!(si->flags & SWP_PAGE_DISCARD)) {
  947. found = alloc_swap_scan_list(si, &si->free_clusters, folio, false);
  948. if (found)
  949. goto done;
  950. }
  951. /* Try reclaim full clusters if free and nonfull lists are drained */
  952. if (vm_swap_full())
  953. swap_reclaim_full_clusters(si, false);
  954. if (order < PMD_ORDER) {
  955. /*
  956. * Scan only one fragment cluster is good enough. Order 0
  957. * allocation will surely success, and large allocation
  958. * failure is not critical. Scanning one cluster still
  959. * keeps the list rotated and reclaimed (for clean swap cache).
  960. */
  961. found = alloc_swap_scan_list(si, &si->frag_clusters[order], folio, false);
  962. if (found)
  963. goto done;
  964. }
  965. if (order)
  966. goto done;
  967. /* Order 0 stealing from higher order */
  968. for (int o = 1; o < SWAP_NR_ORDERS; o++) {
  969. /*
  970. * Clusters here have at least one usable slots and can't fail order 0
  971. * allocation, but reclaim may drop si->lock and race with another user.
  972. */
  973. found = alloc_swap_scan_list(si, &si->frag_clusters[o], folio, true);
  974. if (found)
  975. goto done;
  976. found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], folio, true);
  977. if (found)
  978. goto done;
  979. }
  980. done:
  981. if (!(si->flags & SWP_SOLIDSTATE))
  982. spin_unlock(&si->global_cluster_lock);
  983. return found;
  984. }
  985. /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
  986. static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
  987. {
  988. unsigned long pages;
  989. spin_lock(&swap_avail_lock);
  990. if (swapoff) {
  991. /*
  992. * Forcefully remove it. Clear the SWP_WRITEOK flags for
  993. * swapoff here so it's synchronized by both si->lock and
  994. * swap_avail_lock, to ensure the result can be seen by
  995. * add_to_avail_list.
  996. */
  997. lockdep_assert_held(&si->lock);
  998. si->flags &= ~SWP_WRITEOK;
  999. atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
  1000. } else {
  1001. /*
  1002. * If not called by swapoff, take it off-list only if it's
  1003. * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly
  1004. * si->inuse_pages == pages), any concurrent slot freeing,
  1005. * or device already removed from plist by someone else
  1006. * will make this return false.
  1007. */
  1008. pages = si->pages;
  1009. if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
  1010. pages | SWAP_USAGE_OFFLIST_BIT))
  1011. goto skip;
  1012. }
  1013. plist_del(&si->avail_list, &swap_avail_head);
  1014. skip:
  1015. spin_unlock(&swap_avail_lock);
  1016. }
  1017. /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
  1018. static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
  1019. {
  1020. long val;
  1021. unsigned long pages;
  1022. spin_lock(&swap_avail_lock);
  1023. /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */
  1024. if (swapon) {
  1025. lockdep_assert_held(&si->lock);
  1026. si->flags |= SWP_WRITEOK;
  1027. } else {
  1028. if (!(READ_ONCE(si->flags) & SWP_WRITEOK))
  1029. goto skip;
  1030. }
  1031. if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT))
  1032. goto skip;
  1033. val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
  1034. /*
  1035. * When device is full and device is on the plist, only one updater will
  1036. * see (inuse_pages == si->pages) and will call del_from_avail_list. If
  1037. * that updater happen to be here, just skip adding.
  1038. */
  1039. pages = si->pages;
  1040. if (val == pages) {
  1041. /* Just like the cmpxchg in del_from_avail_list */
  1042. if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
  1043. pages | SWAP_USAGE_OFFLIST_BIT))
  1044. goto skip;
  1045. }
  1046. plist_add(&si->avail_list, &swap_avail_head);
  1047. skip:
  1048. spin_unlock(&swap_avail_lock);
  1049. }
  1050. /*
  1051. * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock
  1052. * within each cluster, so the total contribution to the global counter should
  1053. * always be positive and cannot exceed the total number of usable slots.
  1054. */
  1055. static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries)
  1056. {
  1057. long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);
  1058. /*
  1059. * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set,
  1060. * remove it from the plist.
  1061. */
  1062. if (unlikely(val == si->pages)) {
  1063. del_from_avail_list(si, false);
  1064. return true;
  1065. }
  1066. return false;
  1067. }
  1068. static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
  1069. {
  1070. long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);
  1071. /*
  1072. * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
  1073. * add it to the plist.
  1074. */
  1075. if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
  1076. add_to_avail_list(si, false);
  1077. }
  1078. static void swap_range_alloc(struct swap_info_struct *si,
  1079. unsigned int nr_entries)
  1080. {
  1081. if (swap_usage_add(si, nr_entries)) {
  1082. if (vm_swap_full())
  1083. schedule_work(&si->reclaim_work);
  1084. }
  1085. atomic_long_sub(nr_entries, &nr_swap_pages);
  1086. }
  1087. static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
  1088. unsigned int nr_entries)
  1089. {
  1090. unsigned long begin = offset;
  1091. unsigned long end = offset + nr_entries - 1;
  1092. void (*swap_slot_free_notify)(struct block_device *, unsigned long);
  1093. unsigned int i;
  1094. /*
  1095. * Use atomic clear_bit operations only on zeromap instead of non-atomic
  1096. * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes.
  1097. */
  1098. for (i = 0; i < nr_entries; i++) {
  1099. clear_bit(offset + i, si->zeromap);
  1100. zswap_invalidate(swp_entry(si->type, offset + i));
  1101. }
  1102. if (si->flags & SWP_BLKDEV)
  1103. swap_slot_free_notify =
  1104. si->bdev->bd_disk->fops->swap_slot_free_notify;
  1105. else
  1106. swap_slot_free_notify = NULL;
  1107. while (offset <= end) {
  1108. arch_swap_invalidate_page(si->type, offset);
  1109. if (swap_slot_free_notify)
  1110. swap_slot_free_notify(si->bdev, offset);
  1111. offset++;
  1112. }
  1113. __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries);
  1114. /*
  1115. * Make sure that try_to_unuse() observes si->inuse_pages reaching 0
  1116. * only after the above cleanups are done.
  1117. */
  1118. smp_wmb();
  1119. atomic_long_add(nr_entries, &nr_swap_pages);
  1120. swap_usage_sub(si, nr_entries);
  1121. }
  1122. static bool get_swap_device_info(struct swap_info_struct *si)
  1123. {
  1124. if (!percpu_ref_tryget_live(&si->users))
  1125. return false;
  1126. /*
  1127. * Guarantee the si->users are checked before accessing other
  1128. * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is
  1129. * up to dated.
  1130. *
  1131. * Paired with the spin_unlock() after setup_swap_info() in
  1132. * enable_swap_info(), and smp_wmb() in swapoff.
  1133. */
  1134. smp_rmb();
  1135. return true;
  1136. }
  1137. /*
  1138. * Fast path try to get swap entries with specified order from current
  1139. * CPU's swap entry pool (a cluster).
  1140. */
  1141. static bool swap_alloc_fast(struct folio *folio)
  1142. {
  1143. unsigned int order = folio_order(folio);
  1144. struct swap_cluster_info *ci;
  1145. struct swap_info_struct *si;
  1146. unsigned int offset;
  1147. /*
  1148. * Once allocated, swap_info_struct will never be completely freed,
  1149. * so checking it's liveness by get_swap_device_info is enough.
  1150. */
  1151. si = this_cpu_read(percpu_swap_cluster.si[order]);
  1152. offset = this_cpu_read(percpu_swap_cluster.offset[order]);
  1153. if (!si || !offset || !get_swap_device_info(si))
  1154. return false;
  1155. ci = swap_cluster_lock(si, offset);
  1156. if (cluster_is_usable(ci, order)) {
  1157. if (cluster_is_empty(ci))
  1158. offset = cluster_offset(si, ci);
  1159. alloc_swap_scan_cluster(si, ci, folio, offset);
  1160. } else {
  1161. swap_cluster_unlock(ci);
  1162. }
  1163. put_swap_device(si);
  1164. return folio_test_swapcache(folio);
  1165. }
  1166. /* Rotate the device and switch to a new cluster */
  1167. static void swap_alloc_slow(struct folio *folio)
  1168. {
  1169. struct swap_info_struct *si, *next;
  1170. spin_lock(&swap_avail_lock);
  1171. start_over:
  1172. plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
  1173. /* Rotate the device and switch to a new cluster */
  1174. plist_requeue(&si->avail_list, &swap_avail_head);
  1175. spin_unlock(&swap_avail_lock);
  1176. if (get_swap_device_info(si)) {
  1177. cluster_alloc_swap_entry(si, folio);
  1178. put_swap_device(si);
  1179. if (folio_test_swapcache(folio))
  1180. return;
  1181. if (folio_test_large(folio))
  1182. return;
  1183. }
  1184. spin_lock(&swap_avail_lock);
  1185. /*
  1186. * if we got here, it's likely that si was almost full before,
  1187. * multiple callers probably all tried to get a page from the
  1188. * same si and it filled up before we could get one; or, the si
  1189. * filled up between us dropping swap_avail_lock.
  1190. * Since we dropped the swap_avail_lock, the swap_avail_list
  1191. * may have been modified; so if next is still in the
  1192. * swap_avail_head list then try it, otherwise start over if we
  1193. * have not gotten any slots.
  1194. */
  1195. if (plist_node_empty(&next->avail_list))
  1196. goto start_over;
  1197. }
  1198. spin_unlock(&swap_avail_lock);
  1199. }
  1200. /*
  1201. * Discard pending clusters in a synchronized way when under high pressure.
  1202. * Return: true if any cluster is discarded.
  1203. */
  1204. static bool swap_sync_discard(void)
  1205. {
  1206. bool ret = false;
  1207. struct swap_info_struct *si, *next;
  1208. spin_lock(&swap_lock);
  1209. start_over:
  1210. plist_for_each_entry_safe(si, next, &swap_active_head, list) {
  1211. spin_unlock(&swap_lock);
  1212. if (get_swap_device_info(si)) {
  1213. if (si->flags & SWP_PAGE_DISCARD)
  1214. ret = swap_do_scheduled_discard(si);
  1215. put_swap_device(si);
  1216. }
  1217. if (ret)
  1218. return true;
  1219. spin_lock(&swap_lock);
  1220. if (plist_node_empty(&next->list))
  1221. goto start_over;
  1222. }
  1223. spin_unlock(&swap_lock);
  1224. return false;
  1225. }
  1226. /**
  1227. * swap_put_entries_cluster - Decrease the swap count of a set of slots.
  1228. * @si: The swap device.
  1229. * @start: start offset of slots.
  1230. * @nr: number of slots.
  1231. * @reclaim_cache: if true, also reclaim the swap cache.
  1232. *
  1233. * This helper decreases the swap count of a set of slots and tries to
  1234. * batch free them. Also reclaims the swap cache if @reclaim_cache is true.
  1235. * Context: The caller must ensure that all slots belong to the same
  1236. * cluster and their swap count doesn't go underflow.
  1237. */
  1238. static void swap_put_entries_cluster(struct swap_info_struct *si,
  1239. unsigned long start, int nr,
  1240. bool reclaim_cache)
  1241. {
  1242. unsigned long offset = start, end = start + nr;
  1243. unsigned long batch_start = SWAP_ENTRY_INVALID;
  1244. struct swap_cluster_info *ci;
  1245. bool need_reclaim = false;
  1246. unsigned int nr_reclaimed;
  1247. unsigned long swp_tb;
  1248. unsigned int count;
  1249. ci = swap_cluster_lock(si, offset);
  1250. do {
  1251. swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
  1252. count = si->swap_map[offset];
  1253. VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD);
  1254. if (count == 1) {
  1255. /* count == 1 and non-cached slots will be batch freed. */
  1256. if (!swp_tb_is_folio(swp_tb)) {
  1257. if (!batch_start)
  1258. batch_start = offset;
  1259. continue;
  1260. }
  1261. /* count will be 0 after put, slot can be reclaimed */
  1262. need_reclaim = true;
  1263. }
  1264. /*
  1265. * A count != 1 or cached slot can't be freed. Put its swap
  1266. * count and then free the interrupted pending batch. Cached
  1267. * slots will be freed when folio is removed from swap cache
  1268. * (__swap_cache_del_folio).
  1269. */
  1270. swap_put_entry_locked(si, ci, offset);
  1271. if (batch_start) {
  1272. swap_entries_free(si, ci, batch_start, offset - batch_start);
  1273. batch_start = SWAP_ENTRY_INVALID;
  1274. }
  1275. } while (++offset < end);
  1276. if (batch_start)
  1277. swap_entries_free(si, ci, batch_start, offset - batch_start);
  1278. swap_cluster_unlock(ci);
  1279. if (!need_reclaim || !reclaim_cache)
  1280. return;
  1281. offset = start;
  1282. do {
  1283. nr_reclaimed = __try_to_reclaim_swap(si, offset,
  1284. TTRS_UNMAPPED | TTRS_FULL);
  1285. offset++;
  1286. if (nr_reclaimed)
  1287. offset = round_up(offset, abs(nr_reclaimed));
  1288. } while (offset < end);
  1289. }
  1290. /**
  1291. * folio_alloc_swap - allocate swap space for a folio
  1292. * @folio: folio we want to move to swap
  1293. *
  1294. * Allocate swap space for the folio and add the folio to the
  1295. * swap cache.
  1296. *
  1297. * Context: Caller needs to hold the folio lock.
  1298. * Return: Whether the folio was added to the swap cache.
  1299. */
  1300. int folio_alloc_swap(struct folio *folio)
  1301. {
  1302. unsigned int order = folio_order(folio);
  1303. unsigned int size = 1 << order;
  1304. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  1305. VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
  1306. if (order) {
  1307. /*
  1308. * Reject large allocation when THP_SWAP is disabled,
  1309. * the caller should split the folio and try again.
  1310. */
  1311. if (!IS_ENABLED(CONFIG_THP_SWAP))
  1312. return -EAGAIN;
  1313. /*
  1314. * Allocation size should never exceed cluster size
  1315. * (HPAGE_PMD_SIZE).
  1316. */
  1317. if (size > SWAPFILE_CLUSTER) {
  1318. VM_WARN_ON_ONCE(1);
  1319. return -EINVAL;
  1320. }
  1321. }
  1322. again:
  1323. local_lock(&percpu_swap_cluster.lock);
  1324. if (!swap_alloc_fast(folio))
  1325. swap_alloc_slow(folio);
  1326. local_unlock(&percpu_swap_cluster.lock);
  1327. if (!order && unlikely(!folio_test_swapcache(folio))) {
  1328. if (swap_sync_discard())
  1329. goto again;
  1330. }
  1331. /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
  1332. if (unlikely(mem_cgroup_try_charge_swap(folio, folio->swap)))
  1333. swap_cache_del_folio(folio);
  1334. if (unlikely(!folio_test_swapcache(folio)))
  1335. return -ENOMEM;
  1336. return 0;
  1337. }
  1338. /**
  1339. * folio_dup_swap() - Increase swap count of swap entries of a folio.
  1340. * @folio: folio with swap entries bounded.
  1341. * @subpage: if not NULL, only increase the swap count of this subpage.
  1342. *
  1343. * Typically called when the folio is unmapped and have its swap entry to
  1344. * take its palce.
  1345. *
  1346. * Context: Caller must ensure the folio is locked and in the swap cache.
  1347. * NOTE: The caller also has to ensure there is no raced call to
  1348. * swap_put_entries_direct on its swap entry before this helper returns, or
  1349. * the swap map may underflow. Currently, we only accept @subpage == NULL
  1350. * for shmem due to the limitation of swap continuation: shmem always
  1351. * duplicates the swap entry only once, so there is no such issue for it.
  1352. */
  1353. int folio_dup_swap(struct folio *folio, struct page *subpage)
  1354. {
  1355. int err = 0;
  1356. swp_entry_t entry = folio->swap;
  1357. unsigned long nr_pages = folio_nr_pages(folio);
  1358. VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
  1359. VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio);
  1360. if (subpage) {
  1361. entry.val += folio_page_idx(folio, subpage);
  1362. nr_pages = 1;
  1363. }
  1364. while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM)
  1365. err = add_swap_count_continuation(entry, GFP_ATOMIC);
  1366. return err;
  1367. }
  1368. /**
  1369. * folio_put_swap() - Decrease swap count of swap entries of a folio.
  1370. * @folio: folio with swap entries bounded, must be in swap cache and locked.
  1371. * @subpage: if not NULL, only decrease the swap count of this subpage.
  1372. *
  1373. * This won't free the swap slots even if swap count drops to zero, they are
  1374. * still pinned by the swap cache. User may call folio_free_swap to free them.
  1375. * Context: Caller must ensure the folio is locked and in the swap cache.
  1376. */
  1377. void folio_put_swap(struct folio *folio, struct page *subpage)
  1378. {
  1379. swp_entry_t entry = folio->swap;
  1380. unsigned long nr_pages = folio_nr_pages(folio);
  1381. struct swap_info_struct *si = __swap_entry_to_info(entry);
  1382. VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
  1383. VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio);
  1384. if (subpage) {
  1385. entry.val += folio_page_idx(folio, subpage);
  1386. nr_pages = 1;
  1387. }
  1388. swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false);
  1389. }
  1390. static void swap_put_entry_locked(struct swap_info_struct *si,
  1391. struct swap_cluster_info *ci,
  1392. unsigned long offset)
  1393. {
  1394. unsigned char count;
  1395. count = si->swap_map[offset];
  1396. if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
  1397. if (count == COUNT_CONTINUED) {
  1398. if (swap_count_continued(si, offset, count))
  1399. count = SWAP_MAP_MAX | COUNT_CONTINUED;
  1400. else
  1401. count = SWAP_MAP_MAX;
  1402. } else
  1403. count--;
  1404. }
  1405. WRITE_ONCE(si->swap_map[offset], count);
  1406. if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))
  1407. swap_entries_free(si, ci, offset, 1);
  1408. }
  1409. /*
  1410. * When we get a swap entry, if there aren't some other ways to
  1411. * prevent swapoff, such as the folio in swap cache is locked, RCU
  1412. * reader side is locked, etc., the swap entry may become invalid
  1413. * because of swapoff. Then, we need to enclose all swap related
  1414. * functions with get_swap_device() and put_swap_device(), unless the
  1415. * swap functions call get/put_swap_device() by themselves.
  1416. *
  1417. * RCU reader side lock (including any spinlock) is sufficient to
  1418. * prevent swapoff, because synchronize_rcu() is called in swapoff()
  1419. * before freeing data structures.
  1420. *
  1421. * Check whether swap entry is valid in the swap device. If so,
  1422. * return pointer to swap_info_struct, and keep the swap entry valid
  1423. * via preventing the swap device from being swapoff, until
  1424. * put_swap_device() is called. Otherwise return NULL.
  1425. *
  1426. * Notice that swapoff or swapoff+swapon can still happen before the
  1427. * percpu_ref_tryget_live() in get_swap_device() or after the
  1428. * percpu_ref_put() in put_swap_device() if there isn't any other way
  1429. * to prevent swapoff. The caller must be prepared for that. For
  1430. * example, the following situation is possible.
  1431. *
  1432. * CPU1 CPU2
  1433. * do_swap_page()
  1434. * ... swapoff+swapon
  1435. * swap_cache_alloc_folio()
  1436. * swap_cache_add_folio()
  1437. * // check swap_map
  1438. * // verify PTE not changed
  1439. *
  1440. * In __swap_duplicate(), the swap_map need to be checked before
  1441. * changing partly because the specified swap entry may be for another
  1442. * swap device which has been swapoff. And in do_swap_page(), after
  1443. * the page is read from the swap device, the PTE is verified not
  1444. * changed with the page table locked to check whether the swap device
  1445. * has been swapoff or swapoff+swapon.
  1446. */
  1447. struct swap_info_struct *get_swap_device(swp_entry_t entry)
  1448. {
  1449. struct swap_info_struct *si;
  1450. unsigned long offset;
  1451. if (!entry.val)
  1452. goto out;
  1453. si = swap_entry_to_info(entry);
  1454. if (!si)
  1455. goto bad_nofile;
  1456. if (!get_swap_device_info(si))
  1457. goto out;
  1458. offset = swp_offset(entry);
  1459. if (offset >= si->max)
  1460. goto put_out;
  1461. return si;
  1462. bad_nofile:
  1463. pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
  1464. out:
  1465. return NULL;
  1466. put_out:
  1467. pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
  1468. percpu_ref_put(&si->users);
  1469. return NULL;
  1470. }
  1471. /*
  1472. * Drop the last ref of swap entries, caller have to ensure all entries
  1473. * belong to the same cgroup and cluster.
  1474. */
  1475. void swap_entries_free(struct swap_info_struct *si,
  1476. struct swap_cluster_info *ci,
  1477. unsigned long offset, unsigned int nr_pages)
  1478. {
  1479. swp_entry_t entry = swp_entry(si->type, offset);
  1480. unsigned char *map = si->swap_map + offset;
  1481. unsigned char *map_end = map + nr_pages;
  1482. /* It should never free entries across different clusters */
  1483. VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1));
  1484. VM_BUG_ON(cluster_is_empty(ci));
  1485. VM_BUG_ON(ci->count < nr_pages);
  1486. ci->count -= nr_pages;
  1487. do {
  1488. VM_WARN_ON(*map > 1);
  1489. *map = 0;
  1490. } while (++map < map_end);
  1491. mem_cgroup_uncharge_swap(entry, nr_pages);
  1492. swap_range_free(si, offset, nr_pages);
  1493. swap_cluster_assert_table_empty(ci, offset, nr_pages);
  1494. if (!ci->count)
  1495. free_cluster(si, ci);
  1496. else
  1497. partial_free_cluster(si, ci);
  1498. }
  1499. int __swap_count(swp_entry_t entry)
  1500. {
  1501. struct swap_info_struct *si = __swap_entry_to_info(entry);
  1502. pgoff_t offset = swp_offset(entry);
  1503. return si->swap_map[offset];
  1504. }
  1505. /**
  1506. * swap_entry_swapped - Check if the swap entry is swapped.
  1507. * @si: the swap device.
  1508. * @entry: the swap entry.
  1509. */
  1510. bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
  1511. {
  1512. pgoff_t offset = swp_offset(entry);
  1513. struct swap_cluster_info *ci;
  1514. int count;
  1515. ci = swap_cluster_lock(si, offset);
  1516. count = si->swap_map[offset];
  1517. swap_cluster_unlock(ci);
  1518. return count && count != SWAP_MAP_BAD;
  1519. }
  1520. /*
  1521. * How many references to @entry are currently swapped out?
  1522. * This considers COUNT_CONTINUED so it returns exact answer.
  1523. */
  1524. int swp_swapcount(swp_entry_t entry)
  1525. {
  1526. int count, tmp_count, n;
  1527. struct swap_info_struct *si;
  1528. struct swap_cluster_info *ci;
  1529. struct page *page;
  1530. pgoff_t offset;
  1531. unsigned char *map;
  1532. si = get_swap_device(entry);
  1533. if (!si)
  1534. return 0;
  1535. offset = swp_offset(entry);
  1536. ci = swap_cluster_lock(si, offset);
  1537. count = si->swap_map[offset];
  1538. if (!(count & COUNT_CONTINUED))
  1539. goto out;
  1540. count &= ~COUNT_CONTINUED;
  1541. n = SWAP_MAP_MAX + 1;
  1542. page = vmalloc_to_page(si->swap_map + offset);
  1543. offset &= ~PAGE_MASK;
  1544. VM_BUG_ON(page_private(page) != SWP_CONTINUED);
  1545. do {
  1546. page = list_next_entry(page, lru);
  1547. map = kmap_local_page(page);
  1548. tmp_count = map[offset];
  1549. kunmap_local(map);
  1550. count += (tmp_count & ~COUNT_CONTINUED) * n;
  1551. n *= (SWAP_CONT_MAX + 1);
  1552. } while (tmp_count & COUNT_CONTINUED);
  1553. out:
  1554. swap_cluster_unlock(ci);
  1555. put_swap_device(si);
  1556. return count;
  1557. }
  1558. static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
  1559. swp_entry_t entry, int order)
  1560. {
  1561. struct swap_cluster_info *ci;
  1562. unsigned char *map = si->swap_map;
  1563. unsigned int nr_pages = 1 << order;
  1564. unsigned long roffset = swp_offset(entry);
  1565. unsigned long offset = round_down(roffset, nr_pages);
  1566. int i;
  1567. bool ret = false;
  1568. ci = swap_cluster_lock(si, offset);
  1569. if (nr_pages == 1) {
  1570. if (map[roffset])
  1571. ret = true;
  1572. goto unlock_out;
  1573. }
  1574. for (i = 0; i < nr_pages; i++) {
  1575. if (map[offset + i]) {
  1576. ret = true;
  1577. break;
  1578. }
  1579. }
  1580. unlock_out:
  1581. swap_cluster_unlock(ci);
  1582. return ret;
  1583. }
  1584. static bool folio_swapped(struct folio *folio)
  1585. {
  1586. swp_entry_t entry = folio->swap;
  1587. struct swap_info_struct *si;
  1588. VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
  1589. VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
  1590. si = __swap_entry_to_info(entry);
  1591. if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
  1592. return swap_entry_swapped(si, entry);
  1593. return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
  1594. }
  1595. static bool folio_swapcache_freeable(struct folio *folio)
  1596. {
  1597. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  1598. if (!folio_test_swapcache(folio))
  1599. return false;
  1600. if (folio_test_writeback(folio))
  1601. return false;
  1602. /*
  1603. * Once hibernation has begun to create its image of memory,
  1604. * there's a danger that one of the calls to folio_free_swap()
  1605. * - most probably a call from __try_to_reclaim_swap() while
  1606. * hibernation is allocating its own swap pages for the image,
  1607. * but conceivably even a call from memory reclaim - will free
  1608. * the swap from a folio which has already been recorded in the
  1609. * image as a clean swapcache folio, and then reuse its swap for
  1610. * another page of the image. On waking from hibernation, the
  1611. * original folio might be freed under memory pressure, then
  1612. * later read back in from swap, now with the wrong data.
  1613. *
  1614. * Hibernation suspends storage while it is writing the image
  1615. * to disk so check that here.
  1616. */
  1617. if (pm_suspended_storage())
  1618. return false;
  1619. return true;
  1620. }
  1621. /**
  1622. * folio_free_swap() - Free the swap space used for this folio.
  1623. * @folio: The folio to remove.
  1624. *
  1625. * If swap is getting full, or if there are no more mappings of this folio,
  1626. * then call folio_free_swap to free its swap space.
  1627. *
  1628. * Return: true if we were able to release the swap space.
  1629. */
  1630. bool folio_free_swap(struct folio *folio)
  1631. {
  1632. if (!folio_swapcache_freeable(folio))
  1633. return false;
  1634. if (folio_swapped(folio))
  1635. return false;
  1636. swap_cache_del_folio(folio);
  1637. folio_set_dirty(folio);
  1638. return true;
  1639. }
  1640. /**
  1641. * swap_put_entries_direct() - Release reference on range of swap entries and
  1642. * reclaim their cache if no more references remain.
  1643. * @entry: First entry of range.
  1644. * @nr: Number of entries in range.
  1645. *
  1646. * For each swap entry in the contiguous range, release a reference. If any swap
  1647. * entries become free, try to reclaim their underlying folios, if present. The
  1648. * offset range is defined by [entry.offset, entry.offset + nr).
  1649. *
  1650. * Context: Caller must ensure there is no race condition on the reference
  1651. * owner. e.g., locking the PTL of a PTE containing the entry being released.
  1652. */
  1653. void swap_put_entries_direct(swp_entry_t entry, int nr)
  1654. {
  1655. const unsigned long start_offset = swp_offset(entry);
  1656. const unsigned long end_offset = start_offset + nr;
  1657. unsigned long offset, cluster_end;
  1658. struct swap_info_struct *si;
  1659. si = get_swap_device(entry);
  1660. if (WARN_ON_ONCE(!si))
  1661. return;
  1662. if (WARN_ON_ONCE(end_offset > si->max))
  1663. goto out;
  1664. /* Put entries and reclaim cache in each cluster */
  1665. offset = start_offset;
  1666. do {
  1667. cluster_end = min(round_up(offset + 1, SWAPFILE_CLUSTER), end_offset);
  1668. swap_put_entries_cluster(si, offset, cluster_end - offset, true);
  1669. offset = cluster_end;
  1670. } while (offset < end_offset);
  1671. out:
  1672. put_swap_device(si);
  1673. }
  1674. #ifdef CONFIG_HIBERNATION
  1675. /* Allocate a slot for hibernation */
  1676. swp_entry_t swap_alloc_hibernation_slot(int type)
  1677. {
  1678. struct swap_info_struct *si = swap_type_to_info(type);
  1679. unsigned long offset;
  1680. swp_entry_t entry = {0};
  1681. if (!si)
  1682. goto fail;
  1683. /* This is called for allocating swap entry, not cache */
  1684. if (get_swap_device_info(si)) {
  1685. if (si->flags & SWP_WRITEOK) {
  1686. /*
  1687. * Grab the local lock to be compliant
  1688. * with swap table allocation.
  1689. */
  1690. local_lock(&percpu_swap_cluster.lock);
  1691. offset = cluster_alloc_swap_entry(si, NULL);
  1692. local_unlock(&percpu_swap_cluster.lock);
  1693. if (offset)
  1694. entry = swp_entry(si->type, offset);
  1695. }
  1696. put_swap_device(si);
  1697. }
  1698. fail:
  1699. return entry;
  1700. }
  1701. /* Free a slot allocated by swap_alloc_hibernation_slot */
  1702. void swap_free_hibernation_slot(swp_entry_t entry)
  1703. {
  1704. struct swap_info_struct *si;
  1705. struct swap_cluster_info *ci;
  1706. pgoff_t offset = swp_offset(entry);
  1707. si = get_swap_device(entry);
  1708. if (WARN_ON(!si))
  1709. return;
  1710. ci = swap_cluster_lock(si, offset);
  1711. swap_put_entry_locked(si, ci, offset);
  1712. swap_cluster_unlock(ci);
  1713. /* In theory readahead might add it to the swap cache by accident */
  1714. __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
  1715. put_swap_device(si);
  1716. }
  1717. /*
  1718. * Find the swap type that corresponds to given device (if any).
  1719. *
  1720. * @offset - number of the PAGE_SIZE-sized block of the device, starting
  1721. * from 0, in which the swap header is expected to be located.
  1722. *
  1723. * This is needed for the suspend to disk (aka swsusp).
  1724. */
  1725. int swap_type_of(dev_t device, sector_t offset)
  1726. {
  1727. int type;
  1728. if (!device)
  1729. return -1;
  1730. spin_lock(&swap_lock);
  1731. for (type = 0; type < nr_swapfiles; type++) {
  1732. struct swap_info_struct *sis = swap_info[type];
  1733. if (!(sis->flags & SWP_WRITEOK))
  1734. continue;
  1735. if (device == sis->bdev->bd_dev) {
  1736. struct swap_extent *se = first_se(sis);
  1737. if (se->start_block == offset) {
  1738. spin_unlock(&swap_lock);
  1739. return type;
  1740. }
  1741. }
  1742. }
  1743. spin_unlock(&swap_lock);
  1744. return -ENODEV;
  1745. }
  1746. int find_first_swap(dev_t *device)
  1747. {
  1748. int type;
  1749. spin_lock(&swap_lock);
  1750. for (type = 0; type < nr_swapfiles; type++) {
  1751. struct swap_info_struct *sis = swap_info[type];
  1752. if (!(sis->flags & SWP_WRITEOK))
  1753. continue;
  1754. *device = sis->bdev->bd_dev;
  1755. spin_unlock(&swap_lock);
  1756. return type;
  1757. }
  1758. spin_unlock(&swap_lock);
  1759. return -ENODEV;
  1760. }
  1761. /*
  1762. * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
  1763. * corresponding to given index in swap_info (swap type).
  1764. */
  1765. sector_t swapdev_block(int type, pgoff_t offset)
  1766. {
  1767. struct swap_info_struct *si = swap_type_to_info(type);
  1768. struct swap_extent *se;
  1769. if (!si || !(si->flags & SWP_WRITEOK))
  1770. return 0;
  1771. se = offset_to_swap_extent(si, offset);
  1772. return se->start_block + (offset - se->start_page);
  1773. }
  1774. /*
  1775. * Return either the total number of swap pages of given type, or the number
  1776. * of free pages of that type (depending on @free)
  1777. *
  1778. * This is needed for software suspend
  1779. */
  1780. unsigned int count_swap_pages(int type, int free)
  1781. {
  1782. unsigned int n = 0;
  1783. spin_lock(&swap_lock);
  1784. if ((unsigned int)type < nr_swapfiles) {
  1785. struct swap_info_struct *sis = swap_info[type];
  1786. spin_lock(&sis->lock);
  1787. if (sis->flags & SWP_WRITEOK) {
  1788. n = sis->pages;
  1789. if (free)
  1790. n -= swap_usage_in_pages(sis);
  1791. }
  1792. spin_unlock(&sis->lock);
  1793. }
  1794. spin_unlock(&swap_lock);
  1795. return n;
  1796. }
  1797. #endif /* CONFIG_HIBERNATION */
  1798. static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
  1799. {
  1800. return pte_same(pte_swp_clear_flags(pte), swp_pte);
  1801. }
  1802. /*
  1803. * No need to decide whether this PTE shares the swap entry with others,
  1804. * just let do_wp_page work it out if a write is requested later - to
  1805. * force COW, vm_page_prot omits write permission from any private vma.
  1806. */
  1807. static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
  1808. unsigned long addr, swp_entry_t entry, struct folio *folio)
  1809. {
  1810. struct page *page;
  1811. struct folio *swapcache;
  1812. spinlock_t *ptl;
  1813. pte_t *pte, new_pte, old_pte;
  1814. bool hwpoisoned = false;
  1815. int ret = 1;
  1816. /*
  1817. * If the folio is removed from swap cache by others, continue to
  1818. * unuse other PTEs. try_to_unuse may try again if we missed this one.
  1819. */
  1820. if (!folio_matches_swap_entry(folio, entry))
  1821. return 0;
  1822. swapcache = folio;
  1823. folio = ksm_might_need_to_copy(folio, vma, addr);
  1824. if (unlikely(!folio))
  1825. return -ENOMEM;
  1826. else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
  1827. hwpoisoned = true;
  1828. folio = swapcache;
  1829. }
  1830. page = folio_file_page(folio, swp_offset(entry));
  1831. if (PageHWPoison(page))
  1832. hwpoisoned = true;
  1833. pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  1834. if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
  1835. swp_entry_to_pte(entry)))) {
  1836. ret = 0;
  1837. goto out;
  1838. }
  1839. old_pte = ptep_get(pte);
  1840. if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) {
  1841. swp_entry_t swp_entry;
  1842. dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
  1843. if (hwpoisoned) {
  1844. swp_entry = make_hwpoison_entry(page);
  1845. } else {
  1846. swp_entry = make_poisoned_swp_entry();
  1847. }
  1848. new_pte = swp_entry_to_pte(swp_entry);
  1849. ret = 0;
  1850. goto setpte;
  1851. }
  1852. /*
  1853. * Some architectures may have to restore extra metadata to the page
  1854. * when reading from swap. This metadata may be indexed by swap entry
  1855. * so this must be called before folio_put_swap().
  1856. */
  1857. arch_swap_restore(folio_swap(entry, folio), folio);
  1858. dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
  1859. inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
  1860. folio_get(folio);
  1861. if (folio == swapcache) {
  1862. rmap_t rmap_flags = RMAP_NONE;
  1863. /*
  1864. * See do_swap_page(): writeback would be problematic.
  1865. * However, we do a folio_wait_writeback() just before this
  1866. * call and have the folio locked.
  1867. */
  1868. VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
  1869. if (pte_swp_exclusive(old_pte))
  1870. rmap_flags |= RMAP_EXCLUSIVE;
  1871. /*
  1872. * We currently only expect small !anon folios, which are either
  1873. * fully exclusive or fully shared. If we ever get large folios
  1874. * here, we have to be careful.
  1875. */
  1876. if (!folio_test_anon(folio)) {
  1877. VM_WARN_ON_ONCE(folio_test_large(folio));
  1878. VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
  1879. folio_add_new_anon_rmap(folio, vma, addr, rmap_flags);
  1880. } else {
  1881. folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags);
  1882. }
  1883. } else { /* ksm created a completely new copy */
  1884. folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
  1885. folio_add_lru_vma(folio, vma);
  1886. }
  1887. new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
  1888. if (pte_swp_soft_dirty(old_pte))
  1889. new_pte = pte_mksoft_dirty(new_pte);
  1890. if (pte_swp_uffd_wp(old_pte))
  1891. new_pte = pte_mkuffd_wp(new_pte);
  1892. setpte:
  1893. set_pte_at(vma->vm_mm, addr, pte, new_pte);
  1894. folio_put_swap(swapcache, folio_file_page(swapcache, swp_offset(entry)));
  1895. out:
  1896. if (pte)
  1897. pte_unmap_unlock(pte, ptl);
  1898. if (folio != swapcache) {
  1899. folio_unlock(folio);
  1900. folio_put(folio);
  1901. }
  1902. return ret;
  1903. }
  1904. static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
  1905. unsigned long addr, unsigned long end,
  1906. unsigned int type)
  1907. {
  1908. pte_t *pte = NULL;
  1909. struct swap_info_struct *si;
  1910. si = swap_info[type];
  1911. do {
  1912. struct folio *folio;
  1913. unsigned long offset;
  1914. unsigned char swp_count;
  1915. softleaf_t entry;
  1916. int ret;
  1917. pte_t ptent;
  1918. if (!pte++) {
  1919. pte = pte_offset_map(pmd, addr);
  1920. if (!pte)
  1921. break;
  1922. }
  1923. ptent = ptep_get_lockless(pte);
  1924. entry = softleaf_from_pte(ptent);
  1925. if (!softleaf_is_swap(entry))
  1926. continue;
  1927. if (swp_type(entry) != type)
  1928. continue;
  1929. offset = swp_offset(entry);
  1930. pte_unmap(pte);
  1931. pte = NULL;
  1932. folio = swap_cache_get_folio(entry);
  1933. if (!folio) {
  1934. struct vm_fault vmf = {
  1935. .vma = vma,
  1936. .address = addr,
  1937. .real_address = addr,
  1938. .pmd = pmd,
  1939. };
  1940. folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
  1941. &vmf);
  1942. }
  1943. if (!folio) {
  1944. swp_count = READ_ONCE(si->swap_map[offset]);
  1945. if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
  1946. continue;
  1947. return -ENOMEM;
  1948. }
  1949. folio_lock(folio);
  1950. folio_wait_writeback(folio);
  1951. ret = unuse_pte(vma, pmd, addr, entry, folio);
  1952. if (ret < 0) {
  1953. folio_unlock(folio);
  1954. folio_put(folio);
  1955. return ret;
  1956. }
  1957. folio_free_swap(folio);
  1958. folio_unlock(folio);
  1959. folio_put(folio);
  1960. } while (addr += PAGE_SIZE, addr != end);
  1961. if (pte)
  1962. pte_unmap(pte);
  1963. return 0;
  1964. }
  1965. static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
  1966. unsigned long addr, unsigned long end,
  1967. unsigned int type)
  1968. {
  1969. pmd_t *pmd;
  1970. unsigned long next;
  1971. int ret;
  1972. pmd = pmd_offset(pud, addr);
  1973. do {
  1974. cond_resched();
  1975. next = pmd_addr_end(addr, end);
  1976. ret = unuse_pte_range(vma, pmd, addr, next, type);
  1977. if (ret)
  1978. return ret;
  1979. } while (pmd++, addr = next, addr != end);
  1980. return 0;
  1981. }
  1982. static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
  1983. unsigned long addr, unsigned long end,
  1984. unsigned int type)
  1985. {
  1986. pud_t *pud;
  1987. unsigned long next;
  1988. int ret;
  1989. pud = pud_offset(p4d, addr);
  1990. do {
  1991. next = pud_addr_end(addr, end);
  1992. if (pud_none_or_clear_bad(pud))
  1993. continue;
  1994. ret = unuse_pmd_range(vma, pud, addr, next, type);
  1995. if (ret)
  1996. return ret;
  1997. } while (pud++, addr = next, addr != end);
  1998. return 0;
  1999. }
  2000. static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
  2001. unsigned long addr, unsigned long end,
  2002. unsigned int type)
  2003. {
  2004. p4d_t *p4d;
  2005. unsigned long next;
  2006. int ret;
  2007. p4d = p4d_offset(pgd, addr);
  2008. do {
  2009. next = p4d_addr_end(addr, end);
  2010. if (p4d_none_or_clear_bad(p4d))
  2011. continue;
  2012. ret = unuse_pud_range(vma, p4d, addr, next, type);
  2013. if (ret)
  2014. return ret;
  2015. } while (p4d++, addr = next, addr != end);
  2016. return 0;
  2017. }
  2018. static int unuse_vma(struct vm_area_struct *vma, unsigned int type)
  2019. {
  2020. pgd_t *pgd;
  2021. unsigned long addr, end, next;
  2022. int ret;
  2023. addr = vma->vm_start;
  2024. end = vma->vm_end;
  2025. pgd = pgd_offset(vma->vm_mm, addr);
  2026. do {
  2027. next = pgd_addr_end(addr, end);
  2028. if (pgd_none_or_clear_bad(pgd))
  2029. continue;
  2030. ret = unuse_p4d_range(vma, pgd, addr, next, type);
  2031. if (ret)
  2032. return ret;
  2033. } while (pgd++, addr = next, addr != end);
  2034. return 0;
  2035. }
  2036. static int unuse_mm(struct mm_struct *mm, unsigned int type)
  2037. {
  2038. struct vm_area_struct *vma;
  2039. int ret = 0;
  2040. VMA_ITERATOR(vmi, mm, 0);
  2041. mmap_read_lock(mm);
  2042. if (check_stable_address_space(mm))
  2043. goto unlock;
  2044. for_each_vma(vmi, vma) {
  2045. if (vma->anon_vma && !is_vm_hugetlb_page(vma)) {
  2046. ret = unuse_vma(vma, type);
  2047. if (ret)
  2048. break;
  2049. }
  2050. cond_resched();
  2051. }
  2052. unlock:
  2053. mmap_read_unlock(mm);
  2054. return ret;
  2055. }
  2056. /*
  2057. * Scan swap_map from current position to next entry still in use.
  2058. * Return 0 if there are no inuse entries after prev till end of
  2059. * the map.
  2060. */
  2061. static unsigned int find_next_to_unuse(struct swap_info_struct *si,
  2062. unsigned int prev)
  2063. {
  2064. unsigned int i;
  2065. unsigned long swp_tb;
  2066. unsigned char count;
  2067. /*
  2068. * No need for swap_lock here: we're just looking
  2069. * for whether an entry is in use, not modifying it; false
  2070. * hits are okay, and sys_swapoff() has already prevented new
  2071. * allocations from this area (while holding swap_lock).
  2072. */
  2073. for (i = prev + 1; i < si->max; i++) {
  2074. count = READ_ONCE(si->swap_map[i]);
  2075. swp_tb = swap_table_get(__swap_offset_to_cluster(si, i),
  2076. i % SWAPFILE_CLUSTER);
  2077. if (count == SWAP_MAP_BAD)
  2078. continue;
  2079. if (count || swp_tb_is_folio(swp_tb))
  2080. break;
  2081. if ((i % LATENCY_LIMIT) == 0)
  2082. cond_resched();
  2083. }
  2084. if (i == si->max)
  2085. i = 0;
  2086. return i;
  2087. }
  2088. static int try_to_unuse(unsigned int type)
  2089. {
  2090. struct mm_struct *prev_mm;
  2091. struct mm_struct *mm;
  2092. struct list_head *p;
  2093. int retval = 0;
  2094. struct swap_info_struct *si = swap_info[type];
  2095. struct folio *folio;
  2096. swp_entry_t entry;
  2097. unsigned int i;
  2098. if (!swap_usage_in_pages(si))
  2099. goto success;
  2100. retry:
  2101. retval = shmem_unuse(type);
  2102. if (retval)
  2103. return retval;
  2104. prev_mm = &init_mm;
  2105. mmget(prev_mm);
  2106. spin_lock(&mmlist_lock);
  2107. p = &init_mm.mmlist;
  2108. while (swap_usage_in_pages(si) &&
  2109. !signal_pending(current) &&
  2110. (p = p->next) != &init_mm.mmlist) {
  2111. mm = list_entry(p, struct mm_struct, mmlist);
  2112. if (!mmget_not_zero(mm))
  2113. continue;
  2114. spin_unlock(&mmlist_lock);
  2115. mmput(prev_mm);
  2116. prev_mm = mm;
  2117. retval = unuse_mm(mm, type);
  2118. if (retval) {
  2119. mmput(prev_mm);
  2120. return retval;
  2121. }
  2122. /*
  2123. * Make sure that we aren't completely killing
  2124. * interactive performance.
  2125. */
  2126. cond_resched();
  2127. spin_lock(&mmlist_lock);
  2128. }
  2129. spin_unlock(&mmlist_lock);
  2130. mmput(prev_mm);
  2131. i = 0;
  2132. while (swap_usage_in_pages(si) &&
  2133. !signal_pending(current) &&
  2134. (i = find_next_to_unuse(si, i)) != 0) {
  2135. entry = swp_entry(type, i);
  2136. folio = swap_cache_get_folio(entry);
  2137. if (!folio)
  2138. continue;
  2139. /*
  2140. * It is conceivable that a racing task removed this folio from
  2141. * swap cache just before we acquired the page lock. The folio
  2142. * might even be back in swap cache on another swap area. But
  2143. * that is okay, folio_free_swap() only removes stale folios.
  2144. */
  2145. folio_lock(folio);
  2146. folio_wait_writeback(folio);
  2147. folio_free_swap(folio);
  2148. folio_unlock(folio);
  2149. folio_put(folio);
  2150. }
  2151. /*
  2152. * Lets check again to see if there are still swap entries in the map.
  2153. * If yes, we would need to do retry the unuse logic again.
  2154. * Under global memory pressure, swap entries can be reinserted back
  2155. * into process space after the mmlist loop above passes over them.
  2156. *
  2157. * Limit the number of retries? No: when mmget_not_zero()
  2158. * above fails, that mm is likely to be freeing swap from
  2159. * exit_mmap(), which proceeds at its own independent pace;
  2160. * and even shmem_writeout() could have been preempted after
  2161. * folio_alloc_swap(), temporarily hiding that swap. It's easy
  2162. * and robust (though cpu-intensive) just to keep retrying.
  2163. */
  2164. if (swap_usage_in_pages(si)) {
  2165. if (!signal_pending(current))
  2166. goto retry;
  2167. return -EINTR;
  2168. }
  2169. success:
  2170. /*
  2171. * Make sure that further cleanups after try_to_unuse() returns happen
  2172. * after swap_range_free() reduces si->inuse_pages to 0.
  2173. */
  2174. smp_mb();
  2175. return 0;
  2176. }
  2177. /*
  2178. * After a successful try_to_unuse, if no swap is now in use, we know
  2179. * we can empty the mmlist. swap_lock must be held on entry and exit.
  2180. * Note that mmlist_lock nests inside swap_lock, and an mm must be
  2181. * added to the mmlist just after page_duplicate - before would be racy.
  2182. */
  2183. static void drain_mmlist(void)
  2184. {
  2185. struct list_head *p, *next;
  2186. unsigned int type;
  2187. for (type = 0; type < nr_swapfiles; type++)
  2188. if (swap_usage_in_pages(swap_info[type]))
  2189. return;
  2190. spin_lock(&mmlist_lock);
  2191. list_for_each_safe(p, next, &init_mm.mmlist)
  2192. list_del_init(p);
  2193. spin_unlock(&mmlist_lock);
  2194. }
  2195. /*
  2196. * Free all of a swapdev's extent information
  2197. */
  2198. static void destroy_swap_extents(struct swap_info_struct *sis)
  2199. {
  2200. while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
  2201. struct rb_node *rb = sis->swap_extent_root.rb_node;
  2202. struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
  2203. rb_erase(rb, &sis->swap_extent_root);
  2204. kfree(se);
  2205. }
  2206. if (sis->flags & SWP_ACTIVATED) {
  2207. struct file *swap_file = sis->swap_file;
  2208. struct address_space *mapping = swap_file->f_mapping;
  2209. sis->flags &= ~SWP_ACTIVATED;
  2210. if (mapping->a_ops->swap_deactivate)
  2211. mapping->a_ops->swap_deactivate(swap_file);
  2212. }
  2213. }
  2214. /*
  2215. * Add a block range (and the corresponding page range) into this swapdev's
  2216. * extent tree.
  2217. *
  2218. * This function rather assumes that it is called in ascending page order.
  2219. */
  2220. int
  2221. add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
  2222. unsigned long nr_pages, sector_t start_block)
  2223. {
  2224. struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
  2225. struct swap_extent *se;
  2226. struct swap_extent *new_se;
  2227. /*
  2228. * place the new node at the right most since the
  2229. * function is called in ascending page order.
  2230. */
  2231. while (*link) {
  2232. parent = *link;
  2233. link = &parent->rb_right;
  2234. }
  2235. if (parent) {
  2236. se = rb_entry(parent, struct swap_extent, rb_node);
  2237. BUG_ON(se->start_page + se->nr_pages != start_page);
  2238. if (se->start_block + se->nr_pages == start_block) {
  2239. /* Merge it */
  2240. se->nr_pages += nr_pages;
  2241. return 0;
  2242. }
  2243. }
  2244. /* No merge, insert a new extent. */
  2245. new_se = kmalloc_obj(*se);
  2246. if (new_se == NULL)
  2247. return -ENOMEM;
  2248. new_se->start_page = start_page;
  2249. new_se->nr_pages = nr_pages;
  2250. new_se->start_block = start_block;
  2251. rb_link_node(&new_se->rb_node, parent, link);
  2252. rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
  2253. return 1;
  2254. }
  2255. EXPORT_SYMBOL_GPL(add_swap_extent);
  2256. /*
  2257. * A `swap extent' is a simple thing which maps a contiguous range of pages
  2258. * onto a contiguous range of disk blocks. A rbtree of swap extents is
  2259. * built at swapon time and is then used at swap_writepage/swap_read_folio
  2260. * time for locating where on disk a page belongs.
  2261. *
  2262. * If the swapfile is an S_ISBLK block device, a single extent is installed.
  2263. * This is done so that the main operating code can treat S_ISBLK and S_ISREG
  2264. * swap files identically.
  2265. *
  2266. * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
  2267. * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
  2268. * swapfiles are handled *identically* after swapon time.
  2269. *
  2270. * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
  2271. * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray
  2272. * blocks are found which do not fall within the PAGE_SIZE alignment
  2273. * requirements, they are simply tossed out - we will never use those blocks
  2274. * for swapping.
  2275. *
  2276. * For all swap devices we set S_SWAPFILE across the life of the swapon. This
  2277. * prevents users from writing to the swap device, which will corrupt memory.
  2278. *
  2279. * The amount of disk space which a single swap extent represents varies.
  2280. * Typically it is in the 1-4 megabyte range. So we can have hundreds of
  2281. * extents in the rbtree. - akpm.
  2282. */
  2283. static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
  2284. {
  2285. struct file *swap_file = sis->swap_file;
  2286. struct address_space *mapping = swap_file->f_mapping;
  2287. struct inode *inode = mapping->host;
  2288. int ret;
  2289. if (S_ISBLK(inode->i_mode)) {
  2290. ret = add_swap_extent(sis, 0, sis->max, 0);
  2291. *span = sis->pages;
  2292. return ret;
  2293. }
  2294. if (mapping->a_ops->swap_activate) {
  2295. ret = mapping->a_ops->swap_activate(sis, swap_file, span);
  2296. if (ret < 0)
  2297. return ret;
  2298. sis->flags |= SWP_ACTIVATED;
  2299. if ((sis->flags & SWP_FS_OPS) &&
  2300. sio_pool_init() != 0) {
  2301. destroy_swap_extents(sis);
  2302. return -ENOMEM;
  2303. }
  2304. return ret;
  2305. }
  2306. return generic_swapfile_activate(sis, swap_file, span);
  2307. }
  2308. static void setup_swap_info(struct swap_info_struct *si, int prio,
  2309. unsigned char *swap_map,
  2310. struct swap_cluster_info *cluster_info,
  2311. unsigned long *zeromap)
  2312. {
  2313. si->prio = prio;
  2314. /*
  2315. * the plist prio is negated because plist ordering is
  2316. * low-to-high, while swap ordering is high-to-low
  2317. */
  2318. si->list.prio = -si->prio;
  2319. si->avail_list.prio = -si->prio;
  2320. si->swap_map = swap_map;
  2321. si->cluster_info = cluster_info;
  2322. si->zeromap = zeromap;
  2323. }
  2324. static void _enable_swap_info(struct swap_info_struct *si)
  2325. {
  2326. atomic_long_add(si->pages, &nr_swap_pages);
  2327. total_swap_pages += si->pages;
  2328. assert_spin_locked(&swap_lock);
  2329. plist_add(&si->list, &swap_active_head);
  2330. /* Add back to available list */
  2331. add_to_avail_list(si, true);
  2332. }
  2333. static void enable_swap_info(struct swap_info_struct *si, int prio,
  2334. unsigned char *swap_map,
  2335. struct swap_cluster_info *cluster_info,
  2336. unsigned long *zeromap)
  2337. {
  2338. spin_lock(&swap_lock);
  2339. spin_lock(&si->lock);
  2340. setup_swap_info(si, prio, swap_map, cluster_info, zeromap);
  2341. spin_unlock(&si->lock);
  2342. spin_unlock(&swap_lock);
  2343. /*
  2344. * Finished initializing swap device, now it's safe to reference it.
  2345. */
  2346. percpu_ref_resurrect(&si->users);
  2347. spin_lock(&swap_lock);
  2348. spin_lock(&si->lock);
  2349. _enable_swap_info(si);
  2350. spin_unlock(&si->lock);
  2351. spin_unlock(&swap_lock);
  2352. }
  2353. static void reinsert_swap_info(struct swap_info_struct *si)
  2354. {
  2355. spin_lock(&swap_lock);
  2356. spin_lock(&si->lock);
  2357. setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap);
  2358. _enable_swap_info(si);
  2359. spin_unlock(&si->lock);
  2360. spin_unlock(&swap_lock);
  2361. }
  2362. /*
  2363. * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range
  2364. * see the updated flags, so there will be no more allocations.
  2365. */
  2366. static void wait_for_allocation(struct swap_info_struct *si)
  2367. {
  2368. unsigned long offset;
  2369. unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER);
  2370. struct swap_cluster_info *ci;
  2371. BUG_ON(si->flags & SWP_WRITEOK);
  2372. for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
  2373. ci = swap_cluster_lock(si, offset);
  2374. swap_cluster_unlock(ci);
  2375. }
  2376. }
  2377. static void free_cluster_info(struct swap_cluster_info *cluster_info,
  2378. unsigned long maxpages)
  2379. {
  2380. struct swap_cluster_info *ci;
  2381. int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
  2382. if (!cluster_info)
  2383. return;
  2384. for (i = 0; i < nr_clusters; i++) {
  2385. ci = cluster_info + i;
  2386. /* Cluster with bad marks count will have a remaining table */
  2387. spin_lock(&ci->lock);
  2388. if (rcu_dereference_protected(ci->table, true)) {
  2389. ci->count = 0;
  2390. swap_cluster_free_table(ci);
  2391. }
  2392. spin_unlock(&ci->lock);
  2393. }
  2394. kvfree(cluster_info);
  2395. }
  2396. /*
  2397. * Called after swap device's reference count is dead, so
  2398. * neither scan nor allocation will use it.
  2399. */
  2400. static void flush_percpu_swap_cluster(struct swap_info_struct *si)
  2401. {
  2402. int cpu, i;
  2403. struct swap_info_struct **pcp_si;
  2404. for_each_possible_cpu(cpu) {
  2405. pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu);
  2406. /*
  2407. * Invalidate the percpu swap cluster cache, si->users
  2408. * is dead, so no new user will point to it, just flush
  2409. * any existing user.
  2410. */
  2411. for (i = 0; i < SWAP_NR_ORDERS; i++)
  2412. cmpxchg(&pcp_si[i], si, NULL);
  2413. }
  2414. }
  2415. SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
  2416. {
  2417. struct swap_info_struct *p = NULL;
  2418. unsigned char *swap_map;
  2419. unsigned long *zeromap;
  2420. struct swap_cluster_info *cluster_info;
  2421. struct file *swap_file, *victim;
  2422. struct address_space *mapping;
  2423. struct inode *inode;
  2424. unsigned int maxpages;
  2425. int err, found = 0;
  2426. if (!capable(CAP_SYS_ADMIN))
  2427. return -EPERM;
  2428. BUG_ON(!current->mm);
  2429. CLASS(filename, pathname)(specialfile);
  2430. victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
  2431. if (IS_ERR(victim))
  2432. return PTR_ERR(victim);
  2433. mapping = victim->f_mapping;
  2434. spin_lock(&swap_lock);
  2435. plist_for_each_entry(p, &swap_active_head, list) {
  2436. if (p->flags & SWP_WRITEOK) {
  2437. if (p->swap_file->f_mapping == mapping) {
  2438. found = 1;
  2439. break;
  2440. }
  2441. }
  2442. }
  2443. if (!found) {
  2444. err = -EINVAL;
  2445. spin_unlock(&swap_lock);
  2446. goto out_dput;
  2447. }
  2448. if (!security_vm_enough_memory_mm(current->mm, p->pages))
  2449. vm_unacct_memory(p->pages);
  2450. else {
  2451. err = -ENOMEM;
  2452. spin_unlock(&swap_lock);
  2453. goto out_dput;
  2454. }
  2455. spin_lock(&p->lock);
  2456. del_from_avail_list(p, true);
  2457. plist_del(&p->list, &swap_active_head);
  2458. atomic_long_sub(p->pages, &nr_swap_pages);
  2459. total_swap_pages -= p->pages;
  2460. spin_unlock(&p->lock);
  2461. spin_unlock(&swap_lock);
  2462. wait_for_allocation(p);
  2463. set_current_oom_origin();
  2464. err = try_to_unuse(p->type);
  2465. clear_current_oom_origin();
  2466. if (err) {
  2467. /* re-insert swap space back into swap_list */
  2468. reinsert_swap_info(p);
  2469. goto out_dput;
  2470. }
  2471. /*
  2472. * Wait for swap operations protected by get/put_swap_device()
  2473. * to complete. Because of synchronize_rcu() here, all swap
  2474. * operations protected by RCU reader side lock (including any
  2475. * spinlock) will be waited too. This makes it easy to
  2476. * prevent folio_test_swapcache() and the following swap cache
  2477. * operations from racing with swapoff.
  2478. */
  2479. percpu_ref_kill(&p->users);
  2480. synchronize_rcu();
  2481. wait_for_completion(&p->comp);
  2482. flush_work(&p->discard_work);
  2483. flush_work(&p->reclaim_work);
  2484. flush_percpu_swap_cluster(p);
  2485. destroy_swap_extents(p);
  2486. if (p->flags & SWP_CONTINUED)
  2487. free_swap_count_continuations(p);
  2488. if (!(p->flags & SWP_SOLIDSTATE))
  2489. atomic_dec(&nr_rotate_swap);
  2490. mutex_lock(&swapon_mutex);
  2491. spin_lock(&swap_lock);
  2492. spin_lock(&p->lock);
  2493. drain_mmlist();
  2494. swap_file = p->swap_file;
  2495. p->swap_file = NULL;
  2496. swap_map = p->swap_map;
  2497. p->swap_map = NULL;
  2498. zeromap = p->zeromap;
  2499. p->zeromap = NULL;
  2500. maxpages = p->max;
  2501. cluster_info = p->cluster_info;
  2502. p->max = 0;
  2503. p->cluster_info = NULL;
  2504. spin_unlock(&p->lock);
  2505. spin_unlock(&swap_lock);
  2506. arch_swap_invalidate_area(p->type);
  2507. zswap_swapoff(p->type);
  2508. mutex_unlock(&swapon_mutex);
  2509. kfree(p->global_cluster);
  2510. p->global_cluster = NULL;
  2511. vfree(swap_map);
  2512. kvfree(zeromap);
  2513. free_cluster_info(cluster_info, maxpages);
  2514. /* Destroy swap account information */
  2515. swap_cgroup_swapoff(p->type);
  2516. inode = mapping->host;
  2517. inode_lock(inode);
  2518. inode->i_flags &= ~S_SWAPFILE;
  2519. inode_unlock(inode);
  2520. filp_close(swap_file, NULL);
  2521. /*
  2522. * Clear the SWP_USED flag after all resources are freed so that swapon
  2523. * can reuse this swap_info in alloc_swap_info() safely. It is ok to
  2524. * not hold p->lock after we cleared its SWP_WRITEOK.
  2525. */
  2526. spin_lock(&swap_lock);
  2527. p->flags = 0;
  2528. spin_unlock(&swap_lock);
  2529. err = 0;
  2530. atomic_inc(&proc_poll_event);
  2531. wake_up_interruptible(&proc_poll_wait);
  2532. out_dput:
  2533. filp_close(victim, NULL);
  2534. return err;
  2535. }
  2536. #ifdef CONFIG_PROC_FS
  2537. static __poll_t swaps_poll(struct file *file, poll_table *wait)
  2538. {
  2539. struct seq_file *seq = file->private_data;
  2540. poll_wait(file, &proc_poll_wait, wait);
  2541. if (seq->poll_event != atomic_read(&proc_poll_event)) {
  2542. seq->poll_event = atomic_read(&proc_poll_event);
  2543. return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
  2544. }
  2545. return EPOLLIN | EPOLLRDNORM;
  2546. }
  2547. /* iterator */
  2548. static void *swap_start(struct seq_file *swap, loff_t *pos)
  2549. {
  2550. struct swap_info_struct *si;
  2551. int type;
  2552. loff_t l = *pos;
  2553. mutex_lock(&swapon_mutex);
  2554. if (!l)
  2555. return SEQ_START_TOKEN;
  2556. for (type = 0; (si = swap_type_to_info(type)); type++) {
  2557. if (!(si->flags & SWP_USED) || !si->swap_map)
  2558. continue;
  2559. if (!--l)
  2560. return si;
  2561. }
  2562. return NULL;
  2563. }
  2564. static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
  2565. {
  2566. struct swap_info_struct *si = v;
  2567. int type;
  2568. if (v == SEQ_START_TOKEN)
  2569. type = 0;
  2570. else
  2571. type = si->type + 1;
  2572. ++(*pos);
  2573. for (; (si = swap_type_to_info(type)); type++) {
  2574. if (!(si->flags & SWP_USED) || !si->swap_map)
  2575. continue;
  2576. return si;
  2577. }
  2578. return NULL;
  2579. }
  2580. static void swap_stop(struct seq_file *swap, void *v)
  2581. {
  2582. mutex_unlock(&swapon_mutex);
  2583. }
  2584. static int swap_show(struct seq_file *swap, void *v)
  2585. {
  2586. struct swap_info_struct *si = v;
  2587. struct file *file;
  2588. int len;
  2589. unsigned long bytes, inuse;
  2590. if (si == SEQ_START_TOKEN) {
  2591. seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
  2592. return 0;
  2593. }
  2594. bytes = K(si->pages);
  2595. inuse = K(swap_usage_in_pages(si));
  2596. file = si->swap_file;
  2597. len = seq_file_path(swap, file, " \t\n\\");
  2598. seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
  2599. len < 40 ? 40 - len : 1, " ",
  2600. S_ISBLK(file_inode(file)->i_mode) ?
  2601. "partition" : "file\t",
  2602. bytes, bytes < 10000000 ? "\t" : "",
  2603. inuse, inuse < 10000000 ? "\t" : "",
  2604. si->prio);
  2605. return 0;
  2606. }
  2607. static const struct seq_operations swaps_op = {
  2608. .start = swap_start,
  2609. .next = swap_next,
  2610. .stop = swap_stop,
  2611. .show = swap_show
  2612. };
  2613. static int swaps_open(struct inode *inode, struct file *file)
  2614. {
  2615. struct seq_file *seq;
  2616. int ret;
  2617. ret = seq_open(file, &swaps_op);
  2618. if (ret)
  2619. return ret;
  2620. seq = file->private_data;
  2621. seq->poll_event = atomic_read(&proc_poll_event);
  2622. return 0;
  2623. }
  2624. static const struct proc_ops swaps_proc_ops = {
  2625. .proc_flags = PROC_ENTRY_PERMANENT,
  2626. .proc_open = swaps_open,
  2627. .proc_read = seq_read,
  2628. .proc_lseek = seq_lseek,
  2629. .proc_release = seq_release,
  2630. .proc_poll = swaps_poll,
  2631. };
  2632. static int __init procswaps_init(void)
  2633. {
  2634. proc_create("swaps", 0, NULL, &swaps_proc_ops);
  2635. return 0;
  2636. }
  2637. __initcall(procswaps_init);
  2638. #endif /* CONFIG_PROC_FS */
  2639. #ifdef MAX_SWAPFILES_CHECK
  2640. static int __init max_swapfiles_check(void)
  2641. {
  2642. MAX_SWAPFILES_CHECK();
  2643. return 0;
  2644. }
  2645. late_initcall(max_swapfiles_check);
  2646. #endif
  2647. static struct swap_info_struct *alloc_swap_info(void)
  2648. {
  2649. struct swap_info_struct *p;
  2650. struct swap_info_struct *defer = NULL;
  2651. unsigned int type;
  2652. p = kvzalloc_obj(struct swap_info_struct);
  2653. if (!p)
  2654. return ERR_PTR(-ENOMEM);
  2655. if (percpu_ref_init(&p->users, swap_users_ref_free,
  2656. PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
  2657. kvfree(p);
  2658. return ERR_PTR(-ENOMEM);
  2659. }
  2660. spin_lock(&swap_lock);
  2661. for (type = 0; type < nr_swapfiles; type++) {
  2662. if (!(swap_info[type]->flags & SWP_USED))
  2663. break;
  2664. }
  2665. if (type >= MAX_SWAPFILES) {
  2666. spin_unlock(&swap_lock);
  2667. percpu_ref_exit(&p->users);
  2668. kvfree(p);
  2669. return ERR_PTR(-EPERM);
  2670. }
  2671. if (type >= nr_swapfiles) {
  2672. p->type = type;
  2673. /*
  2674. * Publish the swap_info_struct after initializing it.
  2675. * Note that kvzalloc() above zeroes all its fields.
  2676. */
  2677. smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */
  2678. nr_swapfiles++;
  2679. } else {
  2680. defer = p;
  2681. p = swap_info[type];
  2682. /*
  2683. * Do not memset this entry: a racing procfs swap_next()
  2684. * would be relying on p->type to remain valid.
  2685. */
  2686. }
  2687. p->swap_extent_root = RB_ROOT;
  2688. plist_node_init(&p->list, 0);
  2689. plist_node_init(&p->avail_list, 0);
  2690. p->flags = SWP_USED;
  2691. spin_unlock(&swap_lock);
  2692. if (defer) {
  2693. percpu_ref_exit(&defer->users);
  2694. kvfree(defer);
  2695. }
  2696. spin_lock_init(&p->lock);
  2697. spin_lock_init(&p->cont_lock);
  2698. atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
  2699. init_completion(&p->comp);
  2700. return p;
  2701. }
  2702. static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
  2703. {
  2704. if (S_ISBLK(inode->i_mode)) {
  2705. si->bdev = I_BDEV(inode);
  2706. /*
  2707. * Zoned block devices contain zones that have a sequential
  2708. * write only restriction. Hence zoned block devices are not
  2709. * suitable for swapping. Disallow them here.
  2710. */
  2711. if (bdev_is_zoned(si->bdev))
  2712. return -EINVAL;
  2713. si->flags |= SWP_BLKDEV;
  2714. } else if (S_ISREG(inode->i_mode)) {
  2715. si->bdev = inode->i_sb->s_bdev;
  2716. }
  2717. return 0;
  2718. }
  2719. /*
  2720. * Find out how many pages are allowed for a single swap device. There
  2721. * are two limiting factors:
  2722. * 1) the number of bits for the swap offset in the swp_entry_t type, and
  2723. * 2) the number of bits in the swap pte, as defined by the different
  2724. * architectures.
  2725. *
  2726. * In order to find the largest possible bit mask, a swap entry with
  2727. * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
  2728. * decoded to a swp_entry_t again, and finally the swap offset is
  2729. * extracted.
  2730. *
  2731. * This will mask all the bits from the initial ~0UL mask that can't
  2732. * be encoded in either the swp_entry_t or the architecture definition
  2733. * of a swap pte.
  2734. */
  2735. unsigned long generic_max_swapfile_size(void)
  2736. {
  2737. swp_entry_t entry = swp_entry(0, ~0UL);
  2738. const pte_t pte = softleaf_to_pte(entry);
  2739. /*
  2740. * Since the PTE can be an invalid softleaf entry (e.g. the none PTE),
  2741. * we need to do this manually.
  2742. */
  2743. entry = __pte_to_swp_entry(pte);
  2744. entry = swp_entry(__swp_type(entry), __swp_offset(entry));
  2745. return swp_offset(entry) + 1;
  2746. }
  2747. /* Can be overridden by an architecture for additional checks. */
  2748. __weak unsigned long arch_max_swapfile_size(void)
  2749. {
  2750. return generic_max_swapfile_size();
  2751. }
  2752. static unsigned long read_swap_header(struct swap_info_struct *si,
  2753. union swap_header *swap_header,
  2754. struct inode *inode)
  2755. {
  2756. int i;
  2757. unsigned long maxpages;
  2758. unsigned long swapfilepages;
  2759. unsigned long last_page;
  2760. if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
  2761. pr_err("Unable to find swap-space signature\n");
  2762. return 0;
  2763. }
  2764. /* swap partition endianness hack... */
  2765. if (swab32(swap_header->info.version) == 1) {
  2766. swab32s(&swap_header->info.version);
  2767. swab32s(&swap_header->info.last_page);
  2768. swab32s(&swap_header->info.nr_badpages);
  2769. if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
  2770. return 0;
  2771. for (i = 0; i < swap_header->info.nr_badpages; i++)
  2772. swab32s(&swap_header->info.badpages[i]);
  2773. }
  2774. /* Check the swap header's sub-version */
  2775. if (swap_header->info.version != 1) {
  2776. pr_warn("Unable to handle swap header version %d\n",
  2777. swap_header->info.version);
  2778. return 0;
  2779. }
  2780. maxpages = swapfile_maximum_size;
  2781. last_page = swap_header->info.last_page;
  2782. if (!last_page) {
  2783. pr_warn("Empty swap-file\n");
  2784. return 0;
  2785. }
  2786. if (last_page > maxpages) {
  2787. pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
  2788. K(maxpages), K(last_page));
  2789. }
  2790. if (maxpages > last_page) {
  2791. maxpages = last_page + 1;
  2792. /* p->max is an unsigned int: don't overflow it */
  2793. if ((unsigned int)maxpages == 0)
  2794. maxpages = UINT_MAX;
  2795. }
  2796. if (!maxpages)
  2797. return 0;
  2798. swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
  2799. if (swapfilepages && maxpages > swapfilepages) {
  2800. pr_warn("Swap area shorter than signature indicates\n");
  2801. return 0;
  2802. }
  2803. if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
  2804. return 0;
  2805. if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
  2806. return 0;
  2807. return maxpages;
  2808. }
  2809. static int setup_swap_map(struct swap_info_struct *si,
  2810. union swap_header *swap_header,
  2811. unsigned char *swap_map,
  2812. unsigned long maxpages)
  2813. {
  2814. unsigned long i;
  2815. swap_map[0] = SWAP_MAP_BAD; /* omit header page */
  2816. for (i = 0; i < swap_header->info.nr_badpages; i++) {
  2817. unsigned int page_nr = swap_header->info.badpages[i];
  2818. if (page_nr == 0 || page_nr > swap_header->info.last_page)
  2819. return -EINVAL;
  2820. if (page_nr < maxpages) {
  2821. swap_map[page_nr] = SWAP_MAP_BAD;
  2822. si->pages--;
  2823. }
  2824. }
  2825. if (!si->pages) {
  2826. pr_warn("Empty swap-file\n");
  2827. return -EINVAL;
  2828. }
  2829. return 0;
  2830. }
  2831. static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
  2832. union swap_header *swap_header,
  2833. unsigned long maxpages)
  2834. {
  2835. unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
  2836. struct swap_cluster_info *cluster_info;
  2837. int err = -ENOMEM;
  2838. unsigned long i;
  2839. cluster_info = kvzalloc_objs(*cluster_info, nr_clusters);
  2840. if (!cluster_info)
  2841. goto err;
  2842. for (i = 0; i < nr_clusters; i++)
  2843. spin_lock_init(&cluster_info[i].lock);
  2844. if (!(si->flags & SWP_SOLIDSTATE)) {
  2845. si->global_cluster = kmalloc_obj(*si->global_cluster);
  2846. if (!si->global_cluster)
  2847. goto err;
  2848. for (i = 0; i < SWAP_NR_ORDERS; i++)
  2849. si->global_cluster->next[i] = SWAP_ENTRY_INVALID;
  2850. spin_lock_init(&si->global_cluster_lock);
  2851. }
  2852. /*
  2853. * Mark unusable pages as unavailable. The clusters aren't
  2854. * marked free yet, so no list operations are involved yet.
  2855. *
  2856. * See setup_swap_map(): header page, bad pages,
  2857. * and the EOF part of the last cluster.
  2858. */
  2859. err = swap_cluster_setup_bad_slot(cluster_info, 0);
  2860. if (err)
  2861. goto err;
  2862. for (i = 0; i < swap_header->info.nr_badpages; i++) {
  2863. unsigned int page_nr = swap_header->info.badpages[i];
  2864. if (page_nr >= maxpages)
  2865. continue;
  2866. err = swap_cluster_setup_bad_slot(cluster_info, page_nr);
  2867. if (err)
  2868. goto err;
  2869. }
  2870. for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) {
  2871. err = swap_cluster_setup_bad_slot(cluster_info, i);
  2872. if (err)
  2873. goto err;
  2874. }
  2875. INIT_LIST_HEAD(&si->free_clusters);
  2876. INIT_LIST_HEAD(&si->full_clusters);
  2877. INIT_LIST_HEAD(&si->discard_clusters);
  2878. for (i = 0; i < SWAP_NR_ORDERS; i++) {
  2879. INIT_LIST_HEAD(&si->nonfull_clusters[i]);
  2880. INIT_LIST_HEAD(&si->frag_clusters[i]);
  2881. }
  2882. for (i = 0; i < nr_clusters; i++) {
  2883. struct swap_cluster_info *ci = &cluster_info[i];
  2884. if (ci->count) {
  2885. ci->flags = CLUSTER_FLAG_NONFULL;
  2886. list_add_tail(&ci->list, &si->nonfull_clusters[0]);
  2887. } else {
  2888. ci->flags = CLUSTER_FLAG_FREE;
  2889. list_add_tail(&ci->list, &si->free_clusters);
  2890. }
  2891. }
  2892. return cluster_info;
  2893. err:
  2894. free_cluster_info(cluster_info, maxpages);
  2895. return ERR_PTR(err);
  2896. }
  2897. SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
  2898. {
  2899. struct swap_info_struct *si;
  2900. struct file *swap_file = NULL;
  2901. struct address_space *mapping;
  2902. struct dentry *dentry;
  2903. int prio;
  2904. int error;
  2905. union swap_header *swap_header;
  2906. int nr_extents;
  2907. sector_t span;
  2908. unsigned long maxpages;
  2909. unsigned char *swap_map = NULL;
  2910. unsigned long *zeromap = NULL;
  2911. struct swap_cluster_info *cluster_info = NULL;
  2912. struct folio *folio = NULL;
  2913. struct inode *inode = NULL;
  2914. bool inced_nr_rotate_swap = false;
  2915. if (swap_flags & ~SWAP_FLAGS_VALID)
  2916. return -EINVAL;
  2917. if (!capable(CAP_SYS_ADMIN))
  2918. return -EPERM;
  2919. si = alloc_swap_info();
  2920. if (IS_ERR(si))
  2921. return PTR_ERR(si);
  2922. INIT_WORK(&si->discard_work, swap_discard_work);
  2923. INIT_WORK(&si->reclaim_work, swap_reclaim_work);
  2924. CLASS(filename, name)(specialfile);
  2925. swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0);
  2926. if (IS_ERR(swap_file)) {
  2927. error = PTR_ERR(swap_file);
  2928. swap_file = NULL;
  2929. goto bad_swap;
  2930. }
  2931. si->swap_file = swap_file;
  2932. mapping = swap_file->f_mapping;
  2933. dentry = swap_file->f_path.dentry;
  2934. inode = mapping->host;
  2935. error = claim_swapfile(si, inode);
  2936. if (unlikely(error))
  2937. goto bad_swap;
  2938. inode_lock(inode);
  2939. if (d_unlinked(dentry) || cant_mount(dentry)) {
  2940. error = -ENOENT;
  2941. goto bad_swap_unlock_inode;
  2942. }
  2943. if (IS_SWAPFILE(inode)) {
  2944. error = -EBUSY;
  2945. goto bad_swap_unlock_inode;
  2946. }
  2947. /*
  2948. * The swap subsystem needs a major overhaul to support this.
  2949. * It doesn't work yet so just disable it for now.
  2950. */
  2951. if (mapping_min_folio_order(mapping) > 0) {
  2952. error = -EINVAL;
  2953. goto bad_swap_unlock_inode;
  2954. }
  2955. /*
  2956. * Read the swap header.
  2957. */
  2958. if (!mapping->a_ops->read_folio) {
  2959. error = -EINVAL;
  2960. goto bad_swap_unlock_inode;
  2961. }
  2962. folio = read_mapping_folio(mapping, 0, swap_file);
  2963. if (IS_ERR(folio)) {
  2964. error = PTR_ERR(folio);
  2965. goto bad_swap_unlock_inode;
  2966. }
  2967. swap_header = kmap_local_folio(folio, 0);
  2968. maxpages = read_swap_header(si, swap_header, inode);
  2969. if (unlikely(!maxpages)) {
  2970. error = -EINVAL;
  2971. goto bad_swap_unlock_inode;
  2972. }
  2973. si->max = maxpages;
  2974. si->pages = maxpages - 1;
  2975. nr_extents = setup_swap_extents(si, &span);
  2976. if (nr_extents < 0) {
  2977. error = nr_extents;
  2978. goto bad_swap_unlock_inode;
  2979. }
  2980. if (si->pages != si->max - 1) {
  2981. pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max);
  2982. error = -EINVAL;
  2983. goto bad_swap_unlock_inode;
  2984. }
  2985. maxpages = si->max;
  2986. /* OK, set up the swap map and apply the bad block list */
  2987. swap_map = vzalloc(maxpages);
  2988. if (!swap_map) {
  2989. error = -ENOMEM;
  2990. goto bad_swap_unlock_inode;
  2991. }
  2992. error = swap_cgroup_swapon(si->type, maxpages);
  2993. if (error)
  2994. goto bad_swap_unlock_inode;
  2995. error = setup_swap_map(si, swap_header, swap_map, maxpages);
  2996. if (error)
  2997. goto bad_swap_unlock_inode;
  2998. /*
  2999. * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might
  3000. * be above MAX_PAGE_ORDER incase of a large swap file.
  3001. */
  3002. zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long),
  3003. GFP_KERNEL | __GFP_ZERO);
  3004. if (!zeromap) {
  3005. error = -ENOMEM;
  3006. goto bad_swap_unlock_inode;
  3007. }
  3008. if (si->bdev && bdev_stable_writes(si->bdev))
  3009. si->flags |= SWP_STABLE_WRITES;
  3010. if (si->bdev && bdev_synchronous(si->bdev))
  3011. si->flags |= SWP_SYNCHRONOUS_IO;
  3012. if (si->bdev && bdev_nonrot(si->bdev)) {
  3013. si->flags |= SWP_SOLIDSTATE;
  3014. } else {
  3015. atomic_inc(&nr_rotate_swap);
  3016. inced_nr_rotate_swap = true;
  3017. }
  3018. cluster_info = setup_clusters(si, swap_header, maxpages);
  3019. if (IS_ERR(cluster_info)) {
  3020. error = PTR_ERR(cluster_info);
  3021. cluster_info = NULL;
  3022. goto bad_swap_unlock_inode;
  3023. }
  3024. if ((swap_flags & SWAP_FLAG_DISCARD) &&
  3025. si->bdev && bdev_max_discard_sectors(si->bdev)) {
  3026. /*
  3027. * When discard is enabled for swap with no particular
  3028. * policy flagged, we set all swap discard flags here in
  3029. * order to sustain backward compatibility with older
  3030. * swapon(8) releases.
  3031. */
  3032. si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
  3033. SWP_PAGE_DISCARD);
  3034. /*
  3035. * By flagging sys_swapon, a sysadmin can tell us to
  3036. * either do single-time area discards only, or to just
  3037. * perform discards for released swap page-clusters.
  3038. * Now it's time to adjust the p->flags accordingly.
  3039. */
  3040. if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
  3041. si->flags &= ~SWP_PAGE_DISCARD;
  3042. else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
  3043. si->flags &= ~SWP_AREA_DISCARD;
  3044. /* issue a swapon-time discard if it's still required */
  3045. if (si->flags & SWP_AREA_DISCARD) {
  3046. int err = discard_swap(si);
  3047. if (unlikely(err))
  3048. pr_err("swapon: discard_swap(%p): %d\n",
  3049. si, err);
  3050. }
  3051. }
  3052. error = zswap_swapon(si->type, maxpages);
  3053. if (error)
  3054. goto bad_swap_unlock_inode;
  3055. /*
  3056. * Flush any pending IO and dirty mappings before we start using this
  3057. * swap device.
  3058. */
  3059. inode->i_flags |= S_SWAPFILE;
  3060. error = inode_drain_writes(inode);
  3061. if (error) {
  3062. inode->i_flags &= ~S_SWAPFILE;
  3063. goto free_swap_zswap;
  3064. }
  3065. mutex_lock(&swapon_mutex);
  3066. prio = DEF_SWAP_PRIO;
  3067. if (swap_flags & SWAP_FLAG_PREFER)
  3068. prio = swap_flags & SWAP_FLAG_PRIO_MASK;
  3069. enable_swap_info(si, prio, swap_map, cluster_info, zeromap);
  3070. pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n",
  3071. K(si->pages), name->name, si->prio, nr_extents,
  3072. K((unsigned long long)span),
  3073. (si->flags & SWP_SOLIDSTATE) ? "SS" : "",
  3074. (si->flags & SWP_DISCARDABLE) ? "D" : "",
  3075. (si->flags & SWP_AREA_DISCARD) ? "s" : "",
  3076. (si->flags & SWP_PAGE_DISCARD) ? "c" : "");
  3077. mutex_unlock(&swapon_mutex);
  3078. atomic_inc(&proc_poll_event);
  3079. wake_up_interruptible(&proc_poll_wait);
  3080. error = 0;
  3081. goto out;
  3082. free_swap_zswap:
  3083. zswap_swapoff(si->type);
  3084. bad_swap_unlock_inode:
  3085. inode_unlock(inode);
  3086. bad_swap:
  3087. kfree(si->global_cluster);
  3088. si->global_cluster = NULL;
  3089. inode = NULL;
  3090. destroy_swap_extents(si);
  3091. swap_cgroup_swapoff(si->type);
  3092. spin_lock(&swap_lock);
  3093. si->swap_file = NULL;
  3094. si->flags = 0;
  3095. spin_unlock(&swap_lock);
  3096. vfree(swap_map);
  3097. kvfree(zeromap);
  3098. if (cluster_info)
  3099. free_cluster_info(cluster_info, maxpages);
  3100. if (inced_nr_rotate_swap)
  3101. atomic_dec(&nr_rotate_swap);
  3102. if (swap_file)
  3103. filp_close(swap_file, NULL);
  3104. out:
  3105. if (!IS_ERR_OR_NULL(folio))
  3106. folio_release_kmap(folio, swap_header);
  3107. if (inode)
  3108. inode_unlock(inode);
  3109. return error;
  3110. }
  3111. void si_swapinfo(struct sysinfo *val)
  3112. {
  3113. unsigned int type;
  3114. unsigned long nr_to_be_unused = 0;
  3115. spin_lock(&swap_lock);
  3116. for (type = 0; type < nr_swapfiles; type++) {
  3117. struct swap_info_struct *si = swap_info[type];
  3118. if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
  3119. nr_to_be_unused += swap_usage_in_pages(si);
  3120. }
  3121. val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
  3122. val->totalswap = total_swap_pages + nr_to_be_unused;
  3123. spin_unlock(&swap_lock);
  3124. }
  3125. /*
  3126. * Verify that nr swap entries are valid and increment their swap map counts.
  3127. *
  3128. * Returns error code in following case.
  3129. * - success -> 0
  3130. * - swp_entry is invalid -> EINVAL
  3131. * - swap-mapped reference is requested but the entry is not used. -> ENOENT
  3132. * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
  3133. */
  3134. static int swap_dup_entries(struct swap_info_struct *si,
  3135. struct swap_cluster_info *ci,
  3136. unsigned long offset,
  3137. unsigned char usage, int nr)
  3138. {
  3139. int i;
  3140. unsigned char count;
  3141. for (i = 0; i < nr; i++) {
  3142. count = si->swap_map[offset + i];
  3143. /*
  3144. * For swapin out, allocator never allocates bad slots. for
  3145. * swapin, readahead is guarded by swap_entry_swapped.
  3146. */
  3147. if (WARN_ON(count == SWAP_MAP_BAD))
  3148. return -ENOENT;
  3149. /*
  3150. * Swap count duplication must be guarded by either swap cache folio (from
  3151. * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct).
  3152. */
  3153. if (WARN_ON(!count &&
  3154. !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))))
  3155. return -ENOENT;
  3156. if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX))
  3157. return -EINVAL;
  3158. }
  3159. for (i = 0; i < nr; i++) {
  3160. count = si->swap_map[offset + i];
  3161. if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
  3162. count += usage;
  3163. else if (swap_count_continued(si, offset + i, count))
  3164. count = COUNT_CONTINUED;
  3165. else {
  3166. /*
  3167. * Don't need to rollback changes, because if
  3168. * usage == 1, there must be nr == 1.
  3169. */
  3170. return -ENOMEM;
  3171. }
  3172. WRITE_ONCE(si->swap_map[offset + i], count);
  3173. }
  3174. return 0;
  3175. }
  3176. static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
  3177. {
  3178. int err;
  3179. struct swap_info_struct *si;
  3180. struct swap_cluster_info *ci;
  3181. unsigned long offset = swp_offset(entry);
  3182. si = swap_entry_to_info(entry);
  3183. if (WARN_ON_ONCE(!si)) {
  3184. pr_err("%s%08lx\n", Bad_file, entry.val);
  3185. return -EINVAL;
  3186. }
  3187. VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
  3188. ci = swap_cluster_lock(si, offset);
  3189. err = swap_dup_entries(si, ci, offset, usage, nr);
  3190. swap_cluster_unlock(ci);
  3191. return err;
  3192. }
  3193. /*
  3194. * swap_dup_entry_direct() - Increase reference count of a swap entry by one.
  3195. * @entry: first swap entry from which we want to increase the refcount.
  3196. *
  3197. * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
  3198. * but could not be atomically allocated. Returns 0, just as if it succeeded,
  3199. * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
  3200. * might occur if a page table entry has got corrupted.
  3201. *
  3202. * Context: Caller must ensure there is no race condition on the reference
  3203. * owner. e.g., locking the PTL of a PTE containing the entry being increased.
  3204. */
  3205. int swap_dup_entry_direct(swp_entry_t entry)
  3206. {
  3207. int err = 0;
  3208. while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
  3209. err = add_swap_count_continuation(entry, GFP_ATOMIC);
  3210. return err;
  3211. }
  3212. /*
  3213. * add_swap_count_continuation - called when a swap count is duplicated
  3214. * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
  3215. * page of the original vmalloc'ed swap_map, to hold the continuation count
  3216. * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
  3217. * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
  3218. *
  3219. * These continuation pages are seldom referenced: the common paths all work
  3220. * on the original swap_map, only referring to a continuation page when the
  3221. * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
  3222. *
  3223. * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
  3224. * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
  3225. * can be called after dropping locks.
  3226. */
  3227. int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
  3228. {
  3229. struct swap_info_struct *si;
  3230. struct swap_cluster_info *ci;
  3231. struct page *head;
  3232. struct page *page;
  3233. struct page *list_page;
  3234. pgoff_t offset;
  3235. unsigned char count;
  3236. int ret = 0;
  3237. /*
  3238. * When debugging, it's easier to use __GFP_ZERO here; but it's better
  3239. * for latency not to zero a page while GFP_ATOMIC and holding locks.
  3240. */
  3241. page = alloc_page(gfp_mask | __GFP_HIGHMEM);
  3242. si = get_swap_device(entry);
  3243. if (!si) {
  3244. /*
  3245. * An acceptable race has occurred since the failing
  3246. * __swap_duplicate(): the swap device may be swapoff
  3247. */
  3248. goto outer;
  3249. }
  3250. offset = swp_offset(entry);
  3251. ci = swap_cluster_lock(si, offset);
  3252. count = si->swap_map[offset];
  3253. if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
  3254. /*
  3255. * The higher the swap count, the more likely it is that tasks
  3256. * will race to add swap count continuation: we need to avoid
  3257. * over-provisioning.
  3258. */
  3259. goto out;
  3260. }
  3261. if (!page) {
  3262. ret = -ENOMEM;
  3263. goto out;
  3264. }
  3265. head = vmalloc_to_page(si->swap_map + offset);
  3266. offset &= ~PAGE_MASK;
  3267. spin_lock(&si->cont_lock);
  3268. /*
  3269. * Page allocation does not initialize the page's lru field,
  3270. * but it does always reset its private field.
  3271. */
  3272. if (!page_private(head)) {
  3273. BUG_ON(count & COUNT_CONTINUED);
  3274. INIT_LIST_HEAD(&head->lru);
  3275. set_page_private(head, SWP_CONTINUED);
  3276. si->flags |= SWP_CONTINUED;
  3277. }
  3278. list_for_each_entry(list_page, &head->lru, lru) {
  3279. unsigned char *map;
  3280. /*
  3281. * If the previous map said no continuation, but we've found
  3282. * a continuation page, free our allocation and use this one.
  3283. */
  3284. if (!(count & COUNT_CONTINUED))
  3285. goto out_unlock_cont;
  3286. map = kmap_local_page(list_page) + offset;
  3287. count = *map;
  3288. kunmap_local(map);
  3289. /*
  3290. * If this continuation count now has some space in it,
  3291. * free our allocation and use this one.
  3292. */
  3293. if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
  3294. goto out_unlock_cont;
  3295. }
  3296. list_add_tail(&page->lru, &head->lru);
  3297. page = NULL; /* now it's attached, don't free it */
  3298. out_unlock_cont:
  3299. spin_unlock(&si->cont_lock);
  3300. out:
  3301. swap_cluster_unlock(ci);
  3302. put_swap_device(si);
  3303. outer:
  3304. if (page)
  3305. __free_page(page);
  3306. return ret;
  3307. }
  3308. /*
  3309. * swap_count_continued - when the original swap_map count is incremented
  3310. * from SWAP_MAP_MAX, check if there is already a continuation page to carry
  3311. * into, carry if so, or else fail until a new continuation page is allocated;
  3312. * when the original swap_map count is decremented from 0 with continuation,
  3313. * borrow from the continuation and report whether it still holds more.
  3314. * Called while __swap_duplicate() or caller of swap_put_entry_locked()
  3315. * holds cluster lock.
  3316. */
  3317. static bool swap_count_continued(struct swap_info_struct *si,
  3318. pgoff_t offset, unsigned char count)
  3319. {
  3320. struct page *head;
  3321. struct page *page;
  3322. unsigned char *map;
  3323. bool ret;
  3324. head = vmalloc_to_page(si->swap_map + offset);
  3325. if (page_private(head) != SWP_CONTINUED) {
  3326. BUG_ON(count & COUNT_CONTINUED);
  3327. return false; /* need to add count continuation */
  3328. }
  3329. spin_lock(&si->cont_lock);
  3330. offset &= ~PAGE_MASK;
  3331. page = list_next_entry(head, lru);
  3332. map = kmap_local_page(page) + offset;
  3333. if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
  3334. goto init_map; /* jump over SWAP_CONT_MAX checks */
  3335. if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
  3336. /*
  3337. * Think of how you add 1 to 999
  3338. */
  3339. while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
  3340. kunmap_local(map);
  3341. page = list_next_entry(page, lru);
  3342. BUG_ON(page == head);
  3343. map = kmap_local_page(page) + offset;
  3344. }
  3345. if (*map == SWAP_CONT_MAX) {
  3346. kunmap_local(map);
  3347. page = list_next_entry(page, lru);
  3348. if (page == head) {
  3349. ret = false; /* add count continuation */
  3350. goto out;
  3351. }
  3352. map = kmap_local_page(page) + offset;
  3353. init_map: *map = 0; /* we didn't zero the page */
  3354. }
  3355. *map += 1;
  3356. kunmap_local(map);
  3357. while ((page = list_prev_entry(page, lru)) != head) {
  3358. map = kmap_local_page(page) + offset;
  3359. *map = COUNT_CONTINUED;
  3360. kunmap_local(map);
  3361. }
  3362. ret = true; /* incremented */
  3363. } else { /* decrementing */
  3364. /*
  3365. * Think of how you subtract 1 from 1000
  3366. */
  3367. BUG_ON(count != COUNT_CONTINUED);
  3368. while (*map == COUNT_CONTINUED) {
  3369. kunmap_local(map);
  3370. page = list_next_entry(page, lru);
  3371. BUG_ON(page == head);
  3372. map = kmap_local_page(page) + offset;
  3373. }
  3374. BUG_ON(*map == 0);
  3375. *map -= 1;
  3376. if (*map == 0)
  3377. count = 0;
  3378. kunmap_local(map);
  3379. while ((page = list_prev_entry(page, lru)) != head) {
  3380. map = kmap_local_page(page) + offset;
  3381. *map = SWAP_CONT_MAX | count;
  3382. count = COUNT_CONTINUED;
  3383. kunmap_local(map);
  3384. }
  3385. ret = count == COUNT_CONTINUED;
  3386. }
  3387. out:
  3388. spin_unlock(&si->cont_lock);
  3389. return ret;
  3390. }
  3391. /*
  3392. * free_swap_count_continuations - swapoff free all the continuation pages
  3393. * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
  3394. */
  3395. static void free_swap_count_continuations(struct swap_info_struct *si)
  3396. {
  3397. pgoff_t offset;
  3398. for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
  3399. struct page *head;
  3400. head = vmalloc_to_page(si->swap_map + offset);
  3401. if (page_private(head)) {
  3402. struct page *page, *next;
  3403. list_for_each_entry_safe(page, next, &head->lru, lru) {
  3404. list_del(&page->lru);
  3405. __free_page(page);
  3406. }
  3407. }
  3408. }
  3409. }
  3410. #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
  3411. static bool __has_usable_swap(void)
  3412. {
  3413. return !plist_head_empty(&swap_active_head);
  3414. }
  3415. void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
  3416. {
  3417. struct swap_info_struct *si;
  3418. if (!(gfp & __GFP_IO))
  3419. return;
  3420. if (!__has_usable_swap())
  3421. return;
  3422. if (!blk_cgroup_congested())
  3423. return;
  3424. /*
  3425. * We've already scheduled a throttle, avoid taking the global swap
  3426. * lock.
  3427. */
  3428. if (current->throttle_disk)
  3429. return;
  3430. spin_lock(&swap_avail_lock);
  3431. plist_for_each_entry(si, &swap_avail_head, avail_list) {
  3432. if (si->bdev) {
  3433. blkcg_schedule_throttle(si->bdev->bd_disk, true);
  3434. break;
  3435. }
  3436. }
  3437. spin_unlock(&swap_avail_lock);
  3438. }
  3439. #endif
  3440. static int __init swapfile_init(void)
  3441. {
  3442. swapfile_maximum_size = arch_max_swapfile_size();
  3443. /*
  3444. * Once a cluster is freed, it's swap table content is read
  3445. * only, and all swap cache readers (swap_cache_*) verifies
  3446. * the content before use. So it's safe to use RCU slab here.
  3447. */
  3448. if (!SWP_TABLE_USE_PAGE)
  3449. swap_table_cachep = kmem_cache_create("swap_table",
  3450. sizeof(struct swap_table),
  3451. 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);
  3452. #ifdef CONFIG_MIGRATION
  3453. if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
  3454. swap_migration_ad_supported = true;
  3455. #endif /* CONFIG_MIGRATION */
  3456. return 0;
  3457. }
  3458. subsys_initcall(swapfile_init);