filemap.c 135 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * linux/mm/filemap.c
  4. *
  5. * Copyright (C) 1994-1999 Linus Torvalds
  6. */
  7. /*
  8. * This file handles the generic file mmap semantics used by
  9. * most "normal" filesystems (but you don't /have/ to use this:
  10. * the NFS filesystem used to do this differently, for example)
  11. */
  12. #include <linux/export.h>
  13. #include <linux/compiler.h>
  14. #include <linux/dax.h>
  15. #include <linux/fs.h>
  16. #include <linux/sched/signal.h>
  17. #include <linux/uaccess.h>
  18. #include <linux/capability.h>
  19. #include <linux/kernel_stat.h>
  20. #include <linux/gfp.h>
  21. #include <linux/mm.h>
  22. #include <linux/swap.h>
  23. #include <linux/leafops.h>
  24. #include <linux/syscalls.h>
  25. #include <linux/mman.h>
  26. #include <linux/pagemap.h>
  27. #include <linux/file.h>
  28. #include <linux/uio.h>
  29. #include <linux/error-injection.h>
  30. #include <linux/hash.h>
  31. #include <linux/writeback.h>
  32. #include <linux/backing-dev.h>
  33. #include <linux/pagevec.h>
  34. #include <linux/security.h>
  35. #include <linux/cpuset.h>
  36. #include <linux/hugetlb.h>
  37. #include <linux/memcontrol.h>
  38. #include <linux/shmem_fs.h>
  39. #include <linux/rmap.h>
  40. #include <linux/delayacct.h>
  41. #include <linux/psi.h>
  42. #include <linux/ramfs.h>
  43. #include <linux/page_idle.h>
  44. #include <linux/migrate.h>
  45. #include <linux/pipe_fs_i.h>
  46. #include <linux/splice.h>
  47. #include <linux/rcupdate_wait.h>
  48. #include <linux/sched/mm.h>
  49. #include <linux/sysctl.h>
  50. #include <linux/pgalloc.h>
  51. #include <asm/tlbflush.h>
  52. #include "internal.h"
  53. #define CREATE_TRACE_POINTS
  54. #include <trace/events/filemap.h>
  55. /*
  56. * FIXME: remove all knowledge of the buffer layer from the core VM
  57. */
  58. #include <linux/buffer_head.h> /* for try_to_free_buffers */
  59. #include <asm/mman.h>
  60. #include "swap.h"
  61. /*
  62. * Shared mappings implemented 30.11.1994. It's not fully working yet,
  63. * though.
  64. *
  65. * Shared mappings now work. 15.8.1995 Bruno.
  66. *
  67. * finished 'unifying' the page and buffer cache and SMP-threaded the
  68. * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  69. *
  70. * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  71. */
  72. /*
  73. * Lock ordering:
  74. *
  75. * ->i_mmap_rwsem (truncate_pagecache)
  76. * ->private_lock (__free_pte->block_dirty_folio)
  77. * ->swap_lock (exclusive_swap_page, others)
  78. * ->i_pages lock
  79. *
  80. * ->i_rwsem
  81. * ->invalidate_lock (acquired by fs in truncate path)
  82. * ->i_mmap_rwsem (truncate->unmap_mapping_range)
  83. *
  84. * ->mmap_lock
  85. * ->i_mmap_rwsem
  86. * ->page_table_lock or pte_lock (various, mainly in memory.c)
  87. * ->i_pages lock (arch-dependent flush_dcache_mmap_lock)
  88. *
  89. * ->mmap_lock
  90. * ->invalidate_lock (filemap_fault)
  91. * ->lock_page (filemap_fault, access_process_vm)
  92. *
  93. * ->i_rwsem (generic_perform_write)
  94. * ->mmap_lock (fault_in_readable->do_page_fault)
  95. *
  96. * bdi->wb.list_lock
  97. * sb_lock (fs/fs-writeback.c)
  98. * ->i_pages lock (__sync_single_inode)
  99. *
  100. * ->i_mmap_rwsem
  101. * ->anon_vma.lock (vma_merge)
  102. *
  103. * ->anon_vma.lock
  104. * ->page_table_lock or pte_lock (anon_vma_prepare and various)
  105. *
  106. * ->page_table_lock or pte_lock
  107. * ->swap_lock (try_to_unmap_one)
  108. * ->private_lock (try_to_unmap_one)
  109. * ->i_pages lock (try_to_unmap_one)
  110. * ->lruvec->lru_lock (follow_page_mask->mark_page_accessed)
  111. * ->lruvec->lru_lock (check_pte_range->folio_isolate_lru)
  112. * ->private_lock (folio_remove_rmap_pte->set_page_dirty)
  113. * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty)
  114. * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty)
  115. * ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty)
  116. * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
  117. * ->inode->i_lock (zap_pte_range->set_page_dirty)
  118. * ->private_lock (zap_pte_range->block_dirty_folio)
  119. */
  120. static void page_cache_delete(struct address_space *mapping,
  121. struct folio *folio, void *shadow)
  122. {
  123. XA_STATE(xas, &mapping->i_pages, folio->index);
  124. long nr = 1;
  125. mapping_set_update(&xas, mapping);
  126. xas_set_order(&xas, folio->index, folio_order(folio));
  127. nr = folio_nr_pages(folio);
  128. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  129. xas_store(&xas, shadow);
  130. xas_init_marks(&xas);
  131. folio->mapping = NULL;
  132. /* Leave folio->index set: truncation lookup relies upon it */
  133. mapping->nrpages -= nr;
  134. }
  135. static void filemap_unaccount_folio(struct address_space *mapping,
  136. struct folio *folio)
  137. {
  138. long nr;
  139. VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
  140. if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
  141. pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",
  142. current->comm, folio_pfn(folio));
  143. dump_page(&folio->page, "still mapped when deleted");
  144. dump_stack();
  145. add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
  146. if (mapping_exiting(mapping) && !folio_test_large(folio)) {
  147. int mapcount = folio_mapcount(folio);
  148. if (folio_ref_count(folio) >= mapcount + 2) {
  149. /*
  150. * All vmas have already been torn down, so it's
  151. * a good bet that actually the page is unmapped
  152. * and we'd rather not leak it: if we're wrong,
  153. * another bad page check should catch it later.
  154. */
  155. atomic_set(&folio->_mapcount, -1);
  156. folio_ref_sub(folio, mapcount);
  157. }
  158. }
  159. }
  160. /* hugetlb folios do not participate in page cache accounting. */
  161. if (folio_test_hugetlb(folio))
  162. return;
  163. nr = folio_nr_pages(folio);
  164. lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
  165. if (folio_test_swapbacked(folio)) {
  166. lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
  167. if (folio_test_pmd_mappable(folio))
  168. lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
  169. } else if (folio_test_pmd_mappable(folio)) {
  170. lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
  171. filemap_nr_thps_dec(mapping);
  172. }
  173. if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags))
  174. mod_node_page_state(folio_pgdat(folio),
  175. NR_KERNEL_FILE_PAGES, -nr);
  176. /*
  177. * At this point folio must be either written or cleaned by
  178. * truncate. Dirty folio here signals a bug and loss of
  179. * unwritten data - on ordinary filesystems.
  180. *
  181. * But it's harmless on in-memory filesystems like tmpfs; and can
  182. * occur when a driver which did get_user_pages() sets page dirty
  183. * before putting it, while the inode is being finally evicted.
  184. *
  185. * Below fixes dirty accounting after removing the folio entirely
  186. * but leaves the dirty flag set: it has no effect for truncated
  187. * folio and anyway will be cleared before returning folio to
  188. * buddy allocator.
  189. */
  190. if (WARN_ON_ONCE(folio_test_dirty(folio) &&
  191. mapping_can_writeback(mapping)))
  192. folio_account_cleaned(folio, inode_to_wb(mapping->host));
  193. }
  194. /*
  195. * Delete a page from the page cache and free it. Caller has to make
  196. * sure the page is locked and that nobody else uses it - or that usage
  197. * is safe. The caller must hold the i_pages lock.
  198. */
  199. void __filemap_remove_folio(struct folio *folio, void *shadow)
  200. {
  201. struct address_space *mapping = folio->mapping;
  202. trace_mm_filemap_delete_from_page_cache(folio);
  203. filemap_unaccount_folio(mapping, folio);
  204. page_cache_delete(mapping, folio, shadow);
  205. }
  206. void filemap_free_folio(struct address_space *mapping, struct folio *folio)
  207. {
  208. void (*free_folio)(struct folio *);
  209. free_folio = mapping->a_ops->free_folio;
  210. if (free_folio)
  211. free_folio(folio);
  212. folio_put_refs(folio, folio_nr_pages(folio));
  213. }
  214. /**
  215. * filemap_remove_folio - Remove folio from page cache.
  216. * @folio: The folio.
  217. *
  218. * This must be called only on folios that are locked and have been
  219. * verified to be in the page cache. It will never put the folio into
  220. * the free list because the caller has a reference on the page.
  221. */
  222. void filemap_remove_folio(struct folio *folio)
  223. {
  224. struct address_space *mapping = folio->mapping;
  225. BUG_ON(!folio_test_locked(folio));
  226. spin_lock(&mapping->host->i_lock);
  227. xa_lock_irq(&mapping->i_pages);
  228. __filemap_remove_folio(folio, NULL);
  229. xa_unlock_irq(&mapping->i_pages);
  230. if (mapping_shrinkable(mapping))
  231. inode_lru_list_add(mapping->host);
  232. spin_unlock(&mapping->host->i_lock);
  233. filemap_free_folio(mapping, folio);
  234. }
  235. /*
  236. * page_cache_delete_batch - delete several folios from page cache
  237. * @mapping: the mapping to which folios belong
  238. * @fbatch: batch of folios to delete
  239. *
  240. * The function walks over mapping->i_pages and removes folios passed in
  241. * @fbatch from the mapping. The function expects @fbatch to be sorted
  242. * by page index and is optimised for it to be dense.
  243. * It tolerates holes in @fbatch (mapping entries at those indices are not
  244. * modified).
  245. *
  246. * The function expects the i_pages lock to be held.
  247. */
  248. static void page_cache_delete_batch(struct address_space *mapping,
  249. struct folio_batch *fbatch)
  250. {
  251. XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
  252. long total_pages = 0;
  253. int i = 0;
  254. struct folio *folio;
  255. mapping_set_update(&xas, mapping);
  256. xas_for_each(&xas, folio, ULONG_MAX) {
  257. if (i >= folio_batch_count(fbatch))
  258. break;
  259. /* A swap/dax/shadow entry got inserted? Skip it. */
  260. if (xa_is_value(folio))
  261. continue;
  262. /*
  263. * A page got inserted in our range? Skip it. We have our
  264. * pages locked so they are protected from being removed.
  265. * If we see a page whose index is higher than ours, it
  266. * means our page has been removed, which shouldn't be
  267. * possible because we're holding the PageLock.
  268. */
  269. if (folio != fbatch->folios[i]) {
  270. VM_BUG_ON_FOLIO(folio->index >
  271. fbatch->folios[i]->index, folio);
  272. continue;
  273. }
  274. WARN_ON_ONCE(!folio_test_locked(folio));
  275. folio->mapping = NULL;
  276. /* Leave folio->index set: truncation lookup relies on it */
  277. i++;
  278. xas_store(&xas, NULL);
  279. total_pages += folio_nr_pages(folio);
  280. }
  281. mapping->nrpages -= total_pages;
  282. }
  283. void delete_from_page_cache_batch(struct address_space *mapping,
  284. struct folio_batch *fbatch)
  285. {
  286. int i;
  287. if (!folio_batch_count(fbatch))
  288. return;
  289. spin_lock(&mapping->host->i_lock);
  290. xa_lock_irq(&mapping->i_pages);
  291. for (i = 0; i < folio_batch_count(fbatch); i++) {
  292. struct folio *folio = fbatch->folios[i];
  293. trace_mm_filemap_delete_from_page_cache(folio);
  294. filemap_unaccount_folio(mapping, folio);
  295. }
  296. page_cache_delete_batch(mapping, fbatch);
  297. xa_unlock_irq(&mapping->i_pages);
  298. if (mapping_shrinkable(mapping))
  299. inode_lru_list_add(mapping->host);
  300. spin_unlock(&mapping->host->i_lock);
  301. for (i = 0; i < folio_batch_count(fbatch); i++)
  302. filemap_free_folio(mapping, fbatch->folios[i]);
  303. }
  304. int filemap_check_errors(struct address_space *mapping)
  305. {
  306. int ret = 0;
  307. /* Check for outstanding write errors */
  308. if (test_bit(AS_ENOSPC, &mapping->flags) &&
  309. test_and_clear_bit(AS_ENOSPC, &mapping->flags))
  310. ret = -ENOSPC;
  311. if (test_bit(AS_EIO, &mapping->flags) &&
  312. test_and_clear_bit(AS_EIO, &mapping->flags))
  313. ret = -EIO;
  314. return ret;
  315. }
  316. EXPORT_SYMBOL(filemap_check_errors);
  317. static int filemap_check_and_keep_errors(struct address_space *mapping)
  318. {
  319. /* Check for outstanding write errors */
  320. if (test_bit(AS_EIO, &mapping->flags))
  321. return -EIO;
  322. if (test_bit(AS_ENOSPC, &mapping->flags))
  323. return -ENOSPC;
  324. return 0;
  325. }
  326. static int filemap_writeback(struct address_space *mapping, loff_t start,
  327. loff_t end, enum writeback_sync_modes sync_mode,
  328. long *nr_to_write)
  329. {
  330. struct writeback_control wbc = {
  331. .sync_mode = sync_mode,
  332. .nr_to_write = nr_to_write ? *nr_to_write : LONG_MAX,
  333. .range_start = start,
  334. .range_end = end,
  335. };
  336. int ret;
  337. if (!mapping_can_writeback(mapping) ||
  338. !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
  339. return 0;
  340. wbc_attach_fdatawrite_inode(&wbc, mapping->host);
  341. ret = do_writepages(mapping, &wbc);
  342. wbc_detach_inode(&wbc);
  343. if (!ret && nr_to_write)
  344. *nr_to_write = wbc.nr_to_write;
  345. return ret;
  346. }
  347. /**
  348. * filemap_fdatawrite_range - start writeback on mapping dirty pages in range
  349. * @mapping: address space structure to write
  350. * @start: offset in bytes where the range starts
  351. * @end: offset in bytes where the range ends (inclusive)
  352. *
  353. * Start writeback against all of a mapping's dirty pages that lie
  354. * within the byte offsets <start, end> inclusive.
  355. *
  356. * This is a data integrity operation that waits upon dirty or in writeback
  357. * pages.
  358. *
  359. * Return: %0 on success, negative error code otherwise.
  360. */
  361. int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
  362. loff_t end)
  363. {
  364. return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL);
  365. }
  366. EXPORT_SYMBOL(filemap_fdatawrite_range);
  367. int filemap_fdatawrite(struct address_space *mapping)
  368. {
  369. return filemap_fdatawrite_range(mapping, 0, LLONG_MAX);
  370. }
  371. EXPORT_SYMBOL(filemap_fdatawrite);
  372. /**
  373. * filemap_flush_range - start writeback on a range
  374. * @mapping: target address_space
  375. * @start: index to start writeback on
  376. * @end: last (inclusive) index for writeback
  377. *
  378. * This is a non-integrity writeback helper, to start writing back folios
  379. * for the indicated range.
  380. *
  381. * Return: %0 on success, negative error code otherwise.
  382. */
  383. int filemap_flush_range(struct address_space *mapping, loff_t start,
  384. loff_t end)
  385. {
  386. return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL);
  387. }
  388. EXPORT_SYMBOL_GPL(filemap_flush_range);
  389. /**
  390. * filemap_flush - mostly a non-blocking flush
  391. * @mapping: target address_space
  392. *
  393. * This is a mostly non-blocking flush. Not suitable for data-integrity
  394. * purposes - I/O may not be started against all dirty pages.
  395. *
  396. * Return: %0 on success, negative error code otherwise.
  397. */
  398. int filemap_flush(struct address_space *mapping)
  399. {
  400. return filemap_flush_range(mapping, 0, LLONG_MAX);
  401. }
  402. EXPORT_SYMBOL(filemap_flush);
  403. /*
  404. * Start writeback on @nr_to_write pages from @mapping. No one but the existing
  405. * btrfs caller should be using this. Talk to linux-mm if you think adding a
  406. * new caller is a good idea.
  407. */
  408. int filemap_flush_nr(struct address_space *mapping, long *nr_to_write)
  409. {
  410. return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE,
  411. nr_to_write);
  412. }
  413. EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs");
  414. /**
  415. * filemap_range_has_page - check if a page exists in range.
  416. * @mapping: address space within which to check
  417. * @start_byte: offset in bytes where the range starts
  418. * @end_byte: offset in bytes where the range ends (inclusive)
  419. *
  420. * Find at least one page in the range supplied, usually used to check if
  421. * direct writing in this range will trigger a writeback.
  422. *
  423. * Return: %true if at least one page exists in the specified range,
  424. * %false otherwise.
  425. */
  426. bool filemap_range_has_page(struct address_space *mapping,
  427. loff_t start_byte, loff_t end_byte)
  428. {
  429. struct folio *folio;
  430. XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
  431. pgoff_t max = end_byte >> PAGE_SHIFT;
  432. if (end_byte < start_byte)
  433. return false;
  434. rcu_read_lock();
  435. for (;;) {
  436. folio = xas_find(&xas, max);
  437. if (xas_retry(&xas, folio))
  438. continue;
  439. /* Shadow entries don't count */
  440. if (xa_is_value(folio))
  441. continue;
  442. /*
  443. * We don't need to try to pin this page; we're about to
  444. * release the RCU lock anyway. It is enough to know that
  445. * there was a page here recently.
  446. */
  447. break;
  448. }
  449. rcu_read_unlock();
  450. return folio != NULL;
  451. }
  452. EXPORT_SYMBOL(filemap_range_has_page);
  453. static void __filemap_fdatawait_range(struct address_space *mapping,
  454. loff_t start_byte, loff_t end_byte)
  455. {
  456. pgoff_t index = start_byte >> PAGE_SHIFT;
  457. pgoff_t end = end_byte >> PAGE_SHIFT;
  458. struct folio_batch fbatch;
  459. unsigned nr_folios;
  460. folio_batch_init(&fbatch);
  461. while (index <= end) {
  462. unsigned i;
  463. nr_folios = filemap_get_folios_tag(mapping, &index, end,
  464. PAGECACHE_TAG_WRITEBACK, &fbatch);
  465. if (!nr_folios)
  466. break;
  467. for (i = 0; i < nr_folios; i++) {
  468. struct folio *folio = fbatch.folios[i];
  469. folio_wait_writeback(folio);
  470. }
  471. folio_batch_release(&fbatch);
  472. cond_resched();
  473. }
  474. }
  475. /**
  476. * filemap_fdatawait_range - wait for writeback to complete
  477. * @mapping: address space structure to wait for
  478. * @start_byte: offset in bytes where the range starts
  479. * @end_byte: offset in bytes where the range ends (inclusive)
  480. *
  481. * Walk the list of under-writeback pages of the given address space
  482. * in the given range and wait for all of them. Check error status of
  483. * the address space and return it.
  484. *
  485. * Since the error status of the address space is cleared by this function,
  486. * callers are responsible for checking the return value and handling and/or
  487. * reporting the error.
  488. *
  489. * Return: error status of the address space.
  490. */
  491. int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
  492. loff_t end_byte)
  493. {
  494. __filemap_fdatawait_range(mapping, start_byte, end_byte);
  495. return filemap_check_errors(mapping);
  496. }
  497. EXPORT_SYMBOL(filemap_fdatawait_range);
  498. /**
  499. * filemap_fdatawait_range_keep_errors - wait for writeback to complete
  500. * @mapping: address space structure to wait for
  501. * @start_byte: offset in bytes where the range starts
  502. * @end_byte: offset in bytes where the range ends (inclusive)
  503. *
  504. * Walk the list of under-writeback pages of the given address space in the
  505. * given range and wait for all of them. Unlike filemap_fdatawait_range(),
  506. * this function does not clear error status of the address space.
  507. *
  508. * Use this function if callers don't handle errors themselves. Expected
  509. * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
  510. * fsfreeze(8)
  511. */
  512. int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
  513. loff_t start_byte, loff_t end_byte)
  514. {
  515. __filemap_fdatawait_range(mapping, start_byte, end_byte);
  516. return filemap_check_and_keep_errors(mapping);
  517. }
  518. EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
  519. /**
  520. * file_fdatawait_range - wait for writeback to complete
  521. * @file: file pointing to address space structure to wait for
  522. * @start_byte: offset in bytes where the range starts
  523. * @end_byte: offset in bytes where the range ends (inclusive)
  524. *
  525. * Walk the list of under-writeback pages of the address space that file
  526. * refers to, in the given range and wait for all of them. Check error
  527. * status of the address space vs. the file->f_wb_err cursor and return it.
  528. *
  529. * Since the error status of the file is advanced by this function,
  530. * callers are responsible for checking the return value and handling and/or
  531. * reporting the error.
  532. *
  533. * Return: error status of the address space vs. the file->f_wb_err cursor.
  534. */
  535. int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
  536. {
  537. struct address_space *mapping = file->f_mapping;
  538. __filemap_fdatawait_range(mapping, start_byte, end_byte);
  539. return file_check_and_advance_wb_err(file);
  540. }
  541. EXPORT_SYMBOL(file_fdatawait_range);
  542. /**
  543. * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
  544. * @mapping: address space structure to wait for
  545. *
  546. * Walk the list of under-writeback pages of the given address space
  547. * and wait for all of them. Unlike filemap_fdatawait(), this function
  548. * does not clear error status of the address space.
  549. *
  550. * Use this function if callers don't handle errors themselves. Expected
  551. * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
  552. * fsfreeze(8)
  553. *
  554. * Return: error status of the address space.
  555. */
  556. int filemap_fdatawait_keep_errors(struct address_space *mapping)
  557. {
  558. __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
  559. return filemap_check_and_keep_errors(mapping);
  560. }
  561. EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
  562. /* Returns true if writeback might be needed or already in progress. */
  563. static bool mapping_needs_writeback(struct address_space *mapping)
  564. {
  565. return mapping->nrpages;
  566. }
  567. bool filemap_range_has_writeback(struct address_space *mapping,
  568. loff_t start_byte, loff_t end_byte)
  569. {
  570. XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
  571. pgoff_t max = end_byte >> PAGE_SHIFT;
  572. struct folio *folio;
  573. if (end_byte < start_byte)
  574. return false;
  575. rcu_read_lock();
  576. xas_for_each(&xas, folio, max) {
  577. if (xas_retry(&xas, folio))
  578. continue;
  579. if (xa_is_value(folio))
  580. continue;
  581. if (folio_test_dirty(folio) || folio_test_locked(folio) ||
  582. folio_test_writeback(folio))
  583. break;
  584. }
  585. rcu_read_unlock();
  586. return folio != NULL;
  587. }
  588. EXPORT_SYMBOL_GPL(filemap_range_has_writeback);
  589. /**
  590. * filemap_write_and_wait_range - write out & wait on a file range
  591. * @mapping: the address_space for the pages
  592. * @lstart: offset in bytes where the range starts
  593. * @lend: offset in bytes where the range ends (inclusive)
  594. *
  595. * Write out and wait upon file offsets lstart->lend, inclusive.
  596. *
  597. * Note that @lend is inclusive (describes the last byte to be written) so
  598. * that this function can be used to write to the very end-of-file (end = -1).
  599. *
  600. * Return: error status of the address space.
  601. */
  602. int filemap_write_and_wait_range(struct address_space *mapping,
  603. loff_t lstart, loff_t lend)
  604. {
  605. int err = 0, err2;
  606. if (lend < lstart)
  607. return 0;
  608. if (mapping_needs_writeback(mapping)) {
  609. err = filemap_fdatawrite_range(mapping, lstart, lend);
  610. /*
  611. * Even if the above returned error, the pages may be
  612. * written partially (e.g. -ENOSPC), so we wait for it.
  613. * But the -EIO is special case, it may indicate the worst
  614. * thing (e.g. bug) happened, so we avoid waiting for it.
  615. */
  616. if (err != -EIO)
  617. __filemap_fdatawait_range(mapping, lstart, lend);
  618. }
  619. err2 = filemap_check_errors(mapping);
  620. if (!err)
  621. err = err2;
  622. return err;
  623. }
  624. EXPORT_SYMBOL(filemap_write_and_wait_range);
  625. void __filemap_set_wb_err(struct address_space *mapping, int err)
  626. {
  627. errseq_t eseq = errseq_set(&mapping->wb_err, err);
  628. trace_filemap_set_wb_err(mapping, eseq);
  629. }
  630. EXPORT_SYMBOL(__filemap_set_wb_err);
  631. /**
  632. * file_check_and_advance_wb_err - report wb error (if any) that was previously
  633. * and advance wb_err to current one
  634. * @file: struct file on which the error is being reported
  635. *
  636. * When userland calls fsync (or something like nfsd does the equivalent), we
  637. * want to report any writeback errors that occurred since the last fsync (or
  638. * since the file was opened if there haven't been any).
  639. *
  640. * Grab the wb_err from the mapping. If it matches what we have in the file,
  641. * then just quickly return 0. The file is all caught up.
  642. *
  643. * If it doesn't match, then take the mapping value, set the "seen" flag in
  644. * it and try to swap it into place. If it works, or another task beat us
  645. * to it with the new value, then update the f_wb_err and return the error
  646. * portion. The error at this point must be reported via proper channels
  647. * (a'la fsync, or NFS COMMIT operation, etc.).
  648. *
  649. * While we handle mapping->wb_err with atomic operations, the f_wb_err
  650. * value is protected by the f_lock since we must ensure that it reflects
  651. * the latest value swapped in for this file descriptor.
  652. *
  653. * Return: %0 on success, negative error code otherwise.
  654. */
  655. int file_check_and_advance_wb_err(struct file *file)
  656. {
  657. int err = 0;
  658. errseq_t old = READ_ONCE(file->f_wb_err);
  659. struct address_space *mapping = file->f_mapping;
  660. /* Locklessly handle the common case where nothing has changed */
  661. if (errseq_check(&mapping->wb_err, old)) {
  662. /* Something changed, must use slow path */
  663. spin_lock(&file->f_lock);
  664. old = file->f_wb_err;
  665. err = errseq_check_and_advance(&mapping->wb_err,
  666. &file->f_wb_err);
  667. trace_file_check_and_advance_wb_err(file, old);
  668. spin_unlock(&file->f_lock);
  669. }
  670. /*
  671. * We're mostly using this function as a drop in replacement for
  672. * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
  673. * that the legacy code would have had on these flags.
  674. */
  675. clear_bit(AS_EIO, &mapping->flags);
  676. clear_bit(AS_ENOSPC, &mapping->flags);
  677. return err;
  678. }
  679. EXPORT_SYMBOL(file_check_and_advance_wb_err);
  680. /**
  681. * file_write_and_wait_range - write out & wait on a file range
  682. * @file: file pointing to address_space with pages
  683. * @lstart: offset in bytes where the range starts
  684. * @lend: offset in bytes where the range ends (inclusive)
  685. *
  686. * Write out and wait upon file offsets lstart->lend, inclusive.
  687. *
  688. * Note that @lend is inclusive (describes the last byte to be written) so
  689. * that this function can be used to write to the very end-of-file (end = -1).
  690. *
  691. * After writing out and waiting on the data, we check and advance the
  692. * f_wb_err cursor to the latest value, and return any errors detected there.
  693. *
  694. * Return: %0 on success, negative error code otherwise.
  695. */
  696. int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
  697. {
  698. int err = 0, err2;
  699. struct address_space *mapping = file->f_mapping;
  700. if (lend < lstart)
  701. return 0;
  702. if (mapping_needs_writeback(mapping)) {
  703. err = filemap_fdatawrite_range(mapping, lstart, lend);
  704. /* See comment of filemap_write_and_wait() */
  705. if (err != -EIO)
  706. __filemap_fdatawait_range(mapping, lstart, lend);
  707. }
  708. err2 = file_check_and_advance_wb_err(file);
  709. if (!err)
  710. err = err2;
  711. return err;
  712. }
  713. EXPORT_SYMBOL(file_write_and_wait_range);
  714. /**
  715. * replace_page_cache_folio - replace a pagecache folio with a new one
  716. * @old: folio to be replaced
  717. * @new: folio to replace with
  718. *
  719. * This function replaces a folio in the pagecache with a new one. On
  720. * success it acquires the pagecache reference for the new folio and
  721. * drops it for the old folio. Both the old and new folios must be
  722. * locked. This function does not add the new folio to the LRU, the
  723. * caller must do that.
  724. *
  725. * The remove + add is atomic. This function cannot fail.
  726. */
  727. void replace_page_cache_folio(struct folio *old, struct folio *new)
  728. {
  729. struct address_space *mapping = old->mapping;
  730. void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
  731. pgoff_t offset = old->index;
  732. XA_STATE(xas, &mapping->i_pages, offset);
  733. VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
  734. VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
  735. VM_BUG_ON_FOLIO(new->mapping, new);
  736. folio_get(new);
  737. new->mapping = mapping;
  738. new->index = offset;
  739. mem_cgroup_replace_folio(old, new);
  740. xas_lock_irq(&xas);
  741. xas_store(&xas, new);
  742. old->mapping = NULL;
  743. /* hugetlb pages do not participate in page cache accounting. */
  744. if (!folio_test_hugetlb(old))
  745. lruvec_stat_sub_folio(old, NR_FILE_PAGES);
  746. if (!folio_test_hugetlb(new))
  747. lruvec_stat_add_folio(new, NR_FILE_PAGES);
  748. if (folio_test_swapbacked(old))
  749. lruvec_stat_sub_folio(old, NR_SHMEM);
  750. if (folio_test_swapbacked(new))
  751. lruvec_stat_add_folio(new, NR_SHMEM);
  752. xas_unlock_irq(&xas);
  753. if (free_folio)
  754. free_folio(old);
  755. folio_put(old);
  756. }
  757. EXPORT_SYMBOL_GPL(replace_page_cache_folio);
  758. noinline int __filemap_add_folio(struct address_space *mapping,
  759. struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
  760. {
  761. XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
  762. bool huge;
  763. long nr;
  764. unsigned int forder = folio_order(folio);
  765. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  766. VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
  767. VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping),
  768. folio);
  769. mapping_set_update(&xas, mapping);
  770. VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
  771. huge = folio_test_hugetlb(folio);
  772. nr = folio_nr_pages(folio);
  773. gfp &= GFP_RECLAIM_MASK;
  774. folio_ref_add(folio, nr);
  775. folio->mapping = mapping;
  776. folio->index = xas.xa_index;
  777. for (;;) {
  778. int order = -1;
  779. void *entry, *old = NULL;
  780. xas_lock_irq(&xas);
  781. xas_for_each_conflict(&xas, entry) {
  782. old = entry;
  783. if (!xa_is_value(entry)) {
  784. xas_set_err(&xas, -EEXIST);
  785. goto unlock;
  786. }
  787. /*
  788. * If a larger entry exists,
  789. * it will be the first and only entry iterated.
  790. */
  791. if (order == -1)
  792. order = xas_get_order(&xas);
  793. }
  794. if (old) {
  795. if (order > 0 && order > forder) {
  796. unsigned int split_order = max(forder,
  797. xas_try_split_min_order(order));
  798. /* How to handle large swap entries? */
  799. BUG_ON(shmem_mapping(mapping));
  800. while (order > forder) {
  801. xas_set_order(&xas, index, split_order);
  802. xas_try_split(&xas, old, order);
  803. if (xas_error(&xas))
  804. goto unlock;
  805. order = split_order;
  806. split_order =
  807. max(xas_try_split_min_order(
  808. split_order),
  809. forder);
  810. }
  811. xas_reset(&xas);
  812. }
  813. if (shadowp)
  814. *shadowp = old;
  815. }
  816. xas_store(&xas, folio);
  817. if (xas_error(&xas))
  818. goto unlock;
  819. mapping->nrpages += nr;
  820. /* hugetlb pages do not participate in page cache accounting */
  821. if (!huge) {
  822. lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
  823. if (folio_test_pmd_mappable(folio))
  824. lruvec_stat_mod_folio(folio,
  825. NR_FILE_THPS, nr);
  826. }
  827. unlock:
  828. xas_unlock_irq(&xas);
  829. if (!xas_nomem(&xas, gfp))
  830. break;
  831. }
  832. if (xas_error(&xas))
  833. goto error;
  834. trace_mm_filemap_add_to_page_cache(folio);
  835. return 0;
  836. error:
  837. folio->mapping = NULL;
  838. /* Leave folio->index set: truncation relies upon it */
  839. folio_put_refs(folio, nr);
  840. return xas_error(&xas);
  841. }
  842. ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
  843. int filemap_add_folio(struct address_space *mapping, struct folio *folio,
  844. pgoff_t index, gfp_t gfp)
  845. {
  846. void *shadow = NULL;
  847. int ret;
  848. struct mem_cgroup *tmp;
  849. bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags);
  850. if (kernel_file)
  851. tmp = set_active_memcg(root_mem_cgroup);
  852. ret = mem_cgroup_charge(folio, NULL, gfp);
  853. if (kernel_file)
  854. set_active_memcg(tmp);
  855. if (ret)
  856. return ret;
  857. __folio_set_locked(folio);
  858. ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
  859. if (unlikely(ret)) {
  860. mem_cgroup_uncharge(folio);
  861. __folio_clear_locked(folio);
  862. } else {
  863. /*
  864. * The folio might have been evicted from cache only
  865. * recently, in which case it should be activated like
  866. * any other repeatedly accessed folio.
  867. * The exception is folios getting rewritten; evicting other
  868. * data from the working set, only to cache data that will
  869. * get overwritten with something else, is a waste of memory.
  870. */
  871. WARN_ON_ONCE(folio_test_active(folio));
  872. if (!(gfp & __GFP_WRITE) && shadow)
  873. workingset_refault(folio, shadow);
  874. folio_add_lru(folio);
  875. if (kernel_file)
  876. mod_node_page_state(folio_pgdat(folio),
  877. NR_KERNEL_FILE_PAGES,
  878. folio_nr_pages(folio));
  879. }
  880. return ret;
  881. }
  882. EXPORT_SYMBOL_GPL(filemap_add_folio);
  883. #ifdef CONFIG_NUMA
  884. struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order,
  885. struct mempolicy *policy)
  886. {
  887. int n;
  888. struct folio *folio;
  889. if (policy)
  890. return folio_alloc_mpol_noprof(gfp, order, policy,
  891. NO_INTERLEAVE_INDEX, numa_node_id());
  892. if (cpuset_do_page_mem_spread()) {
  893. unsigned int cpuset_mems_cookie;
  894. do {
  895. cpuset_mems_cookie = read_mems_allowed_begin();
  896. n = cpuset_mem_spread_node();
  897. folio = __folio_alloc_node_noprof(gfp, order, n);
  898. } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
  899. return folio;
  900. }
  901. return folio_alloc_noprof(gfp, order);
  902. }
  903. EXPORT_SYMBOL(filemap_alloc_folio_noprof);
  904. #endif
  905. /*
  906. * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
  907. *
  908. * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
  909. *
  910. * @mapping1: the first mapping to lock
  911. * @mapping2: the second mapping to lock
  912. */
  913. void filemap_invalidate_lock_two(struct address_space *mapping1,
  914. struct address_space *mapping2)
  915. {
  916. if (mapping1 > mapping2)
  917. swap(mapping1, mapping2);
  918. if (mapping1)
  919. down_write(&mapping1->invalidate_lock);
  920. if (mapping2 && mapping1 != mapping2)
  921. down_write_nested(&mapping2->invalidate_lock, 1);
  922. }
  923. EXPORT_SYMBOL(filemap_invalidate_lock_two);
  924. /*
  925. * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
  926. *
  927. * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
  928. *
  929. * @mapping1: the first mapping to unlock
  930. * @mapping2: the second mapping to unlock
  931. */
  932. void filemap_invalidate_unlock_two(struct address_space *mapping1,
  933. struct address_space *mapping2)
  934. {
  935. if (mapping1)
  936. up_write(&mapping1->invalidate_lock);
  937. if (mapping2 && mapping1 != mapping2)
  938. up_write(&mapping2->invalidate_lock);
  939. }
  940. EXPORT_SYMBOL(filemap_invalidate_unlock_two);
  941. /*
  942. * In order to wait for pages to become available there must be
  943. * waitqueues associated with pages. By using a hash table of
  944. * waitqueues where the bucket discipline is to maintain all
  945. * waiters on the same queue and wake all when any of the pages
  946. * become available, and for the woken contexts to check to be
  947. * sure the appropriate page became available, this saves space
  948. * at a cost of "thundering herd" phenomena during rare hash
  949. * collisions.
  950. */
  951. #define PAGE_WAIT_TABLE_BITS 8
  952. #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
  953. static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
  954. static wait_queue_head_t *folio_waitqueue(struct folio *folio)
  955. {
  956. return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
  957. }
  958. /* How many times do we accept lock stealing from under a waiter? */
  959. static int sysctl_page_lock_unfairness = 5;
  960. static const struct ctl_table filemap_sysctl_table[] = {
  961. {
  962. .procname = "page_lock_unfairness",
  963. .data = &sysctl_page_lock_unfairness,
  964. .maxlen = sizeof(sysctl_page_lock_unfairness),
  965. .mode = 0644,
  966. .proc_handler = proc_dointvec_minmax,
  967. .extra1 = SYSCTL_ZERO,
  968. }
  969. };
  970. void __init pagecache_init(void)
  971. {
  972. int i;
  973. for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
  974. init_waitqueue_head(&folio_wait_table[i]);
  975. page_writeback_init();
  976. register_sysctl_init("vm", filemap_sysctl_table);
  977. }
  978. /*
  979. * The page wait code treats the "wait->flags" somewhat unusually, because
  980. * we have multiple different kinds of waits, not just the usual "exclusive"
  981. * one.
  982. *
  983. * We have:
  984. *
  985. * (a) no special bits set:
  986. *
  987. * We're just waiting for the bit to be released, and when a waker
  988. * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
  989. * and remove it from the wait queue.
  990. *
  991. * Simple and straightforward.
  992. *
  993. * (b) WQ_FLAG_EXCLUSIVE:
  994. *
  995. * The waiter is waiting to get the lock, and only one waiter should
  996. * be woken up to avoid any thundering herd behavior. We'll set the
  997. * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
  998. *
  999. * This is the traditional exclusive wait.
  1000. *
  1001. * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
  1002. *
  1003. * The waiter is waiting to get the bit, and additionally wants the
  1004. * lock to be transferred to it for fair lock behavior. If the lock
  1005. * cannot be taken, we stop walking the wait queue without waking
  1006. * the waiter.
  1007. *
  1008. * This is the "fair lock handoff" case, and in addition to setting
  1009. * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
  1010. * that it now has the lock.
  1011. */
  1012. static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
  1013. {
  1014. unsigned int flags;
  1015. struct wait_page_key *key = arg;
  1016. struct wait_page_queue *wait_page
  1017. = container_of(wait, struct wait_page_queue, wait);
  1018. if (!wake_page_match(wait_page, key))
  1019. return 0;
  1020. /*
  1021. * If it's a lock handoff wait, we get the bit for it, and
  1022. * stop walking (and do not wake it up) if we can't.
  1023. */
  1024. flags = wait->flags;
  1025. if (flags & WQ_FLAG_EXCLUSIVE) {
  1026. if (test_bit(key->bit_nr, &key->folio->flags.f))
  1027. return -1;
  1028. if (flags & WQ_FLAG_CUSTOM) {
  1029. if (test_and_set_bit(key->bit_nr, &key->folio->flags.f))
  1030. return -1;
  1031. flags |= WQ_FLAG_DONE;
  1032. }
  1033. }
  1034. /*
  1035. * We are holding the wait-queue lock, but the waiter that
  1036. * is waiting for this will be checking the flags without
  1037. * any locking.
  1038. *
  1039. * So update the flags atomically, and wake up the waiter
  1040. * afterwards to avoid any races. This store-release pairs
  1041. * with the load-acquire in folio_wait_bit_common().
  1042. */
  1043. smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
  1044. wake_up_state(wait->private, mode);
  1045. /*
  1046. * Ok, we have successfully done what we're waiting for,
  1047. * and we can unconditionally remove the wait entry.
  1048. *
  1049. * Note that this pairs with the "finish_wait()" in the
  1050. * waiter, and has to be the absolute last thing we do.
  1051. * After this list_del_init(&wait->entry) the wait entry
  1052. * might be de-allocated and the process might even have
  1053. * exited.
  1054. */
  1055. list_del_init_careful(&wait->entry);
  1056. return (flags & WQ_FLAG_EXCLUSIVE) != 0;
  1057. }
  1058. static void folio_wake_bit(struct folio *folio, int bit_nr)
  1059. {
  1060. wait_queue_head_t *q = folio_waitqueue(folio);
  1061. struct wait_page_key key;
  1062. unsigned long flags;
  1063. key.folio = folio;
  1064. key.bit_nr = bit_nr;
  1065. key.page_match = 0;
  1066. spin_lock_irqsave(&q->lock, flags);
  1067. __wake_up_locked_key(q, TASK_NORMAL, &key);
  1068. /*
  1069. * It's possible to miss clearing waiters here, when we woke our page
  1070. * waiters, but the hashed waitqueue has waiters for other pages on it.
  1071. * That's okay, it's a rare case. The next waker will clear it.
  1072. *
  1073. * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
  1074. * other), the flag may be cleared in the course of freeing the page;
  1075. * but that is not required for correctness.
  1076. */
  1077. if (!waitqueue_active(q) || !key.page_match)
  1078. folio_clear_waiters(folio);
  1079. spin_unlock_irqrestore(&q->lock, flags);
  1080. }
  1081. /*
  1082. * A choice of three behaviors for folio_wait_bit_common():
  1083. */
  1084. enum behavior {
  1085. EXCLUSIVE, /* Hold ref to page and take the bit when woken, like
  1086. * __folio_lock() waiting on then setting PG_locked.
  1087. */
  1088. SHARED, /* Hold ref to page and check the bit when woken, like
  1089. * folio_wait_writeback() waiting on PG_writeback.
  1090. */
  1091. DROP, /* Drop ref to page before wait, no check when woken,
  1092. * like folio_put_wait_locked() on PG_locked.
  1093. */
  1094. };
  1095. /*
  1096. * Attempt to check (or get) the folio flag, and mark us done
  1097. * if successful.
  1098. */
  1099. static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
  1100. struct wait_queue_entry *wait)
  1101. {
  1102. if (wait->flags & WQ_FLAG_EXCLUSIVE) {
  1103. if (test_and_set_bit(bit_nr, &folio->flags.f))
  1104. return false;
  1105. } else if (test_bit(bit_nr, &folio->flags.f))
  1106. return false;
  1107. wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
  1108. return true;
  1109. }
  1110. static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
  1111. int state, enum behavior behavior)
  1112. {
  1113. wait_queue_head_t *q = folio_waitqueue(folio);
  1114. int unfairness = sysctl_page_lock_unfairness;
  1115. struct wait_page_queue wait_page;
  1116. wait_queue_entry_t *wait = &wait_page.wait;
  1117. bool thrashing = false;
  1118. unsigned long pflags;
  1119. bool in_thrashing;
  1120. if (bit_nr == PG_locked &&
  1121. !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
  1122. delayacct_thrashing_start(&in_thrashing);
  1123. psi_memstall_enter(&pflags);
  1124. thrashing = true;
  1125. }
  1126. init_wait(wait);
  1127. wait->func = wake_page_function;
  1128. wait_page.folio = folio;
  1129. wait_page.bit_nr = bit_nr;
  1130. repeat:
  1131. wait->flags = 0;
  1132. if (behavior == EXCLUSIVE) {
  1133. wait->flags = WQ_FLAG_EXCLUSIVE;
  1134. if (--unfairness < 0)
  1135. wait->flags |= WQ_FLAG_CUSTOM;
  1136. }
  1137. /*
  1138. * Do one last check whether we can get the
  1139. * page bit synchronously.
  1140. *
  1141. * Do the folio_set_waiters() marking before that
  1142. * to let any waker we _just_ missed know they
  1143. * need to wake us up (otherwise they'll never
  1144. * even go to the slow case that looks at the
  1145. * page queue), and add ourselves to the wait
  1146. * queue if we need to sleep.
  1147. *
  1148. * This part needs to be done under the queue
  1149. * lock to avoid races.
  1150. */
  1151. spin_lock_irq(&q->lock);
  1152. folio_set_waiters(folio);
  1153. if (!folio_trylock_flag(folio, bit_nr, wait))
  1154. __add_wait_queue_entry_tail(q, wait);
  1155. spin_unlock_irq(&q->lock);
  1156. /*
  1157. * From now on, all the logic will be based on
  1158. * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
  1159. * see whether the page bit testing has already
  1160. * been done by the wake function.
  1161. *
  1162. * We can drop our reference to the folio.
  1163. */
  1164. if (behavior == DROP)
  1165. folio_put(folio);
  1166. /*
  1167. * Note that until the "finish_wait()", or until
  1168. * we see the WQ_FLAG_WOKEN flag, we need to
  1169. * be very careful with the 'wait->flags', because
  1170. * we may race with a waker that sets them.
  1171. */
  1172. for (;;) {
  1173. unsigned int flags;
  1174. set_current_state(state);
  1175. /* Loop until we've been woken or interrupted */
  1176. flags = smp_load_acquire(&wait->flags);
  1177. if (!(flags & WQ_FLAG_WOKEN)) {
  1178. if (signal_pending_state(state, current))
  1179. break;
  1180. io_schedule();
  1181. continue;
  1182. }
  1183. /* If we were non-exclusive, we're done */
  1184. if (behavior != EXCLUSIVE)
  1185. break;
  1186. /* If the waker got the lock for us, we're done */
  1187. if (flags & WQ_FLAG_DONE)
  1188. break;
  1189. /*
  1190. * Otherwise, if we're getting the lock, we need to
  1191. * try to get it ourselves.
  1192. *
  1193. * And if that fails, we'll have to retry this all.
  1194. */
  1195. if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
  1196. goto repeat;
  1197. wait->flags |= WQ_FLAG_DONE;
  1198. break;
  1199. }
  1200. /*
  1201. * If a signal happened, this 'finish_wait()' may remove the last
  1202. * waiter from the wait-queues, but the folio waiters bit will remain
  1203. * set. That's ok. The next wakeup will take care of it, and trying
  1204. * to do it here would be difficult and prone to races.
  1205. */
  1206. finish_wait(q, wait);
  1207. if (thrashing) {
  1208. delayacct_thrashing_end(&in_thrashing);
  1209. psi_memstall_leave(&pflags);
  1210. }
  1211. /*
  1212. * NOTE! The wait->flags weren't stable until we've done the
  1213. * 'finish_wait()', and we could have exited the loop above due
  1214. * to a signal, and had a wakeup event happen after the signal
  1215. * test but before the 'finish_wait()'.
  1216. *
  1217. * So only after the finish_wait() can we reliably determine
  1218. * if we got woken up or not, so we can now figure out the final
  1219. * return value based on that state without races.
  1220. *
  1221. * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
  1222. * waiter, but an exclusive one requires WQ_FLAG_DONE.
  1223. */
  1224. if (behavior == EXCLUSIVE)
  1225. return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
  1226. return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
  1227. }
  1228. #ifdef CONFIG_MIGRATION
  1229. /**
  1230. * softleaf_entry_wait_on_locked - Wait for a migration entry or
  1231. * device_private entry to be removed.
  1232. * @entry: migration or device_private swap entry.
  1233. * @ptl: already locked ptl. This function will drop the lock.
  1234. *
  1235. * Wait for a migration entry referencing the given page, or device_private
  1236. * entry referencing a dvice_private page to be unlocked. This is
  1237. * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except
  1238. * this can be called without taking a reference on the page. Instead this
  1239. * should be called while holding the ptl for @entry referencing
  1240. * the page.
  1241. *
  1242. * Returns after unlocking the ptl.
  1243. *
  1244. * This follows the same logic as folio_wait_bit_common() so see the comments
  1245. * there.
  1246. */
  1247. void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
  1248. __releases(ptl)
  1249. {
  1250. struct wait_page_queue wait_page;
  1251. wait_queue_entry_t *wait = &wait_page.wait;
  1252. bool thrashing = false;
  1253. unsigned long pflags;
  1254. bool in_thrashing;
  1255. wait_queue_head_t *q;
  1256. struct folio *folio = softleaf_to_folio(entry);
  1257. q = folio_waitqueue(folio);
  1258. if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
  1259. delayacct_thrashing_start(&in_thrashing);
  1260. psi_memstall_enter(&pflags);
  1261. thrashing = true;
  1262. }
  1263. init_wait(wait);
  1264. wait->func = wake_page_function;
  1265. wait_page.folio = folio;
  1266. wait_page.bit_nr = PG_locked;
  1267. wait->flags = 0;
  1268. spin_lock_irq(&q->lock);
  1269. folio_set_waiters(folio);
  1270. if (!folio_trylock_flag(folio, PG_locked, wait))
  1271. __add_wait_queue_entry_tail(q, wait);
  1272. spin_unlock_irq(&q->lock);
  1273. /*
  1274. * If a migration entry exists for the page the migration path must hold
  1275. * a valid reference to the page, and it must take the ptl to remove the
  1276. * migration entry. So the page is valid until the ptl is dropped.
  1277. * Similarly any path attempting to drop the last reference to a
  1278. * device-private page needs to grab the ptl to remove the device-private
  1279. * entry.
  1280. */
  1281. spin_unlock(ptl);
  1282. for (;;) {
  1283. unsigned int flags;
  1284. set_current_state(TASK_UNINTERRUPTIBLE);
  1285. /* Loop until we've been woken or interrupted */
  1286. flags = smp_load_acquire(&wait->flags);
  1287. if (!(flags & WQ_FLAG_WOKEN)) {
  1288. if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
  1289. break;
  1290. io_schedule();
  1291. continue;
  1292. }
  1293. break;
  1294. }
  1295. finish_wait(q, wait);
  1296. if (thrashing) {
  1297. delayacct_thrashing_end(&in_thrashing);
  1298. psi_memstall_leave(&pflags);
  1299. }
  1300. }
  1301. #endif
  1302. void folio_wait_bit(struct folio *folio, int bit_nr)
  1303. {
  1304. folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
  1305. }
  1306. EXPORT_SYMBOL(folio_wait_bit);
  1307. int folio_wait_bit_killable(struct folio *folio, int bit_nr)
  1308. {
  1309. return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
  1310. }
  1311. EXPORT_SYMBOL(folio_wait_bit_killable);
  1312. /**
  1313. * folio_put_wait_locked - Drop a reference and wait for it to be unlocked
  1314. * @folio: The folio to wait for.
  1315. * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
  1316. *
  1317. * The caller should hold a reference on @folio. They expect the page to
  1318. * become unlocked relatively soon, but do not wish to hold up migration
  1319. * (for example) by holding the reference while waiting for the folio to
  1320. * come unlocked. After this function returns, the caller should not
  1321. * dereference @folio.
  1322. *
  1323. * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
  1324. */
  1325. static int folio_put_wait_locked(struct folio *folio, int state)
  1326. {
  1327. return folio_wait_bit_common(folio, PG_locked, state, DROP);
  1328. }
  1329. /**
  1330. * folio_unlock - Unlock a locked folio.
  1331. * @folio: The folio.
  1332. *
  1333. * Unlocks the folio and wakes up any thread sleeping on the page lock.
  1334. *
  1335. * Context: May be called from interrupt or process context. May not be
  1336. * called from NMI context.
  1337. */
  1338. void folio_unlock(struct folio *folio)
  1339. {
  1340. /* Bit 7 allows x86 to check the byte's sign bit */
  1341. BUILD_BUG_ON(PG_waiters != 7);
  1342. BUILD_BUG_ON(PG_locked > 7);
  1343. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  1344. if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))
  1345. folio_wake_bit(folio, PG_locked);
  1346. }
  1347. EXPORT_SYMBOL(folio_unlock);
  1348. /**
  1349. * folio_end_read - End read on a folio.
  1350. * @folio: The folio.
  1351. * @success: True if all reads completed successfully.
  1352. *
  1353. * When all reads against a folio have completed, filesystems should
  1354. * call this function to let the pagecache know that no more reads
  1355. * are outstanding. This will unlock the folio and wake up any thread
  1356. * sleeping on the lock. The folio will also be marked uptodate if all
  1357. * reads succeeded.
  1358. *
  1359. * Context: May be called from interrupt or process context. May not be
  1360. * called from NMI context.
  1361. */
  1362. void folio_end_read(struct folio *folio, bool success)
  1363. {
  1364. unsigned long mask = 1 << PG_locked;
  1365. /* Must be in bottom byte for x86 to work */
  1366. BUILD_BUG_ON(PG_uptodate > 7);
  1367. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  1368. VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio);
  1369. if (likely(success))
  1370. mask |= 1 << PG_uptodate;
  1371. if (folio_xor_flags_has_waiters(folio, mask))
  1372. folio_wake_bit(folio, PG_locked);
  1373. }
  1374. EXPORT_SYMBOL(folio_end_read);
  1375. /**
  1376. * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
  1377. * @folio: The folio.
  1378. *
  1379. * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
  1380. * it. The folio reference held for PG_private_2 being set is released.
  1381. *
  1382. * This is, for example, used when a netfs folio is being written to a local
  1383. * disk cache, thereby allowing writes to the cache for the same folio to be
  1384. * serialised.
  1385. */
  1386. void folio_end_private_2(struct folio *folio)
  1387. {
  1388. VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
  1389. clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
  1390. folio_wake_bit(folio, PG_private_2);
  1391. folio_put(folio);
  1392. }
  1393. EXPORT_SYMBOL(folio_end_private_2);
  1394. /**
  1395. * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
  1396. * @folio: The folio to wait on.
  1397. *
  1398. * Wait for PG_private_2 to be cleared on a folio.
  1399. */
  1400. void folio_wait_private_2(struct folio *folio)
  1401. {
  1402. while (folio_test_private_2(folio))
  1403. folio_wait_bit(folio, PG_private_2);
  1404. }
  1405. EXPORT_SYMBOL(folio_wait_private_2);
  1406. /**
  1407. * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
  1408. * @folio: The folio to wait on.
  1409. *
  1410. * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is
  1411. * received by the calling task.
  1412. *
  1413. * Return:
  1414. * - 0 if successful.
  1415. * - -EINTR if a fatal signal was encountered.
  1416. */
  1417. int folio_wait_private_2_killable(struct folio *folio)
  1418. {
  1419. int ret = 0;
  1420. while (folio_test_private_2(folio)) {
  1421. ret = folio_wait_bit_killable(folio, PG_private_2);
  1422. if (ret < 0)
  1423. break;
  1424. }
  1425. return ret;
  1426. }
  1427. EXPORT_SYMBOL(folio_wait_private_2_killable);
  1428. static void filemap_end_dropbehind(struct folio *folio)
  1429. {
  1430. struct address_space *mapping = folio->mapping;
  1431. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  1432. if (folio_test_writeback(folio) || folio_test_dirty(folio))
  1433. return;
  1434. if (!folio_test_clear_dropbehind(folio))
  1435. return;
  1436. if (mapping)
  1437. folio_unmap_invalidate(mapping, folio, 0);
  1438. }
  1439. /*
  1440. * If folio was marked as dropbehind, then pages should be dropped when writeback
  1441. * completes. Do that now. If we fail, it's likely because of a big folio -
  1442. * just reset dropbehind for that case and latter completions should invalidate.
  1443. */
  1444. void folio_end_dropbehind(struct folio *folio)
  1445. {
  1446. if (!folio_test_dropbehind(folio))
  1447. return;
  1448. /*
  1449. * Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
  1450. * but can happen if normal writeback just happens to find dirty folios
  1451. * that were created as part of uncached writeback, and that writeback
  1452. * would otherwise not need non-IRQ handling. Just skip the
  1453. * invalidation in that case.
  1454. */
  1455. if (in_task() && folio_trylock(folio)) {
  1456. filemap_end_dropbehind(folio);
  1457. folio_unlock(folio);
  1458. }
  1459. }
  1460. EXPORT_SYMBOL_GPL(folio_end_dropbehind);
  1461. /**
  1462. * folio_end_writeback_no_dropbehind - End writeback against a folio.
  1463. * @folio: The folio.
  1464. *
  1465. * The folio must actually be under writeback.
  1466. * This call is intended for filesystems that need to defer dropbehind.
  1467. *
  1468. * Context: May be called from process or interrupt context.
  1469. */
  1470. void folio_end_writeback_no_dropbehind(struct folio *folio)
  1471. {
  1472. VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
  1473. /*
  1474. * folio_test_clear_reclaim() could be used here but it is an
  1475. * atomic operation and overkill in this particular case. Failing
  1476. * to shuffle a folio marked for immediate reclaim is too mild
  1477. * a gain to justify taking an atomic operation penalty at the
  1478. * end of every folio writeback.
  1479. */
  1480. if (folio_test_reclaim(folio)) {
  1481. folio_clear_reclaim(folio);
  1482. folio_rotate_reclaimable(folio);
  1483. }
  1484. if (__folio_end_writeback(folio))
  1485. folio_wake_bit(folio, PG_writeback);
  1486. acct_reclaim_writeback(folio);
  1487. }
  1488. EXPORT_SYMBOL_GPL(folio_end_writeback_no_dropbehind);
  1489. /**
  1490. * folio_end_writeback - End writeback against a folio.
  1491. * @folio: The folio.
  1492. *
  1493. * The folio must actually be under writeback.
  1494. *
  1495. * Context: May be called from process or interrupt context.
  1496. */
  1497. void folio_end_writeback(struct folio *folio)
  1498. {
  1499. VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
  1500. /*
  1501. * Writeback does not hold a folio reference of its own, relying
  1502. * on truncation to wait for the clearing of PG_writeback.
  1503. * But here we must make sure that the folio is not freed and
  1504. * reused before the folio_wake_bit().
  1505. */
  1506. folio_get(folio);
  1507. folio_end_writeback_no_dropbehind(folio);
  1508. folio_end_dropbehind(folio);
  1509. folio_put(folio);
  1510. }
  1511. EXPORT_SYMBOL(folio_end_writeback);
  1512. /**
  1513. * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
  1514. * @folio: The folio to lock
  1515. */
  1516. void __folio_lock(struct folio *folio)
  1517. {
  1518. folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
  1519. EXCLUSIVE);
  1520. }
  1521. EXPORT_SYMBOL(__folio_lock);
  1522. int __folio_lock_killable(struct folio *folio)
  1523. {
  1524. return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
  1525. EXCLUSIVE);
  1526. }
  1527. EXPORT_SYMBOL_GPL(__folio_lock_killable);
  1528. static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
  1529. {
  1530. struct wait_queue_head *q = folio_waitqueue(folio);
  1531. int ret;
  1532. wait->folio = folio;
  1533. wait->bit_nr = PG_locked;
  1534. spin_lock_irq(&q->lock);
  1535. __add_wait_queue_entry_tail(q, &wait->wait);
  1536. folio_set_waiters(folio);
  1537. ret = !folio_trylock(folio);
  1538. /*
  1539. * If we were successful now, we know we're still on the
  1540. * waitqueue as we're still under the lock. This means it's
  1541. * safe to remove and return success, we know the callback
  1542. * isn't going to trigger.
  1543. */
  1544. if (!ret)
  1545. __remove_wait_queue(q, &wait->wait);
  1546. else
  1547. ret = -EIOCBQUEUED;
  1548. spin_unlock_irq(&q->lock);
  1549. return ret;
  1550. }
  1551. /*
  1552. * Return values:
  1553. * 0 - folio is locked.
  1554. * non-zero - folio is not locked.
  1555. * mmap_lock or per-VMA lock has been released (mmap_read_unlock() or
  1556. * vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and
  1557. * FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.
  1558. *
  1559. * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
  1560. * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.
  1561. */
  1562. vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
  1563. {
  1564. unsigned int flags = vmf->flags;
  1565. if (fault_flag_allow_retry_first(flags)) {
  1566. /*
  1567. * CAUTION! In this case, mmap_lock/per-VMA lock is not
  1568. * released even though returning VM_FAULT_RETRY.
  1569. */
  1570. if (flags & FAULT_FLAG_RETRY_NOWAIT)
  1571. return VM_FAULT_RETRY;
  1572. release_fault_lock(vmf);
  1573. if (flags & FAULT_FLAG_KILLABLE)
  1574. folio_wait_locked_killable(folio);
  1575. else
  1576. folio_wait_locked(folio);
  1577. return VM_FAULT_RETRY;
  1578. }
  1579. if (flags & FAULT_FLAG_KILLABLE) {
  1580. bool ret;
  1581. ret = __folio_lock_killable(folio);
  1582. if (ret) {
  1583. release_fault_lock(vmf);
  1584. return VM_FAULT_RETRY;
  1585. }
  1586. } else {
  1587. __folio_lock(folio);
  1588. }
  1589. return 0;
  1590. }
  1591. /**
  1592. * page_cache_next_miss() - Find the next gap in the page cache.
  1593. * @mapping: Mapping.
  1594. * @index: Index.
  1595. * @max_scan: Maximum range to search.
  1596. *
  1597. * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
  1598. * gap with the lowest index.
  1599. *
  1600. * This function may be called under the rcu_read_lock. However, this will
  1601. * not atomically search a snapshot of the cache at a single point in time.
  1602. * For example, if a gap is created at index 5, then subsequently a gap is
  1603. * created at index 10, page_cache_next_miss covering both indices may
  1604. * return 10 if called under the rcu_read_lock.
  1605. *
  1606. * Return: The index of the gap if found, otherwise an index outside the
  1607. * range specified (in which case 'return - index >= max_scan' will be true).
  1608. * In the rare case of index wrap-around, 0 will be returned.
  1609. */
  1610. pgoff_t page_cache_next_miss(struct address_space *mapping,
  1611. pgoff_t index, unsigned long max_scan)
  1612. {
  1613. XA_STATE(xas, &mapping->i_pages, index);
  1614. unsigned long nr = max_scan;
  1615. while (nr--) {
  1616. void *entry = xas_next(&xas);
  1617. if (!entry || xa_is_value(entry))
  1618. return xas.xa_index;
  1619. if (xas.xa_index == 0)
  1620. return 0;
  1621. }
  1622. return index + max_scan;
  1623. }
  1624. EXPORT_SYMBOL(page_cache_next_miss);
  1625. /**
  1626. * page_cache_prev_miss() - Find the previous gap in the page cache.
  1627. * @mapping: Mapping.
  1628. * @index: Index.
  1629. * @max_scan: Maximum range to search.
  1630. *
  1631. * Search the range [max(index - max_scan + 1, 0), index] for the
  1632. * gap with the highest index.
  1633. *
  1634. * This function may be called under the rcu_read_lock. However, this will
  1635. * not atomically search a snapshot of the cache at a single point in time.
  1636. * For example, if a gap is created at index 10, then subsequently a gap is
  1637. * created at index 5, page_cache_prev_miss() covering both indices may
  1638. * return 5 if called under the rcu_read_lock.
  1639. *
  1640. * Return: The index of the gap if found, otherwise an index outside the
  1641. * range specified (in which case 'index - return >= max_scan' will be true).
  1642. * In the rare case of wrap-around, ULONG_MAX will be returned.
  1643. */
  1644. pgoff_t page_cache_prev_miss(struct address_space *mapping,
  1645. pgoff_t index, unsigned long max_scan)
  1646. {
  1647. XA_STATE(xas, &mapping->i_pages, index);
  1648. while (max_scan--) {
  1649. void *entry = xas_prev(&xas);
  1650. if (!entry || xa_is_value(entry))
  1651. break;
  1652. if (xas.xa_index == ULONG_MAX)
  1653. break;
  1654. }
  1655. return xas.xa_index;
  1656. }
  1657. EXPORT_SYMBOL(page_cache_prev_miss);
  1658. /*
  1659. * Lockless page cache protocol:
  1660. * On the lookup side:
  1661. * 1. Load the folio from i_pages
  1662. * 2. Increment the refcount if it's not zero
  1663. * 3. If the folio is not found by xas_reload(), put the refcount and retry
  1664. *
  1665. * On the removal side:
  1666. * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
  1667. * B. Remove the page from i_pages
  1668. * C. Return the page to the page allocator
  1669. *
  1670. * This means that any page may have its reference count temporarily
  1671. * increased by a speculative page cache (or GUP-fast) lookup as it can
  1672. * be allocated by another user before the RCU grace period expires.
  1673. * Because the refcount temporarily acquired here may end up being the
  1674. * last refcount on the page, any page allocation must be freeable by
  1675. * folio_put().
  1676. */
  1677. /*
  1678. * filemap_get_entry - Get a page cache entry.
  1679. * @mapping: the address_space to search
  1680. * @index: The page cache index.
  1681. *
  1682. * Looks up the page cache entry at @mapping & @index. If it is a folio,
  1683. * it is returned with an increased refcount. If it is a shadow entry
  1684. * of a previously evicted folio, or a swap entry from shmem/tmpfs,
  1685. * it is returned without further action.
  1686. *
  1687. * Return: The folio, swap or shadow entry, %NULL if nothing is found.
  1688. */
  1689. void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
  1690. {
  1691. XA_STATE(xas, &mapping->i_pages, index);
  1692. struct folio *folio;
  1693. rcu_read_lock();
  1694. repeat:
  1695. xas_reset(&xas);
  1696. folio = xas_load(&xas);
  1697. if (xas_retry(&xas, folio))
  1698. goto repeat;
  1699. /*
  1700. * A shadow entry of a recently evicted page, or a swap entry from
  1701. * shmem/tmpfs. Return it without attempting to raise page count.
  1702. */
  1703. if (!folio || xa_is_value(folio))
  1704. goto out;
  1705. if (!folio_try_get(folio))
  1706. goto repeat;
  1707. if (unlikely(folio != xas_reload(&xas))) {
  1708. folio_put(folio);
  1709. goto repeat;
  1710. }
  1711. out:
  1712. rcu_read_unlock();
  1713. return folio;
  1714. }
  1715. /**
  1716. * __filemap_get_folio_mpol - Find and get a reference to a folio.
  1717. * @mapping: The address_space to search.
  1718. * @index: The page index.
  1719. * @fgp_flags: %FGP flags modify how the folio is returned.
  1720. * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
  1721. * @policy: NUMA memory allocation policy to follow.
  1722. *
  1723. * Looks up the page cache entry at @mapping & @index.
  1724. *
  1725. * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
  1726. * if the %GFP flags specified for %FGP_CREAT are atomic.
  1727. *
  1728. * If this function returns a folio, it is returned with an increased refcount.
  1729. *
  1730. * Return: The found folio or an ERR_PTR() otherwise.
  1731. */
  1732. struct folio *__filemap_get_folio_mpol(struct address_space *mapping,
  1733. pgoff_t index, fgf_t fgp_flags, gfp_t gfp, struct mempolicy *policy)
  1734. {
  1735. struct folio *folio;
  1736. repeat:
  1737. folio = filemap_get_entry(mapping, index);
  1738. if (xa_is_value(folio))
  1739. folio = NULL;
  1740. if (!folio)
  1741. goto no_page;
  1742. if (fgp_flags & FGP_LOCK) {
  1743. if (fgp_flags & FGP_NOWAIT) {
  1744. if (!folio_trylock(folio)) {
  1745. folio_put(folio);
  1746. return ERR_PTR(-EAGAIN);
  1747. }
  1748. } else {
  1749. folio_lock(folio);
  1750. }
  1751. /* Has the page been truncated? */
  1752. if (unlikely(folio->mapping != mapping)) {
  1753. folio_unlock(folio);
  1754. folio_put(folio);
  1755. goto repeat;
  1756. }
  1757. VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
  1758. }
  1759. if (fgp_flags & FGP_ACCESSED)
  1760. folio_mark_accessed(folio);
  1761. else if (fgp_flags & FGP_WRITE) {
  1762. /* Clear idle flag for buffer write */
  1763. if (folio_test_idle(folio))
  1764. folio_clear_idle(folio);
  1765. }
  1766. if (fgp_flags & FGP_STABLE)
  1767. folio_wait_stable(folio);
  1768. no_page:
  1769. if (!folio && (fgp_flags & FGP_CREAT)) {
  1770. unsigned int min_order = mapping_min_folio_order(mapping);
  1771. unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));
  1772. int err;
  1773. index = mapping_align_index(mapping, index);
  1774. if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
  1775. gfp |= __GFP_WRITE;
  1776. if (fgp_flags & FGP_NOFS)
  1777. gfp &= ~__GFP_FS;
  1778. if (fgp_flags & FGP_NOWAIT) {
  1779. gfp &= ~GFP_KERNEL;
  1780. gfp |= GFP_NOWAIT;
  1781. }
  1782. if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
  1783. fgp_flags |= FGP_LOCK;
  1784. if (order > mapping_max_folio_order(mapping))
  1785. order = mapping_max_folio_order(mapping);
  1786. /* If we're not aligned, allocate a smaller folio */
  1787. if (index & ((1UL << order) - 1))
  1788. order = __ffs(index);
  1789. do {
  1790. gfp_t alloc_gfp = gfp;
  1791. err = -ENOMEM;
  1792. if (order > min_order)
  1793. alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
  1794. folio = filemap_alloc_folio(alloc_gfp, order, policy);
  1795. if (!folio)
  1796. continue;
  1797. /* Init accessed so avoid atomic mark_page_accessed later */
  1798. if (fgp_flags & FGP_ACCESSED)
  1799. __folio_set_referenced(folio);
  1800. if (fgp_flags & FGP_DONTCACHE)
  1801. __folio_set_dropbehind(folio);
  1802. err = filemap_add_folio(mapping, folio, index, gfp);
  1803. if (!err)
  1804. break;
  1805. folio_put(folio);
  1806. folio = NULL;
  1807. } while (order-- > min_order);
  1808. if (err == -EEXIST)
  1809. goto repeat;
  1810. if (err) {
  1811. /*
  1812. * When NOWAIT I/O fails to allocate folios this could
  1813. * be due to a nonblocking memory allocation and not
  1814. * because the system actually is out of memory.
  1815. * Return -EAGAIN so that there caller retries in a
  1816. * blocking fashion instead of propagating -ENOMEM
  1817. * to the application.
  1818. */
  1819. if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)
  1820. err = -EAGAIN;
  1821. return ERR_PTR(err);
  1822. }
  1823. /*
  1824. * filemap_add_folio locks the page, and for mmap
  1825. * we expect an unlocked page.
  1826. */
  1827. if (folio && (fgp_flags & FGP_FOR_MMAP))
  1828. folio_unlock(folio);
  1829. }
  1830. if (!folio)
  1831. return ERR_PTR(-ENOENT);
  1832. /* not an uncached lookup, clear uncached if set */
  1833. if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
  1834. folio_clear_dropbehind(folio);
  1835. return folio;
  1836. }
  1837. EXPORT_SYMBOL(__filemap_get_folio_mpol);
  1838. static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
  1839. xa_mark_t mark)
  1840. {
  1841. struct folio *folio;
  1842. retry:
  1843. if (mark == XA_PRESENT)
  1844. folio = xas_find(xas, max);
  1845. else
  1846. folio = xas_find_marked(xas, max, mark);
  1847. if (xas_retry(xas, folio))
  1848. goto retry;
  1849. /*
  1850. * A shadow entry of a recently evicted page, a swap
  1851. * entry from shmem/tmpfs or a DAX entry. Return it
  1852. * without attempting to raise page count.
  1853. */
  1854. if (!folio || xa_is_value(folio))
  1855. return folio;
  1856. if (!folio_try_get(folio))
  1857. goto reset;
  1858. if (unlikely(folio != xas_reload(xas))) {
  1859. folio_put(folio);
  1860. goto reset;
  1861. }
  1862. return folio;
  1863. reset:
  1864. xas_reset(xas);
  1865. goto retry;
  1866. }
  1867. /**
  1868. * find_get_entries - gang pagecache lookup
  1869. * @mapping: The address_space to search
  1870. * @start: The starting page cache index
  1871. * @end: The final page index (inclusive).
  1872. * @fbatch: Where the resulting entries are placed.
  1873. * @indices: The cache indices corresponding to the entries in @entries
  1874. *
  1875. * find_get_entries() will search for and return a batch of entries in
  1876. * the mapping. The entries are placed in @fbatch. find_get_entries()
  1877. * takes a reference on any actual folios it returns.
  1878. *
  1879. * The entries have ascending indexes. The indices may not be consecutive
  1880. * due to not-present entries or large folios.
  1881. *
  1882. * Any shadow entries of evicted folios, or swap entries from
  1883. * shmem/tmpfs, are included in the returned array.
  1884. *
  1885. * Return: The number of entries which were found.
  1886. */
  1887. unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
  1888. pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
  1889. {
  1890. XA_STATE(xas, &mapping->i_pages, *start);
  1891. struct folio *folio;
  1892. rcu_read_lock();
  1893. while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
  1894. indices[fbatch->nr] = xas.xa_index;
  1895. if (!folio_batch_add(fbatch, folio))
  1896. break;
  1897. }
  1898. if (folio_batch_count(fbatch)) {
  1899. unsigned long nr;
  1900. int idx = folio_batch_count(fbatch) - 1;
  1901. folio = fbatch->folios[idx];
  1902. if (!xa_is_value(folio))
  1903. nr = folio_nr_pages(folio);
  1904. else
  1905. nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]);
  1906. *start = round_down(indices[idx] + nr, nr);
  1907. }
  1908. rcu_read_unlock();
  1909. return folio_batch_count(fbatch);
  1910. }
  1911. /**
  1912. * find_lock_entries - Find a batch of pagecache entries.
  1913. * @mapping: The address_space to search.
  1914. * @start: The starting page cache index.
  1915. * @end: The final page index (inclusive).
  1916. * @fbatch: Where the resulting entries are placed.
  1917. * @indices: The cache indices of the entries in @fbatch.
  1918. *
  1919. * find_lock_entries() will return a batch of entries from @mapping.
  1920. * Swap, shadow and DAX entries are included. Folios are returned
  1921. * locked and with an incremented refcount. Folios which are locked
  1922. * by somebody else or under writeback are skipped. Folios which are
  1923. * partially outside the range are not returned.
  1924. *
  1925. * The entries have ascending indexes. The indices may not be consecutive
  1926. * due to not-present entries, large folios, folios which could not be
  1927. * locked or folios under writeback.
  1928. *
  1929. * Return: The number of entries which were found.
  1930. */
  1931. unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
  1932. pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
  1933. {
  1934. XA_STATE(xas, &mapping->i_pages, *start);
  1935. struct folio *folio;
  1936. rcu_read_lock();
  1937. while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
  1938. unsigned long base;
  1939. unsigned long nr;
  1940. if (!xa_is_value(folio)) {
  1941. nr = folio_nr_pages(folio);
  1942. base = folio->index;
  1943. /* Omit large folio which begins before the start */
  1944. if (base < *start)
  1945. goto put;
  1946. /* Omit large folio which extends beyond the end */
  1947. if (base + nr - 1 > end)
  1948. goto put;
  1949. if (!folio_trylock(folio))
  1950. goto put;
  1951. if (folio->mapping != mapping ||
  1952. folio_test_writeback(folio))
  1953. goto unlock;
  1954. VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
  1955. folio);
  1956. } else {
  1957. nr = 1 << xas_get_order(&xas);
  1958. base = xas.xa_index & ~(nr - 1);
  1959. /* Omit order>0 value which begins before the start */
  1960. if (base < *start)
  1961. continue;
  1962. /* Omit order>0 value which extends beyond the end */
  1963. if (base + nr - 1 > end)
  1964. break;
  1965. }
  1966. /* Update start now so that last update is correct on return */
  1967. *start = base + nr;
  1968. indices[fbatch->nr] = xas.xa_index;
  1969. if (!folio_batch_add(fbatch, folio))
  1970. break;
  1971. continue;
  1972. unlock:
  1973. folio_unlock(folio);
  1974. put:
  1975. folio_put(folio);
  1976. }
  1977. rcu_read_unlock();
  1978. return folio_batch_count(fbatch);
  1979. }
  1980. /**
  1981. * filemap_get_folios - Get a batch of folios
  1982. * @mapping: The address_space to search
  1983. * @start: The starting page index
  1984. * @end: The final page index (inclusive)
  1985. * @fbatch: The batch to fill.
  1986. *
  1987. * Search for and return a batch of folios in the mapping starting at
  1988. * index @start and up to index @end (inclusive). The folios are returned
  1989. * in @fbatch with an elevated reference count.
  1990. *
  1991. * Return: The number of folios which were found.
  1992. * We also update @start to index the next folio for the traversal.
  1993. */
  1994. unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
  1995. pgoff_t end, struct folio_batch *fbatch)
  1996. {
  1997. return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch);
  1998. }
  1999. EXPORT_SYMBOL(filemap_get_folios);
  2000. /**
  2001. * filemap_get_folios_contig - Get a batch of contiguous folios
  2002. * @mapping: The address_space to search
  2003. * @start: The starting page index
  2004. * @end: The final page index (inclusive)
  2005. * @fbatch: The batch to fill
  2006. *
  2007. * filemap_get_folios_contig() works exactly like filemap_get_folios(),
  2008. * except the returned folios are guaranteed to be contiguous. This may
  2009. * not return all contiguous folios if the batch gets filled up.
  2010. *
  2011. * Return: The number of folios found.
  2012. * Also update @start to be positioned for traversal of the next folio.
  2013. */
  2014. unsigned filemap_get_folios_contig(struct address_space *mapping,
  2015. pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
  2016. {
  2017. XA_STATE(xas, &mapping->i_pages, *start);
  2018. unsigned long nr;
  2019. struct folio *folio;
  2020. rcu_read_lock();
  2021. for (folio = xas_load(&xas); folio && xas.xa_index <= end;
  2022. folio = xas_next(&xas)) {
  2023. if (xas_retry(&xas, folio))
  2024. continue;
  2025. /*
  2026. * If the entry has been swapped out, we can stop looking.
  2027. * No current caller is looking for DAX entries.
  2028. */
  2029. if (xa_is_value(folio))
  2030. goto update_start;
  2031. /* If we landed in the middle of a THP, continue at its end. */
  2032. if (xa_is_sibling(folio))
  2033. goto update_start;
  2034. if (!folio_try_get(folio))
  2035. goto retry;
  2036. if (unlikely(folio != xas_reload(&xas)))
  2037. goto put_folio;
  2038. if (!folio_batch_add(fbatch, folio)) {
  2039. nr = folio_nr_pages(folio);
  2040. *start = folio->index + nr;
  2041. goto out;
  2042. }
  2043. xas_advance(&xas, folio_next_index(folio) - 1);
  2044. continue;
  2045. put_folio:
  2046. folio_put(folio);
  2047. retry:
  2048. xas_reset(&xas);
  2049. }
  2050. update_start:
  2051. nr = folio_batch_count(fbatch);
  2052. if (nr) {
  2053. folio = fbatch->folios[nr - 1];
  2054. *start = folio_next_index(folio);
  2055. }
  2056. out:
  2057. rcu_read_unlock();
  2058. return folio_batch_count(fbatch);
  2059. }
  2060. EXPORT_SYMBOL(filemap_get_folios_contig);
  2061. /**
  2062. * filemap_get_folios_tag - Get a batch of folios matching @tag
  2063. * @mapping: The address_space to search
  2064. * @start: The starting page index
  2065. * @end: The final page index (inclusive)
  2066. * @tag: The tag index
  2067. * @fbatch: The batch to fill
  2068. *
  2069. * The first folio may start before @start; if it does, it will contain
  2070. * @start. The final folio may extend beyond @end; if it does, it will
  2071. * contain @end. The folios have ascending indices. There may be gaps
  2072. * between the folios if there are indices which have no folio in the
  2073. * page cache. If folios are added to or removed from the page cache
  2074. * while this is running, they may or may not be found by this call.
  2075. * Only returns folios that are tagged with @tag.
  2076. *
  2077. * Return: The number of folios found.
  2078. * Also update @start to index the next folio for traversal.
  2079. */
  2080. unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
  2081. pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)
  2082. {
  2083. XA_STATE(xas, &mapping->i_pages, *start);
  2084. struct folio *folio;
  2085. rcu_read_lock();
  2086. while ((folio = find_get_entry(&xas, end, tag)) != NULL) {
  2087. /*
  2088. * Shadow entries should never be tagged, but this iteration
  2089. * is lockless so there is a window for page reclaim to evict
  2090. * a page we saw tagged. Skip over it.
  2091. */
  2092. if (xa_is_value(folio))
  2093. continue;
  2094. if (!folio_batch_add(fbatch, folio)) {
  2095. unsigned long nr = folio_nr_pages(folio);
  2096. *start = folio->index + nr;
  2097. goto out;
  2098. }
  2099. }
  2100. /*
  2101. * We come here when there is no page beyond @end. We take care to not
  2102. * overflow the index @start as it confuses some of the callers. This
  2103. * breaks the iteration when there is a page at index -1 but that is
  2104. * already broke anyway.
  2105. */
  2106. if (end == (pgoff_t)-1)
  2107. *start = (pgoff_t)-1;
  2108. else
  2109. *start = end + 1;
  2110. out:
  2111. rcu_read_unlock();
  2112. return folio_batch_count(fbatch);
  2113. }
  2114. EXPORT_SYMBOL(filemap_get_folios_tag);
  2115. /**
  2116. * filemap_get_folios_dirty - Get a batch of dirty folios
  2117. * @mapping: The address_space to search
  2118. * @start: The starting folio index
  2119. * @end: The final folio index (inclusive)
  2120. * @fbatch: The batch to fill
  2121. *
  2122. * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except
  2123. * the returned folios are presumed to be dirty or undergoing writeback. Dirty
  2124. * state is presumed because we don't block on folio lock nor want to miss
  2125. * folios. Callers that need to can recheck state upon locking the folio.
  2126. *
  2127. * This may not return all dirty folios if the batch gets filled up.
  2128. *
  2129. * Return: The number of folios found.
  2130. * Also update @start to be positioned for traversal of the next folio.
  2131. */
  2132. unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start,
  2133. pgoff_t end, struct folio_batch *fbatch)
  2134. {
  2135. XA_STATE(xas, &mapping->i_pages, *start);
  2136. struct folio *folio;
  2137. rcu_read_lock();
  2138. while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
  2139. if (xa_is_value(folio))
  2140. continue;
  2141. if (folio_trylock(folio)) {
  2142. bool clean = !folio_test_dirty(folio) &&
  2143. !folio_test_writeback(folio);
  2144. folio_unlock(folio);
  2145. if (clean) {
  2146. folio_put(folio);
  2147. continue;
  2148. }
  2149. }
  2150. if (!folio_batch_add(fbatch, folio)) {
  2151. unsigned long nr = folio_nr_pages(folio);
  2152. *start = folio->index + nr;
  2153. goto out;
  2154. }
  2155. }
  2156. /*
  2157. * We come here when there is no folio beyond @end. We take care to not
  2158. * overflow the index @start as it confuses some of the callers. This
  2159. * breaks the iteration when there is a folio at index -1 but that is
  2160. * already broke anyway.
  2161. */
  2162. if (end == (pgoff_t)-1)
  2163. *start = (pgoff_t)-1;
  2164. else
  2165. *start = end + 1;
  2166. out:
  2167. rcu_read_unlock();
  2168. return folio_batch_count(fbatch);
  2169. }
  2170. /*
  2171. * CD/DVDs are error prone. When a medium error occurs, the driver may fail
  2172. * a _large_ part of the i/o request. Imagine the worst scenario:
  2173. *
  2174. * ---R__________________________________________B__________
  2175. * ^ reading here ^ bad block(assume 4k)
  2176. *
  2177. * read(R) => miss => readahead(R...B) => media error => frustrating retries
  2178. * => failing the whole request => read(R) => read(R+1) =>
  2179. * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
  2180. * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
  2181. * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
  2182. *
  2183. * It is going insane. Fix it by quickly scaling down the readahead size.
  2184. */
  2185. static void shrink_readahead_size_eio(struct file_ra_state *ra)
  2186. {
  2187. ra->ra_pages /= 4;
  2188. }
  2189. /*
  2190. * filemap_get_read_batch - Get a batch of folios for read
  2191. *
  2192. * Get a batch of folios which represent a contiguous range of bytes in
  2193. * the file. No exceptional entries will be returned. If @index is in
  2194. * the middle of a folio, the entire folio will be returned. The last
  2195. * folio in the batch may have the readahead flag set or the uptodate flag
  2196. * clear so that the caller can take the appropriate action.
  2197. */
  2198. static void filemap_get_read_batch(struct address_space *mapping,
  2199. pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
  2200. {
  2201. XA_STATE(xas, &mapping->i_pages, index);
  2202. struct folio *folio;
  2203. rcu_read_lock();
  2204. for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
  2205. if (xas_retry(&xas, folio))
  2206. continue;
  2207. if (xas.xa_index > max || xa_is_value(folio))
  2208. break;
  2209. if (xa_is_sibling(folio))
  2210. break;
  2211. if (!folio_try_get(folio))
  2212. goto retry;
  2213. if (unlikely(folio != xas_reload(&xas)))
  2214. goto put_folio;
  2215. if (!folio_batch_add(fbatch, folio))
  2216. break;
  2217. if (!folio_test_uptodate(folio))
  2218. break;
  2219. if (folio_test_readahead(folio))
  2220. break;
  2221. xas_advance(&xas, folio_next_index(folio) - 1);
  2222. continue;
  2223. put_folio:
  2224. folio_put(folio);
  2225. retry:
  2226. xas_reset(&xas);
  2227. }
  2228. rcu_read_unlock();
  2229. }
  2230. static int filemap_read_folio(struct file *file, filler_t filler,
  2231. struct folio *folio)
  2232. {
  2233. bool workingset = folio_test_workingset(folio);
  2234. unsigned long pflags;
  2235. int error;
  2236. /* Start the actual read. The read will unlock the page. */
  2237. if (unlikely(workingset))
  2238. psi_memstall_enter(&pflags);
  2239. error = filler(file, folio);
  2240. if (unlikely(workingset))
  2241. psi_memstall_leave(&pflags);
  2242. if (error)
  2243. return error;
  2244. error = folio_wait_locked_killable(folio);
  2245. if (error)
  2246. return error;
  2247. if (folio_test_uptodate(folio))
  2248. return 0;
  2249. if (file)
  2250. shrink_readahead_size_eio(&file->f_ra);
  2251. return -EIO;
  2252. }
  2253. static bool filemap_range_uptodate(struct address_space *mapping,
  2254. loff_t pos, size_t count, struct folio *folio,
  2255. bool need_uptodate)
  2256. {
  2257. if (folio_test_uptodate(folio))
  2258. return true;
  2259. /* pipes can't handle partially uptodate pages */
  2260. if (need_uptodate)
  2261. return false;
  2262. if (!mapping->a_ops->is_partially_uptodate)
  2263. return false;
  2264. if (mapping->host->i_blkbits >= folio_shift(folio))
  2265. return false;
  2266. if (folio_pos(folio) > pos) {
  2267. count -= folio_pos(folio) - pos;
  2268. pos = 0;
  2269. } else {
  2270. pos -= folio_pos(folio);
  2271. }
  2272. if (pos == 0 && count >= folio_size(folio))
  2273. return false;
  2274. return mapping->a_ops->is_partially_uptodate(folio, pos, count);
  2275. }
  2276. static int filemap_update_page(struct kiocb *iocb,
  2277. struct address_space *mapping, size_t count,
  2278. struct folio *folio, bool need_uptodate)
  2279. {
  2280. int error;
  2281. if (iocb->ki_flags & IOCB_NOWAIT) {
  2282. if (!filemap_invalidate_trylock_shared(mapping))
  2283. return -EAGAIN;
  2284. } else {
  2285. filemap_invalidate_lock_shared(mapping);
  2286. }
  2287. if (!folio_trylock(folio)) {
  2288. error = -EAGAIN;
  2289. if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
  2290. goto unlock_mapping;
  2291. if (!(iocb->ki_flags & IOCB_WAITQ)) {
  2292. filemap_invalidate_unlock_shared(mapping);
  2293. /*
  2294. * This is where we usually end up waiting for a
  2295. * previously submitted readahead to finish.
  2296. */
  2297. folio_put_wait_locked(folio, TASK_KILLABLE);
  2298. return AOP_TRUNCATED_PAGE;
  2299. }
  2300. error = __folio_lock_async(folio, iocb->ki_waitq);
  2301. if (error)
  2302. goto unlock_mapping;
  2303. }
  2304. error = AOP_TRUNCATED_PAGE;
  2305. if (!folio->mapping)
  2306. goto unlock;
  2307. error = 0;
  2308. if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
  2309. need_uptodate))
  2310. goto unlock;
  2311. error = -EAGAIN;
  2312. if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
  2313. goto unlock;
  2314. error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
  2315. folio);
  2316. goto unlock_mapping;
  2317. unlock:
  2318. folio_unlock(folio);
  2319. unlock_mapping:
  2320. filemap_invalidate_unlock_shared(mapping);
  2321. if (error == AOP_TRUNCATED_PAGE)
  2322. folio_put(folio);
  2323. return error;
  2324. }
  2325. static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch)
  2326. {
  2327. struct address_space *mapping = iocb->ki_filp->f_mapping;
  2328. struct folio *folio;
  2329. int error;
  2330. unsigned int min_order = mapping_min_folio_order(mapping);
  2331. pgoff_t index;
  2332. if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
  2333. return -EAGAIN;
  2334. folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL);
  2335. if (!folio)
  2336. return -ENOMEM;
  2337. if (iocb->ki_flags & IOCB_DONTCACHE)
  2338. __folio_set_dropbehind(folio);
  2339. /*
  2340. * Protect against truncate / hole punch. Grabbing invalidate_lock
  2341. * here assures we cannot instantiate and bring uptodate new
  2342. * pagecache folios after evicting page cache during truncate
  2343. * and before actually freeing blocks. Note that we could
  2344. * release invalidate_lock after inserting the folio into
  2345. * the page cache as the locked folio would then be enough to
  2346. * synchronize with hole punching. But there are code paths
  2347. * such as filemap_update_page() filling in partially uptodate
  2348. * pages or ->readahead() that need to hold invalidate_lock
  2349. * while mapping blocks for IO so let's hold the lock here as
  2350. * well to keep locking rules simple.
  2351. */
  2352. filemap_invalidate_lock_shared(mapping);
  2353. index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order;
  2354. error = filemap_add_folio(mapping, folio, index,
  2355. mapping_gfp_constraint(mapping, GFP_KERNEL));
  2356. if (error == -EEXIST)
  2357. error = AOP_TRUNCATED_PAGE;
  2358. if (error)
  2359. goto error;
  2360. error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
  2361. folio);
  2362. if (error)
  2363. goto error;
  2364. filemap_invalidate_unlock_shared(mapping);
  2365. folio_batch_add(fbatch, folio);
  2366. return 0;
  2367. error:
  2368. filemap_invalidate_unlock_shared(mapping);
  2369. folio_put(folio);
  2370. return error;
  2371. }
  2372. static int filemap_readahead(struct kiocb *iocb, struct file *file,
  2373. struct address_space *mapping, struct folio *folio,
  2374. pgoff_t last_index)
  2375. {
  2376. DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);
  2377. if (iocb->ki_flags & IOCB_NOIO)
  2378. return -EAGAIN;
  2379. if (iocb->ki_flags & IOCB_DONTCACHE)
  2380. ractl.dropbehind = 1;
  2381. page_cache_async_ra(&ractl, folio, last_index - folio->index);
  2382. return 0;
  2383. }
  2384. static int filemap_get_pages(struct kiocb *iocb, size_t count,
  2385. struct folio_batch *fbatch, bool need_uptodate)
  2386. {
  2387. struct file *filp = iocb->ki_filp;
  2388. struct address_space *mapping = filp->f_mapping;
  2389. pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
  2390. pgoff_t last_index;
  2391. struct folio *folio;
  2392. unsigned int flags;
  2393. int err = 0;
  2394. /* "last_index" is the index of the folio beyond the end of the read */
  2395. last_index = round_up(iocb->ki_pos + count,
  2396. mapping_min_folio_nrbytes(mapping)) >> PAGE_SHIFT;
  2397. retry:
  2398. if (fatal_signal_pending(current))
  2399. return -EINTR;
  2400. filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
  2401. if (!folio_batch_count(fbatch)) {
  2402. DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index);
  2403. if (iocb->ki_flags & IOCB_NOIO)
  2404. return -EAGAIN;
  2405. if (iocb->ki_flags & IOCB_NOWAIT)
  2406. flags = memalloc_noio_save();
  2407. if (iocb->ki_flags & IOCB_DONTCACHE)
  2408. ractl.dropbehind = 1;
  2409. page_cache_sync_ra(&ractl, last_index - index);
  2410. if (iocb->ki_flags & IOCB_NOWAIT)
  2411. memalloc_noio_restore(flags);
  2412. filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
  2413. }
  2414. if (!folio_batch_count(fbatch)) {
  2415. err = filemap_create_folio(iocb, fbatch);
  2416. if (err == AOP_TRUNCATED_PAGE)
  2417. goto retry;
  2418. return err;
  2419. }
  2420. folio = fbatch->folios[folio_batch_count(fbatch) - 1];
  2421. if (folio_test_readahead(folio)) {
  2422. err = filemap_readahead(iocb, filp, mapping, folio, last_index);
  2423. if (err)
  2424. goto err;
  2425. }
  2426. if (!folio_test_uptodate(folio)) {
  2427. if (folio_batch_count(fbatch) > 1) {
  2428. err = -EAGAIN;
  2429. goto err;
  2430. }
  2431. err = filemap_update_page(iocb, mapping, count, folio,
  2432. need_uptodate);
  2433. if (err)
  2434. goto err;
  2435. }
  2436. trace_mm_filemap_get_pages(mapping, index, last_index - 1);
  2437. return 0;
  2438. err:
  2439. if (err < 0)
  2440. folio_put(folio);
  2441. if (likely(--fbatch->nr))
  2442. return 0;
  2443. if (err == AOP_TRUNCATED_PAGE)
  2444. goto retry;
  2445. return err;
  2446. }
  2447. static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
  2448. {
  2449. unsigned int shift = folio_shift(folio);
  2450. return (pos1 >> shift == pos2 >> shift);
  2451. }
  2452. static void filemap_end_dropbehind_read(struct folio *folio)
  2453. {
  2454. if (!folio_test_dropbehind(folio))
  2455. return;
  2456. if (folio_test_writeback(folio) || folio_test_dirty(folio))
  2457. return;
  2458. if (folio_trylock(folio)) {
  2459. filemap_end_dropbehind(folio);
  2460. folio_unlock(folio);
  2461. }
  2462. }
  2463. /**
  2464. * filemap_read - Read data from the page cache.
  2465. * @iocb: The iocb to read.
  2466. * @iter: Destination for the data.
  2467. * @already_read: Number of bytes already read by the caller.
  2468. *
  2469. * Copies data from the page cache. If the data is not currently present,
  2470. * uses the readahead and read_folio address_space operations to fetch it.
  2471. *
  2472. * Return: Total number of bytes copied, including those already read by
  2473. * the caller. If an error happens before any bytes are copied, returns
  2474. * a negative error number.
  2475. */
  2476. ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
  2477. ssize_t already_read)
  2478. {
  2479. struct file *filp = iocb->ki_filp;
  2480. struct file_ra_state *ra = &filp->f_ra;
  2481. struct address_space *mapping = filp->f_mapping;
  2482. struct inode *inode = mapping->host;
  2483. struct folio_batch fbatch;
  2484. int i, error = 0;
  2485. bool writably_mapped;
  2486. loff_t isize, end_offset;
  2487. loff_t last_pos = ra->prev_pos;
  2488. if (unlikely(iocb->ki_pos < 0))
  2489. return -EINVAL;
  2490. if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
  2491. return 0;
  2492. if (unlikely(!iov_iter_count(iter)))
  2493. return 0;
  2494. iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
  2495. folio_batch_init(&fbatch);
  2496. do {
  2497. cond_resched();
  2498. /*
  2499. * If we've already successfully copied some data, then we
  2500. * can no longer safely return -EIOCBQUEUED. Hence mark
  2501. * an async read NOWAIT at that point.
  2502. */
  2503. if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
  2504. iocb->ki_flags |= IOCB_NOWAIT;
  2505. if (unlikely(iocb->ki_pos >= i_size_read(inode)))
  2506. break;
  2507. error = filemap_get_pages(iocb, iter->count, &fbatch, false);
  2508. if (error < 0)
  2509. break;
  2510. /*
  2511. * i_size must be checked after we know the pages are Uptodate.
  2512. *
  2513. * Checking i_size after the check allows us to calculate
  2514. * the correct value for "nr", which means the zero-filled
  2515. * part of the page is not copied back to userspace (unless
  2516. * another truncate extends the file - this is desired though).
  2517. */
  2518. isize = i_size_read(inode);
  2519. if (unlikely(iocb->ki_pos >= isize))
  2520. goto put_folios;
  2521. end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
  2522. /*
  2523. * Once we start copying data, we don't want to be touching any
  2524. * cachelines that might be contended:
  2525. */
  2526. writably_mapped = mapping_writably_mapped(mapping);
  2527. /*
  2528. * When a read accesses the same folio several times, only
  2529. * mark it as accessed the first time.
  2530. */
  2531. if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
  2532. fbatch.folios[0]))
  2533. folio_mark_accessed(fbatch.folios[0]);
  2534. for (i = 0; i < folio_batch_count(&fbatch); i++) {
  2535. struct folio *folio = fbatch.folios[i];
  2536. size_t fsize = folio_size(folio);
  2537. size_t offset = iocb->ki_pos & (fsize - 1);
  2538. size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
  2539. fsize - offset);
  2540. size_t copied;
  2541. if (end_offset < folio_pos(folio))
  2542. break;
  2543. if (i > 0)
  2544. folio_mark_accessed(folio);
  2545. /*
  2546. * If users can be writing to this folio using arbitrary
  2547. * virtual addresses, take care of potential aliasing
  2548. * before reading the folio on the kernel side.
  2549. */
  2550. if (writably_mapped)
  2551. flush_dcache_folio(folio);
  2552. copied = copy_folio_to_iter(folio, offset, bytes, iter);
  2553. already_read += copied;
  2554. iocb->ki_pos += copied;
  2555. last_pos = iocb->ki_pos;
  2556. if (copied < bytes) {
  2557. error = -EFAULT;
  2558. break;
  2559. }
  2560. }
  2561. put_folios:
  2562. for (i = 0; i < folio_batch_count(&fbatch); i++) {
  2563. struct folio *folio = fbatch.folios[i];
  2564. filemap_end_dropbehind_read(folio);
  2565. folio_put(folio);
  2566. }
  2567. folio_batch_init(&fbatch);
  2568. } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
  2569. file_accessed(filp);
  2570. ra->prev_pos = last_pos;
  2571. return already_read ? already_read : error;
  2572. }
  2573. EXPORT_SYMBOL_GPL(filemap_read);
  2574. int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
  2575. {
  2576. struct address_space *mapping = iocb->ki_filp->f_mapping;
  2577. loff_t pos = iocb->ki_pos;
  2578. loff_t end = pos + count - 1;
  2579. if (iocb->ki_flags & IOCB_NOWAIT) {
  2580. if (filemap_range_needs_writeback(mapping, pos, end))
  2581. return -EAGAIN;
  2582. return 0;
  2583. }
  2584. return filemap_write_and_wait_range(mapping, pos, end);
  2585. }
  2586. EXPORT_SYMBOL_GPL(kiocb_write_and_wait);
  2587. int filemap_invalidate_pages(struct address_space *mapping,
  2588. loff_t pos, loff_t end, bool nowait)
  2589. {
  2590. int ret;
  2591. if (nowait) {
  2592. /* we could block if there are any pages in the range */
  2593. if (filemap_range_has_page(mapping, pos, end))
  2594. return -EAGAIN;
  2595. } else {
  2596. ret = filemap_write_and_wait_range(mapping, pos, end);
  2597. if (ret)
  2598. return ret;
  2599. }
  2600. /*
  2601. * After a write we want buffered reads to be sure to go to disk to get
  2602. * the new data. We invalidate clean cached page from the region we're
  2603. * about to write. We do this *before* the write so that we can return
  2604. * without clobbering -EIOCBQUEUED from ->direct_IO().
  2605. */
  2606. return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
  2607. end >> PAGE_SHIFT);
  2608. }
  2609. int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
  2610. {
  2611. struct address_space *mapping = iocb->ki_filp->f_mapping;
  2612. return filemap_invalidate_pages(mapping, iocb->ki_pos,
  2613. iocb->ki_pos + count - 1,
  2614. iocb->ki_flags & IOCB_NOWAIT);
  2615. }
  2616. EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
  2617. /**
  2618. * generic_file_read_iter - generic filesystem read routine
  2619. * @iocb: kernel I/O control block
  2620. * @iter: destination for the data read
  2621. *
  2622. * This is the "read_iter()" routine for all filesystems
  2623. * that can use the page cache directly.
  2624. *
  2625. * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
  2626. * be returned when no data can be read without waiting for I/O requests
  2627. * to complete; it doesn't prevent readahead.
  2628. *
  2629. * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
  2630. * requests shall be made for the read or for readahead. When no data
  2631. * can be read, -EAGAIN shall be returned. When readahead would be
  2632. * triggered, a partial, possibly empty read shall be returned.
  2633. *
  2634. * Return:
  2635. * * number of bytes copied, even for partial reads
  2636. * * negative error code (or 0 if IOCB_NOIO) if nothing was read
  2637. */
  2638. ssize_t
  2639. generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
  2640. {
  2641. size_t count = iov_iter_count(iter);
  2642. ssize_t retval = 0;
  2643. if (!count)
  2644. return 0; /* skip atime */
  2645. if (iocb->ki_flags & IOCB_DIRECT) {
  2646. struct file *file = iocb->ki_filp;
  2647. struct address_space *mapping = file->f_mapping;
  2648. struct inode *inode = mapping->host;
  2649. retval = kiocb_write_and_wait(iocb, count);
  2650. if (retval < 0)
  2651. return retval;
  2652. file_accessed(file);
  2653. retval = mapping->a_ops->direct_IO(iocb, iter);
  2654. if (retval >= 0) {
  2655. iocb->ki_pos += retval;
  2656. count -= retval;
  2657. }
  2658. if (retval != -EIOCBQUEUED)
  2659. iov_iter_revert(iter, count - iov_iter_count(iter));
  2660. /*
  2661. * Btrfs can have a short DIO read if we encounter
  2662. * compressed extents, so if there was an error, or if
  2663. * we've already read everything we wanted to, or if
  2664. * there was a short read because we hit EOF, go ahead
  2665. * and return. Otherwise fallthrough to buffered io for
  2666. * the rest of the read. Buffered reads will not work for
  2667. * DAX files, so don't bother trying.
  2668. */
  2669. if (retval < 0 || !count || IS_DAX(inode))
  2670. return retval;
  2671. if (iocb->ki_pos >= i_size_read(inode))
  2672. return retval;
  2673. }
  2674. return filemap_read(iocb, iter, retval);
  2675. }
  2676. EXPORT_SYMBOL(generic_file_read_iter);
  2677. /*
  2678. * Splice subpages from a folio into a pipe.
  2679. */
  2680. size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
  2681. struct folio *folio, loff_t fpos, size_t size)
  2682. {
  2683. struct page *page;
  2684. size_t spliced = 0, offset = offset_in_folio(folio, fpos);
  2685. page = folio_page(folio, offset / PAGE_SIZE);
  2686. size = min(size, folio_size(folio) - offset);
  2687. offset %= PAGE_SIZE;
  2688. while (spliced < size && !pipe_is_full(pipe)) {
  2689. struct pipe_buffer *buf = pipe_head_buf(pipe);
  2690. size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
  2691. *buf = (struct pipe_buffer) {
  2692. .ops = &page_cache_pipe_buf_ops,
  2693. .page = page,
  2694. .offset = offset,
  2695. .len = part,
  2696. };
  2697. folio_get(folio);
  2698. pipe->head++;
  2699. page++;
  2700. spliced += part;
  2701. offset = 0;
  2702. }
  2703. return spliced;
  2704. }
  2705. /**
  2706. * filemap_splice_read - Splice data from a file's pagecache into a pipe
  2707. * @in: The file to read from
  2708. * @ppos: Pointer to the file position to read from
  2709. * @pipe: The pipe to splice into
  2710. * @len: The amount to splice
  2711. * @flags: The SPLICE_F_* flags
  2712. *
  2713. * This function gets folios from a file's pagecache and splices them into the
  2714. * pipe. Readahead will be called as necessary to fill more folios. This may
  2715. * be used for blockdevs also.
  2716. *
  2717. * Return: On success, the number of bytes read will be returned and *@ppos
  2718. * will be updated if appropriate; 0 will be returned if there is no more data
  2719. * to be read; -EAGAIN will be returned if the pipe had no space, and some
  2720. * other negative error code will be returned on error. A short read may occur
  2721. * if the pipe has insufficient space, we reach the end of the data or we hit a
  2722. * hole.
  2723. */
  2724. ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
  2725. struct pipe_inode_info *pipe,
  2726. size_t len, unsigned int flags)
  2727. {
  2728. struct folio_batch fbatch;
  2729. struct kiocb iocb;
  2730. size_t total_spliced = 0, used, npages;
  2731. loff_t isize, end_offset;
  2732. bool writably_mapped;
  2733. int i, error = 0;
  2734. if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
  2735. return 0;
  2736. init_sync_kiocb(&iocb, in);
  2737. iocb.ki_pos = *ppos;
  2738. /* Work out how much data we can actually add into the pipe */
  2739. used = pipe_buf_usage(pipe);
  2740. npages = max_t(ssize_t, pipe->max_usage - used, 0);
  2741. len = min_t(size_t, len, npages * PAGE_SIZE);
  2742. folio_batch_init(&fbatch);
  2743. do {
  2744. cond_resched();
  2745. if (*ppos >= i_size_read(in->f_mapping->host))
  2746. break;
  2747. iocb.ki_pos = *ppos;
  2748. error = filemap_get_pages(&iocb, len, &fbatch, true);
  2749. if (error < 0)
  2750. break;
  2751. /*
  2752. * i_size must be checked after we know the pages are Uptodate.
  2753. *
  2754. * Checking i_size after the check allows us to calculate
  2755. * the correct value for "nr", which means the zero-filled
  2756. * part of the page is not copied back to userspace (unless
  2757. * another truncate extends the file - this is desired though).
  2758. */
  2759. isize = i_size_read(in->f_mapping->host);
  2760. if (unlikely(*ppos >= isize))
  2761. break;
  2762. end_offset = min_t(loff_t, isize, *ppos + len);
  2763. /*
  2764. * Once we start copying data, we don't want to be touching any
  2765. * cachelines that might be contended:
  2766. */
  2767. writably_mapped = mapping_writably_mapped(in->f_mapping);
  2768. for (i = 0; i < folio_batch_count(&fbatch); i++) {
  2769. struct folio *folio = fbatch.folios[i];
  2770. size_t n;
  2771. if (folio_pos(folio) >= end_offset)
  2772. goto out;
  2773. folio_mark_accessed(folio);
  2774. /*
  2775. * If users can be writing to this folio using arbitrary
  2776. * virtual addresses, take care of potential aliasing
  2777. * before reading the folio on the kernel side.
  2778. */
  2779. if (writably_mapped)
  2780. flush_dcache_folio(folio);
  2781. n = min_t(loff_t, len, isize - *ppos);
  2782. n = splice_folio_into_pipe(pipe, folio, *ppos, n);
  2783. if (!n)
  2784. goto out;
  2785. len -= n;
  2786. total_spliced += n;
  2787. *ppos += n;
  2788. in->f_ra.prev_pos = *ppos;
  2789. if (pipe_is_full(pipe))
  2790. goto out;
  2791. }
  2792. folio_batch_release(&fbatch);
  2793. } while (len);
  2794. out:
  2795. folio_batch_release(&fbatch);
  2796. file_accessed(in);
  2797. return total_spliced ? total_spliced : error;
  2798. }
  2799. EXPORT_SYMBOL(filemap_splice_read);
  2800. static inline loff_t folio_seek_hole_data(struct xa_state *xas,
  2801. struct address_space *mapping, struct folio *folio,
  2802. loff_t start, loff_t end, bool seek_data)
  2803. {
  2804. const struct address_space_operations *ops = mapping->a_ops;
  2805. size_t offset, bsz = i_blocksize(mapping->host);
  2806. if (xa_is_value(folio) || folio_test_uptodate(folio))
  2807. return seek_data ? start : end;
  2808. if (!ops->is_partially_uptodate)
  2809. return seek_data ? end : start;
  2810. xas_pause(xas);
  2811. rcu_read_unlock();
  2812. folio_lock(folio);
  2813. if (unlikely(folio->mapping != mapping))
  2814. goto unlock;
  2815. offset = offset_in_folio(folio, start) & ~(bsz - 1);
  2816. do {
  2817. if (ops->is_partially_uptodate(folio, offset, bsz) ==
  2818. seek_data)
  2819. break;
  2820. start = (start + bsz) & ~((u64)bsz - 1);
  2821. offset += bsz;
  2822. } while (offset < folio_size(folio));
  2823. unlock:
  2824. folio_unlock(folio);
  2825. rcu_read_lock();
  2826. return start;
  2827. }
  2828. static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
  2829. {
  2830. if (xa_is_value(folio))
  2831. return PAGE_SIZE << xas_get_order(xas);
  2832. return folio_size(folio);
  2833. }
  2834. /**
  2835. * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
  2836. * @mapping: Address space to search.
  2837. * @start: First byte to consider.
  2838. * @end: Limit of search (exclusive).
  2839. * @whence: Either SEEK_HOLE or SEEK_DATA.
  2840. *
  2841. * If the page cache knows which blocks contain holes and which blocks
  2842. * contain data, your filesystem can use this function to implement
  2843. * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are
  2844. * entirely memory-based such as tmpfs, and filesystems which support
  2845. * unwritten extents.
  2846. *
  2847. * Return: The requested offset on success, or -ENXIO if @whence specifies
  2848. * SEEK_DATA and there is no data after @start. There is an implicit hole
  2849. * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
  2850. * and @end contain data.
  2851. */
  2852. loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
  2853. loff_t end, int whence)
  2854. {
  2855. XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
  2856. pgoff_t max = (end - 1) >> PAGE_SHIFT;
  2857. bool seek_data = (whence == SEEK_DATA);
  2858. struct folio *folio;
  2859. if (end <= start)
  2860. return -ENXIO;
  2861. rcu_read_lock();
  2862. while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
  2863. loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
  2864. size_t seek_size;
  2865. if (start < pos) {
  2866. if (!seek_data)
  2867. goto unlock;
  2868. start = pos;
  2869. }
  2870. seek_size = seek_folio_size(&xas, folio);
  2871. pos = round_up((u64)pos + 1, seek_size);
  2872. start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
  2873. seek_data);
  2874. if (start < pos)
  2875. goto unlock;
  2876. if (start >= end)
  2877. break;
  2878. if (seek_size > PAGE_SIZE)
  2879. xas_set(&xas, pos >> PAGE_SHIFT);
  2880. if (!xa_is_value(folio))
  2881. folio_put(folio);
  2882. }
  2883. if (seek_data)
  2884. start = -ENXIO;
  2885. unlock:
  2886. rcu_read_unlock();
  2887. if (folio && !xa_is_value(folio))
  2888. folio_put(folio);
  2889. if (start > end)
  2890. return end;
  2891. return start;
  2892. }
  2893. #ifdef CONFIG_MMU
  2894. #define MMAP_LOTSAMISS (100)
  2895. /*
  2896. * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
  2897. * @vmf - the vm_fault for this fault.
  2898. * @folio - the folio to lock.
  2899. * @fpin - the pointer to the file we may pin (or is already pinned).
  2900. *
  2901. * This works similar to lock_folio_or_retry in that it can drop the
  2902. * mmap_lock. It differs in that it actually returns the folio locked
  2903. * if it returns 1 and 0 if it couldn't lock the folio. If we did have
  2904. * to drop the mmap_lock then fpin will point to the pinned file and
  2905. * needs to be fput()'ed at a later point.
  2906. */
  2907. static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
  2908. struct file **fpin)
  2909. {
  2910. if (folio_trylock(folio))
  2911. return 1;
  2912. /*
  2913. * NOTE! This will make us return with VM_FAULT_RETRY, but with
  2914. * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
  2915. * is supposed to work. We have way too many special cases..
  2916. */
  2917. if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
  2918. return 0;
  2919. *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
  2920. if (vmf->flags & FAULT_FLAG_KILLABLE) {
  2921. if (__folio_lock_killable(folio)) {
  2922. /*
  2923. * We didn't have the right flags to drop the
  2924. * fault lock, but all fault_handlers only check
  2925. * for fatal signals if we return VM_FAULT_RETRY,
  2926. * so we need to drop the fault lock here and
  2927. * return 0 if we don't have a fpin.
  2928. */
  2929. if (*fpin == NULL)
  2930. release_fault_lock(vmf);
  2931. return 0;
  2932. }
  2933. } else
  2934. __folio_lock(folio);
  2935. return 1;
  2936. }
  2937. /*
  2938. * Synchronous readahead happens when we don't even find a page in the page
  2939. * cache at all. We don't want to perform IO under the mmap sem, so if we have
  2940. * to drop the mmap sem we return the file that was pinned in order for us to do
  2941. * that. If we didn't pin a file then we return NULL. The file that is
  2942. * returned needs to be fput()'ed when we're done with it.
  2943. */
  2944. static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
  2945. {
  2946. struct file *file = vmf->vma->vm_file;
  2947. struct file_ra_state *ra = &file->f_ra;
  2948. struct address_space *mapping = file->f_mapping;
  2949. DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
  2950. struct file *fpin = NULL;
  2951. vm_flags_t vm_flags = vmf->vma->vm_flags;
  2952. bool force_thp_readahead = false;
  2953. unsigned short mmap_miss;
  2954. /* Use the readahead code, even if readahead is disabled */
  2955. if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
  2956. (vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER)
  2957. force_thp_readahead = true;
  2958. if (!force_thp_readahead) {
  2959. /*
  2960. * If we don't want any read-ahead, don't bother.
  2961. * VM_EXEC case below is already intended for random access.
  2962. */
  2963. if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ)
  2964. return fpin;
  2965. if (!ra->ra_pages)
  2966. return fpin;
  2967. if (vm_flags & VM_SEQ_READ) {
  2968. fpin = maybe_unlock_mmap_for_io(vmf, fpin);
  2969. page_cache_sync_ra(&ractl, ra->ra_pages);
  2970. return fpin;
  2971. }
  2972. }
  2973. if (!(vm_flags & VM_SEQ_READ)) {
  2974. /* Avoid banging the cache line if not needed */
  2975. mmap_miss = READ_ONCE(ra->mmap_miss);
  2976. if (mmap_miss < MMAP_LOTSAMISS * 10)
  2977. WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
  2978. /*
  2979. * Do we miss much more than hit in this file? If so,
  2980. * stop bothering with read-ahead. It will only hurt.
  2981. */
  2982. if (mmap_miss > MMAP_LOTSAMISS)
  2983. return fpin;
  2984. }
  2985. if (force_thp_readahead) {
  2986. fpin = maybe_unlock_mmap_for_io(vmf, fpin);
  2987. ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
  2988. ra->size = HPAGE_PMD_NR;
  2989. /*
  2990. * Fetch two PMD folios, so we get the chance to actually
  2991. * readahead, unless we've been told not to.
  2992. */
  2993. if (!(vm_flags & VM_RAND_READ))
  2994. ra->size *= 2;
  2995. ra->async_size = HPAGE_PMD_NR;
  2996. ra->order = HPAGE_PMD_ORDER;
  2997. page_cache_ra_order(&ractl, ra);
  2998. return fpin;
  2999. }
  3000. if (vm_flags & VM_EXEC) {
  3001. /*
  3002. * Allow arch to request a preferred minimum folio order for
  3003. * executable memory. This can often be beneficial to
  3004. * performance if (e.g.) arm64 can contpte-map the folio.
  3005. * Executable memory rarely benefits from readahead, due to its
  3006. * random access nature, so set async_size to 0.
  3007. *
  3008. * Limit to the boundaries of the VMA to avoid reading in any
  3009. * pad that might exist between sections, which would be a waste
  3010. * of memory.
  3011. */
  3012. struct vm_area_struct *vma = vmf->vma;
  3013. unsigned long start = vma->vm_pgoff;
  3014. unsigned long end = start + vma_pages(vma);
  3015. unsigned long ra_end;
  3016. ra->order = exec_folio_order();
  3017. ra->start = round_down(vmf->pgoff, 1UL << ra->order);
  3018. ra->start = max(ra->start, start);
  3019. ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order);
  3020. ra_end = min(ra_end, end);
  3021. ra->size = ra_end - ra->start;
  3022. ra->async_size = 0;
  3023. } else {
  3024. /*
  3025. * mmap read-around
  3026. */
  3027. ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
  3028. ra->size = ra->ra_pages;
  3029. ra->async_size = ra->ra_pages / 4;
  3030. ra->order = 0;
  3031. }
  3032. fpin = maybe_unlock_mmap_for_io(vmf, fpin);
  3033. ractl._index = ra->start;
  3034. page_cache_ra_order(&ractl, ra);
  3035. return fpin;
  3036. }
  3037. /*
  3038. * Asynchronous readahead happens when we find the page and PG_readahead,
  3039. * so we want to possibly extend the readahead further. We return the file that
  3040. * was pinned if we have to drop the mmap_lock in order to do IO.
  3041. */
  3042. static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
  3043. struct folio *folio)
  3044. {
  3045. struct file *file = vmf->vma->vm_file;
  3046. struct file_ra_state *ra = &file->f_ra;
  3047. DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
  3048. struct file *fpin = NULL;
  3049. unsigned short mmap_miss;
  3050. /* If we don't want any read-ahead, don't bother */
  3051. if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
  3052. return fpin;
  3053. /*
  3054. * If the folio is locked, we're likely racing against another fault.
  3055. * Don't touch the mmap_miss counter to avoid decreasing it multiple
  3056. * times for a single folio and break the balance with mmap_miss
  3057. * increase in do_sync_mmap_readahead().
  3058. */
  3059. if (likely(!folio_test_locked(folio))) {
  3060. mmap_miss = READ_ONCE(ra->mmap_miss);
  3061. if (mmap_miss)
  3062. WRITE_ONCE(ra->mmap_miss, --mmap_miss);
  3063. }
  3064. if (folio_test_readahead(folio)) {
  3065. fpin = maybe_unlock_mmap_for_io(vmf, fpin);
  3066. page_cache_async_ra(&ractl, folio, ra->ra_pages);
  3067. }
  3068. return fpin;
  3069. }
  3070. static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
  3071. {
  3072. struct vm_area_struct *vma = vmf->vma;
  3073. vm_fault_t ret = 0;
  3074. pte_t *ptep;
  3075. /*
  3076. * We might have COW'ed a pagecache folio and might now have an mlocked
  3077. * anon folio mapped. The original pagecache folio is not mlocked and
  3078. * might have been evicted. During a read+clear/modify/write update of
  3079. * the PTE, such as done in do_numa_page()/change_pte_range(), we
  3080. * temporarily clear the PTE under PT lock and might detect it here as
  3081. * "none" when not holding the PT lock.
  3082. *
  3083. * Not rechecking the PTE under PT lock could result in an unexpected
  3084. * major fault in an mlock'ed region. Recheck only for this special
  3085. * scenario while holding the PT lock, to not degrade non-mlocked
  3086. * scenarios. Recheck the PTE without PT lock firstly, thereby reducing
  3087. * the number of times we hold PT lock.
  3088. */
  3089. if (!(vma->vm_flags & VM_LOCKED))
  3090. return 0;
  3091. if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
  3092. return 0;
  3093. ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address,
  3094. &vmf->ptl);
  3095. if (unlikely(!ptep))
  3096. return VM_FAULT_NOPAGE;
  3097. if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {
  3098. ret = VM_FAULT_NOPAGE;
  3099. } else {
  3100. spin_lock(vmf->ptl);
  3101. if (unlikely(!pte_none(ptep_get(ptep))))
  3102. ret = VM_FAULT_NOPAGE;
  3103. spin_unlock(vmf->ptl);
  3104. }
  3105. pte_unmap(ptep);
  3106. return ret;
  3107. }
  3108. /**
  3109. * filemap_fault - read in file data for page fault handling
  3110. * @vmf: struct vm_fault containing details of the fault
  3111. *
  3112. * filemap_fault() is invoked via the vma operations vector for a
  3113. * mapped memory region to read in file data during a page fault.
  3114. *
  3115. * The goto's are kind of ugly, but this streamlines the normal case of having
  3116. * it in the page cache, and handles the special cases reasonably without
  3117. * having a lot of duplicated code.
  3118. *
  3119. * vma->vm_mm->mmap_lock must be held on entry.
  3120. *
  3121. * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
  3122. * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
  3123. *
  3124. * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
  3125. * has not been released.
  3126. *
  3127. * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
  3128. *
  3129. * Return: bitwise-OR of %VM_FAULT_ codes.
  3130. */
  3131. vm_fault_t filemap_fault(struct vm_fault *vmf)
  3132. {
  3133. int error;
  3134. struct file *file = vmf->vma->vm_file;
  3135. struct file *fpin = NULL;
  3136. struct address_space *mapping = file->f_mapping;
  3137. struct inode *inode = mapping->host;
  3138. pgoff_t max_idx, index = vmf->pgoff;
  3139. struct folio *folio;
  3140. vm_fault_t ret = 0;
  3141. bool mapping_locked = false;
  3142. max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  3143. if (unlikely(index >= max_idx))
  3144. return VM_FAULT_SIGBUS;
  3145. trace_mm_filemap_fault(mapping, index);
  3146. /*
  3147. * Do we have something in the page cache already?
  3148. */
  3149. folio = filemap_get_folio(mapping, index);
  3150. if (likely(!IS_ERR(folio))) {
  3151. /*
  3152. * We found the page, so try async readahead before waiting for
  3153. * the lock.
  3154. */
  3155. if (!(vmf->flags & FAULT_FLAG_TRIED))
  3156. fpin = do_async_mmap_readahead(vmf, folio);
  3157. if (unlikely(!folio_test_uptodate(folio))) {
  3158. filemap_invalidate_lock_shared(mapping);
  3159. mapping_locked = true;
  3160. }
  3161. } else {
  3162. ret = filemap_fault_recheck_pte_none(vmf);
  3163. if (unlikely(ret))
  3164. return ret;
  3165. /* No page in the page cache at all */
  3166. count_vm_event(PGMAJFAULT);
  3167. count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
  3168. ret = VM_FAULT_MAJOR;
  3169. fpin = do_sync_mmap_readahead(vmf);
  3170. retry_find:
  3171. /*
  3172. * See comment in filemap_create_folio() why we need
  3173. * invalidate_lock
  3174. */
  3175. if (!mapping_locked) {
  3176. filemap_invalidate_lock_shared(mapping);
  3177. mapping_locked = true;
  3178. }
  3179. folio = __filemap_get_folio(mapping, index,
  3180. FGP_CREAT|FGP_FOR_MMAP,
  3181. vmf->gfp_mask);
  3182. if (IS_ERR(folio)) {
  3183. if (fpin)
  3184. goto out_retry;
  3185. filemap_invalidate_unlock_shared(mapping);
  3186. return VM_FAULT_OOM;
  3187. }
  3188. }
  3189. if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
  3190. goto out_retry;
  3191. /* Did it get truncated? */
  3192. if (unlikely(folio->mapping != mapping)) {
  3193. folio_unlock(folio);
  3194. folio_put(folio);
  3195. goto retry_find;
  3196. }
  3197. VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
  3198. /*
  3199. * We have a locked folio in the page cache, now we need to check
  3200. * that it's up-to-date. If not, it is going to be due to an error,
  3201. * or because readahead was otherwise unable to retrieve it.
  3202. */
  3203. if (unlikely(!folio_test_uptodate(folio))) {
  3204. /*
  3205. * If the invalidate lock is not held, the folio was in cache
  3206. * and uptodate and now it is not. Strange but possible since we
  3207. * didn't hold the page lock all the time. Let's drop
  3208. * everything, get the invalidate lock and try again.
  3209. */
  3210. if (!mapping_locked) {
  3211. folio_unlock(folio);
  3212. folio_put(folio);
  3213. goto retry_find;
  3214. }
  3215. /*
  3216. * OK, the folio is really not uptodate. This can be because the
  3217. * VMA has the VM_RAND_READ flag set, or because an error
  3218. * arose. Let's read it in directly.
  3219. */
  3220. goto page_not_uptodate;
  3221. }
  3222. /*
  3223. * We've made it this far and we had to drop our mmap_lock, now is the
  3224. * time to return to the upper layer and have it re-find the vma and
  3225. * redo the fault.
  3226. */
  3227. if (fpin) {
  3228. folio_unlock(folio);
  3229. goto out_retry;
  3230. }
  3231. if (mapping_locked)
  3232. filemap_invalidate_unlock_shared(mapping);
  3233. /*
  3234. * Found the page and have a reference on it.
  3235. * We must recheck i_size under page lock.
  3236. */
  3237. max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  3238. if (unlikely(index >= max_idx)) {
  3239. folio_unlock(folio);
  3240. folio_put(folio);
  3241. return VM_FAULT_SIGBUS;
  3242. }
  3243. vmf->page = folio_file_page(folio, index);
  3244. return ret | VM_FAULT_LOCKED;
  3245. page_not_uptodate:
  3246. /*
  3247. * Umm, take care of errors if the page isn't up-to-date.
  3248. * Try to re-read it _once_. We do this synchronously,
  3249. * because there really aren't any performance issues here
  3250. * and we need to check for errors.
  3251. */
  3252. fpin = maybe_unlock_mmap_for_io(vmf, fpin);
  3253. error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
  3254. if (fpin)
  3255. goto out_retry;
  3256. folio_put(folio);
  3257. if (!error || error == AOP_TRUNCATED_PAGE)
  3258. goto retry_find;
  3259. filemap_invalidate_unlock_shared(mapping);
  3260. return VM_FAULT_SIGBUS;
  3261. out_retry:
  3262. /*
  3263. * We dropped the mmap_lock, we need to return to the fault handler to
  3264. * re-find the vma and come back and find our hopefully still populated
  3265. * page.
  3266. */
  3267. if (!IS_ERR(folio))
  3268. folio_put(folio);
  3269. if (mapping_locked)
  3270. filemap_invalidate_unlock_shared(mapping);
  3271. if (fpin)
  3272. fput(fpin);
  3273. return ret | VM_FAULT_RETRY;
  3274. }
  3275. EXPORT_SYMBOL(filemap_fault);
  3276. static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
  3277. pgoff_t start)
  3278. {
  3279. struct mm_struct *mm = vmf->vma->vm_mm;
  3280. /* Huge page is mapped? No need to proceed. */
  3281. if (pmd_trans_huge(*vmf->pmd)) {
  3282. folio_unlock(folio);
  3283. folio_put(folio);
  3284. return true;
  3285. }
  3286. if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
  3287. struct page *page = folio_file_page(folio, start);
  3288. vm_fault_t ret = do_set_pmd(vmf, folio, page);
  3289. if (!ret) {
  3290. /* The page is mapped successfully, reference consumed. */
  3291. folio_unlock(folio);
  3292. return true;
  3293. }
  3294. }
  3295. if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)
  3296. pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
  3297. return false;
  3298. }
  3299. static struct folio *next_uptodate_folio(struct xa_state *xas,
  3300. struct address_space *mapping, pgoff_t end_pgoff)
  3301. {
  3302. struct folio *folio = xas_next_entry(xas, end_pgoff);
  3303. unsigned long max_idx;
  3304. do {
  3305. if (!folio)
  3306. return NULL;
  3307. if (xas_retry(xas, folio))
  3308. continue;
  3309. if (xa_is_value(folio))
  3310. continue;
  3311. if (!folio_try_get(folio))
  3312. continue;
  3313. if (folio_test_locked(folio))
  3314. goto skip;
  3315. /* Has the page moved or been split? */
  3316. if (unlikely(folio != xas_reload(xas)))
  3317. goto skip;
  3318. if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
  3319. goto skip;
  3320. if (!folio_trylock(folio))
  3321. goto skip;
  3322. if (folio->mapping != mapping)
  3323. goto unlock;
  3324. if (!folio_test_uptodate(folio))
  3325. goto unlock;
  3326. max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
  3327. if (xas->xa_index >= max_idx)
  3328. goto unlock;
  3329. return folio;
  3330. unlock:
  3331. folio_unlock(folio);
  3332. skip:
  3333. folio_put(folio);
  3334. } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);
  3335. return NULL;
  3336. }
  3337. /*
  3338. * Map page range [start_page, start_page + nr_pages) of folio.
  3339. * start_page is gotten from start by folio_page(folio, start)
  3340. */
  3341. static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
  3342. struct folio *folio, unsigned long start,
  3343. unsigned long addr, unsigned int nr_pages,
  3344. unsigned long *rss, unsigned short *mmap_miss,
  3345. pgoff_t file_end)
  3346. {
  3347. struct address_space *mapping = folio->mapping;
  3348. unsigned int ref_from_caller = 1;
  3349. vm_fault_t ret = 0;
  3350. struct page *page = folio_page(folio, start);
  3351. unsigned int count = 0;
  3352. pte_t *old_ptep = vmf->pte;
  3353. unsigned long addr0;
  3354. /*
  3355. * Map the large folio fully where possible:
  3356. *
  3357. * - The folio is fully within size of the file or belong
  3358. * to shmem/tmpfs;
  3359. * - The folio doesn't cross VMA boundary;
  3360. * - The folio doesn't cross page table boundary;
  3361. */
  3362. addr0 = addr - start * PAGE_SIZE;
  3363. if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) &&
  3364. folio_within_vma(folio, vmf->vma) &&
  3365. (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) {
  3366. vmf->pte -= start;
  3367. page -= start;
  3368. addr = addr0;
  3369. nr_pages = folio_nr_pages(folio);
  3370. }
  3371. do {
  3372. if (PageHWPoison(page + count))
  3373. goto skip;
  3374. /*
  3375. * If there are too many folios that are recently evicted
  3376. * in a file, they will probably continue to be evicted.
  3377. * In such situation, read-ahead is only a waste of IO.
  3378. * Don't decrease mmap_miss in this scenario to make sure
  3379. * we can stop read-ahead.
  3380. */
  3381. if (!folio_test_workingset(folio))
  3382. (*mmap_miss)++;
  3383. /*
  3384. * NOTE: If there're PTE markers, we'll leave them to be
  3385. * handled in the specific fault path, and it'll prohibit the
  3386. * fault-around logic.
  3387. */
  3388. if (!pte_none(ptep_get(&vmf->pte[count])))
  3389. goto skip;
  3390. count++;
  3391. continue;
  3392. skip:
  3393. if (count) {
  3394. set_pte_range(vmf, folio, page, count, addr);
  3395. *rss += count;
  3396. folio_ref_add(folio, count - ref_from_caller);
  3397. ref_from_caller = 0;
  3398. if (in_range(vmf->address, addr, count * PAGE_SIZE))
  3399. ret = VM_FAULT_NOPAGE;
  3400. }
  3401. count++;
  3402. page += count;
  3403. vmf->pte += count;
  3404. addr += count * PAGE_SIZE;
  3405. count = 0;
  3406. } while (--nr_pages > 0);
  3407. if (count) {
  3408. set_pte_range(vmf, folio, page, count, addr);
  3409. *rss += count;
  3410. folio_ref_add(folio, count - ref_from_caller);
  3411. ref_from_caller = 0;
  3412. if (in_range(vmf->address, addr, count * PAGE_SIZE))
  3413. ret = VM_FAULT_NOPAGE;
  3414. }
  3415. vmf->pte = old_ptep;
  3416. if (ref_from_caller)
  3417. /* Locked folios cannot get truncated. */
  3418. folio_ref_dec(folio);
  3419. return ret;
  3420. }
  3421. static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
  3422. struct folio *folio, unsigned long addr,
  3423. unsigned long *rss, unsigned short *mmap_miss)
  3424. {
  3425. vm_fault_t ret = 0;
  3426. struct page *page = &folio->page;
  3427. if (PageHWPoison(page))
  3428. goto out;
  3429. /* See comment of filemap_map_folio_range() */
  3430. if (!folio_test_workingset(folio))
  3431. (*mmap_miss)++;
  3432. /*
  3433. * NOTE: If there're PTE markers, we'll leave them to be
  3434. * handled in the specific fault path, and it'll prohibit
  3435. * the fault-around logic.
  3436. */
  3437. if (!pte_none(ptep_get(vmf->pte)))
  3438. goto out;
  3439. if (vmf->address == addr)
  3440. ret = VM_FAULT_NOPAGE;
  3441. set_pte_range(vmf, folio, page, 1, addr);
  3442. (*rss)++;
  3443. return ret;
  3444. out:
  3445. /* Locked folios cannot get truncated. */
  3446. folio_ref_dec(folio);
  3447. return ret;
  3448. }
  3449. vm_fault_t filemap_map_pages(struct vm_fault *vmf,
  3450. pgoff_t start_pgoff, pgoff_t end_pgoff)
  3451. {
  3452. struct vm_area_struct *vma = vmf->vma;
  3453. struct file *file = vma->vm_file;
  3454. struct address_space *mapping = file->f_mapping;
  3455. pgoff_t file_end, last_pgoff = start_pgoff;
  3456. unsigned long addr;
  3457. XA_STATE(xas, &mapping->i_pages, start_pgoff);
  3458. struct folio *folio;
  3459. vm_fault_t ret = 0;
  3460. unsigned long rss = 0;
  3461. unsigned int nr_pages = 0, folio_type;
  3462. unsigned short mmap_miss = 0, mmap_miss_saved;
  3463. /*
  3464. * Recalculate end_pgoff based on file_end before calling
  3465. * next_uptodate_folio() to avoid races with concurrent
  3466. * truncation.
  3467. */
  3468. file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
  3469. end_pgoff = min(end_pgoff, file_end);
  3470. rcu_read_lock();
  3471. folio = next_uptodate_folio(&xas, mapping, end_pgoff);
  3472. if (!folio)
  3473. goto out;
  3474. /*
  3475. * Do not allow to map with PMD across i_size to preserve
  3476. * SIGBUS semantics.
  3477. *
  3478. * Make an exception for shmem/tmpfs that for long time
  3479. * intentionally mapped with PMDs across i_size.
  3480. */
  3481. if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) &&
  3482. filemap_map_pmd(vmf, folio, start_pgoff)) {
  3483. ret = VM_FAULT_NOPAGE;
  3484. goto out;
  3485. }
  3486. addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  3487. vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
  3488. if (!vmf->pte) {
  3489. folio_unlock(folio);
  3490. folio_put(folio);
  3491. goto out;
  3492. }
  3493. folio_type = mm_counter_file(folio);
  3494. do {
  3495. unsigned long end;
  3496. addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
  3497. vmf->pte += xas.xa_index - last_pgoff;
  3498. last_pgoff = xas.xa_index;
  3499. end = folio_next_index(folio) - 1;
  3500. nr_pages = min(end, end_pgoff) - xas.xa_index + 1;
  3501. if (!folio_test_large(folio))
  3502. ret |= filemap_map_order0_folio(vmf,
  3503. folio, addr, &rss, &mmap_miss);
  3504. else
  3505. ret |= filemap_map_folio_range(vmf, folio,
  3506. xas.xa_index - folio->index, addr,
  3507. nr_pages, &rss, &mmap_miss, file_end);
  3508. folio_unlock(folio);
  3509. } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
  3510. add_mm_counter(vma->vm_mm, folio_type, rss);
  3511. pte_unmap_unlock(vmf->pte, vmf->ptl);
  3512. trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff);
  3513. out:
  3514. rcu_read_unlock();
  3515. mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);
  3516. if (mmap_miss >= mmap_miss_saved)
  3517. WRITE_ONCE(file->f_ra.mmap_miss, 0);
  3518. else
  3519. WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);
  3520. return ret;
  3521. }
  3522. EXPORT_SYMBOL(filemap_map_pages);
  3523. vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
  3524. {
  3525. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  3526. struct folio *folio = page_folio(vmf->page);
  3527. vm_fault_t ret = VM_FAULT_LOCKED;
  3528. sb_start_pagefault(mapping->host->i_sb);
  3529. file_update_time(vmf->vma->vm_file);
  3530. folio_lock(folio);
  3531. if (folio->mapping != mapping) {
  3532. folio_unlock(folio);
  3533. ret = VM_FAULT_NOPAGE;
  3534. goto out;
  3535. }
  3536. /*
  3537. * We mark the folio dirty already here so that when freeze is in
  3538. * progress, we are guaranteed that writeback during freezing will
  3539. * see the dirty folio and writeprotect it again.
  3540. */
  3541. folio_mark_dirty(folio);
  3542. folio_wait_stable(folio);
  3543. out:
  3544. sb_end_pagefault(mapping->host->i_sb);
  3545. return ret;
  3546. }
  3547. const struct vm_operations_struct generic_file_vm_ops = {
  3548. .fault = filemap_fault,
  3549. .map_pages = filemap_map_pages,
  3550. .page_mkwrite = filemap_page_mkwrite,
  3551. };
  3552. /* This is used for a general mmap of a disk file */
  3553. int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
  3554. {
  3555. struct address_space *mapping = file->f_mapping;
  3556. if (!mapping->a_ops->read_folio)
  3557. return -ENOEXEC;
  3558. file_accessed(file);
  3559. vma->vm_ops = &generic_file_vm_ops;
  3560. return 0;
  3561. }
  3562. int generic_file_mmap_prepare(struct vm_area_desc *desc)
  3563. {
  3564. struct file *file = desc->file;
  3565. struct address_space *mapping = file->f_mapping;
  3566. if (!mapping->a_ops->read_folio)
  3567. return -ENOEXEC;
  3568. file_accessed(file);
  3569. desc->vm_ops = &generic_file_vm_ops;
  3570. return 0;
  3571. }
  3572. /*
  3573. * This is for filesystems which do not implement ->writepage.
  3574. */
  3575. int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
  3576. {
  3577. if (vma_is_shared_maywrite(vma))
  3578. return -EINVAL;
  3579. return generic_file_mmap(file, vma);
  3580. }
  3581. int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
  3582. {
  3583. if (is_shared_maywrite(&desc->vma_flags))
  3584. return -EINVAL;
  3585. return generic_file_mmap_prepare(desc);
  3586. }
  3587. #else
  3588. vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
  3589. {
  3590. return VM_FAULT_SIGBUS;
  3591. }
  3592. int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
  3593. {
  3594. return -ENOSYS;
  3595. }
  3596. int generic_file_mmap_prepare(struct vm_area_desc *desc)
  3597. {
  3598. return -ENOSYS;
  3599. }
  3600. int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
  3601. {
  3602. return -ENOSYS;
  3603. }
  3604. int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
  3605. {
  3606. return -ENOSYS;
  3607. }
  3608. #endif /* CONFIG_MMU */
  3609. EXPORT_SYMBOL(filemap_page_mkwrite);
  3610. EXPORT_SYMBOL(generic_file_mmap);
  3611. EXPORT_SYMBOL(generic_file_mmap_prepare);
  3612. EXPORT_SYMBOL(generic_file_readonly_mmap);
  3613. EXPORT_SYMBOL(generic_file_readonly_mmap_prepare);
  3614. static struct folio *do_read_cache_folio(struct address_space *mapping,
  3615. pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
  3616. {
  3617. struct folio *folio;
  3618. int err;
  3619. if (!filler)
  3620. filler = mapping->a_ops->read_folio;
  3621. repeat:
  3622. folio = filemap_get_folio(mapping, index);
  3623. if (IS_ERR(folio)) {
  3624. folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL);
  3625. if (!folio)
  3626. return ERR_PTR(-ENOMEM);
  3627. index = mapping_align_index(mapping, index);
  3628. err = filemap_add_folio(mapping, folio, index, gfp);
  3629. if (unlikely(err)) {
  3630. folio_put(folio);
  3631. if (err == -EEXIST)
  3632. goto repeat;
  3633. /* Presumably ENOMEM for xarray node */
  3634. return ERR_PTR(err);
  3635. }
  3636. goto filler;
  3637. }
  3638. if (folio_test_uptodate(folio))
  3639. goto out;
  3640. if (!folio_trylock(folio)) {
  3641. folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
  3642. goto repeat;
  3643. }
  3644. /* Folio was truncated from mapping */
  3645. if (!folio->mapping) {
  3646. folio_unlock(folio);
  3647. folio_put(folio);
  3648. goto repeat;
  3649. }
  3650. /* Someone else locked and filled the page in a very small window */
  3651. if (folio_test_uptodate(folio)) {
  3652. folio_unlock(folio);
  3653. goto out;
  3654. }
  3655. filler:
  3656. err = filemap_read_folio(file, filler, folio);
  3657. if (err) {
  3658. folio_put(folio);
  3659. if (err == AOP_TRUNCATED_PAGE)
  3660. goto repeat;
  3661. return ERR_PTR(err);
  3662. }
  3663. out:
  3664. folio_mark_accessed(folio);
  3665. return folio;
  3666. }
  3667. /**
  3668. * read_cache_folio - Read into page cache, fill it if needed.
  3669. * @mapping: The address_space to read from.
  3670. * @index: The index to read.
  3671. * @filler: Function to perform the read, or NULL to use aops->read_folio().
  3672. * @file: Passed to filler function, may be NULL if not required.
  3673. *
  3674. * Read one page into the page cache. If it succeeds, the folio returned
  3675. * will contain @index, but it may not be the first page of the folio.
  3676. *
  3677. * If the filler function returns an error, it will be returned to the
  3678. * caller.
  3679. *
  3680. * Context: May sleep. Expects mapping->invalidate_lock to be held.
  3681. * Return: An uptodate folio on success, ERR_PTR() on failure.
  3682. */
  3683. struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
  3684. filler_t filler, struct file *file)
  3685. {
  3686. return do_read_cache_folio(mapping, index, filler, file,
  3687. mapping_gfp_mask(mapping));
  3688. }
  3689. EXPORT_SYMBOL(read_cache_folio);
  3690. /**
  3691. * mapping_read_folio_gfp - Read into page cache, using specified allocation flags.
  3692. * @mapping: The address_space for the folio.
  3693. * @index: The index that the allocated folio will contain.
  3694. * @gfp: The page allocator flags to use if allocating.
  3695. *
  3696. * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with
  3697. * any new memory allocations done using the specified allocation flags.
  3698. *
  3699. * The most likely error from this function is EIO, but ENOMEM is
  3700. * possible and so is EINTR. If ->read_folio returns another error,
  3701. * that will be returned to the caller.
  3702. *
  3703. * The function expects mapping->invalidate_lock to be already held.
  3704. *
  3705. * Return: Uptodate folio on success, ERR_PTR() on failure.
  3706. */
  3707. struct folio *mapping_read_folio_gfp(struct address_space *mapping,
  3708. pgoff_t index, gfp_t gfp)
  3709. {
  3710. return do_read_cache_folio(mapping, index, NULL, NULL, gfp);
  3711. }
  3712. EXPORT_SYMBOL(mapping_read_folio_gfp);
  3713. static struct page *do_read_cache_page(struct address_space *mapping,
  3714. pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
  3715. {
  3716. struct folio *folio;
  3717. folio = do_read_cache_folio(mapping, index, filler, file, gfp);
  3718. if (IS_ERR(folio))
  3719. return &folio->page;
  3720. return folio_file_page(folio, index);
  3721. }
  3722. struct page *read_cache_page(struct address_space *mapping,
  3723. pgoff_t index, filler_t *filler, struct file *file)
  3724. {
  3725. return do_read_cache_page(mapping, index, filler, file,
  3726. mapping_gfp_mask(mapping));
  3727. }
  3728. EXPORT_SYMBOL(read_cache_page);
  3729. /**
  3730. * read_cache_page_gfp - read into page cache, using specified page allocation flags.
  3731. * @mapping: the page's address_space
  3732. * @index: the page index
  3733. * @gfp: the page allocator flags to use if allocating
  3734. *
  3735. * This is the same as "read_mapping_page(mapping, index, NULL)", but with
  3736. * any new page allocations done using the specified allocation flags.
  3737. *
  3738. * If the page does not get brought uptodate, return -EIO.
  3739. *
  3740. * The function expects mapping->invalidate_lock to be already held.
  3741. *
  3742. * Return: up to date page on success, ERR_PTR() on failure.
  3743. */
  3744. struct page *read_cache_page_gfp(struct address_space *mapping,
  3745. pgoff_t index,
  3746. gfp_t gfp)
  3747. {
  3748. return do_read_cache_page(mapping, index, NULL, NULL, gfp);
  3749. }
  3750. EXPORT_SYMBOL(read_cache_page_gfp);
  3751. /*
  3752. * Warn about a page cache invalidation failure during a direct I/O write.
  3753. */
  3754. static void dio_warn_stale_pagecache(struct file *filp)
  3755. {
  3756. static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
  3757. char pathname[128];
  3758. char *path;
  3759. errseq_set(&filp->f_mapping->wb_err, -EIO);
  3760. if (__ratelimit(&_rs)) {
  3761. path = file_path(filp, pathname, sizeof(pathname));
  3762. if (IS_ERR(path))
  3763. path = "(unknown)";
  3764. pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
  3765. pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
  3766. current->comm);
  3767. }
  3768. }
  3769. void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
  3770. {
  3771. struct address_space *mapping = iocb->ki_filp->f_mapping;
  3772. if (mapping->nrpages &&
  3773. invalidate_inode_pages2_range(mapping,
  3774. iocb->ki_pos >> PAGE_SHIFT,
  3775. (iocb->ki_pos + count - 1) >> PAGE_SHIFT))
  3776. dio_warn_stale_pagecache(iocb->ki_filp);
  3777. }
  3778. ssize_t
  3779. generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
  3780. {
  3781. struct address_space *mapping = iocb->ki_filp->f_mapping;
  3782. size_t write_len = iov_iter_count(from);
  3783. ssize_t written;
  3784. /*
  3785. * If a page can not be invalidated, return 0 to fall back
  3786. * to buffered write.
  3787. */
  3788. written = kiocb_invalidate_pages(iocb, write_len);
  3789. if (written) {
  3790. if (written == -EBUSY)
  3791. return 0;
  3792. return written;
  3793. }
  3794. written = mapping->a_ops->direct_IO(iocb, from);
  3795. /*
  3796. * Finally, try again to invalidate clean pages which might have been
  3797. * cached by non-direct readahead, or faulted in by get_user_pages()
  3798. * if the source of the write was an mmap'ed region of the file
  3799. * we're writing. Either one is a pretty crazy thing to do,
  3800. * so we don't support it 100%. If this invalidation
  3801. * fails, tough, the write still worked...
  3802. *
  3803. * Most of the time we do not need this since dio_complete() will do
  3804. * the invalidation for us. However there are some file systems that
  3805. * do not end up with dio_complete() being called, so let's not break
  3806. * them by removing it completely.
  3807. *
  3808. * Noticeable example is a blkdev_direct_IO().
  3809. *
  3810. * Skip invalidation for async writes or if mapping has no pages.
  3811. */
  3812. if (written > 0) {
  3813. struct inode *inode = mapping->host;
  3814. loff_t pos = iocb->ki_pos;
  3815. kiocb_invalidate_post_direct_write(iocb, written);
  3816. pos += written;
  3817. write_len -= written;
  3818. if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
  3819. i_size_write(inode, pos);
  3820. mark_inode_dirty(inode);
  3821. }
  3822. iocb->ki_pos = pos;
  3823. }
  3824. if (written != -EIOCBQUEUED)
  3825. iov_iter_revert(from, write_len - iov_iter_count(from));
  3826. return written;
  3827. }
  3828. EXPORT_SYMBOL(generic_file_direct_write);
  3829. ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
  3830. {
  3831. struct file *file = iocb->ki_filp;
  3832. loff_t pos = iocb->ki_pos;
  3833. struct address_space *mapping = file->f_mapping;
  3834. const struct address_space_operations *a_ops = mapping->a_ops;
  3835. size_t chunk = mapping_max_folio_size(mapping);
  3836. long status = 0;
  3837. ssize_t written = 0;
  3838. do {
  3839. struct folio *folio;
  3840. size_t offset; /* Offset into folio */
  3841. size_t bytes; /* Bytes to write to folio */
  3842. size_t copied; /* Bytes copied from user */
  3843. void *fsdata = NULL;
  3844. bytes = iov_iter_count(i);
  3845. retry:
  3846. offset = pos & (chunk - 1);
  3847. bytes = min(chunk - offset, bytes);
  3848. balance_dirty_pages_ratelimited(mapping);
  3849. if (fatal_signal_pending(current)) {
  3850. status = -EINTR;
  3851. break;
  3852. }
  3853. status = a_ops->write_begin(iocb, mapping, pos, bytes,
  3854. &folio, &fsdata);
  3855. if (unlikely(status < 0))
  3856. break;
  3857. offset = offset_in_folio(folio, pos);
  3858. if (bytes > folio_size(folio) - offset)
  3859. bytes = folio_size(folio) - offset;
  3860. if (mapping_writably_mapped(mapping))
  3861. flush_dcache_folio(folio);
  3862. /*
  3863. * Faults here on mmap()s can recurse into arbitrary
  3864. * filesystem code. Lots of locks are held that can
  3865. * deadlock. Use an atomic copy to avoid deadlocking
  3866. * in page fault handling.
  3867. */
  3868. copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
  3869. flush_dcache_folio(folio);
  3870. status = a_ops->write_end(iocb, mapping, pos, bytes, copied,
  3871. folio, fsdata);
  3872. if (unlikely(status != copied)) {
  3873. iov_iter_revert(i, copied - max(status, 0L));
  3874. if (unlikely(status < 0))
  3875. break;
  3876. }
  3877. cond_resched();
  3878. if (unlikely(status == 0)) {
  3879. /*
  3880. * A short copy made ->write_end() reject the
  3881. * thing entirely. Might be memory poisoning
  3882. * halfway through, might be a race with munmap,
  3883. * might be severe memory pressure.
  3884. */
  3885. if (chunk > PAGE_SIZE)
  3886. chunk /= 2;
  3887. if (copied) {
  3888. bytes = copied;
  3889. goto retry;
  3890. }
  3891. /*
  3892. * 'folio' is now unlocked and faults on it can be
  3893. * handled. Ensure forward progress by trying to
  3894. * fault it in now.
  3895. */
  3896. if (fault_in_iov_iter_readable(i, bytes) == bytes) {
  3897. status = -EFAULT;
  3898. break;
  3899. }
  3900. } else {
  3901. pos += status;
  3902. written += status;
  3903. }
  3904. } while (iov_iter_count(i));
  3905. if (!written)
  3906. return status;
  3907. iocb->ki_pos += written;
  3908. return written;
  3909. }
  3910. EXPORT_SYMBOL(generic_perform_write);
  3911. /**
  3912. * __generic_file_write_iter - write data to a file
  3913. * @iocb: IO state structure (file, offset, etc.)
  3914. * @from: iov_iter with data to write
  3915. *
  3916. * This function does all the work needed for actually writing data to a
  3917. * file. It does all basic checks, removes SUID from the file, updates
  3918. * modification times and calls proper subroutines depending on whether we
  3919. * do direct IO or a standard buffered write.
  3920. *
  3921. * It expects i_rwsem to be grabbed unless we work on a block device or similar
  3922. * object which does not need locking at all.
  3923. *
  3924. * This function does *not* take care of syncing data in case of O_SYNC write.
  3925. * A caller has to handle it. This is mainly due to the fact that we want to
  3926. * avoid syncing under i_rwsem.
  3927. *
  3928. * Return:
  3929. * * number of bytes written, even for truncated writes
  3930. * * negative error code if no data has been written at all
  3931. */
  3932. ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  3933. {
  3934. struct file *file = iocb->ki_filp;
  3935. struct address_space *mapping = file->f_mapping;
  3936. struct inode *inode = mapping->host;
  3937. ssize_t ret;
  3938. ret = file_remove_privs(file);
  3939. if (ret)
  3940. return ret;
  3941. ret = file_update_time(file);
  3942. if (ret)
  3943. return ret;
  3944. if (iocb->ki_flags & IOCB_DIRECT) {
  3945. ret = generic_file_direct_write(iocb, from);
  3946. /*
  3947. * If the write stopped short of completing, fall back to
  3948. * buffered writes. Some filesystems do this for writes to
  3949. * holes, for example. For DAX files, a buffered write will
  3950. * not succeed (even if it did, DAX does not handle dirty
  3951. * page-cache pages correctly).
  3952. */
  3953. if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
  3954. return ret;
  3955. return direct_write_fallback(iocb, from, ret,
  3956. generic_perform_write(iocb, from));
  3957. }
  3958. return generic_perform_write(iocb, from);
  3959. }
  3960. EXPORT_SYMBOL(__generic_file_write_iter);
  3961. /**
  3962. * generic_file_write_iter - write data to a file
  3963. * @iocb: IO state structure
  3964. * @from: iov_iter with data to write
  3965. *
  3966. * This is a wrapper around __generic_file_write_iter() to be used by most
  3967. * filesystems. It takes care of syncing the file in case of O_SYNC file
  3968. * and acquires i_rwsem as needed.
  3969. * Return:
  3970. * * negative error code if no data has been written at all of
  3971. * vfs_fsync_range() failed for a synchronous write
  3972. * * number of bytes written, even for truncated writes
  3973. */
  3974. ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  3975. {
  3976. struct file *file = iocb->ki_filp;
  3977. struct inode *inode = file->f_mapping->host;
  3978. ssize_t ret;
  3979. inode_lock(inode);
  3980. ret = generic_write_checks(iocb, from);
  3981. if (ret > 0)
  3982. ret = __generic_file_write_iter(iocb, from);
  3983. inode_unlock(inode);
  3984. if (ret > 0)
  3985. ret = generic_write_sync(iocb, ret);
  3986. return ret;
  3987. }
  3988. EXPORT_SYMBOL(generic_file_write_iter);
  3989. /**
  3990. * filemap_release_folio() - Release fs-specific metadata on a folio.
  3991. * @folio: The folio which the kernel is trying to free.
  3992. * @gfp: Memory allocation flags (and I/O mode).
  3993. *
  3994. * The address_space is trying to release any data attached to a folio
  3995. * (presumably at folio->private).
  3996. *
  3997. * This will also be called if the private_2 flag is set on a page,
  3998. * indicating that the folio has other metadata associated with it.
  3999. *
  4000. * The @gfp argument specifies whether I/O may be performed to release
  4001. * this page (__GFP_IO), and whether the call may block
  4002. * (__GFP_RECLAIM & __GFP_FS).
  4003. *
  4004. * Return: %true if the release was successful, otherwise %false.
  4005. */
  4006. bool filemap_release_folio(struct folio *folio, gfp_t gfp)
  4007. {
  4008. struct address_space * const mapping = folio->mapping;
  4009. BUG_ON(!folio_test_locked(folio));
  4010. if (!folio_needs_release(folio))
  4011. return true;
  4012. if (folio_test_writeback(folio))
  4013. return false;
  4014. if (mapping && mapping->a_ops->release_folio)
  4015. return mapping->a_ops->release_folio(folio, gfp);
  4016. return try_to_free_buffers(folio);
  4017. }
  4018. EXPORT_SYMBOL(filemap_release_folio);
  4019. /**
  4020. * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache
  4021. * @inode: The inode to flush
  4022. * @flush: Set to write back rather than simply invalidate.
  4023. * @start: First byte to in range.
  4024. * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start
  4025. * onwards.
  4026. *
  4027. * Invalidate all the folios on an inode that contribute to the specified
  4028. * range, possibly writing them back first. Whilst the operation is
  4029. * undertaken, the invalidate lock is held to prevent new folios from being
  4030. * installed.
  4031. */
  4032. int filemap_invalidate_inode(struct inode *inode, bool flush,
  4033. loff_t start, loff_t end)
  4034. {
  4035. struct address_space *mapping = inode->i_mapping;
  4036. pgoff_t first = start >> PAGE_SHIFT;
  4037. pgoff_t last = end >> PAGE_SHIFT;
  4038. pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1;
  4039. if (!mapping || !mapping->nrpages || end < start)
  4040. goto out;
  4041. /* Prevent new folios from being added to the inode. */
  4042. filemap_invalidate_lock(mapping);
  4043. if (!mapping->nrpages)
  4044. goto unlock;
  4045. unmap_mapping_pages(mapping, first, nr, false);
  4046. /* Write back the data if we're asked to. */
  4047. if (flush)
  4048. filemap_fdatawrite_range(mapping, start, end);
  4049. /* Wait for writeback to complete on all folios and discard. */
  4050. invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE);
  4051. unlock:
  4052. filemap_invalidate_unlock(mapping);
  4053. out:
  4054. return filemap_check_errors(mapping);
  4055. }
  4056. EXPORT_SYMBOL_GPL(filemap_invalidate_inode);
  4057. #ifdef CONFIG_CACHESTAT_SYSCALL
  4058. /**
  4059. * filemap_cachestat() - compute the page cache statistics of a mapping
  4060. * @mapping: The mapping to compute the statistics for.
  4061. * @first_index: The starting page cache index.
  4062. * @last_index: The final page index (inclusive).
  4063. * @cs: the cachestat struct to write the result to.
  4064. *
  4065. * This will query the page cache statistics of a mapping in the
  4066. * page range of [first_index, last_index] (inclusive). The statistics
  4067. * queried include: number of dirty pages, number of pages marked for
  4068. * writeback, and the number of (recently) evicted pages.
  4069. */
  4070. static void filemap_cachestat(struct address_space *mapping,
  4071. pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
  4072. {
  4073. XA_STATE(xas, &mapping->i_pages, first_index);
  4074. struct folio *folio;
  4075. /* Flush stats (and potentially sleep) outside the RCU read section. */
  4076. mem_cgroup_flush_stats_ratelimited(NULL);
  4077. rcu_read_lock();
  4078. xas_for_each(&xas, folio, last_index) {
  4079. int order;
  4080. unsigned long nr_pages;
  4081. pgoff_t folio_first_index, folio_last_index;
  4082. /*
  4083. * Don't deref the folio. It is not pinned, and might
  4084. * get freed (and reused) underneath us.
  4085. *
  4086. * We *could* pin it, but that would be expensive for
  4087. * what should be a fast and lightweight syscall.
  4088. *
  4089. * Instead, derive all information of interest from
  4090. * the rcu-protected xarray.
  4091. */
  4092. if (xas_retry(&xas, folio))
  4093. continue;
  4094. order = xas_get_order(&xas);
  4095. nr_pages = 1 << order;
  4096. folio_first_index = round_down(xas.xa_index, 1 << order);
  4097. folio_last_index = folio_first_index + nr_pages - 1;
  4098. /* Folios might straddle the range boundaries, only count covered pages */
  4099. if (folio_first_index < first_index)
  4100. nr_pages -= first_index - folio_first_index;
  4101. if (folio_last_index > last_index)
  4102. nr_pages -= folio_last_index - last_index;
  4103. if (xa_is_value(folio)) {
  4104. /* page is evicted */
  4105. void *shadow = (void *)folio;
  4106. bool workingset; /* not used */
  4107. cs->nr_evicted += nr_pages;
  4108. #ifdef CONFIG_SWAP /* implies CONFIG_MMU */
  4109. if (shmem_mapping(mapping)) {
  4110. /* shmem file - in swap cache */
  4111. swp_entry_t swp = radix_to_swp_entry(folio);
  4112. /* swapin error results in poisoned entry */
  4113. if (!softleaf_is_swap(swp))
  4114. goto resched;
  4115. /*
  4116. * Getting a swap entry from the shmem
  4117. * inode means we beat
  4118. * shmem_unuse(). rcu_read_lock()
  4119. * ensures swapoff waits for us before
  4120. * freeing the swapper space. However,
  4121. * we can race with swapping and
  4122. * invalidation, so there might not be
  4123. * a shadow in the swapcache (yet).
  4124. */
  4125. shadow = swap_cache_get_shadow(swp);
  4126. if (!shadow)
  4127. goto resched;
  4128. }
  4129. #endif
  4130. if (workingset_test_recent(shadow, true, &workingset, false))
  4131. cs->nr_recently_evicted += nr_pages;
  4132. goto resched;
  4133. }
  4134. /* page is in cache */
  4135. cs->nr_cache += nr_pages;
  4136. if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
  4137. cs->nr_dirty += nr_pages;
  4138. if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
  4139. cs->nr_writeback += nr_pages;
  4140. resched:
  4141. if (need_resched()) {
  4142. xas_pause(&xas);
  4143. cond_resched_rcu();
  4144. }
  4145. }
  4146. rcu_read_unlock();
  4147. }
  4148. /*
  4149. * See mincore: reveal pagecache information only for files
  4150. * that the calling process has write access to, or could (if
  4151. * tried) open for writing.
  4152. */
  4153. static inline bool can_do_cachestat(struct file *f)
  4154. {
  4155. if (f->f_mode & FMODE_WRITE)
  4156. return true;
  4157. if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f)))
  4158. return true;
  4159. return file_permission(f, MAY_WRITE) == 0;
  4160. }
  4161. /*
  4162. * The cachestat(2) system call.
  4163. *
  4164. * cachestat() returns the page cache statistics of a file in the
  4165. * bytes range specified by `off` and `len`: number of cached pages,
  4166. * number of dirty pages, number of pages marked for writeback,
  4167. * number of evicted pages, and number of recently evicted pages.
  4168. *
  4169. * An evicted page is a page that is previously in the page cache
  4170. * but has been evicted since. A page is recently evicted if its last
  4171. * eviction was recent enough that its reentry to the cache would
  4172. * indicate that it is actively being used by the system, and that
  4173. * there is memory pressure on the system.
  4174. *
  4175. * `off` and `len` must be non-negative integers. If `len` > 0,
  4176. * the queried range is [`off`, `off` + `len`]. If `len` == 0,
  4177. * we will query in the range from `off` to the end of the file.
  4178. *
  4179. * The `flags` argument is unused for now, but is included for future
  4180. * extensibility. User should pass 0 (i.e no flag specified).
  4181. *
  4182. * Currently, hugetlbfs is not supported.
  4183. *
  4184. * Because the status of a page can change after cachestat() checks it
  4185. * but before it returns to the application, the returned values may
  4186. * contain stale information.
  4187. *
  4188. * return values:
  4189. * zero - success
  4190. * -EFAULT - cstat or cstat_range points to an illegal address
  4191. * -EINVAL - invalid flags
  4192. * -EBADF - invalid file descriptor
  4193. * -EOPNOTSUPP - file descriptor is of a hugetlbfs file
  4194. */
  4195. SYSCALL_DEFINE4(cachestat, unsigned int, fd,
  4196. struct cachestat_range __user *, cstat_range,
  4197. struct cachestat __user *, cstat, unsigned int, flags)
  4198. {
  4199. CLASS(fd, f)(fd);
  4200. struct address_space *mapping;
  4201. struct cachestat_range csr;
  4202. struct cachestat cs;
  4203. pgoff_t first_index, last_index;
  4204. if (fd_empty(f))
  4205. return -EBADF;
  4206. if (copy_from_user(&csr, cstat_range,
  4207. sizeof(struct cachestat_range)))
  4208. return -EFAULT;
  4209. /* hugetlbfs is not supported */
  4210. if (is_file_hugepages(fd_file(f)))
  4211. return -EOPNOTSUPP;
  4212. if (!can_do_cachestat(fd_file(f)))
  4213. return -EPERM;
  4214. if (flags != 0)
  4215. return -EINVAL;
  4216. first_index = csr.off >> PAGE_SHIFT;
  4217. last_index =
  4218. csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
  4219. memset(&cs, 0, sizeof(struct cachestat));
  4220. mapping = fd_file(f)->f_mapping;
  4221. filemap_cachestat(mapping, first_index, last_index, &cs);
  4222. if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
  4223. return -EFAULT;
  4224. return 0;
  4225. }
  4226. #endif /* CONFIG_CACHESTAT_SYSCALL */