dax.c 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * fs/dax.c - Direct Access filesystem code
  4. * Copyright (c) 2013-2014 Intel Corporation
  5. * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
  6. * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
  7. */
  8. #include <linux/atomic.h>
  9. #include <linux/blkdev.h>
  10. #include <linux/buffer_head.h>
  11. #include <linux/dax.h>
  12. #include <linux/fs.h>
  13. #include <linux/highmem.h>
  14. #include <linux/memcontrol.h>
  15. #include <linux/mm.h>
  16. #include <linux/mutex.h>
  17. #include <linux/pagevec.h>
  18. #include <linux/sched.h>
  19. #include <linux/sched/signal.h>
  20. #include <linux/uio.h>
  21. #include <linux/vmstat.h>
  22. #include <linux/sizes.h>
  23. #include <linux/mmu_notifier.h>
  24. #include <linux/iomap.h>
  25. #include <linux/rmap.h>
  26. #include <linux/pgalloc.h>
  27. #define CREATE_TRACE_POINTS
  28. #include <trace/events/fs_dax.h>
  29. /* We choose 4096 entries - same as per-zone page wait tables */
  30. #define DAX_WAIT_TABLE_BITS 12
  31. #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
  32. /* The 'colour' (ie low bits) within a PMD of a page offset. */
  33. #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
  34. #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
  35. static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
  36. static int __init init_dax_wait_table(void)
  37. {
  38. int i;
  39. for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
  40. init_waitqueue_head(wait_table + i);
  41. return 0;
  42. }
  43. fs_initcall(init_dax_wait_table);
  44. /*
  45. * DAX pagecache entries use XArray value entries so they can't be mistaken
  46. * for pages. We use one bit for locking, one bit for the entry size (PMD)
  47. * and two more to tell us if the entry is a zero page or an empty entry that
  48. * is just used for locking. In total four special bits.
  49. *
  50. * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
  51. * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
  52. * block allocation.
  53. */
  54. #define DAX_SHIFT (4)
  55. #define DAX_LOCKED (1UL << 0)
  56. #define DAX_PMD (1UL << 1)
  57. #define DAX_ZERO_PAGE (1UL << 2)
  58. #define DAX_EMPTY (1UL << 3)
  59. static unsigned long dax_to_pfn(void *entry)
  60. {
  61. return xa_to_value(entry) >> DAX_SHIFT;
  62. }
  63. static struct folio *dax_to_folio(void *entry)
  64. {
  65. return page_folio(pfn_to_page(dax_to_pfn(entry)));
  66. }
  67. static void *dax_make_entry(unsigned long pfn, unsigned long flags)
  68. {
  69. return xa_mk_value(flags | (pfn << DAX_SHIFT));
  70. }
  71. static bool dax_is_locked(void *entry)
  72. {
  73. return xa_to_value(entry) & DAX_LOCKED;
  74. }
  75. static unsigned int dax_entry_order(void *entry)
  76. {
  77. if (xa_to_value(entry) & DAX_PMD)
  78. return PMD_ORDER;
  79. return 0;
  80. }
  81. static unsigned long dax_is_pmd_entry(void *entry)
  82. {
  83. return xa_to_value(entry) & DAX_PMD;
  84. }
  85. static bool dax_is_pte_entry(void *entry)
  86. {
  87. return !(xa_to_value(entry) & DAX_PMD);
  88. }
  89. static int dax_is_zero_entry(void *entry)
  90. {
  91. return xa_to_value(entry) & DAX_ZERO_PAGE;
  92. }
  93. static int dax_is_empty_entry(void *entry)
  94. {
  95. return xa_to_value(entry) & DAX_EMPTY;
  96. }
  97. /*
  98. * true if the entry that was found is of a smaller order than the entry
  99. * we were looking for
  100. */
  101. static bool dax_is_conflict(void *entry)
  102. {
  103. return entry == XA_RETRY_ENTRY;
  104. }
  105. /*
  106. * DAX page cache entry locking
  107. */
  108. struct exceptional_entry_key {
  109. struct xarray *xa;
  110. pgoff_t entry_start;
  111. };
  112. struct wait_exceptional_entry_queue {
  113. wait_queue_entry_t wait;
  114. struct exceptional_entry_key key;
  115. };
  116. /**
  117. * enum dax_wake_mode: waitqueue wakeup behaviour
  118. * @WAKE_ALL: wake all waiters in the waitqueue
  119. * @WAKE_NEXT: wake only the first waiter in the waitqueue
  120. */
  121. enum dax_wake_mode {
  122. WAKE_ALL,
  123. WAKE_NEXT,
  124. };
  125. static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
  126. void *entry, struct exceptional_entry_key *key)
  127. {
  128. unsigned long hash;
  129. unsigned long index = xas->xa_index;
  130. /*
  131. * If 'entry' is a PMD, align the 'index' that we use for the wait
  132. * queue to the start of that PMD. This ensures that all offsets in
  133. * the range covered by the PMD map to the same bit lock.
  134. */
  135. if (dax_is_pmd_entry(entry))
  136. index &= ~PG_PMD_COLOUR;
  137. key->xa = xas->xa;
  138. key->entry_start = index;
  139. hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
  140. return wait_table + hash;
  141. }
  142. static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
  143. unsigned int mode, int sync, void *keyp)
  144. {
  145. struct exceptional_entry_key *key = keyp;
  146. struct wait_exceptional_entry_queue *ewait =
  147. container_of(wait, struct wait_exceptional_entry_queue, wait);
  148. if (key->xa != ewait->key.xa ||
  149. key->entry_start != ewait->key.entry_start)
  150. return 0;
  151. return autoremove_wake_function(wait, mode, sync, NULL);
  152. }
  153. /*
  154. * @entry may no longer be the entry at the index in the mapping.
  155. * The important information it's conveying is whether the entry at
  156. * this index used to be a PMD entry.
  157. */
  158. static void dax_wake_entry(struct xa_state *xas, void *entry,
  159. enum dax_wake_mode mode)
  160. {
  161. struct exceptional_entry_key key;
  162. wait_queue_head_t *wq;
  163. wq = dax_entry_waitqueue(xas, entry, &key);
  164. /*
  165. * Checking for locked entry and prepare_to_wait_exclusive() happens
  166. * under the i_pages lock, ditto for entry handling in our callers.
  167. * So at this point all tasks that could have seen our entry locked
  168. * must be in the waitqueue and the following check will see them.
  169. */
  170. if (waitqueue_active(wq))
  171. __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
  172. }
  173. /*
  174. * Look up entry in page cache, wait for it to become unlocked if it
  175. * is a DAX entry and return it. The caller must subsequently call
  176. * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
  177. * if it did. The entry returned may have a larger order than @order.
  178. * If @order is larger than the order of the entry found in i_pages, this
  179. * function returns a dax_is_conflict entry.
  180. *
  181. * Must be called with the i_pages lock held.
  182. */
  183. static void *get_next_unlocked_entry(struct xa_state *xas, unsigned int order)
  184. {
  185. void *entry;
  186. struct wait_exceptional_entry_queue ewait;
  187. wait_queue_head_t *wq;
  188. init_wait(&ewait.wait);
  189. ewait.wait.func = wake_exceptional_entry_func;
  190. for (;;) {
  191. entry = xas_find_conflict(xas);
  192. if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
  193. return entry;
  194. if (dax_entry_order(entry) < order)
  195. return XA_RETRY_ENTRY;
  196. if (!dax_is_locked(entry))
  197. return entry;
  198. wq = dax_entry_waitqueue(xas, entry, &ewait.key);
  199. prepare_to_wait_exclusive(wq, &ewait.wait,
  200. TASK_UNINTERRUPTIBLE);
  201. xas_unlock_irq(xas);
  202. xas_reset(xas);
  203. schedule();
  204. finish_wait(wq, &ewait.wait);
  205. xas_lock_irq(xas);
  206. }
  207. }
  208. /*
  209. * Wait for the given entry to become unlocked. Caller must hold the i_pages
  210. * lock and call either put_unlocked_entry() if it did not lock the entry or
  211. * dax_unlock_entry() if it did. Returns an unlocked entry if still present.
  212. */
  213. static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry)
  214. {
  215. struct wait_exceptional_entry_queue ewait;
  216. wait_queue_head_t *wq;
  217. init_wait(&ewait.wait);
  218. ewait.wait.func = wake_exceptional_entry_func;
  219. while (unlikely(dax_is_locked(entry))) {
  220. wq = dax_entry_waitqueue(xas, entry, &ewait.key);
  221. prepare_to_wait_exclusive(wq, &ewait.wait,
  222. TASK_UNINTERRUPTIBLE);
  223. xas_reset(xas);
  224. xas_unlock_irq(xas);
  225. schedule();
  226. finish_wait(wq, &ewait.wait);
  227. xas_lock_irq(xas);
  228. entry = xas_load(xas);
  229. }
  230. if (xa_is_internal(entry))
  231. return NULL;
  232. return entry;
  233. }
  234. /*
  235. * The only thing keeping the address space around is the i_pages lock
  236. * (it's cycled in clear_inode() after removing the entries from i_pages)
  237. * After we call xas_unlock_irq(), we cannot touch xas->xa.
  238. */
  239. static void wait_entry_unlocked(struct xa_state *xas, void *entry)
  240. {
  241. struct wait_exceptional_entry_queue ewait;
  242. wait_queue_head_t *wq;
  243. init_wait(&ewait.wait);
  244. ewait.wait.func = wake_exceptional_entry_func;
  245. wq = dax_entry_waitqueue(xas, entry, &ewait.key);
  246. /*
  247. * Unlike get_next_unlocked_entry() there is no guarantee that this
  248. * path ever successfully retrieves an unlocked entry before an
  249. * inode dies. Perform a non-exclusive wait in case this path
  250. * never successfully performs its own wake up.
  251. */
  252. prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
  253. xas_unlock_irq(xas);
  254. schedule();
  255. finish_wait(wq, &ewait.wait);
  256. }
  257. static void put_unlocked_entry(struct xa_state *xas, void *entry,
  258. enum dax_wake_mode mode)
  259. {
  260. if (entry && !dax_is_conflict(entry))
  261. dax_wake_entry(xas, entry, mode);
  262. }
  263. /*
  264. * We used the xa_state to get the entry, but then we locked the entry and
  265. * dropped the xa_lock, so we know the xa_state is stale and must be reset
  266. * before use.
  267. */
  268. static void dax_unlock_entry(struct xa_state *xas, void *entry)
  269. {
  270. void *old;
  271. BUG_ON(dax_is_locked(entry));
  272. xas_reset(xas);
  273. xas_lock_irq(xas);
  274. old = xas_store(xas, entry);
  275. xas_unlock_irq(xas);
  276. BUG_ON(!dax_is_locked(old));
  277. dax_wake_entry(xas, entry, WAKE_NEXT);
  278. }
  279. /*
  280. * Return: The entry stored at this location before it was locked.
  281. */
  282. static void *dax_lock_entry(struct xa_state *xas, void *entry)
  283. {
  284. unsigned long v = xa_to_value(entry);
  285. return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
  286. }
  287. static unsigned long dax_entry_size(void *entry)
  288. {
  289. if (dax_is_zero_entry(entry))
  290. return 0;
  291. else if (dax_is_empty_entry(entry))
  292. return 0;
  293. else if (dax_is_pmd_entry(entry))
  294. return PMD_SIZE;
  295. else
  296. return PAGE_SIZE;
  297. }
  298. /*
  299. * A DAX folio is considered shared if it has no mapping set and ->share (which
  300. * shares the ->index field) is non-zero. Note this may return false even if the
  301. * page is shared between multiple files but has not yet actually been mapped
  302. * into multiple address spaces.
  303. */
  304. static inline bool dax_folio_is_shared(struct folio *folio)
  305. {
  306. return !folio->mapping && folio->share;
  307. }
  308. /*
  309. * When it is called by dax_insert_entry(), the shared flag will indicate
  310. * whether this entry is shared by multiple files. If the page has not
  311. * previously been associated with any mappings the ->mapping and ->index
  312. * fields will be set. If it has already been associated with a mapping
  313. * the mapping will be cleared and the share count set. It's then up to
  314. * reverse map users like memory_failure() to call back into the filesystem to
  315. * recover ->mapping and ->index information. For example by implementing
  316. * dax_holder_operations.
  317. */
  318. static void dax_folio_make_shared(struct folio *folio)
  319. {
  320. /*
  321. * folio is not currently shared so mark it as shared by clearing
  322. * folio->mapping.
  323. */
  324. folio->mapping = NULL;
  325. /*
  326. * folio has previously been mapped into one address space so set the
  327. * share count.
  328. */
  329. folio->share = 1;
  330. }
  331. static inline unsigned long dax_folio_put(struct folio *folio)
  332. {
  333. unsigned long ref;
  334. int order, i;
  335. if (!dax_folio_is_shared(folio))
  336. ref = 0;
  337. else
  338. ref = --folio->share;
  339. if (ref)
  340. return ref;
  341. folio->mapping = NULL;
  342. order = folio_order(folio);
  343. if (!order)
  344. return 0;
  345. folio_reset_order(folio);
  346. for (i = 0; i < (1UL << order); i++) {
  347. struct dev_pagemap *pgmap = page_pgmap(&folio->page);
  348. struct page *page = folio_page(folio, i);
  349. struct folio *new_folio = (struct folio *)page;
  350. ClearPageHead(page);
  351. clear_compound_head(page);
  352. new_folio->mapping = NULL;
  353. /*
  354. * Reset pgmap which was over-written by
  355. * prep_compound_page().
  356. */
  357. new_folio->pgmap = pgmap;
  358. new_folio->share = 0;
  359. WARN_ON_ONCE(folio_ref_count(new_folio));
  360. }
  361. return ref;
  362. }
  363. static void dax_folio_init(void *entry)
  364. {
  365. struct folio *folio = dax_to_folio(entry);
  366. int order = dax_entry_order(entry);
  367. /*
  368. * Folio should have been split back to order-0 pages in
  369. * dax_folio_put() when they were removed from their
  370. * final mapping.
  371. */
  372. WARN_ON_ONCE(folio_order(folio));
  373. if (order > 0) {
  374. prep_compound_page(&folio->page, order);
  375. if (order > 1)
  376. INIT_LIST_HEAD(&folio->_deferred_list);
  377. WARN_ON_ONCE(folio_ref_count(folio));
  378. }
  379. }
  380. static void dax_associate_entry(void *entry, struct address_space *mapping,
  381. struct vm_area_struct *vma,
  382. unsigned long address, bool shared)
  383. {
  384. unsigned long size = dax_entry_size(entry), index;
  385. struct folio *folio = dax_to_folio(entry);
  386. if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
  387. return;
  388. index = linear_page_index(vma, address & ~(size - 1));
  389. if (shared && (folio->mapping || dax_folio_is_shared(folio))) {
  390. if (folio->mapping)
  391. dax_folio_make_shared(folio);
  392. WARN_ON_ONCE(!folio->share);
  393. WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio));
  394. folio->share++;
  395. } else {
  396. WARN_ON_ONCE(folio->mapping);
  397. dax_folio_init(entry);
  398. folio = dax_to_folio(entry);
  399. folio->mapping = mapping;
  400. folio->index = index;
  401. }
  402. }
  403. static void dax_disassociate_entry(void *entry, struct address_space *mapping,
  404. bool trunc)
  405. {
  406. struct folio *folio = dax_to_folio(entry);
  407. if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
  408. return;
  409. dax_folio_put(folio);
  410. }
  411. static struct page *dax_busy_page(void *entry)
  412. {
  413. struct folio *folio = dax_to_folio(entry);
  414. if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
  415. return NULL;
  416. if (folio_ref_count(folio) - folio_mapcount(folio))
  417. return &folio->page;
  418. else
  419. return NULL;
  420. }
  421. /**
  422. * dax_lock_folio - Lock the DAX entry corresponding to a folio
  423. * @folio: The folio whose entry we want to lock
  424. *
  425. * Context: Process context.
  426. * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could
  427. * not be locked.
  428. */
  429. dax_entry_t dax_lock_folio(struct folio *folio)
  430. {
  431. XA_STATE(xas, NULL, 0);
  432. void *entry;
  433. /* Ensure folio->mapping isn't freed while we look at it */
  434. rcu_read_lock();
  435. for (;;) {
  436. struct address_space *mapping = READ_ONCE(folio->mapping);
  437. entry = NULL;
  438. if (!mapping || !dax_mapping(mapping))
  439. break;
  440. /*
  441. * In the device-dax case there's no need to lock, a
  442. * struct dev_pagemap pin is sufficient to keep the
  443. * inode alive, and we assume we have dev_pagemap pin
  444. * otherwise we would not have a valid pfn_to_page()
  445. * translation.
  446. */
  447. entry = (void *)~0UL;
  448. if (S_ISCHR(mapping->host->i_mode))
  449. break;
  450. xas.xa = &mapping->i_pages;
  451. xas_lock_irq(&xas);
  452. if (mapping != folio->mapping) {
  453. xas_unlock_irq(&xas);
  454. continue;
  455. }
  456. xas_set(&xas, folio->index);
  457. entry = xas_load(&xas);
  458. if (dax_is_locked(entry)) {
  459. rcu_read_unlock();
  460. wait_entry_unlocked(&xas, entry);
  461. rcu_read_lock();
  462. continue;
  463. }
  464. dax_lock_entry(&xas, entry);
  465. xas_unlock_irq(&xas);
  466. break;
  467. }
  468. rcu_read_unlock();
  469. return (dax_entry_t)entry;
  470. }
  471. void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
  472. {
  473. struct address_space *mapping = folio->mapping;
  474. XA_STATE(xas, &mapping->i_pages, folio->index);
  475. if (S_ISCHR(mapping->host->i_mode))
  476. return;
  477. dax_unlock_entry(&xas, (void *)cookie);
  478. }
  479. /*
  480. * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
  481. * @mapping: the file's mapping whose entry we want to lock
  482. * @index: the offset within this file
  483. * @page: output the dax page corresponding to this dax entry
  484. *
  485. * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
  486. * could not be locked.
  487. */
  488. dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
  489. struct page **page)
  490. {
  491. XA_STATE(xas, NULL, 0);
  492. void *entry;
  493. rcu_read_lock();
  494. for (;;) {
  495. entry = NULL;
  496. if (!dax_mapping(mapping))
  497. break;
  498. xas.xa = &mapping->i_pages;
  499. xas_lock_irq(&xas);
  500. xas_set(&xas, index);
  501. entry = xas_load(&xas);
  502. if (dax_is_locked(entry)) {
  503. rcu_read_unlock();
  504. wait_entry_unlocked(&xas, entry);
  505. rcu_read_lock();
  506. continue;
  507. }
  508. if (!entry ||
  509. dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
  510. /*
  511. * Because we are looking for entry from file's mapping
  512. * and index, so the entry may not be inserted for now,
  513. * or even a zero/empty entry. We don't think this is
  514. * an error case. So, return a special value and do
  515. * not output @page.
  516. */
  517. entry = (void *)~0UL;
  518. } else {
  519. *page = pfn_to_page(dax_to_pfn(entry));
  520. dax_lock_entry(&xas, entry);
  521. }
  522. xas_unlock_irq(&xas);
  523. break;
  524. }
  525. rcu_read_unlock();
  526. return (dax_entry_t)entry;
  527. }
  528. void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
  529. dax_entry_t cookie)
  530. {
  531. XA_STATE(xas, &mapping->i_pages, index);
  532. if (cookie == ~0UL)
  533. return;
  534. dax_unlock_entry(&xas, (void *)cookie);
  535. }
  536. /*
  537. * Find page cache entry at given index. If it is a DAX entry, return it
  538. * with the entry locked. If the page cache doesn't contain an entry at
  539. * that index, add a locked empty entry.
  540. *
  541. * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
  542. * either return that locked entry or will return VM_FAULT_FALLBACK.
  543. * This will happen if there are any PTE entries within the PMD range
  544. * that we are requesting.
  545. *
  546. * We always favor PTE entries over PMD entries. There isn't a flow where we
  547. * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
  548. * insertion will fail if it finds any PTE entries already in the tree, and a
  549. * PTE insertion will cause an existing PMD entry to be unmapped and
  550. * downgraded to PTE entries. This happens for both PMD zero pages as
  551. * well as PMD empty entries.
  552. *
  553. * The exception to this downgrade path is for PMD entries that have
  554. * real storage backing them. We will leave these real PMD entries in
  555. * the tree, and PTE writes will simply dirty the entire PMD entry.
  556. *
  557. * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
  558. * persistent memory the benefit is doubtful. We can add that later if we can
  559. * show it helps.
  560. *
  561. * On error, this function does not return an ERR_PTR. Instead it returns
  562. * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
  563. * overlap with xarray value entries.
  564. */
  565. static void *grab_mapping_entry(struct xa_state *xas,
  566. struct address_space *mapping, unsigned int order)
  567. {
  568. unsigned long index = xas->xa_index;
  569. bool pmd_downgrade; /* splitting PMD entry into PTE entries? */
  570. void *entry;
  571. retry:
  572. pmd_downgrade = false;
  573. xas_lock_irq(xas);
  574. entry = get_next_unlocked_entry(xas, order);
  575. if (entry) {
  576. if (dax_is_conflict(entry))
  577. goto fallback;
  578. if (!xa_is_value(entry)) {
  579. xas_set_err(xas, -EIO);
  580. goto out_unlock;
  581. }
  582. if (order == 0) {
  583. if (dax_is_pmd_entry(entry) &&
  584. (dax_is_zero_entry(entry) ||
  585. dax_is_empty_entry(entry))) {
  586. pmd_downgrade = true;
  587. }
  588. }
  589. }
  590. if (pmd_downgrade) {
  591. /*
  592. * Make sure 'entry' remains valid while we drop
  593. * the i_pages lock.
  594. */
  595. dax_lock_entry(xas, entry);
  596. /*
  597. * Besides huge zero pages the only other thing that gets
  598. * downgraded are empty entries which don't need to be
  599. * unmapped.
  600. */
  601. if (dax_is_zero_entry(entry)) {
  602. xas_unlock_irq(xas);
  603. unmap_mapping_pages(mapping,
  604. xas->xa_index & ~PG_PMD_COLOUR,
  605. PG_PMD_NR, false);
  606. xas_reset(xas);
  607. xas_lock_irq(xas);
  608. }
  609. dax_disassociate_entry(entry, mapping, false);
  610. xas_store(xas, NULL); /* undo the PMD join */
  611. dax_wake_entry(xas, entry, WAKE_ALL);
  612. mapping->nrpages -= PG_PMD_NR;
  613. entry = NULL;
  614. xas_set(xas, index);
  615. }
  616. if (entry) {
  617. dax_lock_entry(xas, entry);
  618. } else {
  619. unsigned long flags = DAX_EMPTY;
  620. if (order > 0)
  621. flags |= DAX_PMD;
  622. entry = dax_make_entry(0, flags);
  623. dax_lock_entry(xas, entry);
  624. if (xas_error(xas))
  625. goto out_unlock;
  626. mapping->nrpages += 1UL << order;
  627. }
  628. out_unlock:
  629. xas_unlock_irq(xas);
  630. if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
  631. goto retry;
  632. if (xas->xa_node == XA_ERROR(-ENOMEM))
  633. return xa_mk_internal(VM_FAULT_OOM);
  634. if (xas_error(xas))
  635. return xa_mk_internal(VM_FAULT_SIGBUS);
  636. return entry;
  637. fallback:
  638. xas_unlock_irq(xas);
  639. return xa_mk_internal(VM_FAULT_FALLBACK);
  640. }
  641. /**
  642. * dax_layout_busy_page_range - find first pinned page in @mapping
  643. * @mapping: address space to scan for a page with ref count > 1
  644. * @start: Starting offset. Page containing 'start' is included.
  645. * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
  646. * pages from 'start' till the end of file are included.
  647. *
  648. * DAX requires ZONE_DEVICE mapped pages. These pages are never
  649. * 'onlined' to the page allocator so they are considered idle when
  650. * page->count == 1. A filesystem uses this interface to determine if
  651. * any page in the mapping is busy, i.e. for DMA, or other
  652. * get_user_pages() usages.
  653. *
  654. * It is expected that the filesystem is holding locks to block the
  655. * establishment of new mappings in this address_space. I.e. it expects
  656. * to be able to run unmap_mapping_range() and subsequently not race
  657. * mapping_mapped() becoming true.
  658. */
  659. struct page *dax_layout_busy_page_range(struct address_space *mapping,
  660. loff_t start, loff_t end)
  661. {
  662. void *entry;
  663. unsigned int scanned = 0;
  664. struct page *page = NULL;
  665. pgoff_t start_idx = start >> PAGE_SHIFT;
  666. pgoff_t end_idx;
  667. XA_STATE(xas, &mapping->i_pages, start_idx);
  668. if (!dax_mapping(mapping))
  669. return NULL;
  670. /* If end == LLONG_MAX, all pages from start to till end of file */
  671. if (end == LLONG_MAX)
  672. end_idx = ULONG_MAX;
  673. else
  674. end_idx = end >> PAGE_SHIFT;
  675. /*
  676. * If we race get_user_pages_fast() here either we'll see the
  677. * elevated page count in the iteration and wait, or
  678. * get_user_pages_fast() will see that the page it took a reference
  679. * against is no longer mapped in the page tables and bail to the
  680. * get_user_pages() slow path. The slow path is protected by
  681. * pte_lock() and pmd_lock(). New references are not taken without
  682. * holding those locks, and unmap_mapping_pages() will not zero the
  683. * pte or pmd without holding the respective lock, so we are
  684. * guaranteed to either see new references or prevent new
  685. * references from being established.
  686. */
  687. unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
  688. xas_lock_irq(&xas);
  689. xas_for_each(&xas, entry, end_idx) {
  690. if (WARN_ON_ONCE(!xa_is_value(entry)))
  691. continue;
  692. entry = wait_entry_unlocked_exclusive(&xas, entry);
  693. if (entry)
  694. page = dax_busy_page(entry);
  695. put_unlocked_entry(&xas, entry, WAKE_NEXT);
  696. if (page)
  697. break;
  698. if (++scanned % XA_CHECK_SCHED)
  699. continue;
  700. xas_pause(&xas);
  701. xas_unlock_irq(&xas);
  702. cond_resched();
  703. xas_lock_irq(&xas);
  704. }
  705. xas_unlock_irq(&xas);
  706. return page;
  707. }
  708. EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
  709. struct page *dax_layout_busy_page(struct address_space *mapping)
  710. {
  711. return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
  712. }
  713. EXPORT_SYMBOL_GPL(dax_layout_busy_page);
  714. static int __dax_invalidate_entry(struct address_space *mapping,
  715. pgoff_t index, bool trunc)
  716. {
  717. XA_STATE(xas, &mapping->i_pages, index);
  718. int ret = 0;
  719. void *entry;
  720. xas_lock_irq(&xas);
  721. entry = get_next_unlocked_entry(&xas, 0);
  722. if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
  723. goto out;
  724. if (!trunc &&
  725. (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
  726. xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
  727. goto out;
  728. dax_disassociate_entry(entry, mapping, trunc);
  729. xas_store(&xas, NULL);
  730. mapping->nrpages -= 1UL << dax_entry_order(entry);
  731. ret = 1;
  732. out:
  733. put_unlocked_entry(&xas, entry, WAKE_ALL);
  734. xas_unlock_irq(&xas);
  735. return ret;
  736. }
  737. static int __dax_clear_dirty_range(struct address_space *mapping,
  738. pgoff_t start, pgoff_t end)
  739. {
  740. XA_STATE(xas, &mapping->i_pages, start);
  741. unsigned int scanned = 0;
  742. void *entry;
  743. xas_lock_irq(&xas);
  744. xas_for_each(&xas, entry, end) {
  745. entry = wait_entry_unlocked_exclusive(&xas, entry);
  746. if (!entry)
  747. continue;
  748. xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
  749. xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
  750. put_unlocked_entry(&xas, entry, WAKE_NEXT);
  751. if (++scanned % XA_CHECK_SCHED)
  752. continue;
  753. xas_pause(&xas);
  754. xas_unlock_irq(&xas);
  755. cond_resched();
  756. xas_lock_irq(&xas);
  757. }
  758. xas_unlock_irq(&xas);
  759. return 0;
  760. }
  761. /*
  762. * Delete DAX entry at @index from @mapping. Wait for it
  763. * to be unlocked before deleting it.
  764. */
  765. int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
  766. {
  767. int ret = __dax_invalidate_entry(mapping, index, true);
  768. /*
  769. * This gets called from truncate / punch_hole path. As such, the caller
  770. * must hold locks protecting against concurrent modifications of the
  771. * page cache (usually fs-private i_mmap_sem for writing). Since the
  772. * caller has seen a DAX entry for this index, we better find it
  773. * at that index as well...
  774. */
  775. WARN_ON_ONCE(!ret);
  776. return ret;
  777. }
  778. void dax_delete_mapping_range(struct address_space *mapping,
  779. loff_t start, loff_t end)
  780. {
  781. void *entry;
  782. pgoff_t start_idx = start >> PAGE_SHIFT;
  783. pgoff_t end_idx;
  784. XA_STATE(xas, &mapping->i_pages, start_idx);
  785. /* If end == LLONG_MAX, all pages from start to till end of file */
  786. if (end == LLONG_MAX)
  787. end_idx = ULONG_MAX;
  788. else
  789. end_idx = end >> PAGE_SHIFT;
  790. xas_lock_irq(&xas);
  791. xas_for_each(&xas, entry, end_idx) {
  792. if (!xa_is_value(entry))
  793. continue;
  794. entry = wait_entry_unlocked_exclusive(&xas, entry);
  795. if (!entry)
  796. continue;
  797. dax_disassociate_entry(entry, mapping, true);
  798. xas_store(&xas, NULL);
  799. mapping->nrpages -= 1UL << dax_entry_order(entry);
  800. put_unlocked_entry(&xas, entry, WAKE_ALL);
  801. }
  802. xas_unlock_irq(&xas);
  803. }
  804. EXPORT_SYMBOL_GPL(dax_delete_mapping_range);
  805. static int wait_page_idle(struct page *page,
  806. void (cb)(struct inode *),
  807. struct inode *inode)
  808. {
  809. return ___wait_var_event(page, dax_page_is_idle(page),
  810. TASK_INTERRUPTIBLE, 0, 0, cb(inode));
  811. }
  812. static void wait_page_idle_uninterruptible(struct page *page,
  813. struct inode *inode)
  814. {
  815. ___wait_var_event(page, dax_page_is_idle(page),
  816. TASK_UNINTERRUPTIBLE, 0, 0, schedule());
  817. }
  818. /*
  819. * Unmaps the inode and waits for any DMA to complete prior to deleting the
  820. * DAX mapping entries for the range.
  821. *
  822. * For NOWAIT behavior, pass @cb as NULL to early-exit on first found
  823. * busy page
  824. */
  825. int dax_break_layout(struct inode *inode, loff_t start, loff_t end,
  826. void (cb)(struct inode *))
  827. {
  828. struct page *page;
  829. int error = 0;
  830. if (!dax_mapping(inode->i_mapping))
  831. return 0;
  832. do {
  833. page = dax_layout_busy_page_range(inode->i_mapping, start, end);
  834. if (!page)
  835. break;
  836. if (!cb) {
  837. error = -ERESTARTSYS;
  838. break;
  839. }
  840. error = wait_page_idle(page, cb, inode);
  841. } while (error == 0);
  842. if (!page)
  843. dax_delete_mapping_range(inode->i_mapping, start, end);
  844. return error;
  845. }
  846. EXPORT_SYMBOL_GPL(dax_break_layout);
  847. void dax_break_layout_final(struct inode *inode)
  848. {
  849. struct page *page;
  850. if (!dax_mapping(inode->i_mapping))
  851. return;
  852. do {
  853. page = dax_layout_busy_page_range(inode->i_mapping, 0,
  854. LLONG_MAX);
  855. if (!page)
  856. break;
  857. wait_page_idle_uninterruptible(page, inode);
  858. } while (true);
  859. if (!page)
  860. dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
  861. }
  862. EXPORT_SYMBOL_GPL(dax_break_layout_final);
  863. /*
  864. * Invalidate DAX entry if it is clean.
  865. */
  866. int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
  867. pgoff_t index)
  868. {
  869. return __dax_invalidate_entry(mapping, index, false);
  870. }
  871. static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
  872. {
  873. return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
  874. }
  875. static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
  876. {
  877. pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
  878. void *vto, *kaddr;
  879. long rc;
  880. int id;
  881. id = dax_read_lock();
  882. rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS,
  883. &kaddr, NULL);
  884. if (rc < 0) {
  885. dax_read_unlock(id);
  886. return rc;
  887. }
  888. vto = kmap_atomic(vmf->cow_page);
  889. copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
  890. kunmap_atomic(vto);
  891. dax_read_unlock(id);
  892. return 0;
  893. }
  894. /*
  895. * MAP_SYNC on a dax mapping guarantees dirty metadata is
  896. * flushed on write-faults (non-cow), but not read-faults.
  897. */
  898. static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
  899. struct vm_area_struct *vma)
  900. {
  901. return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
  902. (iter->iomap.flags & IOMAP_F_DIRTY);
  903. }
  904. /*
  905. * By this point grab_mapping_entry() has ensured that we have a locked entry
  906. * of the appropriate size so we don't have to worry about downgrading PMDs to
  907. * PTEs. If we happen to be trying to insert a PTE and there is a PMD
  908. * already in the tree, we will skip the insertion and just dirty the PMD as
  909. * appropriate.
  910. */
  911. static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
  912. const struct iomap_iter *iter, void *entry, unsigned long pfn,
  913. unsigned long flags)
  914. {
  915. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  916. void *new_entry = dax_make_entry(pfn, flags);
  917. bool write = iter->flags & IOMAP_WRITE;
  918. bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma);
  919. bool shared = iter->iomap.flags & IOMAP_F_SHARED;
  920. if (dirty)
  921. __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  922. if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
  923. unsigned long index = xas->xa_index;
  924. /* we are replacing a zero page with block mapping */
  925. if (dax_is_pmd_entry(entry))
  926. unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
  927. PG_PMD_NR, false);
  928. else /* pte entry */
  929. unmap_mapping_pages(mapping, index, 1, false);
  930. }
  931. xas_reset(xas);
  932. xas_lock_irq(xas);
  933. if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
  934. void *old;
  935. dax_disassociate_entry(entry, mapping, false);
  936. dax_associate_entry(new_entry, mapping, vmf->vma,
  937. vmf->address, shared);
  938. /*
  939. * Only swap our new entry into the page cache if the current
  940. * entry is a zero page or an empty entry. If a normal PTE or
  941. * PMD entry is already in the cache, we leave it alone. This
  942. * means that if we are trying to insert a PTE and the
  943. * existing entry is a PMD, we will just leave the PMD in the
  944. * tree and dirty it if necessary.
  945. */
  946. old = dax_lock_entry(xas, new_entry);
  947. WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
  948. DAX_LOCKED));
  949. entry = new_entry;
  950. } else {
  951. xas_load(xas); /* Walk the xa_state */
  952. }
  953. if (dirty)
  954. xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
  955. if (write && shared)
  956. xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
  957. xas_unlock_irq(xas);
  958. return entry;
  959. }
  960. static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
  961. struct address_space *mapping, void *entry)
  962. {
  963. unsigned long pfn, index, count, end;
  964. long ret = 0;
  965. struct vm_area_struct *vma;
  966. /*
  967. * A page got tagged dirty in DAX mapping? Something is seriously
  968. * wrong.
  969. */
  970. if (WARN_ON(!xa_is_value(entry)))
  971. return -EIO;
  972. if (unlikely(dax_is_locked(entry))) {
  973. void *old_entry = entry;
  974. entry = get_next_unlocked_entry(xas, 0);
  975. /* Entry got punched out / reallocated? */
  976. if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
  977. goto put_unlocked;
  978. /*
  979. * Entry got reallocated elsewhere? No need to writeback.
  980. * We have to compare pfns as we must not bail out due to
  981. * difference in lockbit or entry type.
  982. */
  983. if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
  984. goto put_unlocked;
  985. if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
  986. dax_is_zero_entry(entry))) {
  987. ret = -EIO;
  988. goto put_unlocked;
  989. }
  990. /* Another fsync thread may have already done this entry */
  991. if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
  992. goto put_unlocked;
  993. }
  994. /* Lock the entry to serialize with page faults */
  995. dax_lock_entry(xas, entry);
  996. /*
  997. * We can clear the tag now but we have to be careful so that concurrent
  998. * dax_writeback_one() calls for the same index cannot finish before we
  999. * actually flush the caches. This is achieved as the calls will look
  1000. * at the entry only under the i_pages lock and once they do that
  1001. * they will see the entry locked and wait for it to unlock.
  1002. */
  1003. xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
  1004. xas_unlock_irq(xas);
  1005. /*
  1006. * If dax_writeback_mapping_range() was given a wbc->range_start
  1007. * in the middle of a PMD, the 'index' we use needs to be
  1008. * aligned to the start of the PMD.
  1009. * This allows us to flush for PMD_SIZE and not have to worry about
  1010. * partial PMD writebacks.
  1011. */
  1012. pfn = dax_to_pfn(entry);
  1013. count = 1UL << dax_entry_order(entry);
  1014. index = xas->xa_index & ~(count - 1);
  1015. end = index + count - 1;
  1016. /* Walk all mappings of a given index of a file and writeprotect them */
  1017. i_mmap_lock_read(mapping);
  1018. vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
  1019. pfn_mkclean_range(pfn, count, index, vma);
  1020. cond_resched();
  1021. }
  1022. i_mmap_unlock_read(mapping);
  1023. dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
  1024. /*
  1025. * After we have flushed the cache, we can clear the dirty tag. There
  1026. * cannot be new dirty data in the pfn after the flush has completed as
  1027. * the pfn mappings are writeprotected and fault waits for mapping
  1028. * entry lock.
  1029. */
  1030. xas_reset(xas);
  1031. xas_lock_irq(xas);
  1032. xas_store(xas, entry);
  1033. xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
  1034. dax_wake_entry(xas, entry, WAKE_NEXT);
  1035. trace_dax_writeback_one(mapping->host, index, count);
  1036. return ret;
  1037. put_unlocked:
  1038. put_unlocked_entry(xas, entry, WAKE_NEXT);
  1039. return ret;
  1040. }
  1041. /*
  1042. * Flush the mapping to the persistent domain within the byte range of [start,
  1043. * end]. This is required by data integrity operations to ensure file data is
  1044. * on persistent storage prior to completion of the operation.
  1045. */
  1046. int dax_writeback_mapping_range(struct address_space *mapping,
  1047. struct dax_device *dax_dev, struct writeback_control *wbc)
  1048. {
  1049. XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
  1050. struct inode *inode = mapping->host;
  1051. pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
  1052. void *entry;
  1053. int ret = 0;
  1054. unsigned int scanned = 0;
  1055. if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
  1056. return -EIO;
  1057. if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
  1058. return 0;
  1059. trace_dax_writeback_range(inode, xas.xa_index, end_index);
  1060. tag_pages_for_writeback(mapping, xas.xa_index, end_index);
  1061. xas_lock_irq(&xas);
  1062. xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
  1063. ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
  1064. if (ret < 0) {
  1065. mapping_set_error(mapping, ret);
  1066. break;
  1067. }
  1068. if (++scanned % XA_CHECK_SCHED)
  1069. continue;
  1070. xas_pause(&xas);
  1071. xas_unlock_irq(&xas);
  1072. cond_resched();
  1073. xas_lock_irq(&xas);
  1074. }
  1075. xas_unlock_irq(&xas);
  1076. trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
  1077. return ret;
  1078. }
  1079. EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
  1080. static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
  1081. size_t size, void **kaddr, unsigned long *pfnp)
  1082. {
  1083. pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
  1084. int id, rc = 0;
  1085. long length;
  1086. id = dax_read_lock();
  1087. length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
  1088. DAX_ACCESS, kaddr, pfnp);
  1089. if (length < 0) {
  1090. rc = length;
  1091. goto out;
  1092. }
  1093. if (!pfnp)
  1094. goto out_check_addr;
  1095. rc = -EINVAL;
  1096. if (PFN_PHYS(length) < size)
  1097. goto out;
  1098. if (*pfnp & (PHYS_PFN(size)-1))
  1099. goto out;
  1100. rc = 0;
  1101. out_check_addr:
  1102. if (!kaddr)
  1103. goto out;
  1104. if (!*kaddr)
  1105. rc = -EFAULT;
  1106. out:
  1107. dax_read_unlock(id);
  1108. return rc;
  1109. }
  1110. /**
  1111. * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
  1112. * by copying the data before and after the range to be written.
  1113. * @pos: address to do copy from.
  1114. * @length: size of copy operation.
  1115. * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
  1116. * @srcmap: iomap srcmap
  1117. * @daddr: destination address to copy to.
  1118. *
  1119. * This can be called from two places. Either during DAX write fault (page
  1120. * aligned), to copy the length size data to daddr. Or, while doing normal DAX
  1121. * write operation, dax_iomap_iter() might call this to do the copy of either
  1122. * start or end unaligned address. In the latter case the rest of the copy of
  1123. * aligned ranges is taken care by dax_iomap_iter() itself.
  1124. * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
  1125. * area to make sure no old data remains.
  1126. */
  1127. static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
  1128. const struct iomap *srcmap, void *daddr)
  1129. {
  1130. loff_t head_off = pos & (align_size - 1);
  1131. size_t size = ALIGN(head_off + length, align_size);
  1132. loff_t end = pos + length;
  1133. loff_t pg_end = round_up(end, align_size);
  1134. /* copy_all is usually in page fault case */
  1135. bool copy_all = head_off == 0 && end == pg_end;
  1136. /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
  1137. bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
  1138. srcmap->type == IOMAP_UNWRITTEN;
  1139. void *saddr = NULL;
  1140. int ret = 0;
  1141. if (!zero_edge) {
  1142. ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
  1143. if (ret)
  1144. return dax_mem2blk_err(ret);
  1145. }
  1146. if (copy_all) {
  1147. if (zero_edge)
  1148. memset(daddr, 0, size);
  1149. else
  1150. ret = copy_mc_to_kernel(daddr, saddr, length);
  1151. goto out;
  1152. }
  1153. /* Copy the head part of the range */
  1154. if (head_off) {
  1155. if (zero_edge)
  1156. memset(daddr, 0, head_off);
  1157. else {
  1158. ret = copy_mc_to_kernel(daddr, saddr, head_off);
  1159. if (ret)
  1160. return -EIO;
  1161. }
  1162. }
  1163. /* Copy the tail part of the range */
  1164. if (end < pg_end) {
  1165. loff_t tail_off = head_off + length;
  1166. loff_t tail_len = pg_end - end;
  1167. if (zero_edge)
  1168. memset(daddr + tail_off, 0, tail_len);
  1169. else {
  1170. ret = copy_mc_to_kernel(daddr + tail_off,
  1171. saddr + tail_off, tail_len);
  1172. if (ret)
  1173. return -EIO;
  1174. }
  1175. }
  1176. out:
  1177. if (zero_edge)
  1178. dax_flush(srcmap->dax_dev, daddr, size);
  1179. return ret ? -EIO : 0;
  1180. }
  1181. /*
  1182. * The user has performed a load from a hole in the file. Allocating a new
  1183. * page in the file would cause excessive storage usage for workloads with
  1184. * sparse files. Instead we insert a read-only mapping of the 4k zero page.
  1185. * If this page is ever written to we will re-fault and change the mapping to
  1186. * point to real DAX storage instead.
  1187. */
  1188. static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
  1189. const struct iomap_iter *iter, void **entry)
  1190. {
  1191. struct inode *inode = iter->inode;
  1192. unsigned long vaddr = vmf->address;
  1193. unsigned long pfn = my_zero_pfn(vaddr);
  1194. vm_fault_t ret;
  1195. *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
  1196. ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), false);
  1197. trace_dax_load_hole(inode, vmf, ret);
  1198. return ret;
  1199. }
  1200. #ifdef CONFIG_FS_DAX_PMD
  1201. static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
  1202. const struct iomap_iter *iter, void **entry)
  1203. {
  1204. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  1205. struct inode *inode = mapping->host;
  1206. struct folio *zero_folio;
  1207. vm_fault_t ret;
  1208. zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
  1209. if (unlikely(!zero_folio)) {
  1210. trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
  1211. return VM_FAULT_FALLBACK;
  1212. }
  1213. *entry = dax_insert_entry(xas, vmf, iter, *entry, folio_pfn(zero_folio),
  1214. DAX_PMD | DAX_ZERO_PAGE);
  1215. ret = vmf_insert_folio_pmd(vmf, zero_folio, false);
  1216. if (ret == VM_FAULT_NOPAGE)
  1217. trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
  1218. return ret;
  1219. }
  1220. #else
  1221. static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
  1222. const struct iomap_iter *iter, void **entry)
  1223. {
  1224. return VM_FAULT_FALLBACK;
  1225. }
  1226. #endif /* CONFIG_FS_DAX_PMD */
  1227. static int dax_unshare_iter(struct iomap_iter *iter)
  1228. {
  1229. struct iomap *iomap = &iter->iomap;
  1230. const struct iomap *srcmap = iomap_iter_srcmap(iter);
  1231. loff_t copy_pos = iter->pos;
  1232. u64 copy_len = iomap_length(iter);
  1233. u32 mod;
  1234. int id = 0;
  1235. s64 ret;
  1236. void *daddr = NULL, *saddr = NULL;
  1237. if (!iomap_want_unshare_iter(iter))
  1238. return iomap_iter_advance_full(iter);
  1239. /*
  1240. * Extend the file range to be aligned to fsblock/pagesize, because
  1241. * we need to copy entire blocks, not just the byte range specified.
  1242. * Invalidate the mapping because we're about to CoW.
  1243. */
  1244. mod = offset_in_page(copy_pos);
  1245. if (mod) {
  1246. copy_len += mod;
  1247. copy_pos -= mod;
  1248. }
  1249. mod = offset_in_page(copy_pos + copy_len);
  1250. if (mod)
  1251. copy_len += PAGE_SIZE - mod;
  1252. invalidate_inode_pages2_range(iter->inode->i_mapping,
  1253. copy_pos >> PAGE_SHIFT,
  1254. (copy_pos + copy_len - 1) >> PAGE_SHIFT);
  1255. id = dax_read_lock();
  1256. ret = dax_iomap_direct_access(iomap, copy_pos, copy_len, &daddr, NULL);
  1257. if (ret < 0)
  1258. goto out_unlock;
  1259. ret = dax_iomap_direct_access(srcmap, copy_pos, copy_len, &saddr, NULL);
  1260. if (ret < 0)
  1261. goto out_unlock;
  1262. if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0)
  1263. ret = -EIO;
  1264. out_unlock:
  1265. dax_read_unlock(id);
  1266. if (ret < 0)
  1267. return dax_mem2blk_err(ret);
  1268. return iomap_iter_advance_full(iter);
  1269. }
  1270. int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
  1271. const struct iomap_ops *ops)
  1272. {
  1273. struct iomap_iter iter = {
  1274. .inode = inode,
  1275. .pos = pos,
  1276. .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX,
  1277. };
  1278. loff_t size = i_size_read(inode);
  1279. int ret;
  1280. if (pos < 0 || pos >= size)
  1281. return 0;
  1282. iter.len = min(len, size - pos);
  1283. while ((ret = iomap_iter(&iter, ops)) > 0)
  1284. iter.status = dax_unshare_iter(&iter);
  1285. return ret;
  1286. }
  1287. EXPORT_SYMBOL_GPL(dax_file_unshare);
  1288. static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
  1289. {
  1290. const struct iomap *iomap = &iter->iomap;
  1291. const struct iomap *srcmap = iomap_iter_srcmap(iter);
  1292. unsigned offset = offset_in_page(pos);
  1293. pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
  1294. void *kaddr;
  1295. long ret;
  1296. ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
  1297. NULL);
  1298. if (ret < 0)
  1299. return dax_mem2blk_err(ret);
  1300. memset(kaddr + offset, 0, size);
  1301. if (iomap->flags & IOMAP_F_SHARED)
  1302. ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
  1303. kaddr);
  1304. else
  1305. dax_flush(iomap->dax_dev, kaddr + offset, size);
  1306. return ret;
  1307. }
  1308. static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
  1309. {
  1310. const struct iomap *iomap = &iter->iomap;
  1311. const struct iomap *srcmap = iomap_iter_srcmap(iter);
  1312. u64 length = iomap_length(iter);
  1313. int ret;
  1314. /* already zeroed? we're done. */
  1315. if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
  1316. return iomap_iter_advance(iter, length);
  1317. /*
  1318. * invalidate the pages whose sharing state is to be changed
  1319. * because of CoW.
  1320. */
  1321. if (iomap->flags & IOMAP_F_SHARED)
  1322. invalidate_inode_pages2_range(iter->inode->i_mapping,
  1323. iter->pos >> PAGE_SHIFT,
  1324. (iter->pos + length - 1) >> PAGE_SHIFT);
  1325. do {
  1326. loff_t pos = iter->pos;
  1327. unsigned offset = offset_in_page(pos);
  1328. pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
  1329. int id;
  1330. length = min_t(u64, PAGE_SIZE - offset, length);
  1331. id = dax_read_lock();
  1332. if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE)
  1333. ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
  1334. else
  1335. ret = dax_memzero(iter, pos, length);
  1336. dax_read_unlock(id);
  1337. if (ret < 0)
  1338. return ret;
  1339. ret = iomap_iter_advance(iter, length);
  1340. if (ret)
  1341. return ret;
  1342. } while ((length = iomap_length(iter)) > 0);
  1343. if (did_zero)
  1344. *did_zero = true;
  1345. return ret;
  1346. }
  1347. int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
  1348. const struct iomap_ops *ops)
  1349. {
  1350. struct iomap_iter iter = {
  1351. .inode = inode,
  1352. .pos = pos,
  1353. .len = len,
  1354. .flags = IOMAP_DAX | IOMAP_ZERO,
  1355. };
  1356. int ret;
  1357. while ((ret = iomap_iter(&iter, ops)) > 0)
  1358. iter.status = dax_zero_iter(&iter, did_zero);
  1359. return ret;
  1360. }
  1361. EXPORT_SYMBOL_GPL(dax_zero_range);
  1362. int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
  1363. const struct iomap_ops *ops)
  1364. {
  1365. unsigned int blocksize = i_blocksize(inode);
  1366. unsigned int off = pos & (blocksize - 1);
  1367. /* Block boundary? Nothing to do */
  1368. if (!off)
  1369. return 0;
  1370. return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
  1371. }
  1372. EXPORT_SYMBOL_GPL(dax_truncate_page);
  1373. static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
  1374. {
  1375. const struct iomap *iomap = &iomi->iomap;
  1376. const struct iomap *srcmap = iomap_iter_srcmap(iomi);
  1377. loff_t length = iomap_length(iomi);
  1378. loff_t pos = iomi->pos;
  1379. struct dax_device *dax_dev = iomap->dax_dev;
  1380. loff_t end = pos + length, done = 0;
  1381. bool write = iov_iter_rw(iter) == WRITE;
  1382. bool cow = write && iomap->flags & IOMAP_F_SHARED;
  1383. ssize_t ret = 0;
  1384. size_t xfer;
  1385. int id;
  1386. if (!write) {
  1387. end = min(end, i_size_read(iomi->inode));
  1388. if (pos >= end)
  1389. return 0;
  1390. if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
  1391. done = iov_iter_zero(min(length, end - pos), iter);
  1392. return iomap_iter_advance(iomi, done);
  1393. }
  1394. }
  1395. /*
  1396. * In DAX mode, enforce either pure overwrites of written extents, or
  1397. * writes to unwritten extents as part of a copy-on-write operation.
  1398. */
  1399. if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
  1400. !(iomap->flags & IOMAP_F_SHARED)))
  1401. return -EIO;
  1402. /*
  1403. * Write can allocate block for an area which has a hole page mapped
  1404. * into page tables. We have to tear down these mappings so that data
  1405. * written by write(2) is visible in mmap.
  1406. */
  1407. if (iomap->flags & IOMAP_F_NEW || cow) {
  1408. /*
  1409. * Filesystem allows CoW on non-shared extents. The src extents
  1410. * may have been mmapped with dirty mark before. To be able to
  1411. * invalidate its dax entries, we need to clear the dirty mark
  1412. * in advance.
  1413. */
  1414. if (cow)
  1415. __dax_clear_dirty_range(iomi->inode->i_mapping,
  1416. pos >> PAGE_SHIFT,
  1417. (end - 1) >> PAGE_SHIFT);
  1418. invalidate_inode_pages2_range(iomi->inode->i_mapping,
  1419. pos >> PAGE_SHIFT,
  1420. (end - 1) >> PAGE_SHIFT);
  1421. }
  1422. id = dax_read_lock();
  1423. while ((pos = iomi->pos) < end) {
  1424. unsigned offset = pos & (PAGE_SIZE - 1);
  1425. const size_t size = ALIGN(length + offset, PAGE_SIZE);
  1426. pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
  1427. ssize_t map_len;
  1428. bool recovery = false;
  1429. void *kaddr;
  1430. if (fatal_signal_pending(current)) {
  1431. ret = -EINTR;
  1432. break;
  1433. }
  1434. map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
  1435. DAX_ACCESS, &kaddr, NULL);
  1436. if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
  1437. map_len = dax_direct_access(dax_dev, pgoff,
  1438. PHYS_PFN(size), DAX_RECOVERY_WRITE,
  1439. &kaddr, NULL);
  1440. if (map_len > 0)
  1441. recovery = true;
  1442. }
  1443. if (map_len < 0) {
  1444. ret = dax_mem2blk_err(map_len);
  1445. break;
  1446. }
  1447. if (cow) {
  1448. ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
  1449. srcmap, kaddr);
  1450. if (ret)
  1451. break;
  1452. }
  1453. map_len = PFN_PHYS(map_len);
  1454. kaddr += offset;
  1455. map_len -= offset;
  1456. if (map_len > end - pos)
  1457. map_len = end - pos;
  1458. if (recovery)
  1459. xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
  1460. map_len, iter);
  1461. else if (write)
  1462. xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
  1463. map_len, iter);
  1464. else
  1465. xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
  1466. map_len, iter);
  1467. ret = iomap_iter_advance(iomi, xfer);
  1468. if (!ret && xfer == 0)
  1469. ret = -EFAULT;
  1470. if (xfer < map_len)
  1471. break;
  1472. length = iomap_length(iomi);
  1473. }
  1474. dax_read_unlock(id);
  1475. return ret;
  1476. }
  1477. /**
  1478. * dax_iomap_rw - Perform I/O to a DAX file
  1479. * @iocb: The control block for this I/O
  1480. * @iter: The addresses to do I/O from or to
  1481. * @ops: iomap ops passed from the file system
  1482. *
  1483. * This function performs read and write operations to directly mapped
  1484. * persistent memory. The callers needs to take care of read/write exclusion
  1485. * and evicting any page cache pages in the region under I/O.
  1486. */
  1487. ssize_t
  1488. dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
  1489. const struct iomap_ops *ops)
  1490. {
  1491. struct iomap_iter iomi = {
  1492. .inode = iocb->ki_filp->f_mapping->host,
  1493. .pos = iocb->ki_pos,
  1494. .len = iov_iter_count(iter),
  1495. .flags = IOMAP_DAX,
  1496. };
  1497. loff_t done = 0;
  1498. int ret;
  1499. if (WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC))
  1500. return -EIO;
  1501. if (!iomi.len)
  1502. return 0;
  1503. if (iov_iter_rw(iter) == WRITE) {
  1504. lockdep_assert_held_write(&iomi.inode->i_rwsem);
  1505. iomi.flags |= IOMAP_WRITE;
  1506. } else if (!sb_rdonly(iomi.inode->i_sb)) {
  1507. lockdep_assert_held(&iomi.inode->i_rwsem);
  1508. }
  1509. if (iocb->ki_flags & IOCB_NOWAIT)
  1510. iomi.flags |= IOMAP_NOWAIT;
  1511. while ((ret = iomap_iter(&iomi, ops)) > 0)
  1512. iomi.status = dax_iomap_iter(&iomi, iter);
  1513. done = iomi.pos - iocb->ki_pos;
  1514. iocb->ki_pos = iomi.pos;
  1515. return done ? done : ret;
  1516. }
  1517. EXPORT_SYMBOL_GPL(dax_iomap_rw);
  1518. static vm_fault_t dax_fault_return(int error)
  1519. {
  1520. if (error == 0)
  1521. return VM_FAULT_NOPAGE;
  1522. return vmf_error(error);
  1523. }
  1524. /*
  1525. * When handling a synchronous page fault and the inode need a fsync, we can
  1526. * insert the PTE/PMD into page tables only after that fsync happened. Skip
  1527. * insertion for now and return the pfn so that caller can insert it after the
  1528. * fsync is done.
  1529. */
  1530. static vm_fault_t dax_fault_synchronous_pfnp(unsigned long *pfnp,
  1531. unsigned long pfn)
  1532. {
  1533. if (WARN_ON_ONCE(!pfnp))
  1534. return VM_FAULT_SIGBUS;
  1535. *pfnp = pfn;
  1536. return VM_FAULT_NEEDDSYNC;
  1537. }
  1538. static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
  1539. const struct iomap_iter *iter)
  1540. {
  1541. vm_fault_t ret;
  1542. int error = 0;
  1543. switch (iter->iomap.type) {
  1544. case IOMAP_HOLE:
  1545. case IOMAP_UNWRITTEN:
  1546. clear_user_highpage(vmf->cow_page, vmf->address);
  1547. break;
  1548. case IOMAP_MAPPED:
  1549. error = copy_cow_page_dax(vmf, iter);
  1550. break;
  1551. default:
  1552. WARN_ON_ONCE(1);
  1553. error = -EIO;
  1554. break;
  1555. }
  1556. if (error)
  1557. return dax_fault_return(error);
  1558. __SetPageUptodate(vmf->cow_page);
  1559. ret = finish_fault(vmf);
  1560. if (!ret)
  1561. return VM_FAULT_DONE_COW;
  1562. return ret;
  1563. }
  1564. /**
  1565. * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
  1566. * @vmf: vm fault instance
  1567. * @iter: iomap iter
  1568. * @pfnp: pfn to be returned
  1569. * @xas: the dax mapping tree of a file
  1570. * @entry: an unlocked dax entry to be inserted
  1571. * @pmd: distinguish whether it is a pmd fault
  1572. */
  1573. static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
  1574. const struct iomap_iter *iter, unsigned long *pfnp,
  1575. struct xa_state *xas, void **entry, bool pmd)
  1576. {
  1577. const struct iomap *iomap = &iter->iomap;
  1578. const struct iomap *srcmap = iomap_iter_srcmap(iter);
  1579. size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
  1580. loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
  1581. bool write = iter->flags & IOMAP_WRITE;
  1582. unsigned long entry_flags = pmd ? DAX_PMD : 0;
  1583. struct folio *folio;
  1584. int ret, err = 0;
  1585. unsigned long pfn;
  1586. void *kaddr;
  1587. if (!pmd && vmf->cow_page)
  1588. return dax_fault_cow_page(vmf, iter);
  1589. /* if we are reading UNWRITTEN and HOLE, return a hole. */
  1590. if (!write &&
  1591. (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
  1592. if (!pmd)
  1593. return dax_load_hole(xas, vmf, iter, entry);
  1594. return dax_pmd_load_hole(xas, vmf, iter, entry);
  1595. }
  1596. if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
  1597. WARN_ON_ONCE(1);
  1598. return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
  1599. }
  1600. err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
  1601. if (err)
  1602. return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
  1603. *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
  1604. if (write && iomap->flags & IOMAP_F_SHARED) {
  1605. err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr);
  1606. if (err)
  1607. return dax_fault_return(err);
  1608. }
  1609. folio = dax_to_folio(*entry);
  1610. if (dax_fault_is_synchronous(iter, vmf->vma))
  1611. return dax_fault_synchronous_pfnp(pfnp, pfn);
  1612. folio_ref_inc(folio);
  1613. if (pmd)
  1614. ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn), write);
  1615. else
  1616. ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), write);
  1617. folio_put(folio);
  1618. return ret;
  1619. }
  1620. static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp,
  1621. int *iomap_errp, const struct iomap_ops *ops)
  1622. {
  1623. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  1624. XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
  1625. struct iomap_iter iter = {
  1626. .inode = mapping->host,
  1627. .pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
  1628. .len = PAGE_SIZE,
  1629. .flags = IOMAP_DAX | IOMAP_FAULT,
  1630. };
  1631. vm_fault_t ret = 0;
  1632. void *entry;
  1633. int error;
  1634. trace_dax_pte_fault(iter.inode, vmf, ret);
  1635. /*
  1636. * Check whether offset isn't beyond end of file now. Caller is supposed
  1637. * to hold locks serializing us with truncate / punch hole so this is
  1638. * a reliable test.
  1639. */
  1640. if (iter.pos >= i_size_read(iter.inode)) {
  1641. ret = VM_FAULT_SIGBUS;
  1642. goto out;
  1643. }
  1644. if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
  1645. iter.flags |= IOMAP_WRITE;
  1646. entry = grab_mapping_entry(&xas, mapping, 0);
  1647. if (xa_is_internal(entry)) {
  1648. ret = xa_to_internal(entry);
  1649. goto out;
  1650. }
  1651. /*
  1652. * It is possible, particularly with mixed reads & writes to private
  1653. * mappings, that we have raced with a PMD fault that overlaps with
  1654. * the PTE we need to set up. If so just return and the fault will be
  1655. * retried.
  1656. */
  1657. if (pmd_trans_huge(*vmf->pmd)) {
  1658. ret = VM_FAULT_NOPAGE;
  1659. goto unlock_entry;
  1660. }
  1661. while ((error = iomap_iter(&iter, ops)) > 0) {
  1662. if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
  1663. iter.status = -EIO; /* fs corruption? */
  1664. continue;
  1665. }
  1666. ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false);
  1667. if (ret != VM_FAULT_SIGBUS &&
  1668. (iter.iomap.flags & IOMAP_F_NEW)) {
  1669. count_vm_event(PGMAJFAULT);
  1670. count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
  1671. ret |= VM_FAULT_MAJOR;
  1672. }
  1673. if (!(ret & VM_FAULT_ERROR))
  1674. iter.status = iomap_iter_advance(&iter, PAGE_SIZE);
  1675. }
  1676. if (iomap_errp)
  1677. *iomap_errp = error;
  1678. if (!ret && error)
  1679. ret = dax_fault_return(error);
  1680. unlock_entry:
  1681. dax_unlock_entry(&xas, entry);
  1682. out:
  1683. trace_dax_pte_fault_done(iter.inode, vmf, ret);
  1684. return ret;
  1685. }
  1686. #ifdef CONFIG_FS_DAX_PMD
  1687. static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
  1688. pgoff_t max_pgoff)
  1689. {
  1690. unsigned long pmd_addr = vmf->address & PMD_MASK;
  1691. bool write = vmf->flags & FAULT_FLAG_WRITE;
  1692. /*
  1693. * Make sure that the faulting address's PMD offset (color) matches
  1694. * the PMD offset from the start of the file. This is necessary so
  1695. * that a PMD range in the page table overlaps exactly with a PMD
  1696. * range in the page cache.
  1697. */
  1698. if ((vmf->pgoff & PG_PMD_COLOUR) !=
  1699. ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
  1700. return true;
  1701. /* Fall back to PTEs if we're going to COW */
  1702. if (write && !(vmf->vma->vm_flags & VM_SHARED))
  1703. return true;
  1704. /* If the PMD would extend outside the VMA */
  1705. if (pmd_addr < vmf->vma->vm_start)
  1706. return true;
  1707. if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
  1708. return true;
  1709. /* If the PMD would extend beyond the file size */
  1710. if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
  1711. return true;
  1712. return false;
  1713. }
  1714. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
  1715. const struct iomap_ops *ops)
  1716. {
  1717. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  1718. XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
  1719. struct iomap_iter iter = {
  1720. .inode = mapping->host,
  1721. .len = PMD_SIZE,
  1722. .flags = IOMAP_DAX | IOMAP_FAULT,
  1723. };
  1724. vm_fault_t ret = VM_FAULT_FALLBACK;
  1725. pgoff_t max_pgoff;
  1726. void *entry;
  1727. if (vmf->flags & FAULT_FLAG_WRITE)
  1728. iter.flags |= IOMAP_WRITE;
  1729. /*
  1730. * Check whether offset isn't beyond end of file now. Caller is
  1731. * supposed to hold locks serializing us with truncate / punch hole so
  1732. * this is a reliable test.
  1733. */
  1734. max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
  1735. trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0);
  1736. if (xas.xa_index >= max_pgoff) {
  1737. ret = VM_FAULT_SIGBUS;
  1738. goto out;
  1739. }
  1740. if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
  1741. goto fallback;
  1742. /*
  1743. * grab_mapping_entry() will make sure we get an empty PMD entry,
  1744. * a zero PMD entry or a DAX PMD. If it can't (because a PTE
  1745. * entry is already in the array, for instance), it will return
  1746. * VM_FAULT_FALLBACK.
  1747. */
  1748. entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
  1749. if (xa_is_internal(entry)) {
  1750. ret = xa_to_internal(entry);
  1751. goto fallback;
  1752. }
  1753. /*
  1754. * It is possible, particularly with mixed reads & writes to private
  1755. * mappings, that we have raced with a PTE fault that overlaps with
  1756. * the PMD we need to set up. If so just return and the fault will be
  1757. * retried.
  1758. */
  1759. if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd)) {
  1760. ret = 0;
  1761. goto unlock_entry;
  1762. }
  1763. iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
  1764. while (iomap_iter(&iter, ops) > 0) {
  1765. if (iomap_length(&iter) < PMD_SIZE)
  1766. continue; /* actually breaks out of the loop */
  1767. ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
  1768. if (ret != VM_FAULT_FALLBACK)
  1769. iter.status = iomap_iter_advance(&iter, PMD_SIZE);
  1770. }
  1771. unlock_entry:
  1772. dax_unlock_entry(&xas, entry);
  1773. fallback:
  1774. if (ret == VM_FAULT_FALLBACK) {
  1775. split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
  1776. count_vm_event(THP_FAULT_FALLBACK);
  1777. }
  1778. out:
  1779. trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret);
  1780. return ret;
  1781. }
  1782. #else
  1783. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
  1784. const struct iomap_ops *ops)
  1785. {
  1786. return VM_FAULT_FALLBACK;
  1787. }
  1788. #endif /* CONFIG_FS_DAX_PMD */
  1789. /**
  1790. * dax_iomap_fault - handle a page fault on a DAX file
  1791. * @vmf: The description of the fault
  1792. * @order: Order of the page to fault in
  1793. * @pfnp: PFN to insert for synchronous faults if fsync is required
  1794. * @iomap_errp: Storage for detailed error code in case of error
  1795. * @ops: Iomap ops passed from the file system
  1796. *
  1797. * When a page fault occurs, filesystems may call this helper in
  1798. * their fault handler for DAX files. dax_iomap_fault() assumes the caller
  1799. * has done all the necessary locking for page fault to proceed
  1800. * successfully.
  1801. */
  1802. vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
  1803. unsigned long *pfnp, int *iomap_errp,
  1804. const struct iomap_ops *ops)
  1805. {
  1806. if (order == 0)
  1807. return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
  1808. else if (order == PMD_ORDER)
  1809. return dax_iomap_pmd_fault(vmf, pfnp, ops);
  1810. else
  1811. return VM_FAULT_FALLBACK;
  1812. }
  1813. EXPORT_SYMBOL_GPL(dax_iomap_fault);
  1814. /*
  1815. * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
  1816. * @vmf: The description of the fault
  1817. * @pfn: PFN to insert
  1818. * @order: Order of entry to insert.
  1819. *
  1820. * This function inserts a writeable PTE or PMD entry into the page tables
  1821. * for an mmaped DAX file. It also marks the page cache entry as dirty.
  1822. */
  1823. static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
  1824. unsigned long pfn, unsigned int order)
  1825. {
  1826. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  1827. XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
  1828. struct folio *folio;
  1829. void *entry;
  1830. vm_fault_t ret;
  1831. xas_lock_irq(&xas);
  1832. entry = get_next_unlocked_entry(&xas, order);
  1833. /* Did we race with someone splitting entry or so? */
  1834. if (!entry || dax_is_conflict(entry) ||
  1835. (order == 0 && !dax_is_pte_entry(entry))) {
  1836. put_unlocked_entry(&xas, entry, WAKE_NEXT);
  1837. xas_unlock_irq(&xas);
  1838. trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
  1839. VM_FAULT_NOPAGE);
  1840. return VM_FAULT_NOPAGE;
  1841. }
  1842. xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
  1843. dax_lock_entry(&xas, entry);
  1844. xas_unlock_irq(&xas);
  1845. folio = pfn_folio(pfn);
  1846. folio_ref_inc(folio);
  1847. if (order == 0)
  1848. ret = vmf_insert_page_mkwrite(vmf, &folio->page, true);
  1849. #ifdef CONFIG_FS_DAX_PMD
  1850. else if (order == PMD_ORDER)
  1851. ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE);
  1852. #endif
  1853. else
  1854. ret = VM_FAULT_FALLBACK;
  1855. folio_put(folio);
  1856. dax_unlock_entry(&xas, entry);
  1857. trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
  1858. return ret;
  1859. }
  1860. /**
  1861. * dax_finish_sync_fault - finish synchronous page fault
  1862. * @vmf: The description of the fault
  1863. * @order: Order of entry to be inserted
  1864. * @pfn: PFN to insert
  1865. *
  1866. * This function ensures that the file range touched by the page fault is
  1867. * stored persistently on the media and handles inserting of appropriate page
  1868. * table entry.
  1869. */
  1870. vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
  1871. unsigned long pfn)
  1872. {
  1873. int err;
  1874. loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
  1875. size_t len = PAGE_SIZE << order;
  1876. err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
  1877. if (err)
  1878. return VM_FAULT_SIGBUS;
  1879. return dax_insert_pfn_mkwrite(vmf, pfn, order);
  1880. }
  1881. EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
  1882. static int dax_range_compare_iter(struct iomap_iter *it_src,
  1883. struct iomap_iter *it_dest, u64 len, bool *same)
  1884. {
  1885. const struct iomap *smap = &it_src->iomap;
  1886. const struct iomap *dmap = &it_dest->iomap;
  1887. loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
  1888. void *saddr, *daddr;
  1889. int id, ret;
  1890. len = min(len, min(smap->length, dmap->length));
  1891. if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
  1892. *same = true;
  1893. goto advance;
  1894. }
  1895. if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
  1896. *same = false;
  1897. return 0;
  1898. }
  1899. id = dax_read_lock();
  1900. ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
  1901. &saddr, NULL);
  1902. if (ret < 0)
  1903. goto out_unlock;
  1904. ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
  1905. &daddr, NULL);
  1906. if (ret < 0)
  1907. goto out_unlock;
  1908. *same = !memcmp(saddr, daddr, len);
  1909. if (!*same)
  1910. len = 0;
  1911. dax_read_unlock(id);
  1912. advance:
  1913. ret = iomap_iter_advance(it_src, len);
  1914. if (!ret)
  1915. ret = iomap_iter_advance(it_dest, len);
  1916. return ret;
  1917. out_unlock:
  1918. dax_read_unlock(id);
  1919. return -EIO;
  1920. }
  1921. int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
  1922. struct inode *dst, loff_t dstoff, loff_t len, bool *same,
  1923. const struct iomap_ops *ops)
  1924. {
  1925. struct iomap_iter src_iter = {
  1926. .inode = src,
  1927. .pos = srcoff,
  1928. .len = len,
  1929. .flags = IOMAP_DAX,
  1930. };
  1931. struct iomap_iter dst_iter = {
  1932. .inode = dst,
  1933. .pos = dstoff,
  1934. .len = len,
  1935. .flags = IOMAP_DAX,
  1936. };
  1937. int ret, status;
  1938. while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
  1939. (ret = iomap_iter(&dst_iter, ops)) > 0) {
  1940. status = dax_range_compare_iter(&src_iter, &dst_iter,
  1941. min(src_iter.len, dst_iter.len), same);
  1942. if (status < 0)
  1943. return ret;
  1944. src_iter.status = dst_iter.status = status;
  1945. }
  1946. return ret;
  1947. }
  1948. int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
  1949. struct file *file_out, loff_t pos_out,
  1950. loff_t *len, unsigned int remap_flags,
  1951. const struct iomap_ops *ops)
  1952. {
  1953. return __generic_remap_file_range_prep(file_in, pos_in, file_out,
  1954. pos_out, len, remap_flags, ops);
  1955. }
  1956. EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);