truncate.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * mm/truncate.c - code for taking down pages from address_spaces
  4. *
  5. * Copyright (C) 2002, Linus Torvalds
  6. *
  7. * 10Sep2002 Andrew Morton
  8. * Initial version.
  9. */
  10. #include <linux/kernel.h>
  11. #include <linux/backing-dev.h>
  12. #include <linux/dax.h>
  13. #include <linux/gfp.h>
  14. #include <linux/mm.h>
  15. #include <linux/swap.h>
  16. #include <linux/export.h>
  17. #include <linux/pagemap.h>
  18. #include <linux/highmem.h>
  19. #include <linux/pagevec.h>
  20. #include <linux/task_io_accounting_ops.h>
  21. #include <linux/shmem_fs.h>
  22. #include <linux/rmap.h>
  23. #include "internal.h"
  24. static void clear_shadow_entries(struct address_space *mapping,
  25. unsigned long start, unsigned long max)
  26. {
  27. XA_STATE(xas, &mapping->i_pages, start);
  28. struct folio *folio;
  29. /* Handled by shmem itself, or for DAX we do nothing. */
  30. if (shmem_mapping(mapping) || dax_mapping(mapping))
  31. return;
  32. xas_set_update(&xas, workingset_update_node);
  33. spin_lock(&mapping->host->i_lock);
  34. xas_lock_irq(&xas);
  35. /* Clear all shadow entries from start to max */
  36. xas_for_each(&xas, folio, max) {
  37. if (xa_is_value(folio))
  38. xas_store(&xas, NULL);
  39. }
  40. xas_unlock_irq(&xas);
  41. if (mapping_shrinkable(mapping))
  42. inode_lru_list_add(mapping->host);
  43. spin_unlock(&mapping->host->i_lock);
  44. }
  45. /*
  46. * Unconditionally remove exceptional entries. Usually called from truncate
  47. * path. Note that the folio_batch may be altered by this function by removing
  48. * exceptional entries similar to what folio_batch_remove_exceptionals() does.
  49. * Please note that indices[] has entries in ascending order as guaranteed by
  50. * either find_get_entries() or find_lock_entries().
  51. */
  52. static void truncate_folio_batch_exceptionals(struct address_space *mapping,
  53. struct folio_batch *fbatch, pgoff_t *indices)
  54. {
  55. XA_STATE(xas, &mapping->i_pages, indices[0]);
  56. int nr = folio_batch_count(fbatch);
  57. struct folio *folio;
  58. int i, j;
  59. /* Handled by shmem itself */
  60. if (shmem_mapping(mapping))
  61. return;
  62. for (j = 0; j < nr; j++)
  63. if (xa_is_value(fbatch->folios[j]))
  64. break;
  65. if (j == nr)
  66. return;
  67. if (dax_mapping(mapping)) {
  68. for (i = j; i < nr; i++) {
  69. if (xa_is_value(fbatch->folios[i])) {
  70. /*
  71. * File systems should already have called
  72. * dax_break_layout_entry() to remove all DAX
  73. * entries while holding a lock to prevent
  74. * establishing new entries. Therefore we
  75. * shouldn't find any here.
  76. */
  77. WARN_ON_ONCE(1);
  78. /*
  79. * Delete the mapping so truncate_pagecache()
  80. * doesn't loop forever.
  81. */
  82. dax_delete_mapping_entry(mapping, indices[i]);
  83. }
  84. }
  85. goto out;
  86. }
  87. xas_set(&xas, indices[j]);
  88. xas_set_update(&xas, workingset_update_node);
  89. spin_lock(&mapping->host->i_lock);
  90. xas_lock_irq(&xas);
  91. xas_for_each(&xas, folio, indices[nr-1]) {
  92. if (xa_is_value(folio))
  93. xas_store(&xas, NULL);
  94. }
  95. xas_unlock_irq(&xas);
  96. if (mapping_shrinkable(mapping))
  97. inode_lru_list_add(mapping->host);
  98. spin_unlock(&mapping->host->i_lock);
  99. out:
  100. folio_batch_remove_exceptionals(fbatch);
  101. }
  102. /**
  103. * folio_invalidate - Invalidate part or all of a folio.
  104. * @folio: The folio which is affected.
  105. * @offset: start of the range to invalidate
  106. * @length: length of the range to invalidate
  107. *
  108. * folio_invalidate() is called when all or part of the folio has become
  109. * invalidated by a truncate operation.
  110. *
  111. * folio_invalidate() does not have to release all buffers, but it must
  112. * ensure that no dirty buffer is left outside @offset and that no I/O
  113. * is underway against any of the blocks which are outside the truncation
  114. * point. Because the caller is about to free (and possibly reuse) those
  115. * blocks on-disk.
  116. */
  117. void folio_invalidate(struct folio *folio, size_t offset, size_t length)
  118. {
  119. const struct address_space_operations *aops = folio->mapping->a_ops;
  120. if (aops->invalidate_folio)
  121. aops->invalidate_folio(folio, offset, length);
  122. }
  123. EXPORT_SYMBOL_GPL(folio_invalidate);
  124. /*
  125. * If truncate cannot remove the fs-private metadata from the page, the page
  126. * becomes orphaned. It will be left on the LRU and may even be mapped into
  127. * user pagetables if we're racing with filemap_fault().
  128. *
  129. * We need to bail out if page->mapping is no longer equal to the original
  130. * mapping. This happens a) when the VM reclaimed the page while we waited on
  131. * its lock, b) when a concurrent invalidate_mapping_pages got there first and
  132. * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
  133. */
  134. static void truncate_cleanup_folio(struct folio *folio)
  135. {
  136. if (folio_mapped(folio))
  137. unmap_mapping_folio(folio);
  138. if (folio_needs_release(folio))
  139. folio_invalidate(folio, 0, folio_size(folio));
  140. /*
  141. * Some filesystems seem to re-dirty the page even after
  142. * the VM has canceled the dirty bit (eg ext3 journaling).
  143. * Hence dirty accounting check is placed after invalidation.
  144. */
  145. folio_cancel_dirty(folio);
  146. }
  147. int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
  148. {
  149. if (folio->mapping != mapping)
  150. return -EIO;
  151. truncate_cleanup_folio(folio);
  152. filemap_remove_folio(folio);
  153. return 0;
  154. }
  155. static int try_folio_split_or_unmap(struct folio *folio, struct page *split_at,
  156. unsigned long min_order)
  157. {
  158. enum ttu_flags ttu_flags =
  159. TTU_SYNC |
  160. TTU_SPLIT_HUGE_PMD |
  161. TTU_IGNORE_MLOCK;
  162. int ret;
  163. ret = try_folio_split_to_order(folio, split_at, min_order);
  164. /*
  165. * If the split fails, unmap the folio, so it will be refaulted
  166. * with PTEs to respect SIGBUS semantics.
  167. *
  168. * Make an exception for shmem/tmpfs that for long time
  169. * intentionally mapped with PMDs across i_size.
  170. */
  171. if (ret && !shmem_mapping(folio->mapping)) {
  172. try_to_unmap(folio, ttu_flags);
  173. WARN_ON(folio_mapped(folio));
  174. }
  175. return ret;
  176. }
  177. /*
  178. * Handle partial folios. The folio may be entirely within the
  179. * range if a split has raced with us. If not, we zero the part of the
  180. * folio that's within the [start, end] range, and then split the folio if
  181. * it's large. split_page_range() will discard pages which now lie beyond
  182. * i_size, and we rely on the caller to discard pages which lie within a
  183. * newly created hole.
  184. *
  185. * Returns false if splitting failed so the caller can avoid
  186. * discarding the entire folio which is stubbornly unsplit.
  187. */
  188. bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
  189. {
  190. loff_t pos = folio_pos(folio);
  191. size_t size = folio_size(folio);
  192. unsigned int offset, length;
  193. struct page *split_at, *split_at2;
  194. unsigned int min_order;
  195. if (pos < start)
  196. offset = start - pos;
  197. else
  198. offset = 0;
  199. if (pos + size <= (u64)end)
  200. length = size - offset;
  201. else
  202. length = end + 1 - pos - offset;
  203. folio_wait_writeback(folio);
  204. if (length == size) {
  205. truncate_inode_folio(folio->mapping, folio);
  206. return true;
  207. }
  208. /*
  209. * We may be zeroing pages we're about to discard, but it avoids
  210. * doing a complex calculation here, and then doing the zeroing
  211. * anyway if the page split fails.
  212. */
  213. if (!mapping_inaccessible(folio->mapping))
  214. folio_zero_range(folio, offset, length);
  215. if (folio_needs_release(folio))
  216. folio_invalidate(folio, offset, length);
  217. if (!folio_test_large(folio))
  218. return true;
  219. min_order = mapping_min_folio_order(folio->mapping);
  220. split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE);
  221. if (!try_folio_split_or_unmap(folio, split_at, min_order)) {
  222. /*
  223. * try to split at offset + length to make sure folios within
  224. * the range can be dropped, especially to avoid memory waste
  225. * for shmem truncate
  226. */
  227. struct folio *folio2;
  228. if (offset + length == size)
  229. goto no_split;
  230. split_at2 = folio_page(folio,
  231. PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
  232. folio2 = page_folio(split_at2);
  233. if (!folio_try_get(folio2))
  234. goto no_split;
  235. if (!folio_test_large(folio2))
  236. goto out;
  237. if (!folio_trylock(folio2))
  238. goto out;
  239. /* make sure folio2 is large and does not change its mapping */
  240. if (folio_test_large(folio2) &&
  241. folio2->mapping == folio->mapping)
  242. try_folio_split_or_unmap(folio2, split_at2, min_order);
  243. folio_unlock(folio2);
  244. out:
  245. folio_put(folio2);
  246. no_split:
  247. return true;
  248. }
  249. if (folio_test_dirty(folio))
  250. return false;
  251. truncate_inode_folio(folio->mapping, folio);
  252. return true;
  253. }
  254. /*
  255. * Used to get rid of pages on hardware memory corruption.
  256. */
  257. int generic_error_remove_folio(struct address_space *mapping,
  258. struct folio *folio)
  259. {
  260. if (!mapping)
  261. return -EINVAL;
  262. /*
  263. * Only punch for normal data pages for now.
  264. * Handling other types like directories would need more auditing.
  265. */
  266. if (!S_ISREG(mapping->host->i_mode))
  267. return -EIO;
  268. return truncate_inode_folio(mapping, folio);
  269. }
  270. EXPORT_SYMBOL(generic_error_remove_folio);
  271. /**
  272. * mapping_evict_folio() - Remove an unused folio from the page-cache.
  273. * @mapping: The mapping this folio belongs to.
  274. * @folio: The folio to remove.
  275. *
  276. * Safely remove one folio from the page cache.
  277. * It only drops clean, unused folios.
  278. *
  279. * Context: Folio must be locked.
  280. * Return: The number of pages successfully removed.
  281. */
  282. long mapping_evict_folio(struct address_space *mapping, struct folio *folio)
  283. {
  284. /* The page may have been truncated before it was locked */
  285. if (!mapping)
  286. return 0;
  287. if (folio_test_dirty(folio) || folio_test_writeback(folio))
  288. return 0;
  289. /* The refcount will be elevated if any page in the folio is mapped */
  290. if (folio_ref_count(folio) >
  291. folio_nr_pages(folio) + folio_has_private(folio) + 1)
  292. return 0;
  293. if (!filemap_release_folio(folio, 0))
  294. return 0;
  295. return remove_mapping(mapping, folio);
  296. }
  297. /**
  298. * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
  299. * @mapping: mapping to truncate
  300. * @lstart: offset from which to truncate
  301. * @lend: offset to which to truncate (inclusive)
  302. *
  303. * Truncate the page cache, removing the pages that are between
  304. * specified offsets (and zeroing out partial pages
  305. * if lstart or lend + 1 is not page aligned).
  306. *
  307. * Truncate takes two passes - the first pass is nonblocking. It will not
  308. * block on page locks and it will not block on writeback. The second pass
  309. * will wait. This is to prevent as much IO as possible in the affected region.
  310. * The first pass will remove most pages, so the search cost of the second pass
  311. * is low.
  312. *
  313. * We pass down the cache-hot hint to the page freeing code. Even if the
  314. * mapping is large, it is probably the case that the final pages are the most
  315. * recently touched, and freeing happens in ascending file offset order.
  316. *
  317. * Note that since ->invalidate_folio() accepts range to invalidate
  318. * truncate_inode_pages_range is able to handle cases where lend + 1 is not
  319. * page aligned properly.
  320. */
  321. void truncate_inode_pages_range(struct address_space *mapping,
  322. loff_t lstart, uoff_t lend)
  323. {
  324. pgoff_t start; /* inclusive */
  325. pgoff_t end; /* exclusive */
  326. struct folio_batch fbatch;
  327. pgoff_t indices[PAGEVEC_SIZE];
  328. pgoff_t index;
  329. int i;
  330. struct folio *folio;
  331. bool same_folio;
  332. if (mapping_empty(mapping))
  333. return;
  334. /*
  335. * 'start' and 'end' always covers the range of pages to be fully
  336. * truncated. Partial pages are covered with 'partial_start' at the
  337. * start of the range and 'partial_end' at the end of the range.
  338. * Note that 'end' is exclusive while 'lend' is inclusive.
  339. */
  340. start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
  341. if (lend == -1)
  342. /*
  343. * lend == -1 indicates end-of-file so we have to set 'end'
  344. * to the highest possible pgoff_t and since the type is
  345. * unsigned we're using -1.
  346. */
  347. end = -1;
  348. else
  349. end = (lend + 1) >> PAGE_SHIFT;
  350. folio_batch_init(&fbatch);
  351. index = start;
  352. while (index < end && find_lock_entries(mapping, &index, end - 1,
  353. &fbatch, indices)) {
  354. truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
  355. for (i = 0; i < folio_batch_count(&fbatch); i++)
  356. truncate_cleanup_folio(fbatch.folios[i]);
  357. delete_from_page_cache_batch(mapping, &fbatch);
  358. for (i = 0; i < folio_batch_count(&fbatch); i++)
  359. folio_unlock(fbatch.folios[i]);
  360. folio_batch_release(&fbatch);
  361. cond_resched();
  362. }
  363. same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
  364. folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
  365. if (!IS_ERR(folio)) {
  366. same_folio = lend < folio_next_pos(folio);
  367. if (!truncate_inode_partial_folio(folio, lstart, lend)) {
  368. start = folio_next_index(folio);
  369. if (same_folio)
  370. end = folio->index;
  371. }
  372. folio_unlock(folio);
  373. folio_put(folio);
  374. folio = NULL;
  375. }
  376. if (!same_folio) {
  377. folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT,
  378. FGP_LOCK, 0);
  379. if (!IS_ERR(folio)) {
  380. if (!truncate_inode_partial_folio(folio, lstart, lend))
  381. end = folio->index;
  382. folio_unlock(folio);
  383. folio_put(folio);
  384. }
  385. }
  386. index = start;
  387. while (index < end) {
  388. cond_resched();
  389. if (!find_get_entries(mapping, &index, end - 1, &fbatch,
  390. indices)) {
  391. /* If all gone from start onwards, we're done */
  392. if (index == start)
  393. break;
  394. /* Otherwise restart to make sure all gone */
  395. index = start;
  396. continue;
  397. }
  398. for (i = 0; i < folio_batch_count(&fbatch); i++) {
  399. struct folio *folio = fbatch.folios[i];
  400. /* We rely upon deletion not changing folio->index */
  401. if (xa_is_value(folio))
  402. continue;
  403. folio_lock(folio);
  404. VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
  405. folio_wait_writeback(folio);
  406. truncate_inode_folio(mapping, folio);
  407. folio_unlock(folio);
  408. }
  409. truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
  410. folio_batch_release(&fbatch);
  411. }
  412. }
  413. EXPORT_SYMBOL(truncate_inode_pages_range);
  414. /**
  415. * truncate_inode_pages - truncate *all* the pages from an offset
  416. * @mapping: mapping to truncate
  417. * @lstart: offset from which to truncate
  418. *
  419. * Called under (and serialised by) inode->i_rwsem and
  420. * mapping->invalidate_lock.
  421. *
  422. * Note: When this function returns, there can be a page in the process of
  423. * deletion (inside __filemap_remove_folio()) in the specified range. Thus
  424. * mapping->nrpages can be non-zero when this function returns even after
  425. * truncation of the whole mapping.
  426. */
  427. void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
  428. {
  429. truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
  430. }
  431. EXPORT_SYMBOL(truncate_inode_pages);
  432. /**
  433. * truncate_inode_pages_final - truncate *all* pages before inode dies
  434. * @mapping: mapping to truncate
  435. *
  436. * Called under (and serialized by) inode->i_rwsem.
  437. *
  438. * Filesystems have to use this in the .evict_inode path to inform the
  439. * VM that this is the final truncate and the inode is going away.
  440. */
  441. void truncate_inode_pages_final(struct address_space *mapping)
  442. {
  443. /*
  444. * Page reclaim can not participate in regular inode lifetime
  445. * management (can't call iput()) and thus can race with the
  446. * inode teardown. Tell it when the address space is exiting,
  447. * so that it does not install eviction information after the
  448. * final truncate has begun.
  449. */
  450. mapping_set_exiting(mapping);
  451. if (!mapping_empty(mapping)) {
  452. /*
  453. * As truncation uses a lockless tree lookup, cycle
  454. * the tree lock to make sure any ongoing tree
  455. * modification that does not see AS_EXITING is
  456. * completed before starting the final truncate.
  457. */
  458. xa_lock_irq(&mapping->i_pages);
  459. xa_unlock_irq(&mapping->i_pages);
  460. }
  461. truncate_inode_pages(mapping, 0);
  462. }
  463. EXPORT_SYMBOL(truncate_inode_pages_final);
  464. /**
  465. * mapping_try_invalidate - Invalidate all the evictable folios of one inode
  466. * @mapping: the address_space which holds the folios to invalidate
  467. * @start: the offset 'from' which to invalidate
  468. * @end: the offset 'to' which to invalidate (inclusive)
  469. * @nr_failed: How many folio invalidations failed
  470. *
  471. * This function is similar to invalidate_mapping_pages(), except that it
  472. * returns the number of folios which could not be evicted in @nr_failed.
  473. */
  474. unsigned long mapping_try_invalidate(struct address_space *mapping,
  475. pgoff_t start, pgoff_t end, unsigned long *nr_failed)
  476. {
  477. pgoff_t indices[PAGEVEC_SIZE];
  478. struct folio_batch fbatch;
  479. pgoff_t index = start;
  480. unsigned long ret;
  481. unsigned long count = 0;
  482. int i;
  483. folio_batch_init(&fbatch);
  484. while (find_lock_entries(mapping, &index, end, &fbatch, indices)) {
  485. bool xa_has_values = false;
  486. int nr = folio_batch_count(&fbatch);
  487. for (i = 0; i < nr; i++) {
  488. struct folio *folio = fbatch.folios[i];
  489. /* We rely upon deletion not changing folio->index */
  490. if (xa_is_value(folio)) {
  491. xa_has_values = true;
  492. count++;
  493. continue;
  494. }
  495. ret = mapping_evict_folio(mapping, folio);
  496. folio_unlock(folio);
  497. /*
  498. * Invalidation is a hint that the folio is no longer
  499. * of interest and try to speed up its reclaim.
  500. */
  501. if (!ret) {
  502. deactivate_file_folio(folio);
  503. /* Likely in the lru cache of a remote CPU */
  504. if (nr_failed)
  505. (*nr_failed)++;
  506. }
  507. count += ret;
  508. }
  509. if (xa_has_values)
  510. clear_shadow_entries(mapping, indices[0], indices[nr-1]);
  511. folio_batch_remove_exceptionals(&fbatch);
  512. folio_batch_release(&fbatch);
  513. cond_resched();
  514. }
  515. return count;
  516. }
  517. /**
  518. * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode
  519. * @mapping: the address_space which holds the cache to invalidate
  520. * @start: the offset 'from' which to invalidate
  521. * @end: the offset 'to' which to invalidate (inclusive)
  522. *
  523. * This function removes pages that are clean, unmapped and unlocked,
  524. * as well as shadow entries. It will not block on IO activity.
  525. *
  526. * If you want to remove all the pages of one inode, regardless of
  527. * their use and writeback state, use truncate_inode_pages().
  528. *
  529. * Return: The number of indices that had their contents invalidated
  530. */
  531. unsigned long invalidate_mapping_pages(struct address_space *mapping,
  532. pgoff_t start, pgoff_t end)
  533. {
  534. return mapping_try_invalidate(mapping, start, end, NULL);
  535. }
  536. EXPORT_SYMBOL(invalidate_mapping_pages);
  537. static int folio_launder(struct address_space *mapping, struct folio *folio)
  538. {
  539. if (!folio_test_dirty(folio))
  540. return 0;
  541. if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL)
  542. return 0;
  543. return mapping->a_ops->launder_folio(folio);
  544. }
  545. /*
  546. * This is like mapping_evict_folio(), except it ignores the folio's
  547. * refcount. We do this because invalidate_inode_pages2() needs stronger
  548. * invalidation guarantees, and cannot afford to leave folios behind because
  549. * shrink_folio_list() has a temp ref on them, or because they're transiently
  550. * sitting in the folio_add_lru() caches.
  551. */
  552. int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
  553. gfp_t gfp)
  554. {
  555. int ret;
  556. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  557. if (folio_mapped(folio))
  558. unmap_mapping_folio(folio);
  559. BUG_ON(folio_mapped(folio));
  560. ret = folio_launder(mapping, folio);
  561. if (ret)
  562. return ret;
  563. if (folio->mapping != mapping)
  564. return -EBUSY;
  565. if (!filemap_release_folio(folio, gfp))
  566. return -EBUSY;
  567. spin_lock(&mapping->host->i_lock);
  568. xa_lock_irq(&mapping->i_pages);
  569. if (folio_test_dirty(folio))
  570. goto failed;
  571. BUG_ON(folio_has_private(folio));
  572. __filemap_remove_folio(folio, NULL);
  573. xa_unlock_irq(&mapping->i_pages);
  574. if (mapping_shrinkable(mapping))
  575. inode_lru_list_add(mapping->host);
  576. spin_unlock(&mapping->host->i_lock);
  577. filemap_free_folio(mapping, folio);
  578. return 1;
  579. failed:
  580. xa_unlock_irq(&mapping->i_pages);
  581. spin_unlock(&mapping->host->i_lock);
  582. return -EBUSY;
  583. }
  584. /**
  585. * invalidate_inode_pages2_range - remove range of pages from an address_space
  586. * @mapping: the address_space
  587. * @start: the page offset 'from' which to invalidate
  588. * @end: the page offset 'to' which to invalidate (inclusive)
  589. *
  590. * Any pages which are found to be mapped into pagetables are unmapped prior to
  591. * invalidation.
  592. *
  593. * Return: -EBUSY if any pages could not be invalidated.
  594. */
  595. int invalidate_inode_pages2_range(struct address_space *mapping,
  596. pgoff_t start, pgoff_t end)
  597. {
  598. pgoff_t indices[PAGEVEC_SIZE];
  599. struct folio_batch fbatch;
  600. pgoff_t index;
  601. int i;
  602. int ret = 0;
  603. int ret2 = 0;
  604. int did_range_unmap = 0;
  605. if (mapping_empty(mapping))
  606. return 0;
  607. folio_batch_init(&fbatch);
  608. index = start;
  609. while (find_get_entries(mapping, &index, end, &fbatch, indices)) {
  610. bool xa_has_values = false;
  611. int nr = folio_batch_count(&fbatch);
  612. for (i = 0; i < nr; i++) {
  613. struct folio *folio = fbatch.folios[i];
  614. /* We rely upon deletion not changing folio->index */
  615. if (xa_is_value(folio)) {
  616. xa_has_values = true;
  617. if (dax_mapping(mapping) &&
  618. !dax_invalidate_mapping_entry_sync(mapping, indices[i]))
  619. ret = -EBUSY;
  620. continue;
  621. }
  622. if (!did_range_unmap && folio_mapped(folio)) {
  623. /*
  624. * If folio is mapped, before taking its lock,
  625. * zap the rest of the file in one hit.
  626. */
  627. unmap_mapping_pages(mapping, indices[i],
  628. (1 + end - indices[i]), false);
  629. did_range_unmap = 1;
  630. }
  631. folio_lock(folio);
  632. if (unlikely(folio->mapping != mapping)) {
  633. folio_unlock(folio);
  634. continue;
  635. }
  636. VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
  637. folio_wait_writeback(folio);
  638. ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL);
  639. if (ret2 < 0)
  640. ret = ret2;
  641. folio_unlock(folio);
  642. }
  643. if (xa_has_values)
  644. clear_shadow_entries(mapping, indices[0], indices[nr-1]);
  645. folio_batch_remove_exceptionals(&fbatch);
  646. folio_batch_release(&fbatch);
  647. cond_resched();
  648. }
  649. /*
  650. * For DAX we invalidate page tables after invalidating page cache. We
  651. * could invalidate page tables while invalidating each entry however
  652. * that would be expensive. And doing range unmapping before doesn't
  653. * work as we have no cheap way to find whether page cache entry didn't
  654. * get remapped later.
  655. */
  656. if (dax_mapping(mapping)) {
  657. unmap_mapping_pages(mapping, start, end - start + 1, false);
  658. }
  659. return ret;
  660. }
  661. EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
  662. /**
  663. * invalidate_inode_pages2 - remove all pages from an address_space
  664. * @mapping: the address_space
  665. *
  666. * Any pages which are found to be mapped into pagetables are unmapped prior to
  667. * invalidation.
  668. *
  669. * Return: -EBUSY if any pages could not be invalidated.
  670. */
  671. int invalidate_inode_pages2(struct address_space *mapping)
  672. {
  673. return invalidate_inode_pages2_range(mapping, 0, -1);
  674. }
  675. EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
  676. /**
  677. * truncate_pagecache - unmap and remove pagecache that has been truncated
  678. * @inode: inode
  679. * @newsize: new file size
  680. *
  681. * inode's new i_size must already be written before truncate_pagecache
  682. * is called.
  683. *
  684. * This function should typically be called before the filesystem
  685. * releases resources associated with the freed range (eg. deallocates
  686. * blocks). This way, pagecache will always stay logically coherent
  687. * with on-disk format, and the filesystem would not have to deal with
  688. * situations such as writepage being called for a page that has already
  689. * had its underlying blocks deallocated.
  690. */
  691. void truncate_pagecache(struct inode *inode, loff_t newsize)
  692. {
  693. struct address_space *mapping = inode->i_mapping;
  694. loff_t holebegin = round_up(newsize, PAGE_SIZE);
  695. /*
  696. * unmap_mapping_range is called twice, first simply for
  697. * efficiency so that truncate_inode_pages does fewer
  698. * single-page unmaps. However after this first call, and
  699. * before truncate_inode_pages finishes, it is possible for
  700. * private pages to be COWed, which remain after
  701. * truncate_inode_pages finishes, hence the second
  702. * unmap_mapping_range call must be made for correctness.
  703. */
  704. unmap_mapping_range(mapping, holebegin, 0, 1);
  705. truncate_inode_pages(mapping, newsize);
  706. unmap_mapping_range(mapping, holebegin, 0, 1);
  707. }
  708. EXPORT_SYMBOL(truncate_pagecache);
  709. /**
  710. * truncate_setsize - update inode and pagecache for a new file size
  711. * @inode: inode
  712. * @newsize: new file size
  713. *
  714. * truncate_setsize updates i_size and performs pagecache truncation (if
  715. * necessary) to @newsize. It will be typically be called from the filesystem's
  716. * setattr function when ATTR_SIZE is passed in.
  717. *
  718. * Must be called with a lock serializing truncates and writes (generally
  719. * i_rwsem but e.g. xfs uses a different lock) and before all filesystem
  720. * specific block truncation has been performed.
  721. */
  722. void truncate_setsize(struct inode *inode, loff_t newsize)
  723. {
  724. loff_t oldsize = inode->i_size;
  725. i_size_write(inode, newsize);
  726. if (newsize > oldsize)
  727. pagecache_isize_extended(inode, oldsize, newsize);
  728. truncate_pagecache(inode, newsize);
  729. }
  730. EXPORT_SYMBOL(truncate_setsize);
  731. /**
  732. * pagecache_isize_extended - update pagecache after extension of i_size
  733. * @inode: inode for which i_size was extended
  734. * @from: original inode size
  735. * @to: new inode size
  736. *
  737. * Handle extension of inode size either caused by extending truncate or
  738. * by write starting after current i_size. We mark the page straddling
  739. * current i_size RO so that page_mkwrite() is called on the first
  740. * write access to the page. The filesystem will update its per-block
  741. * information before user writes to the page via mmap after the i_size
  742. * has been changed.
  743. *
  744. * The function must be called after i_size is updated so that page fault
  745. * coming after we unlock the folio will already see the new i_size.
  746. * The function must be called while we still hold i_rwsem - this not only
  747. * makes sure i_size is stable but also that userspace cannot observe new
  748. * i_size value before we are prepared to store mmap writes at new inode size.
  749. */
  750. void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
  751. {
  752. int bsize = i_blocksize(inode);
  753. loff_t rounded_from;
  754. struct folio *folio;
  755. WARN_ON(to > inode->i_size);
  756. if (from >= to || bsize >= PAGE_SIZE)
  757. return;
  758. /* Page straddling @from will not have any hole block created? */
  759. rounded_from = round_up(from, bsize);
  760. if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
  761. return;
  762. folio = filemap_lock_folio(inode->i_mapping, from / PAGE_SIZE);
  763. /* Folio not cached? Nothing to do */
  764. if (IS_ERR(folio))
  765. return;
  766. /*
  767. * See folio_clear_dirty_for_io() for details why folio_mark_dirty()
  768. * is needed.
  769. */
  770. if (folio_mkclean(folio))
  771. folio_mark_dirty(folio);
  772. /*
  773. * The post-eof range of the folio must be zeroed before it is exposed
  774. * to the file. Writeback normally does this, but since i_size has been
  775. * increased we handle it here.
  776. */
  777. if (folio_test_dirty(folio)) {
  778. unsigned int offset, end;
  779. offset = from - folio_pos(folio);
  780. end = min_t(unsigned int, to - folio_pos(folio),
  781. folio_size(folio));
  782. folio_zero_segment(folio, offset, end);
  783. }
  784. folio_unlock(folio);
  785. folio_put(folio);
  786. }
  787. EXPORT_SYMBOL(pagecache_isize_extended);
  788. /**
  789. * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
  790. * @inode: inode
  791. * @lstart: offset of beginning of hole
  792. * @lend: offset of last byte of hole
  793. *
  794. * This function should typically be called before the filesystem
  795. * releases resources associated with the freed range (eg. deallocates
  796. * blocks). This way, pagecache will always stay logically coherent
  797. * with on-disk format, and the filesystem would not have to deal with
  798. * situations such as writepage being called for a page that has already
  799. * had its underlying blocks deallocated.
  800. */
  801. void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
  802. {
  803. struct address_space *mapping = inode->i_mapping;
  804. loff_t unmap_start = round_up(lstart, PAGE_SIZE);
  805. loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
  806. /*
  807. * This rounding is currently just for example: unmap_mapping_range
  808. * expands its hole outwards, whereas we want it to contract the hole
  809. * inwards. However, existing callers of truncate_pagecache_range are
  810. * doing their own page rounding first. Note that unmap_mapping_range
  811. * allows holelen 0 for all, and we allow lend -1 for end of file.
  812. */
  813. /*
  814. * Unlike in truncate_pagecache, unmap_mapping_range is called only
  815. * once (before truncating pagecache), and without "even_cows" flag:
  816. * hole-punching should not remove private COWed pages from the hole.
  817. */
  818. if ((u64)unmap_end > (u64)unmap_start)
  819. unmap_mapping_range(mapping, unmap_start,
  820. 1 + unmap_end - unmap_start, 0);
  821. truncate_inode_pages_range(mapping, lstart, lend);
  822. }
  823. EXPORT_SYMBOL(truncate_pagecache_range);