swap_state.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * linux/mm/swap_state.c
  4. *
  5. * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
  6. * Swap reorganised 29.12.95, Stephen Tweedie
  7. *
  8. * Rewritten to use page cache, (C) 1998 Stephen Tweedie
  9. */
  10. #include <linux/mm.h>
  11. #include <linux/gfp.h>
  12. #include <linux/kernel_stat.h>
  13. #include <linux/mempolicy.h>
  14. #include <linux/swap.h>
  15. #include <linux/leafops.h>
  16. #include <linux/init.h>
  17. #include <linux/pagemap.h>
  18. #include <linux/pagevec.h>
  19. #include <linux/backing-dev.h>
  20. #include <linux/blkdev.h>
  21. #include <linux/migrate.h>
  22. #include <linux/vmalloc.h>
  23. #include <linux/huge_mm.h>
  24. #include <linux/shmem_fs.h>
  25. #include "internal.h"
  26. #include "swap_table.h"
  27. #include "swap.h"
  28. /*
  29. * swapper_space is a fiction, retained to simplify the path through
  30. * vmscan's shrink_folio_list.
  31. */
  32. static const struct address_space_operations swap_aops = {
  33. .dirty_folio = noop_dirty_folio,
  34. #ifdef CONFIG_MIGRATION
  35. .migrate_folio = migrate_folio,
  36. #endif
  37. };
  38. struct address_space swap_space __read_mostly = {
  39. .a_ops = &swap_aops,
  40. };
  41. static bool enable_vma_readahead __read_mostly = true;
  42. #define SWAP_RA_ORDER_CEILING 5
  43. #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
  44. #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
  45. #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK
  46. #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK)
  47. #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK)
  48. #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
  49. #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK)
  50. #define SWAP_RA_VAL(addr, win, hits) \
  51. (((addr) & PAGE_MASK) | \
  52. (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \
  53. ((hits) & SWAP_RA_HITS_MASK))
  54. /* Initial readahead hits is 4 to start up with a small window */
  55. #define GET_SWAP_RA_VAL(vma) \
  56. (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
  57. static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
  58. void show_swap_cache_info(void)
  59. {
  60. printk("%lu pages in swap cache\n", total_swapcache_pages());
  61. printk("Free swap = %ldkB\n", K(get_nr_swap_pages()));
  62. printk("Total swap = %lukB\n", K(total_swap_pages));
  63. }
  64. /**
  65. * swap_cache_get_folio - Looks up a folio in the swap cache.
  66. * @entry: swap entry used for the lookup.
  67. *
  68. * A found folio will be returned unlocked and with its refcount increased.
  69. *
  70. * Context: Caller must ensure @entry is valid and protect the swap device
  71. * with reference count or locks.
  72. * Return: Returns the found folio on success, NULL otherwise. The caller
  73. * must lock and check if the folio still matches the swap entry before
  74. * use (e.g., folio_matches_swap_entry).
  75. */
  76. struct folio *swap_cache_get_folio(swp_entry_t entry)
  77. {
  78. unsigned long swp_tb;
  79. struct folio *folio;
  80. for (;;) {
  81. swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
  82. swp_cluster_offset(entry));
  83. if (!swp_tb_is_folio(swp_tb))
  84. return NULL;
  85. folio = swp_tb_to_folio(swp_tb);
  86. if (likely(folio_try_get(folio)))
  87. return folio;
  88. }
  89. return NULL;
  90. }
  91. /**
  92. * swap_cache_has_folio - Check if a swap slot has cache.
  93. * @entry: swap entry indicating the slot.
  94. *
  95. * Context: Caller must ensure @entry is valid and protect the swap
  96. * device with reference count or locks.
  97. */
  98. bool swap_cache_has_folio(swp_entry_t entry)
  99. {
  100. unsigned long swp_tb;
  101. swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
  102. swp_cluster_offset(entry));
  103. return swp_tb_is_folio(swp_tb);
  104. }
  105. /**
  106. * swap_cache_get_shadow - Looks up a shadow in the swap cache.
  107. * @entry: swap entry used for the lookup.
  108. *
  109. * Context: Caller must ensure @entry is valid and protect the swap device
  110. * with reference count or locks.
  111. * Return: Returns either NULL or an XA_VALUE (shadow).
  112. */
  113. void *swap_cache_get_shadow(swp_entry_t entry)
  114. {
  115. unsigned long swp_tb;
  116. swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
  117. swp_cluster_offset(entry));
  118. if (swp_tb_is_shadow(swp_tb))
  119. return swp_tb_to_shadow(swp_tb);
  120. return NULL;
  121. }
  122. void __swap_cache_add_folio(struct swap_cluster_info *ci,
  123. struct folio *folio, swp_entry_t entry)
  124. {
  125. unsigned long new_tb;
  126. unsigned int ci_start, ci_off, ci_end;
  127. unsigned long nr_pages = folio_nr_pages(folio);
  128. VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
  129. VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
  130. VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
  131. new_tb = folio_to_swp_tb(folio);
  132. ci_start = swp_cluster_offset(entry);
  133. ci_off = ci_start;
  134. ci_end = ci_start + nr_pages;
  135. do {
  136. VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off)));
  137. __swap_table_set(ci, ci_off, new_tb);
  138. } while (++ci_off < ci_end);
  139. folio_ref_add(folio, nr_pages);
  140. folio_set_swapcache(folio);
  141. folio->swap = entry;
  142. node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
  143. lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
  144. }
  145. /**
  146. * swap_cache_add_folio - Add a folio into the swap cache.
  147. * @folio: The folio to be added.
  148. * @entry: The swap entry corresponding to the folio.
  149. * @gfp: gfp_mask for XArray node allocation.
  150. * @shadowp: If a shadow is found, return the shadow.
  151. *
  152. * Context: Caller must ensure @entry is valid and protect the swap device
  153. * with reference count or locks.
  154. */
  155. static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
  156. void **shadowp)
  157. {
  158. int err;
  159. void *shadow = NULL;
  160. unsigned long old_tb;
  161. struct swap_info_struct *si;
  162. struct swap_cluster_info *ci;
  163. unsigned int ci_start, ci_off, ci_end, offset;
  164. unsigned long nr_pages = folio_nr_pages(folio);
  165. si = __swap_entry_to_info(entry);
  166. ci_start = swp_cluster_offset(entry);
  167. ci_end = ci_start + nr_pages;
  168. ci_off = ci_start;
  169. offset = swp_offset(entry);
  170. ci = swap_cluster_lock(si, swp_offset(entry));
  171. if (unlikely(!ci->table)) {
  172. err = -ENOENT;
  173. goto failed;
  174. }
  175. do {
  176. old_tb = __swap_table_get(ci, ci_off);
  177. if (unlikely(swp_tb_is_folio(old_tb))) {
  178. err = -EEXIST;
  179. goto failed;
  180. }
  181. if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) {
  182. err = -ENOENT;
  183. goto failed;
  184. }
  185. if (swp_tb_is_shadow(old_tb))
  186. shadow = swp_tb_to_shadow(old_tb);
  187. offset++;
  188. } while (++ci_off < ci_end);
  189. __swap_cache_add_folio(ci, folio, entry);
  190. swap_cluster_unlock(ci);
  191. if (shadowp)
  192. *shadowp = shadow;
  193. return 0;
  194. failed:
  195. swap_cluster_unlock(ci);
  196. return err;
  197. }
  198. /**
  199. * __swap_cache_del_folio - Removes a folio from the swap cache.
  200. * @ci: The locked swap cluster.
  201. * @folio: The folio.
  202. * @entry: The first swap entry that the folio corresponds to.
  203. * @shadow: shadow value to be filled in the swap cache.
  204. *
  205. * Removes a folio from the swap cache and fills a shadow in place.
  206. * This won't put the folio's refcount. The caller has to do that.
  207. *
  208. * Context: Caller must ensure the folio is locked and in the swap cache
  209. * using the index of @entry, and lock the cluster that holds the entries.
  210. */
  211. void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
  212. swp_entry_t entry, void *shadow)
  213. {
  214. struct swap_info_struct *si;
  215. unsigned long old_tb, new_tb;
  216. unsigned int ci_start, ci_off, ci_end;
  217. bool folio_swapped = false, need_free = false;
  218. unsigned long nr_pages = folio_nr_pages(folio);
  219. VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci);
  220. VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
  221. VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
  222. VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio);
  223. si = __swap_entry_to_info(entry);
  224. new_tb = shadow_swp_to_tb(shadow);
  225. ci_start = swp_cluster_offset(entry);
  226. ci_end = ci_start + nr_pages;
  227. ci_off = ci_start;
  228. do {
  229. /* If shadow is NULL, we sets an empty shadow */
  230. old_tb = __swap_table_xchg(ci, ci_off, new_tb);
  231. WARN_ON_ONCE(!swp_tb_is_folio(old_tb) ||
  232. swp_tb_to_folio(old_tb) != folio);
  233. if (__swap_count(swp_entry(si->type,
  234. swp_offset(entry) + ci_off - ci_start)))
  235. folio_swapped = true;
  236. else
  237. need_free = true;
  238. } while (++ci_off < ci_end);
  239. folio->swap.val = 0;
  240. folio_clear_swapcache(folio);
  241. node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages);
  242. lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
  243. if (!folio_swapped) {
  244. swap_entries_free(si, ci, swp_offset(entry), nr_pages);
  245. } else if (need_free) {
  246. do {
  247. if (!__swap_count(entry))
  248. swap_entries_free(si, ci, swp_offset(entry), 1);
  249. entry.val++;
  250. } while (--nr_pages);
  251. }
  252. }
  253. /**
  254. * swap_cache_del_folio - Removes a folio from the swap cache.
  255. * @folio: The folio.
  256. *
  257. * Same as __swap_cache_del_folio, but handles lock and refcount. The
  258. * caller must ensure the folio is either clean or has a swap count
  259. * equal to zero, or it may cause data loss.
  260. *
  261. * Context: Caller must ensure the folio is locked and in the swap cache.
  262. */
  263. void swap_cache_del_folio(struct folio *folio)
  264. {
  265. struct swap_cluster_info *ci;
  266. swp_entry_t entry = folio->swap;
  267. ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry));
  268. __swap_cache_del_folio(ci, folio, entry, NULL);
  269. swap_cluster_unlock(ci);
  270. folio_ref_sub(folio, folio_nr_pages(folio));
  271. }
  272. /**
  273. * __swap_cache_replace_folio - Replace a folio in the swap cache.
  274. * @ci: The locked swap cluster.
  275. * @old: The old folio to be replaced.
  276. * @new: The new folio.
  277. *
  278. * Replace an existing folio in the swap cache with a new folio. The
  279. * caller is responsible for setting up the new folio's flag and swap
  280. * entries. Replacement will take the new folio's swap entry value as
  281. * the starting offset to override all slots covered by the new folio.
  282. *
  283. * Context: Caller must ensure both folios are locked, and lock the
  284. * cluster that holds the old folio to be replaced.
  285. */
  286. void __swap_cache_replace_folio(struct swap_cluster_info *ci,
  287. struct folio *old, struct folio *new)
  288. {
  289. swp_entry_t entry = new->swap;
  290. unsigned long nr_pages = folio_nr_pages(new);
  291. unsigned int ci_off = swp_cluster_offset(entry);
  292. unsigned int ci_end = ci_off + nr_pages;
  293. unsigned long old_tb, new_tb;
  294. VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new));
  295. VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new));
  296. VM_WARN_ON_ONCE(!entry.val);
  297. /* Swap cache still stores N entries instead of a high-order entry */
  298. new_tb = folio_to_swp_tb(new);
  299. do {
  300. old_tb = __swap_table_xchg(ci, ci_off, new_tb);
  301. WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old);
  302. } while (++ci_off < ci_end);
  303. /*
  304. * If the old folio is partially replaced (e.g., splitting a large
  305. * folio, the old folio is shrunk, and new split sub folios replace
  306. * the shrunk part), ensure the new folio doesn't overlap it.
  307. */
  308. if (IS_ENABLED(CONFIG_DEBUG_VM) &&
  309. folio_order(old) != folio_order(new)) {
  310. ci_off = swp_cluster_offset(old->swap);
  311. ci_end = ci_off + folio_nr_pages(old);
  312. while (ci_off++ < ci_end)
  313. WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old);
  314. }
  315. }
  316. /**
  317. * __swap_cache_clear_shadow - Clears a set of shadows in the swap cache.
  318. * @entry: The starting index entry.
  319. * @nr_ents: How many slots need to be cleared.
  320. *
  321. * Context: Caller must ensure the range is valid, all in one single cluster,
  322. * not occupied by any folio, and lock the cluster.
  323. */
  324. void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents)
  325. {
  326. struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
  327. unsigned int ci_off = swp_cluster_offset(entry), ci_end;
  328. unsigned long old;
  329. ci_end = ci_off + nr_ents;
  330. do {
  331. old = __swap_table_xchg(ci, ci_off, null_to_swp_tb());
  332. WARN_ON_ONCE(swp_tb_is_folio(old));
  333. } while (++ci_off < ci_end);
  334. }
  335. /*
  336. * If we are the only user, then try to free up the swap cache.
  337. *
  338. * Its ok to check the swapcache flag without the folio lock
  339. * here because we are going to recheck again inside
  340. * folio_free_swap() _with_ the lock.
  341. * - Marcelo
  342. */
  343. void free_swap_cache(struct folio *folio)
  344. {
  345. if (folio_test_swapcache(folio) && !folio_mapped(folio) &&
  346. folio_trylock(folio)) {
  347. folio_free_swap(folio);
  348. folio_unlock(folio);
  349. }
  350. }
  351. /*
  352. * Freeing a folio and also freeing any swap cache associated with
  353. * this folio if it is the last user.
  354. */
  355. void free_folio_and_swap_cache(struct folio *folio)
  356. {
  357. free_swap_cache(folio);
  358. if (!is_huge_zero_folio(folio))
  359. folio_put(folio);
  360. }
  361. /*
  362. * Passed an array of pages, drop them all from swapcache and then release
  363. * them. They are removed from the LRU and freed if this is their last use.
  364. */
  365. void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
  366. {
  367. struct folio_batch folios;
  368. unsigned int refs[PAGEVEC_SIZE];
  369. folio_batch_init(&folios);
  370. for (int i = 0; i < nr; i++) {
  371. struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
  372. free_swap_cache(folio);
  373. refs[folios.nr] = 1;
  374. if (unlikely(encoded_page_flags(pages[i]) &
  375. ENCODED_PAGE_BIT_NR_PAGES_NEXT))
  376. refs[folios.nr] = encoded_nr_pages(pages[++i]);
  377. if (folio_batch_add(&folios, folio) == 0)
  378. folios_put_refs(&folios, refs);
  379. }
  380. if (folios.nr)
  381. folios_put_refs(&folios, refs);
  382. }
  383. static inline bool swap_use_vma_readahead(void)
  384. {
  385. return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
  386. }
  387. /**
  388. * swap_update_readahead - Update the readahead statistics of VMA or globally.
  389. * @folio: the swap cache folio that just got hit.
  390. * @vma: the VMA that should be updated, could be NULL for global update.
  391. * @addr: the addr that triggered the swapin, ignored if @vma is NULL.
  392. */
  393. void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
  394. unsigned long addr)
  395. {
  396. bool readahead, vma_ra = swap_use_vma_readahead();
  397. /*
  398. * At the moment, we don't support PG_readahead for anon THP
  399. * so let's bail out rather than confusing the readahead stat.
  400. */
  401. if (unlikely(folio_test_large(folio)))
  402. return;
  403. readahead = folio_test_clear_readahead(folio);
  404. if (vma && vma_ra) {
  405. unsigned long ra_val;
  406. int win, hits;
  407. ra_val = GET_SWAP_RA_VAL(vma);
  408. win = SWAP_RA_WIN(ra_val);
  409. hits = SWAP_RA_HITS(ra_val);
  410. if (readahead)
  411. hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
  412. atomic_long_set(&vma->swap_readahead_info,
  413. SWAP_RA_VAL(addr, win, hits));
  414. }
  415. if (readahead) {
  416. count_vm_event(SWAP_RA_HIT);
  417. if (!vma || !vma_ra)
  418. atomic_inc(&swapin_readahead_hits);
  419. }
  420. }
  421. /**
  422. * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache.
  423. * @entry: swap entry to be bound to the folio.
  424. * @folio: folio to be added.
  425. * @gfp: memory allocation flags for charge, can be 0 if @charged if true.
  426. * @charged: if the folio is already charged.
  427. *
  428. * Update the swap_map and add folio as swap cache, typically before swapin.
  429. * All swap slots covered by the folio must have a non-zero swap count.
  430. *
  431. * Context: Caller must protect the swap device with reference count or locks.
  432. * Return: Returns the folio being added on success. Returns the existing folio
  433. * if @entry is already cached. Returns NULL if raced with swapin or swapoff.
  434. */
  435. static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
  436. struct folio *folio,
  437. gfp_t gfp, bool charged)
  438. {
  439. struct folio *swapcache = NULL;
  440. void *shadow;
  441. int ret;
  442. __folio_set_locked(folio);
  443. __folio_set_swapbacked(folio);
  444. if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry))
  445. goto failed;
  446. for (;;) {
  447. ret = swap_cache_add_folio(folio, entry, &shadow);
  448. if (!ret)
  449. break;
  450. /*
  451. * Large order allocation needs special handling on
  452. * race: if a smaller folio exists in cache, swapin needs
  453. * to fallback to order 0, and doing a swap cache lookup
  454. * might return a folio that is irrelevant to the faulting
  455. * entry because @entry is aligned down. Just return NULL.
  456. */
  457. if (ret != -EEXIST || folio_test_large(folio))
  458. goto failed;
  459. swapcache = swap_cache_get_folio(entry);
  460. if (swapcache)
  461. goto failed;
  462. }
  463. memcg1_swapin(entry, folio_nr_pages(folio));
  464. if (shadow)
  465. workingset_refault(folio, shadow);
  466. /* Caller will initiate read into locked folio */
  467. folio_add_lru(folio);
  468. return folio;
  469. failed:
  470. folio_unlock(folio);
  471. return swapcache;
  472. }
  473. /**
  474. * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache.
  475. * @entry: the swapped out swap entry to be binded to the folio.
  476. * @gfp_mask: memory allocation flags
  477. * @mpol: NUMA memory allocation policy to be applied
  478. * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
  479. * @new_page_allocated: sets true if allocation happened, false otherwise
  480. *
  481. * Allocate a folio in the swap cache for one swap slot, typically before
  482. * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by
  483. * @entry must have a non-zero swap count (swapped out).
  484. * Currently only supports order 0.
  485. *
  486. * Context: Caller must protect the swap device with reference count or locks.
  487. * Return: Returns the existing folio if @entry is cached already. Returns
  488. * NULL if failed due to -ENOMEM or @entry have a swap count < 1.
  489. */
  490. struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
  491. struct mempolicy *mpol, pgoff_t ilx,
  492. bool *new_page_allocated)
  493. {
  494. struct swap_info_struct *si = __swap_entry_to_info(entry);
  495. struct folio *folio;
  496. struct folio *result = NULL;
  497. *new_page_allocated = false;
  498. /* Check the swap cache again for readahead path. */
  499. folio = swap_cache_get_folio(entry);
  500. if (folio)
  501. return folio;
  502. /* Skip allocation for unused and bad swap slot for readahead. */
  503. if (!swap_entry_swapped(si, entry))
  504. return NULL;
  505. /* Allocate a new folio to be added into the swap cache. */
  506. folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id());
  507. if (!folio)
  508. return NULL;
  509. /* Try add the new folio, returns existing folio or NULL on failure. */
  510. result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false);
  511. if (result == folio)
  512. *new_page_allocated = true;
  513. else
  514. folio_put(folio);
  515. return result;
  516. }
  517. /**
  518. * swapin_folio - swap-in one or multiple entries skipping readahead.
  519. * @entry: starting swap entry to swap in
  520. * @folio: a new allocated and charged folio
  521. *
  522. * Reads @entry into @folio, @folio will be added to the swap cache.
  523. * If @folio is a large folio, the @entry will be rounded down to align
  524. * with the folio size.
  525. *
  526. * Return: returns pointer to @folio on success. If folio is a large folio
  527. * and this raced with another swapin, NULL will be returned to allow fallback
  528. * to order 0. Else, if another folio was already added to the swap cache,
  529. * return that swap cache folio instead.
  530. */
  531. struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
  532. {
  533. struct folio *swapcache;
  534. pgoff_t offset = swp_offset(entry);
  535. unsigned long nr_pages = folio_nr_pages(folio);
  536. entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
  537. swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
  538. if (swapcache == folio)
  539. swap_read_folio(folio, NULL);
  540. return swapcache;
  541. }
  542. /*
  543. * Locate a page of swap in physical memory, reserving swap cache space
  544. * and reading the disk if it is not already cached.
  545. * A failure return means that either the page allocation failed or that
  546. * the swap entry is no longer in use.
  547. */
  548. struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
  549. struct vm_area_struct *vma, unsigned long addr,
  550. struct swap_iocb **plug)
  551. {
  552. struct swap_info_struct *si;
  553. bool page_allocated;
  554. struct mempolicy *mpol;
  555. pgoff_t ilx;
  556. struct folio *folio;
  557. si = get_swap_device(entry);
  558. if (!si)
  559. return NULL;
  560. mpol = get_vma_policy(vma, addr, 0, &ilx);
  561. folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
  562. &page_allocated);
  563. mpol_cond_put(mpol);
  564. if (page_allocated)
  565. swap_read_folio(folio, plug);
  566. put_swap_device(si);
  567. return folio;
  568. }
  569. static unsigned int __swapin_nr_pages(unsigned long prev_offset,
  570. unsigned long offset,
  571. int hits,
  572. int max_pages,
  573. int prev_win)
  574. {
  575. unsigned int pages, last_ra;
  576. /*
  577. * This heuristic has been found to work well on both sequential and
  578. * random loads, swapping to hard disk or to SSD: please don't ask
  579. * what the "+ 2" means, it just happens to work well, that's all.
  580. */
  581. pages = hits + 2;
  582. if (pages == 2) {
  583. /*
  584. * We can have no readahead hits to judge by: but must not get
  585. * stuck here forever, so check for an adjacent offset instead
  586. * (and don't even bother to check whether swap type is same).
  587. */
  588. if (offset != prev_offset + 1 && offset != prev_offset - 1)
  589. pages = 1;
  590. } else {
  591. unsigned int roundup = 4;
  592. while (roundup < pages)
  593. roundup <<= 1;
  594. pages = roundup;
  595. }
  596. if (pages > max_pages)
  597. pages = max_pages;
  598. /* Don't shrink readahead too fast */
  599. last_ra = prev_win / 2;
  600. if (pages < last_ra)
  601. pages = last_ra;
  602. return pages;
  603. }
  604. static unsigned long swapin_nr_pages(unsigned long offset)
  605. {
  606. static unsigned long prev_offset;
  607. unsigned int hits, pages, max_pages;
  608. static atomic_t last_readahead_pages;
  609. max_pages = 1 << READ_ONCE(page_cluster);
  610. if (max_pages <= 1)
  611. return 1;
  612. hits = atomic_xchg(&swapin_readahead_hits, 0);
  613. pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits,
  614. max_pages,
  615. atomic_read(&last_readahead_pages));
  616. if (!hits)
  617. WRITE_ONCE(prev_offset, offset);
  618. atomic_set(&last_readahead_pages, pages);
  619. return pages;
  620. }
  621. /**
  622. * swap_cluster_readahead - swap in pages in hope we need them soon
  623. * @entry: swap entry of this memory
  624. * @gfp_mask: memory allocation flags
  625. * @mpol: NUMA memory allocation policy to be applied
  626. * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
  627. *
  628. * Returns the struct folio for entry and addr, after queueing swapin.
  629. *
  630. * Primitive swap readahead code. We simply read an aligned block of
  631. * (1 << page_cluster) entries in the swap area. This method is chosen
  632. * because it doesn't cost us any seek time. We also make sure to queue
  633. * the 'original' request together with the readahead ones...
  634. *
  635. * Note: it is intentional that the same NUMA policy and interleave index
  636. * are used for every page of the readahead: neighbouring pages on swap
  637. * are fairly likely to have been swapped out from the same node.
  638. */
  639. struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
  640. struct mempolicy *mpol, pgoff_t ilx)
  641. {
  642. struct folio *folio;
  643. unsigned long entry_offset = swp_offset(entry);
  644. unsigned long offset = entry_offset;
  645. unsigned long start_offset, end_offset;
  646. unsigned long mask;
  647. struct swap_info_struct *si = __swap_entry_to_info(entry);
  648. struct blk_plug plug;
  649. struct swap_iocb *splug = NULL;
  650. bool page_allocated;
  651. mask = swapin_nr_pages(offset) - 1;
  652. if (!mask)
  653. goto skip;
  654. /* Read a page_cluster sized and aligned cluster around offset. */
  655. start_offset = offset & ~mask;
  656. end_offset = offset | mask;
  657. if (!start_offset) /* First page is swap header. */
  658. start_offset++;
  659. if (end_offset >= si->max)
  660. end_offset = si->max - 1;
  661. blk_start_plug(&plug);
  662. for (offset = start_offset; offset <= end_offset ; offset++) {
  663. /* Ok, do the async read-ahead now */
  664. folio = swap_cache_alloc_folio(
  665. swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx,
  666. &page_allocated);
  667. if (!folio)
  668. continue;
  669. if (page_allocated) {
  670. swap_read_folio(folio, &splug);
  671. if (offset != entry_offset) {
  672. folio_set_readahead(folio);
  673. count_vm_event(SWAP_RA);
  674. }
  675. }
  676. folio_put(folio);
  677. }
  678. blk_finish_plug(&plug);
  679. swap_read_unplug(splug);
  680. lru_add_drain(); /* Push any new pages onto the LRU now */
  681. skip:
  682. /* The page was likely read above, so no need for plugging here */
  683. folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
  684. &page_allocated);
  685. if (unlikely(page_allocated))
  686. swap_read_folio(folio, NULL);
  687. return folio;
  688. }
  689. static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start,
  690. unsigned long *end)
  691. {
  692. struct vm_area_struct *vma = vmf->vma;
  693. unsigned long ra_val;
  694. unsigned long faddr, prev_faddr, left, right;
  695. unsigned int max_win, hits, prev_win, win;
  696. max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING);
  697. if (max_win == 1)
  698. return 1;
  699. faddr = vmf->address;
  700. ra_val = GET_SWAP_RA_VAL(vma);
  701. prev_faddr = SWAP_RA_ADDR(ra_val);
  702. prev_win = SWAP_RA_WIN(ra_val);
  703. hits = SWAP_RA_HITS(ra_val);
  704. win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits,
  705. max_win, prev_win);
  706. atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0));
  707. if (win == 1)
  708. return 1;
  709. if (faddr == prev_faddr + PAGE_SIZE)
  710. left = faddr;
  711. else if (prev_faddr == faddr + PAGE_SIZE)
  712. left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE;
  713. else
  714. left = faddr - (((win - 1) / 2) << PAGE_SHIFT);
  715. right = left + (win << PAGE_SHIFT);
  716. if ((long)left < 0)
  717. left = 0;
  718. *start = max3(left, vma->vm_start, faddr & PMD_MASK);
  719. *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE);
  720. return win;
  721. }
  722. /**
  723. * swap_vma_readahead - swap in pages in hope we need them soon
  724. * @targ_entry: swap entry of the targeted memory
  725. * @gfp_mask: memory allocation flags
  726. * @mpol: NUMA memory allocation policy to be applied
  727. * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
  728. * @vmf: fault information
  729. *
  730. * Returns the struct folio for entry and addr, after queueing swapin.
  731. *
  732. * Primitive swap readahead code. We simply read in a few pages whose
  733. * virtual addresses are around the fault address in the same vma.
  734. *
  735. * Caller must hold read mmap_lock if vmf->vma is not NULL.
  736. *
  737. */
  738. static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
  739. struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf)
  740. {
  741. struct blk_plug plug;
  742. struct swap_iocb *splug = NULL;
  743. struct folio *folio;
  744. pte_t *pte = NULL, pentry;
  745. int win;
  746. unsigned long start, end, addr;
  747. pgoff_t ilx;
  748. bool page_allocated;
  749. win = swap_vma_ra_win(vmf, &start, &end);
  750. if (win == 1)
  751. goto skip;
  752. ilx = targ_ilx - PFN_DOWN(vmf->address - start);
  753. blk_start_plug(&plug);
  754. for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) {
  755. struct swap_info_struct *si = NULL;
  756. softleaf_t entry;
  757. if (!pte++) {
  758. pte = pte_offset_map(vmf->pmd, addr);
  759. if (!pte)
  760. break;
  761. }
  762. pentry = ptep_get_lockless(pte);
  763. entry = softleaf_from_pte(pentry);
  764. if (!softleaf_is_swap(entry))
  765. continue;
  766. pte_unmap(pte);
  767. pte = NULL;
  768. /*
  769. * Readahead entry may come from a device that we are not
  770. * holding a reference to, try to grab a reference, or skip.
  771. */
  772. if (swp_type(entry) != swp_type(targ_entry)) {
  773. si = get_swap_device(entry);
  774. if (!si)
  775. continue;
  776. }
  777. folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
  778. &page_allocated);
  779. if (si)
  780. put_swap_device(si);
  781. if (!folio)
  782. continue;
  783. if (page_allocated) {
  784. swap_read_folio(folio, &splug);
  785. if (addr != vmf->address) {
  786. folio_set_readahead(folio);
  787. count_vm_event(SWAP_RA);
  788. }
  789. }
  790. folio_put(folio);
  791. }
  792. if (pte)
  793. pte_unmap(pte);
  794. blk_finish_plug(&plug);
  795. swap_read_unplug(splug);
  796. lru_add_drain();
  797. skip:
  798. /* The folio was likely read above, so no need for plugging here */
  799. folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx,
  800. &page_allocated);
  801. if (unlikely(page_allocated))
  802. swap_read_folio(folio, NULL);
  803. return folio;
  804. }
  805. /**
  806. * swapin_readahead - swap in pages in hope we need them soon
  807. * @entry: swap entry of this memory
  808. * @gfp_mask: memory allocation flags
  809. * @vmf: fault information
  810. *
  811. * Returns the struct folio for entry and addr, after queueing swapin.
  812. *
  813. * It's a main entry function for swap readahead. By the configuration,
  814. * it will read ahead blocks by cluster-based(ie, physical disk based)
  815. * or vma-based(ie, virtual address based on faulty address) readahead.
  816. */
  817. struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
  818. struct vm_fault *vmf)
  819. {
  820. struct mempolicy *mpol;
  821. pgoff_t ilx;
  822. struct folio *folio;
  823. mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
  824. folio = swap_use_vma_readahead() ?
  825. swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
  826. swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
  827. mpol_cond_put(mpol);
  828. return folio;
  829. }
  830. #ifdef CONFIG_SYSFS
  831. static ssize_t vma_ra_enabled_show(struct kobject *kobj,
  832. struct kobj_attribute *attr, char *buf)
  833. {
  834. return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead));
  835. }
  836. static ssize_t vma_ra_enabled_store(struct kobject *kobj,
  837. struct kobj_attribute *attr,
  838. const char *buf, size_t count)
  839. {
  840. ssize_t ret;
  841. ret = kstrtobool(buf, &enable_vma_readahead);
  842. if (ret)
  843. return ret;
  844. return count;
  845. }
  846. static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled);
  847. static struct attribute *swap_attrs[] = {
  848. &vma_ra_enabled_attr.attr,
  849. NULL,
  850. };
  851. static const struct attribute_group swap_attr_group = {
  852. .attrs = swap_attrs,
  853. };
  854. static int __init swap_init(void)
  855. {
  856. int err;
  857. struct kobject *swap_kobj;
  858. swap_kobj = kobject_create_and_add("swap", mm_kobj);
  859. if (!swap_kobj) {
  860. pr_err("failed to create swap kobject\n");
  861. return -ENOMEM;
  862. }
  863. err = sysfs_create_group(swap_kobj, &swap_attr_group);
  864. if (err) {
  865. pr_err("failed to register swap group\n");
  866. goto delete_obj;
  867. }
  868. /* Swap cache writeback is LRU based, no tags for it */
  869. mapping_set_no_writeback_tags(&swap_space);
  870. return 0;
  871. delete_obj:
  872. kobject_put(swap_kobj);
  873. return err;
  874. }
  875. subsys_initcall(swap_init);
  876. #endif