memfd_luo.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2025, Google LLC.
  4. * Pasha Tatashin <pasha.tatashin@soleen.com>
  5. *
  6. * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
  7. * Pratyush Yadav <ptyadav@amazon.de>
  8. */
  9. /**
  10. * DOC: Memfd Preservation via LUO
  11. *
  12. * Overview
  13. * ========
  14. *
  15. * Memory file descriptors (memfd) can be preserved over a kexec using the Live
  16. * Update Orchestrator (LUO) file preservation. This allows userspace to
  17. * transfer its memory contents to the next kernel after a kexec.
  18. *
  19. * The preservation is not intended to be transparent. Only select properties of
  20. * the file are preserved. All others are reset to default. The preserved
  21. * properties are described below.
  22. *
  23. * .. note::
  24. * The LUO API is not stabilized yet, so the preserved properties of a memfd
  25. * are also not stable and are subject to backwards incompatible changes.
  26. *
  27. * .. note::
  28. * Currently a memfd backed by Hugetlb is not supported. Memfds created
  29. * with ``MFD_HUGETLB`` will be rejected.
  30. *
  31. * Preserved Properties
  32. * ====================
  33. *
  34. * The following properties of the memfd are preserved across kexec:
  35. *
  36. * File Contents
  37. * All data stored in the file is preserved.
  38. *
  39. * File Size
  40. * The size of the file is preserved. Holes in the file are filled by
  41. * allocating pages for them during preservation.
  42. *
  43. * File Position
  44. * The current file position is preserved, allowing applications to continue
  45. * reading/writing from their last position.
  46. *
  47. * File Status Flags
  48. * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
  49. * is maintained.
  50. *
  51. * Non-Preserved Properties
  52. * ========================
  53. *
  54. * All properties which are not preserved must be assumed to be reset to
  55. * default. This section describes some of those properties which may be more of
  56. * note.
  57. *
  58. * ``FD_CLOEXEC`` flag
  59. * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
  60. * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
  61. * again after restore via ``fcntl()``.
  62. *
  63. * Seals
  64. * File seals are not preserved. The file is unsealed on restore and if
  65. * needed, must be sealed again via ``fcntl()``.
  66. */
  67. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  68. #include <linux/bits.h>
  69. #include <linux/err.h>
  70. #include <linux/file.h>
  71. #include <linux/io.h>
  72. #include <linux/kexec_handover.h>
  73. #include <linux/kho/abi/memfd.h>
  74. #include <linux/liveupdate.h>
  75. #include <linux/shmem_fs.h>
  76. #include <linux/vmalloc.h>
  77. #include <linux/memfd.h>
  78. #include "internal.h"
  79. static int memfd_luo_preserve_folios(struct file *file,
  80. struct kho_vmalloc *kho_vmalloc,
  81. struct memfd_luo_folio_ser **out_folios_ser,
  82. u64 *nr_foliosp)
  83. {
  84. struct inode *inode = file_inode(file);
  85. struct memfd_luo_folio_ser *folios_ser;
  86. unsigned int max_folios;
  87. long i, size, nr_pinned;
  88. struct folio **folios;
  89. int err = -EINVAL;
  90. pgoff_t offset;
  91. u64 nr_folios;
  92. size = i_size_read(inode);
  93. /*
  94. * If the file has zero size, then the folios and nr_folios properties
  95. * are not set.
  96. */
  97. if (!size) {
  98. *nr_foliosp = 0;
  99. *out_folios_ser = NULL;
  100. memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
  101. return 0;
  102. }
  103. /*
  104. * Guess the number of folios based on inode size. Real number might end
  105. * up being smaller if there are higher order folios.
  106. */
  107. max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
  108. folios = kvmalloc_objs(*folios, max_folios);
  109. if (!folios)
  110. return -ENOMEM;
  111. /*
  112. * Pin the folios so they don't move around behind our back. This also
  113. * ensures none of the folios are in CMA -- which ensures they don't
  114. * fall in KHO scratch memory. It also moves swapped out folios back to
  115. * memory.
  116. *
  117. * A side effect of doing this is that it allocates a folio for all
  118. * indices in the file. This might waste memory on sparse memfds. If
  119. * that is really a problem in the future, we can have a
  120. * memfd_pin_folios() variant that does not allocate a page on empty
  121. * slots.
  122. */
  123. nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
  124. &offset);
  125. if (nr_pinned < 0) {
  126. err = nr_pinned;
  127. pr_err("failed to pin folios: %d\n", err);
  128. goto err_free_folios;
  129. }
  130. nr_folios = nr_pinned;
  131. folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
  132. if (!folios_ser) {
  133. err = -ENOMEM;
  134. goto err_unpin;
  135. }
  136. for (i = 0; i < nr_folios; i++) {
  137. struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
  138. struct folio *folio = folios[i];
  139. err = kho_preserve_folio(folio);
  140. if (err)
  141. goto err_unpreserve;
  142. folio_lock(folio);
  143. /*
  144. * A dirty folio is one which has been written to. A clean folio
  145. * is its opposite. Since a clean folio does not carry user
  146. * data, it can be freed by page reclaim under memory pressure.
  147. *
  148. * Saving the dirty flag at prepare() time doesn't work since it
  149. * can change later. Saving it at freeze() also won't work
  150. * because the dirty bit is normally synced at unmap and there
  151. * might still be a mapping of the file at freeze().
  152. *
  153. * To see why this is a problem, say a folio is clean at
  154. * preserve, but gets dirtied later. The pfolio flags will mark
  155. * it as clean. After retrieve, the next kernel might try to
  156. * reclaim this folio under memory pressure, losing user data.
  157. *
  158. * Unconditionally mark it dirty to avoid this problem. This
  159. * comes at the cost of making clean folios un-reclaimable after
  160. * live update.
  161. */
  162. folio_mark_dirty(folio);
  163. /*
  164. * If the folio is not uptodate, it was fallocated but never
  165. * used. Saving this flag at prepare() doesn't work since it
  166. * might change later when someone uses the folio.
  167. *
  168. * Since we have taken the performance penalty of allocating,
  169. * zeroing, and pinning all the folios in the holes, take a bit
  170. * more and zero all non-uptodate folios too.
  171. *
  172. * NOTE: For someone looking to improve preserve performance,
  173. * this is a good place to look.
  174. */
  175. if (!folio_test_uptodate(folio)) {
  176. folio_zero_range(folio, 0, folio_size(folio));
  177. flush_dcache_folio(folio);
  178. folio_mark_uptodate(folio);
  179. }
  180. folio_unlock(folio);
  181. pfolio->pfn = folio_pfn(folio);
  182. pfolio->flags = MEMFD_LUO_FOLIO_DIRTY | MEMFD_LUO_FOLIO_UPTODATE;
  183. pfolio->index = folio->index;
  184. }
  185. err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
  186. if (err)
  187. goto err_unpreserve;
  188. kvfree(folios);
  189. *nr_foliosp = nr_folios;
  190. *out_folios_ser = folios_ser;
  191. /*
  192. * Note: folios_ser is purposely not freed here. It is preserved
  193. * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
  194. * that is passed via private_data.
  195. */
  196. return 0;
  197. err_unpreserve:
  198. for (i = i - 1; i >= 0; i--)
  199. kho_unpreserve_folio(folios[i]);
  200. vfree(folios_ser);
  201. err_unpin:
  202. unpin_folios(folios, nr_folios);
  203. err_free_folios:
  204. kvfree(folios);
  205. return err;
  206. }
  207. static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
  208. struct memfd_luo_folio_ser *folios_ser,
  209. u64 nr_folios)
  210. {
  211. long i;
  212. if (!nr_folios)
  213. return;
  214. kho_unpreserve_vmalloc(kho_vmalloc);
  215. for (i = 0; i < nr_folios; i++) {
  216. const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
  217. struct folio *folio;
  218. if (!pfolio->pfn)
  219. continue;
  220. folio = pfn_folio(pfolio->pfn);
  221. kho_unpreserve_folio(folio);
  222. unpin_folio(folio);
  223. }
  224. vfree(folios_ser);
  225. }
  226. static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
  227. {
  228. struct inode *inode = file_inode(args->file);
  229. struct memfd_luo_folio_ser *folios_ser;
  230. struct memfd_luo_ser *ser;
  231. u64 nr_folios;
  232. int err = 0;
  233. inode_lock(inode);
  234. shmem_freeze(inode, true);
  235. /* Allocate the main serialization structure in preserved memory */
  236. ser = kho_alloc_preserve(sizeof(*ser));
  237. if (IS_ERR(ser)) {
  238. err = PTR_ERR(ser);
  239. goto err_unlock;
  240. }
  241. ser->pos = args->file->f_pos;
  242. ser->size = i_size_read(inode);
  243. err = memfd_luo_preserve_folios(args->file, &ser->folios,
  244. &folios_ser, &nr_folios);
  245. if (err)
  246. goto err_free_ser;
  247. ser->nr_folios = nr_folios;
  248. inode_unlock(inode);
  249. args->private_data = folios_ser;
  250. args->serialized_data = virt_to_phys(ser);
  251. return 0;
  252. err_free_ser:
  253. kho_unpreserve_free(ser);
  254. err_unlock:
  255. shmem_freeze(inode, false);
  256. inode_unlock(inode);
  257. return err;
  258. }
  259. static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
  260. {
  261. struct memfd_luo_ser *ser;
  262. if (WARN_ON_ONCE(!args->serialized_data))
  263. return -EINVAL;
  264. ser = phys_to_virt(args->serialized_data);
  265. /*
  266. * The pos might have changed since prepare. Everything else stays the
  267. * same.
  268. */
  269. ser->pos = args->file->f_pos;
  270. return 0;
  271. }
  272. static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
  273. {
  274. struct inode *inode = file_inode(args->file);
  275. struct memfd_luo_ser *ser;
  276. if (WARN_ON_ONCE(!args->serialized_data))
  277. return;
  278. inode_lock(inode);
  279. shmem_freeze(inode, false);
  280. ser = phys_to_virt(args->serialized_data);
  281. memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
  282. ser->nr_folios);
  283. kho_unpreserve_free(ser);
  284. inode_unlock(inode);
  285. }
  286. static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
  287. u64 nr_folios)
  288. {
  289. u64 i;
  290. for (i = 0; i < nr_folios; i++) {
  291. const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
  292. struct folio *folio;
  293. phys_addr_t phys;
  294. if (!pfolio->pfn)
  295. continue;
  296. phys = PFN_PHYS(pfolio->pfn);
  297. folio = kho_restore_folio(phys);
  298. if (!folio) {
  299. pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
  300. phys);
  301. continue;
  302. }
  303. folio_put(folio);
  304. }
  305. }
  306. static void memfd_luo_finish(struct liveupdate_file_op_args *args)
  307. {
  308. struct memfd_luo_folio_ser *folios_ser;
  309. struct memfd_luo_ser *ser;
  310. /*
  311. * If retrieve was successful, nothing to do. If it failed, retrieve()
  312. * already cleaned up everything it could. So nothing to do there
  313. * either. Only need to clean up when retrieve was not called.
  314. */
  315. if (args->retrieve_status)
  316. return;
  317. ser = phys_to_virt(args->serialized_data);
  318. if (!ser)
  319. return;
  320. if (ser->nr_folios) {
  321. folios_ser = kho_restore_vmalloc(&ser->folios);
  322. if (!folios_ser)
  323. goto out;
  324. memfd_luo_discard_folios(folios_ser, ser->nr_folios);
  325. vfree(folios_ser);
  326. }
  327. out:
  328. kho_restore_free(ser);
  329. }
  330. static int memfd_luo_retrieve_folios(struct file *file,
  331. struct memfd_luo_folio_ser *folios_ser,
  332. u64 nr_folios)
  333. {
  334. struct inode *inode = file_inode(file);
  335. struct address_space *mapping = inode->i_mapping;
  336. struct folio *folio;
  337. int err = -EIO;
  338. long i;
  339. for (i = 0; i < nr_folios; i++) {
  340. const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
  341. phys_addr_t phys;
  342. u64 index;
  343. int flags;
  344. if (!pfolio->pfn)
  345. continue;
  346. phys = PFN_PHYS(pfolio->pfn);
  347. folio = kho_restore_folio(phys);
  348. if (!folio) {
  349. pr_err("Unable to restore folio at physical address: %llx\n",
  350. phys);
  351. goto put_folios;
  352. }
  353. index = pfolio->index;
  354. flags = pfolio->flags;
  355. /* Set up the folio for insertion. */
  356. __folio_set_locked(folio);
  357. __folio_set_swapbacked(folio);
  358. err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
  359. if (err) {
  360. pr_err("shmem: failed to charge folio index %ld: %d\n",
  361. i, err);
  362. goto unlock_folio;
  363. }
  364. err = shmem_add_to_page_cache(folio, mapping, index, NULL,
  365. mapping_gfp_mask(mapping));
  366. if (err) {
  367. pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
  368. i, err);
  369. goto unlock_folio;
  370. }
  371. if (flags & MEMFD_LUO_FOLIO_UPTODATE)
  372. folio_mark_uptodate(folio);
  373. if (flags & MEMFD_LUO_FOLIO_DIRTY)
  374. folio_mark_dirty(folio);
  375. err = shmem_inode_acct_blocks(inode, 1);
  376. if (err) {
  377. pr_err("shmem: failed to account folio index %ld: %d\n",
  378. i, err);
  379. goto unlock_folio;
  380. }
  381. shmem_recalc_inode(inode, 1, 0);
  382. folio_add_lru(folio);
  383. folio_unlock(folio);
  384. folio_put(folio);
  385. }
  386. return 0;
  387. unlock_folio:
  388. folio_unlock(folio);
  389. folio_put(folio);
  390. put_folios:
  391. /*
  392. * Note: don't free the folios already added to the file. They will be
  393. * freed when the file is freed. Free the ones not added yet here.
  394. */
  395. for (long j = i + 1; j < nr_folios; j++) {
  396. const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
  397. folio = kho_restore_folio(pfolio->pfn);
  398. if (folio)
  399. folio_put(folio);
  400. }
  401. return err;
  402. }
  403. static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
  404. {
  405. struct memfd_luo_folio_ser *folios_ser;
  406. struct memfd_luo_ser *ser;
  407. struct file *file;
  408. int err;
  409. ser = phys_to_virt(args->serialized_data);
  410. if (!ser)
  411. return -EINVAL;
  412. file = memfd_alloc_file("", 0);
  413. if (IS_ERR(file)) {
  414. pr_err("failed to setup file: %pe\n", file);
  415. err = PTR_ERR(file);
  416. goto free_ser;
  417. }
  418. vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
  419. file->f_inode->i_size = ser->size;
  420. if (ser->nr_folios) {
  421. folios_ser = kho_restore_vmalloc(&ser->folios);
  422. if (!folios_ser) {
  423. err = -EINVAL;
  424. goto put_file;
  425. }
  426. err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
  427. vfree(folios_ser);
  428. if (err)
  429. goto put_file;
  430. }
  431. args->file = file;
  432. kho_restore_free(ser);
  433. return 0;
  434. put_file:
  435. fput(file);
  436. free_ser:
  437. kho_restore_free(ser);
  438. return err;
  439. }
  440. static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
  441. struct file *file)
  442. {
  443. struct inode *inode = file_inode(file);
  444. return shmem_file(file) && !inode->i_nlink;
  445. }
  446. static const struct liveupdate_file_ops memfd_luo_file_ops = {
  447. .freeze = memfd_luo_freeze,
  448. .finish = memfd_luo_finish,
  449. .retrieve = memfd_luo_retrieve,
  450. .preserve = memfd_luo_preserve,
  451. .unpreserve = memfd_luo_unpreserve,
  452. .can_preserve = memfd_luo_can_preserve,
  453. .owner = THIS_MODULE,
  454. };
  455. static struct liveupdate_file_handler memfd_luo_handler = {
  456. .ops = &memfd_luo_file_ops,
  457. .compatible = MEMFD_LUO_FH_COMPATIBLE,
  458. };
  459. static int __init memfd_luo_init(void)
  460. {
  461. int err = liveupdate_register_file_handler(&memfd_luo_handler);
  462. if (err && err != -EOPNOTSUPP) {
  463. pr_err("Could not register luo filesystem handler: %pe\n",
  464. ERR_PTR(err));
  465. return err;
  466. }
  467. return 0;
  468. }
  469. late_initcall(memfd_luo_init);