guest_memfd.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/anon_inodes.h>
  3. #include <linux/backing-dev.h>
  4. #include <linux/falloc.h>
  5. #include <linux/fs.h>
  6. #include <linux/kvm_host.h>
  7. #include <linux/mempolicy.h>
  8. #include <linux/pseudo_fs.h>
  9. #include <linux/pagemap.h>
  10. #include "kvm_mm.h"
  11. static struct vfsmount *kvm_gmem_mnt;
  12. /*
  13. * A guest_memfd instance can be associated multiple VMs, each with its own
  14. * "view" of the underlying physical memory.
  15. *
  16. * The gmem's inode is effectively the raw underlying physical storage, and is
  17. * used to track properties of the physical memory, while each gmem file is
  18. * effectively a single VM's view of that storage, and is used to track assets
  19. * specific to its associated VM, e.g. memslots=>gmem bindings.
  20. */
  21. struct gmem_file {
  22. struct kvm *kvm;
  23. struct xarray bindings;
  24. struct list_head entry;
  25. };
  26. struct gmem_inode {
  27. struct shared_policy policy;
  28. struct inode vfs_inode;
  29. u64 flags;
  30. };
  31. static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
  32. {
  33. return container_of(inode, struct gmem_inode, vfs_inode);
  34. }
  35. #define kvm_gmem_for_each_file(f, mapping) \
  36. list_for_each_entry(f, &(mapping)->i_private_list, entry)
  37. /**
  38. * folio_file_pfn - like folio_file_page, but return a pfn.
  39. * @folio: The folio which contains this index.
  40. * @index: The index we want to look up.
  41. *
  42. * Return: The pfn for this index.
  43. */
  44. static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
  45. {
  46. return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
  47. }
  48. static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
  49. {
  50. return gfn - slot->base_gfn + slot->gmem.pgoff;
  51. }
  52. static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
  53. pgoff_t index, struct folio *folio)
  54. {
  55. #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
  56. kvm_pfn_t pfn = folio_file_pfn(folio, index);
  57. gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
  58. int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
  59. if (rc) {
  60. pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
  61. index, gfn, pfn, rc);
  62. return rc;
  63. }
  64. #endif
  65. return 0;
  66. }
  67. /*
  68. * Process @folio, which contains @gfn, so that the guest can use it.
  69. * The folio must be locked and the gfn must be contained in @slot.
  70. * On successful return the guest sees a zero page so as to avoid
  71. * leaking host data and the up-to-date flag is set.
  72. */
  73. static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
  74. gfn_t gfn, struct folio *folio)
  75. {
  76. pgoff_t index;
  77. /*
  78. * Preparing huge folios should always be safe, since it should
  79. * be possible to split them later if needed.
  80. *
  81. * Right now the folio order is always going to be zero, but the
  82. * code is ready for huge folios. The only assumption is that
  83. * the base pgoff of memslots is naturally aligned with the
  84. * requested page order, ensuring that huge folios can also use
  85. * huge page table entries for GPA->HPA mapping.
  86. *
  87. * The order will be passed when creating the guest_memfd, and
  88. * checked when creating memslots.
  89. */
  90. WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
  91. index = kvm_gmem_get_index(slot, gfn);
  92. index = ALIGN_DOWN(index, folio_nr_pages(folio));
  93. return __kvm_gmem_prepare_folio(kvm, slot, index, folio);
  94. }
  95. /*
  96. * Returns a locked folio on success. The caller is responsible for
  97. * setting the up-to-date flag before the memory is mapped into the guest.
  98. * There is no backing storage for the memory, so the folio will remain
  99. * up-to-date until it's removed.
  100. *
  101. * Ignore accessed, referenced, and dirty flags. The memory is
  102. * unevictable and there is no storage to write back to.
  103. */
  104. static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
  105. {
  106. /* TODO: Support huge pages. */
  107. struct mempolicy *policy;
  108. struct folio *folio;
  109. /*
  110. * Fast-path: See if folio is already present in mapping to avoid
  111. * policy_lookup.
  112. */
  113. folio = __filemap_get_folio(inode->i_mapping, index,
  114. FGP_LOCK | FGP_ACCESSED, 0);
  115. if (!IS_ERR(folio))
  116. return folio;
  117. policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
  118. folio = __filemap_get_folio_mpol(inode->i_mapping, index,
  119. FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
  120. mapping_gfp_mask(inode->i_mapping), policy);
  121. mpol_cond_put(policy);
  122. /*
  123. * External interfaces like kvm_gmem_get_pfn() support dealing
  124. * with hugepages to a degree, but internally, guest_memfd currently
  125. * assumes that all folios are order-0 and handling would need
  126. * to be updated for anything otherwise (e.g. page-clearing
  127. * operations).
  128. */
  129. WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio));
  130. return folio;
  131. }
  132. static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
  133. {
  134. if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)
  135. return KVM_FILTER_SHARED;
  136. return KVM_FILTER_PRIVATE;
  137. }
  138. static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start,
  139. pgoff_t end,
  140. enum kvm_gfn_range_filter attr_filter)
  141. {
  142. bool flush = false, found_memslot = false;
  143. struct kvm_memory_slot *slot;
  144. struct kvm *kvm = f->kvm;
  145. unsigned long index;
  146. xa_for_each_range(&f->bindings, index, slot, start, end - 1) {
  147. pgoff_t pgoff = slot->gmem.pgoff;
  148. struct kvm_gfn_range gfn_range = {
  149. .start = slot->base_gfn + max(pgoff, start) - pgoff,
  150. .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
  151. .slot = slot,
  152. .may_block = true,
  153. .attr_filter = attr_filter,
  154. };
  155. if (!found_memslot) {
  156. found_memslot = true;
  157. KVM_MMU_LOCK(kvm);
  158. kvm_mmu_invalidate_begin(kvm);
  159. }
  160. flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
  161. }
  162. if (flush)
  163. kvm_flush_remote_tlbs(kvm);
  164. if (found_memslot)
  165. KVM_MMU_UNLOCK(kvm);
  166. }
  167. static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start,
  168. pgoff_t end)
  169. {
  170. enum kvm_gfn_range_filter attr_filter;
  171. struct gmem_file *f;
  172. attr_filter = kvm_gmem_get_invalidate_filter(inode);
  173. kvm_gmem_for_each_file(f, inode->i_mapping)
  174. __kvm_gmem_invalidate_begin(f, start, end, attr_filter);
  175. }
  176. static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start,
  177. pgoff_t end)
  178. {
  179. struct kvm *kvm = f->kvm;
  180. if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
  181. KVM_MMU_LOCK(kvm);
  182. kvm_mmu_invalidate_end(kvm);
  183. KVM_MMU_UNLOCK(kvm);
  184. }
  185. }
  186. static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start,
  187. pgoff_t end)
  188. {
  189. struct gmem_file *f;
  190. kvm_gmem_for_each_file(f, inode->i_mapping)
  191. __kvm_gmem_invalidate_end(f, start, end);
  192. }
  193. static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
  194. {
  195. pgoff_t start = offset >> PAGE_SHIFT;
  196. pgoff_t end = (offset + len) >> PAGE_SHIFT;
  197. /*
  198. * Bindings must be stable across invalidation to ensure the start+end
  199. * are balanced.
  200. */
  201. filemap_invalidate_lock(inode->i_mapping);
  202. kvm_gmem_invalidate_begin(inode, start, end);
  203. truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
  204. kvm_gmem_invalidate_end(inode, start, end);
  205. filemap_invalidate_unlock(inode->i_mapping);
  206. return 0;
  207. }
  208. static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
  209. {
  210. struct address_space *mapping = inode->i_mapping;
  211. pgoff_t start, index, end;
  212. int r;
  213. /* Dedicated guest is immutable by default. */
  214. if (offset + len > i_size_read(inode))
  215. return -EINVAL;
  216. filemap_invalidate_lock_shared(mapping);
  217. start = offset >> PAGE_SHIFT;
  218. end = (offset + len) >> PAGE_SHIFT;
  219. r = 0;
  220. for (index = start; index < end; ) {
  221. struct folio *folio;
  222. if (signal_pending(current)) {
  223. r = -EINTR;
  224. break;
  225. }
  226. folio = kvm_gmem_get_folio(inode, index);
  227. if (IS_ERR(folio)) {
  228. r = PTR_ERR(folio);
  229. break;
  230. }
  231. index = folio_next_index(folio);
  232. folio_unlock(folio);
  233. folio_put(folio);
  234. /* 64-bit only, wrapping the index should be impossible. */
  235. if (WARN_ON_ONCE(!index))
  236. break;
  237. cond_resched();
  238. }
  239. filemap_invalidate_unlock_shared(mapping);
  240. return r;
  241. }
  242. static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
  243. loff_t len)
  244. {
  245. int ret;
  246. if (!(mode & FALLOC_FL_KEEP_SIZE))
  247. return -EOPNOTSUPP;
  248. if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
  249. return -EOPNOTSUPP;
  250. if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
  251. return -EINVAL;
  252. if (mode & FALLOC_FL_PUNCH_HOLE)
  253. ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
  254. else
  255. ret = kvm_gmem_allocate(file_inode(file), offset, len);
  256. if (!ret)
  257. file_modified(file);
  258. return ret;
  259. }
  260. static int kvm_gmem_release(struct inode *inode, struct file *file)
  261. {
  262. struct gmem_file *f = file->private_data;
  263. struct kvm_memory_slot *slot;
  264. struct kvm *kvm = f->kvm;
  265. unsigned long index;
  266. /*
  267. * Prevent concurrent attempts to *unbind* a memslot. This is the last
  268. * reference to the file and thus no new bindings can be created, but
  269. * dereferencing the slot for existing bindings needs to be protected
  270. * against memslot updates, specifically so that unbind doesn't race
  271. * and free the memslot (kvm_gmem_get_file() will return NULL).
  272. *
  273. * Since .release is called only when the reference count is zero,
  274. * after which file_ref_get() and get_file_active() fail,
  275. * kvm_gmem_get_pfn() cannot be using the file concurrently.
  276. * file_ref_put() provides a full barrier, and get_file_active() the
  277. * matching acquire barrier.
  278. */
  279. mutex_lock(&kvm->slots_lock);
  280. filemap_invalidate_lock(inode->i_mapping);
  281. xa_for_each(&f->bindings, index, slot)
  282. WRITE_ONCE(slot->gmem.file, NULL);
  283. /*
  284. * All in-flight operations are gone and new bindings can be created.
  285. * Zap all SPTEs pointed at by this file. Do not free the backing
  286. * memory, as its lifetime is associated with the inode, not the file.
  287. */
  288. __kvm_gmem_invalidate_begin(f, 0, -1ul,
  289. kvm_gmem_get_invalidate_filter(inode));
  290. __kvm_gmem_invalidate_end(f, 0, -1ul);
  291. list_del(&f->entry);
  292. filemap_invalidate_unlock(inode->i_mapping);
  293. mutex_unlock(&kvm->slots_lock);
  294. xa_destroy(&f->bindings);
  295. kfree(f);
  296. kvm_put_kvm(kvm);
  297. return 0;
  298. }
  299. static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
  300. {
  301. /*
  302. * Do not return slot->gmem.file if it has already been closed;
  303. * there might be some time between the last fput() and when
  304. * kvm_gmem_release() clears slot->gmem.file.
  305. */
  306. return get_file_active(&slot->gmem.file);
  307. }
  308. DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T),
  309. kvm_gmem_get_file(slot), struct kvm_memory_slot *slot);
  310. static bool kvm_gmem_supports_mmap(struct inode *inode)
  311. {
  312. return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP;
  313. }
  314. static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
  315. {
  316. struct inode *inode = file_inode(vmf->vma->vm_file);
  317. struct folio *folio;
  318. vm_fault_t ret = VM_FAULT_LOCKED;
  319. if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
  320. return VM_FAULT_SIGBUS;
  321. if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
  322. return VM_FAULT_SIGBUS;
  323. folio = kvm_gmem_get_folio(inode, vmf->pgoff);
  324. if (IS_ERR(folio)) {
  325. if (PTR_ERR(folio) == -EAGAIN)
  326. return VM_FAULT_RETRY;
  327. return vmf_error(PTR_ERR(folio));
  328. }
  329. if (WARN_ON_ONCE(folio_test_large(folio))) {
  330. ret = VM_FAULT_SIGBUS;
  331. goto out_folio;
  332. }
  333. if (!folio_test_uptodate(folio)) {
  334. clear_highpage(folio_page(folio, 0));
  335. folio_mark_uptodate(folio);
  336. }
  337. vmf->page = folio_file_page(folio, vmf->pgoff);
  338. out_folio:
  339. if (ret != VM_FAULT_LOCKED) {
  340. folio_unlock(folio);
  341. folio_put(folio);
  342. }
  343. return ret;
  344. }
  345. #ifdef CONFIG_NUMA
  346. static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
  347. {
  348. struct inode *inode = file_inode(vma->vm_file);
  349. return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
  350. }
  351. static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
  352. unsigned long addr, pgoff_t *pgoff)
  353. {
  354. struct inode *inode = file_inode(vma->vm_file);
  355. *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
  356. /*
  357. * Return the memory policy for this index, or NULL if none is set.
  358. *
  359. * Returning NULL, e.g. instead of the current task's memory policy, is
  360. * important for the .get_policy kernel ABI: it indicates that no
  361. * explicit policy has been set via mbind() for this memory. The caller
  362. * can then replace NULL with the default memory policy instead of the
  363. * current task's memory policy.
  364. */
  365. return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
  366. }
  367. #endif /* CONFIG_NUMA */
  368. static const struct vm_operations_struct kvm_gmem_vm_ops = {
  369. .fault = kvm_gmem_fault_user_mapping,
  370. #ifdef CONFIG_NUMA
  371. .get_policy = kvm_gmem_get_policy,
  372. .set_policy = kvm_gmem_set_policy,
  373. #endif
  374. };
  375. static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
  376. {
  377. if (!kvm_gmem_supports_mmap(file_inode(file)))
  378. return -ENODEV;
  379. if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
  380. (VM_SHARED | VM_MAYSHARE)) {
  381. return -EINVAL;
  382. }
  383. vma->vm_ops = &kvm_gmem_vm_ops;
  384. return 0;
  385. }
  386. static struct file_operations kvm_gmem_fops = {
  387. .mmap = kvm_gmem_mmap,
  388. .open = generic_file_open,
  389. .release = kvm_gmem_release,
  390. .fallocate = kvm_gmem_fallocate,
  391. };
  392. static int kvm_gmem_migrate_folio(struct address_space *mapping,
  393. struct folio *dst, struct folio *src,
  394. enum migrate_mode mode)
  395. {
  396. WARN_ON_ONCE(1);
  397. return -EINVAL;
  398. }
  399. static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
  400. {
  401. pgoff_t start, end;
  402. filemap_invalidate_lock_shared(mapping);
  403. start = folio->index;
  404. end = start + folio_nr_pages(folio);
  405. kvm_gmem_invalidate_begin(mapping->host, start, end);
  406. /*
  407. * Do not truncate the range, what action is taken in response to the
  408. * error is userspace's decision (assuming the architecture supports
  409. * gracefully handling memory errors). If/when the guest attempts to
  410. * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
  411. * at which point KVM can either terminate the VM or propagate the
  412. * error to userspace.
  413. */
  414. kvm_gmem_invalidate_end(mapping->host, start, end);
  415. filemap_invalidate_unlock_shared(mapping);
  416. return MF_DELAYED;
  417. }
  418. #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
  419. static void kvm_gmem_free_folio(struct folio *folio)
  420. {
  421. struct page *page = folio_page(folio, 0);
  422. kvm_pfn_t pfn = page_to_pfn(page);
  423. int order = folio_order(folio);
  424. kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
  425. }
  426. #endif
  427. static const struct address_space_operations kvm_gmem_aops = {
  428. .dirty_folio = noop_dirty_folio,
  429. .migrate_folio = kvm_gmem_migrate_folio,
  430. .error_remove_folio = kvm_gmem_error_folio,
  431. #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
  432. .free_folio = kvm_gmem_free_folio,
  433. #endif
  434. };
  435. static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
  436. struct iattr *attr)
  437. {
  438. return -EINVAL;
  439. }
  440. static const struct inode_operations kvm_gmem_iops = {
  441. .setattr = kvm_gmem_setattr,
  442. };
  443. bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
  444. {
  445. return true;
  446. }
  447. static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
  448. {
  449. static const char *name = "[kvm-gmem]";
  450. struct gmem_file *f;
  451. struct inode *inode;
  452. struct file *file;
  453. int fd, err;
  454. fd = get_unused_fd_flags(0);
  455. if (fd < 0)
  456. return fd;
  457. f = kzalloc_obj(*f);
  458. if (!f) {
  459. err = -ENOMEM;
  460. goto err_fd;
  461. }
  462. /* __fput() will take care of fops_put(). */
  463. if (!fops_get(&kvm_gmem_fops)) {
  464. err = -ENOENT;
  465. goto err_gmem;
  466. }
  467. inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL);
  468. if (IS_ERR(inode)) {
  469. err = PTR_ERR(inode);
  470. goto err_fops;
  471. }
  472. inode->i_op = &kvm_gmem_iops;
  473. inode->i_mapping->a_ops = &kvm_gmem_aops;
  474. inode->i_mode |= S_IFREG;
  475. inode->i_size = size;
  476. mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
  477. mapping_set_inaccessible(inode->i_mapping);
  478. /* Unmovable mappings are supposed to be marked unevictable as well. */
  479. WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
  480. GMEM_I(inode)->flags = flags;
  481. file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
  482. if (IS_ERR(file)) {
  483. err = PTR_ERR(file);
  484. goto err_inode;
  485. }
  486. file->f_flags |= O_LARGEFILE;
  487. file->private_data = f;
  488. kvm_get_kvm(kvm);
  489. f->kvm = kvm;
  490. xa_init(&f->bindings);
  491. list_add(&f->entry, &inode->i_mapping->i_private_list);
  492. fd_install(fd, file);
  493. return fd;
  494. err_inode:
  495. iput(inode);
  496. err_fops:
  497. fops_put(&kvm_gmem_fops);
  498. err_gmem:
  499. kfree(f);
  500. err_fd:
  501. put_unused_fd(fd);
  502. return err;
  503. }
  504. int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
  505. {
  506. loff_t size = args->size;
  507. u64 flags = args->flags;
  508. if (flags & ~kvm_gmem_get_supported_flags(kvm))
  509. return -EINVAL;
  510. if (size <= 0 || !PAGE_ALIGNED(size))
  511. return -EINVAL;
  512. return __kvm_gmem_create(kvm, size, flags);
  513. }
  514. int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
  515. unsigned int fd, loff_t offset)
  516. {
  517. loff_t size = slot->npages << PAGE_SHIFT;
  518. unsigned long start, end;
  519. struct gmem_file *f;
  520. struct inode *inode;
  521. struct file *file;
  522. int r = -EINVAL;
  523. BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
  524. file = fget(fd);
  525. if (!file)
  526. return -EBADF;
  527. if (file->f_op != &kvm_gmem_fops)
  528. goto err;
  529. f = file->private_data;
  530. if (f->kvm != kvm)
  531. goto err;
  532. inode = file_inode(file);
  533. if (offset < 0 || !PAGE_ALIGNED(offset) ||
  534. offset + size > i_size_read(inode))
  535. goto err;
  536. filemap_invalidate_lock(inode->i_mapping);
  537. start = offset >> PAGE_SHIFT;
  538. end = start + slot->npages;
  539. if (!xa_empty(&f->bindings) &&
  540. xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
  541. filemap_invalidate_unlock(inode->i_mapping);
  542. goto err;
  543. }
  544. /*
  545. * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so
  546. * kvm_gmem_bind() must occur on a new memslot. Because the memslot
  547. * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file.
  548. */
  549. WRITE_ONCE(slot->gmem.file, file);
  550. slot->gmem.pgoff = start;
  551. if (kvm_gmem_supports_mmap(inode))
  552. slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
  553. xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
  554. filemap_invalidate_unlock(inode->i_mapping);
  555. /*
  556. * Drop the reference to the file, even on success. The file pins KVM,
  557. * not the other way 'round. Active bindings are invalidated if the
  558. * file is closed before memslots are destroyed.
  559. */
  560. r = 0;
  561. err:
  562. fput(file);
  563. return r;
  564. }
  565. static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f)
  566. {
  567. unsigned long start = slot->gmem.pgoff;
  568. unsigned long end = start + slot->npages;
  569. xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL);
  570. /*
  571. * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
  572. * cannot see this memslot.
  573. */
  574. WRITE_ONCE(slot->gmem.file, NULL);
  575. }
  576. void kvm_gmem_unbind(struct kvm_memory_slot *slot)
  577. {
  578. /*
  579. * Nothing to do if the underlying file was _already_ closed, as
  580. * kvm_gmem_release() invalidates and nullifies all bindings.
  581. */
  582. if (!slot->gmem.file)
  583. return;
  584. CLASS(gmem_get_file, file)(slot);
  585. /*
  586. * However, if the file is _being_ closed, then the bindings need to be
  587. * removed as kvm_gmem_release() might not run until after the memslot
  588. * is freed. Note, modifying the bindings is safe even though the file
  589. * is dying as kvm_gmem_release() nullifies slot->gmem.file under
  590. * slots_lock, and only puts its reference to KVM after destroying all
  591. * bindings. I.e. reaching this point means kvm_gmem_release() hasn't
  592. * yet destroyed the bindings or freed the gmem_file, and can't do so
  593. * until the caller drops slots_lock.
  594. */
  595. if (!file) {
  596. __kvm_gmem_unbind(slot, slot->gmem.file->private_data);
  597. return;
  598. }
  599. filemap_invalidate_lock(file->f_mapping);
  600. __kvm_gmem_unbind(slot, file->private_data);
  601. filemap_invalidate_unlock(file->f_mapping);
  602. }
  603. /* Returns a locked folio on success. */
  604. static struct folio *__kvm_gmem_get_pfn(struct file *file,
  605. struct kvm_memory_slot *slot,
  606. pgoff_t index, kvm_pfn_t *pfn,
  607. int *max_order)
  608. {
  609. struct file *slot_file = READ_ONCE(slot->gmem.file);
  610. struct gmem_file *f = file->private_data;
  611. struct folio *folio;
  612. if (file != slot_file) {
  613. WARN_ON_ONCE(slot_file);
  614. return ERR_PTR(-EFAULT);
  615. }
  616. if (xa_load(&f->bindings, index) != slot) {
  617. WARN_ON_ONCE(xa_load(&f->bindings, index));
  618. return ERR_PTR(-EIO);
  619. }
  620. folio = kvm_gmem_get_folio(file_inode(file), index);
  621. if (IS_ERR(folio))
  622. return folio;
  623. if (folio_test_hwpoison(folio)) {
  624. folio_unlock(folio);
  625. folio_put(folio);
  626. return ERR_PTR(-EHWPOISON);
  627. }
  628. *pfn = folio_file_pfn(folio, index);
  629. if (max_order)
  630. *max_order = 0;
  631. return folio;
  632. }
  633. int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
  634. gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
  635. int *max_order)
  636. {
  637. pgoff_t index = kvm_gmem_get_index(slot, gfn);
  638. struct folio *folio;
  639. int r = 0;
  640. CLASS(gmem_get_file, file)(slot);
  641. if (!file)
  642. return -EFAULT;
  643. folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
  644. if (IS_ERR(folio))
  645. return PTR_ERR(folio);
  646. if (!folio_test_uptodate(folio)) {
  647. clear_highpage(folio_page(folio, 0));
  648. folio_mark_uptodate(folio);
  649. }
  650. r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
  651. folio_unlock(folio);
  652. if (!r)
  653. *page = folio_file_page(folio, index);
  654. else
  655. folio_put(folio);
  656. return r;
  657. }
  658. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
  659. #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
  660. static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
  661. struct file *file, gfn_t gfn, struct page *src_page,
  662. kvm_gmem_populate_cb post_populate, void *opaque)
  663. {
  664. pgoff_t index = kvm_gmem_get_index(slot, gfn);
  665. struct folio *folio;
  666. kvm_pfn_t pfn;
  667. int ret;
  668. filemap_invalidate_lock(file->f_mapping);
  669. folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL);
  670. if (IS_ERR(folio)) {
  671. ret = PTR_ERR(folio);
  672. goto out_unlock;
  673. }
  674. folio_unlock(folio);
  675. if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1,
  676. KVM_MEMORY_ATTRIBUTE_PRIVATE,
  677. KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
  678. ret = -EINVAL;
  679. goto out_put_folio;
  680. }
  681. ret = post_populate(kvm, gfn, pfn, src_page, opaque);
  682. if (!ret)
  683. folio_mark_uptodate(folio);
  684. out_put_folio:
  685. folio_put(folio);
  686. out_unlock:
  687. filemap_invalidate_unlock(file->f_mapping);
  688. return ret;
  689. }
  690. long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
  691. kvm_gmem_populate_cb post_populate, void *opaque)
  692. {
  693. struct kvm_memory_slot *slot;
  694. int ret = 0;
  695. long i;
  696. lockdep_assert_held(&kvm->slots_lock);
  697. if (WARN_ON_ONCE(npages <= 0))
  698. return -EINVAL;
  699. if (WARN_ON_ONCE(!PAGE_ALIGNED(src)))
  700. return -EINVAL;
  701. slot = gfn_to_memslot(kvm, start_gfn);
  702. if (!kvm_slot_has_gmem(slot))
  703. return -EINVAL;
  704. CLASS(gmem_get_file, file)(slot);
  705. if (!file)
  706. return -EFAULT;
  707. npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
  708. for (i = 0; i < npages; i++) {
  709. struct page *src_page = NULL;
  710. if (signal_pending(current)) {
  711. ret = -EINTR;
  712. break;
  713. }
  714. if (src) {
  715. unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE;
  716. ret = get_user_pages_fast(uaddr, 1, 0, &src_page);
  717. if (ret < 0)
  718. break;
  719. if (ret != 1) {
  720. ret = -ENOMEM;
  721. break;
  722. }
  723. }
  724. ret = __kvm_gmem_populate(kvm, slot, file, start_gfn + i, src_page,
  725. post_populate, opaque);
  726. if (src_page)
  727. put_page(src_page);
  728. if (ret)
  729. break;
  730. }
  731. return ret && !i ? ret : i;
  732. }
  733. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
  734. #endif
  735. static struct kmem_cache *kvm_gmem_inode_cachep;
  736. static void kvm_gmem_init_inode_once(void *__gi)
  737. {
  738. struct gmem_inode *gi = __gi;
  739. /*
  740. * Note! Don't initialize the inode with anything specific to the
  741. * guest_memfd instance, or that might be specific to how the inode is
  742. * used (from the VFS-layer's perspective). This hook is called only
  743. * during the initial slab allocation, i.e. only fields/state that are
  744. * idempotent across _all_ use of the inode _object_ can be initialized
  745. * at this time!
  746. */
  747. inode_init_once(&gi->vfs_inode);
  748. }
  749. static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
  750. {
  751. struct gmem_inode *gi;
  752. gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL);
  753. if (!gi)
  754. return NULL;
  755. mpol_shared_policy_init(&gi->policy, NULL);
  756. gi->flags = 0;
  757. return &gi->vfs_inode;
  758. }
  759. static void kvm_gmem_destroy_inode(struct inode *inode)
  760. {
  761. mpol_free_shared_policy(&GMEM_I(inode)->policy);
  762. }
  763. static void kvm_gmem_free_inode(struct inode *inode)
  764. {
  765. kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode));
  766. }
  767. static const struct super_operations kvm_gmem_super_operations = {
  768. .statfs = simple_statfs,
  769. .alloc_inode = kvm_gmem_alloc_inode,
  770. .destroy_inode = kvm_gmem_destroy_inode,
  771. .free_inode = kvm_gmem_free_inode,
  772. };
  773. static int kvm_gmem_init_fs_context(struct fs_context *fc)
  774. {
  775. struct pseudo_fs_context *ctx;
  776. if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
  777. return -ENOMEM;
  778. fc->s_iflags |= SB_I_NOEXEC;
  779. fc->s_iflags |= SB_I_NODEV;
  780. ctx = fc->fs_private;
  781. ctx->ops = &kvm_gmem_super_operations;
  782. return 0;
  783. }
  784. static struct file_system_type kvm_gmem_fs = {
  785. .name = "guest_memfd",
  786. .init_fs_context = kvm_gmem_init_fs_context,
  787. .kill_sb = kill_anon_super,
  788. };
  789. static int kvm_gmem_init_mount(void)
  790. {
  791. kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
  792. if (IS_ERR(kvm_gmem_mnt))
  793. return PTR_ERR(kvm_gmem_mnt);
  794. kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
  795. return 0;
  796. }
  797. int kvm_gmem_init(struct module *module)
  798. {
  799. struct kmem_cache_args args = {
  800. .align = 0,
  801. .ctor = kvm_gmem_init_inode_once,
  802. };
  803. int ret;
  804. kvm_gmem_fops.owner = module;
  805. kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache",
  806. sizeof(struct gmem_inode),
  807. &args, SLAB_ACCOUNT);
  808. if (!kvm_gmem_inode_cachep)
  809. return -ENOMEM;
  810. ret = kvm_gmem_init_mount();
  811. if (ret) {
  812. kmem_cache_destroy(kvm_gmem_inode_cachep);
  813. return ret;
  814. }
  815. return 0;
  816. }
  817. void kvm_gmem_exit(void)
  818. {
  819. kern_unmount(kvm_gmem_mnt);
  820. kvm_gmem_mnt = NULL;
  821. rcu_barrier();
  822. kmem_cache_destroy(kvm_gmem_inode_cachep);
  823. }