memfd.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * memfd_create system call and file sealing support
  4. *
  5. * Code was originally included in shmem.c, and broken out to facilitate
  6. * use by hugetlbfs as well as tmpfs.
  7. */
  8. #include <linux/fs.h>
  9. #include <linux/vfs.h>
  10. #include <linux/pagemap.h>
  11. #include <linux/file.h>
  12. #include <linux/mm.h>
  13. #include <linux/sched/signal.h>
  14. #include <linux/khugepaged.h>
  15. #include <linux/syscalls.h>
  16. #include <linux/hugetlb.h>
  17. #include <linux/shmem_fs.h>
  18. #include <linux/memfd.h>
  19. #include <linux/pid_namespace.h>
  20. #include <uapi/linux/memfd.h>
  21. #include "swap.h"
  22. /*
  23. * We need a tag: a new tag would expand every xa_node by 8 bytes,
  24. * so reuse a tag which we firmly believe is never set or cleared on tmpfs
  25. * or hugetlbfs because they are memory only filesystems.
  26. */
  27. #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
  28. #define LAST_SCAN 4 /* about 150ms max */
  29. static bool memfd_folio_has_extra_refs(struct folio *folio)
  30. {
  31. return folio_ref_count(folio) != folio_expected_ref_count(folio);
  32. }
  33. static void memfd_tag_pins(struct xa_state *xas)
  34. {
  35. struct folio *folio;
  36. int latency = 0;
  37. lru_add_drain();
  38. xas_lock_irq(xas);
  39. xas_for_each(xas, folio, ULONG_MAX) {
  40. if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio))
  41. xas_set_mark(xas, MEMFD_TAG_PINNED);
  42. if (++latency < XA_CHECK_SCHED)
  43. continue;
  44. latency = 0;
  45. xas_pause(xas);
  46. xas_unlock_irq(xas);
  47. cond_resched();
  48. xas_lock_irq(xas);
  49. }
  50. xas_unlock_irq(xas);
  51. }
  52. /*
  53. * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c).
  54. * It is mainly called to allocate a folio in a memfd when the caller
  55. * (memfd_pin_folios()) cannot find a folio in the page cache at a given
  56. * index in the mapping.
  57. */
  58. struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
  59. {
  60. #ifdef CONFIG_HUGETLB_PAGE
  61. struct folio *folio;
  62. gfp_t gfp_mask;
  63. if (is_file_hugepages(memfd)) {
  64. /*
  65. * The folio would most likely be accessed by a DMA driver,
  66. * therefore, we have zone memory constraints where we can
  67. * alloc from. Also, the folio will be pinned for an indefinite
  68. * amount of time, so it is not expected to be migrated away.
  69. */
  70. struct inode *inode = file_inode(memfd);
  71. struct hstate *h = hstate_file(memfd);
  72. int err = -ENOMEM;
  73. long nr_resv;
  74. gfp_mask = htlb_alloc_mask(h);
  75. gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
  76. idx >>= huge_page_order(h);
  77. nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, EMPTY_VMA_FLAGS);
  78. if (nr_resv < 0)
  79. return ERR_PTR(nr_resv);
  80. folio = alloc_hugetlb_folio_reserve(h,
  81. numa_node_id(),
  82. NULL,
  83. gfp_mask);
  84. if (folio) {
  85. u32 hash;
  86. /*
  87. * Zero the folio to prevent information leaks to userspace.
  88. * Use folio_zero_user() which is optimized for huge/gigantic
  89. * pages. Pass 0 as addr_hint since this is not a faulting path
  90. * and we don't have a user virtual address yet.
  91. */
  92. folio_zero_user(folio, 0);
  93. /*
  94. * Mark the folio uptodate before adding to page cache,
  95. * as required by filemap.c and other hugetlb paths.
  96. */
  97. __folio_mark_uptodate(folio);
  98. /*
  99. * Serialize hugepage allocation and instantiation to prevent
  100. * races with concurrent allocations, as required by all other
  101. * callers of hugetlb_add_to_page_cache().
  102. */
  103. hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx);
  104. mutex_lock(&hugetlb_fault_mutex_table[hash]);
  105. err = hugetlb_add_to_page_cache(folio,
  106. memfd->f_mapping,
  107. idx);
  108. mutex_unlock(&hugetlb_fault_mutex_table[hash]);
  109. if (err) {
  110. folio_put(folio);
  111. goto err_unresv;
  112. }
  113. hugetlb_set_folio_subpool(folio, subpool_inode(inode));
  114. folio_unlock(folio);
  115. return folio;
  116. }
  117. err_unresv:
  118. if (nr_resv > 0)
  119. hugetlb_unreserve_pages(inode, idx, idx + 1, 0);
  120. return ERR_PTR(err);
  121. }
  122. #endif
  123. return shmem_read_folio(memfd->f_mapping, idx);
  124. }
  125. /*
  126. * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
  127. * via get_user_pages(), drivers might have some pending I/O without any active
  128. * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios
  129. * and see whether it has an elevated ref-count. If so, we tag them and wait for
  130. * them to be dropped.
  131. * The caller must guarantee that no new user will acquire writable references
  132. * to those folios to avoid races.
  133. */
  134. static int memfd_wait_for_pins(struct address_space *mapping)
  135. {
  136. XA_STATE(xas, &mapping->i_pages, 0);
  137. struct folio *folio;
  138. int error, scan;
  139. memfd_tag_pins(&xas);
  140. error = 0;
  141. for (scan = 0; scan <= LAST_SCAN; scan++) {
  142. int latency = 0;
  143. if (!xas_marked(&xas, MEMFD_TAG_PINNED))
  144. break;
  145. if (!scan)
  146. lru_add_drain_all();
  147. else if (schedule_timeout_killable((HZ << scan) / 200))
  148. scan = LAST_SCAN;
  149. xas_set(&xas, 0);
  150. xas_lock_irq(&xas);
  151. xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
  152. bool clear = true;
  153. if (!xa_is_value(folio) &&
  154. memfd_folio_has_extra_refs(folio)) {
  155. /*
  156. * On the last scan, we clean up all those tags
  157. * we inserted; but make a note that we still
  158. * found folios pinned.
  159. */
  160. if (scan == LAST_SCAN)
  161. error = -EBUSY;
  162. else
  163. clear = false;
  164. }
  165. if (clear)
  166. xas_clear_mark(&xas, MEMFD_TAG_PINNED);
  167. if (++latency < XA_CHECK_SCHED)
  168. continue;
  169. latency = 0;
  170. xas_pause(&xas);
  171. xas_unlock_irq(&xas);
  172. cond_resched();
  173. xas_lock_irq(&xas);
  174. }
  175. xas_unlock_irq(&xas);
  176. }
  177. return error;
  178. }
  179. static unsigned int *memfd_file_seals_ptr(struct file *file)
  180. {
  181. if (shmem_file(file))
  182. return &SHMEM_I(file_inode(file))->seals;
  183. #ifdef CONFIG_HUGETLBFS
  184. if (is_file_hugepages(file))
  185. return &HUGETLBFS_I(file_inode(file))->seals;
  186. #endif
  187. return NULL;
  188. }
  189. #define F_ALL_SEALS (F_SEAL_SEAL | \
  190. F_SEAL_EXEC | \
  191. F_SEAL_SHRINK | \
  192. F_SEAL_GROW | \
  193. F_SEAL_WRITE | \
  194. F_SEAL_FUTURE_WRITE)
  195. static int memfd_add_seals(struct file *file, unsigned int seals)
  196. {
  197. struct inode *inode = file_inode(file);
  198. unsigned int *file_seals;
  199. int error;
  200. /*
  201. * SEALING
  202. * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
  203. * but restrict access to a specific subset of file operations. Seals
  204. * can only be added, but never removed. This way, mutually untrusted
  205. * parties can share common memory regions with a well-defined policy.
  206. * A malicious peer can thus never perform unwanted operations on a
  207. * shared object.
  208. *
  209. * Seals are only supported on special tmpfs or hugetlbfs files and
  210. * always affect the whole underlying inode. Once a seal is set, it
  211. * may prevent some kinds of access to the file. Currently, the
  212. * following seals are defined:
  213. * SEAL_SEAL: Prevent further seals from being set on this file
  214. * SEAL_SHRINK: Prevent the file from shrinking
  215. * SEAL_GROW: Prevent the file from growing
  216. * SEAL_WRITE: Prevent write access to the file
  217. * SEAL_EXEC: Prevent modification of the exec bits in the file mode
  218. *
  219. * As we don't require any trust relationship between two parties, we
  220. * must prevent seals from being removed. Therefore, sealing a file
  221. * only adds a given set of seals to the file, it never touches
  222. * existing seals. Furthermore, the "setting seals"-operation can be
  223. * sealed itself, which basically prevents any further seal from being
  224. * added.
  225. *
  226. * Semantics of sealing are only defined on volatile files. Only
  227. * anonymous tmpfs and hugetlbfs files support sealing. More
  228. * importantly, seals are never written to disk. Therefore, there's
  229. * no plan to support it on other file types.
  230. */
  231. if (!(file->f_mode & FMODE_WRITE))
  232. return -EPERM;
  233. if (seals & ~(unsigned int)F_ALL_SEALS)
  234. return -EINVAL;
  235. inode_lock(inode);
  236. file_seals = memfd_file_seals_ptr(file);
  237. if (!file_seals) {
  238. error = -EINVAL;
  239. goto unlock;
  240. }
  241. if (*file_seals & F_SEAL_SEAL) {
  242. error = -EPERM;
  243. goto unlock;
  244. }
  245. if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
  246. error = mapping_deny_writable(file->f_mapping);
  247. if (error)
  248. goto unlock;
  249. error = memfd_wait_for_pins(file->f_mapping);
  250. if (error) {
  251. mapping_allow_writable(file->f_mapping);
  252. goto unlock;
  253. }
  254. }
  255. /*
  256. * SEAL_EXEC implies SEAL_WRITE, making W^X from the start.
  257. */
  258. if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
  259. seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;
  260. *file_seals |= seals;
  261. error = 0;
  262. unlock:
  263. inode_unlock(inode);
  264. return error;
  265. }
  266. static int memfd_get_seals(struct file *file)
  267. {
  268. unsigned int *seals = memfd_file_seals_ptr(file);
  269. return seals ? *seals : -EINVAL;
  270. }
  271. long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
  272. {
  273. long error;
  274. switch (cmd) {
  275. case F_ADD_SEALS:
  276. error = memfd_add_seals(file, arg);
  277. break;
  278. case F_GET_SEALS:
  279. error = memfd_get_seals(file);
  280. break;
  281. default:
  282. error = -EINVAL;
  283. break;
  284. }
  285. return error;
  286. }
  287. #define MFD_NAME_PREFIX "memfd:"
  288. #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
  289. #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
  290. #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
  291. static int check_sysctl_memfd_noexec(unsigned int *flags)
  292. {
  293. #ifdef CONFIG_SYSCTL
  294. struct pid_namespace *ns = task_active_pid_ns(current);
  295. int sysctl = pidns_memfd_noexec_scope(ns);
  296. if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
  297. if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL)
  298. *flags |= MFD_NOEXEC_SEAL;
  299. else
  300. *flags |= MFD_EXEC;
  301. }
  302. if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) {
  303. pr_err_ratelimited(
  304. "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n",
  305. current->comm, task_pid_nr(current), sysctl);
  306. return -EACCES;
  307. }
  308. #endif
  309. return 0;
  310. }
  311. static inline bool is_write_sealed(unsigned int seals)
  312. {
  313. return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
  314. }
  315. static int check_write_seal(vm_flags_t *vm_flags_ptr)
  316. {
  317. vm_flags_t vm_flags = *vm_flags_ptr;
  318. vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE);
  319. /* If a private mapping then writability is irrelevant. */
  320. if (!(mask & VM_SHARED))
  321. return 0;
  322. /*
  323. * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
  324. * write seals are active.
  325. */
  326. if (mask & VM_WRITE)
  327. return -EPERM;
  328. /*
  329. * This is a read-only mapping, disallow mprotect() from making a
  330. * write-sealed mapping writable in future.
  331. */
  332. *vm_flags_ptr &= ~VM_MAYWRITE;
  333. return 0;
  334. }
  335. int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr)
  336. {
  337. int err = 0;
  338. unsigned int *seals_ptr = memfd_file_seals_ptr(file);
  339. unsigned int seals = seals_ptr ? *seals_ptr : 0;
  340. if (is_write_sealed(seals))
  341. err = check_write_seal(vm_flags_ptr);
  342. return err;
  343. }
  344. static int sanitize_flags(unsigned int *flags_ptr)
  345. {
  346. unsigned int flags = *flags_ptr;
  347. if (!(flags & MFD_HUGETLB)) {
  348. if (flags & ~MFD_ALL_FLAGS)
  349. return -EINVAL;
  350. } else {
  351. /* Allow huge page size encoding in flags. */
  352. if (flags & ~(MFD_ALL_FLAGS |
  353. (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
  354. return -EINVAL;
  355. }
  356. /* Invalid if both EXEC and NOEXEC_SEAL are set.*/
  357. if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
  358. return -EINVAL;
  359. return check_sysctl_memfd_noexec(flags_ptr);
  360. }
  361. static char *alloc_name(const char __user *uname)
  362. {
  363. int error;
  364. char *name;
  365. long len;
  366. name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
  367. if (!name)
  368. return ERR_PTR(-ENOMEM);
  369. memcpy(name, MFD_NAME_PREFIX, MFD_NAME_PREFIX_LEN);
  370. /* returned length does not include terminating zero */
  371. len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1);
  372. if (len < 0) {
  373. error = -EFAULT;
  374. goto err_name;
  375. } else if (len > MFD_NAME_MAX_LEN) {
  376. error = -EINVAL;
  377. goto err_name;
  378. }
  379. return name;
  380. err_name:
  381. kfree(name);
  382. return ERR_PTR(error);
  383. }
  384. struct file *memfd_alloc_file(const char *name, unsigned int flags)
  385. {
  386. unsigned int *file_seals;
  387. struct file *file;
  388. struct inode *inode;
  389. int err = 0;
  390. if (flags & MFD_HUGETLB) {
  391. file = hugetlb_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT),
  392. HUGETLB_ANONHUGE_INODE,
  393. (flags >> MFD_HUGE_SHIFT) &
  394. MFD_HUGE_MASK);
  395. } else {
  396. file = shmem_file_setup(name, 0, mk_vma_flags(VMA_NORESERVE_BIT));
  397. }
  398. if (IS_ERR(file))
  399. return file;
  400. inode = file_inode(file);
  401. err = security_inode_init_security_anon(inode,
  402. &QSTR(MEMFD_ANON_NAME), NULL);
  403. if (err) {
  404. fput(file);
  405. file = ERR_PTR(err);
  406. return file;
  407. }
  408. file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
  409. file->f_flags |= O_LARGEFILE;
  410. if (flags & MFD_NOEXEC_SEAL) {
  411. inode->i_mode &= ~0111;
  412. file_seals = memfd_file_seals_ptr(file);
  413. if (file_seals) {
  414. *file_seals &= ~F_SEAL_SEAL;
  415. *file_seals |= F_SEAL_EXEC;
  416. }
  417. } else if (flags & MFD_ALLOW_SEALING) {
  418. /* MFD_EXEC and MFD_ALLOW_SEALING are set */
  419. file_seals = memfd_file_seals_ptr(file);
  420. if (file_seals)
  421. *file_seals &= ~F_SEAL_SEAL;
  422. }
  423. return file;
  424. }
  425. SYSCALL_DEFINE2(memfd_create,
  426. const char __user *, uname,
  427. unsigned int, flags)
  428. {
  429. char *name __free(kfree) = NULL;
  430. unsigned int fd_flags;
  431. int error;
  432. error = sanitize_flags(&flags);
  433. if (error < 0)
  434. return error;
  435. name = alloc_name(uname);
  436. if (IS_ERR(name))
  437. return PTR_ERR(name);
  438. fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0;
  439. return FD_ADD(fd_flags, memfd_alloc_file(name, flags));
  440. }