mseal.c 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Implement mseal() syscall.
  4. *
  5. * Copyright (c) 2023,2024 Google, Inc.
  6. *
  7. * Author: Jeff Xu <jeffxu@chromium.org>
  8. */
  9. #include <linux/mempolicy.h>
  10. #include <linux/mman.h>
  11. #include <linux/mm.h>
  12. #include <linux/mm_inline.h>
  13. #include <linux/syscalls.h>
  14. #include <linux/sched.h>
  15. #include "internal.h"
  16. /*
  17. * mseal() disallows an input range which contain unmapped ranges (VMA holes).
  18. *
  19. * It disallows unmapped regions from start to end whether they exist at the
  20. * start, in the middle, or at the end of the range, or any combination thereof.
  21. *
  22. * This is because after sealing a range, there's nothing to stop memory mapping
  23. * of ranges in the remaining gaps later, meaning that the user might then
  24. * wrongly consider the entirety of the mseal()'d range to be sealed when it
  25. * in fact isn't.
  26. */
  27. /*
  28. * Does the [start, end) range contain any unmapped memory?
  29. *
  30. * We ensure that:
  31. * - start is part of a valid VMA.
  32. * - end is part of a valid VMA.
  33. * - no gap (unallocated memory) exists between start and end.
  34. */
  35. static bool range_contains_unmapped(struct mm_struct *mm,
  36. unsigned long start, unsigned long end)
  37. {
  38. struct vm_area_struct *vma;
  39. unsigned long prev_end = start;
  40. VMA_ITERATOR(vmi, current->mm, start);
  41. for_each_vma_range(vmi, vma, end) {
  42. if (vma->vm_start > prev_end)
  43. return true;
  44. prev_end = vma->vm_end;
  45. }
  46. return prev_end < end;
  47. }
  48. static int mseal_apply(struct mm_struct *mm,
  49. unsigned long start, unsigned long end)
  50. {
  51. struct vm_area_struct *vma, *prev;
  52. VMA_ITERATOR(vmi, mm, start);
  53. /* We know there are no gaps so this will be non-NULL. */
  54. vma = vma_iter_load(&vmi);
  55. prev = vma_prev(&vmi);
  56. if (start > vma->vm_start)
  57. prev = vma;
  58. for_each_vma_range(vmi, vma, end) {
  59. const unsigned long curr_start = MAX(vma->vm_start, start);
  60. const unsigned long curr_end = MIN(vma->vm_end, end);
  61. if (!(vma->vm_flags & VM_SEALED)) {
  62. vm_flags_t vm_flags = vma->vm_flags | VM_SEALED;
  63. vma = vma_modify_flags(&vmi, prev, vma, curr_start,
  64. curr_end, &vm_flags);
  65. if (IS_ERR(vma))
  66. return PTR_ERR(vma);
  67. vm_flags_set(vma, VM_SEALED);
  68. }
  69. prev = vma;
  70. }
  71. return 0;
  72. }
  73. /*
  74. * mseal(2) seals the VM's meta data from
  75. * selected syscalls.
  76. *
  77. * addr/len: VM address range.
  78. *
  79. * The address range by addr/len must meet:
  80. * start (addr) must be in a valid VMA.
  81. * end (addr + len) must be in a valid VMA.
  82. * no gap (unallocated memory) between start and end.
  83. * start (addr) must be page aligned.
  84. *
  85. * len: len will be page aligned implicitly.
  86. *
  87. * Below VMA operations are blocked after sealing.
  88. * 1> Unmapping, moving to another location, and shrinking
  89. * the size, via munmap() and mremap(), can leave an empty
  90. * space, therefore can be replaced with a VMA with a new
  91. * set of attributes.
  92. * 2> Moving or expanding a different vma into the current location,
  93. * via mremap().
  94. * 3> Modifying a VMA via mmap(MAP_FIXED).
  95. * 4> Size expansion, via mremap(), does not appear to pose any
  96. * specific risks to sealed VMAs. It is included anyway because
  97. * the use case is unclear. In any case, users can rely on
  98. * merging to expand a sealed VMA.
  99. * 5> mprotect and pkey_mprotect.
  100. * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
  101. * for anonymous memory, when users don't have write permission to the
  102. * memory. Those behaviors can alter region contents by discarding pages,
  103. * effectively a memset(0) for anonymous memory.
  104. *
  105. * flags: reserved.
  106. *
  107. * return values:
  108. * zero: success.
  109. * -EINVAL:
  110. * invalid input flags.
  111. * start address is not page aligned.
  112. * Address range (start + len) overflow.
  113. * -ENOMEM:
  114. * addr is not a valid address (not allocated).
  115. * end (start + len) is not a valid address.
  116. * a gap (unallocated memory) between start and end.
  117. * -EPERM:
  118. * - In 32 bit architecture, sealing is not supported.
  119. * Note:
  120. * user can call mseal(2) multiple times, adding a seal on an
  121. * already sealed memory is a no-action (no error).
  122. *
  123. * unseal() is not supported.
  124. */
  125. int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
  126. {
  127. size_t len;
  128. int ret = 0;
  129. unsigned long end;
  130. struct mm_struct *mm = current->mm;
  131. /* Verify flags not set. */
  132. if (flags)
  133. return -EINVAL;
  134. start = untagged_addr(start);
  135. if (!PAGE_ALIGNED(start))
  136. return -EINVAL;
  137. len = PAGE_ALIGN(len_in);
  138. /* Check to see whether len was rounded up from small -ve to zero. */
  139. if (len_in && !len)
  140. return -EINVAL;
  141. end = start + len;
  142. if (end < start)
  143. return -EINVAL;
  144. if (end == start)
  145. return 0;
  146. if (mmap_write_lock_killable(mm))
  147. return -EINTR;
  148. if (range_contains_unmapped(mm, start, end)) {
  149. ret = -ENOMEM;
  150. goto out;
  151. }
  152. /*
  153. * Second pass, this should success, unless there are errors
  154. * from vma_modify_flags, e.g. merge/split error, or process
  155. * reaching the max supported VMAs, however, those cases shall
  156. * be rare.
  157. */
  158. ret = mseal_apply(mm, start, end);
  159. out:
  160. mmap_write_unlock(mm);
  161. return ret;
  162. }
  163. SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
  164. flags)
  165. {
  166. return do_mseal(start, len, flags);
  167. }