trans_pgd.c 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Transitional page tables for kexec and hibernate
  4. *
  5. * This file derived from: arch/arm64/kernel/hibernate.c
  6. *
  7. * Copyright (c) 2021, Microsoft Corporation.
  8. * Pasha Tatashin <pasha.tatashin@soleen.com>
  9. *
  10. */
  11. /*
  12. * Transitional tables are used during system transferring from one world to
  13. * another: such as during hibernate restore, and kexec reboots. During these
  14. * phases one cannot rely on page table not being overwritten. This is because
  15. * hibernate and kexec can overwrite the current page tables during transition.
  16. */
  17. #include <asm/trans_pgd.h>
  18. #include <asm/pgalloc.h>
  19. #include <asm/pgtable.h>
  20. #include <linux/suspend.h>
  21. #include <linux/bug.h>
  22. #include <linux/mm.h>
  23. #include <linux/mmzone.h>
  24. #include <linux/kfence.h>
  25. static void *trans_alloc(struct trans_pgd_info *info)
  26. {
  27. return info->trans_alloc_page(info->trans_alloc_arg);
  28. }
  29. static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
  30. {
  31. pte_t pte = __ptep_get(src_ptep);
  32. if (pte_valid(pte)) {
  33. /*
  34. * Resume will overwrite areas that may be marked
  35. * read only (code, rodata). Clear the RDONLY bit from
  36. * the temporary mappings we use during restore.
  37. */
  38. __set_pte(dst_ptep, pte_mkwrite_novma(pte));
  39. } else if (!pte_none(pte)) {
  40. /*
  41. * debug_pagealloc will removed the PTE_VALID bit if
  42. * the page isn't in use by the resume kernel. It may have
  43. * been in use by the original kernel, in which case we need
  44. * to put it back in our copy to do the restore.
  45. *
  46. * Other cases include kfence / vmalloc / memfd_secret which
  47. * may call `set_direct_map_invalid_noflush()`.
  48. *
  49. * Before marking this entry valid, check the pfn should
  50. * be mapped.
  51. */
  52. BUG_ON(!pfn_valid(pte_pfn(pte)));
  53. __set_pte(dst_ptep, pte_mkvalid(pte_mkwrite_novma(pte)));
  54. }
  55. }
  56. static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp,
  57. pmd_t *src_pmdp, unsigned long start, unsigned long end)
  58. {
  59. pte_t *src_ptep;
  60. pte_t *dst_ptep;
  61. unsigned long addr = start;
  62. dst_ptep = trans_alloc(info);
  63. if (!dst_ptep)
  64. return -ENOMEM;
  65. pmd_populate_kernel(NULL, dst_pmdp, dst_ptep);
  66. dst_ptep = pte_offset_kernel(dst_pmdp, start);
  67. src_ptep = pte_offset_kernel(src_pmdp, start);
  68. do {
  69. _copy_pte(dst_ptep, src_ptep, addr);
  70. } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end);
  71. return 0;
  72. }
  73. static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp,
  74. pud_t *src_pudp, unsigned long start, unsigned long end)
  75. {
  76. pmd_t *src_pmdp;
  77. pmd_t *dst_pmdp;
  78. unsigned long next;
  79. unsigned long addr = start;
  80. if (pud_none(READ_ONCE(*dst_pudp))) {
  81. dst_pmdp = trans_alloc(info);
  82. if (!dst_pmdp)
  83. return -ENOMEM;
  84. pud_populate(NULL, dst_pudp, dst_pmdp);
  85. }
  86. dst_pmdp = pmd_offset(dst_pudp, start);
  87. src_pmdp = pmd_offset(src_pudp, start);
  88. do {
  89. pmd_t pmd = READ_ONCE(*src_pmdp);
  90. next = pmd_addr_end(addr, end);
  91. if (pmd_none(pmd))
  92. continue;
  93. if (pmd_table(pmd)) {
  94. if (copy_pte(info, dst_pmdp, src_pmdp, addr, next))
  95. return -ENOMEM;
  96. } else {
  97. set_pmd(dst_pmdp,
  98. __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY));
  99. }
  100. } while (dst_pmdp++, src_pmdp++, addr = next, addr != end);
  101. return 0;
  102. }
  103. static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp,
  104. p4d_t *src_p4dp, unsigned long start,
  105. unsigned long end)
  106. {
  107. pud_t *dst_pudp;
  108. pud_t *src_pudp;
  109. unsigned long next;
  110. unsigned long addr = start;
  111. if (p4d_none(READ_ONCE(*dst_p4dp))) {
  112. dst_pudp = trans_alloc(info);
  113. if (!dst_pudp)
  114. return -ENOMEM;
  115. p4d_populate(NULL, dst_p4dp, dst_pudp);
  116. }
  117. dst_pudp = pud_offset(dst_p4dp, start);
  118. src_pudp = pud_offset(src_p4dp, start);
  119. do {
  120. pud_t pud = READ_ONCE(*src_pudp);
  121. next = pud_addr_end(addr, end);
  122. if (pud_none(pud))
  123. continue;
  124. if (pud_table(pud)) {
  125. if (copy_pmd(info, dst_pudp, src_pudp, addr, next))
  126. return -ENOMEM;
  127. } else {
  128. set_pud(dst_pudp,
  129. __pud(pud_val(pud) & ~PUD_SECT_RDONLY));
  130. }
  131. } while (dst_pudp++, src_pudp++, addr = next, addr != end);
  132. return 0;
  133. }
  134. static int copy_p4d(struct trans_pgd_info *info, pgd_t *dst_pgdp,
  135. pgd_t *src_pgdp, unsigned long start,
  136. unsigned long end)
  137. {
  138. p4d_t *dst_p4dp;
  139. p4d_t *src_p4dp;
  140. unsigned long next;
  141. unsigned long addr = start;
  142. if (pgd_none(READ_ONCE(*dst_pgdp))) {
  143. dst_p4dp = trans_alloc(info);
  144. if (!dst_p4dp)
  145. return -ENOMEM;
  146. pgd_populate(NULL, dst_pgdp, dst_p4dp);
  147. }
  148. dst_p4dp = p4d_offset(dst_pgdp, start);
  149. src_p4dp = p4d_offset(src_pgdp, start);
  150. do {
  151. next = p4d_addr_end(addr, end);
  152. if (p4d_none(READ_ONCE(*src_p4dp)))
  153. continue;
  154. if (copy_pud(info, dst_p4dp, src_p4dp, addr, next))
  155. return -ENOMEM;
  156. } while (dst_p4dp++, src_p4dp++, addr = next, addr != end);
  157. return 0;
  158. }
  159. static int copy_page_tables(struct trans_pgd_info *info, pgd_t *dst_pgdp,
  160. unsigned long start, unsigned long end)
  161. {
  162. unsigned long next;
  163. unsigned long addr = start;
  164. pgd_t *src_pgdp = pgd_offset_k(start);
  165. dst_pgdp = pgd_offset_pgd(dst_pgdp, start);
  166. do {
  167. next = pgd_addr_end(addr, end);
  168. if (pgd_none(READ_ONCE(*src_pgdp)))
  169. continue;
  170. if (copy_p4d(info, dst_pgdp, src_pgdp, addr, next))
  171. return -ENOMEM;
  172. } while (dst_pgdp++, src_pgdp++, addr = next, addr != end);
  173. return 0;
  174. }
  175. /*
  176. * Create trans_pgd and copy linear map.
  177. * info: contains allocator and its argument
  178. * dst_pgdp: new page table that is created, and to which map is copied.
  179. * start: Start of the interval (inclusive).
  180. * end: End of the interval (exclusive).
  181. *
  182. * Returns 0 on success, and -ENOMEM on failure.
  183. */
  184. int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **dst_pgdp,
  185. unsigned long start, unsigned long end)
  186. {
  187. int rc;
  188. pgd_t *trans_pgd = trans_alloc(info);
  189. if (!trans_pgd) {
  190. pr_err("Failed to allocate memory for temporary page tables.\n");
  191. return -ENOMEM;
  192. }
  193. rc = copy_page_tables(info, trans_pgd, start, end);
  194. if (!rc)
  195. *dst_pgdp = trans_pgd;
  196. return rc;
  197. }
  198. /*
  199. * The page we want to idmap may be outside the range covered by VA_BITS that
  200. * can be built using the kernel's p?d_populate() helpers. As a one off, for a
  201. * single page, we build these page tables bottom up and just assume that will
  202. * need the maximum T0SZ.
  203. *
  204. * Returns 0 on success, and -ENOMEM on failure.
  205. * On success trans_ttbr0 contains page table with idmapped page, t0sz is set to
  206. * maximum T0SZ for this page.
  207. */
  208. int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0,
  209. unsigned long *t0sz, void *page)
  210. {
  211. phys_addr_t dst_addr = virt_to_phys(page);
  212. unsigned long pfn = __phys_to_pfn(dst_addr);
  213. int max_msb = (dst_addr & GENMASK(52, 48)) ? 51 : 47;
  214. int bits_mapped = PAGE_SHIFT - 4;
  215. unsigned long level_mask, prev_level_entry, *levels[4];
  216. int this_level, index, level_lsb, level_msb;
  217. dst_addr &= PAGE_MASK;
  218. prev_level_entry = pte_val(pfn_pte(pfn, PAGE_KERNEL_ROX));
  219. for (this_level = 3; this_level >= 0; this_level--) {
  220. levels[this_level] = trans_alloc(info);
  221. if (!levels[this_level])
  222. return -ENOMEM;
  223. level_lsb = ARM64_HW_PGTABLE_LEVEL_SHIFT(this_level);
  224. level_msb = min(level_lsb + bits_mapped, max_msb);
  225. level_mask = GENMASK_ULL(level_msb, level_lsb);
  226. index = (dst_addr & level_mask) >> level_lsb;
  227. *(levels[this_level] + index) = prev_level_entry;
  228. pfn = virt_to_pfn(levels[this_level]);
  229. prev_level_entry = pte_val(pfn_pte(pfn,
  230. __pgprot(PMD_TYPE_TABLE)));
  231. if (level_msb == max_msb)
  232. break;
  233. }
  234. *trans_ttbr0 = phys_to_ttbr(__pfn_to_phys(pfn));
  235. *t0sz = TCR_T0SZ(max_msb + 1);
  236. return 0;
  237. }
  238. /*
  239. * Create a copy of the vector table so we can call HVC_SET_VECTORS or
  240. * HVC_SOFT_RESTART from contexts where the table may be overwritten.
  241. */
  242. int trans_pgd_copy_el2_vectors(struct trans_pgd_info *info,
  243. phys_addr_t *el2_vectors)
  244. {
  245. void *hyp_stub = trans_alloc(info);
  246. if (!hyp_stub)
  247. return -ENOMEM;
  248. *el2_vectors = virt_to_phys(hyp_stub);
  249. memcpy(hyp_stub, &trans_pgd_stub_vectors, ARM64_VECTOR_TABLE_LEN);
  250. caches_clean_inval_pou((unsigned long)hyp_stub,
  251. (unsigned long)hyp_stub +
  252. ARM64_VECTOR_TABLE_LEN);
  253. dcache_clean_inval_poc((unsigned long)hyp_stub,
  254. (unsigned long)hyp_stub +
  255. ARM64_VECTOR_TABLE_LEN);
  256. return 0;
  257. }