pgalloc.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Page table allocation functions
  4. *
  5. * Copyright IBM Corp. 2016
  6. * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  7. */
  8. #include <linux/sysctl.h>
  9. #include <linux/slab.h>
  10. #include <linux/mm.h>
  11. #include <asm/mmu_context.h>
  12. #include <asm/page-states.h>
  13. #include <asm/pgalloc.h>
  14. #include <asm/tlbflush.h>
  15. unsigned long *crst_table_alloc_noprof(struct mm_struct *mm)
  16. {
  17. gfp_t gfp = GFP_KERNEL_ACCOUNT;
  18. struct ptdesc *ptdesc;
  19. unsigned long *table;
  20. if (mm == &init_mm)
  21. gfp &= ~__GFP_ACCOUNT;
  22. ptdesc = pagetable_alloc_noprof(gfp, CRST_ALLOC_ORDER);
  23. if (!ptdesc)
  24. return NULL;
  25. table = ptdesc_address(ptdesc);
  26. __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER);
  27. return table;
  28. }
  29. void crst_table_free(struct mm_struct *mm, unsigned long *table)
  30. {
  31. if (!table)
  32. return;
  33. pagetable_free(virt_to_ptdesc(table));
  34. }
  35. static void __crst_table_upgrade(void *arg)
  36. {
  37. struct mm_struct *mm = arg;
  38. struct ctlreg asce;
  39. /* change all active ASCEs to avoid the creation of new TLBs */
  40. if (current->active_mm == mm) {
  41. asce.val = mm->context.asce;
  42. get_lowcore()->user_asce = asce;
  43. local_ctl_load(7, &asce);
  44. if (!test_thread_flag(TIF_ASCE_PRIMARY))
  45. local_ctl_load(1, &asce);
  46. }
  47. __tlb_flush_local();
  48. }
  49. int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
  50. {
  51. unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
  52. unsigned long asce_limit = mm->context.asce_limit;
  53. mmap_assert_write_locked(mm);
  54. /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
  55. VM_BUG_ON(asce_limit < _REGION2_SIZE);
  56. if (end <= asce_limit)
  57. return 0;
  58. if (asce_limit == _REGION2_SIZE) {
  59. p4d = crst_table_alloc(mm);
  60. if (unlikely(!p4d))
  61. goto err_p4d;
  62. crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
  63. pagetable_p4d_ctor(virt_to_ptdesc(p4d));
  64. }
  65. if (end > _REGION1_SIZE) {
  66. pgd = crst_table_alloc(mm);
  67. if (unlikely(!pgd))
  68. goto err_pgd;
  69. crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
  70. pagetable_pgd_ctor(virt_to_ptdesc(pgd));
  71. }
  72. spin_lock_bh(&mm->page_table_lock);
  73. if (p4d) {
  74. __pgd = (unsigned long *) mm->pgd;
  75. p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
  76. mm->pgd = (pgd_t *) p4d;
  77. mm->context.asce_limit = _REGION1_SIZE;
  78. mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
  79. _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
  80. mm_inc_nr_puds(mm);
  81. }
  82. if (pgd) {
  83. __pgd = (unsigned long *) mm->pgd;
  84. pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
  85. mm->pgd = (pgd_t *) pgd;
  86. mm->context.asce_limit = TASK_SIZE_MAX;
  87. mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
  88. _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
  89. }
  90. spin_unlock_bh(&mm->page_table_lock);
  91. on_each_cpu(__crst_table_upgrade, mm, 0);
  92. return 0;
  93. err_pgd:
  94. pagetable_dtor(virt_to_ptdesc(p4d));
  95. crst_table_free(mm, p4d);
  96. err_p4d:
  97. return -ENOMEM;
  98. }
  99. unsigned long *page_table_alloc_noprof(struct mm_struct *mm)
  100. {
  101. gfp_t gfp = GFP_KERNEL_ACCOUNT;
  102. struct ptdesc *ptdesc;
  103. unsigned long *table;
  104. if (mm == &init_mm)
  105. gfp &= ~__GFP_ACCOUNT;
  106. ptdesc = pagetable_alloc_noprof(gfp, 0);
  107. if (!ptdesc)
  108. return NULL;
  109. if (!pagetable_pte_ctor(mm, ptdesc)) {
  110. pagetable_free(ptdesc);
  111. return NULL;
  112. }
  113. table = ptdesc_address(ptdesc);
  114. __arch_set_page_dat(table, 1);
  115. memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
  116. memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
  117. return table;
  118. }
  119. void page_table_free(struct mm_struct *mm, unsigned long *table)
  120. {
  121. struct ptdesc *ptdesc = virt_to_ptdesc(table);
  122. if (pagetable_is_reserved(ptdesc))
  123. return free_reserved_ptdesc(ptdesc);
  124. pagetable_dtor_free(ptdesc);
  125. }
  126. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  127. static void pte_free_now(struct rcu_head *head)
  128. {
  129. struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
  130. pagetable_dtor_free(ptdesc);
  131. }
  132. void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
  133. {
  134. struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
  135. call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
  136. }
  137. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  138. /*
  139. * Base infrastructure required to generate basic asces, region, segment,
  140. * and page tables that do not make use of enhanced features like EDAT1.
  141. */
  142. static struct kmem_cache *base_pgt_cache;
  143. static unsigned long *base_pgt_alloc(void)
  144. {
  145. unsigned long *table;
  146. table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
  147. if (table)
  148. memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
  149. return table;
  150. }
  151. static void base_pgt_free(unsigned long *table)
  152. {
  153. kmem_cache_free(base_pgt_cache, table);
  154. }
  155. static unsigned long *base_crst_alloc(unsigned long val)
  156. {
  157. unsigned long *table;
  158. struct ptdesc *ptdesc;
  159. ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
  160. if (!ptdesc)
  161. return NULL;
  162. table = ptdesc_address(ptdesc);
  163. crst_table_init(table, val);
  164. return table;
  165. }
  166. static void base_crst_free(unsigned long *table)
  167. {
  168. if (!table)
  169. return;
  170. pagetable_free(virt_to_ptdesc(table));
  171. }
  172. #define BASE_ADDR_END_FUNC(NAME, SIZE) \
  173. static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \
  174. unsigned long end) \
  175. { \
  176. unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1); \
  177. \
  178. return (next - 1) < (end - 1) ? next : end; \
  179. }
  180. BASE_ADDR_END_FUNC(page, PAGE_SIZE)
  181. BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
  182. BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
  183. BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
  184. BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
  185. static inline unsigned long base_lra(unsigned long address)
  186. {
  187. unsigned long real;
  188. asm volatile(
  189. " lra %0,0(%1)"
  190. : "=d" (real) : "a" (address) : "cc");
  191. return real;
  192. }
  193. static int base_page_walk(unsigned long *origin, unsigned long addr,
  194. unsigned long end, int alloc)
  195. {
  196. unsigned long *pte, next;
  197. if (!alloc)
  198. return 0;
  199. pte = origin;
  200. pte += (addr & _PAGE_INDEX) >> PAGE_SHIFT;
  201. do {
  202. next = base_page_addr_end(addr, end);
  203. *pte = base_lra(addr);
  204. } while (pte++, addr = next, addr < end);
  205. return 0;
  206. }
  207. static int base_segment_walk(unsigned long *origin, unsigned long addr,
  208. unsigned long end, int alloc)
  209. {
  210. unsigned long *ste, next, *table;
  211. int rc;
  212. ste = origin;
  213. ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
  214. do {
  215. next = base_segment_addr_end(addr, end);
  216. if (*ste & _SEGMENT_ENTRY_INVALID) {
  217. if (!alloc)
  218. continue;
  219. table = base_pgt_alloc();
  220. if (!table)
  221. return -ENOMEM;
  222. *ste = __pa(table) | _SEGMENT_ENTRY;
  223. }
  224. table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
  225. rc = base_page_walk(table, addr, next, alloc);
  226. if (rc)
  227. return rc;
  228. if (!alloc)
  229. base_pgt_free(table);
  230. cond_resched();
  231. } while (ste++, addr = next, addr < end);
  232. return 0;
  233. }
  234. static int base_region3_walk(unsigned long *origin, unsigned long addr,
  235. unsigned long end, int alloc)
  236. {
  237. unsigned long *rtte, next, *table;
  238. int rc;
  239. rtte = origin;
  240. rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
  241. do {
  242. next = base_region3_addr_end(addr, end);
  243. if (*rtte & _REGION_ENTRY_INVALID) {
  244. if (!alloc)
  245. continue;
  246. table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
  247. if (!table)
  248. return -ENOMEM;
  249. *rtte = __pa(table) | _REGION3_ENTRY;
  250. }
  251. table = __va(*rtte & _REGION_ENTRY_ORIGIN);
  252. rc = base_segment_walk(table, addr, next, alloc);
  253. if (rc)
  254. return rc;
  255. if (!alloc)
  256. base_crst_free(table);
  257. } while (rtte++, addr = next, addr < end);
  258. return 0;
  259. }
  260. static int base_region2_walk(unsigned long *origin, unsigned long addr,
  261. unsigned long end, int alloc)
  262. {
  263. unsigned long *rste, next, *table;
  264. int rc;
  265. rste = origin;
  266. rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
  267. do {
  268. next = base_region2_addr_end(addr, end);
  269. if (*rste & _REGION_ENTRY_INVALID) {
  270. if (!alloc)
  271. continue;
  272. table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
  273. if (!table)
  274. return -ENOMEM;
  275. *rste = __pa(table) | _REGION2_ENTRY;
  276. }
  277. table = __va(*rste & _REGION_ENTRY_ORIGIN);
  278. rc = base_region3_walk(table, addr, next, alloc);
  279. if (rc)
  280. return rc;
  281. if (!alloc)
  282. base_crst_free(table);
  283. } while (rste++, addr = next, addr < end);
  284. return 0;
  285. }
  286. static int base_region1_walk(unsigned long *origin, unsigned long addr,
  287. unsigned long end, int alloc)
  288. {
  289. unsigned long *rfte, next, *table;
  290. int rc;
  291. rfte = origin;
  292. rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
  293. do {
  294. next = base_region1_addr_end(addr, end);
  295. if (*rfte & _REGION_ENTRY_INVALID) {
  296. if (!alloc)
  297. continue;
  298. table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
  299. if (!table)
  300. return -ENOMEM;
  301. *rfte = __pa(table) | _REGION1_ENTRY;
  302. }
  303. table = __va(*rfte & _REGION_ENTRY_ORIGIN);
  304. rc = base_region2_walk(table, addr, next, alloc);
  305. if (rc)
  306. return rc;
  307. if (!alloc)
  308. base_crst_free(table);
  309. } while (rfte++, addr = next, addr < end);
  310. return 0;
  311. }
  312. /**
  313. * base_asce_free - free asce and tables returned from base_asce_alloc()
  314. * @asce: asce to be freed
  315. *
  316. * Frees all region, segment, and page tables that were allocated with a
  317. * corresponding base_asce_alloc() call.
  318. */
  319. void base_asce_free(unsigned long asce)
  320. {
  321. unsigned long *table = __va(asce & _ASCE_ORIGIN);
  322. if (!asce)
  323. return;
  324. switch (asce & _ASCE_TYPE_MASK) {
  325. case _ASCE_TYPE_SEGMENT:
  326. base_segment_walk(table, 0, _REGION3_SIZE, 0);
  327. break;
  328. case _ASCE_TYPE_REGION3:
  329. base_region3_walk(table, 0, _REGION2_SIZE, 0);
  330. break;
  331. case _ASCE_TYPE_REGION2:
  332. base_region2_walk(table, 0, _REGION1_SIZE, 0);
  333. break;
  334. case _ASCE_TYPE_REGION1:
  335. base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
  336. break;
  337. }
  338. base_crst_free(table);
  339. }
  340. static int base_pgt_cache_init(void)
  341. {
  342. static DEFINE_MUTEX(base_pgt_cache_mutex);
  343. unsigned long sz = _PAGE_TABLE_SIZE;
  344. if (base_pgt_cache)
  345. return 0;
  346. mutex_lock(&base_pgt_cache_mutex);
  347. if (!base_pgt_cache)
  348. base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
  349. mutex_unlock(&base_pgt_cache_mutex);
  350. return base_pgt_cache ? 0 : -ENOMEM;
  351. }
  352. /**
  353. * base_asce_alloc - create kernel mapping without enhanced DAT features
  354. * @addr: virtual start address of kernel mapping
  355. * @num_pages: number of consecutive pages
  356. *
  357. * Generate an asce, including all required region, segment and page tables,
  358. * that can be used to access the virtual kernel mapping. The difference is
  359. * that the returned asce does not make use of any enhanced DAT features like
  360. * e.g. large pages. This is required for some I/O functions that pass an
  361. * asce, like e.g. some service call requests.
  362. *
  363. * Note: the returned asce may NEVER be attached to any cpu. It may only be
  364. * used for I/O requests. tlb entries that might result because the
  365. * asce was attached to a cpu won't be cleared.
  366. */
  367. unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
  368. {
  369. unsigned long asce, *table, end;
  370. int rc;
  371. if (base_pgt_cache_init())
  372. return 0;
  373. end = addr + num_pages * PAGE_SIZE;
  374. if (end <= _REGION3_SIZE) {
  375. table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
  376. if (!table)
  377. return 0;
  378. rc = base_segment_walk(table, addr, end, 1);
  379. asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
  380. } else if (end <= _REGION2_SIZE) {
  381. table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
  382. if (!table)
  383. return 0;
  384. rc = base_region3_walk(table, addr, end, 1);
  385. asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
  386. } else if (end <= _REGION1_SIZE) {
  387. table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
  388. if (!table)
  389. return 0;
  390. rc = base_region2_walk(table, addr, end, 1);
  391. asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
  392. } else {
  393. table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
  394. if (!table)
  395. return 0;
  396. rc = base_region1_walk(table, addr, end, 1);
  397. asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
  398. }
  399. if (rc) {
  400. base_asce_free(asce);
  401. asce = 0;
  402. }
  403. return asce;
  404. }