nested.c 48 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 2017 - Columbia University and Linaro Ltd.
  4. * Author: Jintack Lim <jintack.lim@linaro.org>
  5. */
  6. #include <linux/bitfield.h>
  7. #include <linux/kvm.h>
  8. #include <linux/kvm_host.h>
  9. #include <asm/fixmap.h>
  10. #include <asm/kvm_arm.h>
  11. #include <asm/kvm_emulate.h>
  12. #include <asm/kvm_mmu.h>
  13. #include <asm/kvm_nested.h>
  14. #include <asm/sysreg.h>
  15. #include "sys_regs.h"
  16. struct vncr_tlb {
  17. /* The guest's VNCR_EL2 */
  18. u64 gva;
  19. struct s1_walk_info wi;
  20. struct s1_walk_result wr;
  21. u64 hpa;
  22. /* -1 when not mapped on a CPU */
  23. int cpu;
  24. /*
  25. * true if the TLB is valid. Can only be changed with the
  26. * mmu_lock held.
  27. */
  28. bool valid;
  29. };
  30. /*
  31. * Ratio of live shadow S2 MMU per vcpu. This is a trade-off between
  32. * memory usage and potential number of different sets of S2 PTs in
  33. * the guests. Running out of S2 MMUs only affects performance (we
  34. * will invalidate them more often).
  35. */
  36. #define S2_MMU_PER_VCPU 2
  37. void kvm_init_nested(struct kvm *kvm)
  38. {
  39. kvm->arch.nested_mmus = NULL;
  40. kvm->arch.nested_mmus_size = 0;
  41. atomic_set(&kvm->arch.vncr_map_count, 0);
  42. }
  43. static int init_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
  44. {
  45. /*
  46. * We only initialise the IPA range on the canonical MMU, which
  47. * defines the contract between KVM and userspace on where the
  48. * "hardware" is in the IPA space. This affects the validity of MMIO
  49. * exits forwarded to userspace, for example.
  50. *
  51. * For nested S2s, we use the PARange as exposed to the guest, as it
  52. * is allowed to use it at will to expose whatever memory map it
  53. * wants to its own guests as it would be on real HW.
  54. */
  55. return kvm_init_stage2_mmu(kvm, mmu, kvm_get_pa_bits(kvm));
  56. }
  57. int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
  58. {
  59. struct kvm *kvm = vcpu->kvm;
  60. struct kvm_s2_mmu *tmp;
  61. int num_mmus, ret = 0;
  62. if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features) &&
  63. !cpus_have_final_cap(ARM64_HAS_HCR_NV1))
  64. return -EINVAL;
  65. if (!vcpu->arch.ctxt.vncr_array)
  66. vcpu->arch.ctxt.vncr_array = (u64 *)__get_free_page(GFP_KERNEL_ACCOUNT |
  67. __GFP_ZERO);
  68. if (!vcpu->arch.ctxt.vncr_array)
  69. return -ENOMEM;
  70. /*
  71. * Let's treat memory allocation failures as benign: If we fail to
  72. * allocate anything, return an error and keep the allocated array
  73. * alive. Userspace may try to recover by initializing the vcpu
  74. * again, and there is no reason to affect the whole VM for this.
  75. */
  76. num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU;
  77. tmp = kvrealloc(kvm->arch.nested_mmus,
  78. size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus),
  79. GFP_KERNEL_ACCOUNT | __GFP_ZERO);
  80. if (!tmp)
  81. return -ENOMEM;
  82. swap(kvm->arch.nested_mmus, tmp);
  83. /*
  84. * If we went through a realocation, adjust the MMU back-pointers in
  85. * the previously initialised kvm_pgtable structures.
  86. */
  87. if (kvm->arch.nested_mmus != tmp)
  88. for (int i = 0; i < kvm->arch.nested_mmus_size; i++)
  89. kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i];
  90. for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++)
  91. ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]);
  92. if (ret) {
  93. for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++)
  94. kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]);
  95. free_page((unsigned long)vcpu->arch.ctxt.vncr_array);
  96. vcpu->arch.ctxt.vncr_array = NULL;
  97. return ret;
  98. }
  99. kvm->arch.nested_mmus_size = num_mmus;
  100. return 0;
  101. }
  102. struct s2_walk_info {
  103. u64 baddr;
  104. unsigned int max_oa_bits;
  105. unsigned int pgshift;
  106. unsigned int sl;
  107. unsigned int t0sz;
  108. bool be;
  109. bool ha;
  110. };
  111. static u32 compute_fsc(int level, u32 fsc)
  112. {
  113. return fsc | (level & 0x3);
  114. }
  115. static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc)
  116. {
  117. u32 esr;
  118. esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC;
  119. esr |= compute_fsc(level, fsc);
  120. return esr;
  121. }
  122. static int get_ia_size(struct s2_walk_info *wi)
  123. {
  124. return 64 - wi->t0sz;
  125. }
  126. static int check_base_s2_limits(struct kvm_vcpu *vcpu, struct s2_walk_info *wi,
  127. int level, int input_size, int stride)
  128. {
  129. int start_size, pa_max;
  130. pa_max = kvm_get_pa_bits(vcpu->kvm);
  131. /* Check translation limits */
  132. switch (BIT(wi->pgshift)) {
  133. case SZ_64K:
  134. if (level == 0 || (level == 1 && pa_max <= 42))
  135. return -EFAULT;
  136. break;
  137. case SZ_16K:
  138. if (level == 0 || (level == 1 && pa_max <= 40))
  139. return -EFAULT;
  140. break;
  141. case SZ_4K:
  142. if (level < 0 || (level == 0 && pa_max <= 42))
  143. return -EFAULT;
  144. break;
  145. }
  146. /* Check input size limits */
  147. if (input_size > pa_max)
  148. return -EFAULT;
  149. /* Check number of entries in starting level table */
  150. start_size = input_size - ((3 - level) * stride + wi->pgshift);
  151. if (start_size < 1 || start_size > stride + 4)
  152. return -EFAULT;
  153. return 0;
  154. }
  155. /* Check if output is within boundaries */
  156. static int check_output_size(struct s2_walk_info *wi, phys_addr_t output)
  157. {
  158. unsigned int output_size = wi->max_oa_bits;
  159. if (output_size != 48 && (output & GENMASK_ULL(47, output_size)))
  160. return -1;
  161. return 0;
  162. }
  163. static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc,
  164. struct s2_walk_info *wi)
  165. {
  166. u64 val;
  167. int r;
  168. r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val));
  169. if (r)
  170. return r;
  171. /*
  172. * Handle reversedescriptors if endianness differs between the
  173. * host and the guest hypervisor.
  174. */
  175. if (wi->be)
  176. *desc = be64_to_cpu((__force __be64)val);
  177. else
  178. *desc = le64_to_cpu((__force __le64)val);
  179. return 0;
  180. }
  181. static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u64 new,
  182. struct s2_walk_info *wi)
  183. {
  184. if (wi->be) {
  185. old = (__force u64)cpu_to_be64(old);
  186. new = (__force u64)cpu_to_be64(new);
  187. } else {
  188. old = (__force u64)cpu_to_le64(old);
  189. new = (__force u64)cpu_to_le64(new);
  190. }
  191. return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
  192. }
  193. /*
  194. * This is essentially a C-version of the pseudo code from the ARM ARM
  195. * AArch64.TranslationTableWalk function. I strongly recommend looking at
  196. * that pseudocode in trying to understand this.
  197. *
  198. * Must be called with the kvm->srcu read lock held
  199. */
  200. static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa,
  201. struct s2_walk_info *wi, struct kvm_s2_trans *out)
  202. {
  203. int first_block_level, level, stride, input_size, base_lower_bound;
  204. phys_addr_t base_addr;
  205. unsigned int addr_top, addr_bottom;
  206. u64 desc, new_desc; /* page table entry */
  207. int ret;
  208. phys_addr_t paddr;
  209. switch (BIT(wi->pgshift)) {
  210. default:
  211. case SZ_64K:
  212. case SZ_16K:
  213. level = 3 - wi->sl;
  214. first_block_level = 2;
  215. break;
  216. case SZ_4K:
  217. level = 2 - wi->sl;
  218. first_block_level = 1;
  219. break;
  220. }
  221. stride = wi->pgshift - 3;
  222. input_size = get_ia_size(wi);
  223. if (input_size > 48 || input_size < 25)
  224. return -EFAULT;
  225. ret = check_base_s2_limits(vcpu, wi, level, input_size, stride);
  226. if (WARN_ON(ret)) {
  227. out->esr = compute_fsc(0, ESR_ELx_FSC_FAULT);
  228. return ret;
  229. }
  230. base_lower_bound = 3 + input_size - ((3 - level) * stride +
  231. wi->pgshift);
  232. base_addr = wi->baddr & GENMASK_ULL(47, base_lower_bound);
  233. if (check_output_size(wi, base_addr)) {
  234. /* R_BFHQH */
  235. out->esr = compute_fsc(0, ESR_ELx_FSC_ADDRSZ);
  236. return 1;
  237. }
  238. addr_top = input_size - 1;
  239. while (1) {
  240. phys_addr_t index;
  241. addr_bottom = (3 - level) * stride + wi->pgshift;
  242. index = (ipa & GENMASK_ULL(addr_top, addr_bottom))
  243. >> (addr_bottom - 3);
  244. paddr = base_addr | index;
  245. ret = read_guest_s2_desc(vcpu, paddr, &desc, wi);
  246. if (ret < 0) {
  247. out->esr = ESR_ELx_FSC_SEA_TTW(level);
  248. return ret;
  249. }
  250. new_desc = desc;
  251. /* Check for valid descriptor at this point */
  252. if (!(desc & KVM_PTE_VALID)) {
  253. out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
  254. out->desc = desc;
  255. return 1;
  256. }
  257. if (FIELD_GET(KVM_PTE_TYPE, desc) == KVM_PTE_TYPE_BLOCK) {
  258. if (level < 3)
  259. break;
  260. out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
  261. out->desc = desc;
  262. return 1;
  263. }
  264. /* We're at the final level */
  265. if (level == 3)
  266. break;
  267. if (check_output_size(wi, desc)) {
  268. out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
  269. out->desc = desc;
  270. return 1;
  271. }
  272. base_addr = desc & GENMASK_ULL(47, wi->pgshift);
  273. level += 1;
  274. addr_top = addr_bottom - 1;
  275. }
  276. if (level < first_block_level) {
  277. out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
  278. out->desc = desc;
  279. return 1;
  280. }
  281. if (check_output_size(wi, desc)) {
  282. out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
  283. out->desc = desc;
  284. return 1;
  285. }
  286. if (wi->ha)
  287. new_desc |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
  288. if (new_desc != desc) {
  289. ret = swap_guest_s2_desc(vcpu, paddr, desc, new_desc, wi);
  290. if (ret)
  291. return ret;
  292. desc = new_desc;
  293. }
  294. if (!(desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) {
  295. out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS);
  296. out->desc = desc;
  297. return 1;
  298. }
  299. addr_bottom += contiguous_bit_shift(desc, wi, level);
  300. /* Calculate and return the result */
  301. paddr = (desc & GENMASK_ULL(47, addr_bottom)) |
  302. (ipa & GENMASK_ULL(addr_bottom - 1, 0));
  303. out->output = paddr;
  304. out->block_size = 1UL << ((3 - level) * stride + wi->pgshift);
  305. out->readable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
  306. out->writable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
  307. out->level = level;
  308. out->desc = desc;
  309. return 0;
  310. }
  311. static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi)
  312. {
  313. wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK;
  314. switch (FIELD_GET(VTCR_EL2_TG0_MASK, vtcr)) {
  315. case VTCR_EL2_TG0_4K:
  316. wi->pgshift = 12; break;
  317. case VTCR_EL2_TG0_16K:
  318. wi->pgshift = 14; break;
  319. case VTCR_EL2_TG0_64K:
  320. default: /* IMPDEF: treat any other value as 64k */
  321. wi->pgshift = 16; break;
  322. }
  323. wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
  324. /* Global limit for now, should eventually be per-VM */
  325. wi->max_oa_bits = min(get_kvm_ipa_limit(),
  326. ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false));
  327. wi->ha = vtcr & VTCR_EL2_HA;
  328. }
  329. int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
  330. struct kvm_s2_trans *result)
  331. {
  332. u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
  333. struct s2_walk_info wi;
  334. int ret;
  335. result->esr = 0;
  336. if (!vcpu_has_nv(vcpu))
  337. return 0;
  338. wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
  339. vtcr_to_walk_info(vtcr, &wi);
  340. wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE;
  341. ret = walk_nested_s2_pgd(vcpu, gipa, &wi, result);
  342. if (ret)
  343. result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC);
  344. return ret;
  345. }
  346. static unsigned int ttl_to_size(u8 ttl)
  347. {
  348. int level = ttl & 3;
  349. int gran = (ttl >> 2) & 3;
  350. unsigned int max_size = 0;
  351. switch (gran) {
  352. case TLBI_TTL_TG_4K:
  353. switch (level) {
  354. case 0:
  355. break;
  356. case 1:
  357. max_size = SZ_1G;
  358. break;
  359. case 2:
  360. max_size = SZ_2M;
  361. break;
  362. case 3:
  363. max_size = SZ_4K;
  364. break;
  365. }
  366. break;
  367. case TLBI_TTL_TG_16K:
  368. switch (level) {
  369. case 0:
  370. case 1:
  371. break;
  372. case 2:
  373. max_size = SZ_32M;
  374. break;
  375. case 3:
  376. max_size = SZ_16K;
  377. break;
  378. }
  379. break;
  380. case TLBI_TTL_TG_64K:
  381. switch (level) {
  382. case 0:
  383. case 1:
  384. /* No 52bit IPA support */
  385. break;
  386. case 2:
  387. max_size = SZ_512M;
  388. break;
  389. case 3:
  390. max_size = SZ_64K;
  391. break;
  392. }
  393. break;
  394. default: /* No size information */
  395. break;
  396. }
  397. return max_size;
  398. }
  399. static u8 pgshift_level_to_ttl(u16 shift, u8 level)
  400. {
  401. u8 ttl;
  402. switch(shift) {
  403. case 12:
  404. ttl = TLBI_TTL_TG_4K;
  405. break;
  406. case 14:
  407. ttl = TLBI_TTL_TG_16K;
  408. break;
  409. case 16:
  410. ttl = TLBI_TTL_TG_64K;
  411. break;
  412. default:
  413. BUG();
  414. }
  415. ttl <<= 2;
  416. ttl |= level & 3;
  417. return ttl;
  418. }
  419. /*
  420. * Compute the equivalent of the TTL field by parsing the shadow PT. The
  421. * granule size is extracted from the cached VTCR_EL2.TG0 while the level is
  422. * retrieved from first entry carrying the level as a tag.
  423. */
  424. static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr)
  425. {
  426. u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr;
  427. kvm_pte_t pte;
  428. u8 ttl, level;
  429. lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(mmu)->mmu_lock);
  430. switch (FIELD_GET(VTCR_EL2_TG0_MASK, vtcr)) {
  431. case VTCR_EL2_TG0_4K:
  432. ttl = (TLBI_TTL_TG_4K << 2);
  433. break;
  434. case VTCR_EL2_TG0_16K:
  435. ttl = (TLBI_TTL_TG_16K << 2);
  436. break;
  437. case VTCR_EL2_TG0_64K:
  438. default: /* IMPDEF: treat any other value as 64k */
  439. ttl = (TLBI_TTL_TG_64K << 2);
  440. break;
  441. }
  442. tmp = addr;
  443. again:
  444. /* Iteratively compute the block sizes for a particular granule size */
  445. switch (FIELD_GET(VTCR_EL2_TG0_MASK, vtcr)) {
  446. case VTCR_EL2_TG0_4K:
  447. if (sz < SZ_4K) sz = SZ_4K;
  448. else if (sz < SZ_2M) sz = SZ_2M;
  449. else if (sz < SZ_1G) sz = SZ_1G;
  450. else sz = 0;
  451. break;
  452. case VTCR_EL2_TG0_16K:
  453. if (sz < SZ_16K) sz = SZ_16K;
  454. else if (sz < SZ_32M) sz = SZ_32M;
  455. else sz = 0;
  456. break;
  457. case VTCR_EL2_TG0_64K:
  458. default: /* IMPDEF: treat any other value as 64k */
  459. if (sz < SZ_64K) sz = SZ_64K;
  460. else if (sz < SZ_512M) sz = SZ_512M;
  461. else sz = 0;
  462. break;
  463. }
  464. if (sz == 0)
  465. return 0;
  466. tmp &= ~(sz - 1);
  467. if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL))
  468. goto again;
  469. if (!(pte & PTE_VALID))
  470. goto again;
  471. level = FIELD_GET(KVM_NV_GUEST_MAP_SZ, pte);
  472. if (!level)
  473. goto again;
  474. ttl |= level;
  475. /*
  476. * We now have found some level information in the shadow S2. Check
  477. * that the resulting range is actually including the original IPA.
  478. */
  479. sz = ttl_to_size(ttl);
  480. if (addr < (tmp + sz))
  481. return ttl;
  482. return 0;
  483. }
  484. unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val)
  485. {
  486. struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
  487. unsigned long max_size;
  488. u8 ttl;
  489. ttl = FIELD_GET(TLBI_TTL_MASK, val);
  490. if (!ttl || !kvm_has_feat(kvm, ID_AA64MMFR2_EL1, TTL, IMP)) {
  491. /* No TTL, check the shadow S2 for a hint */
  492. u64 addr = (val & GENMASK_ULL(35, 0)) << 12;
  493. ttl = get_guest_mapping_ttl(mmu, addr);
  494. }
  495. max_size = ttl_to_size(ttl);
  496. if (!max_size) {
  497. /* Compute the maximum extent of the invalidation */
  498. switch (FIELD_GET(VTCR_EL2_TG0_MASK, mmu->tlb_vtcr)) {
  499. case VTCR_EL2_TG0_4K:
  500. max_size = SZ_1G;
  501. break;
  502. case VTCR_EL2_TG0_16K:
  503. max_size = SZ_32M;
  504. break;
  505. case VTCR_EL2_TG0_64K:
  506. default: /* IMPDEF: treat any other value as 64k */
  507. /*
  508. * No, we do not support 52bit IPA in nested yet. Once
  509. * we do, this should be 4TB.
  510. */
  511. max_size = SZ_512M;
  512. break;
  513. }
  514. }
  515. WARN_ON(!max_size);
  516. return max_size;
  517. }
  518. /*
  519. * We can have multiple *different* MMU contexts with the same VMID:
  520. *
  521. * - S2 being enabled or not, hence differing by the HCR_EL2.VM bit
  522. *
  523. * - Multiple vcpus using private S2s (huh huh...), hence differing by the
  524. * VBBTR_EL2.BADDR address
  525. *
  526. * - A combination of the above...
  527. *
  528. * We can always identify which MMU context to pick at run-time. However,
  529. * TLB invalidation involving a VMID must take action on all the TLBs using
  530. * this particular VMID. This translates into applying the same invalidation
  531. * operation to all the contexts that are using this VMID. Moar phun!
  532. */
  533. void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
  534. const union tlbi_info *info,
  535. void (*tlbi_callback)(struct kvm_s2_mmu *,
  536. const union tlbi_info *))
  537. {
  538. write_lock(&kvm->mmu_lock);
  539. for (int i = 0; i < kvm->arch.nested_mmus_size; i++) {
  540. struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
  541. if (!kvm_s2_mmu_valid(mmu))
  542. continue;
  543. if (vmid == get_vmid(mmu->tlb_vttbr))
  544. tlbi_callback(mmu, info);
  545. }
  546. write_unlock(&kvm->mmu_lock);
  547. }
  548. struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
  549. {
  550. struct kvm *kvm = vcpu->kvm;
  551. bool nested_stage2_enabled;
  552. u64 vttbr, vtcr, hcr;
  553. lockdep_assert_held_write(&kvm->mmu_lock);
  554. vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
  555. vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
  556. hcr = vcpu_read_sys_reg(vcpu, HCR_EL2);
  557. nested_stage2_enabled = hcr & HCR_VM;
  558. /* Don't consider the CnP bit for the vttbr match */
  559. vttbr &= ~VTTBR_CNP_BIT;
  560. /*
  561. * Two possibilities when looking up a S2 MMU context:
  562. *
  563. * - either S2 is enabled in the guest, and we need a context that is
  564. * S2-enabled and matches the full VTTBR (VMID+BADDR) and VTCR,
  565. * which makes it safe from a TLB conflict perspective (a broken
  566. * guest won't be able to generate them),
  567. *
  568. * - or S2 is disabled, and we need a context that is S2-disabled
  569. * and matches the VMID only, as all TLBs are tagged by VMID even
  570. * if S2 translation is disabled.
  571. */
  572. for (int i = 0; i < kvm->arch.nested_mmus_size; i++) {
  573. struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
  574. if (!kvm_s2_mmu_valid(mmu))
  575. continue;
  576. if (nested_stage2_enabled &&
  577. mmu->nested_stage2_enabled &&
  578. vttbr == mmu->tlb_vttbr &&
  579. vtcr == mmu->tlb_vtcr)
  580. return mmu;
  581. if (!nested_stage2_enabled &&
  582. !mmu->nested_stage2_enabled &&
  583. get_vmid(vttbr) == get_vmid(mmu->tlb_vttbr))
  584. return mmu;
  585. }
  586. return NULL;
  587. }
  588. static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu)
  589. {
  590. struct kvm *kvm = vcpu->kvm;
  591. struct kvm_s2_mmu *s2_mmu;
  592. int i;
  593. lockdep_assert_held_write(&vcpu->kvm->mmu_lock);
  594. s2_mmu = lookup_s2_mmu(vcpu);
  595. if (s2_mmu)
  596. goto out;
  597. /*
  598. * Make sure we don't always search from the same point, or we
  599. * will always reuse a potentially active context, leaving
  600. * free contexts unused.
  601. */
  602. for (i = kvm->arch.nested_mmus_next;
  603. i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next);
  604. i++) {
  605. s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size];
  606. if (atomic_read(&s2_mmu->refcnt) == 0)
  607. break;
  608. }
  609. BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */
  610. /* Set the scene for the next search */
  611. kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size;
  612. /* Make sure we don't forget to do the laundry */
  613. if (kvm_s2_mmu_valid(s2_mmu))
  614. s2_mmu->pending_unmap = true;
  615. /*
  616. * The virtual VMID (modulo CnP) will be used as a key when matching
  617. * an existing kvm_s2_mmu.
  618. *
  619. * We cache VTCR at allocation time, once and for all. It'd be great
  620. * if the guest didn't screw that one up, as this is not very
  621. * forgiving...
  622. */
  623. s2_mmu->tlb_vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2) & ~VTTBR_CNP_BIT;
  624. s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
  625. s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM;
  626. out:
  627. atomic_inc(&s2_mmu->refcnt);
  628. /*
  629. * Set the vCPU request to perform an unmap, even if the pending unmap
  630. * originates from another vCPU. This guarantees that the MMU has been
  631. * completely unmapped before any vCPU actually uses it, and allows
  632. * multiple vCPUs to lend a hand with completing the unmap.
  633. */
  634. if (s2_mmu->pending_unmap)
  635. kvm_make_request(KVM_REQ_NESTED_S2_UNMAP, vcpu);
  636. return s2_mmu;
  637. }
  638. void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
  639. {
  640. /* CnP being set denotes an invalid entry */
  641. mmu->tlb_vttbr = VTTBR_CNP_BIT;
  642. mmu->nested_stage2_enabled = false;
  643. atomic_set(&mmu->refcnt, 0);
  644. }
  645. void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
  646. {
  647. /*
  648. * If the vCPU kept its reference on the MMU after the last put,
  649. * keep rolling with it.
  650. */
  651. if (is_hyp_ctxt(vcpu)) {
  652. if (!vcpu->arch.hw_mmu)
  653. vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
  654. } else {
  655. if (!vcpu->arch.hw_mmu) {
  656. scoped_guard(write_lock, &vcpu->kvm->mmu_lock)
  657. vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
  658. }
  659. if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV)
  660. kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu);
  661. }
  662. }
  663. void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu)
  664. {
  665. /* Unconditionally drop the VNCR mapping if we have one */
  666. if (host_data_test_flag(L1_VNCR_MAPPED)) {
  667. BUG_ON(vcpu->arch.vncr_tlb->cpu != smp_processor_id());
  668. BUG_ON(is_hyp_ctxt(vcpu));
  669. clear_fixmap(vncr_fixmap(vcpu->arch.vncr_tlb->cpu));
  670. vcpu->arch.vncr_tlb->cpu = -1;
  671. host_data_clear_flag(L1_VNCR_MAPPED);
  672. atomic_dec(&vcpu->kvm->arch.vncr_map_count);
  673. }
  674. /*
  675. * Keep a reference on the associated stage-2 MMU if the vCPU is
  676. * scheduling out and not in WFI emulation, suggesting it is likely to
  677. * reuse the MMU sometime soon.
  678. */
  679. if (vcpu->scheduled_out && !vcpu_get_flag(vcpu, IN_WFI))
  680. return;
  681. if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu))
  682. atomic_dec(&vcpu->arch.hw_mmu->refcnt);
  683. vcpu->arch.hw_mmu = NULL;
  684. }
  685. /*
  686. * Returns non-zero if permission fault is handled by injecting it to the next
  687. * level hypervisor.
  688. */
  689. int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans)
  690. {
  691. bool forward_fault = false;
  692. trans->esr = 0;
  693. if (!kvm_vcpu_trap_is_permission_fault(vcpu))
  694. return 0;
  695. if (kvm_vcpu_trap_is_iabt(vcpu)) {
  696. if (vcpu_mode_priv(vcpu))
  697. forward_fault = !kvm_s2_trans_exec_el1(vcpu->kvm, trans);
  698. else
  699. forward_fault = !kvm_s2_trans_exec_el0(vcpu->kvm, trans);
  700. } else {
  701. bool write_fault = kvm_is_write_fault(vcpu);
  702. forward_fault = ((write_fault && !trans->writable) ||
  703. (!write_fault && !trans->readable));
  704. }
  705. if (forward_fault)
  706. trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM);
  707. return forward_fault;
  708. }
  709. int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
  710. {
  711. vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2);
  712. vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2);
  713. return kvm_inject_nested_sync(vcpu, esr_el2);
  714. }
  715. u16 get_asid_by_regime(struct kvm_vcpu *vcpu, enum trans_regime regime)
  716. {
  717. enum vcpu_sysreg ttbr_elx;
  718. u64 tcr;
  719. u16 asid;
  720. switch (regime) {
  721. case TR_EL10:
  722. tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
  723. ttbr_elx = (tcr & TCR_A1) ? TTBR1_EL1 : TTBR0_EL1;
  724. break;
  725. case TR_EL20:
  726. tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
  727. ttbr_elx = (tcr & TCR_A1) ? TTBR1_EL2 : TTBR0_EL2;
  728. break;
  729. default:
  730. BUG();
  731. }
  732. asid = FIELD_GET(TTBRx_EL1_ASID, vcpu_read_sys_reg(vcpu, ttbr_elx));
  733. if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
  734. !(tcr & TCR_ASID16))
  735. asid &= GENMASK(7, 0);
  736. return asid;
  737. }
  738. static void invalidate_vncr(struct vncr_tlb *vt)
  739. {
  740. vt->valid = false;
  741. if (vt->cpu != -1)
  742. clear_fixmap(vncr_fixmap(vt->cpu));
  743. }
  744. static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end)
  745. {
  746. struct kvm_vcpu *vcpu;
  747. unsigned long i;
  748. lockdep_assert_held_write(&kvm->mmu_lock);
  749. if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
  750. return;
  751. kvm_for_each_vcpu(i, vcpu, kvm) {
  752. struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
  753. u64 ipa_start, ipa_end, ipa_size;
  754. /*
  755. * Careful here: We end-up here from an MMU notifier,
  756. * and this can race against a vcpu not being onlined
  757. * yet, without the pseudo-TLB being allocated.
  758. *
  759. * Skip those, as they obviously don't participate in
  760. * the invalidation at this stage.
  761. */
  762. if (!vt)
  763. continue;
  764. if (!vt->valid)
  765. continue;
  766. ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
  767. vt->wr.level));
  768. ipa_start = vt->wr.pa & ~(ipa_size - 1);
  769. ipa_end = ipa_start + ipa_size;
  770. if (ipa_end <= start || ipa_start >= end)
  771. continue;
  772. invalidate_vncr(vt);
  773. }
  774. }
  775. struct s1e2_tlbi_scope {
  776. enum {
  777. TLBI_ALL,
  778. TLBI_VA,
  779. TLBI_VAA,
  780. TLBI_ASID,
  781. } type;
  782. u16 asid;
  783. u64 va;
  784. u64 size;
  785. };
  786. static void invalidate_vncr_va(struct kvm *kvm,
  787. struct s1e2_tlbi_scope *scope)
  788. {
  789. struct kvm_vcpu *vcpu;
  790. unsigned long i;
  791. lockdep_assert_held_write(&kvm->mmu_lock);
  792. kvm_for_each_vcpu(i, vcpu, kvm) {
  793. struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
  794. u64 va_start, va_end, va_size;
  795. if (!vt->valid)
  796. continue;
  797. va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
  798. vt->wr.level));
  799. va_start = vt->gva & ~(va_size - 1);
  800. va_end = va_start + va_size;
  801. switch (scope->type) {
  802. case TLBI_ALL:
  803. break;
  804. case TLBI_VA:
  805. if (va_end <= scope->va ||
  806. va_start >= (scope->va + scope->size))
  807. continue;
  808. if (vt->wr.nG && vt->wr.asid != scope->asid)
  809. continue;
  810. break;
  811. case TLBI_VAA:
  812. if (va_end <= scope->va ||
  813. va_start >= (scope->va + scope->size))
  814. continue;
  815. break;
  816. case TLBI_ASID:
  817. if (!vt->wr.nG || vt->wr.asid != scope->asid)
  818. continue;
  819. break;
  820. }
  821. invalidate_vncr(vt);
  822. }
  823. }
  824. #define tlbi_va_s1_to_va(v) (u64)sign_extend64((v) << 12, 48)
  825. static void compute_s1_tlbi_range(struct kvm_vcpu *vcpu, u32 inst, u64 val,
  826. struct s1e2_tlbi_scope *scope)
  827. {
  828. switch (inst) {
  829. case OP_TLBI_ALLE2:
  830. case OP_TLBI_ALLE2IS:
  831. case OP_TLBI_ALLE2OS:
  832. case OP_TLBI_VMALLE1:
  833. case OP_TLBI_VMALLE1IS:
  834. case OP_TLBI_VMALLE1OS:
  835. case OP_TLBI_ALLE2NXS:
  836. case OP_TLBI_ALLE2ISNXS:
  837. case OP_TLBI_ALLE2OSNXS:
  838. case OP_TLBI_VMALLE1NXS:
  839. case OP_TLBI_VMALLE1ISNXS:
  840. case OP_TLBI_VMALLE1OSNXS:
  841. scope->type = TLBI_ALL;
  842. break;
  843. case OP_TLBI_VAE2:
  844. case OP_TLBI_VAE2IS:
  845. case OP_TLBI_VAE2OS:
  846. case OP_TLBI_VAE1:
  847. case OP_TLBI_VAE1IS:
  848. case OP_TLBI_VAE1OS:
  849. case OP_TLBI_VAE2NXS:
  850. case OP_TLBI_VAE2ISNXS:
  851. case OP_TLBI_VAE2OSNXS:
  852. case OP_TLBI_VAE1NXS:
  853. case OP_TLBI_VAE1ISNXS:
  854. case OP_TLBI_VAE1OSNXS:
  855. case OP_TLBI_VALE2:
  856. case OP_TLBI_VALE2IS:
  857. case OP_TLBI_VALE2OS:
  858. case OP_TLBI_VALE1:
  859. case OP_TLBI_VALE1IS:
  860. case OP_TLBI_VALE1OS:
  861. case OP_TLBI_VALE2NXS:
  862. case OP_TLBI_VALE2ISNXS:
  863. case OP_TLBI_VALE2OSNXS:
  864. case OP_TLBI_VALE1NXS:
  865. case OP_TLBI_VALE1ISNXS:
  866. case OP_TLBI_VALE1OSNXS:
  867. scope->type = TLBI_VA;
  868. scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val));
  869. if (!scope->size)
  870. scope->size = SZ_1G;
  871. scope->va = tlbi_va_s1_to_va(val) & ~(scope->size - 1);
  872. scope->asid = FIELD_GET(TLBIR_ASID_MASK, val);
  873. break;
  874. case OP_TLBI_ASIDE1:
  875. case OP_TLBI_ASIDE1IS:
  876. case OP_TLBI_ASIDE1OS:
  877. case OP_TLBI_ASIDE1NXS:
  878. case OP_TLBI_ASIDE1ISNXS:
  879. case OP_TLBI_ASIDE1OSNXS:
  880. scope->type = TLBI_ASID;
  881. scope->asid = FIELD_GET(TLBIR_ASID_MASK, val);
  882. break;
  883. case OP_TLBI_VAAE1:
  884. case OP_TLBI_VAAE1IS:
  885. case OP_TLBI_VAAE1OS:
  886. case OP_TLBI_VAAE1NXS:
  887. case OP_TLBI_VAAE1ISNXS:
  888. case OP_TLBI_VAAE1OSNXS:
  889. case OP_TLBI_VAALE1:
  890. case OP_TLBI_VAALE1IS:
  891. case OP_TLBI_VAALE1OS:
  892. case OP_TLBI_VAALE1NXS:
  893. case OP_TLBI_VAALE1ISNXS:
  894. case OP_TLBI_VAALE1OSNXS:
  895. scope->type = TLBI_VAA;
  896. scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val));
  897. if (!scope->size)
  898. scope->size = SZ_1G;
  899. scope->va = tlbi_va_s1_to_va(val) & ~(scope->size - 1);
  900. break;
  901. case OP_TLBI_RVAE2:
  902. case OP_TLBI_RVAE2IS:
  903. case OP_TLBI_RVAE2OS:
  904. case OP_TLBI_RVAE1:
  905. case OP_TLBI_RVAE1IS:
  906. case OP_TLBI_RVAE1OS:
  907. case OP_TLBI_RVAE2NXS:
  908. case OP_TLBI_RVAE2ISNXS:
  909. case OP_TLBI_RVAE2OSNXS:
  910. case OP_TLBI_RVAE1NXS:
  911. case OP_TLBI_RVAE1ISNXS:
  912. case OP_TLBI_RVAE1OSNXS:
  913. case OP_TLBI_RVALE2:
  914. case OP_TLBI_RVALE2IS:
  915. case OP_TLBI_RVALE2OS:
  916. case OP_TLBI_RVALE1:
  917. case OP_TLBI_RVALE1IS:
  918. case OP_TLBI_RVALE1OS:
  919. case OP_TLBI_RVALE2NXS:
  920. case OP_TLBI_RVALE2ISNXS:
  921. case OP_TLBI_RVALE2OSNXS:
  922. case OP_TLBI_RVALE1NXS:
  923. case OP_TLBI_RVALE1ISNXS:
  924. case OP_TLBI_RVALE1OSNXS:
  925. scope->type = TLBI_VA;
  926. scope->va = decode_range_tlbi(val, &scope->size, &scope->asid);
  927. break;
  928. case OP_TLBI_RVAAE1:
  929. case OP_TLBI_RVAAE1IS:
  930. case OP_TLBI_RVAAE1OS:
  931. case OP_TLBI_RVAAE1NXS:
  932. case OP_TLBI_RVAAE1ISNXS:
  933. case OP_TLBI_RVAAE1OSNXS:
  934. case OP_TLBI_RVAALE1:
  935. case OP_TLBI_RVAALE1IS:
  936. case OP_TLBI_RVAALE1OS:
  937. case OP_TLBI_RVAALE1NXS:
  938. case OP_TLBI_RVAALE1ISNXS:
  939. case OP_TLBI_RVAALE1OSNXS:
  940. scope->type = TLBI_VAA;
  941. scope->va = decode_range_tlbi(val, &scope->size, NULL);
  942. break;
  943. }
  944. }
  945. void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val)
  946. {
  947. struct s1e2_tlbi_scope scope = {};
  948. compute_s1_tlbi_range(vcpu, inst, val, &scope);
  949. guard(write_lock)(&vcpu->kvm->mmu_lock);
  950. invalidate_vncr_va(vcpu->kvm, &scope);
  951. }
  952. void kvm_nested_s2_wp(struct kvm *kvm)
  953. {
  954. int i;
  955. lockdep_assert_held_write(&kvm->mmu_lock);
  956. if (!kvm->arch.nested_mmus_size)
  957. return;
  958. for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
  959. struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
  960. if (kvm_s2_mmu_valid(mmu))
  961. kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu));
  962. }
  963. kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
  964. }
  965. void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block)
  966. {
  967. int i;
  968. lockdep_assert_held_write(&kvm->mmu_lock);
  969. if (!kvm->arch.nested_mmus_size)
  970. return;
  971. for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
  972. struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
  973. if (kvm_s2_mmu_valid(mmu))
  974. kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block);
  975. }
  976. kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
  977. }
  978. void kvm_nested_s2_flush(struct kvm *kvm)
  979. {
  980. int i;
  981. lockdep_assert_held_write(&kvm->mmu_lock);
  982. if (!kvm->arch.nested_mmus_size)
  983. return;
  984. for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
  985. struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
  986. if (kvm_s2_mmu_valid(mmu))
  987. kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu));
  988. }
  989. }
  990. void kvm_arch_flush_shadow_all(struct kvm *kvm)
  991. {
  992. int i;
  993. for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
  994. struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
  995. if (!WARN_ON(atomic_read(&mmu->refcnt)))
  996. kvm_free_stage2_pgd(mmu);
  997. }
  998. kvfree(kvm->arch.nested_mmus);
  999. kvm->arch.nested_mmus = NULL;
  1000. kvm->arch.nested_mmus_size = 0;
  1001. kvm_uninit_stage2_mmu(kvm);
  1002. }
  1003. /*
  1004. * Dealing with VNCR_EL2 exposed by the *guest* is a complicated matter:
  1005. *
  1006. * - We introduce an internal representation of a vcpu-private TLB,
  1007. * representing the mapping between the guest VA contained in VNCR_EL2,
  1008. * the IPA the guest's EL2 PTs point to, and the actual PA this lives at.
  1009. *
  1010. * - On translation fault from a nested VNCR access, we create such a TLB.
  1011. * If there is no mapping to describe, the guest inherits the fault.
  1012. * Crucially, no actual mapping is done at this stage.
  1013. *
  1014. * - On vcpu_load() in a non-HYP context with HCR_EL2.NV==1, if the above
  1015. * TLB exists, we map it in the fixmap for this CPU, and run with it. We
  1016. * have to respect the permissions dictated by the guest, but not the
  1017. * memory type (FWB is a must).
  1018. *
  1019. * - Note that we usually don't do a vcpu_load() on the back of a fault
  1020. * (unless we are preempted), so the resolution of a translation fault
  1021. * must go via a request that will map the VNCR page in the fixmap.
  1022. * vcpu_load() might as well use the same mechanism.
  1023. *
  1024. * - On vcpu_put() in a non-HYP context with HCR_EL2.NV==1, if the TLB was
  1025. * mapped, we unmap it. Yes it is that simple. The TLB still exists
  1026. * though, and may be reused at a later load.
  1027. *
  1028. * - On permission fault, we simply forward the fault to the guest's EL2.
  1029. * Get out of my way.
  1030. *
  1031. * - On any TLBI for the EL2&0 translation regime, we must find any TLB that
  1032. * intersects with the TLBI request, invalidate it, and unmap the page
  1033. * from the fixmap. Because we need to look at all the vcpu-private TLBs,
  1034. * this requires some wide-ranging locking to ensure that nothing races
  1035. * against it. This may require some refcounting to avoid the search when
  1036. * no such TLB is present.
  1037. *
  1038. * - On MMU notifiers, we must invalidate our TLB in a similar way, but
  1039. * looking at the IPA instead. The funny part is that there may not be a
  1040. * stage-2 mapping for this page if L1 hasn't accessed it using LD/ST
  1041. * instructions.
  1042. */
  1043. int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu)
  1044. {
  1045. if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
  1046. return 0;
  1047. vcpu->arch.vncr_tlb = kzalloc_obj(*vcpu->arch.vncr_tlb,
  1048. GFP_KERNEL_ACCOUNT);
  1049. if (!vcpu->arch.vncr_tlb)
  1050. return -ENOMEM;
  1051. return 0;
  1052. }
  1053. static u64 read_vncr_el2(struct kvm_vcpu *vcpu)
  1054. {
  1055. return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48);
  1056. }
  1057. static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem)
  1058. {
  1059. struct kvm_memory_slot *memslot;
  1060. bool write_fault, writable;
  1061. unsigned long mmu_seq;
  1062. struct vncr_tlb *vt;
  1063. struct page *page;
  1064. u64 va, pfn, gfn;
  1065. int ret;
  1066. vt = vcpu->arch.vncr_tlb;
  1067. /*
  1068. * If we're about to walk the EL2 S1 PTs, we must invalidate the
  1069. * current TLB, as it could be sampled from another vcpu doing a
  1070. * TLBI *IS. A real CPU wouldn't do that, but we only keep a single
  1071. * translation, so not much of a choice.
  1072. *
  1073. * We also prepare the next walk wilst we're at it.
  1074. */
  1075. scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
  1076. invalidate_vncr(vt);
  1077. vt->wi = (struct s1_walk_info) {
  1078. .regime = TR_EL20,
  1079. .as_el0 = false,
  1080. .pan = false,
  1081. };
  1082. vt->wr = (struct s1_walk_result){};
  1083. }
  1084. guard(srcu)(&vcpu->kvm->srcu);
  1085. va = read_vncr_el2(vcpu);
  1086. ret = __kvm_translate_va(vcpu, &vt->wi, &vt->wr, va);
  1087. if (ret)
  1088. return ret;
  1089. write_fault = kvm_is_write_fault(vcpu);
  1090. mmu_seq = vcpu->kvm->mmu_invalidate_seq;
  1091. smp_rmb();
  1092. gfn = vt->wr.pa >> PAGE_SHIFT;
  1093. memslot = gfn_to_memslot(vcpu->kvm, gfn);
  1094. if (!memslot)
  1095. return -EFAULT;
  1096. *is_gmem = kvm_slot_has_gmem(memslot);
  1097. if (!*is_gmem) {
  1098. pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
  1099. &writable, &page);
  1100. if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
  1101. return -EFAULT;
  1102. } else {
  1103. ret = kvm_gmem_get_pfn(vcpu->kvm, memslot, gfn, &pfn, &page, NULL);
  1104. if (ret) {
  1105. kvm_prepare_memory_fault_exit(vcpu, vt->wr.pa, PAGE_SIZE,
  1106. write_fault, false, false);
  1107. return ret;
  1108. }
  1109. }
  1110. scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
  1111. if (mmu_invalidate_retry(vcpu->kvm, mmu_seq))
  1112. return -EAGAIN;
  1113. vt->gva = va;
  1114. vt->hpa = pfn << PAGE_SHIFT;
  1115. vt->valid = true;
  1116. vt->cpu = -1;
  1117. kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu);
  1118. kvm_release_faultin_page(vcpu->kvm, page, false, vt->wr.pw);
  1119. }
  1120. if (vt->wr.pw)
  1121. mark_page_dirty(vcpu->kvm, gfn);
  1122. return 0;
  1123. }
  1124. static void inject_vncr_perm(struct kvm_vcpu *vcpu)
  1125. {
  1126. struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
  1127. u64 esr = kvm_vcpu_get_esr(vcpu);
  1128. /* Adjust the fault level to reflect that of the guest's */
  1129. esr &= ~ESR_ELx_FSC;
  1130. esr |= FIELD_PREP(ESR_ELx_FSC,
  1131. ESR_ELx_FSC_PERM_L(vt->wr.level));
  1132. kvm_inject_nested_sync(vcpu, esr);
  1133. }
  1134. static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu)
  1135. {
  1136. struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
  1137. lockdep_assert_held_read(&vcpu->kvm->mmu_lock);
  1138. if (!vt->valid)
  1139. return false;
  1140. if (read_vncr_el2(vcpu) != vt->gva)
  1141. return false;
  1142. if (vt->wr.nG)
  1143. return get_asid_by_regime(vcpu, TR_EL20) == vt->wr.asid;
  1144. return true;
  1145. }
  1146. int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu)
  1147. {
  1148. struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
  1149. u64 esr = kvm_vcpu_get_esr(vcpu);
  1150. WARN_ON_ONCE(!(esr & ESR_ELx_VNCR));
  1151. if (kvm_vcpu_abt_issea(vcpu))
  1152. return kvm_handle_guest_sea(vcpu);
  1153. if (esr_fsc_is_permission_fault(esr)) {
  1154. inject_vncr_perm(vcpu);
  1155. } else if (esr_fsc_is_translation_fault(esr)) {
  1156. bool valid, is_gmem = false;
  1157. int ret;
  1158. scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
  1159. valid = kvm_vncr_tlb_lookup(vcpu);
  1160. if (!valid)
  1161. ret = kvm_translate_vncr(vcpu, &is_gmem);
  1162. else
  1163. ret = -EPERM;
  1164. switch (ret) {
  1165. case -EAGAIN:
  1166. /* Let's try again... */
  1167. break;
  1168. case -ENOMEM:
  1169. /*
  1170. * For guest_memfd, this indicates that it failed to
  1171. * create a folio to back the memory. Inform userspace.
  1172. */
  1173. if (is_gmem)
  1174. return 0;
  1175. /* Otherwise, let's try again... */
  1176. break;
  1177. case -EFAULT:
  1178. case -EIO:
  1179. case -EHWPOISON:
  1180. if (is_gmem)
  1181. return 0;
  1182. fallthrough;
  1183. case -EINVAL:
  1184. case -ENOENT:
  1185. case -EACCES:
  1186. /*
  1187. * Translation failed, inject the corresponding
  1188. * exception back to EL2.
  1189. */
  1190. BUG_ON(!vt->wr.failed);
  1191. esr &= ~ESR_ELx_FSC;
  1192. esr |= FIELD_PREP(ESR_ELx_FSC, vt->wr.fst);
  1193. kvm_inject_nested_sync(vcpu, esr);
  1194. break;
  1195. case -EPERM:
  1196. /* Hack to deal with POE until we get kernel support */
  1197. inject_vncr_perm(vcpu);
  1198. break;
  1199. case 0:
  1200. break;
  1201. }
  1202. } else {
  1203. WARN_ONCE(1, "Unhandled VNCR abort, ESR=%llx\n", esr);
  1204. }
  1205. return 1;
  1206. }
  1207. static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu)
  1208. {
  1209. struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
  1210. pgprot_t prot;
  1211. guard(preempt)();
  1212. guard(read_lock)(&vcpu->kvm->mmu_lock);
  1213. /*
  1214. * The request to map VNCR may have raced against some other
  1215. * event, such as an interrupt, and may not be valid anymore.
  1216. */
  1217. if (is_hyp_ctxt(vcpu))
  1218. return;
  1219. /*
  1220. * Check that the pseudo-TLB is valid and that VNCR_EL2 still
  1221. * contains the expected value. If it doesn't, we simply bail out
  1222. * without a mapping -- a transformed MSR/MRS will generate the
  1223. * fault and allows us to populate the pseudo-TLB.
  1224. */
  1225. if (!vt->valid)
  1226. return;
  1227. if (read_vncr_el2(vcpu) != vt->gva)
  1228. return;
  1229. if (vt->wr.nG && get_asid_by_regime(vcpu, TR_EL20) != vt->wr.asid)
  1230. return;
  1231. vt->cpu = smp_processor_id();
  1232. if (vt->wr.pw && vt->wr.pr)
  1233. prot = PAGE_KERNEL;
  1234. else if (vt->wr.pr)
  1235. prot = PAGE_KERNEL_RO;
  1236. else
  1237. prot = PAGE_NONE;
  1238. /*
  1239. * We can't map write-only (or no permission at all) in the kernel,
  1240. * but the guest can do it if using POE, so we'll have to turn a
  1241. * translation fault into a permission fault at runtime.
  1242. * FIXME: WO doesn't work at all, need POE support in the kernel.
  1243. */
  1244. if (pgprot_val(prot) != pgprot_val(PAGE_NONE)) {
  1245. __set_fixmap(vncr_fixmap(vt->cpu), vt->hpa, prot);
  1246. host_data_set_flag(L1_VNCR_MAPPED);
  1247. atomic_inc(&vcpu->kvm->arch.vncr_map_count);
  1248. }
  1249. }
  1250. #define has_tgran_2(__r, __sz) \
  1251. ({ \
  1252. u64 _s1, _s2, _mmfr0 = __r; \
  1253. \
  1254. _s2 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \
  1255. TGRAN##__sz##_2, _mmfr0); \
  1256. \
  1257. _s1 = SYS_FIELD_GET(ID_AA64MMFR0_EL1, \
  1258. TGRAN##__sz, _mmfr0); \
  1259. \
  1260. ((_s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_NI && \
  1261. _s2 != ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz) || \
  1262. (_s2 == ID_AA64MMFR0_EL1_TGRAN##__sz##_2_TGRAN##__sz && \
  1263. _s1 != ID_AA64MMFR0_EL1_TGRAN##__sz##_NI)); \
  1264. })
  1265. /*
  1266. * Our emulated CPU doesn't support all the possible features. For the
  1267. * sake of simplicity (and probably mental sanity), wipe out a number
  1268. * of feature bits we don't intend to support for the time being.
  1269. * This list should get updated as new features get added to the NV
  1270. * support, and new extension to the architecture.
  1271. */
  1272. u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
  1273. {
  1274. u64 orig_val = val;
  1275. switch (reg) {
  1276. case SYS_ID_AA64ISAR1_EL1:
  1277. /* Support everything but LS64 and Spec Invalidation */
  1278. val &= ~(ID_AA64ISAR1_EL1_LS64 |
  1279. ID_AA64ISAR1_EL1_SPECRES);
  1280. break;
  1281. case SYS_ID_AA64PFR0_EL1:
  1282. /* No RME, AMU, MPAM, or S-EL2 */
  1283. val &= ~(ID_AA64PFR0_EL1_RME |
  1284. ID_AA64PFR0_EL1_AMU |
  1285. ID_AA64PFR0_EL1_MPAM |
  1286. ID_AA64PFR0_EL1_SEL2 |
  1287. ID_AA64PFR0_EL1_EL3 |
  1288. ID_AA64PFR0_EL1_EL2 |
  1289. ID_AA64PFR0_EL1_EL1 |
  1290. ID_AA64PFR0_EL1_EL0);
  1291. /* 64bit only at any EL */
  1292. val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP);
  1293. val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP);
  1294. val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP);
  1295. val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP);
  1296. break;
  1297. case SYS_ID_AA64PFR1_EL1:
  1298. /* Only support BTI, SSBS, CSV2_frac */
  1299. val &= ~(ID_AA64PFR1_EL1_PFAR |
  1300. ID_AA64PFR1_EL1_MTEX |
  1301. ID_AA64PFR1_EL1_THE |
  1302. ID_AA64PFR1_EL1_GCS |
  1303. ID_AA64PFR1_EL1_MTE_frac |
  1304. ID_AA64PFR1_EL1_NMI |
  1305. ID_AA64PFR1_EL1_SME |
  1306. ID_AA64PFR1_EL1_RES0 |
  1307. ID_AA64PFR1_EL1_MPAM_frac |
  1308. ID_AA64PFR1_EL1_MTE);
  1309. break;
  1310. case SYS_ID_AA64MMFR0_EL1:
  1311. /* Hide ExS, Secure Memory */
  1312. val &= ~(ID_AA64MMFR0_EL1_EXS |
  1313. ID_AA64MMFR0_EL1_TGRAN4_2 |
  1314. ID_AA64MMFR0_EL1_TGRAN16_2 |
  1315. ID_AA64MMFR0_EL1_TGRAN64_2 |
  1316. ID_AA64MMFR0_EL1_SNSMEM);
  1317. /* Hide CNTPOFF if present */
  1318. val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, ECV, IMP);
  1319. /* Disallow unsupported S2 page sizes */
  1320. switch (PAGE_SIZE) {
  1321. case SZ_64K:
  1322. val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, NI);
  1323. fallthrough;
  1324. case SZ_16K:
  1325. val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, NI);
  1326. fallthrough;
  1327. case SZ_4K:
  1328. /* Support everything */
  1329. break;
  1330. }
  1331. /*
  1332. * Since we can't support a guest S2 page size smaller
  1333. * than the host's own page size (due to KVM only
  1334. * populating its own S2 using the kernel's page
  1335. * size), advertise the limitation using FEAT_GTG.
  1336. */
  1337. switch (PAGE_SIZE) {
  1338. case SZ_4K:
  1339. if (has_tgran_2(orig_val, 4))
  1340. val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP);
  1341. fallthrough;
  1342. case SZ_16K:
  1343. if (has_tgran_2(orig_val, 16))
  1344. val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP);
  1345. fallthrough;
  1346. case SZ_64K:
  1347. if (has_tgran_2(orig_val, 64))
  1348. val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP);
  1349. break;
  1350. }
  1351. /* Cap PARange to 48bits */
  1352. val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, PARANGE, 48);
  1353. break;
  1354. case SYS_ID_AA64MMFR1_EL1:
  1355. val &= ~(ID_AA64MMFR1_EL1_CMOW |
  1356. ID_AA64MMFR1_EL1_nTLBPA |
  1357. ID_AA64MMFR1_EL1_ETS);
  1358. /* FEAT_E2H0 implies no VHE */
  1359. if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features))
  1360. val &= ~ID_AA64MMFR1_EL1_VH;
  1361. val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, AF);
  1362. break;
  1363. case SYS_ID_AA64MMFR2_EL1:
  1364. val &= ~(ID_AA64MMFR2_EL1_BBM |
  1365. ID_AA64MMFR2_EL1_TTL |
  1366. GENMASK_ULL(47, 44) |
  1367. ID_AA64MMFR2_EL1_ST |
  1368. ID_AA64MMFR2_EL1_CCIDX |
  1369. ID_AA64MMFR2_EL1_VARange);
  1370. /* Force TTL support */
  1371. val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR2_EL1, TTL, IMP);
  1372. break;
  1373. case SYS_ID_AA64MMFR4_EL1:
  1374. /*
  1375. * You get EITHER
  1376. *
  1377. * - FEAT_VHE without FEAT_E2H0
  1378. * - FEAT_NV limited to FEAT_NV2
  1379. * - HCR_EL2.NV1 being RES0
  1380. *
  1381. * OR
  1382. *
  1383. * - FEAT_E2H0 without FEAT_VHE nor FEAT_NV
  1384. *
  1385. * Life is too short for anything else.
  1386. */
  1387. if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) {
  1388. val = 0;
  1389. } else {
  1390. val = SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY);
  1391. val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI_NV1);
  1392. }
  1393. break;
  1394. case SYS_ID_AA64DFR0_EL1:
  1395. /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */
  1396. val &= ~(ID_AA64DFR0_EL1_ExtTrcBuff |
  1397. ID_AA64DFR0_EL1_BRBE |
  1398. ID_AA64DFR0_EL1_MTPMU |
  1399. ID_AA64DFR0_EL1_TraceBuffer |
  1400. ID_AA64DFR0_EL1_TraceFilt |
  1401. ID_AA64DFR0_EL1_PMSVer |
  1402. ID_AA64DFR0_EL1_CTX_CMPs |
  1403. ID_AA64DFR0_EL1_SEBEP |
  1404. ID_AA64DFR0_EL1_PMSS |
  1405. ID_AA64DFR0_EL1_TraceVer);
  1406. /*
  1407. * FEAT_Debugv8p9 requires support for extended breakpoints /
  1408. * watchpoints.
  1409. */
  1410. val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8);
  1411. break;
  1412. }
  1413. return val;
  1414. }
  1415. u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu,
  1416. enum vcpu_sysreg sr, u64 v)
  1417. {
  1418. struct resx resx;
  1419. resx = kvm_get_sysreg_resx(vcpu->kvm, sr);
  1420. v &= ~resx.res0;
  1421. v |= resx.res1;
  1422. return v;
  1423. }
  1424. static __always_inline void set_sysreg_masks(struct kvm *kvm, int sr, struct resx resx)
  1425. {
  1426. BUILD_BUG_ON(!__builtin_constant_p(sr));
  1427. BUILD_BUG_ON(sr < __SANITISED_REG_START__);
  1428. BUILD_BUG_ON(sr >= NR_SYS_REGS);
  1429. kvm_set_sysreg_resx(kvm, sr, resx);
  1430. }
  1431. int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
  1432. {
  1433. struct kvm *kvm = vcpu->kvm;
  1434. struct resx resx;
  1435. lockdep_assert_held(&kvm->arch.config_lock);
  1436. if (kvm->arch.sysreg_masks)
  1437. goto out;
  1438. kvm->arch.sysreg_masks = kzalloc_obj(*(kvm->arch.sysreg_masks),
  1439. GFP_KERNEL_ACCOUNT);
  1440. if (!kvm->arch.sysreg_masks)
  1441. return -ENOMEM;
  1442. /* VTTBR_EL2 */
  1443. resx = (typeof(resx)){};
  1444. if (!kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16))
  1445. resx.res0 |= GENMASK(63, 56);
  1446. if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, CnP, IMP))
  1447. resx.res0 |= VTTBR_CNP_BIT;
  1448. set_sysreg_masks(kvm, VTTBR_EL2, resx);
  1449. /* VTCR_EL2 */
  1450. resx = get_reg_fixed_bits(kvm, VTCR_EL2);
  1451. set_sysreg_masks(kvm, VTCR_EL2, resx);
  1452. /* VMPIDR_EL2 */
  1453. resx.res0 = GENMASK(63, 40) | GENMASK(30, 24);
  1454. resx.res1 = BIT(31);
  1455. set_sysreg_masks(kvm, VMPIDR_EL2, resx);
  1456. /* HCR_EL2 */
  1457. resx = get_reg_fixed_bits(kvm, HCR_EL2);
  1458. set_sysreg_masks(kvm, HCR_EL2, resx);
  1459. /* HCRX_EL2 */
  1460. resx = get_reg_fixed_bits(kvm, HCRX_EL2);
  1461. set_sysreg_masks(kvm, HCRX_EL2, resx);
  1462. /* HFG[RW]TR_EL2 */
  1463. resx = get_reg_fixed_bits(kvm, HFGRTR_EL2);
  1464. set_sysreg_masks(kvm, HFGRTR_EL2, resx);
  1465. resx = get_reg_fixed_bits(kvm, HFGWTR_EL2);
  1466. set_sysreg_masks(kvm, HFGWTR_EL2, resx);
  1467. /* HDFG[RW]TR_EL2 */
  1468. resx = get_reg_fixed_bits(kvm, HDFGRTR_EL2);
  1469. set_sysreg_masks(kvm, HDFGRTR_EL2, resx);
  1470. resx = get_reg_fixed_bits(kvm, HDFGWTR_EL2);
  1471. set_sysreg_masks(kvm, HDFGWTR_EL2, resx);
  1472. /* HFGITR_EL2 */
  1473. resx = get_reg_fixed_bits(kvm, HFGITR_EL2);
  1474. set_sysreg_masks(kvm, HFGITR_EL2, resx);
  1475. /* HAFGRTR_EL2 - not a lot to see here */
  1476. resx = get_reg_fixed_bits(kvm, HAFGRTR_EL2);
  1477. set_sysreg_masks(kvm, HAFGRTR_EL2, resx);
  1478. /* HFG[RW]TR2_EL2 */
  1479. resx = get_reg_fixed_bits(kvm, HFGRTR2_EL2);
  1480. set_sysreg_masks(kvm, HFGRTR2_EL2, resx);
  1481. resx = get_reg_fixed_bits(kvm, HFGWTR2_EL2);
  1482. set_sysreg_masks(kvm, HFGWTR2_EL2, resx);
  1483. /* HDFG[RW]TR2_EL2 */
  1484. resx = get_reg_fixed_bits(kvm, HDFGRTR2_EL2);
  1485. set_sysreg_masks(kvm, HDFGRTR2_EL2, resx);
  1486. resx = get_reg_fixed_bits(kvm, HDFGWTR2_EL2);
  1487. set_sysreg_masks(kvm, HDFGWTR2_EL2, resx);
  1488. /* HFGITR2_EL2 */
  1489. resx = get_reg_fixed_bits(kvm, HFGITR2_EL2);
  1490. set_sysreg_masks(kvm, HFGITR2_EL2, resx);
  1491. /* TCR2_EL2 */
  1492. resx = get_reg_fixed_bits(kvm, TCR2_EL2);
  1493. set_sysreg_masks(kvm, TCR2_EL2, resx);
  1494. /* SCTLR_EL1 */
  1495. resx = get_reg_fixed_bits(kvm, SCTLR_EL1);
  1496. set_sysreg_masks(kvm, SCTLR_EL1, resx);
  1497. /* SCTLR_EL2 */
  1498. resx = get_reg_fixed_bits(kvm, SCTLR_EL2);
  1499. set_sysreg_masks(kvm, SCTLR_EL2, resx);
  1500. /* SCTLR2_ELx */
  1501. resx = get_reg_fixed_bits(kvm, SCTLR2_EL1);
  1502. set_sysreg_masks(kvm, SCTLR2_EL1, resx);
  1503. resx = get_reg_fixed_bits(kvm, SCTLR2_EL2);
  1504. set_sysreg_masks(kvm, SCTLR2_EL2, resx);
  1505. /* MDCR_EL2 */
  1506. resx = get_reg_fixed_bits(kvm, MDCR_EL2);
  1507. set_sysreg_masks(kvm, MDCR_EL2, resx);
  1508. /* CNTHCTL_EL2 */
  1509. resx.res0 = GENMASK(63, 20);
  1510. resx.res1 = 0;
  1511. if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RME, IMP))
  1512. resx.res0 |= CNTHCTL_CNTPMASK | CNTHCTL_CNTVMASK;
  1513. if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, CNTPOFF)) {
  1514. resx.res0 |= CNTHCTL_ECV;
  1515. if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, IMP))
  1516. resx.res0 |= (CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT |
  1517. CNTHCTL_EL1NVPCT | CNTHCTL_EL1NVVCT);
  1518. }
  1519. if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP))
  1520. resx.res0 |= GENMASK(11, 8);
  1521. set_sysreg_masks(kvm, CNTHCTL_EL2, resx);
  1522. /* ICH_HCR_EL2 */
  1523. resx.res0 = ICH_HCR_EL2_RES0;
  1524. resx.res1 = ICH_HCR_EL2_RES1;
  1525. if (!(kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_TDS))
  1526. resx.res0 |= ICH_HCR_EL2_TDIR;
  1527. /* No GICv4 is presented to the guest */
  1528. resx.res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount;
  1529. set_sysreg_masks(kvm, ICH_HCR_EL2, resx);
  1530. /* VNCR_EL2 */
  1531. resx.res0 = VNCR_EL2_RES0;
  1532. resx.res1 = VNCR_EL2_RES1;
  1533. set_sysreg_masks(kvm, VNCR_EL2, resx);
  1534. out:
  1535. for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++)
  1536. __vcpu_rmw_sys_reg(vcpu, sr, |=, 0);
  1537. return 0;
  1538. }
  1539. void check_nested_vcpu_requests(struct kvm_vcpu *vcpu)
  1540. {
  1541. if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) {
  1542. struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
  1543. write_lock(&vcpu->kvm->mmu_lock);
  1544. if (mmu->pending_unmap) {
  1545. kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true);
  1546. mmu->pending_unmap = false;
  1547. }
  1548. write_unlock(&vcpu->kvm->mmu_lock);
  1549. }
  1550. if (kvm_check_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu))
  1551. kvm_map_l1_vncr(vcpu);
  1552. /* Must be last, as may switch context! */
  1553. if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu))
  1554. kvm_inject_nested_irq(vcpu);
  1555. }
  1556. /*
  1557. * One of the many architectural bugs in FEAT_NV2 is that the guest hypervisor
  1558. * can write to HCR_EL2 behind our back, potentially changing the exception
  1559. * routing / masking for even the host context.
  1560. *
  1561. * What follows is some slop to (1) react to exception routing / masking and (2)
  1562. * preserve the pending SError state across translation regimes.
  1563. */
  1564. void kvm_nested_flush_hwstate(struct kvm_vcpu *vcpu)
  1565. {
  1566. if (!vcpu_has_nv(vcpu))
  1567. return;
  1568. if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
  1569. kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
  1570. }
  1571. void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu)
  1572. {
  1573. unsigned long *hcr = vcpu_hcr(vcpu);
  1574. if (!vcpu_has_nv(vcpu))
  1575. return;
  1576. /*
  1577. * We previously decided that an SError was deliverable to the guest.
  1578. * Reap the pending state from HCR_EL2 and...
  1579. */
  1580. if (unlikely(__test_and_clear_bit(__ffs(HCR_VSE), hcr)))
  1581. vcpu_set_flag(vcpu, NESTED_SERROR_PENDING);
  1582. /*
  1583. * Re-attempt SError injection in case the deliverability has changed,
  1584. * which is necessary to faithfully emulate WFI the case of a pending
  1585. * SError being a wakeup condition.
  1586. */
  1587. if (unlikely(vcpu_test_and_clear_flag(vcpu, NESTED_SERROR_PENDING)))
  1588. kvm_inject_serror_esr(vcpu, vcpu_get_vsesr(vcpu));
  1589. }
  1590. /*
  1591. * KVM unconditionally sets most of these traps anyway but use an allowlist
  1592. * to document the guest hypervisor traps that may take precedence and guard
  1593. * against future changes to the non-nested trap configuration.
  1594. */
  1595. #define NV_MDCR_GUEST_INCLUDE (MDCR_EL2_TDE | \
  1596. MDCR_EL2_TDA | \
  1597. MDCR_EL2_TDRA | \
  1598. MDCR_EL2_TTRF | \
  1599. MDCR_EL2_TPMS | \
  1600. MDCR_EL2_TPM | \
  1601. MDCR_EL2_TPMCR | \
  1602. MDCR_EL2_TDCC | \
  1603. MDCR_EL2_TDOSA)
  1604. void kvm_nested_setup_mdcr_el2(struct kvm_vcpu *vcpu)
  1605. {
  1606. u64 guest_mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2);
  1607. if (is_nested_ctxt(vcpu))
  1608. vcpu->arch.mdcr_el2 |= (guest_mdcr & NV_MDCR_GUEST_INCLUDE);
  1609. /*
  1610. * In yet another example where FEAT_NV2 is fscking broken, accesses
  1611. * to MDSCR_EL1 are redirected to the VNCR despite having an effect
  1612. * at EL2. Use a big hammer to apply sanity.
  1613. *
  1614. * Unless of course we have FEAT_FGT, in which case we can precisely
  1615. * trap MDSCR_EL1.
  1616. */
  1617. else if (!cpus_have_final_cap(ARM64_HAS_FGT))
  1618. vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
  1619. }