at.c 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 2017 - Linaro Ltd
  4. * Author: Jintack Lim <jintack.lim@linaro.org>
  5. */
  6. #include <linux/kvm_host.h>
  7. #include <asm/esr.h>
  8. #include <asm/kvm_hyp.h>
  9. #include <asm/kvm_mmu.h>
  10. static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw)
  11. {
  12. wr->fst = fst;
  13. wr->ptw = s1ptw;
  14. wr->s2 = s1ptw;
  15. wr->failed = true;
  16. }
  17. #define S1_MMU_DISABLED (-127)
  18. static int get_ia_size(struct s1_walk_info *wi)
  19. {
  20. return 64 - wi->txsz;
  21. }
  22. /* Return true if the IPA is out of the OA range */
  23. static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
  24. {
  25. if (wi->pa52bit)
  26. return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits));
  27. return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
  28. }
  29. static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr)
  30. {
  31. switch (BIT(wi->pgshift)) {
  32. case SZ_64K:
  33. default: /* IMPDEF: treat any other value as 64k */
  34. if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52))
  35. return false;
  36. return ((wi->regime == TR_EL2 ?
  37. FIELD_GET(TCR_EL2_PS_MASK, tcr) :
  38. FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110);
  39. case SZ_16K:
  40. if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT))
  41. return false;
  42. break;
  43. case SZ_4K:
  44. if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT))
  45. return false;
  46. break;
  47. }
  48. return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS));
  49. }
  50. static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc)
  51. {
  52. u64 addr;
  53. if (!wi->pa52bit)
  54. return desc & GENMASK_ULL(47, wi->pgshift);
  55. switch (BIT(wi->pgshift)) {
  56. case SZ_4K:
  57. case SZ_16K:
  58. addr = desc & GENMASK_ULL(49, wi->pgshift);
  59. addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50;
  60. break;
  61. case SZ_64K:
  62. default: /* IMPDEF: treat any other value as 64k */
  63. addr = desc & GENMASK_ULL(47, wi->pgshift);
  64. addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48;
  65. break;
  66. }
  67. return addr;
  68. }
  69. /* Return the translation regime that applies to an AT instruction */
  70. static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
  71. {
  72. /*
  73. * We only get here from guest EL2, so the translation
  74. * regime AT applies to is solely defined by {E2H,TGE}.
  75. */
  76. switch (op) {
  77. case OP_AT_S1E2R:
  78. case OP_AT_S1E2W:
  79. case OP_AT_S1E2A:
  80. return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
  81. default:
  82. return (vcpu_el2_e2h_is_set(vcpu) &&
  83. vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
  84. }
  85. }
  86. static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime)
  87. {
  88. if (regime == TR_EL10) {
  89. if (vcpu_has_nv(vcpu) &&
  90. !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En))
  91. return 0;
  92. return vcpu_read_sys_reg(vcpu, TCR2_EL1);
  93. }
  94. return vcpu_read_sys_reg(vcpu, TCR2_EL2);
  95. }
  96. static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
  97. {
  98. if (!kvm_has_s1pie(vcpu->kvm))
  99. return false;
  100. /* Abuse TCR2_EL1_PIE and use it for EL2 as well */
  101. return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE;
  102. }
  103. static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi)
  104. {
  105. u64 val;
  106. if (!kvm_has_s1poe(vcpu->kvm)) {
  107. wi->poe = wi->e0poe = false;
  108. return;
  109. }
  110. val = effective_tcr2(vcpu, wi->regime);
  111. /* Abuse TCR2_EL1_* for EL2 */
  112. wi->poe = val & TCR2_EL1_POE;
  113. wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE);
  114. }
  115. static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
  116. struct s1_walk_result *wr, u64 va)
  117. {
  118. u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr;
  119. unsigned int stride, x;
  120. bool va55, tbi, lva;
  121. va55 = va & BIT(55);
  122. if (vcpu_has_nv(vcpu)) {
  123. hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
  124. wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
  125. } else {
  126. WARN_ON_ONCE(wi->regime != TR_EL10);
  127. wi->s2 = false;
  128. hcr = 0;
  129. }
  130. switch (wi->regime) {
  131. case TR_EL10:
  132. sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
  133. tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
  134. ttbr = (va55 ?
  135. vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
  136. vcpu_read_sys_reg(vcpu, TTBR0_EL1));
  137. break;
  138. case TR_EL2:
  139. case TR_EL20:
  140. sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
  141. tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
  142. ttbr = (va55 ?
  143. vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
  144. vcpu_read_sys_reg(vcpu, TTBR0_EL2));
  145. break;
  146. default:
  147. BUG();
  148. }
  149. /* Someone was silly enough to encode TG0/TG1 differently */
  150. if (va55 && wi->regime != TR_EL2) {
  151. wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
  152. tg = FIELD_GET(TCR_TG1_MASK, tcr);
  153. switch (tg << TCR_TG1_SHIFT) {
  154. case TCR_TG1_4K:
  155. wi->pgshift = 12; break;
  156. case TCR_TG1_16K:
  157. wi->pgshift = 14; break;
  158. case TCR_TG1_64K:
  159. default: /* IMPDEF: treat any other value as 64k */
  160. wi->pgshift = 16; break;
  161. }
  162. } else {
  163. wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
  164. tg = FIELD_GET(TCR_TG0_MASK, tcr);
  165. switch (tg << TCR_TG0_SHIFT) {
  166. case TCR_TG0_4K:
  167. wi->pgshift = 12; break;
  168. case TCR_TG0_16K:
  169. wi->pgshift = 14; break;
  170. case TCR_TG0_64K:
  171. default: /* IMPDEF: treat any other value as 64k */
  172. wi->pgshift = 16; break;
  173. }
  174. }
  175. wi->pa52bit = has_52bit_pa(vcpu, wi, tcr);
  176. ia_bits = get_ia_size(wi);
  177. /* AArch64.S1StartLevel() */
  178. stride = wi->pgshift - 3;
  179. wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
  180. if (wi->regime == TR_EL2 && va55)
  181. goto addrsz;
  182. tbi = (wi->regime == TR_EL2 ?
  183. FIELD_GET(TCR_EL2_TBI, tcr) :
  184. (va55 ?
  185. FIELD_GET(TCR_TBI1, tcr) :
  186. FIELD_GET(TCR_TBI0, tcr)));
  187. if (!tbi && (u64)sign_extend64(va, 55) != va)
  188. goto addrsz;
  189. wi->sh = (wi->regime == TR_EL2 ?
  190. FIELD_GET(TCR_EL2_SH0_MASK, tcr) :
  191. (va55 ?
  192. FIELD_GET(TCR_SH1_MASK, tcr) :
  193. FIELD_GET(TCR_SH0_MASK, tcr)));
  194. va = (u64)sign_extend64(va, 55);
  195. /* Let's put the MMU disabled case aside immediately */
  196. switch (wi->regime) {
  197. case TR_EL10:
  198. /*
  199. * If dealing with the EL1&0 translation regime, 3 things
  200. * can disable the S1 translation:
  201. *
  202. * - HCR_EL2.DC = 1
  203. * - HCR_EL2.{E2H,TGE} = {0,1}
  204. * - SCTLR_EL1.M = 0
  205. *
  206. * The TGE part is interesting. If we have decided that this
  207. * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
  208. * {0,x}, and we only need to test for TGE == 1.
  209. */
  210. if (hcr & (HCR_DC | HCR_TGE)) {
  211. wr->level = S1_MMU_DISABLED;
  212. break;
  213. }
  214. fallthrough;
  215. case TR_EL2:
  216. case TR_EL20:
  217. if (!(sctlr & SCTLR_ELx_M))
  218. wr->level = S1_MMU_DISABLED;
  219. break;
  220. }
  221. if (wr->level == S1_MMU_DISABLED) {
  222. if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
  223. goto addrsz;
  224. wr->pa = va;
  225. return 0;
  226. }
  227. wi->be = sctlr & SCTLR_ELx_EE;
  228. wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
  229. wi->hpd &= (wi->regime == TR_EL2 ?
  230. FIELD_GET(TCR_EL2_HPD, tcr) :
  231. (va55 ?
  232. FIELD_GET(TCR_HPD1, tcr) :
  233. FIELD_GET(TCR_HPD0, tcr)));
  234. /* R_JHSVW */
  235. wi->hpd |= s1pie_enabled(vcpu, wi->regime);
  236. /* Do we have POE? */
  237. compute_s1poe(vcpu, wi);
  238. /* R_BVXDG */
  239. wi->hpd |= (wi->poe || wi->e0poe);
  240. /* R_PLCGL, R_YXNYW */
  241. if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
  242. if (wi->txsz > 39)
  243. goto transfault;
  244. } else {
  245. if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
  246. goto transfault;
  247. }
  248. /* R_GTJBY, R_SXWGM */
  249. switch (BIT(wi->pgshift)) {
  250. case SZ_4K:
  251. case SZ_16K:
  252. lva = wi->pa52bit;
  253. break;
  254. case SZ_64K:
  255. lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
  256. break;
  257. }
  258. if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
  259. goto transfault;
  260. /* R_YYVYV, I_THCZK */
  261. if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
  262. (va55 && va < GENMASK(63, ia_bits)))
  263. goto transfault;
  264. /* I_ZFSYQ */
  265. if (wi->regime != TR_EL2 &&
  266. (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
  267. goto transfault;
  268. /* R_BNDVG and following statements */
  269. if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
  270. wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
  271. goto transfault;
  272. ps = (wi->regime == TR_EL2 ?
  273. FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
  274. wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit));
  275. /* Compute minimal alignment */
  276. x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
  277. wi->baddr = ttbr & TTBRx_EL1_BADDR;
  278. if (wi->pa52bit) {
  279. /*
  280. * Force the alignment on 64 bytes for top-level tables
  281. * smaller than 8 entries, since TTBR.BADDR[5:2] are used to
  282. * store bits [51:48] of the first level of lookup.
  283. */
  284. x = max(x, 6);
  285. wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48;
  286. }
  287. /* R_VPBBF */
  288. if (check_output_size(wi->baddr, wi))
  289. goto addrsz;
  290. wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
  291. wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF);
  292. wi->ha &= (wi->regime == TR_EL2 ?
  293. FIELD_GET(TCR_EL2_HA, tcr) :
  294. FIELD_GET(TCR_HA, tcr));
  295. return 0;
  296. addrsz:
  297. /*
  298. * Address Size Fault level 0 to indicate it comes from TTBR.
  299. * yes, this is an oddity.
  300. */
  301. fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false);
  302. return -EFAULT;
  303. transfault:
  304. /* Translation Fault on start level */
  305. fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false);
  306. return -EFAULT;
  307. }
  308. static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc,
  309. struct s1_walk_info *wi)
  310. {
  311. u64 val;
  312. int r;
  313. r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val));
  314. if (r)
  315. return r;
  316. if (wi->be)
  317. *desc = be64_to_cpu((__force __be64)val);
  318. else
  319. *desc = le64_to_cpu((__force __le64)val);
  320. return 0;
  321. }
  322. static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new,
  323. struct s1_walk_info *wi)
  324. {
  325. if (wi->be) {
  326. old = (__force u64)cpu_to_be64(old);
  327. new = (__force u64)cpu_to_be64(new);
  328. } else {
  329. old = (__force u64)cpu_to_le64(old);
  330. new = (__force u64)cpu_to_le64(new);
  331. }
  332. return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
  333. }
  334. static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
  335. struct s1_walk_result *wr, u64 va)
  336. {
  337. u64 va_top, va_bottom, baddr, desc, new_desc, ipa;
  338. struct kvm_s2_trans s2_trans = {};
  339. int level, stride, ret;
  340. level = wi->sl;
  341. stride = wi->pgshift - 3;
  342. baddr = wi->baddr;
  343. va_top = get_ia_size(wi) - 1;
  344. while (1) {
  345. u64 index;
  346. va_bottom = (3 - level) * stride + wi->pgshift;
  347. index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
  348. ipa = baddr | index;
  349. if (wi->s2) {
  350. ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
  351. if (ret) {
  352. fail_s1_walk(wr,
  353. (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
  354. true);
  355. return ret;
  356. }
  357. if (!kvm_s2_trans_readable(&s2_trans)) {
  358. fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
  359. true);
  360. return -EPERM;
  361. }
  362. ipa = kvm_s2_trans_output(&s2_trans);
  363. }
  364. if (wi->filter) {
  365. ret = wi->filter->fn(&(struct s1_walk_context)
  366. {
  367. .wi = wi,
  368. .table_ipa = baddr,
  369. .level = level,
  370. }, wi->filter->priv);
  371. if (ret)
  372. return ret;
  373. }
  374. ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi);
  375. if (ret) {
  376. fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false);
  377. return ret;
  378. }
  379. new_desc = desc;
  380. /* Invalid descriptor */
  381. if (!(desc & BIT(0)))
  382. goto transfault;
  383. /* Block mapping, check validity down the line */
  384. if (!(desc & BIT(1)))
  385. break;
  386. /* Page mapping */
  387. if (level == 3)
  388. break;
  389. /* Table handling */
  390. if (!wi->hpd) {
  391. wr->APTable |= FIELD_GET(S1_TABLE_AP, desc);
  392. wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
  393. wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
  394. }
  395. baddr = desc_to_oa(wi, desc);
  396. /* Check for out-of-range OA */
  397. if (check_output_size(baddr, wi))
  398. goto addrsz;
  399. /* Prepare for next round */
  400. va_top = va_bottom - 1;
  401. level++;
  402. }
  403. /* Block mapping, check the validity of the level */
  404. if (!(desc & BIT(1))) {
  405. bool valid_block = false;
  406. switch (BIT(wi->pgshift)) {
  407. case SZ_4K:
  408. valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0);
  409. break;
  410. case SZ_16K:
  411. case SZ_64K:
  412. valid_block = level == 2 || (wi->pa52bit && level == 1);
  413. break;
  414. }
  415. if (!valid_block)
  416. goto transfault;
  417. }
  418. baddr = desc_to_oa(wi, desc);
  419. if (check_output_size(baddr & GENMASK(52, va_bottom), wi))
  420. goto addrsz;
  421. if (wi->ha)
  422. new_desc |= PTE_AF;
  423. if (new_desc != desc) {
  424. if (wi->s2 && !kvm_s2_trans_writable(&s2_trans)) {
  425. fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), true);
  426. return -EPERM;
  427. }
  428. ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi);
  429. if (ret)
  430. return ret;
  431. desc = new_desc;
  432. }
  433. if (!(desc & PTE_AF)) {
  434. fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false);
  435. return -EACCES;
  436. }
  437. va_bottom += contiguous_bit_shift(desc, wi, level);
  438. wr->failed = false;
  439. wr->level = level;
  440. wr->desc = desc;
  441. wr->pa = baddr & GENMASK(52, va_bottom);
  442. wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
  443. wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG);
  444. if (wr->nG)
  445. wr->asid = get_asid_by_regime(vcpu, wi->regime);
  446. return 0;
  447. addrsz:
  448. fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false);
  449. return -EINVAL;
  450. transfault:
  451. fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false);
  452. return -ENOENT;
  453. }
  454. struct mmu_config {
  455. u64 ttbr0;
  456. u64 ttbr1;
  457. u64 tcr;
  458. u64 mair;
  459. u64 tcr2;
  460. u64 pir;
  461. u64 pire0;
  462. u64 por_el0;
  463. u64 por_el1;
  464. u64 sctlr;
  465. u64 vttbr;
  466. u64 vtcr;
  467. };
  468. static void __mmu_config_save(struct mmu_config *config)
  469. {
  470. config->ttbr0 = read_sysreg_el1(SYS_TTBR0);
  471. config->ttbr1 = read_sysreg_el1(SYS_TTBR1);
  472. config->tcr = read_sysreg_el1(SYS_TCR);
  473. config->mair = read_sysreg_el1(SYS_MAIR);
  474. if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
  475. config->tcr2 = read_sysreg_el1(SYS_TCR2);
  476. if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
  477. config->pir = read_sysreg_el1(SYS_PIR);
  478. config->pire0 = read_sysreg_el1(SYS_PIRE0);
  479. }
  480. if (system_supports_poe()) {
  481. config->por_el1 = read_sysreg_el1(SYS_POR);
  482. config->por_el0 = read_sysreg_s(SYS_POR_EL0);
  483. }
  484. }
  485. config->sctlr = read_sysreg_el1(SYS_SCTLR);
  486. config->vttbr = read_sysreg(vttbr_el2);
  487. config->vtcr = read_sysreg(vtcr_el2);
  488. }
  489. static void __mmu_config_restore(struct mmu_config *config)
  490. {
  491. /*
  492. * ARM errata 1165522 and 1530923 require TGE to be 1 before
  493. * we update the guest state.
  494. */
  495. asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
  496. write_sysreg_el1(config->ttbr0, SYS_TTBR0);
  497. write_sysreg_el1(config->ttbr1, SYS_TTBR1);
  498. write_sysreg_el1(config->tcr, SYS_TCR);
  499. write_sysreg_el1(config->mair, SYS_MAIR);
  500. if (cpus_have_final_cap(ARM64_HAS_TCR2)) {
  501. write_sysreg_el1(config->tcr2, SYS_TCR2);
  502. if (cpus_have_final_cap(ARM64_HAS_S1PIE)) {
  503. write_sysreg_el1(config->pir, SYS_PIR);
  504. write_sysreg_el1(config->pire0, SYS_PIRE0);
  505. }
  506. if (system_supports_poe()) {
  507. write_sysreg_el1(config->por_el1, SYS_POR);
  508. write_sysreg_s(config->por_el0, SYS_POR_EL0);
  509. }
  510. }
  511. write_sysreg_el1(config->sctlr, SYS_SCTLR);
  512. write_sysreg(config->vttbr, vttbr_el2);
  513. write_sysreg(config->vtcr, vtcr_el2);
  514. }
  515. static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
  516. {
  517. u64 host_pan;
  518. bool fail;
  519. host_pan = read_sysreg_s(SYS_PSTATE_PAN);
  520. write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN);
  521. switch (op) {
  522. case OP_AT_S1E1RP:
  523. fail = __kvm_at(OP_AT_S1E1RP, vaddr);
  524. break;
  525. case OP_AT_S1E1WP:
  526. fail = __kvm_at(OP_AT_S1E1WP, vaddr);
  527. break;
  528. }
  529. write_sysreg_s(host_pan, SYS_PSTATE_PAN);
  530. return fail;
  531. }
  532. #define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic)
  533. #define MEMATTR_NC 0b0100
  534. #define MEMATTR_Wt 0b1000
  535. #define MEMATTR_Wb 0b1100
  536. #define MEMATTR_WbRaWa 0b1111
  537. #define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0)
  538. static u8 s2_memattr_to_attr(u8 memattr)
  539. {
  540. memattr &= 0b1111;
  541. switch (memattr) {
  542. case 0b0000:
  543. case 0b0001:
  544. case 0b0010:
  545. case 0b0011:
  546. return memattr << 2;
  547. case 0b0100:
  548. return MEMATTR(Wb, Wb);
  549. case 0b0101:
  550. return MEMATTR(NC, NC);
  551. case 0b0110:
  552. return MEMATTR(Wt, NC);
  553. case 0b0111:
  554. return MEMATTR(Wb, NC);
  555. case 0b1000:
  556. /* Reserved, assume NC */
  557. return MEMATTR(NC, NC);
  558. case 0b1001:
  559. return MEMATTR(NC, Wt);
  560. case 0b1010:
  561. return MEMATTR(Wt, Wt);
  562. case 0b1011:
  563. return MEMATTR(Wb, Wt);
  564. case 0b1100:
  565. /* Reserved, assume NC */
  566. return MEMATTR(NC, NC);
  567. case 0b1101:
  568. return MEMATTR(NC, Wb);
  569. case 0b1110:
  570. return MEMATTR(Wt, Wb);
  571. case 0b1111:
  572. return MEMATTR(Wb, Wb);
  573. default:
  574. unreachable();
  575. }
  576. }
  577. static u8 combine_s1_s2_attr(u8 s1, u8 s2)
  578. {
  579. bool transient;
  580. u8 final = 0;
  581. /* Upgrade transient s1 to non-transient to simplify things */
  582. switch (s1) {
  583. case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */
  584. transient = true;
  585. s1 = MEMATTR_Wt | (s1 & GENMASK(1,0));
  586. break;
  587. case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */
  588. transient = true;
  589. s1 = MEMATTR_Wb | (s1 & GENMASK(1,0));
  590. break;
  591. default:
  592. transient = false;
  593. }
  594. /* S2CombineS1AttrHints() */
  595. if ((s1 & GENMASK(3, 2)) == MEMATTR_NC ||
  596. (s2 & GENMASK(3, 2)) == MEMATTR_NC)
  597. final = MEMATTR_NC;
  598. else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt ||
  599. (s2 & GENMASK(3, 2)) == MEMATTR_Wt)
  600. final = MEMATTR_Wt;
  601. else
  602. final = MEMATTR_Wb;
  603. if (final != MEMATTR_NC) {
  604. /* Inherit RaWa hints form S1 */
  605. if (transient) {
  606. switch (s1 & GENMASK(3, 2)) {
  607. case MEMATTR_Wt:
  608. final = 0;
  609. break;
  610. case MEMATTR_Wb:
  611. final = MEMATTR_NC;
  612. break;
  613. }
  614. }
  615. final |= s1 & GENMASK(1, 0);
  616. }
  617. return final;
  618. }
  619. #define ATTR_NSH 0b00
  620. #define ATTR_RSV 0b01
  621. #define ATTR_OSH 0b10
  622. #define ATTR_ISH 0b11
  623. static u8 compute_final_sh(u8 attr, u8 sh)
  624. {
  625. /* Any form of device, as well as NC has SH[1:0]=0b10 */
  626. if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC))
  627. return ATTR_OSH;
  628. if (sh == ATTR_RSV) /* Reserved, mapped to NSH */
  629. sh = ATTR_NSH;
  630. return sh;
  631. }
  632. static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr,
  633. u8 attr)
  634. {
  635. u8 sh;
  636. /*
  637. * non-52bit and LPA have their basic shareability described in the
  638. * descriptor. LPA2 gets it from the corresponding field in TCR,
  639. * conveniently recorded in the walk info.
  640. */
  641. if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K)
  642. sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc);
  643. else
  644. sh = wi->sh;
  645. return compute_final_sh(attr, sh);
  646. }
  647. static u8 combine_sh(u8 s1_sh, u8 s2_sh)
  648. {
  649. if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
  650. return ATTR_OSH;
  651. if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
  652. return ATTR_ISH;
  653. return ATTR_NSH;
  654. }
  655. static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
  656. struct kvm_s2_trans *tr)
  657. {
  658. u8 s1_parattr, s2_memattr, final_attr, s2_sh;
  659. u64 par;
  660. /* If S2 has failed to translate, report the damage */
  661. if (tr->esr) {
  662. par = SYS_PAR_EL1_RES1;
  663. par |= SYS_PAR_EL1_F;
  664. par |= SYS_PAR_EL1_S;
  665. par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr);
  666. return par;
  667. }
  668. s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par);
  669. s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc);
  670. if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) {
  671. if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP))
  672. s2_memattr &= ~BIT(3);
  673. /* Combination of R_VRJSW and R_RHWZM */
  674. switch (s2_memattr) {
  675. case 0b0101:
  676. if (MEMATTR_IS_DEVICE(s1_parattr))
  677. final_attr = s1_parattr;
  678. else
  679. final_attr = MEMATTR(NC, NC);
  680. break;
  681. case 0b0110:
  682. case 0b1110:
  683. final_attr = MEMATTR(WbRaWa, WbRaWa);
  684. break;
  685. case 0b0111:
  686. case 0b1111:
  687. /* Preserve S1 attribute */
  688. final_attr = s1_parattr;
  689. break;
  690. case 0b0100:
  691. case 0b1100:
  692. case 0b1101:
  693. /* Reserved, do something non-silly */
  694. final_attr = s1_parattr;
  695. break;
  696. default:
  697. /*
  698. * MemAttr[2]=0, Device from S2.
  699. *
  700. * FWB does not influence the way that stage 1
  701. * memory types and attributes are combined
  702. * with stage 2 Device type and attributes.
  703. */
  704. final_attr = min(s2_memattr_to_attr(s2_memattr),
  705. s1_parattr);
  706. }
  707. } else {
  708. /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */
  709. u8 s2_parattr = s2_memattr_to_attr(s2_memattr);
  710. if (MEMATTR_IS_DEVICE(s1_parattr) ||
  711. MEMATTR_IS_DEVICE(s2_parattr)) {
  712. final_attr = min(s1_parattr, s2_parattr);
  713. } else {
  714. /* At this stage, this is memory vs memory */
  715. final_attr = combine_s1_s2_attr(s1_parattr & 0xf,
  716. s2_parattr & 0xf);
  717. final_attr |= combine_s1_s2_attr(s1_parattr >> 4,
  718. s2_parattr >> 4) << 4;
  719. }
  720. }
  721. if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) &&
  722. !MEMATTR_IS_DEVICE(final_attr))
  723. final_attr = MEMATTR(NC, NC);
  724. s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc);
  725. par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
  726. par |= tr->output & GENMASK(47, 12);
  727. par |= FIELD_PREP(SYS_PAR_EL1_SH,
  728. combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
  729. compute_final_sh(final_attr, s2_sh)));
  730. return par;
  731. }
  732. static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
  733. struct s1_walk_result *wr)
  734. {
  735. u64 par;
  736. if (wr->failed) {
  737. par = SYS_PAR_EL1_RES1;
  738. par |= SYS_PAR_EL1_F;
  739. par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
  740. par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
  741. par |= wr->s2 ? SYS_PAR_EL1_S : 0;
  742. } else if (wr->level == S1_MMU_DISABLED) {
  743. /* MMU off or HCR_EL2.DC == 1 */
  744. par = SYS_PAR_EL1_NSE;
  745. par |= wr->pa & SYS_PAR_EL1_PA;
  746. if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) &&
  747. (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
  748. par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
  749. MEMATTR(WbRaWa, WbRaWa));
  750. par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH);
  751. } else {
  752. par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
  753. par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH);
  754. }
  755. } else {
  756. u64 mair, sctlr;
  757. u8 sh;
  758. par = SYS_PAR_EL1_NSE;
  759. mair = (wi->regime == TR_EL10 ?
  760. vcpu_read_sys_reg(vcpu, MAIR_EL1) :
  761. vcpu_read_sys_reg(vcpu, MAIR_EL2));
  762. mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
  763. mair &= 0xff;
  764. sctlr = (wi->regime == TR_EL10 ?
  765. vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
  766. vcpu_read_sys_reg(vcpu, SCTLR_EL2));
  767. /* Force NC for memory if SCTLR_ELx.C is clear */
  768. if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
  769. mair = MEMATTR(NC, NC);
  770. par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
  771. par |= wr->pa & SYS_PAR_EL1_PA;
  772. sh = compute_s1_sh(wi, wr, mair);
  773. par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
  774. }
  775. return par;
  776. }
  777. static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
  778. {
  779. u64 sctlr;
  780. if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
  781. return false;
  782. if (s1pie_enabled(vcpu, regime))
  783. return true;
  784. if (regime == TR_EL10)
  785. sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
  786. else
  787. sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
  788. return sctlr & SCTLR_EL1_EPAN;
  789. }
  790. static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu,
  791. struct s1_walk_info *wi,
  792. struct s1_walk_result *wr)
  793. {
  794. bool wxn;
  795. /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */
  796. if (wi->regime != TR_EL2) {
  797. switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) {
  798. case 0b00:
  799. wr->pr = wr->pw = true;
  800. wr->ur = wr->uw = false;
  801. break;
  802. case 0b01:
  803. wr->pr = wr->pw = wr->ur = wr->uw = true;
  804. break;
  805. case 0b10:
  806. wr->pr = true;
  807. wr->pw = wr->ur = wr->uw = false;
  808. break;
  809. case 0b11:
  810. wr->pr = wr->ur = true;
  811. wr->pw = wr->uw = false;
  812. break;
  813. }
  814. /* We don't use px for anything yet, but hey... */
  815. wr->px = !((wr->desc & PTE_PXN) || wr->uw);
  816. wr->ux = !(wr->desc & PTE_UXN);
  817. } else {
  818. wr->ur = wr->uw = wr->ux = false;
  819. if (!(wr->desc & PTE_RDONLY)) {
  820. wr->pr = wr->pw = true;
  821. } else {
  822. wr->pr = true;
  823. wr->pw = false;
  824. }
  825. /* XN maps to UXN */
  826. wr->px = !(wr->desc & PTE_UXN);
  827. }
  828. switch (wi->regime) {
  829. case TR_EL2:
  830. case TR_EL20:
  831. wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN);
  832. break;
  833. case TR_EL10:
  834. wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN);
  835. break;
  836. }
  837. wr->pwxn = wr->uwxn = wxn;
  838. wr->pov = wi->poe;
  839. wr->uov = wi->e0poe;
  840. }
  841. static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu,
  842. struct s1_walk_info *wi,
  843. struct s1_walk_result *wr)
  844. {
  845. /* Hierarchical part of AArch64.S1DirectBasePermissions() */
  846. if (wi->regime != TR_EL2) {
  847. switch (wr->APTable) {
  848. case 0b00:
  849. break;
  850. case 0b01:
  851. wr->ur = wr->uw = false;
  852. break;
  853. case 0b10:
  854. wr->pw = wr->uw = false;
  855. break;
  856. case 0b11:
  857. wr->pw = wr->ur = wr->uw = false;
  858. break;
  859. }
  860. wr->px &= !wr->PXNTable;
  861. wr->ux &= !wr->UXNTable;
  862. } else {
  863. if (wr->APTable & BIT(1))
  864. wr->pw = false;
  865. /* XN maps to UXN */
  866. wr->px &= !wr->UXNTable;
  867. }
  868. }
  869. #define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf)
  870. #define set_priv_perms(wr, r, w, x) \
  871. do { \
  872. (wr)->pr = (r); \
  873. (wr)->pw = (w); \
  874. (wr)->px = (x); \
  875. } while (0)
  876. #define set_unpriv_perms(wr, r, w, x) \
  877. do { \
  878. (wr)->ur = (r); \
  879. (wr)->uw = (w); \
  880. (wr)->ux = (x); \
  881. } while (0)
  882. #define set_priv_wxn(wr, v) \
  883. do { \
  884. (wr)->pwxn = (v); \
  885. } while (0)
  886. #define set_unpriv_wxn(wr, v) \
  887. do { \
  888. (wr)->uwxn = (v); \
  889. } while (0)
  890. /* Similar to AArch64.S1IndirectBasePermissions(), without GCS */
  891. #define set_perms(w, wr, ip) \
  892. do { \
  893. /* R_LLZDZ */ \
  894. switch ((ip)) { \
  895. case 0b0000: \
  896. set_ ## w ## _perms((wr), false, false, false); \
  897. break; \
  898. case 0b0001: \
  899. set_ ## w ## _perms((wr), true , false, false); \
  900. break; \
  901. case 0b0010: \
  902. set_ ## w ## _perms((wr), false, false, true ); \
  903. break; \
  904. case 0b0011: \
  905. set_ ## w ## _perms((wr), true , false, true ); \
  906. break; \
  907. case 0b0100: \
  908. set_ ## w ## _perms((wr), false, false, false); \
  909. break; \
  910. case 0b0101: \
  911. set_ ## w ## _perms((wr), true , true , false); \
  912. break; \
  913. case 0b0110: \
  914. set_ ## w ## _perms((wr), true , true , true ); \
  915. break; \
  916. case 0b0111: \
  917. set_ ## w ## _perms((wr), true , true , true ); \
  918. break; \
  919. case 0b1000: \
  920. set_ ## w ## _perms((wr), true , false, false); \
  921. break; \
  922. case 0b1001: \
  923. set_ ## w ## _perms((wr), true , false, false); \
  924. break; \
  925. case 0b1010: \
  926. set_ ## w ## _perms((wr), true , false, true ); \
  927. break; \
  928. case 0b1011: \
  929. set_ ## w ## _perms((wr), false, false, false); \
  930. break; \
  931. case 0b1100: \
  932. set_ ## w ## _perms((wr), true , true , false); \
  933. break; \
  934. case 0b1101: \
  935. set_ ## w ## _perms((wr), false, false, false); \
  936. break; \
  937. case 0b1110: \
  938. set_ ## w ## _perms((wr), true , true , true ); \
  939. break; \
  940. case 0b1111: \
  941. set_ ## w ## _perms((wr), false, false, false); \
  942. break; \
  943. } \
  944. \
  945. /* R_HJYGR */ \
  946. set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \
  947. \
  948. } while (0)
  949. static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu,
  950. struct s1_walk_info *wi,
  951. struct s1_walk_result *wr)
  952. {
  953. u8 up, pp, idx;
  954. idx = pte_pi_index(wr->desc);
  955. switch (wi->regime) {
  956. case TR_EL10:
  957. pp = perm_idx(vcpu, PIR_EL1, idx);
  958. up = perm_idx(vcpu, PIRE0_EL1, idx);
  959. break;
  960. case TR_EL20:
  961. pp = perm_idx(vcpu, PIR_EL2, idx);
  962. up = perm_idx(vcpu, PIRE0_EL2, idx);
  963. break;
  964. case TR_EL2:
  965. pp = perm_idx(vcpu, PIR_EL2, idx);
  966. up = 0;
  967. break;
  968. }
  969. set_perms(priv, wr, pp);
  970. if (wi->regime != TR_EL2)
  971. set_perms(unpriv, wr, up);
  972. else
  973. set_unpriv_perms(wr, false, false, false);
  974. wr->pov = wi->poe && !(pp & BIT(3));
  975. wr->uov = wi->e0poe && !(up & BIT(3));
  976. /* R_VFPJF */
  977. if (wr->px && wr->uw) {
  978. set_priv_perms(wr, false, false, false);
  979. set_unpriv_perms(wr, false, false, false);
  980. }
  981. }
  982. static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu,
  983. struct s1_walk_info *wi,
  984. struct s1_walk_result *wr)
  985. {
  986. u8 idx, pov_perms, uov_perms;
  987. idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc);
  988. if (wr->pov) {
  989. switch (wi->regime) {
  990. case TR_EL10:
  991. pov_perms = perm_idx(vcpu, POR_EL1, idx);
  992. break;
  993. case TR_EL20:
  994. pov_perms = perm_idx(vcpu, POR_EL2, idx);
  995. break;
  996. case TR_EL2:
  997. pov_perms = perm_idx(vcpu, POR_EL2, idx);
  998. break;
  999. }
  1000. if (pov_perms & ~POE_RWX)
  1001. pov_perms = POE_NONE;
  1002. /* R_QXXPC, S1PrivOverflow enabled */
  1003. if (wr->pwxn && (pov_perms & POE_X))
  1004. pov_perms &= ~POE_W;
  1005. wr->pr &= pov_perms & POE_R;
  1006. wr->pw &= pov_perms & POE_W;
  1007. wr->px &= pov_perms & POE_X;
  1008. }
  1009. if (wr->uov) {
  1010. switch (wi->regime) {
  1011. case TR_EL10:
  1012. uov_perms = perm_idx(vcpu, POR_EL0, idx);
  1013. break;
  1014. case TR_EL20:
  1015. uov_perms = perm_idx(vcpu, POR_EL0, idx);
  1016. break;
  1017. case TR_EL2:
  1018. uov_perms = 0;
  1019. break;
  1020. }
  1021. if (uov_perms & ~POE_RWX)
  1022. uov_perms = POE_NONE;
  1023. /* R_NPBXC, S1UnprivOverlay enabled */
  1024. if (wr->uwxn && (uov_perms & POE_X))
  1025. uov_perms &= ~POE_W;
  1026. wr->ur &= uov_perms & POE_R;
  1027. wr->uw &= uov_perms & POE_W;
  1028. wr->ux &= uov_perms & POE_X;
  1029. }
  1030. }
  1031. static void compute_s1_permissions(struct kvm_vcpu *vcpu,
  1032. struct s1_walk_info *wi,
  1033. struct s1_walk_result *wr)
  1034. {
  1035. bool pan;
  1036. if (!s1pie_enabled(vcpu, wi->regime))
  1037. compute_s1_direct_permissions(vcpu, wi, wr);
  1038. else
  1039. compute_s1_indirect_permissions(vcpu, wi, wr);
  1040. if (!wi->hpd)
  1041. compute_s1_hierarchical_permissions(vcpu, wi, wr);
  1042. compute_s1_overlay_permissions(vcpu, wi, wr);
  1043. /* R_QXXPC, S1PrivOverlay disabled */
  1044. if (!wr->pov)
  1045. wr->px &= !(wr->pwxn && wr->pw);
  1046. /* R_NPBXC, S1UnprivOverlay disabled */
  1047. if (!wr->uov)
  1048. wr->ux &= !(wr->uwxn && wr->uw);
  1049. pan = wi->pan && (wr->ur || wr->uw ||
  1050. (pan3_enabled(vcpu, wi->regime) && wr->ux));
  1051. wr->pw &= !pan;
  1052. wr->pr &= !pan;
  1053. }
  1054. static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par)
  1055. {
  1056. struct s1_walk_result wr = {};
  1057. struct s1_walk_info wi = {};
  1058. bool perm_fail = false;
  1059. int ret, idx;
  1060. wi.regime = compute_translation_regime(vcpu, op);
  1061. wi.as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
  1062. wi.pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) &&
  1063. (*vcpu_cpsr(vcpu) & PSR_PAN_BIT);
  1064. ret = setup_s1_walk(vcpu, &wi, &wr, vaddr);
  1065. if (ret)
  1066. goto compute_par;
  1067. if (wr.level == S1_MMU_DISABLED)
  1068. goto compute_par;
  1069. idx = srcu_read_lock(&vcpu->kvm->srcu);
  1070. ret = walk_s1(vcpu, &wi, &wr, vaddr);
  1071. srcu_read_unlock(&vcpu->kvm->srcu, idx);
  1072. /*
  1073. * Race to update a descriptor -- restart the walk.
  1074. */
  1075. if (ret == -EAGAIN)
  1076. return ret;
  1077. if (ret)
  1078. goto compute_par;
  1079. compute_s1_permissions(vcpu, &wi, &wr);
  1080. switch (op) {
  1081. case OP_AT_S1E1RP:
  1082. case OP_AT_S1E1R:
  1083. case OP_AT_S1E2R:
  1084. perm_fail = !wr.pr;
  1085. break;
  1086. case OP_AT_S1E1WP:
  1087. case OP_AT_S1E1W:
  1088. case OP_AT_S1E2W:
  1089. perm_fail = !wr.pw;
  1090. break;
  1091. case OP_AT_S1E0R:
  1092. perm_fail = !wr.ur;
  1093. break;
  1094. case OP_AT_S1E0W:
  1095. perm_fail = !wr.uw;
  1096. break;
  1097. case OP_AT_S1E1A:
  1098. case OP_AT_S1E2A:
  1099. break;
  1100. default:
  1101. BUG();
  1102. }
  1103. if (perm_fail)
  1104. fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false);
  1105. compute_par:
  1106. *par = compute_par_s1(vcpu, &wi, &wr);
  1107. return 0;
  1108. }
  1109. /*
  1110. * Return the PAR_EL1 value as the result of a valid translation.
  1111. *
  1112. * If the translation is unsuccessful, the value may only contain
  1113. * PAR_EL1.F, and cannot be taken at face value. It isn't an
  1114. * indication of the translation having failed, only that the fast
  1115. * path did not succeed, *unless* it indicates a S1 permission or
  1116. * access fault.
  1117. */
  1118. static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
  1119. {
  1120. struct mmu_config config;
  1121. struct kvm_s2_mmu *mmu;
  1122. bool fail, mmu_cs;
  1123. u64 par;
  1124. par = SYS_PAR_EL1_F;
  1125. /*
  1126. * We've trapped, so everything is live on the CPU. As we will
  1127. * be switching contexts behind everybody's back, disable
  1128. * interrupts while holding the mmu lock.
  1129. */
  1130. guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock);
  1131. /*
  1132. * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
  1133. * the right one (as we trapped from vEL2). If not, save the
  1134. * full MMU context.
  1135. *
  1136. * We are also guaranteed to be in the correct context if
  1137. * we're not in a nested VM.
  1138. */
  1139. mmu_cs = (vcpu_has_nv(vcpu) &&
  1140. !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)));
  1141. if (!mmu_cs)
  1142. goto skip_mmu_switch;
  1143. /*
  1144. * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
  1145. * find it (recycled by another vcpu, for example). When this
  1146. * happens, admit defeat immediately and use the SW (slow) path.
  1147. */
  1148. mmu = lookup_s2_mmu(vcpu);
  1149. if (!mmu)
  1150. return par;
  1151. __mmu_config_save(&config);
  1152. write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0);
  1153. write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1);
  1154. write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR);
  1155. write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR);
  1156. if (kvm_has_tcr2(vcpu->kvm)) {
  1157. write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2);
  1158. if (kvm_has_s1pie(vcpu->kvm)) {
  1159. write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR);
  1160. write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0);
  1161. }
  1162. if (kvm_has_s1poe(vcpu->kvm)) {
  1163. write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR);
  1164. write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0);
  1165. }
  1166. }
  1167. write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR);
  1168. __load_stage2(mmu, mmu->arch);
  1169. skip_mmu_switch:
  1170. /* Temporarily switch back to guest context */
  1171. write_sysreg_hcr(vcpu->arch.hcr_el2);
  1172. isb();
  1173. switch (op) {
  1174. case OP_AT_S1E1RP:
  1175. case OP_AT_S1E1WP:
  1176. fail = at_s1e1p_fast(vcpu, op, vaddr);
  1177. break;
  1178. case OP_AT_S1E1R:
  1179. fail = __kvm_at(OP_AT_S1E1R, vaddr);
  1180. break;
  1181. case OP_AT_S1E1W:
  1182. fail = __kvm_at(OP_AT_S1E1W, vaddr);
  1183. break;
  1184. case OP_AT_S1E0R:
  1185. fail = __kvm_at(OP_AT_S1E0R, vaddr);
  1186. break;
  1187. case OP_AT_S1E0W:
  1188. fail = __kvm_at(OP_AT_S1E0W, vaddr);
  1189. break;
  1190. case OP_AT_S1E1A:
  1191. fail = __kvm_at(OP_AT_S1E1A, vaddr);
  1192. break;
  1193. default:
  1194. WARN_ON_ONCE(1);
  1195. fail = true;
  1196. break;
  1197. }
  1198. if (!fail)
  1199. par = read_sysreg_par();
  1200. write_sysreg_hcr(HCR_HOST_VHE_FLAGS);
  1201. if (mmu_cs)
  1202. __mmu_config_restore(&config);
  1203. return par;
  1204. }
  1205. static bool par_check_s1_perm_fault(u64 par)
  1206. {
  1207. u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
  1208. return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
  1209. !(par & SYS_PAR_EL1_S));
  1210. }
  1211. static bool par_check_s1_access_fault(u64 par)
  1212. {
  1213. u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
  1214. return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS &&
  1215. !(par & SYS_PAR_EL1_S));
  1216. }
  1217. int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
  1218. {
  1219. u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
  1220. int ret;
  1221. /*
  1222. * If PAR_EL1 reports that AT failed on a S1 permission or access
  1223. * fault, we know for sure that the PTW was able to walk the S1
  1224. * tables and there's nothing else to do.
  1225. *
  1226. * If AT failed for any other reason, then we must walk the guest S1
  1227. * to emulate the instruction.
  1228. */
  1229. if ((par & SYS_PAR_EL1_F) &&
  1230. !par_check_s1_perm_fault(par) &&
  1231. !par_check_s1_access_fault(par)) {
  1232. ret = handle_at_slow(vcpu, op, vaddr, &par);
  1233. if (ret)
  1234. return ret;
  1235. }
  1236. vcpu_write_sys_reg(vcpu, par, PAR_EL1);
  1237. return 0;
  1238. }
  1239. int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
  1240. {
  1241. u64 par;
  1242. int ret;
  1243. /*
  1244. * We've trapped, so everything is live on the CPU. As we will be
  1245. * switching context behind everybody's back, disable interrupts...
  1246. */
  1247. scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) {
  1248. u64 val, hcr;
  1249. bool fail;
  1250. val = hcr = read_sysreg(hcr_el2);
  1251. val &= ~HCR_TGE;
  1252. val |= HCR_VM;
  1253. if (!vcpu_el2_e2h_is_set(vcpu))
  1254. val |= HCR_NV | HCR_NV1;
  1255. write_sysreg_hcr(val);
  1256. isb();
  1257. par = SYS_PAR_EL1_F;
  1258. switch (op) {
  1259. case OP_AT_S1E2R:
  1260. fail = __kvm_at(OP_AT_S1E1R, vaddr);
  1261. break;
  1262. case OP_AT_S1E2W:
  1263. fail = __kvm_at(OP_AT_S1E1W, vaddr);
  1264. break;
  1265. case OP_AT_S1E2A:
  1266. fail = __kvm_at(OP_AT_S1E1A, vaddr);
  1267. break;
  1268. default:
  1269. WARN_ON_ONCE(1);
  1270. fail = true;
  1271. }
  1272. if (!fail)
  1273. par = read_sysreg_par();
  1274. write_sysreg_hcr(hcr);
  1275. isb();
  1276. }
  1277. /* We failed the translation, let's replay it in slow motion */
  1278. if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) {
  1279. ret = handle_at_slow(vcpu, op, vaddr, &par);
  1280. if (ret)
  1281. return ret;
  1282. }
  1283. vcpu_write_sys_reg(vcpu, par, PAR_EL1);
  1284. return 0;
  1285. }
  1286. int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
  1287. {
  1288. struct kvm_s2_trans out = {};
  1289. u64 ipa, par;
  1290. bool write;
  1291. int ret;
  1292. /* Do the stage-1 translation */
  1293. switch (op) {
  1294. case OP_AT_S12E1R:
  1295. op = OP_AT_S1E1R;
  1296. write = false;
  1297. break;
  1298. case OP_AT_S12E1W:
  1299. op = OP_AT_S1E1W;
  1300. write = true;
  1301. break;
  1302. case OP_AT_S12E0R:
  1303. op = OP_AT_S1E0R;
  1304. write = false;
  1305. break;
  1306. case OP_AT_S12E0W:
  1307. op = OP_AT_S1E0W;
  1308. write = true;
  1309. break;
  1310. default:
  1311. WARN_ON_ONCE(1);
  1312. return 0;
  1313. }
  1314. __kvm_at_s1e01(vcpu, op, vaddr);
  1315. par = vcpu_read_sys_reg(vcpu, PAR_EL1);
  1316. if (par & SYS_PAR_EL1_F)
  1317. return 0;
  1318. /*
  1319. * If we only have a single stage of translation (EL2&0), exit
  1320. * early. Same thing if {VM,DC}=={0,0}.
  1321. */
  1322. if (compute_translation_regime(vcpu, op) == TR_EL20 ||
  1323. !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
  1324. return 0;
  1325. /* Do the stage-2 translation */
  1326. ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
  1327. out.esr = 0;
  1328. ret = kvm_walk_nested_s2(vcpu, ipa, &out);
  1329. if (ret < 0)
  1330. return ret;
  1331. /* Check the access permission */
  1332. if (!out.esr &&
  1333. ((!write && !out.readable) || (write && !out.writable)))
  1334. out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
  1335. par = compute_par_s12(vcpu, par, &out);
  1336. vcpu_write_sys_reg(vcpu, par, PAR_EL1);
  1337. return 0;
  1338. }
  1339. /*
  1340. * Translate a VA for a given EL in a given translation regime, with
  1341. * or without PAN. This requires wi->{regime, as_el0, pan} to be
  1342. * set. The rest of the wi and wr should be 0-initialised.
  1343. */
  1344. int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
  1345. struct s1_walk_result *wr, u64 va)
  1346. {
  1347. int ret;
  1348. ret = setup_s1_walk(vcpu, wi, wr, va);
  1349. if (ret)
  1350. return ret;
  1351. if (wr->level == S1_MMU_DISABLED) {
  1352. wr->ur = wr->uw = wr->ux = true;
  1353. wr->pr = wr->pw = wr->px = true;
  1354. } else {
  1355. ret = walk_s1(vcpu, wi, wr, va);
  1356. if (ret)
  1357. return ret;
  1358. compute_s1_permissions(vcpu, wi, wr);
  1359. }
  1360. return 0;
  1361. }
  1362. struct desc_match {
  1363. u64 ipa;
  1364. int level;
  1365. };
  1366. static int match_s1_desc(struct s1_walk_context *ctxt, void *priv)
  1367. {
  1368. struct desc_match *dm = priv;
  1369. u64 ipa = dm->ipa;
  1370. /* Use S1 granule alignment */
  1371. ipa &= GENMASK(51, ctxt->wi->pgshift);
  1372. /* Not the IPA we're looking for? Continue. */
  1373. if (ipa != ctxt->table_ipa)
  1374. return 0;
  1375. /* Note the level and interrupt the walk */
  1376. dm->level = ctxt->level;
  1377. return -EINTR;
  1378. }
  1379. int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
  1380. {
  1381. struct desc_match dm = {
  1382. .ipa = ipa,
  1383. };
  1384. struct s1_walk_info wi = {
  1385. .filter = &(struct s1_walk_filter){
  1386. .fn = match_s1_desc,
  1387. .priv = &dm,
  1388. },
  1389. .as_el0 = false,
  1390. .pan = false,
  1391. };
  1392. struct s1_walk_result wr = {};
  1393. int ret;
  1394. if (is_hyp_ctxt(vcpu))
  1395. wi.regime = vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
  1396. else
  1397. wi.regime = TR_EL10;
  1398. ret = setup_s1_walk(vcpu, &wi, &wr, va);
  1399. if (ret)
  1400. return ret;
  1401. /* We really expect the S1 MMU to be on here... */
  1402. if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) {
  1403. *level = 0;
  1404. return 0;
  1405. }
  1406. /* Walk the guest's PT, looking for a match along the way */
  1407. ret = walk_s1(vcpu, &wi, &wr, va);
  1408. switch (ret) {
  1409. case -EINTR:
  1410. /* We interrupted the walk on a match, return the level */
  1411. *level = dm.level;
  1412. return 0;
  1413. case 0:
  1414. /* The walk completed, we failed to find the entry */
  1415. return -ENOENT;
  1416. default:
  1417. /* Any other error... */
  1418. return ret;
  1419. }
  1420. }
  1421. static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new)
  1422. {
  1423. u64 tmp = old;
  1424. int ret = 0;
  1425. uaccess_enable_privileged();
  1426. asm volatile(__LSE_PREAMBLE
  1427. "1: cas %[old], %[new], %[addr]\n"
  1428. "2:\n"
  1429. _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret])
  1430. : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret)
  1431. : [new] "r" (new)
  1432. : "memory");
  1433. uaccess_disable_privileged();
  1434. if (ret)
  1435. return ret;
  1436. if (tmp != old)
  1437. return -EAGAIN;
  1438. return ret;
  1439. }
  1440. static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new)
  1441. {
  1442. int ret = 1;
  1443. u64 tmp;
  1444. uaccess_enable_privileged();
  1445. asm volatile("prfm pstl1strm, %[addr]\n"
  1446. "1: ldxr %[tmp], %[addr]\n"
  1447. "sub %[tmp], %[tmp], %[old]\n"
  1448. "cbnz %[tmp], 3f\n"
  1449. "2: stlxr %w[ret], %[new], %[addr]\n"
  1450. "3:\n"
  1451. _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret])
  1452. _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret])
  1453. : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp)
  1454. : [old] "r" (old), [new] "r" (new)
  1455. : "memory");
  1456. uaccess_disable_privileged();
  1457. /* STLXR didn't update the descriptor, or the compare failed */
  1458. if (ret == 1)
  1459. return -EAGAIN;
  1460. return ret;
  1461. }
  1462. int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new)
  1463. {
  1464. struct kvm_memory_slot *slot;
  1465. unsigned long hva;
  1466. u64 __user *ptep;
  1467. bool writable;
  1468. int offset;
  1469. gfn_t gfn;
  1470. int r;
  1471. lockdep_assert(srcu_read_lock_held(&kvm->srcu));
  1472. gfn = ipa >> PAGE_SHIFT;
  1473. offset = offset_in_page(ipa);
  1474. slot = gfn_to_memslot(kvm, gfn);
  1475. hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
  1476. if (kvm_is_error_hva(hva))
  1477. return -EINVAL;
  1478. if (!writable)
  1479. return -EPERM;
  1480. ptep = (void __user *)hva + offset;
  1481. if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS))
  1482. r = __lse_swap_desc(ptep, old, new);
  1483. else
  1484. r = __llsc_swap_desc(ptep, old, new);
  1485. if (r < 0)
  1486. return r;
  1487. mark_page_dirty_in_slot(kvm, slot, gfn);
  1488. return 0;
  1489. }