sm4-ce-gcm-core.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
  4. * as specified in rfc8998
  5. * https://datatracker.ietf.org/doc/html/rfc8998
  6. *
  7. * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  8. * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
  9. */
  10. #include <linux/linkage.h>
  11. #include <linux/cfi_types.h>
  12. #include <asm/assembler.h>
  13. #include "sm4-ce-asm.h"
  14. .arch armv8-a+crypto
  15. .irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
  16. .set .Lv\b\().4s, \b
  17. .endr
  18. .macro sm4e, vd, vn
  19. .inst 0xcec08400 | (.L\vn << 5) | .L\vd
  20. .endm
  21. /* Register macros */
  22. /* Used for both encryption and decryption */
  23. #define RHASH v21
  24. #define RRCONST v22
  25. #define RZERO v23
  26. /* Helper macros. */
  27. /*
  28. * input: m0, m1
  29. * output: r0:r1 (low 128-bits in r0, high in r1)
  30. */
  31. #define PMUL_128x128(r0, r1, m0, m1, T0, T1) \
  32. ext T0.16b, m1.16b, m1.16b, #8; \
  33. pmull r0.1q, m0.1d, m1.1d; \
  34. pmull T1.1q, m0.1d, T0.1d; \
  35. pmull2 T0.1q, m0.2d, T0.2d; \
  36. pmull2 r1.1q, m0.2d, m1.2d; \
  37. eor T0.16b, T0.16b, T1.16b; \
  38. ext T1.16b, RZERO.16b, T0.16b, #8; \
  39. ext T0.16b, T0.16b, RZERO.16b, #8; \
  40. eor r0.16b, r0.16b, T1.16b; \
  41. eor r1.16b, r1.16b, T0.16b;
  42. #define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1, \
  43. r2, r3, m2, m3, T2, T3, \
  44. r4, r5, m4, m5, T4, T5, \
  45. r6, r7, m6, m7, T6, T7) \
  46. ext T0.16b, m1.16b, m1.16b, #8; \
  47. ext T2.16b, m3.16b, m3.16b, #8; \
  48. ext T4.16b, m5.16b, m5.16b, #8; \
  49. ext T6.16b, m7.16b, m7.16b, #8; \
  50. pmull r0.1q, m0.1d, m1.1d; \
  51. pmull r2.1q, m2.1d, m3.1d; \
  52. pmull r4.1q, m4.1d, m5.1d; \
  53. pmull r6.1q, m6.1d, m7.1d; \
  54. pmull T1.1q, m0.1d, T0.1d; \
  55. pmull T3.1q, m2.1d, T2.1d; \
  56. pmull T5.1q, m4.1d, T4.1d; \
  57. pmull T7.1q, m6.1d, T6.1d; \
  58. pmull2 T0.1q, m0.2d, T0.2d; \
  59. pmull2 T2.1q, m2.2d, T2.2d; \
  60. pmull2 T4.1q, m4.2d, T4.2d; \
  61. pmull2 T6.1q, m6.2d, T6.2d; \
  62. pmull2 r1.1q, m0.2d, m1.2d; \
  63. pmull2 r3.1q, m2.2d, m3.2d; \
  64. pmull2 r5.1q, m4.2d, m5.2d; \
  65. pmull2 r7.1q, m6.2d, m7.2d; \
  66. eor T0.16b, T0.16b, T1.16b; \
  67. eor T2.16b, T2.16b, T3.16b; \
  68. eor T4.16b, T4.16b, T5.16b; \
  69. eor T6.16b, T6.16b, T7.16b; \
  70. ext T1.16b, RZERO.16b, T0.16b, #8; \
  71. ext T3.16b, RZERO.16b, T2.16b, #8; \
  72. ext T5.16b, RZERO.16b, T4.16b, #8; \
  73. ext T7.16b, RZERO.16b, T6.16b, #8; \
  74. ext T0.16b, T0.16b, RZERO.16b, #8; \
  75. ext T2.16b, T2.16b, RZERO.16b, #8; \
  76. ext T4.16b, T4.16b, RZERO.16b, #8; \
  77. ext T6.16b, T6.16b, RZERO.16b, #8; \
  78. eor r0.16b, r0.16b, T1.16b; \
  79. eor r2.16b, r2.16b, T3.16b; \
  80. eor r4.16b, r4.16b, T5.16b; \
  81. eor r6.16b, r6.16b, T7.16b; \
  82. eor r1.16b, r1.16b, T0.16b; \
  83. eor r3.16b, r3.16b, T2.16b; \
  84. eor r5.16b, r5.16b, T4.16b; \
  85. eor r7.16b, r7.16b, T6.16b;
  86. /*
  87. * input: r0:r1 (low 128-bits in r0, high in r1)
  88. * output: a
  89. */
  90. #define REDUCTION(a, r0, r1, rconst, T0, T1) \
  91. pmull2 T0.1q, r1.2d, rconst.2d; \
  92. ext T1.16b, T0.16b, RZERO.16b, #8; \
  93. ext T0.16b, RZERO.16b, T0.16b, #8; \
  94. eor r1.16b, r1.16b, T1.16b; \
  95. eor r0.16b, r0.16b, T0.16b; \
  96. pmull T0.1q, r1.1d, rconst.1d; \
  97. eor a.16b, r0.16b, T0.16b;
  98. #define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1) \
  99. rev32 b0.16b, b0.16b; \
  100. ext T0.16b, m1.16b, m1.16b, #8; \
  101. sm4e b0.4s, v24.4s; \
  102. pmull r0.1q, m0.1d, m1.1d; \
  103. sm4e b0.4s, v25.4s; \
  104. pmull T1.1q, m0.1d, T0.1d; \
  105. sm4e b0.4s, v26.4s; \
  106. pmull2 T0.1q, m0.2d, T0.2d; \
  107. sm4e b0.4s, v27.4s; \
  108. pmull2 r1.1q, m0.2d, m1.2d; \
  109. sm4e b0.4s, v28.4s; \
  110. eor T0.16b, T0.16b, T1.16b; \
  111. sm4e b0.4s, v29.4s; \
  112. ext T1.16b, RZERO.16b, T0.16b, #8; \
  113. sm4e b0.4s, v30.4s; \
  114. ext T0.16b, T0.16b, RZERO.16b, #8; \
  115. sm4e b0.4s, v31.4s; \
  116. eor r0.16b, r0.16b, T1.16b; \
  117. rev64 b0.4s, b0.4s; \
  118. eor r1.16b, r1.16b, T0.16b; \
  119. ext b0.16b, b0.16b, b0.16b, #8; \
  120. rev32 b0.16b, b0.16b;
  121. #define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2, \
  122. r0, r1, m0, m1, T0, T1, \
  123. r2, r3, m2, m3, T2, T3, \
  124. r4, r5, m4, m5, T4, T5) \
  125. rev32 b0.16b, b0.16b; \
  126. rev32 b1.16b, b1.16b; \
  127. rev32 b2.16b, b2.16b; \
  128. ext T0.16b, m1.16b, m1.16b, #8; \
  129. ext T2.16b, m3.16b, m3.16b, #8; \
  130. ext T4.16b, m5.16b, m5.16b, #8; \
  131. sm4e b0.4s, v24.4s; \
  132. sm4e b1.4s, v24.4s; \
  133. sm4e b2.4s, v24.4s; \
  134. pmull r0.1q, m0.1d, m1.1d; \
  135. pmull r2.1q, m2.1d, m3.1d; \
  136. pmull r4.1q, m4.1d, m5.1d; \
  137. sm4e b0.4s, v25.4s; \
  138. sm4e b1.4s, v25.4s; \
  139. sm4e b2.4s, v25.4s; \
  140. pmull T1.1q, m0.1d, T0.1d; \
  141. pmull T3.1q, m2.1d, T2.1d; \
  142. pmull T5.1q, m4.1d, T4.1d; \
  143. sm4e b0.4s, v26.4s; \
  144. sm4e b1.4s, v26.4s; \
  145. sm4e b2.4s, v26.4s; \
  146. pmull2 T0.1q, m0.2d, T0.2d; \
  147. pmull2 T2.1q, m2.2d, T2.2d; \
  148. pmull2 T4.1q, m4.2d, T4.2d; \
  149. sm4e b0.4s, v27.4s; \
  150. sm4e b1.4s, v27.4s; \
  151. sm4e b2.4s, v27.4s; \
  152. pmull2 r1.1q, m0.2d, m1.2d; \
  153. pmull2 r3.1q, m2.2d, m3.2d; \
  154. pmull2 r5.1q, m4.2d, m5.2d; \
  155. sm4e b0.4s, v28.4s; \
  156. sm4e b1.4s, v28.4s; \
  157. sm4e b2.4s, v28.4s; \
  158. eor T0.16b, T0.16b, T1.16b; \
  159. eor T2.16b, T2.16b, T3.16b; \
  160. eor T4.16b, T4.16b, T5.16b; \
  161. sm4e b0.4s, v29.4s; \
  162. sm4e b1.4s, v29.4s; \
  163. sm4e b2.4s, v29.4s; \
  164. ext T1.16b, RZERO.16b, T0.16b, #8; \
  165. ext T3.16b, RZERO.16b, T2.16b, #8; \
  166. ext T5.16b, RZERO.16b, T4.16b, #8; \
  167. sm4e b0.4s, v30.4s; \
  168. sm4e b1.4s, v30.4s; \
  169. sm4e b2.4s, v30.4s; \
  170. ext T0.16b, T0.16b, RZERO.16b, #8; \
  171. ext T2.16b, T2.16b, RZERO.16b, #8; \
  172. ext T4.16b, T4.16b, RZERO.16b, #8; \
  173. sm4e b0.4s, v31.4s; \
  174. sm4e b1.4s, v31.4s; \
  175. sm4e b2.4s, v31.4s; \
  176. eor r0.16b, r0.16b, T1.16b; \
  177. eor r2.16b, r2.16b, T3.16b; \
  178. eor r4.16b, r4.16b, T5.16b; \
  179. rev64 b0.4s, b0.4s; \
  180. rev64 b1.4s, b1.4s; \
  181. rev64 b2.4s, b2.4s; \
  182. eor r1.16b, r1.16b, T0.16b; \
  183. eor r3.16b, r3.16b, T2.16b; \
  184. eor r5.16b, r5.16b, T4.16b; \
  185. ext b0.16b, b0.16b, b0.16b, #8; \
  186. ext b1.16b, b1.16b, b1.16b, #8; \
  187. ext b2.16b, b2.16b, b2.16b, #8; \
  188. eor r0.16b, r0.16b, r2.16b; \
  189. eor r1.16b, r1.16b, r3.16b; \
  190. rev32 b0.16b, b0.16b; \
  191. rev32 b1.16b, b1.16b; \
  192. rev32 b2.16b, b2.16b; \
  193. eor r0.16b, r0.16b, r4.16b; \
  194. eor r1.16b, r1.16b, r5.16b;
  195. #define inc32_le128(vctr) \
  196. mov vctr.d[1], x9; \
  197. add w6, w9, #1; \
  198. mov vctr.d[0], x8; \
  199. bfi x9, x6, #0, #32; \
  200. rev64 vctr.16b, vctr.16b;
  201. #define GTAG_HASH_LENGTHS(vctr0, vlen) \
  202. ld1 {vlen.16b}, [x7]; \
  203. /* construct CTR0 */ \
  204. /* the lower 32-bits of initial IV is always be32(1) */ \
  205. mov x6, #0x1; \
  206. bfi x9, x6, #0, #32; \
  207. mov vctr0.d[0], x8; \
  208. mov vctr0.d[1], x9; \
  209. rbit vlen.16b, vlen.16b; \
  210. rev64 vctr0.16b, vctr0.16b; \
  211. /* authtag = GCTR(CTR0, GHASH) */ \
  212. eor RHASH.16b, RHASH.16b, vlen.16b; \
  213. SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \
  214. RTMP0, RTMP1); \
  215. REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3); \
  216. rbit RHASH.16b, RHASH.16b; \
  217. eor RHASH.16b, RHASH.16b, vctr0.16b;
  218. /* Register macros for encrypt and ghash */
  219. /* can be the same as input v0-v3 */
  220. #define RR1 v0
  221. #define RR3 v1
  222. #define RR5 v2
  223. #define RR7 v3
  224. #define RR0 v4
  225. #define RR2 v5
  226. #define RR4 v6
  227. #define RR6 v7
  228. #define RTMP0 v8
  229. #define RTMP1 v9
  230. #define RTMP2 v10
  231. #define RTMP3 v11
  232. #define RTMP4 v12
  233. #define RTMP5 v13
  234. #define RTMP6 v14
  235. #define RTMP7 v15
  236. #define RH1 v16
  237. #define RH2 v17
  238. #define RH3 v18
  239. #define RH4 v19
  240. .align 3
  241. SYM_FUNC_START(sm4_ce_pmull_ghash_setup)
  242. /* input:
  243. * x0: round key array, CTX
  244. * x1: ghash table
  245. */
  246. SM4_PREPARE(x0)
  247. adr_l x2, .Lghash_rconst
  248. ld1r {RRCONST.2d}, [x2]
  249. eor RZERO.16b, RZERO.16b, RZERO.16b
  250. /* H = E(K, 0^128) */
  251. rev32 v0.16b, RZERO.16b
  252. SM4_CRYPT_BLK_BE(v0)
  253. /* H ^ 1 */
  254. rbit RH1.16b, v0.16b
  255. /* H ^ 2 */
  256. PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
  257. REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)
  258. /* H ^ 3 */
  259. PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
  260. REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)
  261. /* H ^ 4 */
  262. PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
  263. REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)
  264. st1 {RH1.16b-RH4.16b}, [x1]
  265. ret
  266. SYM_FUNC_END(sm4_ce_pmull_ghash_setup)
  267. .align 3
  268. SYM_FUNC_START(pmull_ghash_update)
  269. /* input:
  270. * x0: ghash table
  271. * x1: ghash result
  272. * x2: src
  273. * w3: nblocks
  274. */
  275. ld1 {RH1.16b-RH4.16b}, [x0]
  276. ld1 {RHASH.16b}, [x1]
  277. rbit RHASH.16b, RHASH.16b
  278. adr_l x4, .Lghash_rconst
  279. ld1r {RRCONST.2d}, [x4]
  280. eor RZERO.16b, RZERO.16b, RZERO.16b
  281. .Lghash_loop_4x:
  282. cmp w3, #4
  283. blt .Lghash_loop_1x
  284. sub w3, w3, #4
  285. ld1 {v0.16b-v3.16b}, [x2], #64
  286. rbit v0.16b, v0.16b
  287. rbit v1.16b, v1.16b
  288. rbit v2.16b, v2.16b
  289. rbit v3.16b, v3.16b
  290. /*
  291. * (in0 ^ HASH) * H^4 => rr0:rr1
  292. * (in1) * H^3 => rr2:rr3
  293. * (in2) * H^2 => rr4:rr5
  294. * (in3) * H^1 => rr6:rr7
  295. */
  296. eor RHASH.16b, RHASH.16b, v0.16b
  297. PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
  298. RR2, RR3, v1, RH3, RTMP2, RTMP3,
  299. RR4, RR5, v2, RH2, RTMP4, RTMP5,
  300. RR6, RR7, v3, RH1, RTMP6, RTMP7)
  301. eor RR0.16b, RR0.16b, RR2.16b
  302. eor RR1.16b, RR1.16b, RR3.16b
  303. eor RR0.16b, RR0.16b, RR4.16b
  304. eor RR1.16b, RR1.16b, RR5.16b
  305. eor RR0.16b, RR0.16b, RR6.16b
  306. eor RR1.16b, RR1.16b, RR7.16b
  307. REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
  308. cbz w3, .Lghash_end
  309. b .Lghash_loop_4x
  310. .Lghash_loop_1x:
  311. sub w3, w3, #1
  312. ld1 {v0.16b}, [x2], #16
  313. rbit v0.16b, v0.16b
  314. eor RHASH.16b, RHASH.16b, v0.16b
  315. PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
  316. REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
  317. cbnz w3, .Lghash_loop_1x
  318. .Lghash_end:
  319. rbit RHASH.16b, RHASH.16b
  320. st1 {RHASH.2d}, [x1]
  321. ret
  322. SYM_FUNC_END(pmull_ghash_update)
  323. .align 3
  324. SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc)
  325. /* input:
  326. * x0: round key array, CTX
  327. * x1: dst
  328. * x2: src
  329. * x3: ctr (big endian, 128 bit)
  330. * w4: nbytes
  331. * x5: ghash result
  332. * x6: ghash table
  333. * x7: lengths (only for last block)
  334. */
  335. SM4_PREPARE(x0)
  336. ldp x8, x9, [x3]
  337. rev x8, x8
  338. rev x9, x9
  339. ld1 {RH1.16b-RH4.16b}, [x6]
  340. ld1 {RHASH.16b}, [x5]
  341. rbit RHASH.16b, RHASH.16b
  342. adr_l x6, .Lghash_rconst
  343. ld1r {RRCONST.2d}, [x6]
  344. eor RZERO.16b, RZERO.16b, RZERO.16b
  345. cbz w4, .Lgcm_enc_hash_len
  346. .Lgcm_enc_loop_4x:
  347. cmp w4, #(4 * 16)
  348. blt .Lgcm_enc_loop_1x
  349. sub w4, w4, #(4 * 16)
  350. /* construct CTRs */
  351. inc32_le128(v0) /* +0 */
  352. inc32_le128(v1) /* +1 */
  353. inc32_le128(v2) /* +2 */
  354. inc32_le128(v3) /* +3 */
  355. ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
  356. SM4_CRYPT_BLK4(v0, v1, v2, v3)
  357. eor v0.16b, v0.16b, RTMP0.16b
  358. eor v1.16b, v1.16b, RTMP1.16b
  359. eor v2.16b, v2.16b, RTMP2.16b
  360. eor v3.16b, v3.16b, RTMP3.16b
  361. st1 {v0.16b-v3.16b}, [x1], #64
  362. /* ghash update */
  363. rbit v0.16b, v0.16b
  364. rbit v1.16b, v1.16b
  365. rbit v2.16b, v2.16b
  366. rbit v3.16b, v3.16b
  367. /*
  368. * (in0 ^ HASH) * H^4 => rr0:rr1
  369. * (in1) * H^3 => rr2:rr3
  370. * (in2) * H^2 => rr4:rr5
  371. * (in3) * H^1 => rr6:rr7
  372. */
  373. eor RHASH.16b, RHASH.16b, v0.16b
  374. PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
  375. RR2, RR3, v1, RH3, RTMP2, RTMP3,
  376. RR4, RR5, v2, RH2, RTMP4, RTMP5,
  377. RR6, RR7, v3, RH1, RTMP6, RTMP7)
  378. eor RR0.16b, RR0.16b, RR2.16b
  379. eor RR1.16b, RR1.16b, RR3.16b
  380. eor RR0.16b, RR0.16b, RR4.16b
  381. eor RR1.16b, RR1.16b, RR5.16b
  382. eor RR0.16b, RR0.16b, RR6.16b
  383. eor RR1.16b, RR1.16b, RR7.16b
  384. REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
  385. cbz w4, .Lgcm_enc_hash_len
  386. b .Lgcm_enc_loop_4x
  387. .Lgcm_enc_loop_1x:
  388. cmp w4, #16
  389. blt .Lgcm_enc_tail
  390. sub w4, w4, #16
  391. /* construct CTRs */
  392. inc32_le128(v0)
  393. ld1 {RTMP0.16b}, [x2], #16
  394. SM4_CRYPT_BLK(v0)
  395. eor v0.16b, v0.16b, RTMP0.16b
  396. st1 {v0.16b}, [x1], #16
  397. /* ghash update */
  398. rbit v0.16b, v0.16b
  399. eor RHASH.16b, RHASH.16b, v0.16b
  400. PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
  401. REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
  402. cbz w4, .Lgcm_enc_hash_len
  403. b .Lgcm_enc_loop_1x
  404. .Lgcm_enc_tail:
  405. /* construct CTRs */
  406. inc32_le128(v0)
  407. SM4_CRYPT_BLK(v0)
  408. /* load permute table */
  409. adr_l x0, .Lcts_permute_table
  410. add x0, x0, #32
  411. sub x0, x0, w4, uxtw
  412. ld1 {v3.16b}, [x0]
  413. .Lgcm_enc_tail_loop:
  414. /* do encrypt */
  415. ldrb w0, [x2], #1 /* get 1 byte from input */
  416. umov w6, v0.b[0] /* get top crypted byte */
  417. eor w6, w6, w0 /* w6 = CTR ^ input */
  418. strb w6, [x1], #1 /* store out byte */
  419. /* shift right out one byte */
  420. ext v0.16b, v0.16b, v0.16b, #1
  421. /* the last ciphertext is placed in high bytes */
  422. ins v0.b[15], w6
  423. subs w4, w4, #1
  424. bne .Lgcm_enc_tail_loop
  425. /* padding last block with zeros */
  426. tbl v0.16b, {v0.16b}, v3.16b
  427. /* ghash update */
  428. rbit v0.16b, v0.16b
  429. eor RHASH.16b, RHASH.16b, v0.16b
  430. PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
  431. REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
  432. .Lgcm_enc_hash_len:
  433. cbz x7, .Lgcm_enc_end
  434. GTAG_HASH_LENGTHS(v1, v3)
  435. b .Lgcm_enc_ret
  436. .Lgcm_enc_end:
  437. /* store new CTR */
  438. rev x8, x8
  439. rev x9, x9
  440. stp x8, x9, [x3]
  441. rbit RHASH.16b, RHASH.16b
  442. .Lgcm_enc_ret:
  443. /* store new MAC */
  444. st1 {RHASH.2d}, [x5]
  445. ret
  446. SYM_FUNC_END(sm4_ce_pmull_gcm_enc)
  447. #undef RR1
  448. #undef RR3
  449. #undef RR5
  450. #undef RR7
  451. #undef RR0
  452. #undef RR2
  453. #undef RR4
  454. #undef RR6
  455. #undef RTMP0
  456. #undef RTMP1
  457. #undef RTMP2
  458. #undef RTMP3
  459. #undef RTMP4
  460. #undef RTMP5
  461. #undef RTMP6
  462. #undef RTMP7
  463. #undef RH1
  464. #undef RH2
  465. #undef RH3
  466. #undef RH4
  467. /* Register macros for decrypt */
  468. /* v0-v2 for building CTRs, v3-v5 for saving inputs */
  469. #define RR1 v6
  470. #define RR3 v7
  471. #define RR5 v8
  472. #define RR0 v9
  473. #define RR2 v10
  474. #define RR4 v11
  475. #define RTMP0 v12
  476. #define RTMP1 v13
  477. #define RTMP2 v14
  478. #define RTMP3 v15
  479. #define RTMP4 v16
  480. #define RTMP5 v17
  481. #define RH1 v18
  482. #define RH2 v19
  483. #define RH3 v20
  484. .align 3
  485. SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec)
  486. /* input:
  487. * x0: round key array, CTX
  488. * x1: dst
  489. * x2: src
  490. * x3: ctr (big endian, 128 bit)
  491. * w4: nbytes
  492. * x5: ghash result
  493. * x6: ghash table
  494. * x7: lengths (only for last block)
  495. */
  496. SM4_PREPARE(x0)
  497. ldp x8, x9, [x3]
  498. rev x8, x8
  499. rev x9, x9
  500. ld1 {RH1.16b-RH3.16b}, [x6]
  501. ld1 {RHASH.16b}, [x5]
  502. rbit RHASH.16b, RHASH.16b
  503. adr_l x6, .Lghash_rconst
  504. ld1r {RRCONST.2d}, [x6]
  505. eor RZERO.16b, RZERO.16b, RZERO.16b
  506. cbz w4, .Lgcm_dec_hash_len
  507. .Lgcm_dec_loop_3x:
  508. cmp w4, #(3 * 16)
  509. blt .Lgcm_dec_loop_1x
  510. sub w4, w4, #(3 * 16)
  511. ld1 {v3.16b-v5.16b}, [x2], #(3 * 16)
  512. /* construct CTRs */
  513. inc32_le128(v0) /* +0 */
  514. rbit v6.16b, v3.16b
  515. inc32_le128(v1) /* +1 */
  516. rbit v7.16b, v4.16b
  517. inc32_le128(v2) /* +2 */
  518. rbit v8.16b, v5.16b
  519. eor RHASH.16b, RHASH.16b, v6.16b
  520. /* decrypt & ghash update */
  521. SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
  522. RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
  523. RR2, RR3, v7, RH2, RTMP2, RTMP3,
  524. RR4, RR5, v8, RH1, RTMP4, RTMP5)
  525. eor v0.16b, v0.16b, v3.16b
  526. eor v1.16b, v1.16b, v4.16b
  527. eor v2.16b, v2.16b, v5.16b
  528. REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
  529. st1 {v0.16b-v2.16b}, [x1], #(3 * 16)
  530. cbz w4, .Lgcm_dec_hash_len
  531. b .Lgcm_dec_loop_3x
  532. .Lgcm_dec_loop_1x:
  533. cmp w4, #16
  534. blt .Lgcm_dec_tail
  535. sub w4, w4, #16
  536. ld1 {v3.16b}, [x2], #16
  537. /* construct CTRs */
  538. inc32_le128(v0)
  539. rbit v6.16b, v3.16b
  540. eor RHASH.16b, RHASH.16b, v6.16b
  541. SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
  542. eor v0.16b, v0.16b, v3.16b
  543. REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
  544. st1 {v0.16b}, [x1], #16
  545. cbz w4, .Lgcm_dec_hash_len
  546. b .Lgcm_dec_loop_1x
  547. .Lgcm_dec_tail:
  548. /* construct CTRs */
  549. inc32_le128(v0)
  550. SM4_CRYPT_BLK(v0)
  551. /* load permute table */
  552. adr_l x0, .Lcts_permute_table
  553. add x0, x0, #32
  554. sub x0, x0, w4, uxtw
  555. ld1 {v3.16b}, [x0]
  556. .Lgcm_dec_tail_loop:
  557. /* do decrypt */
  558. ldrb w0, [x2], #1 /* get 1 byte from input */
  559. umov w6, v0.b[0] /* get top crypted byte */
  560. eor w6, w6, w0 /* w6 = CTR ^ input */
  561. strb w6, [x1], #1 /* store out byte */
  562. /* shift right out one byte */
  563. ext v0.16b, v0.16b, v0.16b, #1
  564. /* the last ciphertext is placed in high bytes */
  565. ins v0.b[15], w0
  566. subs w4, w4, #1
  567. bne .Lgcm_dec_tail_loop
  568. /* padding last block with zeros */
  569. tbl v0.16b, {v0.16b}, v3.16b
  570. /* ghash update */
  571. rbit v0.16b, v0.16b
  572. eor RHASH.16b, RHASH.16b, v0.16b
  573. PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
  574. REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
  575. .Lgcm_dec_hash_len:
  576. cbz x7, .Lgcm_dec_end
  577. GTAG_HASH_LENGTHS(v1, v3)
  578. b .Lgcm_dec_ret
  579. .Lgcm_dec_end:
  580. /* store new CTR */
  581. rev x8, x8
  582. rev x9, x9
  583. stp x8, x9, [x3]
  584. rbit RHASH.16b, RHASH.16b
  585. .Lgcm_dec_ret:
  586. /* store new MAC */
  587. st1 {RHASH.2d}, [x5]
  588. ret
  589. SYM_FUNC_END(sm4_ce_pmull_gcm_dec)
  590. .section ".rodata", "a"
  591. .align 4
  592. .Lcts_permute_table:
  593. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  594. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  595. .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
  596. .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
  597. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  598. .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
  599. .Lghash_rconst:
  600. .quad 0x87