chacha-core.S 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. /* SPDX-License-Identifier: GPL-2.0 OR MIT */
  2. /*
  3. * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
  4. * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  5. */
  6. #define MASK_U32 0x3c
  7. #define CHACHA20_BLOCK_SIZE 64
  8. #define STACK_SIZE 32
  9. #define X0 $t0
  10. #define X1 $t1
  11. #define X2 $t2
  12. #define X3 $t3
  13. #define X4 $t4
  14. #define X5 $t5
  15. #define X6 $t6
  16. #define X7 $t7
  17. #define X8 $t8
  18. #define X9 $t9
  19. #define X10 $v1
  20. #define X11 $s6
  21. #define X12 $s5
  22. #define X13 $s4
  23. #define X14 $s3
  24. #define X15 $s2
  25. /* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
  26. #define T0 $s1
  27. #define T1 $s0
  28. #define T(n) T ## n
  29. #define X(n) X ## n
  30. /* Input arguments */
  31. #define STATE $a0
  32. #define OUT $a1
  33. #define IN $a2
  34. #define BYTES $a3
  35. /* Output argument */
  36. /* NONCE[0] is kept in a register and not in memory.
  37. * We don't want to touch original value in memory.
  38. * Must be incremented every loop iteration.
  39. */
  40. #define NONCE_0 $v0
  41. /* SAVED_X and SAVED_CA are set in the jump table.
  42. * Use regs which are overwritten on exit else we don't leak clear data.
  43. * They are used to handling the last bytes which are not multiple of 4.
  44. */
  45. #define SAVED_X X15
  46. #define SAVED_CA $s7
  47. #define IS_UNALIGNED $s7
  48. #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  49. #define MSB 0
  50. #define LSB 3
  51. #define CPU_TO_LE32(n) \
  52. wsbh n, n; \
  53. rotr n, 16;
  54. #else
  55. #define MSB 3
  56. #define LSB 0
  57. #define CPU_TO_LE32(n)
  58. #endif
  59. #define FOR_EACH_WORD(x) \
  60. x( 0); \
  61. x( 1); \
  62. x( 2); \
  63. x( 3); \
  64. x( 4); \
  65. x( 5); \
  66. x( 6); \
  67. x( 7); \
  68. x( 8); \
  69. x( 9); \
  70. x(10); \
  71. x(11); \
  72. x(12); \
  73. x(13); \
  74. x(14); \
  75. x(15);
  76. #define FOR_EACH_WORD_REV(x) \
  77. x(15); \
  78. x(14); \
  79. x(13); \
  80. x(12); \
  81. x(11); \
  82. x(10); \
  83. x( 9); \
  84. x( 8); \
  85. x( 7); \
  86. x( 6); \
  87. x( 5); \
  88. x( 4); \
  89. x( 3); \
  90. x( 2); \
  91. x( 1); \
  92. x( 0);
  93. #define PLUS_ONE_0 1
  94. #define PLUS_ONE_1 2
  95. #define PLUS_ONE_2 3
  96. #define PLUS_ONE_3 4
  97. #define PLUS_ONE_4 5
  98. #define PLUS_ONE_5 6
  99. #define PLUS_ONE_6 7
  100. #define PLUS_ONE_7 8
  101. #define PLUS_ONE_8 9
  102. #define PLUS_ONE_9 10
  103. #define PLUS_ONE_10 11
  104. #define PLUS_ONE_11 12
  105. #define PLUS_ONE_12 13
  106. #define PLUS_ONE_13 14
  107. #define PLUS_ONE_14 15
  108. #define PLUS_ONE_15 16
  109. #define PLUS_ONE(x) PLUS_ONE_ ## x
  110. #define _CONCAT3(a,b,c) a ## b ## c
  111. #define CONCAT3(a,b,c) _CONCAT3(a,b,c)
  112. #define STORE_UNALIGNED(x) \
  113. CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
  114. .if (x != 12); \
  115. lw T0, (x*4)(STATE); \
  116. .endif; \
  117. lwl T1, (x*4)+MSB ## (IN); \
  118. lwr T1, (x*4)+LSB ## (IN); \
  119. .if (x == 12); \
  120. addu X ## x, NONCE_0; \
  121. .else; \
  122. addu X ## x, T0; \
  123. .endif; \
  124. CPU_TO_LE32(X ## x); \
  125. xor X ## x, T1; \
  126. swl X ## x, (x*4)+MSB ## (OUT); \
  127. swr X ## x, (x*4)+LSB ## (OUT);
  128. #define STORE_ALIGNED(x) \
  129. CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
  130. .if (x != 12); \
  131. lw T0, (x*4)(STATE); \
  132. .endif; \
  133. lw T1, (x*4) ## (IN); \
  134. .if (x == 12); \
  135. addu X ## x, NONCE_0; \
  136. .else; \
  137. addu X ## x, T0; \
  138. .endif; \
  139. CPU_TO_LE32(X ## x); \
  140. xor X ## x, T1; \
  141. sw X ## x, (x*4) ## (OUT);
  142. /* Jump table macro.
  143. * Used for setup and handling the last bytes, which are not multiple of 4.
  144. * X15 is free to store Xn
  145. * Every jumptable entry must be equal in size.
  146. */
  147. #define JMPTBL_ALIGNED(x) \
  148. .Lchacha_mips_jmptbl_aligned_ ## x: ; \
  149. .set noreorder; \
  150. b .Lchacha_mips_xor_aligned_ ## x ## _b; \
  151. .if (x == 12); \
  152. addu SAVED_X, X ## x, NONCE_0; \
  153. .else; \
  154. addu SAVED_X, X ## x, SAVED_CA; \
  155. .endif; \
  156. .set reorder
  157. #define JMPTBL_UNALIGNED(x) \
  158. .Lchacha_mips_jmptbl_unaligned_ ## x: ; \
  159. .set noreorder; \
  160. b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
  161. .if (x == 12); \
  162. addu SAVED_X, X ## x, NONCE_0; \
  163. .else; \
  164. addu SAVED_X, X ## x, SAVED_CA; \
  165. .endif; \
  166. .set reorder
  167. #define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
  168. addu X(A), X(K); \
  169. addu X(B), X(L); \
  170. addu X(C), X(M); \
  171. addu X(D), X(N); \
  172. xor X(V), X(A); \
  173. xor X(W), X(B); \
  174. xor X(Y), X(C); \
  175. xor X(Z), X(D); \
  176. rotr X(V), 32 - S; \
  177. rotr X(W), 32 - S; \
  178. rotr X(Y), 32 - S; \
  179. rotr X(Z), 32 - S;
  180. .text
  181. .set reorder
  182. .set noat
  183. .globl chacha_crypt_arch
  184. .ent chacha_crypt_arch
  185. chacha_crypt_arch:
  186. .frame $sp, STACK_SIZE, $ra
  187. /* Load number of rounds */
  188. lw $at, 16($sp)
  189. addiu $sp, -STACK_SIZE
  190. /* Return bytes = 0. */
  191. beqz BYTES, .Lchacha_mips_end
  192. lw NONCE_0, 48(STATE)
  193. /* Save s0-s7 */
  194. sw $s0, 0($sp)
  195. sw $s1, 4($sp)
  196. sw $s2, 8($sp)
  197. sw $s3, 12($sp)
  198. sw $s4, 16($sp)
  199. sw $s5, 20($sp)
  200. sw $s6, 24($sp)
  201. sw $s7, 28($sp)
  202. /* Test IN or OUT is unaligned.
  203. * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
  204. */
  205. or IS_UNALIGNED, IN, OUT
  206. andi IS_UNALIGNED, 0x3
  207. b .Lchacha_rounds_start
  208. .align 4
  209. .Loop_chacha_rounds:
  210. addiu IN, CHACHA20_BLOCK_SIZE
  211. addiu OUT, CHACHA20_BLOCK_SIZE
  212. addiu NONCE_0, 1
  213. .Lchacha_rounds_start:
  214. lw X0, 0(STATE)
  215. lw X1, 4(STATE)
  216. lw X2, 8(STATE)
  217. lw X3, 12(STATE)
  218. lw X4, 16(STATE)
  219. lw X5, 20(STATE)
  220. lw X6, 24(STATE)
  221. lw X7, 28(STATE)
  222. lw X8, 32(STATE)
  223. lw X9, 36(STATE)
  224. lw X10, 40(STATE)
  225. lw X11, 44(STATE)
  226. move X12, NONCE_0
  227. lw X13, 52(STATE)
  228. lw X14, 56(STATE)
  229. lw X15, 60(STATE)
  230. .Loop_chacha_xor_rounds:
  231. addiu $at, -2
  232. AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
  233. AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
  234. AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
  235. AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
  236. AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
  237. AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
  238. AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
  239. AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
  240. bnez $at, .Loop_chacha_xor_rounds
  241. addiu BYTES, -(CHACHA20_BLOCK_SIZE)
  242. /* Is data src/dst unaligned? Jump */
  243. bnez IS_UNALIGNED, .Loop_chacha_unaligned
  244. /* Set number rounds here to fill delayslot. */
  245. lw $at, (STACK_SIZE+16)($sp)
  246. /* BYTES < 0, it has no full block. */
  247. bltz BYTES, .Lchacha_mips_no_full_block_aligned
  248. FOR_EACH_WORD_REV(STORE_ALIGNED)
  249. /* BYTES > 0? Loop again. */
  250. bgtz BYTES, .Loop_chacha_rounds
  251. /* Place this here to fill delay slot */
  252. addiu NONCE_0, 1
  253. /* BYTES < 0? Handle last bytes */
  254. bltz BYTES, .Lchacha_mips_xor_bytes
  255. .Lchacha_mips_xor_done:
  256. /* Restore used registers */
  257. lw $s0, 0($sp)
  258. lw $s1, 4($sp)
  259. lw $s2, 8($sp)
  260. lw $s3, 12($sp)
  261. lw $s4, 16($sp)
  262. lw $s5, 20($sp)
  263. lw $s6, 24($sp)
  264. lw $s7, 28($sp)
  265. /* Write NONCE_0 back to right location in state */
  266. sw NONCE_0, 48(STATE)
  267. .Lchacha_mips_end:
  268. addiu $sp, STACK_SIZE
  269. jr $ra
  270. .Lchacha_mips_no_full_block_aligned:
  271. /* Restore the offset on BYTES */
  272. addiu BYTES, CHACHA20_BLOCK_SIZE
  273. /* Get number of full WORDS */
  274. andi $at, BYTES, MASK_U32
  275. /* Load upper half of jump table addr */
  276. lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
  277. /* Calculate lower half jump table offset */
  278. ins T0, $at, 1, 6
  279. /* Add offset to STATE */
  280. addu T1, STATE, $at
  281. /* Add lower half jump table addr */
  282. addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
  283. /* Read value from STATE */
  284. lw SAVED_CA, 0(T1)
  285. /* Store remaining bytecounter as negative value */
  286. subu BYTES, $at, BYTES
  287. jr T0
  288. /* Jump table */
  289. FOR_EACH_WORD(JMPTBL_ALIGNED)
  290. .Loop_chacha_unaligned:
  291. /* Set number rounds here to fill delayslot. */
  292. lw $at, (STACK_SIZE+16)($sp)
  293. /* BYTES > 0, it has no full block. */
  294. bltz BYTES, .Lchacha_mips_no_full_block_unaligned
  295. FOR_EACH_WORD_REV(STORE_UNALIGNED)
  296. /* BYTES > 0? Loop again. */
  297. bgtz BYTES, .Loop_chacha_rounds
  298. /* Write NONCE_0 back to right location in state */
  299. sw NONCE_0, 48(STATE)
  300. .set noreorder
  301. /* Fall through to byte handling */
  302. bgez BYTES, .Lchacha_mips_xor_done
  303. .Lchacha_mips_xor_unaligned_0_b:
  304. .Lchacha_mips_xor_aligned_0_b:
  305. /* Place this here to fill delay slot */
  306. addiu NONCE_0, 1
  307. .set reorder
  308. .Lchacha_mips_xor_bytes:
  309. addu IN, $at
  310. addu OUT, $at
  311. /* First byte */
  312. lbu T1, 0(IN)
  313. addiu $at, BYTES, 1
  314. xor T1, SAVED_X
  315. sb T1, 0(OUT)
  316. beqz $at, .Lchacha_mips_xor_done
  317. /* Second byte */
  318. lbu T1, 1(IN)
  319. addiu $at, BYTES, 2
  320. rotr SAVED_X, 8
  321. xor T1, SAVED_X
  322. sb T1, 1(OUT)
  323. beqz $at, .Lchacha_mips_xor_done
  324. /* Third byte */
  325. lbu T1, 2(IN)
  326. rotr SAVED_X, 8
  327. xor T1, SAVED_X
  328. sb T1, 2(OUT)
  329. b .Lchacha_mips_xor_done
  330. .Lchacha_mips_no_full_block_unaligned:
  331. /* Restore the offset on BYTES */
  332. addiu BYTES, CHACHA20_BLOCK_SIZE
  333. /* Get number of full WORDS */
  334. andi $at, BYTES, MASK_U32
  335. /* Load upper half of jump table addr */
  336. lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
  337. /* Calculate lower half jump table offset */
  338. ins T0, $at, 1, 6
  339. /* Add offset to STATE */
  340. addu T1, STATE, $at
  341. /* Add lower half jump table addr */
  342. addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
  343. /* Read value from STATE */
  344. lw SAVED_CA, 0(T1)
  345. /* Store remaining bytecounter as negative value */
  346. subu BYTES, $at, BYTES
  347. jr T0
  348. /* Jump table */
  349. FOR_EACH_WORD(JMPTBL_UNALIGNED)
  350. .end chacha_crypt_arch
  351. .set at
  352. /* Input arguments
  353. * STATE $a0
  354. * OUT $a1
  355. * NROUND $a2
  356. */
  357. #undef X12
  358. #undef X13
  359. #undef X14
  360. #undef X15
  361. #define X12 $a3
  362. #define X13 $at
  363. #define X14 $v0
  364. #define X15 STATE
  365. .set noat
  366. .globl hchacha_block_arch
  367. .ent hchacha_block_arch
  368. hchacha_block_arch:
  369. .frame $sp, STACK_SIZE, $ra
  370. addiu $sp, -STACK_SIZE
  371. /* Save X11(s6) */
  372. sw X11, 0($sp)
  373. lw X0, 0(STATE)
  374. lw X1, 4(STATE)
  375. lw X2, 8(STATE)
  376. lw X3, 12(STATE)
  377. lw X4, 16(STATE)
  378. lw X5, 20(STATE)
  379. lw X6, 24(STATE)
  380. lw X7, 28(STATE)
  381. lw X8, 32(STATE)
  382. lw X9, 36(STATE)
  383. lw X10, 40(STATE)
  384. lw X11, 44(STATE)
  385. lw X12, 48(STATE)
  386. lw X13, 52(STATE)
  387. lw X14, 56(STATE)
  388. lw X15, 60(STATE)
  389. .Loop_hchacha_xor_rounds:
  390. addiu $a2, -2
  391. AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
  392. AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
  393. AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
  394. AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
  395. AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
  396. AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
  397. AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
  398. AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
  399. bnez $a2, .Loop_hchacha_xor_rounds
  400. /* Restore used register */
  401. lw X11, 0($sp)
  402. sw X0, 0(OUT)
  403. sw X1, 4(OUT)
  404. sw X2, 8(OUT)
  405. sw X3, 12(OUT)
  406. sw X12, 16(OUT)
  407. sw X13, 20(OUT)
  408. sw X14, 24(OUT)
  409. sw X15, 28(OUT)
  410. addiu $sp, STACK_SIZE
  411. jr $ra
  412. .end hchacha_block_arch
  413. .set at