blake2b-neon-core.S 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * BLAKE2b digest algorithm optimized with ARM NEON instructions. On ARM
  4. * processors that have NEON support but not the ARMv8 Crypto Extensions,
  5. * typically this BLAKE2b implementation is much faster than the SHA-2 family
  6. * and slightly faster than SHA-1.
  7. *
  8. * Copyright 2020 Google LLC
  9. *
  10. * Author: Eric Biggers <ebiggers@google.com>
  11. */
  12. #include <linux/linkage.h>
  13. .text
  14. .fpu neon
  15. // The arguments to blake2b_compress_neon()
  16. CTX .req r0
  17. DATA .req r1
  18. NBLOCKS .req r2
  19. INC .req r3
  20. // Pointers to the rotation tables
  21. ROR24_TABLE .req r4
  22. ROR16_TABLE .req r5
  23. // The original stack pointer
  24. ORIG_SP .req r6
  25. // NEON registers which contain the message words of the current block.
  26. // M_0-M_3 are occasionally used for other purposes too.
  27. M_0 .req d16
  28. M_1 .req d17
  29. M_2 .req d18
  30. M_3 .req d19
  31. M_4 .req d20
  32. M_5 .req d21
  33. M_6 .req d22
  34. M_7 .req d23
  35. M_8 .req d24
  36. M_9 .req d25
  37. M_10 .req d26
  38. M_11 .req d27
  39. M_12 .req d28
  40. M_13 .req d29
  41. M_14 .req d30
  42. M_15 .req d31
  43. .align 4
  44. // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
  45. // instruction. This is the most efficient way to implement these
  46. // rotation amounts with NEON. (On Cortex-A53 it's the same speed as
  47. // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
  48. .Lror24_table:
  49. .byte 3, 4, 5, 6, 7, 0, 1, 2
  50. .Lror16_table:
  51. .byte 2, 3, 4, 5, 6, 7, 0, 1
  52. // The BLAKE2b initialization vector
  53. .Lblake2b_IV:
  54. .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
  55. .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
  56. .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
  57. .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
  58. // Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
  59. // NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack
  60. // pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
  61. // (M_0-M_3), so that they can be reloaded if they are used as temporary
  62. // registers. The macro arguments s0-s15 give the order in which the message
  63. // words are used in this round. 'final' is 1 if this is the final round.
  64. .macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \
  65. s8, s9, s10, s11, s12, s13, s14, s15, final=0
  66. // Mix the columns:
  67. // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
  68. // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
  69. // a += b + m[blake2b_sigma[r][2*i + 0]];
  70. vadd.u64 q0, q0, q2
  71. vadd.u64 q1, q1, q3
  72. vadd.u64 d0, d0, M_\s0
  73. vadd.u64 d1, d1, M_\s2
  74. vadd.u64 d2, d2, M_\s4
  75. vadd.u64 d3, d3, M_\s6
  76. // d = ror64(d ^ a, 32);
  77. veor q6, q6, q0
  78. veor q7, q7, q1
  79. vrev64.32 q6, q6
  80. vrev64.32 q7, q7
  81. // c += d;
  82. vadd.u64 q4, q4, q6
  83. vadd.u64 q5, q5, q7
  84. // b = ror64(b ^ c, 24);
  85. vld1.8 {M_0}, [ROR24_TABLE, :64]
  86. veor q2, q2, q4
  87. veor q3, q3, q5
  88. vtbl.8 d4, {d4}, M_0
  89. vtbl.8 d5, {d5}, M_0
  90. vtbl.8 d6, {d6}, M_0
  91. vtbl.8 d7, {d7}, M_0
  92. // a += b + m[blake2b_sigma[r][2*i + 1]];
  93. //
  94. // M_0 got clobbered above, so we have to reload it if any of the four
  95. // message words this step needs happens to be M_0. Otherwise we don't
  96. // need to reload it here, as it will just get clobbered again below.
  97. .if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
  98. vld1.8 {M_0}, [sp, :64]
  99. .endif
  100. vadd.u64 q0, q0, q2
  101. vadd.u64 q1, q1, q3
  102. vadd.u64 d0, d0, M_\s1
  103. vadd.u64 d1, d1, M_\s3
  104. vadd.u64 d2, d2, M_\s5
  105. vadd.u64 d3, d3, M_\s7
  106. // d = ror64(d ^ a, 16);
  107. vld1.8 {M_0}, [ROR16_TABLE, :64]
  108. veor q6, q6, q0
  109. veor q7, q7, q1
  110. vtbl.8 d12, {d12}, M_0
  111. vtbl.8 d13, {d13}, M_0
  112. vtbl.8 d14, {d14}, M_0
  113. vtbl.8 d15, {d15}, M_0
  114. // c += d;
  115. vadd.u64 q4, q4, q6
  116. vadd.u64 q5, q5, q7
  117. // b = ror64(b ^ c, 63);
  118. //
  119. // This rotation amount isn't a multiple of 8, so it has to be
  120. // implemented using a pair of shifts, which requires temporary
  121. // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
  122. veor q8, q2, q4
  123. veor q9, q3, q5
  124. vshr.u64 q2, q8, #63
  125. vshr.u64 q3, q9, #63
  126. vsli.u64 q2, q8, #1
  127. vsli.u64 q3, q9, #1
  128. vld1.8 {q8-q9}, [sp, :256]
  129. // Mix the diagonals:
  130. // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
  131. // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
  132. //
  133. // There are two possible ways to do this: use 'vext' instructions to
  134. // shift the rows of the matrix so that the diagonals become columns,
  135. // and undo it afterwards; or just use 64-bit operations on 'd'
  136. // registers instead of 128-bit operations on 'q' registers. We use the
  137. // latter approach, as it performs much better on Cortex-A7.
  138. // a += b + m[blake2b_sigma[r][2*i + 0]];
  139. vadd.u64 d0, d0, d5
  140. vadd.u64 d1, d1, d6
  141. vadd.u64 d2, d2, d7
  142. vadd.u64 d3, d3, d4
  143. vadd.u64 d0, d0, M_\s8
  144. vadd.u64 d1, d1, M_\s10
  145. vadd.u64 d2, d2, M_\s12
  146. vadd.u64 d3, d3, M_\s14
  147. // d = ror64(d ^ a, 32);
  148. veor d15, d15, d0
  149. veor d12, d12, d1
  150. veor d13, d13, d2
  151. veor d14, d14, d3
  152. vrev64.32 d15, d15
  153. vrev64.32 d12, d12
  154. vrev64.32 d13, d13
  155. vrev64.32 d14, d14
  156. // c += d;
  157. vadd.u64 d10, d10, d15
  158. vadd.u64 d11, d11, d12
  159. vadd.u64 d8, d8, d13
  160. vadd.u64 d9, d9, d14
  161. // b = ror64(b ^ c, 24);
  162. vld1.8 {M_0}, [ROR24_TABLE, :64]
  163. veor d5, d5, d10
  164. veor d6, d6, d11
  165. veor d7, d7, d8
  166. veor d4, d4, d9
  167. vtbl.8 d5, {d5}, M_0
  168. vtbl.8 d6, {d6}, M_0
  169. vtbl.8 d7, {d7}, M_0
  170. vtbl.8 d4, {d4}, M_0
  171. // a += b + m[blake2b_sigma[r][2*i + 1]];
  172. .if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
  173. vld1.8 {M_0}, [sp, :64]
  174. .endif
  175. vadd.u64 d0, d0, d5
  176. vadd.u64 d1, d1, d6
  177. vadd.u64 d2, d2, d7
  178. vadd.u64 d3, d3, d4
  179. vadd.u64 d0, d0, M_\s9
  180. vadd.u64 d1, d1, M_\s11
  181. vadd.u64 d2, d2, M_\s13
  182. vadd.u64 d3, d3, M_\s15
  183. // d = ror64(d ^ a, 16);
  184. vld1.8 {M_0}, [ROR16_TABLE, :64]
  185. veor d15, d15, d0
  186. veor d12, d12, d1
  187. veor d13, d13, d2
  188. veor d14, d14, d3
  189. vtbl.8 d12, {d12}, M_0
  190. vtbl.8 d13, {d13}, M_0
  191. vtbl.8 d14, {d14}, M_0
  192. vtbl.8 d15, {d15}, M_0
  193. // c += d;
  194. vadd.u64 d10, d10, d15
  195. vadd.u64 d11, d11, d12
  196. vadd.u64 d8, d8, d13
  197. vadd.u64 d9, d9, d14
  198. // b = ror64(b ^ c, 63);
  199. veor d16, d4, d9
  200. veor d17, d5, d10
  201. veor d18, d6, d11
  202. veor d19, d7, d8
  203. vshr.u64 q2, q8, #63
  204. vshr.u64 q3, q9, #63
  205. vsli.u64 q2, q8, #1
  206. vsli.u64 q3, q9, #1
  207. // Reloading q8-q9 can be skipped on the final round.
  208. .if ! \final
  209. vld1.8 {q8-q9}, [sp, :256]
  210. .endif
  211. .endm
  212. //
  213. // void blake2b_compress_neon(struct blake2b_ctx *ctx,
  214. // const u8 *data, size_t nblocks, u32 inc);
  215. //
  216. // Only the first three fields of struct blake2b_ctx are used:
  217. // u64 h[8]; (inout)
  218. // u64 t[2]; (inout)
  219. // u64 f[2]; (in)
  220. //
  221. .align 5
  222. ENTRY(blake2b_compress_neon)
  223. push {r4-r10}
  224. // Allocate a 32-byte stack buffer that is 32-byte aligned.
  225. mov ORIG_SP, sp
  226. sub ip, sp, #32
  227. bic ip, ip, #31
  228. mov sp, ip
  229. adr ROR24_TABLE, .Lror24_table
  230. adr ROR16_TABLE, .Lror16_table
  231. mov ip, CTX
  232. vld1.64 {q0-q1}, [ip]! // Load h[0..3]
  233. vld1.64 {q2-q3}, [ip]! // Load h[4..7]
  234. .Lnext_block:
  235. adr r10, .Lblake2b_IV
  236. vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1]
  237. vld1.64 {q4-q5}, [r10]! // Load IV[0..3]
  238. vmov r7, r8, d28 // Copy t[0] to (r7, r8)
  239. vld1.64 {q6-q7}, [r10] // Load IV[4..7]
  240. adds r7, r7, INC // Increment counter
  241. bcs .Lslow_inc_ctr
  242. vmov.i32 d28[0], r7
  243. vst1.64 {d28}, [ip] // Update t[0]
  244. .Linc_ctr_done:
  245. // Load the next message block and finish initializing the state matrix
  246. // 'v'. Fortunately, there are exactly enough NEON registers to fit the
  247. // entire state matrix in q0-q7 and the entire message block in q8-15.
  248. //
  249. // However, _blake2b_round also needs some extra registers for rotates,
  250. // so we have to spill some registers. It's better to spill the message
  251. // registers than the state registers, as the message doesn't change.
  252. // Therefore we store a copy of the first 32 bytes of the message block
  253. // (q8-q9) in an aligned buffer on the stack so that they can be
  254. // reloaded when needed. (We could just reload directly from the
  255. // message buffer, but it's faster to use aligned loads.)
  256. vld1.8 {q8-q9}, [DATA]!
  257. veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1]
  258. vld1.8 {q10-q11}, [DATA]!
  259. veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]
  260. vld1.8 {q12-q13}, [DATA]!
  261. vst1.8 {q8-q9}, [sp, :256]
  262. mov ip, CTX
  263. vld1.8 {q14-q15}, [DATA]!
  264. // Execute the rounds. Each round is provided the order in which it
  265. // needs to use the message words.
  266. _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
  267. _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
  268. _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
  269. _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
  270. _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
  271. _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
  272. _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
  273. _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
  274. _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
  275. _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
  276. _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
  277. _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
  278. final=1
  279. // Fold the final state matrix into the hash chaining value:
  280. //
  281. // for (i = 0; i < 8; i++)
  282. // h[i] ^= v[i] ^ v[i + 8];
  283. //
  284. vld1.64 {q8-q9}, [ip]! // Load old h[0..3]
  285. veor q0, q0, q4 // v[0..1] ^= v[8..9]
  286. veor q1, q1, q5 // v[2..3] ^= v[10..11]
  287. vld1.64 {q10-q11}, [ip] // Load old h[4..7]
  288. veor q2, q2, q6 // v[4..5] ^= v[12..13]
  289. veor q3, q3, q7 // v[6..7] ^= v[14..15]
  290. veor q0, q0, q8 // v[0..1] ^= h[0..1]
  291. veor q1, q1, q9 // v[2..3] ^= h[2..3]
  292. mov ip, CTX
  293. subs NBLOCKS, NBLOCKS, #1 // nblocks--
  294. vst1.64 {q0-q1}, [ip]! // Store new h[0..3]
  295. veor q2, q2, q10 // v[4..5] ^= h[4..5]
  296. veor q3, q3, q11 // v[6..7] ^= h[6..7]
  297. vst1.64 {q2-q3}, [ip]! // Store new h[4..7]
  298. // Advance to the next block, if there is one.
  299. bne .Lnext_block // nblocks != 0?
  300. mov sp, ORIG_SP
  301. pop {r4-r10}
  302. mov pc, lr
  303. .Lslow_inc_ctr:
  304. // Handle the case where the counter overflowed its low 32 bits, by
  305. // carrying the overflow bit into the full 128-bit counter.
  306. vmov r9, r10, d29
  307. adcs r8, r8, #0
  308. adcs r9, r9, #0
  309. adc r10, r10, #0
  310. vmov d28, r7, r8
  311. vmov d29, r9, r10
  312. vst1.64 {q14}, [ip] // Update t[0] and t[1]
  313. b .Linc_ctr_done
  314. ENDPROC(blake2b_compress_neon)