blake2s-core.S 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * BLAKE2s digest algorithm, ARM scalar implementation. This is faster
  4. * than the generic implementations of BLAKE2s and BLAKE2b, but slower
  5. * than the NEON implementation of BLAKE2b. There is no NEON
  6. * implementation of BLAKE2s, since NEON doesn't really help with it.
  7. *
  8. * Copyright 2020 Google LLC
  9. *
  10. * Author: Eric Biggers <ebiggers@google.com>
  11. */
  12. #include <linux/linkage.h>
  13. #include <asm/assembler.h>
  14. // Registers used to hold message words temporarily. There aren't
  15. // enough ARM registers to hold the whole message block, so we have to
  16. // load the words on-demand.
  17. M_0 .req r12
  18. M_1 .req r14
  19. // The BLAKE2s initialization vector
  20. .Lblake2s_IV:
  21. .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
  22. .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
  23. .macro __ldrd a, b, src, offset
  24. #if __LINUX_ARM_ARCH__ >= 6
  25. ldrd \a, \b, [\src, #\offset]
  26. #else
  27. ldr \a, [\src, #\offset]
  28. ldr \b, [\src, #\offset + 4]
  29. #endif
  30. .endm
  31. .macro __strd a, b, dst, offset
  32. #if __LINUX_ARM_ARCH__ >= 6
  33. strd \a, \b, [\dst, #\offset]
  34. #else
  35. str \a, [\dst, #\offset]
  36. str \b, [\dst, #\offset + 4]
  37. #endif
  38. .endm
  39. .macro _le32_bswap a, tmp
  40. #ifdef __ARMEB__
  41. rev_l \a, \tmp
  42. #endif
  43. .endm
  44. .macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp
  45. _le32_bswap \a, \tmp
  46. _le32_bswap \b, \tmp
  47. _le32_bswap \c, \tmp
  48. _le32_bswap \d, \tmp
  49. _le32_bswap \e, \tmp
  50. _le32_bswap \f, \tmp
  51. _le32_bswap \g, \tmp
  52. _le32_bswap \h, \tmp
  53. .endm
  54. // Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
  55. // (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
  56. // columns/diagonals. s0-s1 are the word offsets to the message words the first
  57. // column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
  58. // M_0 and M_1 are free to use, and the message block can be found at sp + 32.
  59. //
  60. // Note that to save instructions, the rotations don't happen when the
  61. // pseudocode says they should, but rather they are delayed until the values are
  62. // used. See the comment above _blake2s_round().
  63. .macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3
  64. ldr M_0, [sp, #32 + 4 * \s0]
  65. ldr M_1, [sp, #32 + 4 * \s2]
  66. // a += b + m[blake2s_sigma[r][2*i + 0]];
  67. add \a0, \a0, \b0, ror #brot
  68. add \a1, \a1, \b1, ror #brot
  69. add \a0, \a0, M_0
  70. add \a1, \a1, M_1
  71. // d = ror32(d ^ a, 16);
  72. eor \d0, \a0, \d0, ror #drot
  73. eor \d1, \a1, \d1, ror #drot
  74. // c += d;
  75. add \c0, \c0, \d0, ror #16
  76. add \c1, \c1, \d1, ror #16
  77. // b = ror32(b ^ c, 12);
  78. eor \b0, \c0, \b0, ror #brot
  79. eor \b1, \c1, \b1, ror #brot
  80. ldr M_0, [sp, #32 + 4 * \s1]
  81. ldr M_1, [sp, #32 + 4 * \s3]
  82. // a += b + m[blake2s_sigma[r][2*i + 1]];
  83. add \a0, \a0, \b0, ror #12
  84. add \a1, \a1, \b1, ror #12
  85. add \a0, \a0, M_0
  86. add \a1, \a1, M_1
  87. // d = ror32(d ^ a, 8);
  88. eor \d0, \a0, \d0, ror#16
  89. eor \d1, \a1, \d1, ror#16
  90. // c += d;
  91. add \c0, \c0, \d0, ror#8
  92. add \c1, \c1, \d1, ror#8
  93. // b = ror32(b ^ c, 7);
  94. eor \b0, \c0, \b0, ror#12
  95. eor \b1, \c1, \b1, ror#12
  96. .endm
  97. // Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9]
  98. // are in r0..r9. The stack pointer points to 8 bytes of scratch space for
  99. // spilling v[8..9], then to v[10..15], then to the message block. r10-r12 and
  100. // r14 are free to use. The macro arguments s0-s15 give the order in which the
  101. // message words are used in this round.
  102. //
  103. // All rotates are performed using the implicit rotate operand accepted by the
  104. // 'add' and 'eor' instructions. This is faster than using explicit rotate
  105. // instructions. To make this work, we allow the values in the second and last
  106. // rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
  107. // wrong rotation amount. The rotation amount is then fixed up just in time
  108. // when the values are used. 'brot' is the number of bits the values in row 'b'
  109. // need to be rotated right to arrive at the correct values, and 'drot'
  110. // similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
  111. // that they end up as (7, 8) after every round.
  112. .macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \
  113. s8, s9, s10, s11, s12, s13, s14, s15
  114. // Mix first two columns:
  115. // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
  116. __ldrd r10, r11, sp, 16 // load v[12] and v[13]
  117. _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \
  118. \s0, \s1, \s2, \s3
  119. __strd r8, r9, sp, 0
  120. __strd r10, r11, sp, 16
  121. // Mix second two columns:
  122. // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
  123. __ldrd r8, r9, sp, 8 // load v[10] and v[11]
  124. __ldrd r10, r11, sp, 24 // load v[14] and v[15]
  125. _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \
  126. \s4, \s5, \s6, \s7
  127. str r10, [sp, #24] // store v[14]
  128. // v[10], v[11], and v[15] are used below, so no need to store them yet.
  129. .set brot, 7
  130. .set drot, 8
  131. // Mix first two diagonals:
  132. // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
  133. ldr r10, [sp, #16] // load v[12]
  134. _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \
  135. \s8, \s9, \s10, \s11
  136. __strd r8, r9, sp, 8
  137. str r11, [sp, #28]
  138. str r10, [sp, #16]
  139. // Mix second two diagonals:
  140. // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
  141. __ldrd r8, r9, sp, 0 // load v[8] and v[9]
  142. __ldrd r10, r11, sp, 20 // load v[13] and v[14]
  143. _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \
  144. \s12, \s13, \s14, \s15
  145. __strd r10, r11, sp, 20
  146. .endm
  147. //
  148. // void blake2s_compress(struct blake2s_ctx *ctx,
  149. // const u8 *data, size_t nblocks, u32 inc);
  150. //
  151. // Only the first three fields of struct blake2s_ctx are used:
  152. // u32 h[8]; (inout)
  153. // u32 t[2]; (inout)
  154. // u32 f[2]; (in)
  155. //
  156. .align 5
  157. ENTRY(blake2s_compress)
  158. push {r0-r2,r4-r11,lr} // keep this an even number
  159. .Lnext_block:
  160. // r0 is 'ctx'
  161. // r1 is 'data'
  162. // r3 is 'inc'
  163. // Load and increment the counter t[0..1].
  164. __ldrd r10, r11, r0, 32
  165. adds r10, r10, r3
  166. adc r11, r11, #0
  167. __strd r10, r11, r0, 32
  168. // _blake2s_round is very short on registers, so copy the message block
  169. // to the stack to save a register during the rounds. This also has the
  170. // advantage that misalignment only needs to be dealt with in one place.
  171. sub sp, sp, #64
  172. mov r12, sp
  173. tst r1, #3
  174. bne .Lcopy_block_misaligned
  175. ldmia r1!, {r2-r9}
  176. _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
  177. stmia r12!, {r2-r9}
  178. ldmia r1!, {r2-r9}
  179. _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
  180. stmia r12, {r2-r9}
  181. .Lcopy_block_done:
  182. str r1, [sp, #68] // Update message pointer
  183. // Calculate v[8..15]. Push v[10..15] onto the stack, and leave space
  184. // for spilling v[8..9]. Leave v[8..9] in r8-r9.
  185. mov r14, r0 // r14 = ctx
  186. adr r12, .Lblake2s_IV
  187. ldmia r12!, {r8-r9} // load IV[0..1]
  188. __ldrd r0, r1, r14, 40 // load f[0..1]
  189. ldm r12, {r2-r7} // load IV[2..7]
  190. eor r4, r4, r10 // v[12] = IV[4] ^ t[0]
  191. eor r5, r5, r11 // v[13] = IV[5] ^ t[1]
  192. eor r6, r6, r0 // v[14] = IV[6] ^ f[0]
  193. eor r7, r7, r1 // v[15] = IV[7] ^ f[1]
  194. push {r2-r7} // push v[10..15]
  195. sub sp, sp, #8 // leave space for v[8..9]
  196. // Load h[0..7] == v[0..7].
  197. ldm r14, {r0-r7}
  198. // Execute the rounds. Each round is provided the order in which it
  199. // needs to use the message words.
  200. .set brot, 0
  201. .set drot, 0
  202. _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
  203. _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
  204. _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
  205. _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
  206. _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
  207. _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
  208. _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
  209. _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
  210. _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
  211. _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
  212. // Fold the final state matrix into the hash chaining value:
  213. //
  214. // for (i = 0; i < 8; i++)
  215. // h[i] ^= v[i] ^ v[i + 8];
  216. //
  217. ldr r14, [sp, #96] // r14 = &h[0]
  218. add sp, sp, #8 // v[8..9] are already loaded.
  219. pop {r10-r11} // load v[10..11]
  220. eor r0, r0, r8
  221. eor r1, r1, r9
  222. eor r2, r2, r10
  223. eor r3, r3, r11
  224. ldm r14, {r8-r11} // load h[0..3]
  225. eor r0, r0, r8
  226. eor r1, r1, r9
  227. eor r2, r2, r10
  228. eor r3, r3, r11
  229. stmia r14!, {r0-r3} // store new h[0..3]
  230. ldm r14, {r0-r3} // load old h[4..7]
  231. pop {r8-r11} // load v[12..15]
  232. eor r0, r0, r4, ror #brot
  233. eor r1, r1, r5, ror #brot
  234. eor r2, r2, r6, ror #brot
  235. eor r3, r3, r7, ror #brot
  236. eor r0, r0, r8, ror #drot
  237. eor r1, r1, r9, ror #drot
  238. eor r2, r2, r10, ror #drot
  239. eor r3, r3, r11, ror #drot
  240. add sp, sp, #64 // skip copy of message block
  241. stm r14, {r0-r3} // store new h[4..7]
  242. // Advance to the next block, if there is one. Note that if there are
  243. // multiple blocks, then 'inc' (the counter increment amount) must be
  244. // 64. So we can simply set it to 64 without re-loading it.
  245. ldm sp, {r0, r1, r2} // load (ctx, data, nblocks)
  246. mov r3, #64 // set 'inc'
  247. subs r2, r2, #1 // nblocks--
  248. str r2, [sp, #8]
  249. bne .Lnext_block // nblocks != 0?
  250. pop {r0-r2,r4-r11,pc}
  251. // The next message block (pointed to by r1) isn't 4-byte aligned, so it
  252. // can't be loaded using ldmia. Copy it to the stack buffer (pointed to
  253. // by r12) using an alternative method. r2-r9 are free to use.
  254. .Lcopy_block_misaligned:
  255. mov r2, #64
  256. 1:
  257. #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
  258. ldr r3, [r1], #4
  259. _le32_bswap r3, r4
  260. #else
  261. ldrb r3, [r1, #0]
  262. ldrb r4, [r1, #1]
  263. ldrb r5, [r1, #2]
  264. ldrb r6, [r1, #3]
  265. add r1, r1, #4
  266. orr r3, r3, r4, lsl #8
  267. orr r3, r3, r5, lsl #16
  268. orr r3, r3, r6, lsl #24
  269. #endif
  270. subs r2, r2, #4
  271. str r3, [r12], #4
  272. bne 1b
  273. b .Lcopy_block_done
  274. ENDPROC(blake2s_compress)