sha256-ce.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * Core SHA-224/SHA-256 transform using v8 Crypto Extensions
  4. *
  5. * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
  6. */
  7. #include <linux/linkage.h>
  8. #include <asm/assembler.h>
  9. .text
  10. .arch armv8-a+crypto
  11. dga .req q20
  12. dgav .req v20
  13. dgb .req q21
  14. dgbv .req v21
  15. t0 .req v22
  16. t1 .req v23
  17. dg0q .req q24
  18. dg0v .req v24
  19. dg1q .req q25
  20. dg1v .req v25
  21. dg2q .req q26
  22. dg2v .req v26
  23. .macro add_only, ev, rc, s0
  24. mov dg2v.16b, dg0v.16b
  25. .ifeq \ev
  26. add t1.4s, v\s0\().4s, \rc\().4s
  27. sha256h dg0q, dg1q, t0.4s
  28. sha256h2 dg1q, dg2q, t0.4s
  29. .else
  30. .ifnb \s0
  31. add t0.4s, v\s0\().4s, \rc\().4s
  32. .endif
  33. sha256h dg0q, dg1q, t1.4s
  34. sha256h2 dg1q, dg2q, t1.4s
  35. .endif
  36. .endm
  37. .macro add_update, ev, rc, s0, s1, s2, s3
  38. sha256su0 v\s0\().4s, v\s1\().4s
  39. add_only \ev, \rc, \s1
  40. sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s
  41. .endm
  42. /*
  43. * The SHA-256 round constants
  44. */
  45. .section ".rodata", "a"
  46. .align 4
  47. .Lsha2_rcon:
  48. .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
  49. .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
  50. .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
  51. .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
  52. .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
  53. .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
  54. .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
  55. .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
  56. .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
  57. .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
  58. .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
  59. .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
  60. .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
  61. .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
  62. .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
  63. .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
  64. .macro load_round_constants tmp
  65. adr_l \tmp, .Lsha2_rcon
  66. ld1 { v0.4s- v3.4s}, [\tmp], #64
  67. ld1 { v4.4s- v7.4s}, [\tmp], #64
  68. ld1 { v8.4s-v11.4s}, [\tmp], #64
  69. ld1 {v12.4s-v15.4s}, [\tmp]
  70. .endm
  71. /*
  72. * size_t __sha256_ce_transform(struct sha256_block_state *state,
  73. * const u8 *data, size_t nblocks);
  74. */
  75. .text
  76. SYM_FUNC_START(__sha256_ce_transform)
  77. load_round_constants x8
  78. /* load state */
  79. ld1 {dgav.4s, dgbv.4s}, [x0]
  80. /* load input */
  81. 0: ld1 {v16.4s-v19.4s}, [x1], #64
  82. sub x2, x2, #1
  83. CPU_LE( rev32 v16.16b, v16.16b )
  84. CPU_LE( rev32 v17.16b, v17.16b )
  85. CPU_LE( rev32 v18.16b, v18.16b )
  86. CPU_LE( rev32 v19.16b, v19.16b )
  87. add t0.4s, v16.4s, v0.4s
  88. mov dg0v.16b, dgav.16b
  89. mov dg1v.16b, dgbv.16b
  90. add_update 0, v1, 16, 17, 18, 19
  91. add_update 1, v2, 17, 18, 19, 16
  92. add_update 0, v3, 18, 19, 16, 17
  93. add_update 1, v4, 19, 16, 17, 18
  94. add_update 0, v5, 16, 17, 18, 19
  95. add_update 1, v6, 17, 18, 19, 16
  96. add_update 0, v7, 18, 19, 16, 17
  97. add_update 1, v8, 19, 16, 17, 18
  98. add_update 0, v9, 16, 17, 18, 19
  99. add_update 1, v10, 17, 18, 19, 16
  100. add_update 0, v11, 18, 19, 16, 17
  101. add_update 1, v12, 19, 16, 17, 18
  102. add_only 0, v13, 17
  103. add_only 1, v14, 18
  104. add_only 0, v15, 19
  105. add_only 1
  106. /* update state */
  107. add dgav.4s, dgav.4s, dg0v.4s
  108. add dgbv.4s, dgbv.4s, dg1v.4s
  109. /* return early if voluntary preemption is needed */
  110. cond_yield 1f, x5, x6
  111. /* handled all input blocks? */
  112. cbnz x2, 0b
  113. /* store new state */
  114. 1: st1 {dgav.4s, dgbv.4s}, [x0]
  115. mov x0, x2
  116. ret
  117. SYM_FUNC_END(__sha256_ce_transform)
  118. .unreq dga
  119. .unreq dgav
  120. .unreq dgb
  121. .unreq dgbv
  122. .unreq t0
  123. .unreq t1
  124. .unreq dg0q
  125. .unreq dg0v
  126. .unreq dg1q
  127. .unreq dg1v
  128. .unreq dg2q
  129. .unreq dg2v
  130. // parameters for sha256_ce_finup2x()
  131. ctx .req x0
  132. data1 .req x1
  133. data2 .req x2
  134. len .req w3
  135. out1 .req x4
  136. out2 .req x5
  137. // other scalar variables
  138. count .req x6
  139. final_step .req w7
  140. // x8-x9 are used as temporaries.
  141. // v0-v15 are used to cache the SHA-256 round constants.
  142. // v16-v19 are used for the message schedule for the first message.
  143. // v20-v23 are used for the message schedule for the second message.
  144. // v24-v31 are used for the state and temporaries as given below.
  145. // *_a are for the first message and *_b for the second.
  146. state0_a_q .req q24
  147. state0_a .req v24
  148. state1_a_q .req q25
  149. state1_a .req v25
  150. state0_b_q .req q26
  151. state0_b .req v26
  152. state1_b_q .req q27
  153. state1_b .req v27
  154. t0_a .req v28
  155. t0_b .req v29
  156. t1_a_q .req q30
  157. t1_a .req v30
  158. t1_b_q .req q31
  159. t1_b .req v31
  160. #define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
  161. #define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
  162. // offsetof(struct __sha256_ctx, state) is assumed to be 0.
  163. // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
  164. // and m0_b contain the current 4 message schedule words for the first
  165. // and second message respectively.
  166. //
  167. // If not all the message schedule words have been computed yet, then
  168. // this also computes 4 more message schedule words for each message.
  169. // m1_a-m3_a contain the next 3 groups of 4 message schedule words for
  170. // the first message, and likewise m1_b-m3_b for the second. After
  171. // consuming the current value of m0_a, this macro computes the group
  172. // after m3_a and writes it to m0_a, and likewise for *_b. This means
  173. // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
  174. // m3_a, m0_a), and likewise for *_b, so the caller must cycle through
  175. // the registers accordingly.
  176. .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \
  177. m0_b, m1_b, m2_b, m3_b
  178. add t0_a\().4s, \m0_a\().4s, \k\().4s
  179. add t0_b\().4s, \m0_b\().4s, \k\().4s
  180. .if \i < 48
  181. sha256su0 \m0_a\().4s, \m1_a\().4s
  182. sha256su0 \m0_b\().4s, \m1_b\().4s
  183. sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
  184. sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
  185. .endif
  186. mov t1_a.16b, state0_a.16b
  187. mov t1_b.16b, state0_b.16b
  188. sha256h state0_a_q, state1_a_q, t0_a\().4s
  189. sha256h state0_b_q, state1_b_q, t0_b\().4s
  190. sha256h2 state1_a_q, t1_a_q, t0_a\().4s
  191. sha256h2 state1_b_q, t1_b_q, t0_b\().4s
  192. .endm
  193. .macro do_16rounds_2x i, k0, k1, k2, k3
  194. do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23
  195. do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
  196. do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21
  197. do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22
  198. .endm
  199. //
  200. // void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
  201. // const u8 *data1, const u8 *data2, int len,
  202. // u8 out1[SHA256_DIGEST_SIZE],
  203. // u8 out2[SHA256_DIGEST_SIZE]);
  204. //
  205. // This function computes the SHA-256 digests of two messages |data1| and
  206. // |data2| that are both |len| bytes long, starting from the initial context
  207. // |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
  208. //
  209. // The instructions for the two SHA-256 operations are interleaved. On many
  210. // CPUs, this is almost twice as fast as hashing each message individually due
  211. // to taking better advantage of the CPU's SHA-256 and SIMD throughput.
  212. //
  213. SYM_FUNC_START(sha256_ce_finup2x)
  214. sub sp, sp, #128
  215. mov final_step, #0
  216. load_round_constants x8
  217. // Load the initial state from ctx->state.
  218. ld1 {state0_a.4s-state1_a.4s}, [ctx]
  219. // Load ctx->bytecount. Take the mod 64 of it to get the number of
  220. // bytes that are buffered in ctx->buf. Also save it in a register with
  221. // len added to it.
  222. ldr x8, [ctx, #OFFSETOF_BYTECOUNT]
  223. add count, x8, len, sxtw
  224. and x8, x8, #63
  225. cbz x8, .Lfinup2x_enter_loop // No bytes buffered?
  226. // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
  227. // followed by the first 64 - x8 bytes of data. Since len >= 64, we
  228. // just load 64 bytes from each of ctx->buf, data1, and data2
  229. // unconditionally and rearrange the data as needed.
  230. add x9, ctx, #OFFSETOF_BUF
  231. ld1 {v16.16b-v19.16b}, [x9]
  232. st1 {v16.16b-v19.16b}, [sp]
  233. ld1 {v16.16b-v19.16b}, [data1], #64
  234. add x9, sp, x8
  235. st1 {v16.16b-v19.16b}, [x9]
  236. ld1 {v16.4s-v19.4s}, [sp]
  237. ld1 {v20.16b-v23.16b}, [data2], #64
  238. st1 {v20.16b-v23.16b}, [x9]
  239. ld1 {v20.4s-v23.4s}, [sp]
  240. sub len, len, #64
  241. sub data1, data1, x8
  242. sub data2, data2, x8
  243. add len, len, w8
  244. mov state0_b.16b, state0_a.16b
  245. mov state1_b.16b, state1_a.16b
  246. b .Lfinup2x_loop_have_data
  247. .Lfinup2x_enter_loop:
  248. sub len, len, #64
  249. mov state0_b.16b, state0_a.16b
  250. mov state1_b.16b, state1_a.16b
  251. .Lfinup2x_loop:
  252. // Load the next two data blocks.
  253. ld1 {v16.4s-v19.4s}, [data1], #64
  254. ld1 {v20.4s-v23.4s}, [data2], #64
  255. .Lfinup2x_loop_have_data:
  256. // Convert the words of the data blocks from big endian.
  257. CPU_LE( rev32 v16.16b, v16.16b )
  258. CPU_LE( rev32 v17.16b, v17.16b )
  259. CPU_LE( rev32 v18.16b, v18.16b )
  260. CPU_LE( rev32 v19.16b, v19.16b )
  261. CPU_LE( rev32 v20.16b, v20.16b )
  262. CPU_LE( rev32 v21.16b, v21.16b )
  263. CPU_LE( rev32 v22.16b, v22.16b )
  264. CPU_LE( rev32 v23.16b, v23.16b )
  265. .Lfinup2x_loop_have_bswapped_data:
  266. // Save the original state for each block.
  267. st1 {state0_a.4s-state1_b.4s}, [sp]
  268. // Do the SHA-256 rounds on each block.
  269. do_16rounds_2x 0, v0, v1, v2, v3
  270. do_16rounds_2x 16, v4, v5, v6, v7
  271. do_16rounds_2x 32, v8, v9, v10, v11
  272. do_16rounds_2x 48, v12, v13, v14, v15
  273. // Add the original state for each block.
  274. ld1 {v16.4s-v19.4s}, [sp]
  275. add state0_a.4s, state0_a.4s, v16.4s
  276. add state1_a.4s, state1_a.4s, v17.4s
  277. add state0_b.4s, state0_b.4s, v18.4s
  278. add state1_b.4s, state1_b.4s, v19.4s
  279. // Update len and loop back if more blocks remain.
  280. sub len, len, #64
  281. tbz len, #31, .Lfinup2x_loop // len >= 0?
  282. // Check if any final blocks need to be handled.
  283. // final_step = 2: all done
  284. // final_step = 1: need to do count-only padding block
  285. // final_step = 0: need to do the block with 0x80 padding byte
  286. tbnz final_step, #1, .Lfinup2x_done
  287. tbnz final_step, #0, .Lfinup2x_finalize_countonly
  288. add len, len, #64
  289. cbz len, .Lfinup2x_finalize_blockaligned
  290. // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
  291. // To do this, write the padding starting with the 0x80 byte to
  292. // &sp[64]. Then for each message, copy the last 64 data bytes to sp
  293. // and load from &sp[64 - len] to get the needed padding block. This
  294. // code relies on the data buffers being >= 64 bytes in length.
  295. sub w8, len, #64 // w8 = len - 64
  296. add data1, data1, w8, sxtw // data1 += len - 64
  297. add data2, data2, w8, sxtw // data2 += len - 64
  298. CPU_LE( mov x9, #0x80 )
  299. CPU_LE( fmov d16, x9 )
  300. CPU_BE( movi v16.16b, #0 )
  301. CPU_BE( mov x9, #0x8000000000000000 )
  302. CPU_BE( mov v16.d[1], x9 )
  303. movi v17.16b, #0
  304. stp q16, q17, [sp, #64]
  305. stp q17, q17, [sp, #96]
  306. sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
  307. cmp len, #56
  308. b.ge 1f // will count spill into its own block?
  309. lsl count, count, #3
  310. CPU_LE( rev count, count )
  311. str count, [x9, #56]
  312. mov final_step, #2 // won't need count-only block
  313. b 2f
  314. 1:
  315. mov final_step, #1 // will need count-only block
  316. 2:
  317. ld1 {v16.16b-v19.16b}, [data1]
  318. st1 {v16.16b-v19.16b}, [sp]
  319. ld1 {v16.4s-v19.4s}, [x9]
  320. ld1 {v20.16b-v23.16b}, [data2]
  321. st1 {v20.16b-v23.16b}, [sp]
  322. ld1 {v20.4s-v23.4s}, [x9]
  323. b .Lfinup2x_loop_have_data
  324. // Prepare a padding block, either:
  325. //
  326. // {0x80, 0, 0, 0, ..., count (as __be64)}
  327. // This is for a block aligned message.
  328. //
  329. // { 0, 0, 0, 0, ..., count (as __be64)}
  330. // This is for a message whose length mod 64 is >= 56.
  331. //
  332. // Pre-swap the endianness of the words.
  333. .Lfinup2x_finalize_countonly:
  334. movi v16.2d, #0
  335. b 1f
  336. .Lfinup2x_finalize_blockaligned:
  337. mov x8, #0x80000000
  338. fmov d16, x8
  339. 1:
  340. movi v17.2d, #0
  341. movi v18.2d, #0
  342. ror count, count, #29 // ror(lsl(count, 3), 32)
  343. mov v19.d[0], xzr
  344. mov v19.d[1], count
  345. mov v20.16b, v16.16b
  346. movi v21.2d, #0
  347. movi v22.2d, #0
  348. mov v23.16b, v19.16b
  349. mov final_step, #2
  350. b .Lfinup2x_loop_have_bswapped_data
  351. .Lfinup2x_done:
  352. // Write the two digests with all bytes in the correct order.
  353. CPU_LE( rev32 state0_a.16b, state0_a.16b )
  354. CPU_LE( rev32 state1_a.16b, state1_a.16b )
  355. CPU_LE( rev32 state0_b.16b, state0_b.16b )
  356. CPU_LE( rev32 state1_b.16b, state1_b.16b )
  357. st1 {state0_a.4s-state1_a.4s}, [out1]
  358. st1 {state0_b.4s-state1_b.4s}, [out2]
  359. add sp, sp, #128
  360. ret
  361. SYM_FUNC_END(sha256_ce_finup2x)