| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408 |
- /* SPDX-License-Identifier: GPL-2.0-only */
- /*
- * Core SHA-224/SHA-256 transform using v8 Crypto Extensions
- *
- * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
- #include <linux/linkage.h>
- #include <asm/assembler.h>
- .text
- .arch armv8-a+crypto
- dga .req q20
- dgav .req v20
- dgb .req q21
- dgbv .req v21
- t0 .req v22
- t1 .req v23
- dg0q .req q24
- dg0v .req v24
- dg1q .req q25
- dg1v .req v25
- dg2q .req q26
- dg2v .req v26
- .macro add_only, ev, rc, s0
- mov dg2v.16b, dg0v.16b
- .ifeq \ev
- add t1.4s, v\s0\().4s, \rc\().4s
- sha256h dg0q, dg1q, t0.4s
- sha256h2 dg1q, dg2q, t0.4s
- .else
- .ifnb \s0
- add t0.4s, v\s0\().4s, \rc\().4s
- .endif
- sha256h dg0q, dg1q, t1.4s
- sha256h2 dg1q, dg2q, t1.4s
- .endif
- .endm
- .macro add_update, ev, rc, s0, s1, s2, s3
- sha256su0 v\s0\().4s, v\s1\().4s
- add_only \ev, \rc, \s1
- sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s
- .endm
- /*
- * The SHA-256 round constants
- */
- .section ".rodata", "a"
- .align 4
- .Lsha2_rcon:
- .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
- .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
- .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
- .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
- .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
- .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
- .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
- .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
- .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
- .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
- .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
- .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
- .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
- .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
- .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
- .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
- .macro load_round_constants tmp
- adr_l \tmp, .Lsha2_rcon
- ld1 { v0.4s- v3.4s}, [\tmp], #64
- ld1 { v4.4s- v7.4s}, [\tmp], #64
- ld1 { v8.4s-v11.4s}, [\tmp], #64
- ld1 {v12.4s-v15.4s}, [\tmp]
- .endm
- /*
- * size_t __sha256_ce_transform(struct sha256_block_state *state,
- * const u8 *data, size_t nblocks);
- */
- .text
- SYM_FUNC_START(__sha256_ce_transform)
- load_round_constants x8
- /* load state */
- ld1 {dgav.4s, dgbv.4s}, [x0]
- /* load input */
- 0: ld1 {v16.4s-v19.4s}, [x1], #64
- sub x2, x2, #1
- CPU_LE( rev32 v16.16b, v16.16b )
- CPU_LE( rev32 v17.16b, v17.16b )
- CPU_LE( rev32 v18.16b, v18.16b )
- CPU_LE( rev32 v19.16b, v19.16b )
- add t0.4s, v16.4s, v0.4s
- mov dg0v.16b, dgav.16b
- mov dg1v.16b, dgbv.16b
- add_update 0, v1, 16, 17, 18, 19
- add_update 1, v2, 17, 18, 19, 16
- add_update 0, v3, 18, 19, 16, 17
- add_update 1, v4, 19, 16, 17, 18
- add_update 0, v5, 16, 17, 18, 19
- add_update 1, v6, 17, 18, 19, 16
- add_update 0, v7, 18, 19, 16, 17
- add_update 1, v8, 19, 16, 17, 18
- add_update 0, v9, 16, 17, 18, 19
- add_update 1, v10, 17, 18, 19, 16
- add_update 0, v11, 18, 19, 16, 17
- add_update 1, v12, 19, 16, 17, 18
- add_only 0, v13, 17
- add_only 1, v14, 18
- add_only 0, v15, 19
- add_only 1
- /* update state */
- add dgav.4s, dgav.4s, dg0v.4s
- add dgbv.4s, dgbv.4s, dg1v.4s
- /* return early if voluntary preemption is needed */
- cond_yield 1f, x5, x6
- /* handled all input blocks? */
- cbnz x2, 0b
- /* store new state */
- 1: st1 {dgav.4s, dgbv.4s}, [x0]
- mov x0, x2
- ret
- SYM_FUNC_END(__sha256_ce_transform)
- .unreq dga
- .unreq dgav
- .unreq dgb
- .unreq dgbv
- .unreq t0
- .unreq t1
- .unreq dg0q
- .unreq dg0v
- .unreq dg1q
- .unreq dg1v
- .unreq dg2q
- .unreq dg2v
- // parameters for sha256_ce_finup2x()
- ctx .req x0
- data1 .req x1
- data2 .req x2
- len .req w3
- out1 .req x4
- out2 .req x5
- // other scalar variables
- count .req x6
- final_step .req w7
- // x8-x9 are used as temporaries.
- // v0-v15 are used to cache the SHA-256 round constants.
- // v16-v19 are used for the message schedule for the first message.
- // v20-v23 are used for the message schedule for the second message.
- // v24-v31 are used for the state and temporaries as given below.
- // *_a are for the first message and *_b for the second.
- state0_a_q .req q24
- state0_a .req v24
- state1_a_q .req q25
- state1_a .req v25
- state0_b_q .req q26
- state0_b .req v26
- state1_b_q .req q27
- state1_b .req v27
- t0_a .req v28
- t0_b .req v29
- t1_a_q .req q30
- t1_a .req v30
- t1_b_q .req q31
- t1_b .req v31
- #define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
- #define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
- // offsetof(struct __sha256_ctx, state) is assumed to be 0.
- // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
- // and m0_b contain the current 4 message schedule words for the first
- // and second message respectively.
- //
- // If not all the message schedule words have been computed yet, then
- // this also computes 4 more message schedule words for each message.
- // m1_a-m3_a contain the next 3 groups of 4 message schedule words for
- // the first message, and likewise m1_b-m3_b for the second. After
- // consuming the current value of m0_a, this macro computes the group
- // after m3_a and writes it to m0_a, and likewise for *_b. This means
- // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
- // m3_a, m0_a), and likewise for *_b, so the caller must cycle through
- // the registers accordingly.
- .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \
- m0_b, m1_b, m2_b, m3_b
- add t0_a\().4s, \m0_a\().4s, \k\().4s
- add t0_b\().4s, \m0_b\().4s, \k\().4s
- .if \i < 48
- sha256su0 \m0_a\().4s, \m1_a\().4s
- sha256su0 \m0_b\().4s, \m1_b\().4s
- sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
- sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
- .endif
- mov t1_a.16b, state0_a.16b
- mov t1_b.16b, state0_b.16b
- sha256h state0_a_q, state1_a_q, t0_a\().4s
- sha256h state0_b_q, state1_b_q, t0_b\().4s
- sha256h2 state1_a_q, t1_a_q, t0_a\().4s
- sha256h2 state1_b_q, t1_b_q, t0_b\().4s
- .endm
- .macro do_16rounds_2x i, k0, k1, k2, k3
- do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23
- do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
- do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21
- do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22
- .endm
- //
- // void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
- // const u8 *data1, const u8 *data2, int len,
- // u8 out1[SHA256_DIGEST_SIZE],
- // u8 out2[SHA256_DIGEST_SIZE]);
- //
- // This function computes the SHA-256 digests of two messages |data1| and
- // |data2| that are both |len| bytes long, starting from the initial context
- // |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
- //
- // The instructions for the two SHA-256 operations are interleaved. On many
- // CPUs, this is almost twice as fast as hashing each message individually due
- // to taking better advantage of the CPU's SHA-256 and SIMD throughput.
- //
- SYM_FUNC_START(sha256_ce_finup2x)
- sub sp, sp, #128
- mov final_step, #0
- load_round_constants x8
- // Load the initial state from ctx->state.
- ld1 {state0_a.4s-state1_a.4s}, [ctx]
- // Load ctx->bytecount. Take the mod 64 of it to get the number of
- // bytes that are buffered in ctx->buf. Also save it in a register with
- // len added to it.
- ldr x8, [ctx, #OFFSETOF_BYTECOUNT]
- add count, x8, len, sxtw
- and x8, x8, #63
- cbz x8, .Lfinup2x_enter_loop // No bytes buffered?
- // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
- // followed by the first 64 - x8 bytes of data. Since len >= 64, we
- // just load 64 bytes from each of ctx->buf, data1, and data2
- // unconditionally and rearrange the data as needed.
- add x9, ctx, #OFFSETOF_BUF
- ld1 {v16.16b-v19.16b}, [x9]
- st1 {v16.16b-v19.16b}, [sp]
- ld1 {v16.16b-v19.16b}, [data1], #64
- add x9, sp, x8
- st1 {v16.16b-v19.16b}, [x9]
- ld1 {v16.4s-v19.4s}, [sp]
- ld1 {v20.16b-v23.16b}, [data2], #64
- st1 {v20.16b-v23.16b}, [x9]
- ld1 {v20.4s-v23.4s}, [sp]
- sub len, len, #64
- sub data1, data1, x8
- sub data2, data2, x8
- add len, len, w8
- mov state0_b.16b, state0_a.16b
- mov state1_b.16b, state1_a.16b
- b .Lfinup2x_loop_have_data
- .Lfinup2x_enter_loop:
- sub len, len, #64
- mov state0_b.16b, state0_a.16b
- mov state1_b.16b, state1_a.16b
- .Lfinup2x_loop:
- // Load the next two data blocks.
- ld1 {v16.4s-v19.4s}, [data1], #64
- ld1 {v20.4s-v23.4s}, [data2], #64
- .Lfinup2x_loop_have_data:
- // Convert the words of the data blocks from big endian.
- CPU_LE( rev32 v16.16b, v16.16b )
- CPU_LE( rev32 v17.16b, v17.16b )
- CPU_LE( rev32 v18.16b, v18.16b )
- CPU_LE( rev32 v19.16b, v19.16b )
- CPU_LE( rev32 v20.16b, v20.16b )
- CPU_LE( rev32 v21.16b, v21.16b )
- CPU_LE( rev32 v22.16b, v22.16b )
- CPU_LE( rev32 v23.16b, v23.16b )
- .Lfinup2x_loop_have_bswapped_data:
- // Save the original state for each block.
- st1 {state0_a.4s-state1_b.4s}, [sp]
- // Do the SHA-256 rounds on each block.
- do_16rounds_2x 0, v0, v1, v2, v3
- do_16rounds_2x 16, v4, v5, v6, v7
- do_16rounds_2x 32, v8, v9, v10, v11
- do_16rounds_2x 48, v12, v13, v14, v15
- // Add the original state for each block.
- ld1 {v16.4s-v19.4s}, [sp]
- add state0_a.4s, state0_a.4s, v16.4s
- add state1_a.4s, state1_a.4s, v17.4s
- add state0_b.4s, state0_b.4s, v18.4s
- add state1_b.4s, state1_b.4s, v19.4s
- // Update len and loop back if more blocks remain.
- sub len, len, #64
- tbz len, #31, .Lfinup2x_loop // len >= 0?
- // Check if any final blocks need to be handled.
- // final_step = 2: all done
- // final_step = 1: need to do count-only padding block
- // final_step = 0: need to do the block with 0x80 padding byte
- tbnz final_step, #1, .Lfinup2x_done
- tbnz final_step, #0, .Lfinup2x_finalize_countonly
- add len, len, #64
- cbz len, .Lfinup2x_finalize_blockaligned
- // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
- // To do this, write the padding starting with the 0x80 byte to
- // &sp[64]. Then for each message, copy the last 64 data bytes to sp
- // and load from &sp[64 - len] to get the needed padding block. This
- // code relies on the data buffers being >= 64 bytes in length.
- sub w8, len, #64 // w8 = len - 64
- add data1, data1, w8, sxtw // data1 += len - 64
- add data2, data2, w8, sxtw // data2 += len - 64
- CPU_LE( mov x9, #0x80 )
- CPU_LE( fmov d16, x9 )
- CPU_BE( movi v16.16b, #0 )
- CPU_BE( mov x9, #0x8000000000000000 )
- CPU_BE( mov v16.d[1], x9 )
- movi v17.16b, #0
- stp q16, q17, [sp, #64]
- stp q17, q17, [sp, #96]
- sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
- cmp len, #56
- b.ge 1f // will count spill into its own block?
- lsl count, count, #3
- CPU_LE( rev count, count )
- str count, [x9, #56]
- mov final_step, #2 // won't need count-only block
- b 2f
- 1:
- mov final_step, #1 // will need count-only block
- 2:
- ld1 {v16.16b-v19.16b}, [data1]
- st1 {v16.16b-v19.16b}, [sp]
- ld1 {v16.4s-v19.4s}, [x9]
- ld1 {v20.16b-v23.16b}, [data2]
- st1 {v20.16b-v23.16b}, [sp]
- ld1 {v20.4s-v23.4s}, [x9]
- b .Lfinup2x_loop_have_data
- // Prepare a padding block, either:
- //
- // {0x80, 0, 0, 0, ..., count (as __be64)}
- // This is for a block aligned message.
- //
- // { 0, 0, 0, 0, ..., count (as __be64)}
- // This is for a message whose length mod 64 is >= 56.
- //
- // Pre-swap the endianness of the words.
- .Lfinup2x_finalize_countonly:
- movi v16.2d, #0
- b 1f
- .Lfinup2x_finalize_blockaligned:
- mov x8, #0x80000000
- fmov d16, x8
- 1:
- movi v17.2d, #0
- movi v18.2d, #0
- ror count, count, #29 // ror(lsl(count, 3), 32)
- mov v19.d[0], xzr
- mov v19.d[1], count
- mov v20.16b, v16.16b
- movi v21.2d, #0
- movi v22.2d, #0
- mov v23.16b, v19.16b
- mov final_step, #2
- b .Lfinup2x_loop_have_bswapped_data
- .Lfinup2x_done:
- // Write the two digests with all bytes in the correct order.
- CPU_LE( rev32 state0_a.16b, state0_a.16b )
- CPU_LE( rev32 state1_a.16b, state1_a.16b )
- CPU_LE( rev32 state0_b.16b, state0_b.16b )
- CPU_LE( rev32 state1_b.16b, state1_b.16b )
- st1 {state0_a.4s-state1_a.4s}, [out1]
- st1 {state0_b.4s-state1_b.4s}, [out2]
- add sp, sp, #128
- ret
- SYM_FUNC_END(sha256_ce_finup2x)
|