| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491 |
- /* SPDX-License-Identifier: GPL-2.0 OR MIT */
- /*
- * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
- #define MASK_U32 0x3c
- #define CHACHA20_BLOCK_SIZE 64
- #define STACK_SIZE 32
- #define X0 $t0
- #define X1 $t1
- #define X2 $t2
- #define X3 $t3
- #define X4 $t4
- #define X5 $t5
- #define X6 $t6
- #define X7 $t7
- #define X8 $t8
- #define X9 $t9
- #define X10 $v1
- #define X11 $s6
- #define X12 $s5
- #define X13 $s4
- #define X14 $s3
- #define X15 $s2
- /* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
- #define T0 $s1
- #define T1 $s0
- #define T(n) T ## n
- #define X(n) X ## n
- /* Input arguments */
- #define STATE $a0
- #define OUT $a1
- #define IN $a2
- #define BYTES $a3
- /* Output argument */
- /* NONCE[0] is kept in a register and not in memory.
- * We don't want to touch original value in memory.
- * Must be incremented every loop iteration.
- */
- #define NONCE_0 $v0
- /* SAVED_X and SAVED_CA are set in the jump table.
- * Use regs which are overwritten on exit else we don't leak clear data.
- * They are used to handling the last bytes which are not multiple of 4.
- */
- #define SAVED_X X15
- #define SAVED_CA $s7
- #define IS_UNALIGNED $s7
- #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- #define MSB 0
- #define LSB 3
- #define CPU_TO_LE32(n) \
- wsbh n, n; \
- rotr n, 16;
- #else
- #define MSB 3
- #define LSB 0
- #define CPU_TO_LE32(n)
- #endif
- #define FOR_EACH_WORD(x) \
- x( 0); \
- x( 1); \
- x( 2); \
- x( 3); \
- x( 4); \
- x( 5); \
- x( 6); \
- x( 7); \
- x( 8); \
- x( 9); \
- x(10); \
- x(11); \
- x(12); \
- x(13); \
- x(14); \
- x(15);
- #define FOR_EACH_WORD_REV(x) \
- x(15); \
- x(14); \
- x(13); \
- x(12); \
- x(11); \
- x(10); \
- x( 9); \
- x( 8); \
- x( 7); \
- x( 6); \
- x( 5); \
- x( 4); \
- x( 3); \
- x( 2); \
- x( 1); \
- x( 0);
- #define PLUS_ONE_0 1
- #define PLUS_ONE_1 2
- #define PLUS_ONE_2 3
- #define PLUS_ONE_3 4
- #define PLUS_ONE_4 5
- #define PLUS_ONE_5 6
- #define PLUS_ONE_6 7
- #define PLUS_ONE_7 8
- #define PLUS_ONE_8 9
- #define PLUS_ONE_9 10
- #define PLUS_ONE_10 11
- #define PLUS_ONE_11 12
- #define PLUS_ONE_12 13
- #define PLUS_ONE_13 14
- #define PLUS_ONE_14 15
- #define PLUS_ONE_15 16
- #define PLUS_ONE(x) PLUS_ONE_ ## x
- #define _CONCAT3(a,b,c) a ## b ## c
- #define CONCAT3(a,b,c) _CONCAT3(a,b,c)
- #define STORE_UNALIGNED(x) \
- CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
- .if (x != 12); \
- lw T0, (x*4)(STATE); \
- .endif; \
- lwl T1, (x*4)+MSB ## (IN); \
- lwr T1, (x*4)+LSB ## (IN); \
- .if (x == 12); \
- addu X ## x, NONCE_0; \
- .else; \
- addu X ## x, T0; \
- .endif; \
- CPU_TO_LE32(X ## x); \
- xor X ## x, T1; \
- swl X ## x, (x*4)+MSB ## (OUT); \
- swr X ## x, (x*4)+LSB ## (OUT);
- #define STORE_ALIGNED(x) \
- CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
- .if (x != 12); \
- lw T0, (x*4)(STATE); \
- .endif; \
- lw T1, (x*4) ## (IN); \
- .if (x == 12); \
- addu X ## x, NONCE_0; \
- .else; \
- addu X ## x, T0; \
- .endif; \
- CPU_TO_LE32(X ## x); \
- xor X ## x, T1; \
- sw X ## x, (x*4) ## (OUT);
- /* Jump table macro.
- * Used for setup and handling the last bytes, which are not multiple of 4.
- * X15 is free to store Xn
- * Every jumptable entry must be equal in size.
- */
- #define JMPTBL_ALIGNED(x) \
- .Lchacha_mips_jmptbl_aligned_ ## x: ; \
- .set noreorder; \
- b .Lchacha_mips_xor_aligned_ ## x ## _b; \
- .if (x == 12); \
- addu SAVED_X, X ## x, NONCE_0; \
- .else; \
- addu SAVED_X, X ## x, SAVED_CA; \
- .endif; \
- .set reorder
- #define JMPTBL_UNALIGNED(x) \
- .Lchacha_mips_jmptbl_unaligned_ ## x: ; \
- .set noreorder; \
- b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
- .if (x == 12); \
- addu SAVED_X, X ## x, NONCE_0; \
- .else; \
- addu SAVED_X, X ## x, SAVED_CA; \
- .endif; \
- .set reorder
- #define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
- addu X(A), X(K); \
- addu X(B), X(L); \
- addu X(C), X(M); \
- addu X(D), X(N); \
- xor X(V), X(A); \
- xor X(W), X(B); \
- xor X(Y), X(C); \
- xor X(Z), X(D); \
- rotr X(V), 32 - S; \
- rotr X(W), 32 - S; \
- rotr X(Y), 32 - S; \
- rotr X(Z), 32 - S;
- .text
- .set reorder
- .set noat
- .globl chacha_crypt_arch
- .ent chacha_crypt_arch
- chacha_crypt_arch:
- .frame $sp, STACK_SIZE, $ra
- /* Load number of rounds */
- lw $at, 16($sp)
- addiu $sp, -STACK_SIZE
- /* Return bytes = 0. */
- beqz BYTES, .Lchacha_mips_end
- lw NONCE_0, 48(STATE)
- /* Save s0-s7 */
- sw $s0, 0($sp)
- sw $s1, 4($sp)
- sw $s2, 8($sp)
- sw $s3, 12($sp)
- sw $s4, 16($sp)
- sw $s5, 20($sp)
- sw $s6, 24($sp)
- sw $s7, 28($sp)
- /* Test IN or OUT is unaligned.
- * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
- */
- or IS_UNALIGNED, IN, OUT
- andi IS_UNALIGNED, 0x3
- b .Lchacha_rounds_start
- .align 4
- .Loop_chacha_rounds:
- addiu IN, CHACHA20_BLOCK_SIZE
- addiu OUT, CHACHA20_BLOCK_SIZE
- addiu NONCE_0, 1
- .Lchacha_rounds_start:
- lw X0, 0(STATE)
- lw X1, 4(STATE)
- lw X2, 8(STATE)
- lw X3, 12(STATE)
- lw X4, 16(STATE)
- lw X5, 20(STATE)
- lw X6, 24(STATE)
- lw X7, 28(STATE)
- lw X8, 32(STATE)
- lw X9, 36(STATE)
- lw X10, 40(STATE)
- lw X11, 44(STATE)
- move X12, NONCE_0
- lw X13, 52(STATE)
- lw X14, 56(STATE)
- lw X15, 60(STATE)
- .Loop_chacha_xor_rounds:
- addiu $at, -2
- AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
- AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
- AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
- AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
- AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
- AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
- AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
- AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
- bnez $at, .Loop_chacha_xor_rounds
- addiu BYTES, -(CHACHA20_BLOCK_SIZE)
- /* Is data src/dst unaligned? Jump */
- bnez IS_UNALIGNED, .Loop_chacha_unaligned
- /* Set number rounds here to fill delayslot. */
- lw $at, (STACK_SIZE+16)($sp)
- /* BYTES < 0, it has no full block. */
- bltz BYTES, .Lchacha_mips_no_full_block_aligned
- FOR_EACH_WORD_REV(STORE_ALIGNED)
- /* BYTES > 0? Loop again. */
- bgtz BYTES, .Loop_chacha_rounds
- /* Place this here to fill delay slot */
- addiu NONCE_0, 1
- /* BYTES < 0? Handle last bytes */
- bltz BYTES, .Lchacha_mips_xor_bytes
- .Lchacha_mips_xor_done:
- /* Restore used registers */
- lw $s0, 0($sp)
- lw $s1, 4($sp)
- lw $s2, 8($sp)
- lw $s3, 12($sp)
- lw $s4, 16($sp)
- lw $s5, 20($sp)
- lw $s6, 24($sp)
- lw $s7, 28($sp)
- /* Write NONCE_0 back to right location in state */
- sw NONCE_0, 48(STATE)
- .Lchacha_mips_end:
- addiu $sp, STACK_SIZE
- jr $ra
- .Lchacha_mips_no_full_block_aligned:
- /* Restore the offset on BYTES */
- addiu BYTES, CHACHA20_BLOCK_SIZE
- /* Get number of full WORDS */
- andi $at, BYTES, MASK_U32
- /* Load upper half of jump table addr */
- lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
- /* Calculate lower half jump table offset */
- ins T0, $at, 1, 6
- /* Add offset to STATE */
- addu T1, STATE, $at
- /* Add lower half jump table addr */
- addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
- /* Read value from STATE */
- lw SAVED_CA, 0(T1)
- /* Store remaining bytecounter as negative value */
- subu BYTES, $at, BYTES
- jr T0
- /* Jump table */
- FOR_EACH_WORD(JMPTBL_ALIGNED)
- .Loop_chacha_unaligned:
- /* Set number rounds here to fill delayslot. */
- lw $at, (STACK_SIZE+16)($sp)
- /* BYTES > 0, it has no full block. */
- bltz BYTES, .Lchacha_mips_no_full_block_unaligned
- FOR_EACH_WORD_REV(STORE_UNALIGNED)
- /* BYTES > 0? Loop again. */
- bgtz BYTES, .Loop_chacha_rounds
- /* Write NONCE_0 back to right location in state */
- sw NONCE_0, 48(STATE)
- .set noreorder
- /* Fall through to byte handling */
- bgez BYTES, .Lchacha_mips_xor_done
- .Lchacha_mips_xor_unaligned_0_b:
- .Lchacha_mips_xor_aligned_0_b:
- /* Place this here to fill delay slot */
- addiu NONCE_0, 1
- .set reorder
- .Lchacha_mips_xor_bytes:
- addu IN, $at
- addu OUT, $at
- /* First byte */
- lbu T1, 0(IN)
- addiu $at, BYTES, 1
- xor T1, SAVED_X
- sb T1, 0(OUT)
- beqz $at, .Lchacha_mips_xor_done
- /* Second byte */
- lbu T1, 1(IN)
- addiu $at, BYTES, 2
- rotr SAVED_X, 8
- xor T1, SAVED_X
- sb T1, 1(OUT)
- beqz $at, .Lchacha_mips_xor_done
- /* Third byte */
- lbu T1, 2(IN)
- rotr SAVED_X, 8
- xor T1, SAVED_X
- sb T1, 2(OUT)
- b .Lchacha_mips_xor_done
- .Lchacha_mips_no_full_block_unaligned:
- /* Restore the offset on BYTES */
- addiu BYTES, CHACHA20_BLOCK_SIZE
- /* Get number of full WORDS */
- andi $at, BYTES, MASK_U32
- /* Load upper half of jump table addr */
- lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
- /* Calculate lower half jump table offset */
- ins T0, $at, 1, 6
- /* Add offset to STATE */
- addu T1, STATE, $at
- /* Add lower half jump table addr */
- addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
- /* Read value from STATE */
- lw SAVED_CA, 0(T1)
- /* Store remaining bytecounter as negative value */
- subu BYTES, $at, BYTES
- jr T0
- /* Jump table */
- FOR_EACH_WORD(JMPTBL_UNALIGNED)
- .end chacha_crypt_arch
- .set at
- /* Input arguments
- * STATE $a0
- * OUT $a1
- * NROUND $a2
- */
- #undef X12
- #undef X13
- #undef X14
- #undef X15
- #define X12 $a3
- #define X13 $at
- #define X14 $v0
- #define X15 STATE
- .set noat
- .globl hchacha_block_arch
- .ent hchacha_block_arch
- hchacha_block_arch:
- .frame $sp, STACK_SIZE, $ra
- addiu $sp, -STACK_SIZE
- /* Save X11(s6) */
- sw X11, 0($sp)
- lw X0, 0(STATE)
- lw X1, 4(STATE)
- lw X2, 8(STATE)
- lw X3, 12(STATE)
- lw X4, 16(STATE)
- lw X5, 20(STATE)
- lw X6, 24(STATE)
- lw X7, 28(STATE)
- lw X8, 32(STATE)
- lw X9, 36(STATE)
- lw X10, 40(STATE)
- lw X11, 44(STATE)
- lw X12, 48(STATE)
- lw X13, 52(STATE)
- lw X14, 56(STATE)
- lw X15, 60(STATE)
- .Loop_hchacha_xor_rounds:
- addiu $a2, -2
- AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
- AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
- AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
- AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
- AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
- AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
- AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
- AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
- bnez $a2, .Loop_hchacha_xor_rounds
- /* Restore used register */
- lw X11, 0($sp)
- sw X0, 0(OUT)
- sw X1, 4(OUT)
- sw X2, 8(OUT)
- sw X3, 12(OUT)
- sw X12, 16(OUT)
- sw X13, 20(OUT)
- sw X14, 24(OUT)
- sw X15, 28(OUT)
- addiu $sp, STACK_SIZE
- jr $ra
- .end hchacha_block_arch
- .set at
|