| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847 |
- #!/usr/bin/env perl
- # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
- #
- # ====================================================================
- # Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL.
- # ====================================================================
- #
- # Poly1305 hash for RISC-V.
- #
- # February 2019
- #
- # In the essence it's pretty straightforward transliteration of MIPS
- # module [without big-endian option].
- #
- # 1.8 cycles per byte on U74, >100% faster than compiler-generated
- # code. 1.9 cpb on C910, ~75% improvement. 3.3 on Spacemit X60, ~69%
- # improvement.
- #
- # June 2024.
- #
- # Add CHERI support.
- #
- ######################################################################
- #
- ($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4));
- ($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31));
- ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17));
- ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27));
- #
- ######################################################################
- $flavour = shift || "64";
- for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
- open STDOUT,">$output";
- $code.=<<___;
- #ifdef __KERNEL__
- # ifdef __riscv_zicfilp
- # undef __riscv_zicfilp // calls are expected to be direct
- # endif
- #endif
- #if defined(__CHERI_PURE_CAPABILITY__) && !defined(__riscv_misaligned_fast)
- # define __riscv_misaligned_fast 1
- #endif
- ___
- if ($flavour =~ /64/) {{{
- ######################################################################
- # 64-bit code path...
- #
- my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
- my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2);
- $code.=<<___;
- #if __riscv_xlen == 64
- # if __SIZEOF_POINTER__ == 16
- # define PUSH csc
- # define POP clc
- # else
- # define PUSH sd
- # define POP ld
- # endif
- #else
- # error "unsupported __riscv_xlen"
- #endif
- .option pic
- .text
- .globl poly1305_init
- .type poly1305_init,\@function
- poly1305_init:
- #ifdef __riscv_zicfilp
- lpad 0
- #endif
- sd $zero,0($ctx)
- sd $zero,8($ctx)
- sd $zero,16($ctx)
- beqz $inp,.Lno_key
- #ifndef __riscv_misaligned_fast
- andi $tmp0,$inp,7 # $inp % 8
- andi $inp,$inp,-8 # align $inp
- slli $tmp0,$tmp0,3 # byte to bit offset
- #endif
- ld $in0,0($inp)
- ld $in1,8($inp)
- #ifndef __riscv_misaligned_fast
- beqz $tmp0,.Laligned_key
- ld $tmp2,16($inp)
- neg $tmp1,$tmp0 # implicit &63 in sll
- srl $in0,$in0,$tmp0
- sll $tmp3,$in1,$tmp1
- srl $in1,$in1,$tmp0
- sll $tmp2,$tmp2,$tmp1
- or $in0,$in0,$tmp3
- or $in1,$in1,$tmp2
- .Laligned_key:
- #endif
- li $tmp0,1
- slli $tmp0,$tmp0,32 # 0x0000000100000000
- addi $tmp0,$tmp0,-63 # 0x00000000ffffffc1
- slli $tmp0,$tmp0,28 # 0x0ffffffc10000000
- addi $tmp0,$tmp0,-1 # 0x0ffffffc0fffffff
- and $in0,$in0,$tmp0
- addi $tmp0,$tmp0,-3 # 0x0ffffffc0ffffffc
- and $in1,$in1,$tmp0
- sd $in0,24($ctx)
- srli $tmp0,$in1,2
- sd $in1,32($ctx)
- add $tmp0,$tmp0,$in1 # s1 = r1 + (r1 >> 2)
- sd $tmp0,40($ctx)
- .Lno_key:
- li $a0,0 # return 0
- ret
- .size poly1305_init,.-poly1305_init
- ___
- {
- my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
- ($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2);
- my ($shr,$shl) = ($t5,$t6); # used on R6
- $code.=<<___;
- .globl poly1305_blocks
- .type poly1305_blocks,\@function
- poly1305_blocks:
- #ifdef __riscv_zicfilp
- lpad 0
- #endif
- andi $len,$len,-16 # complete blocks only
- beqz $len,.Lno_data
- caddi $sp,$sp,-4*__SIZEOF_POINTER__
- PUSH $s0,3*__SIZEOF_POINTER__($sp)
- PUSH $s1,2*__SIZEOF_POINTER__($sp)
- PUSH $s2,1*__SIZEOF_POINTER__($sp)
- PUSH $s3,0*__SIZEOF_POINTER__($sp)
- #ifndef __riscv_misaligned_fast
- andi $shr,$inp,7
- andi $inp,$inp,-8 # align $inp
- slli $shr,$shr,3 # byte to bit offset
- neg $shl,$shr # implicit &63 in sll
- #endif
- ld $h0,0($ctx) # load hash value
- ld $h1,8($ctx)
- ld $h2,16($ctx)
- ld $r0,24($ctx) # load key
- ld $r1,32($ctx)
- ld $rs1,40($ctx)
- add $len,$len,$inp # end of buffer
- .Loop:
- ld $in0,0($inp) # load input
- ld $in1,8($inp)
- #ifndef __riscv_misaligned_fast
- beqz $shr,.Laligned_inp
- ld $tmp2,16($inp)
- srl $in0,$in0,$shr
- sll $tmp3,$in1,$shl
- srl $in1,$in1,$shr
- sll $tmp2,$tmp2,$shl
- or $in0,$in0,$tmp3
- or $in1,$in1,$tmp2
- .Laligned_inp:
- #endif
- caddi $inp,$inp,16
- andi $tmp0,$h2,-4 # modulo-scheduled reduction
- srli $tmp1,$h2,2
- andi $h2,$h2,3
- add $d0,$h0,$in0 # accumulate input
- add $tmp1,$tmp1,$tmp0
- sltu $tmp0,$d0,$h0
- add $d0,$d0,$tmp1 # ... and residue
- sltu $tmp1,$d0,$tmp1
- add $d1,$h1,$in1
- add $tmp0,$tmp0,$tmp1
- sltu $tmp1,$d1,$h1
- add $d1,$d1,$tmp0
- add $d2,$h2,$padbit
- sltu $tmp0,$d1,$tmp0
- mulhu $h1,$r0,$d0 # h0*r0
- mul $h0,$r0,$d0
- add $d2,$d2,$tmp1
- add $d2,$d2,$tmp0
- mulhu $tmp1,$rs1,$d1 # h1*5*r1
- mul $tmp0,$rs1,$d1
- mulhu $h2,$r1,$d0 # h0*r1
- mul $tmp2,$r1,$d0
- add $h0,$h0,$tmp0
- add $h1,$h1,$tmp1
- sltu $tmp0,$h0,$tmp0
- add $h1,$h1,$tmp0
- add $h1,$h1,$tmp2
- mulhu $tmp1,$r0,$d1 # h1*r0
- mul $tmp0,$r0,$d1
- sltu $tmp2,$h1,$tmp2
- add $h2,$h2,$tmp2
- mul $tmp2,$rs1,$d2 # h2*5*r1
- add $h1,$h1,$tmp0
- add $h2,$h2,$tmp1
- mul $tmp3,$r0,$d2 # h2*r0
- sltu $tmp0,$h1,$tmp0
- add $h2,$h2,$tmp0
- add $h1,$h1,$tmp2
- sltu $tmp2,$h1,$tmp2
- add $h2,$h2,$tmp2
- add $h2,$h2,$tmp3
- bne $inp,$len,.Loop
- sd $h0,0($ctx) # store hash value
- sd $h1,8($ctx)
- sd $h2,16($ctx)
- POP $s0,3*__SIZEOF_POINTER__($sp) # epilogue
- POP $s1,2*__SIZEOF_POINTER__($sp)
- POP $s2,1*__SIZEOF_POINTER__($sp)
- POP $s3,0*__SIZEOF_POINTER__($sp)
- caddi $sp,$sp,4*__SIZEOF_POINTER__
- .Lno_data:
- ret
- .size poly1305_blocks,.-poly1305_blocks
- ___
- }
- {
- my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
- $code.=<<___;
- .globl poly1305_emit
- .type poly1305_emit,\@function
- poly1305_emit:
- #ifdef __riscv_zicfilp
- lpad 0
- #endif
- ld $tmp2,16($ctx)
- ld $tmp0,0($ctx)
- ld $tmp1,8($ctx)
- andi $in0,$tmp2,-4 # final reduction
- srl $in1,$tmp2,2
- andi $tmp2,$tmp2,3
- add $in0,$in0,$in1
- add $tmp0,$tmp0,$in0
- sltu $in1,$tmp0,$in0
- addi $in0,$tmp0,5 # compare to modulus
- add $tmp1,$tmp1,$in1
- sltiu $tmp3,$in0,5
- sltu $tmp4,$tmp1,$in1
- add $in1,$tmp1,$tmp3
- add $tmp2,$tmp2,$tmp4
- sltu $tmp3,$in1,$tmp3
- add $tmp2,$tmp2,$tmp3
- srli $tmp2,$tmp2,2 # see if it carried/borrowed
- neg $tmp2,$tmp2
- xor $in0,$in0,$tmp0
- xor $in1,$in1,$tmp1
- and $in0,$in0,$tmp2
- and $in1,$in1,$tmp2
- xor $in0,$in0,$tmp0
- xor $in1,$in1,$tmp1
- lwu $tmp0,0($nonce) # load nonce
- lwu $tmp1,4($nonce)
- lwu $tmp2,8($nonce)
- lwu $tmp3,12($nonce)
- slli $tmp1,$tmp1,32
- slli $tmp3,$tmp3,32
- or $tmp0,$tmp0,$tmp1
- or $tmp2,$tmp2,$tmp3
- add $in0,$in0,$tmp0 # accumulate nonce
- add $in1,$in1,$tmp2
- sltu $tmp0,$in0,$tmp0
- add $in1,$in1,$tmp0
- #ifdef __riscv_misaligned_fast
- sd $in0,0($mac) # write mac value
- sd $in1,8($mac)
- #else
- srli $tmp0,$in0,8 # write mac value
- srli $tmp1,$in0,16
- srli $tmp2,$in0,24
- sb $in0,0($mac)
- srli $tmp3,$in0,32
- sb $tmp0,1($mac)
- srli $tmp0,$in0,40
- sb $tmp1,2($mac)
- srli $tmp1,$in0,48
- sb $tmp2,3($mac)
- srli $tmp2,$in0,56
- sb $tmp3,4($mac)
- srli $tmp3,$in1,8
- sb $tmp0,5($mac)
- srli $tmp0,$in1,16
- sb $tmp1,6($mac)
- srli $tmp1,$in1,24
- sb $tmp2,7($mac)
- sb $in1,8($mac)
- srli $tmp2,$in1,32
- sb $tmp3,9($mac)
- srli $tmp3,$in1,40
- sb $tmp0,10($mac)
- srli $tmp0,$in1,48
- sb $tmp1,11($mac)
- srli $tmp1,$in1,56
- sb $tmp2,12($mac)
- sb $tmp3,13($mac)
- sb $tmp0,14($mac)
- sb $tmp1,15($mac)
- #endif
- ret
- .size poly1305_emit,.-poly1305_emit
- .string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
- ___
- }
- }}} else {{{
- ######################################################################
- # 32-bit code path
- #
- my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
- my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
- ($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3);
- $code.=<<___;
- #if __riscv_xlen == 32
- # if __SIZEOF_POINTER__ == 8
- # define PUSH csc
- # define POP clc
- # else
- # define PUSH sw
- # define POP lw
- # endif
- # define MULX(hi,lo,a,b) mulhu hi,a,b; mul lo,a,b
- # define srliw srli
- # define srlw srl
- # define sllw sll
- # define addw add
- # define addiw addi
- # define mulw mul
- #elif __riscv_xlen == 64
- # if __SIZEOF_POINTER__ == 16
- # define PUSH csc
- # define POP clc
- # else
- # define PUSH sd
- # define POP ld
- # endif
- # define MULX(hi,lo,a,b) slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32
- #else
- # error "unsupported __riscv_xlen"
- #endif
- .option pic
- .text
- .globl poly1305_init
- .type poly1305_init,\@function
- poly1305_init:
- #ifdef __riscv_zicfilp
- lpad 0
- #endif
- sw $zero,0($ctx)
- sw $zero,4($ctx)
- sw $zero,8($ctx)
- sw $zero,12($ctx)
- sw $zero,16($ctx)
- beqz $inp,.Lno_key
- #ifndef __riscv_misaligned_fast
- andi $tmp0,$inp,3 # $inp % 4
- sub $inp,$inp,$tmp0 # align $inp
- sll $tmp0,$tmp0,3 # byte to bit offset
- #endif
- lw $in0,0($inp)
- lw $in1,4($inp)
- lw $in2,8($inp)
- lw $in3,12($inp)
- #ifndef __riscv_misaligned_fast
- beqz $tmp0,.Laligned_key
- lw $tmp2,16($inp)
- sub $tmp1,$zero,$tmp0
- srlw $in0,$in0,$tmp0
- sllw $tmp3,$in1,$tmp1
- srlw $in1,$in1,$tmp0
- or $in0,$in0,$tmp3
- sllw $tmp3,$in2,$tmp1
- srlw $in2,$in2,$tmp0
- or $in1,$in1,$tmp3
- sllw $tmp3,$in3,$tmp1
- srlw $in3,$in3,$tmp0
- or $in2,$in2,$tmp3
- sllw $tmp2,$tmp2,$tmp1
- or $in3,$in3,$tmp2
- .Laligned_key:
- #endif
- lui $tmp0,0x10000
- addi $tmp0,$tmp0,-1 # 0x0fffffff
- and $in0,$in0,$tmp0
- addi $tmp0,$tmp0,-3 # 0x0ffffffc
- and $in1,$in1,$tmp0
- and $in2,$in2,$tmp0
- and $in3,$in3,$tmp0
- sw $in0,20($ctx)
- sw $in1,24($ctx)
- sw $in2,28($ctx)
- sw $in3,32($ctx)
- srlw $tmp1,$in1,2
- srlw $tmp2,$in2,2
- srlw $tmp3,$in3,2
- addw $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
- addw $in2,$in2,$tmp2
- addw $in3,$in3,$tmp3
- sw $in1,36($ctx)
- sw $in2,40($ctx)
- sw $in3,44($ctx)
- .Lno_key:
- li $a0,0
- ret
- .size poly1305_init,.-poly1305_init
- ___
- {
- my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
- ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2);
- my ($d0,$d1,$d2,$d3) =
- ($a4,$a5,$a6,$a7);
- my $shr = $ra; # used on R6
- $code.=<<___;
- .globl poly1305_blocks
- .type poly1305_blocks,\@function
- poly1305_blocks:
- #ifdef __riscv_zicfilp
- lpad 0
- #endif
- andi $len,$len,-16 # complete blocks only
- beqz $len,.Labort
- #ifdef __riscv_zcmp
- cm.push {ra,s0-s8}, -48
- #else
- caddi $sp,$sp,-__SIZEOF_POINTER__*12
- PUSH $ra, __SIZEOF_POINTER__*11($sp)
- PUSH $s0, __SIZEOF_POINTER__*10($sp)
- PUSH $s1, __SIZEOF_POINTER__*9($sp)
- PUSH $s2, __SIZEOF_POINTER__*8($sp)
- PUSH $s3, __SIZEOF_POINTER__*7($sp)
- PUSH $s4, __SIZEOF_POINTER__*6($sp)
- PUSH $s5, __SIZEOF_POINTER__*5($sp)
- PUSH $s6, __SIZEOF_POINTER__*4($sp)
- PUSH $s7, __SIZEOF_POINTER__*3($sp)
- PUSH $s8, __SIZEOF_POINTER__*2($sp)
- #endif
- #ifndef __riscv_misaligned_fast
- andi $shr,$inp,3
- andi $inp,$inp,-4 # align $inp
- slli $shr,$shr,3 # byte to bit offset
- #endif
- lw $h0,0($ctx) # load hash value
- lw $h1,4($ctx)
- lw $h2,8($ctx)
- lw $h3,12($ctx)
- lw $h4,16($ctx)
- lw $r0,20($ctx) # load key
- lw $r1,24($ctx)
- lw $r2,28($ctx)
- lw $r3,32($ctx)
- lw $rs1,36($ctx)
- lw $rs2,40($ctx)
- lw $rs3,44($ctx)
- add $len,$len,$inp # end of buffer
- .Loop:
- lw $d0,0($inp) # load input
- lw $d1,4($inp)
- lw $d2,8($inp)
- lw $d3,12($inp)
- #ifndef __riscv_misaligned_fast
- beqz $shr,.Laligned_inp
- lw $t4,16($inp)
- sub $t5,$zero,$shr
- srlw $d0,$d0,$shr
- sllw $t3,$d1,$t5
- srlw $d1,$d1,$shr
- or $d0,$d0,$t3
- sllw $t3,$d2,$t5
- srlw $d2,$d2,$shr
- or $d1,$d1,$t3
- sllw $t3,$d3,$t5
- srlw $d3,$d3,$shr
- or $d2,$d2,$t3
- sllw $t4,$t4,$t5
- or $d3,$d3,$t4
- .Laligned_inp:
- #endif
- srliw $t3,$h4,2 # modulo-scheduled reduction
- andi $t4,$h4,-4
- andi $h4,$h4,3
- addw $d0,$d0,$h0 # accumulate input
- addw $t4,$t4,$t3
- sltu $h0,$d0,$h0
- addw $d0,$d0,$t4 # ... and residue
- sltu $t4,$d0,$t4
- addw $d1,$d1,$h1
- addw $h0,$h0,$t4 # carry
- sltu $h1,$d1,$h1
- addw $d1,$d1,$h0
- sltu $h0,$d1,$h0
- addw $d2,$d2,$h2
- addw $h1,$h1,$h0 # carry
- sltu $h2,$d2,$h2
- addw $d2,$d2,$h1
- sltu $h1,$d2,$h1
- addw $d3,$d3,$h3
- addw $h2,$h2,$h1 # carry
- sltu $h3,$d3,$h3
- addw $d3,$d3,$h2
- MULX ($h1,$h0,$r0,$d0) # d0*r0
- sltu $h2,$d3,$h2
- addw $h3,$h3,$h2 # carry
- MULX ($t4,$t3,$rs3,$d1) # d1*s3
- addw $h4,$h4,$padbit
- caddi $inp,$inp,16
- addw $h4,$h4,$h3
- MULX ($t6,$a3,$rs2,$d2) # d2*s2
- addw $h0,$h0,$t3
- addw $h1,$h1,$t4
- sltu $t3,$h0,$t3
- addw $h1,$h1,$t3
- MULX ($t4,$t3,$rs1,$d3) # d3*s1
- addw $h0,$h0,$a3
- addw $h1,$h1,$t6
- sltu $a3,$h0,$a3
- addw $h1,$h1,$a3
- MULX ($h2,$a3,$r1,$d0) # d0*r1
- addw $h0,$h0,$t3
- addw $h1,$h1,$t4
- sltu $t3,$h0,$t3
- addw $h1,$h1,$t3
- MULX ($t4,$t3,$r0,$d1) # d1*r0
- addw $h1,$h1,$a3
- sltu $a3,$h1,$a3
- addw $h2,$h2,$a3
- MULX ($t6,$a3,$rs3,$d2) # d2*s3
- addw $h1,$h1,$t3
- addw $h2,$h2,$t4
- sltu $t3,$h1,$t3
- addw $h2,$h2,$t3
- MULX ($t4,$t3,$rs2,$d3) # d3*s2
- addw $h1,$h1,$a3
- addw $h2,$h2,$t6
- sltu $a3,$h1,$a3
- addw $h2,$h2,$a3
- mulw $a3,$rs1,$h4 # h4*s1
- addw $h1,$h1,$t3
- addw $h2,$h2,$t4
- sltu $t3,$h1,$t3
- addw $h2,$h2,$t3
- MULX ($h3,$t3,$r2,$d0) # d0*r2
- addw $h1,$h1,$a3
- sltu $a3,$h1,$a3
- addw $h2,$h2,$a3
- MULX ($t6,$a3,$r1,$d1) # d1*r1
- addw $h2,$h2,$t3
- sltu $t3,$h2,$t3
- addw $h3,$h3,$t3
- MULX ($t4,$t3,$r0,$d2) # d2*r0
- addw $h2,$h2,$a3
- addw $h3,$h3,$t6
- sltu $a3,$h2,$a3
- addw $h3,$h3,$a3
- MULX ($t6,$a3,$rs3,$d3) # d3*s3
- addw $h2,$h2,$t3
- addw $h3,$h3,$t4
- sltu $t3,$h2,$t3
- addw $h3,$h3,$t3
- mulw $t3,$rs2,$h4 # h4*s2
- addw $h2,$h2,$a3
- addw $h3,$h3,$t6
- sltu $a3,$h2,$a3
- addw $h3,$h3,$a3
- MULX ($t6,$a3,$r3,$d0) # d0*r3
- addw $h2,$h2,$t3
- sltu $t3,$h2,$t3
- addw $h3,$h3,$t3
- MULX ($t4,$t3,$r2,$d1) # d1*r2
- addw $h3,$h3,$a3
- sltu $a3,$h3,$a3
- addw $t6,$t6,$a3
- MULX ($a3,$d3,$r0,$d3) # d3*r0
- addw $h3,$h3,$t3
- addw $t6,$t6,$t4
- sltu $t3,$h3,$t3
- addw $t6,$t6,$t3
- MULX ($t4,$t3,$r1,$d2) # d2*r1
- addw $h3,$h3,$d3
- addw $t6,$t6,$a3
- sltu $d3,$h3,$d3
- addw $t6,$t6,$d3
- mulw $a3,$rs3,$h4 # h4*s3
- addw $h3,$h3,$t3
- addw $t6,$t6,$t4
- sltu $t3,$h3,$t3
- addw $t6,$t6,$t3
- mulw $h4,$r0,$h4 # h4*r0
- addw $h3,$h3,$a3
- sltu $a3,$h3,$a3
- addw $t6,$t6,$a3
- addw $h4,$t6,$h4
- li $padbit,1 # if we loop, padbit is 1
- bne $inp,$len,.Loop
- sw $h0,0($ctx) # store hash value
- sw $h1,4($ctx)
- sw $h2,8($ctx)
- sw $h3,12($ctx)
- sw $h4,16($ctx)
- #ifdef __riscv_zcmp
- cm.popret {ra,s0-s8}, 48
- #else
- POP $ra, __SIZEOF_POINTER__*11($sp)
- POP $s0, __SIZEOF_POINTER__*10($sp)
- POP $s1, __SIZEOF_POINTER__*9($sp)
- POP $s2, __SIZEOF_POINTER__*8($sp)
- POP $s3, __SIZEOF_POINTER__*7($sp)
- POP $s4, __SIZEOF_POINTER__*6($sp)
- POP $s5, __SIZEOF_POINTER__*5($sp)
- POP $s6, __SIZEOF_POINTER__*4($sp)
- POP $s7, __SIZEOF_POINTER__*3($sp)
- POP $s8, __SIZEOF_POINTER__*2($sp)
- caddi $sp,$sp,__SIZEOF_POINTER__*12
- #endif
- .Labort:
- ret
- .size poly1305_blocks,.-poly1305_blocks
- ___
- }
- {
- my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
- $code.=<<___;
- .globl poly1305_emit
- .type poly1305_emit,\@function
- poly1305_emit:
- #ifdef __riscv_zicfilp
- lpad 0
- #endif
- lw $tmp4,16($ctx)
- lw $tmp0,0($ctx)
- lw $tmp1,4($ctx)
- lw $tmp2,8($ctx)
- lw $tmp3,12($ctx)
- srliw $ctx,$tmp4,2 # final reduction
- andi $in0,$tmp4,-4
- andi $tmp4,$tmp4,3
- addw $ctx,$ctx,$in0
- addw $tmp0,$tmp0,$ctx
- sltu $ctx,$tmp0,$ctx
- addiw $in0,$tmp0,5 # compare to modulus
- addw $tmp1,$tmp1,$ctx
- sltiu $in1,$in0,5
- sltu $ctx,$tmp1,$ctx
- addw $in1,$in1,$tmp1
- addw $tmp2,$tmp2,$ctx
- sltu $in2,$in1,$tmp1
- sltu $ctx,$tmp2,$ctx
- addw $in2,$in2,$tmp2
- addw $tmp3,$tmp3,$ctx
- sltu $in3,$in2,$tmp2
- sltu $ctx,$tmp3,$ctx
- addw $in3,$in3,$tmp3
- addw $tmp4,$tmp4,$ctx
- sltu $ctx,$in3,$tmp3
- addw $ctx,$ctx,$tmp4
- srl $ctx,$ctx,2 # see if it carried/borrowed
- sub $ctx,$zero,$ctx
- xor $in0,$in0,$tmp0
- xor $in1,$in1,$tmp1
- xor $in2,$in2,$tmp2
- xor $in3,$in3,$tmp3
- and $in0,$in0,$ctx
- and $in1,$in1,$ctx
- and $in2,$in2,$ctx
- and $in3,$in3,$ctx
- xor $in0,$in0,$tmp0
- xor $in1,$in1,$tmp1
- xor $in2,$in2,$tmp2
- xor $in3,$in3,$tmp3
- lw $tmp0,0($nonce) # load nonce
- lw $tmp1,4($nonce)
- lw $tmp2,8($nonce)
- lw $tmp3,12($nonce)
- addw $in0,$in0,$tmp0 # accumulate nonce
- sltu $ctx,$in0,$tmp0
- addw $in1,$in1,$tmp1
- sltu $tmp1,$in1,$tmp1
- addw $in1,$in1,$ctx
- sltu $ctx,$in1,$ctx
- addw $ctx,$ctx,$tmp1
- addw $in2,$in2,$tmp2
- sltu $tmp2,$in2,$tmp2
- addw $in2,$in2,$ctx
- sltu $ctx,$in2,$ctx
- addw $ctx,$ctx,$tmp2
- addw $in3,$in3,$tmp3
- addw $in3,$in3,$ctx
- #ifdef __riscv_misaligned_fast
- sw $in0,0($mac) # write mac value
- sw $in1,4($mac)
- sw $in2,8($mac)
- sw $in3,12($mac)
- #else
- srl $tmp0,$in0,8 # write mac value
- srl $tmp1,$in0,16
- srl $tmp2,$in0,24
- sb $in0, 0($mac)
- sb $tmp0,1($mac)
- srl $tmp0,$in1,8
- sb $tmp1,2($mac)
- srl $tmp1,$in1,16
- sb $tmp2,3($mac)
- srl $tmp2,$in1,24
- sb $in1, 4($mac)
- sb $tmp0,5($mac)
- srl $tmp0,$in2,8
- sb $tmp1,6($mac)
- srl $tmp1,$in2,16
- sb $tmp2,7($mac)
- srl $tmp2,$in2,24
- sb $in2, 8($mac)
- sb $tmp0,9($mac)
- srl $tmp0,$in3,8
- sb $tmp1,10($mac)
- srl $tmp1,$in3,16
- sb $tmp2,11($mac)
- srl $tmp2,$in3,24
- sb $in3, 12($mac)
- sb $tmp0,13($mac)
- sb $tmp1,14($mac)
- sb $tmp2,15($mac)
- #endif
- ret
- .size poly1305_emit,.-poly1305_emit
- .string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
- ___
- }
- }}}
- foreach (split("\n", $code)) {
- if ($flavour =~ /^cheri/) {
- s/\(x([0-9]+)\)/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/;
- s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or
- s/\b(ret|jal)\b/c$1/;
- s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or
- m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g;
- } else {
- s/\bcaddi?\b/add/ or
- s/\bcmove\b/mv/;
- }
- print $_, "\n";
- }
- close STDOUT;
|