poly1305-riscv.pl 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847
  1. #!/usr/bin/env perl
  2. # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
  3. #
  4. # ====================================================================
  5. # Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL.
  6. # ====================================================================
  7. #
  8. # Poly1305 hash for RISC-V.
  9. #
  10. # February 2019
  11. #
  12. # In the essence it's pretty straightforward transliteration of MIPS
  13. # module [without big-endian option].
  14. #
  15. # 1.8 cycles per byte on U74, >100% faster than compiler-generated
  16. # code. 1.9 cpb on C910, ~75% improvement. 3.3 on Spacemit X60, ~69%
  17. # improvement.
  18. #
  19. # June 2024.
  20. #
  21. # Add CHERI support.
  22. #
  23. ######################################################################
  24. #
  25. ($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4));
  26. ($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31));
  27. ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17));
  28. ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27));
  29. #
  30. ######################################################################
  31. $flavour = shift || "64";
  32. for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
  33. open STDOUT,">$output";
  34. $code.=<<___;
  35. #ifdef __KERNEL__
  36. # ifdef __riscv_zicfilp
  37. # undef __riscv_zicfilp // calls are expected to be direct
  38. # endif
  39. #endif
  40. #if defined(__CHERI_PURE_CAPABILITY__) && !defined(__riscv_misaligned_fast)
  41. # define __riscv_misaligned_fast 1
  42. #endif
  43. ___
  44. if ($flavour =~ /64/) {{{
  45. ######################################################################
  46. # 64-bit code path...
  47. #
  48. my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
  49. my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2);
  50. $code.=<<___;
  51. #if __riscv_xlen == 64
  52. # if __SIZEOF_POINTER__ == 16
  53. # define PUSH csc
  54. # define POP clc
  55. # else
  56. # define PUSH sd
  57. # define POP ld
  58. # endif
  59. #else
  60. # error "unsupported __riscv_xlen"
  61. #endif
  62. .option pic
  63. .text
  64. .globl poly1305_init
  65. .type poly1305_init,\@function
  66. poly1305_init:
  67. #ifdef __riscv_zicfilp
  68. lpad 0
  69. #endif
  70. sd $zero,0($ctx)
  71. sd $zero,8($ctx)
  72. sd $zero,16($ctx)
  73. beqz $inp,.Lno_key
  74. #ifndef __riscv_misaligned_fast
  75. andi $tmp0,$inp,7 # $inp % 8
  76. andi $inp,$inp,-8 # align $inp
  77. slli $tmp0,$tmp0,3 # byte to bit offset
  78. #endif
  79. ld $in0,0($inp)
  80. ld $in1,8($inp)
  81. #ifndef __riscv_misaligned_fast
  82. beqz $tmp0,.Laligned_key
  83. ld $tmp2,16($inp)
  84. neg $tmp1,$tmp0 # implicit &63 in sll
  85. srl $in0,$in0,$tmp0
  86. sll $tmp3,$in1,$tmp1
  87. srl $in1,$in1,$tmp0
  88. sll $tmp2,$tmp2,$tmp1
  89. or $in0,$in0,$tmp3
  90. or $in1,$in1,$tmp2
  91. .Laligned_key:
  92. #endif
  93. li $tmp0,1
  94. slli $tmp0,$tmp0,32 # 0x0000000100000000
  95. addi $tmp0,$tmp0,-63 # 0x00000000ffffffc1
  96. slli $tmp0,$tmp0,28 # 0x0ffffffc10000000
  97. addi $tmp0,$tmp0,-1 # 0x0ffffffc0fffffff
  98. and $in0,$in0,$tmp0
  99. addi $tmp0,$tmp0,-3 # 0x0ffffffc0ffffffc
  100. and $in1,$in1,$tmp0
  101. sd $in0,24($ctx)
  102. srli $tmp0,$in1,2
  103. sd $in1,32($ctx)
  104. add $tmp0,$tmp0,$in1 # s1 = r1 + (r1 >> 2)
  105. sd $tmp0,40($ctx)
  106. .Lno_key:
  107. li $a0,0 # return 0
  108. ret
  109. .size poly1305_init,.-poly1305_init
  110. ___
  111. {
  112. my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
  113. ($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2);
  114. my ($shr,$shl) = ($t5,$t6); # used on R6
  115. $code.=<<___;
  116. .globl poly1305_blocks
  117. .type poly1305_blocks,\@function
  118. poly1305_blocks:
  119. #ifdef __riscv_zicfilp
  120. lpad 0
  121. #endif
  122. andi $len,$len,-16 # complete blocks only
  123. beqz $len,.Lno_data
  124. caddi $sp,$sp,-4*__SIZEOF_POINTER__
  125. PUSH $s0,3*__SIZEOF_POINTER__($sp)
  126. PUSH $s1,2*__SIZEOF_POINTER__($sp)
  127. PUSH $s2,1*__SIZEOF_POINTER__($sp)
  128. PUSH $s3,0*__SIZEOF_POINTER__($sp)
  129. #ifndef __riscv_misaligned_fast
  130. andi $shr,$inp,7
  131. andi $inp,$inp,-8 # align $inp
  132. slli $shr,$shr,3 # byte to bit offset
  133. neg $shl,$shr # implicit &63 in sll
  134. #endif
  135. ld $h0,0($ctx) # load hash value
  136. ld $h1,8($ctx)
  137. ld $h2,16($ctx)
  138. ld $r0,24($ctx) # load key
  139. ld $r1,32($ctx)
  140. ld $rs1,40($ctx)
  141. add $len,$len,$inp # end of buffer
  142. .Loop:
  143. ld $in0,0($inp) # load input
  144. ld $in1,8($inp)
  145. #ifndef __riscv_misaligned_fast
  146. beqz $shr,.Laligned_inp
  147. ld $tmp2,16($inp)
  148. srl $in0,$in0,$shr
  149. sll $tmp3,$in1,$shl
  150. srl $in1,$in1,$shr
  151. sll $tmp2,$tmp2,$shl
  152. or $in0,$in0,$tmp3
  153. or $in1,$in1,$tmp2
  154. .Laligned_inp:
  155. #endif
  156. caddi $inp,$inp,16
  157. andi $tmp0,$h2,-4 # modulo-scheduled reduction
  158. srli $tmp1,$h2,2
  159. andi $h2,$h2,3
  160. add $d0,$h0,$in0 # accumulate input
  161. add $tmp1,$tmp1,$tmp0
  162. sltu $tmp0,$d0,$h0
  163. add $d0,$d0,$tmp1 # ... and residue
  164. sltu $tmp1,$d0,$tmp1
  165. add $d1,$h1,$in1
  166. add $tmp0,$tmp0,$tmp1
  167. sltu $tmp1,$d1,$h1
  168. add $d1,$d1,$tmp0
  169. add $d2,$h2,$padbit
  170. sltu $tmp0,$d1,$tmp0
  171. mulhu $h1,$r0,$d0 # h0*r0
  172. mul $h0,$r0,$d0
  173. add $d2,$d2,$tmp1
  174. add $d2,$d2,$tmp0
  175. mulhu $tmp1,$rs1,$d1 # h1*5*r1
  176. mul $tmp0,$rs1,$d1
  177. mulhu $h2,$r1,$d0 # h0*r1
  178. mul $tmp2,$r1,$d0
  179. add $h0,$h0,$tmp0
  180. add $h1,$h1,$tmp1
  181. sltu $tmp0,$h0,$tmp0
  182. add $h1,$h1,$tmp0
  183. add $h1,$h1,$tmp2
  184. mulhu $tmp1,$r0,$d1 # h1*r0
  185. mul $tmp0,$r0,$d1
  186. sltu $tmp2,$h1,$tmp2
  187. add $h2,$h2,$tmp2
  188. mul $tmp2,$rs1,$d2 # h2*5*r1
  189. add $h1,$h1,$tmp0
  190. add $h2,$h2,$tmp1
  191. mul $tmp3,$r0,$d2 # h2*r0
  192. sltu $tmp0,$h1,$tmp0
  193. add $h2,$h2,$tmp0
  194. add $h1,$h1,$tmp2
  195. sltu $tmp2,$h1,$tmp2
  196. add $h2,$h2,$tmp2
  197. add $h2,$h2,$tmp3
  198. bne $inp,$len,.Loop
  199. sd $h0,0($ctx) # store hash value
  200. sd $h1,8($ctx)
  201. sd $h2,16($ctx)
  202. POP $s0,3*__SIZEOF_POINTER__($sp) # epilogue
  203. POP $s1,2*__SIZEOF_POINTER__($sp)
  204. POP $s2,1*__SIZEOF_POINTER__($sp)
  205. POP $s3,0*__SIZEOF_POINTER__($sp)
  206. caddi $sp,$sp,4*__SIZEOF_POINTER__
  207. .Lno_data:
  208. ret
  209. .size poly1305_blocks,.-poly1305_blocks
  210. ___
  211. }
  212. {
  213. my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
  214. $code.=<<___;
  215. .globl poly1305_emit
  216. .type poly1305_emit,\@function
  217. poly1305_emit:
  218. #ifdef __riscv_zicfilp
  219. lpad 0
  220. #endif
  221. ld $tmp2,16($ctx)
  222. ld $tmp0,0($ctx)
  223. ld $tmp1,8($ctx)
  224. andi $in0,$tmp2,-4 # final reduction
  225. srl $in1,$tmp2,2
  226. andi $tmp2,$tmp2,3
  227. add $in0,$in0,$in1
  228. add $tmp0,$tmp0,$in0
  229. sltu $in1,$tmp0,$in0
  230. addi $in0,$tmp0,5 # compare to modulus
  231. add $tmp1,$tmp1,$in1
  232. sltiu $tmp3,$in0,5
  233. sltu $tmp4,$tmp1,$in1
  234. add $in1,$tmp1,$tmp3
  235. add $tmp2,$tmp2,$tmp4
  236. sltu $tmp3,$in1,$tmp3
  237. add $tmp2,$tmp2,$tmp3
  238. srli $tmp2,$tmp2,2 # see if it carried/borrowed
  239. neg $tmp2,$tmp2
  240. xor $in0,$in0,$tmp0
  241. xor $in1,$in1,$tmp1
  242. and $in0,$in0,$tmp2
  243. and $in1,$in1,$tmp2
  244. xor $in0,$in0,$tmp0
  245. xor $in1,$in1,$tmp1
  246. lwu $tmp0,0($nonce) # load nonce
  247. lwu $tmp1,4($nonce)
  248. lwu $tmp2,8($nonce)
  249. lwu $tmp3,12($nonce)
  250. slli $tmp1,$tmp1,32
  251. slli $tmp3,$tmp3,32
  252. or $tmp0,$tmp0,$tmp1
  253. or $tmp2,$tmp2,$tmp3
  254. add $in0,$in0,$tmp0 # accumulate nonce
  255. add $in1,$in1,$tmp2
  256. sltu $tmp0,$in0,$tmp0
  257. add $in1,$in1,$tmp0
  258. #ifdef __riscv_misaligned_fast
  259. sd $in0,0($mac) # write mac value
  260. sd $in1,8($mac)
  261. #else
  262. srli $tmp0,$in0,8 # write mac value
  263. srli $tmp1,$in0,16
  264. srli $tmp2,$in0,24
  265. sb $in0,0($mac)
  266. srli $tmp3,$in0,32
  267. sb $tmp0,1($mac)
  268. srli $tmp0,$in0,40
  269. sb $tmp1,2($mac)
  270. srli $tmp1,$in0,48
  271. sb $tmp2,3($mac)
  272. srli $tmp2,$in0,56
  273. sb $tmp3,4($mac)
  274. srli $tmp3,$in1,8
  275. sb $tmp0,5($mac)
  276. srli $tmp0,$in1,16
  277. sb $tmp1,6($mac)
  278. srli $tmp1,$in1,24
  279. sb $tmp2,7($mac)
  280. sb $in1,8($mac)
  281. srli $tmp2,$in1,32
  282. sb $tmp3,9($mac)
  283. srli $tmp3,$in1,40
  284. sb $tmp0,10($mac)
  285. srli $tmp0,$in1,48
  286. sb $tmp1,11($mac)
  287. srli $tmp1,$in1,56
  288. sb $tmp2,12($mac)
  289. sb $tmp3,13($mac)
  290. sb $tmp0,14($mac)
  291. sb $tmp1,15($mac)
  292. #endif
  293. ret
  294. .size poly1305_emit,.-poly1305_emit
  295. .string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
  296. ___
  297. }
  298. }}} else {{{
  299. ######################################################################
  300. # 32-bit code path
  301. #
  302. my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
  303. my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
  304. ($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3);
  305. $code.=<<___;
  306. #if __riscv_xlen == 32
  307. # if __SIZEOF_POINTER__ == 8
  308. # define PUSH csc
  309. # define POP clc
  310. # else
  311. # define PUSH sw
  312. # define POP lw
  313. # endif
  314. # define MULX(hi,lo,a,b) mulhu hi,a,b; mul lo,a,b
  315. # define srliw srli
  316. # define srlw srl
  317. # define sllw sll
  318. # define addw add
  319. # define addiw addi
  320. # define mulw mul
  321. #elif __riscv_xlen == 64
  322. # if __SIZEOF_POINTER__ == 16
  323. # define PUSH csc
  324. # define POP clc
  325. # else
  326. # define PUSH sd
  327. # define POP ld
  328. # endif
  329. # define MULX(hi,lo,a,b) slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32
  330. #else
  331. # error "unsupported __riscv_xlen"
  332. #endif
  333. .option pic
  334. .text
  335. .globl poly1305_init
  336. .type poly1305_init,\@function
  337. poly1305_init:
  338. #ifdef __riscv_zicfilp
  339. lpad 0
  340. #endif
  341. sw $zero,0($ctx)
  342. sw $zero,4($ctx)
  343. sw $zero,8($ctx)
  344. sw $zero,12($ctx)
  345. sw $zero,16($ctx)
  346. beqz $inp,.Lno_key
  347. #ifndef __riscv_misaligned_fast
  348. andi $tmp0,$inp,3 # $inp % 4
  349. sub $inp,$inp,$tmp0 # align $inp
  350. sll $tmp0,$tmp0,3 # byte to bit offset
  351. #endif
  352. lw $in0,0($inp)
  353. lw $in1,4($inp)
  354. lw $in2,8($inp)
  355. lw $in3,12($inp)
  356. #ifndef __riscv_misaligned_fast
  357. beqz $tmp0,.Laligned_key
  358. lw $tmp2,16($inp)
  359. sub $tmp1,$zero,$tmp0
  360. srlw $in0,$in0,$tmp0
  361. sllw $tmp3,$in1,$tmp1
  362. srlw $in1,$in1,$tmp0
  363. or $in0,$in0,$tmp3
  364. sllw $tmp3,$in2,$tmp1
  365. srlw $in2,$in2,$tmp0
  366. or $in1,$in1,$tmp3
  367. sllw $tmp3,$in3,$tmp1
  368. srlw $in3,$in3,$tmp0
  369. or $in2,$in2,$tmp3
  370. sllw $tmp2,$tmp2,$tmp1
  371. or $in3,$in3,$tmp2
  372. .Laligned_key:
  373. #endif
  374. lui $tmp0,0x10000
  375. addi $tmp0,$tmp0,-1 # 0x0fffffff
  376. and $in0,$in0,$tmp0
  377. addi $tmp0,$tmp0,-3 # 0x0ffffffc
  378. and $in1,$in1,$tmp0
  379. and $in2,$in2,$tmp0
  380. and $in3,$in3,$tmp0
  381. sw $in0,20($ctx)
  382. sw $in1,24($ctx)
  383. sw $in2,28($ctx)
  384. sw $in3,32($ctx)
  385. srlw $tmp1,$in1,2
  386. srlw $tmp2,$in2,2
  387. srlw $tmp3,$in3,2
  388. addw $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
  389. addw $in2,$in2,$tmp2
  390. addw $in3,$in3,$tmp3
  391. sw $in1,36($ctx)
  392. sw $in2,40($ctx)
  393. sw $in3,44($ctx)
  394. .Lno_key:
  395. li $a0,0
  396. ret
  397. .size poly1305_init,.-poly1305_init
  398. ___
  399. {
  400. my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
  401. ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2);
  402. my ($d0,$d1,$d2,$d3) =
  403. ($a4,$a5,$a6,$a7);
  404. my $shr = $ra; # used on R6
  405. $code.=<<___;
  406. .globl poly1305_blocks
  407. .type poly1305_blocks,\@function
  408. poly1305_blocks:
  409. #ifdef __riscv_zicfilp
  410. lpad 0
  411. #endif
  412. andi $len,$len,-16 # complete blocks only
  413. beqz $len,.Labort
  414. #ifdef __riscv_zcmp
  415. cm.push {ra,s0-s8}, -48
  416. #else
  417. caddi $sp,$sp,-__SIZEOF_POINTER__*12
  418. PUSH $ra, __SIZEOF_POINTER__*11($sp)
  419. PUSH $s0, __SIZEOF_POINTER__*10($sp)
  420. PUSH $s1, __SIZEOF_POINTER__*9($sp)
  421. PUSH $s2, __SIZEOF_POINTER__*8($sp)
  422. PUSH $s3, __SIZEOF_POINTER__*7($sp)
  423. PUSH $s4, __SIZEOF_POINTER__*6($sp)
  424. PUSH $s5, __SIZEOF_POINTER__*5($sp)
  425. PUSH $s6, __SIZEOF_POINTER__*4($sp)
  426. PUSH $s7, __SIZEOF_POINTER__*3($sp)
  427. PUSH $s8, __SIZEOF_POINTER__*2($sp)
  428. #endif
  429. #ifndef __riscv_misaligned_fast
  430. andi $shr,$inp,3
  431. andi $inp,$inp,-4 # align $inp
  432. slli $shr,$shr,3 # byte to bit offset
  433. #endif
  434. lw $h0,0($ctx) # load hash value
  435. lw $h1,4($ctx)
  436. lw $h2,8($ctx)
  437. lw $h3,12($ctx)
  438. lw $h4,16($ctx)
  439. lw $r0,20($ctx) # load key
  440. lw $r1,24($ctx)
  441. lw $r2,28($ctx)
  442. lw $r3,32($ctx)
  443. lw $rs1,36($ctx)
  444. lw $rs2,40($ctx)
  445. lw $rs3,44($ctx)
  446. add $len,$len,$inp # end of buffer
  447. .Loop:
  448. lw $d0,0($inp) # load input
  449. lw $d1,4($inp)
  450. lw $d2,8($inp)
  451. lw $d3,12($inp)
  452. #ifndef __riscv_misaligned_fast
  453. beqz $shr,.Laligned_inp
  454. lw $t4,16($inp)
  455. sub $t5,$zero,$shr
  456. srlw $d0,$d0,$shr
  457. sllw $t3,$d1,$t5
  458. srlw $d1,$d1,$shr
  459. or $d0,$d0,$t3
  460. sllw $t3,$d2,$t5
  461. srlw $d2,$d2,$shr
  462. or $d1,$d1,$t3
  463. sllw $t3,$d3,$t5
  464. srlw $d3,$d3,$shr
  465. or $d2,$d2,$t3
  466. sllw $t4,$t4,$t5
  467. or $d3,$d3,$t4
  468. .Laligned_inp:
  469. #endif
  470. srliw $t3,$h4,2 # modulo-scheduled reduction
  471. andi $t4,$h4,-4
  472. andi $h4,$h4,3
  473. addw $d0,$d0,$h0 # accumulate input
  474. addw $t4,$t4,$t3
  475. sltu $h0,$d0,$h0
  476. addw $d0,$d0,$t4 # ... and residue
  477. sltu $t4,$d0,$t4
  478. addw $d1,$d1,$h1
  479. addw $h0,$h0,$t4 # carry
  480. sltu $h1,$d1,$h1
  481. addw $d1,$d1,$h0
  482. sltu $h0,$d1,$h0
  483. addw $d2,$d2,$h2
  484. addw $h1,$h1,$h0 # carry
  485. sltu $h2,$d2,$h2
  486. addw $d2,$d2,$h1
  487. sltu $h1,$d2,$h1
  488. addw $d3,$d3,$h3
  489. addw $h2,$h2,$h1 # carry
  490. sltu $h3,$d3,$h3
  491. addw $d3,$d3,$h2
  492. MULX ($h1,$h0,$r0,$d0) # d0*r0
  493. sltu $h2,$d3,$h2
  494. addw $h3,$h3,$h2 # carry
  495. MULX ($t4,$t3,$rs3,$d1) # d1*s3
  496. addw $h4,$h4,$padbit
  497. caddi $inp,$inp,16
  498. addw $h4,$h4,$h3
  499. MULX ($t6,$a3,$rs2,$d2) # d2*s2
  500. addw $h0,$h0,$t3
  501. addw $h1,$h1,$t4
  502. sltu $t3,$h0,$t3
  503. addw $h1,$h1,$t3
  504. MULX ($t4,$t3,$rs1,$d3) # d3*s1
  505. addw $h0,$h0,$a3
  506. addw $h1,$h1,$t6
  507. sltu $a3,$h0,$a3
  508. addw $h1,$h1,$a3
  509. MULX ($h2,$a3,$r1,$d0) # d0*r1
  510. addw $h0,$h0,$t3
  511. addw $h1,$h1,$t4
  512. sltu $t3,$h0,$t3
  513. addw $h1,$h1,$t3
  514. MULX ($t4,$t3,$r0,$d1) # d1*r0
  515. addw $h1,$h1,$a3
  516. sltu $a3,$h1,$a3
  517. addw $h2,$h2,$a3
  518. MULX ($t6,$a3,$rs3,$d2) # d2*s3
  519. addw $h1,$h1,$t3
  520. addw $h2,$h2,$t4
  521. sltu $t3,$h1,$t3
  522. addw $h2,$h2,$t3
  523. MULX ($t4,$t3,$rs2,$d3) # d3*s2
  524. addw $h1,$h1,$a3
  525. addw $h2,$h2,$t6
  526. sltu $a3,$h1,$a3
  527. addw $h2,$h2,$a3
  528. mulw $a3,$rs1,$h4 # h4*s1
  529. addw $h1,$h1,$t3
  530. addw $h2,$h2,$t4
  531. sltu $t3,$h1,$t3
  532. addw $h2,$h2,$t3
  533. MULX ($h3,$t3,$r2,$d0) # d0*r2
  534. addw $h1,$h1,$a3
  535. sltu $a3,$h1,$a3
  536. addw $h2,$h2,$a3
  537. MULX ($t6,$a3,$r1,$d1) # d1*r1
  538. addw $h2,$h2,$t3
  539. sltu $t3,$h2,$t3
  540. addw $h3,$h3,$t3
  541. MULX ($t4,$t3,$r0,$d2) # d2*r0
  542. addw $h2,$h2,$a3
  543. addw $h3,$h3,$t6
  544. sltu $a3,$h2,$a3
  545. addw $h3,$h3,$a3
  546. MULX ($t6,$a3,$rs3,$d3) # d3*s3
  547. addw $h2,$h2,$t3
  548. addw $h3,$h3,$t4
  549. sltu $t3,$h2,$t3
  550. addw $h3,$h3,$t3
  551. mulw $t3,$rs2,$h4 # h4*s2
  552. addw $h2,$h2,$a3
  553. addw $h3,$h3,$t6
  554. sltu $a3,$h2,$a3
  555. addw $h3,$h3,$a3
  556. MULX ($t6,$a3,$r3,$d0) # d0*r3
  557. addw $h2,$h2,$t3
  558. sltu $t3,$h2,$t3
  559. addw $h3,$h3,$t3
  560. MULX ($t4,$t3,$r2,$d1) # d1*r2
  561. addw $h3,$h3,$a3
  562. sltu $a3,$h3,$a3
  563. addw $t6,$t6,$a3
  564. MULX ($a3,$d3,$r0,$d3) # d3*r0
  565. addw $h3,$h3,$t3
  566. addw $t6,$t6,$t4
  567. sltu $t3,$h3,$t3
  568. addw $t6,$t6,$t3
  569. MULX ($t4,$t3,$r1,$d2) # d2*r1
  570. addw $h3,$h3,$d3
  571. addw $t6,$t6,$a3
  572. sltu $d3,$h3,$d3
  573. addw $t6,$t6,$d3
  574. mulw $a3,$rs3,$h4 # h4*s3
  575. addw $h3,$h3,$t3
  576. addw $t6,$t6,$t4
  577. sltu $t3,$h3,$t3
  578. addw $t6,$t6,$t3
  579. mulw $h4,$r0,$h4 # h4*r0
  580. addw $h3,$h3,$a3
  581. sltu $a3,$h3,$a3
  582. addw $t6,$t6,$a3
  583. addw $h4,$t6,$h4
  584. li $padbit,1 # if we loop, padbit is 1
  585. bne $inp,$len,.Loop
  586. sw $h0,0($ctx) # store hash value
  587. sw $h1,4($ctx)
  588. sw $h2,8($ctx)
  589. sw $h3,12($ctx)
  590. sw $h4,16($ctx)
  591. #ifdef __riscv_zcmp
  592. cm.popret {ra,s0-s8}, 48
  593. #else
  594. POP $ra, __SIZEOF_POINTER__*11($sp)
  595. POP $s0, __SIZEOF_POINTER__*10($sp)
  596. POP $s1, __SIZEOF_POINTER__*9($sp)
  597. POP $s2, __SIZEOF_POINTER__*8($sp)
  598. POP $s3, __SIZEOF_POINTER__*7($sp)
  599. POP $s4, __SIZEOF_POINTER__*6($sp)
  600. POP $s5, __SIZEOF_POINTER__*5($sp)
  601. POP $s6, __SIZEOF_POINTER__*4($sp)
  602. POP $s7, __SIZEOF_POINTER__*3($sp)
  603. POP $s8, __SIZEOF_POINTER__*2($sp)
  604. caddi $sp,$sp,__SIZEOF_POINTER__*12
  605. #endif
  606. .Labort:
  607. ret
  608. .size poly1305_blocks,.-poly1305_blocks
  609. ___
  610. }
  611. {
  612. my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
  613. $code.=<<___;
  614. .globl poly1305_emit
  615. .type poly1305_emit,\@function
  616. poly1305_emit:
  617. #ifdef __riscv_zicfilp
  618. lpad 0
  619. #endif
  620. lw $tmp4,16($ctx)
  621. lw $tmp0,0($ctx)
  622. lw $tmp1,4($ctx)
  623. lw $tmp2,8($ctx)
  624. lw $tmp3,12($ctx)
  625. srliw $ctx,$tmp4,2 # final reduction
  626. andi $in0,$tmp4,-4
  627. andi $tmp4,$tmp4,3
  628. addw $ctx,$ctx,$in0
  629. addw $tmp0,$tmp0,$ctx
  630. sltu $ctx,$tmp0,$ctx
  631. addiw $in0,$tmp0,5 # compare to modulus
  632. addw $tmp1,$tmp1,$ctx
  633. sltiu $in1,$in0,5
  634. sltu $ctx,$tmp1,$ctx
  635. addw $in1,$in1,$tmp1
  636. addw $tmp2,$tmp2,$ctx
  637. sltu $in2,$in1,$tmp1
  638. sltu $ctx,$tmp2,$ctx
  639. addw $in2,$in2,$tmp2
  640. addw $tmp3,$tmp3,$ctx
  641. sltu $in3,$in2,$tmp2
  642. sltu $ctx,$tmp3,$ctx
  643. addw $in3,$in3,$tmp3
  644. addw $tmp4,$tmp4,$ctx
  645. sltu $ctx,$in3,$tmp3
  646. addw $ctx,$ctx,$tmp4
  647. srl $ctx,$ctx,2 # see if it carried/borrowed
  648. sub $ctx,$zero,$ctx
  649. xor $in0,$in0,$tmp0
  650. xor $in1,$in1,$tmp1
  651. xor $in2,$in2,$tmp2
  652. xor $in3,$in3,$tmp3
  653. and $in0,$in0,$ctx
  654. and $in1,$in1,$ctx
  655. and $in2,$in2,$ctx
  656. and $in3,$in3,$ctx
  657. xor $in0,$in0,$tmp0
  658. xor $in1,$in1,$tmp1
  659. xor $in2,$in2,$tmp2
  660. xor $in3,$in3,$tmp3
  661. lw $tmp0,0($nonce) # load nonce
  662. lw $tmp1,4($nonce)
  663. lw $tmp2,8($nonce)
  664. lw $tmp3,12($nonce)
  665. addw $in0,$in0,$tmp0 # accumulate nonce
  666. sltu $ctx,$in0,$tmp0
  667. addw $in1,$in1,$tmp1
  668. sltu $tmp1,$in1,$tmp1
  669. addw $in1,$in1,$ctx
  670. sltu $ctx,$in1,$ctx
  671. addw $ctx,$ctx,$tmp1
  672. addw $in2,$in2,$tmp2
  673. sltu $tmp2,$in2,$tmp2
  674. addw $in2,$in2,$ctx
  675. sltu $ctx,$in2,$ctx
  676. addw $ctx,$ctx,$tmp2
  677. addw $in3,$in3,$tmp3
  678. addw $in3,$in3,$ctx
  679. #ifdef __riscv_misaligned_fast
  680. sw $in0,0($mac) # write mac value
  681. sw $in1,4($mac)
  682. sw $in2,8($mac)
  683. sw $in3,12($mac)
  684. #else
  685. srl $tmp0,$in0,8 # write mac value
  686. srl $tmp1,$in0,16
  687. srl $tmp2,$in0,24
  688. sb $in0, 0($mac)
  689. sb $tmp0,1($mac)
  690. srl $tmp0,$in1,8
  691. sb $tmp1,2($mac)
  692. srl $tmp1,$in1,16
  693. sb $tmp2,3($mac)
  694. srl $tmp2,$in1,24
  695. sb $in1, 4($mac)
  696. sb $tmp0,5($mac)
  697. srl $tmp0,$in2,8
  698. sb $tmp1,6($mac)
  699. srl $tmp1,$in2,16
  700. sb $tmp2,7($mac)
  701. srl $tmp2,$in2,24
  702. sb $in2, 8($mac)
  703. sb $tmp0,9($mac)
  704. srl $tmp0,$in3,8
  705. sb $tmp1,10($mac)
  706. srl $tmp1,$in3,16
  707. sb $tmp2,11($mac)
  708. srl $tmp2,$in3,24
  709. sb $in3, 12($mac)
  710. sb $tmp0,13($mac)
  711. sb $tmp1,14($mac)
  712. sb $tmp2,15($mac)
  713. #endif
  714. ret
  715. .size poly1305_emit,.-poly1305_emit
  716. .string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
  717. ___
  718. }
  719. }}}
  720. foreach (split("\n", $code)) {
  721. if ($flavour =~ /^cheri/) {
  722. s/\(x([0-9]+)\)/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/;
  723. s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or
  724. s/\b(ret|jal)\b/c$1/;
  725. s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or
  726. m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g;
  727. } else {
  728. s/\bcaddi?\b/add/ or
  729. s/\bcmove\b/mv/;
  730. }
  731. print $_, "\n";
  732. }
  733. close STDOUT;