poly1305-armv4.pl 29 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235
  1. #!/usr/bin/env perl
  2. # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
  3. #
  4. # ====================================================================
  5. # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
  6. # project.
  7. # ====================================================================
  8. #
  9. # IALU(*)/gcc-4.4 NEON
  10. #
  11. # ARM11xx(ARMv6) 7.78/+100% -
  12. # Cortex-A5 6.35/+130% 3.00
  13. # Cortex-A8 6.25/+115% 2.36
  14. # Cortex-A9 5.10/+95% 2.55
  15. # Cortex-A15 3.85/+85% 1.25(**)
  16. # Snapdragon S4 5.70/+100% 1.48(**)
  17. #
  18. # (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
  19. # (**) these are trade-off results, they can be improved by ~8% but at
  20. # the cost of 15/12% regression on Cortex-A5/A7, it's even possible
  21. # to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
  22. $flavour = shift;
  23. if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  24. else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  25. if ($flavour && $flavour ne "void") {
  26. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  27. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  28. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  29. die "can't locate arm-xlate.pl";
  30. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  31. } else {
  32. open STDOUT,">$output";
  33. }
  34. ($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
  35. $code.=<<___;
  36. #ifndef __KERNEL__
  37. # include "arm_arch.h"
  38. #else
  39. # define __ARM_ARCH__ __LINUX_ARM_ARCH__
  40. # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
  41. # define poly1305_init poly1305_block_init
  42. # define poly1305_blocks poly1305_blocks_arm
  43. #endif
  44. #if defined(__thumb2__)
  45. .syntax unified
  46. .thumb
  47. #else
  48. .code 32
  49. #endif
  50. .text
  51. .globl poly1305_emit
  52. .globl poly1305_blocks
  53. .globl poly1305_init
  54. .type poly1305_init,%function
  55. .align 5
  56. poly1305_init:
  57. .Lpoly1305_init:
  58. stmdb sp!,{r4-r11}
  59. eor r3,r3,r3
  60. cmp $inp,#0
  61. str r3,[$ctx,#0] @ zero hash value
  62. str r3,[$ctx,#4]
  63. str r3,[$ctx,#8]
  64. str r3,[$ctx,#12]
  65. str r3,[$ctx,#16]
  66. str r3,[$ctx,#36] @ clear is_base2_26
  67. add $ctx,$ctx,#20
  68. #ifdef __thumb2__
  69. it eq
  70. #endif
  71. moveq r0,#0
  72. beq .Lno_key
  73. #if __ARM_MAX_ARCH__>=7
  74. mov r3,#-1
  75. str r3,[$ctx,#28] @ impossible key power value
  76. # ifndef __KERNEL__
  77. adr r11,.Lpoly1305_init
  78. ldr r12,.LOPENSSL_armcap
  79. # endif
  80. #endif
  81. ldrb r4,[$inp,#0]
  82. mov r10,#0x0fffffff
  83. ldrb r5,[$inp,#1]
  84. and r3,r10,#-4 @ 0x0ffffffc
  85. ldrb r6,[$inp,#2]
  86. ldrb r7,[$inp,#3]
  87. orr r4,r4,r5,lsl#8
  88. ldrb r5,[$inp,#4]
  89. orr r4,r4,r6,lsl#16
  90. ldrb r6,[$inp,#5]
  91. orr r4,r4,r7,lsl#24
  92. ldrb r7,[$inp,#6]
  93. and r4,r4,r10
  94. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  95. # if !defined(_WIN32)
  96. ldr r12,[r11,r12] @ OPENSSL_armcap_P
  97. # endif
  98. # if defined(__APPLE__) || defined(_WIN32)
  99. ldr r12,[r12]
  100. # endif
  101. #endif
  102. ldrb r8,[$inp,#7]
  103. orr r5,r5,r6,lsl#8
  104. ldrb r6,[$inp,#8]
  105. orr r5,r5,r7,lsl#16
  106. ldrb r7,[$inp,#9]
  107. orr r5,r5,r8,lsl#24
  108. ldrb r8,[$inp,#10]
  109. and r5,r5,r3
  110. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  111. tst r12,#ARMV7_NEON @ check for NEON
  112. # ifdef __thumb2__
  113. adr r9,.Lpoly1305_blocks_neon
  114. adr r11,.Lpoly1305_blocks
  115. it ne
  116. movne r11,r9
  117. adr r12,.Lpoly1305_emit
  118. orr r11,r11,#1 @ thumb-ify addresses
  119. orr r12,r12,#1
  120. # else
  121. add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
  122. ite eq
  123. addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
  124. addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
  125. # endif
  126. #endif
  127. ldrb r9,[$inp,#11]
  128. orr r6,r6,r7,lsl#8
  129. ldrb r7,[$inp,#12]
  130. orr r6,r6,r8,lsl#16
  131. ldrb r8,[$inp,#13]
  132. orr r6,r6,r9,lsl#24
  133. ldrb r9,[$inp,#14]
  134. and r6,r6,r3
  135. ldrb r10,[$inp,#15]
  136. orr r7,r7,r8,lsl#8
  137. str r4,[$ctx,#0]
  138. orr r7,r7,r9,lsl#16
  139. str r5,[$ctx,#4]
  140. orr r7,r7,r10,lsl#24
  141. str r6,[$ctx,#8]
  142. and r7,r7,r3
  143. str r7,[$ctx,#12]
  144. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  145. stmia r2,{r11,r12} @ fill functions table
  146. mov r0,#1
  147. #else
  148. mov r0,#0
  149. #endif
  150. .Lno_key:
  151. ldmia sp!,{r4-r11}
  152. #if __ARM_ARCH__>=5
  153. ret @ bx lr
  154. #else
  155. tst lr,#1
  156. moveq pc,lr @ be binary compatible with V4, yet
  157. bx lr @ interoperable with Thumb ISA:-)
  158. #endif
  159. .size poly1305_init,.-poly1305_init
  160. ___
  161. {
  162. my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
  163. my ($s1,$s2,$s3)=($r1,$r2,$r3);
  164. $code.=<<___;
  165. .type poly1305_blocks,%function
  166. .align 5
  167. poly1305_blocks:
  168. .Lpoly1305_blocks:
  169. stmdb sp!,{r3-r11,lr}
  170. ands $len,$len,#-16
  171. beq .Lno_data
  172. add $len,$len,$inp @ end pointer
  173. sub sp,sp,#32
  174. #if __ARM_ARCH__<7
  175. ldmia $ctx,{$h0-$r3} @ load context
  176. add $ctx,$ctx,#20
  177. str $len,[sp,#16] @ offload stuff
  178. str $ctx,[sp,#12]
  179. #else
  180. ldr lr,[$ctx,#36] @ is_base2_26
  181. ldmia $ctx!,{$h0-$h4} @ load hash value
  182. str $len,[sp,#16] @ offload stuff
  183. str $ctx,[sp,#12]
  184. adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
  185. mov $r1,$h1,lsr#6
  186. adcs $r1,$r1,$h2,lsl#20
  187. mov $r2,$h2,lsr#12
  188. adcs $r2,$r2,$h3,lsl#14
  189. mov $r3,$h3,lsr#18
  190. adcs $r3,$r3,$h4,lsl#8
  191. mov $len,#0
  192. teq lr,#0
  193. str $len,[$ctx,#16] @ clear is_base2_26
  194. adc $len,$len,$h4,lsr#24
  195. itttt ne
  196. movne $h0,$r0 @ choose between radixes
  197. movne $h1,$r1
  198. movne $h2,$r2
  199. movne $h3,$r3
  200. ldmia $ctx,{$r0-$r3} @ load key
  201. it ne
  202. movne $h4,$len
  203. #endif
  204. mov lr,$inp
  205. cmp $padbit,#0
  206. str $r1,[sp,#20]
  207. str $r2,[sp,#24]
  208. str $r3,[sp,#28]
  209. b .Loop
  210. .align 4
  211. .Loop:
  212. #if __ARM_ARCH__<7
  213. ldrb r0,[lr],#16 @ load input
  214. # ifdef __thumb2__
  215. it hi
  216. # endif
  217. addhi $h4,$h4,#1 @ 1<<128
  218. ldrb r1,[lr,#-15]
  219. ldrb r2,[lr,#-14]
  220. ldrb r3,[lr,#-13]
  221. orr r1,r0,r1,lsl#8
  222. ldrb r0,[lr,#-12]
  223. orr r2,r1,r2,lsl#16
  224. ldrb r1,[lr,#-11]
  225. orr r3,r2,r3,lsl#24
  226. ldrb r2,[lr,#-10]
  227. adds $h0,$h0,r3 @ accumulate input
  228. ldrb r3,[lr,#-9]
  229. orr r1,r0,r1,lsl#8
  230. ldrb r0,[lr,#-8]
  231. orr r2,r1,r2,lsl#16
  232. ldrb r1,[lr,#-7]
  233. orr r3,r2,r3,lsl#24
  234. ldrb r2,[lr,#-6]
  235. adcs $h1,$h1,r3
  236. ldrb r3,[lr,#-5]
  237. orr r1,r0,r1,lsl#8
  238. ldrb r0,[lr,#-4]
  239. orr r2,r1,r2,lsl#16
  240. ldrb r1,[lr,#-3]
  241. orr r3,r2,r3,lsl#24
  242. ldrb r2,[lr,#-2]
  243. adcs $h2,$h2,r3
  244. ldrb r3,[lr,#-1]
  245. orr r1,r0,r1,lsl#8
  246. str lr,[sp,#8] @ offload input pointer
  247. orr r2,r1,r2,lsl#16
  248. add $s1,$r1,$r1,lsr#2
  249. orr r3,r2,r3,lsl#24
  250. #else
  251. ldr r0,[lr],#16 @ load input
  252. it hi
  253. addhi $h4,$h4,#1 @ padbit
  254. ldr r1,[lr,#-12]
  255. ldr r2,[lr,#-8]
  256. ldr r3,[lr,#-4]
  257. # ifdef __ARMEB__
  258. rev r0,r0
  259. rev r1,r1
  260. rev r2,r2
  261. rev r3,r3
  262. # endif
  263. adds $h0,$h0,r0 @ accumulate input
  264. str lr,[sp,#8] @ offload input pointer
  265. adcs $h1,$h1,r1
  266. add $s1,$r1,$r1,lsr#2
  267. adcs $h2,$h2,r2
  268. #endif
  269. add $s2,$r2,$r2,lsr#2
  270. adcs $h3,$h3,r3
  271. add $s3,$r3,$r3,lsr#2
  272. umull r2,r3,$h1,$r0
  273. adc $h4,$h4,#0
  274. umull r0,r1,$h0,$r0
  275. umlal r2,r3,$h4,$s1
  276. umlal r0,r1,$h3,$s1
  277. ldr $r1,[sp,#20] @ reload $r1
  278. umlal r2,r3,$h2,$s3
  279. umlal r0,r1,$h1,$s3
  280. umlal r2,r3,$h3,$s2
  281. umlal r0,r1,$h2,$s2
  282. umlal r2,r3,$h0,$r1
  283. str r0,[sp,#0] @ future $h0
  284. mul r0,$s2,$h4
  285. ldr $r2,[sp,#24] @ reload $r2
  286. adds r2,r2,r1 @ d1+=d0>>32
  287. eor r1,r1,r1
  288. adc lr,r3,#0 @ future $h2
  289. str r2,[sp,#4] @ future $h1
  290. mul r2,$s3,$h4
  291. eor r3,r3,r3
  292. umlal r0,r1,$h3,$s3
  293. ldr $r3,[sp,#28] @ reload $r3
  294. umlal r2,r3,$h3,$r0
  295. umlal r0,r1,$h2,$r0
  296. umlal r2,r3,$h2,$r1
  297. umlal r0,r1,$h1,$r1
  298. umlal r2,r3,$h1,$r2
  299. umlal r0,r1,$h0,$r2
  300. umlal r2,r3,$h0,$r3
  301. ldr $h0,[sp,#0]
  302. mul $h4,$r0,$h4
  303. ldr $h1,[sp,#4]
  304. adds $h2,lr,r0 @ d2+=d1>>32
  305. ldr lr,[sp,#8] @ reload input pointer
  306. adc r1,r1,#0
  307. adds $h3,r2,r1 @ d3+=d2>>32
  308. ldr r0,[sp,#16] @ reload end pointer
  309. adc r3,r3,#0
  310. add $h4,$h4,r3 @ h4+=d3>>32
  311. and r1,$h4,#-4
  312. and $h4,$h4,#3
  313. add r1,r1,r1,lsr#2 @ *=5
  314. adds $h0,$h0,r1
  315. adcs $h1,$h1,#0
  316. adcs $h2,$h2,#0
  317. adcs $h3,$h3,#0
  318. adc $h4,$h4,#0
  319. cmp r0,lr @ done yet?
  320. bhi .Loop
  321. ldr $ctx,[sp,#12]
  322. add sp,sp,#32
  323. stmdb $ctx,{$h0-$h4} @ store the result
  324. .Lno_data:
  325. #if __ARM_ARCH__>=5
  326. ldmia sp!,{r3-r11,pc}
  327. #else
  328. ldmia sp!,{r3-r11,lr}
  329. tst lr,#1
  330. moveq pc,lr @ be binary compatible with V4, yet
  331. bx lr @ interoperable with Thumb ISA:-)
  332. #endif
  333. .size poly1305_blocks,.-poly1305_blocks
  334. ___
  335. }
  336. {
  337. my ($ctx,$mac,$nonce)=map("r$_",(0..2));
  338. my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
  339. my $g4=$ctx;
  340. $code.=<<___;
  341. .type poly1305_emit,%function
  342. .align 5
  343. poly1305_emit:
  344. .Lpoly1305_emit:
  345. stmdb sp!,{r4-r11}
  346. ldmia $ctx,{$h0-$h4}
  347. #if __ARM_ARCH__>=7
  348. ldr ip,[$ctx,#36] @ is_base2_26
  349. adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
  350. mov $g1,$h1,lsr#6
  351. adcs $g1,$g1,$h2,lsl#20
  352. mov $g2,$h2,lsr#12
  353. adcs $g2,$g2,$h3,lsl#14
  354. mov $g3,$h3,lsr#18
  355. adcs $g3,$g3,$h4,lsl#8
  356. mov $g4,#0
  357. adc $g4,$g4,$h4,lsr#24
  358. tst ip,ip
  359. itttt ne
  360. movne $h0,$g0
  361. movne $h1,$g1
  362. movne $h2,$g2
  363. movne $h3,$g3
  364. it ne
  365. movne $h4,$g4
  366. #endif
  367. adds $g0,$h0,#5 @ compare to modulus
  368. adcs $g1,$h1,#0
  369. adcs $g2,$h2,#0
  370. adcs $g3,$h3,#0
  371. adc $g4,$h4,#0
  372. tst $g4,#4 @ did it carry/borrow?
  373. #ifdef __thumb2__
  374. it ne
  375. #endif
  376. movne $h0,$g0
  377. ldr $g0,[$nonce,#0]
  378. #ifdef __thumb2__
  379. it ne
  380. #endif
  381. movne $h1,$g1
  382. ldr $g1,[$nonce,#4]
  383. #ifdef __thumb2__
  384. it ne
  385. #endif
  386. movne $h2,$g2
  387. ldr $g2,[$nonce,#8]
  388. #ifdef __thumb2__
  389. it ne
  390. #endif
  391. movne $h3,$g3
  392. ldr $g3,[$nonce,#12]
  393. adds $h0,$h0,$g0
  394. adcs $h1,$h1,$g1
  395. adcs $h2,$h2,$g2
  396. adc $h3,$h3,$g3
  397. #if __ARM_ARCH__>=7
  398. # ifdef __ARMEB__
  399. rev $h0,$h0
  400. rev $h1,$h1
  401. rev $h2,$h2
  402. rev $h3,$h3
  403. # endif
  404. str $h0,[$mac,#0]
  405. str $h1,[$mac,#4]
  406. str $h2,[$mac,#8]
  407. str $h3,[$mac,#12]
  408. #else
  409. strb $h0,[$mac,#0]
  410. mov $h0,$h0,lsr#8
  411. strb $h1,[$mac,#4]
  412. mov $h1,$h1,lsr#8
  413. strb $h2,[$mac,#8]
  414. mov $h2,$h2,lsr#8
  415. strb $h3,[$mac,#12]
  416. mov $h3,$h3,lsr#8
  417. strb $h0,[$mac,#1]
  418. mov $h0,$h0,lsr#8
  419. strb $h1,[$mac,#5]
  420. mov $h1,$h1,lsr#8
  421. strb $h2,[$mac,#9]
  422. mov $h2,$h2,lsr#8
  423. strb $h3,[$mac,#13]
  424. mov $h3,$h3,lsr#8
  425. strb $h0,[$mac,#2]
  426. mov $h0,$h0,lsr#8
  427. strb $h1,[$mac,#6]
  428. mov $h1,$h1,lsr#8
  429. strb $h2,[$mac,#10]
  430. mov $h2,$h2,lsr#8
  431. strb $h3,[$mac,#14]
  432. mov $h3,$h3,lsr#8
  433. strb $h0,[$mac,#3]
  434. strb $h1,[$mac,#7]
  435. strb $h2,[$mac,#11]
  436. strb $h3,[$mac,#15]
  437. #endif
  438. ldmia sp!,{r4-r11}
  439. #if __ARM_ARCH__>=5
  440. ret @ bx lr
  441. #else
  442. tst lr,#1
  443. moveq pc,lr @ be binary compatible with V4, yet
  444. bx lr @ interoperable with Thumb ISA:-)
  445. #endif
  446. .size poly1305_emit,.-poly1305_emit
  447. ___
  448. {
  449. my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
  450. my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
  451. my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
  452. my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
  453. $code.=<<___;
  454. #if __ARM_MAX_ARCH__>=7
  455. .fpu neon
  456. .type poly1305_init_neon,%function
  457. .align 5
  458. poly1305_init_neon:
  459. .Lpoly1305_init_neon:
  460. ldr r3,[$ctx,#48] @ first table element
  461. cmp r3,#-1 @ is value impossible?
  462. bne .Lno_init_neon
  463. ldr r4,[$ctx,#20] @ load key base 2^32
  464. ldr r5,[$ctx,#24]
  465. ldr r6,[$ctx,#28]
  466. ldr r7,[$ctx,#32]
  467. and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
  468. mov r3,r4,lsr#26
  469. mov r4,r5,lsr#20
  470. orr r3,r3,r5,lsl#6
  471. mov r5,r6,lsr#14
  472. orr r4,r4,r6,lsl#12
  473. mov r6,r7,lsr#8
  474. orr r5,r5,r7,lsl#18
  475. and r3,r3,#0x03ffffff
  476. and r4,r4,#0x03ffffff
  477. and r5,r5,#0x03ffffff
  478. vdup.32 $R0,r2 @ r^1 in both lanes
  479. add r2,r3,r3,lsl#2 @ *5
  480. vdup.32 $R1,r3
  481. add r3,r4,r4,lsl#2
  482. vdup.32 $S1,r2
  483. vdup.32 $R2,r4
  484. add r4,r5,r5,lsl#2
  485. vdup.32 $S2,r3
  486. vdup.32 $R3,r5
  487. add r5,r6,r6,lsl#2
  488. vdup.32 $S3,r4
  489. vdup.32 $R4,r6
  490. vdup.32 $S4,r5
  491. mov $zeros,#2 @ counter
  492. .Lsquare_neon:
  493. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  494. @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  495. @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  496. @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  497. @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  498. @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  499. vmull.u32 $D0,$R0,${R0}[1]
  500. vmull.u32 $D1,$R1,${R0}[1]
  501. vmull.u32 $D2,$R2,${R0}[1]
  502. vmull.u32 $D3,$R3,${R0}[1]
  503. vmull.u32 $D4,$R4,${R0}[1]
  504. vmlal.u32 $D0,$R4,${S1}[1]
  505. vmlal.u32 $D1,$R0,${R1}[1]
  506. vmlal.u32 $D2,$R1,${R1}[1]
  507. vmlal.u32 $D3,$R2,${R1}[1]
  508. vmlal.u32 $D4,$R3,${R1}[1]
  509. vmlal.u32 $D0,$R3,${S2}[1]
  510. vmlal.u32 $D1,$R4,${S2}[1]
  511. vmlal.u32 $D3,$R1,${R2}[1]
  512. vmlal.u32 $D2,$R0,${R2}[1]
  513. vmlal.u32 $D4,$R2,${R2}[1]
  514. vmlal.u32 $D0,$R2,${S3}[1]
  515. vmlal.u32 $D3,$R0,${R3}[1]
  516. vmlal.u32 $D1,$R3,${S3}[1]
  517. vmlal.u32 $D2,$R4,${S3}[1]
  518. vmlal.u32 $D4,$R1,${R3}[1]
  519. vmlal.u32 $D3,$R4,${S4}[1]
  520. vmlal.u32 $D0,$R1,${S4}[1]
  521. vmlal.u32 $D1,$R2,${S4}[1]
  522. vmlal.u32 $D2,$R3,${S4}[1]
  523. vmlal.u32 $D4,$R0,${R4}[1]
  524. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  525. @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
  526. @ and P. Schwabe
  527. @
  528. @ H0>>+H1>>+H2>>+H3>>+H4
  529. @ H3>>+H4>>*5+H0>>+H1
  530. @
  531. @ Trivia.
  532. @
  533. @ Result of multiplication of n-bit number by m-bit number is
  534. @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
  535. @ m-bit number multiplied by 2^n is still n+m bits wide.
  536. @
  537. @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
  538. @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
  539. @ one is n+1 bits wide.
  540. @
  541. @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
  542. @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
  543. @ can be 27. However! In cases when their width exceeds 26 bits
  544. @ they are limited by 2^26+2^6. This in turn means that *sum*
  545. @ of the products with these values can still be viewed as sum
  546. @ of 52-bit numbers as long as the amount of addends is not a
  547. @ power of 2. For example,
  548. @
  549. @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
  550. @
  551. @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
  552. @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
  553. @ 8 * (2^52) or 2^55. However, the value is then multiplied by
  554. @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
  555. @ which is less than 32 * (2^52) or 2^57. And when processing
  556. @ data we are looking at triple as many addends...
  557. @
  558. @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
  559. @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
  560. @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
  561. @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
  562. @ instruction accepts 2x32-bit input and writes 2x64-bit result.
  563. @ This means that result of reduction have to be compressed upon
  564. @ loop wrap-around. This can be done in the process of reduction
  565. @ to minimize amount of instructions [as well as amount of
  566. @ 128-bit instructions, which benefits low-end processors], but
  567. @ one has to watch for H2 (which is narrower than H0) and 5*H4
  568. @ not being wider than 58 bits, so that result of right shift
  569. @ by 26 bits fits in 32 bits. This is also useful on x86,
  570. @ because it allows to use paddd in place for paddq, which
  571. @ benefits Atom, where paddq is ridiculously slow.
  572. vshr.u64 $T0,$D3,#26
  573. vmovn.i64 $D3#lo,$D3
  574. vshr.u64 $T1,$D0,#26
  575. vmovn.i64 $D0#lo,$D0
  576. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  577. vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
  578. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  579. vbic.i32 $D0#lo,#0xfc000000
  580. vshrn.u64 $T0#lo,$D4,#26
  581. vmovn.i64 $D4#lo,$D4
  582. vshr.u64 $T1,$D1,#26
  583. vmovn.i64 $D1#lo,$D1
  584. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  585. vbic.i32 $D4#lo,#0xfc000000
  586. vbic.i32 $D1#lo,#0xfc000000
  587. vadd.i32 $D0#lo,$D0#lo,$T0#lo
  588. vshl.u32 $T0#lo,$T0#lo,#2
  589. vshrn.u64 $T1#lo,$D2,#26
  590. vmovn.i64 $D2#lo,$D2
  591. vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
  592. vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
  593. vbic.i32 $D2#lo,#0xfc000000
  594. vshr.u32 $T0#lo,$D0#lo,#26
  595. vbic.i32 $D0#lo,#0xfc000000
  596. vshr.u32 $T1#lo,$D3#lo,#26
  597. vbic.i32 $D3#lo,#0xfc000000
  598. vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
  599. vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
  600. subs $zeros,$zeros,#1
  601. beq .Lsquare_break_neon
  602. add $tbl0,$ctx,#(48+0*9*4)
  603. add $tbl1,$ctx,#(48+1*9*4)
  604. vtrn.32 $R0,$D0#lo @ r^2:r^1
  605. vtrn.32 $R2,$D2#lo
  606. vtrn.32 $R3,$D3#lo
  607. vtrn.32 $R1,$D1#lo
  608. vtrn.32 $R4,$D4#lo
  609. vshl.u32 $S2,$R2,#2 @ *5
  610. vshl.u32 $S3,$R3,#2
  611. vshl.u32 $S1,$R1,#2
  612. vshl.u32 $S4,$R4,#2
  613. vadd.i32 $S2,$S2,$R2
  614. vadd.i32 $S1,$S1,$R1
  615. vadd.i32 $S3,$S3,$R3
  616. vadd.i32 $S4,$S4,$R4
  617. vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
  618. vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
  619. vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  620. vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  621. vst1.32 {${S4}[0]},[$tbl0,:32]
  622. vst1.32 {${S4}[1]},[$tbl1,:32]
  623. b .Lsquare_neon
  624. .align 4
  625. .Lsquare_break_neon:
  626. add $tbl0,$ctx,#(48+2*4*9)
  627. add $tbl1,$ctx,#(48+3*4*9)
  628. vmov $R0,$D0#lo @ r^4:r^3
  629. vshl.u32 $S1,$D1#lo,#2 @ *5
  630. vmov $R1,$D1#lo
  631. vshl.u32 $S2,$D2#lo,#2
  632. vmov $R2,$D2#lo
  633. vshl.u32 $S3,$D3#lo,#2
  634. vmov $R3,$D3#lo
  635. vshl.u32 $S4,$D4#lo,#2
  636. vmov $R4,$D4#lo
  637. vadd.i32 $S1,$S1,$D1#lo
  638. vadd.i32 $S2,$S2,$D2#lo
  639. vadd.i32 $S3,$S3,$D3#lo
  640. vadd.i32 $S4,$S4,$D4#lo
  641. vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
  642. vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
  643. vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  644. vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  645. vst1.32 {${S4}[0]},[$tbl0]
  646. vst1.32 {${S4}[1]},[$tbl1]
  647. .Lno_init_neon:
  648. ret @ bx lr
  649. .size poly1305_init_neon,.-poly1305_init_neon
  650. .globl poly1305_blocks_neon
  651. .type poly1305_blocks_neon,%function
  652. .align 5
  653. poly1305_blocks_neon:
  654. .Lpoly1305_blocks_neon:
  655. ldr ip,[$ctx,#36] @ is_base2_26
  656. cmp $len,#64
  657. blo .Lpoly1305_blocks
  658. stmdb sp!,{r4-r7}
  659. vstmdb sp!,{d8-d15} @ ABI specification says so
  660. tst ip,ip @ is_base2_26?
  661. bne .Lbase2_26_neon
  662. stmdb sp!,{r1-r3,lr}
  663. bl .Lpoly1305_init_neon
  664. ldr r4,[$ctx,#0] @ load hash value base 2^32
  665. ldr r5,[$ctx,#4]
  666. ldr r6,[$ctx,#8]
  667. ldr r7,[$ctx,#12]
  668. ldr ip,[$ctx,#16]
  669. and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
  670. mov r3,r4,lsr#26
  671. veor $D0#lo,$D0#lo,$D0#lo
  672. mov r4,r5,lsr#20
  673. orr r3,r3,r5,lsl#6
  674. veor $D1#lo,$D1#lo,$D1#lo
  675. mov r5,r6,lsr#14
  676. orr r4,r4,r6,lsl#12
  677. veor $D2#lo,$D2#lo,$D2#lo
  678. mov r6,r7,lsr#8
  679. orr r5,r5,r7,lsl#18
  680. veor $D3#lo,$D3#lo,$D3#lo
  681. and r3,r3,#0x03ffffff
  682. orr r6,r6,ip,lsl#24
  683. veor $D4#lo,$D4#lo,$D4#lo
  684. and r4,r4,#0x03ffffff
  685. mov r1,#1
  686. and r5,r5,#0x03ffffff
  687. str r1,[$ctx,#36] @ set is_base2_26
  688. vmov.32 $D0#lo[0],r2
  689. vmov.32 $D1#lo[0],r3
  690. vmov.32 $D2#lo[0],r4
  691. vmov.32 $D3#lo[0],r5
  692. vmov.32 $D4#lo[0],r6
  693. adr $zeros,.Lzeros
  694. ldmia sp!,{r1-r3,lr}
  695. b .Lhash_loaded
  696. .align 4
  697. .Lbase2_26_neon:
  698. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  699. @ load hash value
  700. veor $D0#lo,$D0#lo,$D0#lo
  701. veor $D1#lo,$D1#lo,$D1#lo
  702. veor $D2#lo,$D2#lo,$D2#lo
  703. veor $D3#lo,$D3#lo,$D3#lo
  704. veor $D4#lo,$D4#lo,$D4#lo
  705. vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
  706. adr $zeros,.Lzeros
  707. vld1.32 {$D4#lo[0]},[$ctx]
  708. sub $ctx,$ctx,#16 @ rewind
  709. .Lhash_loaded:
  710. add $in2,$inp,#32
  711. mov $padbit,$padbit,lsl#24
  712. tst $len,#31
  713. beq .Leven
  714. vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
  715. vmov.32 $H4#lo[0],$padbit
  716. sub $len,$len,#16
  717. add $in2,$inp,#32
  718. # ifdef __ARMEB__
  719. vrev32.8 $H0,$H0
  720. vrev32.8 $H3,$H3
  721. vrev32.8 $H1,$H1
  722. vrev32.8 $H2,$H2
  723. # endif
  724. vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
  725. vshl.u32 $H3#lo,$H3#lo,#18
  726. vsri.u32 $H3#lo,$H2#lo,#14
  727. vshl.u32 $H2#lo,$H2#lo,#12
  728. vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
  729. vbic.i32 $H3#lo,#0xfc000000
  730. vsri.u32 $H2#lo,$H1#lo,#20
  731. vshl.u32 $H1#lo,$H1#lo,#6
  732. vbic.i32 $H2#lo,#0xfc000000
  733. vsri.u32 $H1#lo,$H0#lo,#26
  734. vadd.i32 $H3#hi,$H3#lo,$D3#lo
  735. vbic.i32 $H0#lo,#0xfc000000
  736. vbic.i32 $H1#lo,#0xfc000000
  737. vadd.i32 $H2#hi,$H2#lo,$D2#lo
  738. vadd.i32 $H0#hi,$H0#lo,$D0#lo
  739. vadd.i32 $H1#hi,$H1#lo,$D1#lo
  740. mov $tbl1,$zeros
  741. add $tbl0,$ctx,#48
  742. cmp $len,$len
  743. b .Long_tail
  744. .align 4
  745. .Leven:
  746. subs $len,$len,#64
  747. it lo
  748. movlo $in2,$zeros
  749. vmov.i32 $H4,#1<<24 @ padbit, yes, always
  750. vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
  751. add $inp,$inp,#64
  752. vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
  753. add $in2,$in2,#64
  754. itt hi
  755. addhi $tbl1,$ctx,#(48+1*9*4)
  756. addhi $tbl0,$ctx,#(48+3*9*4)
  757. # ifdef __ARMEB__
  758. vrev32.8 $H0,$H0
  759. vrev32.8 $H3,$H3
  760. vrev32.8 $H1,$H1
  761. vrev32.8 $H2,$H2
  762. # endif
  763. vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
  764. vshl.u32 $H3,$H3,#18
  765. vsri.u32 $H3,$H2,#14
  766. vshl.u32 $H2,$H2,#12
  767. vbic.i32 $H3,#0xfc000000
  768. vsri.u32 $H2,$H1,#20
  769. vshl.u32 $H1,$H1,#6
  770. vbic.i32 $H2,#0xfc000000
  771. vsri.u32 $H1,$H0,#26
  772. vbic.i32 $H0,#0xfc000000
  773. vbic.i32 $H1,#0xfc000000
  774. bls .Lskip_loop
  775. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
  776. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
  777. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  778. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  779. b .Loop_neon
  780. .align 5
  781. .Loop_neon:
  782. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  783. @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
  784. @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
  785. @ \___________________/
  786. @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
  787. @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
  788. @ \___________________/ \____________________/
  789. @
  790. @ Note that we start with inp[2:3]*r^2. This is because it
  791. @ doesn't depend on reduction in previous iteration.
  792. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  793. @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  794. @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  795. @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  796. @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  797. @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  798. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  799. @ inp[2:3]*r^2
  800. vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
  801. vmull.u32 $D2,$H2#hi,${R0}[1]
  802. vadd.i32 $H0#lo,$H0#lo,$D0#lo
  803. vmull.u32 $D0,$H0#hi,${R0}[1]
  804. vadd.i32 $H3#lo,$H3#lo,$D3#lo
  805. vmull.u32 $D3,$H3#hi,${R0}[1]
  806. vmlal.u32 $D2,$H1#hi,${R1}[1]
  807. vadd.i32 $H1#lo,$H1#lo,$D1#lo
  808. vmull.u32 $D1,$H1#hi,${R0}[1]
  809. vadd.i32 $H4#lo,$H4#lo,$D4#lo
  810. vmull.u32 $D4,$H4#hi,${R0}[1]
  811. subs $len,$len,#64
  812. vmlal.u32 $D0,$H4#hi,${S1}[1]
  813. it lo
  814. movlo $in2,$zeros
  815. vmlal.u32 $D3,$H2#hi,${R1}[1]
  816. vld1.32 ${S4}[1],[$tbl1,:32]
  817. vmlal.u32 $D1,$H0#hi,${R1}[1]
  818. vmlal.u32 $D4,$H3#hi,${R1}[1]
  819. vmlal.u32 $D0,$H3#hi,${S2}[1]
  820. vmlal.u32 $D3,$H1#hi,${R2}[1]
  821. vmlal.u32 $D4,$H2#hi,${R2}[1]
  822. vmlal.u32 $D1,$H4#hi,${S2}[1]
  823. vmlal.u32 $D2,$H0#hi,${R2}[1]
  824. vmlal.u32 $D3,$H0#hi,${R3}[1]
  825. vmlal.u32 $D0,$H2#hi,${S3}[1]
  826. vmlal.u32 $D4,$H1#hi,${R3}[1]
  827. vmlal.u32 $D1,$H3#hi,${S3}[1]
  828. vmlal.u32 $D2,$H4#hi,${S3}[1]
  829. vmlal.u32 $D3,$H4#hi,${S4}[1]
  830. vmlal.u32 $D0,$H1#hi,${S4}[1]
  831. vmlal.u32 $D4,$H0#hi,${R4}[1]
  832. vmlal.u32 $D1,$H2#hi,${S4}[1]
  833. vmlal.u32 $D2,$H3#hi,${S4}[1]
  834. vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
  835. add $in2,$in2,#64
  836. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  837. @ (hash+inp[0:1])*r^4 and accumulate
  838. vmlal.u32 $D3,$H3#lo,${R0}[0]
  839. vmlal.u32 $D0,$H0#lo,${R0}[0]
  840. vmlal.u32 $D4,$H4#lo,${R0}[0]
  841. vmlal.u32 $D1,$H1#lo,${R0}[0]
  842. vmlal.u32 $D2,$H2#lo,${R0}[0]
  843. vld1.32 ${S4}[0],[$tbl0,:32]
  844. vmlal.u32 $D3,$H2#lo,${R1}[0]
  845. vmlal.u32 $D0,$H4#lo,${S1}[0]
  846. vmlal.u32 $D4,$H3#lo,${R1}[0]
  847. vmlal.u32 $D1,$H0#lo,${R1}[0]
  848. vmlal.u32 $D2,$H1#lo,${R1}[0]
  849. vmlal.u32 $D3,$H1#lo,${R2}[0]
  850. vmlal.u32 $D0,$H3#lo,${S2}[0]
  851. vmlal.u32 $D4,$H2#lo,${R2}[0]
  852. vmlal.u32 $D1,$H4#lo,${S2}[0]
  853. vmlal.u32 $D2,$H0#lo,${R2}[0]
  854. vmlal.u32 $D3,$H0#lo,${R3}[0]
  855. vmlal.u32 $D0,$H2#lo,${S3}[0]
  856. vmlal.u32 $D4,$H1#lo,${R3}[0]
  857. vmlal.u32 $D1,$H3#lo,${S3}[0]
  858. vmlal.u32 $D3,$H4#lo,${S4}[0]
  859. vmlal.u32 $D2,$H4#lo,${S3}[0]
  860. vmlal.u32 $D0,$H1#lo,${S4}[0]
  861. vmlal.u32 $D4,$H0#lo,${R4}[0]
  862. vmov.i32 $H4,#1<<24 @ padbit, yes, always
  863. vmlal.u32 $D1,$H2#lo,${S4}[0]
  864. vmlal.u32 $D2,$H3#lo,${S4}[0]
  865. vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
  866. add $inp,$inp,#64
  867. # ifdef __ARMEB__
  868. vrev32.8 $H0,$H0
  869. vrev32.8 $H1,$H1
  870. vrev32.8 $H2,$H2
  871. vrev32.8 $H3,$H3
  872. # endif
  873. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  874. @ lazy reduction interleaved with base 2^32 -> base 2^26 of
  875. @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
  876. vshr.u64 $T0,$D3,#26
  877. vmovn.i64 $D3#lo,$D3
  878. vshr.u64 $T1,$D0,#26
  879. vmovn.i64 $D0#lo,$D0
  880. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  881. vbic.i32 $D3#lo,#0xfc000000
  882. vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
  883. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  884. vshl.u32 $H3,$H3,#18
  885. vbic.i32 $D0#lo,#0xfc000000
  886. vshrn.u64 $T0#lo,$D4,#26
  887. vmovn.i64 $D4#lo,$D4
  888. vshr.u64 $T1,$D1,#26
  889. vmovn.i64 $D1#lo,$D1
  890. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  891. vsri.u32 $H3,$H2,#14
  892. vbic.i32 $D4#lo,#0xfc000000
  893. vshl.u32 $H2,$H2,#12
  894. vbic.i32 $D1#lo,#0xfc000000
  895. vadd.i32 $D0#lo,$D0#lo,$T0#lo
  896. vshl.u32 $T0#lo,$T0#lo,#2
  897. vbic.i32 $H3,#0xfc000000
  898. vshrn.u64 $T1#lo,$D2,#26
  899. vmovn.i64 $D2#lo,$D2
  900. vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
  901. vsri.u32 $H2,$H1,#20
  902. vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
  903. vshl.u32 $H1,$H1,#6
  904. vbic.i32 $D2#lo,#0xfc000000
  905. vbic.i32 $H2,#0xfc000000
  906. vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
  907. vmovn.i64 $D0#lo,$D0
  908. vsri.u32 $H1,$H0,#26
  909. vbic.i32 $H0,#0xfc000000
  910. vshr.u32 $T1#lo,$D3#lo,#26
  911. vbic.i32 $D3#lo,#0xfc000000
  912. vbic.i32 $D0#lo,#0xfc000000
  913. vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
  914. vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
  915. vbic.i32 $H1,#0xfc000000
  916. bhi .Loop_neon
  917. .Lskip_loop:
  918. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  919. @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
  920. add $tbl1,$ctx,#(48+0*9*4)
  921. add $tbl0,$ctx,#(48+1*9*4)
  922. adds $len,$len,#32
  923. it ne
  924. movne $len,#0
  925. bne .Long_tail
  926. vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
  927. vadd.i32 $H0#hi,$H0#lo,$D0#lo
  928. vadd.i32 $H3#hi,$H3#lo,$D3#lo
  929. vadd.i32 $H1#hi,$H1#lo,$D1#lo
  930. vadd.i32 $H4#hi,$H4#lo,$D4#lo
  931. .Long_tail:
  932. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
  933. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
  934. vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
  935. vmull.u32 $D2,$H2#hi,$R0
  936. vadd.i32 $H0#lo,$H0#lo,$D0#lo
  937. vmull.u32 $D0,$H0#hi,$R0
  938. vadd.i32 $H3#lo,$H3#lo,$D3#lo
  939. vmull.u32 $D3,$H3#hi,$R0
  940. vadd.i32 $H1#lo,$H1#lo,$D1#lo
  941. vmull.u32 $D1,$H1#hi,$R0
  942. vadd.i32 $H4#lo,$H4#lo,$D4#lo
  943. vmull.u32 $D4,$H4#hi,$R0
  944. vmlal.u32 $D0,$H4#hi,$S1
  945. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  946. vmlal.u32 $D3,$H2#hi,$R1
  947. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  948. vmlal.u32 $D1,$H0#hi,$R1
  949. vmlal.u32 $D4,$H3#hi,$R1
  950. vmlal.u32 $D2,$H1#hi,$R1
  951. vmlal.u32 $D3,$H1#hi,$R2
  952. vld1.32 ${S4}[1],[$tbl1,:32]
  953. vmlal.u32 $D0,$H3#hi,$S2
  954. vld1.32 ${S4}[0],[$tbl0,:32]
  955. vmlal.u32 $D4,$H2#hi,$R2
  956. vmlal.u32 $D1,$H4#hi,$S2
  957. vmlal.u32 $D2,$H0#hi,$R2
  958. vmlal.u32 $D3,$H0#hi,$R3
  959. it ne
  960. addne $tbl1,$ctx,#(48+2*9*4)
  961. vmlal.u32 $D0,$H2#hi,$S3
  962. it ne
  963. addne $tbl0,$ctx,#(48+3*9*4)
  964. vmlal.u32 $D4,$H1#hi,$R3
  965. vmlal.u32 $D1,$H3#hi,$S3
  966. vmlal.u32 $D2,$H4#hi,$S3
  967. vmlal.u32 $D3,$H4#hi,$S4
  968. vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
  969. vmlal.u32 $D0,$H1#hi,$S4
  970. vshr.u64 $MASK,$MASK,#38
  971. vmlal.u32 $D4,$H0#hi,$R4
  972. vmlal.u32 $D1,$H2#hi,$S4
  973. vmlal.u32 $D2,$H3#hi,$S4
  974. beq .Lshort_tail
  975. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  976. @ (hash+inp[0:1])*r^4:r^3 and accumulate
  977. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
  978. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
  979. vmlal.u32 $D2,$H2#lo,$R0
  980. vmlal.u32 $D0,$H0#lo,$R0
  981. vmlal.u32 $D3,$H3#lo,$R0
  982. vmlal.u32 $D1,$H1#lo,$R0
  983. vmlal.u32 $D4,$H4#lo,$R0
  984. vmlal.u32 $D0,$H4#lo,$S1
  985. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  986. vmlal.u32 $D3,$H2#lo,$R1
  987. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  988. vmlal.u32 $D1,$H0#lo,$R1
  989. vmlal.u32 $D4,$H3#lo,$R1
  990. vmlal.u32 $D2,$H1#lo,$R1
  991. vmlal.u32 $D3,$H1#lo,$R2
  992. vld1.32 ${S4}[1],[$tbl1,:32]
  993. vmlal.u32 $D0,$H3#lo,$S2
  994. vld1.32 ${S4}[0],[$tbl0,:32]
  995. vmlal.u32 $D4,$H2#lo,$R2
  996. vmlal.u32 $D1,$H4#lo,$S2
  997. vmlal.u32 $D2,$H0#lo,$R2
  998. vmlal.u32 $D3,$H0#lo,$R3
  999. vmlal.u32 $D0,$H2#lo,$S3
  1000. vmlal.u32 $D4,$H1#lo,$R3
  1001. vmlal.u32 $D1,$H3#lo,$S3
  1002. vmlal.u32 $D2,$H4#lo,$S3
  1003. vmlal.u32 $D3,$H4#lo,$S4
  1004. vorn $MASK,$MASK,$MASK @ all-ones
  1005. vmlal.u32 $D0,$H1#lo,$S4
  1006. vshr.u64 $MASK,$MASK,#38
  1007. vmlal.u32 $D4,$H0#lo,$R4
  1008. vmlal.u32 $D1,$H2#lo,$S4
  1009. vmlal.u32 $D2,$H3#lo,$S4
  1010. .Lshort_tail:
  1011. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  1012. @ horizontal addition
  1013. vadd.i64 $D3#lo,$D3#lo,$D3#hi
  1014. vadd.i64 $D0#lo,$D0#lo,$D0#hi
  1015. vadd.i64 $D4#lo,$D4#lo,$D4#hi
  1016. vadd.i64 $D1#lo,$D1#lo,$D1#hi
  1017. vadd.i64 $D2#lo,$D2#lo,$D2#hi
  1018. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  1019. @ lazy reduction, but without narrowing
  1020. vshr.u64 $T0,$D3,#26
  1021. vand.i64 $D3,$D3,$MASK
  1022. vshr.u64 $T1,$D0,#26
  1023. vand.i64 $D0,$D0,$MASK
  1024. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  1025. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  1026. vshr.u64 $T0,$D4,#26
  1027. vand.i64 $D4,$D4,$MASK
  1028. vshr.u64 $T1,$D1,#26
  1029. vand.i64 $D1,$D1,$MASK
  1030. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  1031. vadd.i64 $D0,$D0,$T0
  1032. vshl.u64 $T0,$T0,#2
  1033. vshr.u64 $T1,$D2,#26
  1034. vand.i64 $D2,$D2,$MASK
  1035. vadd.i64 $D0,$D0,$T0 @ h4 -> h0
  1036. vadd.i64 $D3,$D3,$T1 @ h2 -> h3
  1037. vshr.u64 $T0,$D0,#26
  1038. vand.i64 $D0,$D0,$MASK
  1039. vshr.u64 $T1,$D3,#26
  1040. vand.i64 $D3,$D3,$MASK
  1041. vadd.i64 $D1,$D1,$T0 @ h0 -> h1
  1042. vadd.i64 $D4,$D4,$T1 @ h3 -> h4
  1043. cmp $len,#0
  1044. bne .Leven
  1045. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  1046. @ store hash value
  1047. vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
  1048. vst1.32 {$D4#lo[0]},[$ctx]
  1049. vldmia sp!,{d8-d15} @ epilogue
  1050. ldmia sp!,{r4-r7}
  1051. ret @ bx lr
  1052. .size poly1305_blocks_neon,.-poly1305_blocks_neon
  1053. .align 5
  1054. .Lzeros:
  1055. .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  1056. #ifndef __KERNEL__
  1057. .LOPENSSL_armcap:
  1058. # ifdef _WIN32
  1059. .word OPENSSL_armcap_P
  1060. # else
  1061. .word OPENSSL_armcap_P-.Lpoly1305_init
  1062. # endif
  1063. .comm OPENSSL_armcap_P,4,4
  1064. .hidden OPENSSL_armcap_P
  1065. #endif
  1066. #endif
  1067. ___
  1068. } }
  1069. $code.=<<___;
  1070. .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
  1071. .align 2
  1072. ___
  1073. foreach (split("\n",$code)) {
  1074. s/\`([^\`]*)\`/eval $1/geo;
  1075. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  1076. s/\bret\b/bx lr/go or
  1077. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  1078. print $_,"\n";
  1079. }
  1080. close STDOUT; # enforce flush