nh-neon-core.S 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * NH - ε-almost-universal hash function, NEON accelerated version
  4. *
  5. * Copyright 2018 Google LLC
  6. *
  7. * Author: Eric Biggers <ebiggers@google.com>
  8. */
  9. #include <linux/linkage.h>
  10. .text
  11. .fpu neon
  12. KEY .req r0
  13. MESSAGE .req r1
  14. MESSAGE_LEN .req r2
  15. HASH .req r3
  16. PASS0_SUMS .req q0
  17. PASS0_SUM_A .req d0
  18. PASS0_SUM_B .req d1
  19. PASS1_SUMS .req q1
  20. PASS1_SUM_A .req d2
  21. PASS1_SUM_B .req d3
  22. PASS2_SUMS .req q2
  23. PASS2_SUM_A .req d4
  24. PASS2_SUM_B .req d5
  25. PASS3_SUMS .req q3
  26. PASS3_SUM_A .req d6
  27. PASS3_SUM_B .req d7
  28. K0 .req q4
  29. K1 .req q5
  30. K2 .req q6
  31. K3 .req q7
  32. T0 .req q8
  33. T0_L .req d16
  34. T0_H .req d17
  35. T1 .req q9
  36. T1_L .req d18
  37. T1_H .req d19
  38. T2 .req q10
  39. T2_L .req d20
  40. T2_H .req d21
  41. T3 .req q11
  42. T3_L .req d22
  43. T3_H .req d23
  44. .macro _nh_stride k0, k1, k2, k3
  45. // Load next message stride
  46. vld1.8 {T3}, [MESSAGE]!
  47. // Load next key stride
  48. vld1.32 {\k3}, [KEY]!
  49. // Add message words to key words
  50. vadd.u32 T0, T3, \k0
  51. vadd.u32 T1, T3, \k1
  52. vadd.u32 T2, T3, \k2
  53. vadd.u32 T3, T3, \k3
  54. // Multiply 32x32 => 64 and accumulate
  55. vmlal.u32 PASS0_SUMS, T0_L, T0_H
  56. vmlal.u32 PASS1_SUMS, T1_L, T1_H
  57. vmlal.u32 PASS2_SUMS, T2_L, T2_H
  58. vmlal.u32 PASS3_SUMS, T3_L, T3_H
  59. .endm
  60. /*
  61. * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
  62. * __le64 hash[NH_NUM_PASSES])
  63. *
  64. * It's guaranteed that message_len % 16 == 0.
  65. */
  66. ENTRY(nh_neon)
  67. vld1.32 {K0,K1}, [KEY]!
  68. vmov.u64 PASS0_SUMS, #0
  69. vmov.u64 PASS1_SUMS, #0
  70. vld1.32 {K2}, [KEY]!
  71. vmov.u64 PASS2_SUMS, #0
  72. vmov.u64 PASS3_SUMS, #0
  73. subs MESSAGE_LEN, MESSAGE_LEN, #64
  74. blt .Lloop4_done
  75. .Lloop4:
  76. _nh_stride K0, K1, K2, K3
  77. _nh_stride K1, K2, K3, K0
  78. _nh_stride K2, K3, K0, K1
  79. _nh_stride K3, K0, K1, K2
  80. subs MESSAGE_LEN, MESSAGE_LEN, #64
  81. bge .Lloop4
  82. .Lloop4_done:
  83. ands MESSAGE_LEN, MESSAGE_LEN, #63
  84. beq .Ldone
  85. _nh_stride K0, K1, K2, K3
  86. subs MESSAGE_LEN, MESSAGE_LEN, #16
  87. beq .Ldone
  88. _nh_stride K1, K2, K3, K0
  89. subs MESSAGE_LEN, MESSAGE_LEN, #16
  90. beq .Ldone
  91. _nh_stride K2, K3, K0, K1
  92. .Ldone:
  93. // Sum the accumulators for each pass, then store the sums to 'hash'
  94. vadd.u64 T0_L, PASS0_SUM_A, PASS0_SUM_B
  95. vadd.u64 T0_H, PASS1_SUM_A, PASS1_SUM_B
  96. vadd.u64 T1_L, PASS2_SUM_A, PASS2_SUM_B
  97. vadd.u64 T1_H, PASS3_SUM_A, PASS3_SUM_B
  98. vst1.8 {T0-T1}, [HASH]
  99. bx lr
  100. ENDPROC(nh_neon)