nh-neon-core.S 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
  4. *
  5. * Copyright 2018 Google LLC
  6. *
  7. * Author: Eric Biggers <ebiggers@google.com>
  8. */
  9. #include <linux/linkage.h>
  10. KEY .req x0
  11. MESSAGE .req x1
  12. MESSAGE_LEN .req x2
  13. HASH .req x3
  14. PASS0_SUMS .req v0
  15. PASS1_SUMS .req v1
  16. PASS2_SUMS .req v2
  17. PASS3_SUMS .req v3
  18. K0 .req v4
  19. K1 .req v5
  20. K2 .req v6
  21. K3 .req v7
  22. T0 .req v8
  23. T1 .req v9
  24. T2 .req v10
  25. T3 .req v11
  26. T4 .req v12
  27. T5 .req v13
  28. T6 .req v14
  29. T7 .req v15
  30. .macro _nh_stride k0, k1, k2, k3
  31. // Load next message stride
  32. ld1 {T3.16b}, [MESSAGE], #16
  33. // Load next key stride
  34. ld1 {\k3\().4s}, [KEY], #16
  35. // Add message words to key words
  36. add T0.4s, T3.4s, \k0\().4s
  37. add T1.4s, T3.4s, \k1\().4s
  38. add T2.4s, T3.4s, \k2\().4s
  39. add T3.4s, T3.4s, \k3\().4s
  40. // Multiply 32x32 => 64 and accumulate
  41. mov T4.d[0], T0.d[1]
  42. mov T5.d[0], T1.d[1]
  43. mov T6.d[0], T2.d[1]
  44. mov T7.d[0], T3.d[1]
  45. umlal PASS0_SUMS.2d, T0.2s, T4.2s
  46. umlal PASS1_SUMS.2d, T1.2s, T5.2s
  47. umlal PASS2_SUMS.2d, T2.2s, T6.2s
  48. umlal PASS3_SUMS.2d, T3.2s, T7.2s
  49. .endm
  50. /*
  51. * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
  52. * __le64 hash[NH_NUM_PASSES])
  53. *
  54. * It's guaranteed that message_len % 16 == 0.
  55. */
  56. SYM_FUNC_START(nh_neon)
  57. ld1 {K0.4s,K1.4s}, [KEY], #32
  58. movi PASS0_SUMS.2d, #0
  59. movi PASS1_SUMS.2d, #0
  60. ld1 {K2.4s}, [KEY], #16
  61. movi PASS2_SUMS.2d, #0
  62. movi PASS3_SUMS.2d, #0
  63. subs MESSAGE_LEN, MESSAGE_LEN, #64
  64. blt .Lloop4_done
  65. .Lloop4:
  66. _nh_stride K0, K1, K2, K3
  67. _nh_stride K1, K2, K3, K0
  68. _nh_stride K2, K3, K0, K1
  69. _nh_stride K3, K0, K1, K2
  70. subs MESSAGE_LEN, MESSAGE_LEN, #64
  71. bge .Lloop4
  72. .Lloop4_done:
  73. ands MESSAGE_LEN, MESSAGE_LEN, #63
  74. beq .Ldone
  75. _nh_stride K0, K1, K2, K3
  76. subs MESSAGE_LEN, MESSAGE_LEN, #16
  77. beq .Ldone
  78. _nh_stride K1, K2, K3, K0
  79. subs MESSAGE_LEN, MESSAGE_LEN, #16
  80. beq .Ldone
  81. _nh_stride K2, K3, K0, K1
  82. .Ldone:
  83. // Sum the accumulators for each pass, then store the sums to 'hash'
  84. addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
  85. addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
  86. st1 {T0.16b,T1.16b}, [HASH]
  87. ret
  88. SYM_FUNC_END(nh_neon)