recov_neon_inner.c 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 2012 Intel Corporation
  4. * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
  5. */
  6. #include <arm_neon.h>
  7. #include "neon.h"
  8. #ifdef CONFIG_ARM
  9. /*
  10. * AArch32 does not provide this intrinsic natively because it does not
  11. * implement the underlying instruction. AArch32 only provides a 64-bit
  12. * wide vtbl.8 instruction, so use that instead.
  13. */
  14. static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
  15. {
  16. union {
  17. uint8x16_t val;
  18. uint8x8x2_t pair;
  19. } __a = { a };
  20. return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
  21. vtbl2_u8(__a.pair, vget_high_u8(b)));
  22. }
  23. #endif
  24. void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
  25. uint8_t *dq, const uint8_t *pbmul,
  26. const uint8_t *qmul)
  27. {
  28. uint8x16_t pm0 = vld1q_u8(pbmul);
  29. uint8x16_t pm1 = vld1q_u8(pbmul + 16);
  30. uint8x16_t qm0 = vld1q_u8(qmul);
  31. uint8x16_t qm1 = vld1q_u8(qmul + 16);
  32. uint8x16_t x0f = vdupq_n_u8(0x0f);
  33. /*
  34. * while ( bytes-- ) {
  35. * uint8_t px, qx, db;
  36. *
  37. * px = *p ^ *dp;
  38. * qx = qmul[*q ^ *dq];
  39. * *dq++ = db = pbmul[px] ^ qx;
  40. * *dp++ = db ^ px;
  41. * p++; q++;
  42. * }
  43. */
  44. while (bytes) {
  45. uint8x16_t vx, vy, px, qx, db;
  46. px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
  47. vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
  48. vy = vshrq_n_u8(vx, 4);
  49. vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
  50. vy = vqtbl1q_u8(qm1, vy);
  51. qx = veorq_u8(vx, vy);
  52. vy = vshrq_n_u8(px, 4);
  53. vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
  54. vy = vqtbl1q_u8(pm1, vy);
  55. vx = veorq_u8(vx, vy);
  56. db = veorq_u8(vx, qx);
  57. vst1q_u8(dq, db);
  58. vst1q_u8(dp, veorq_u8(db, px));
  59. bytes -= 16;
  60. p += 16;
  61. q += 16;
  62. dp += 16;
  63. dq += 16;
  64. }
  65. }
  66. void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
  67. const uint8_t *qmul)
  68. {
  69. uint8x16_t qm0 = vld1q_u8(qmul);
  70. uint8x16_t qm1 = vld1q_u8(qmul + 16);
  71. uint8x16_t x0f = vdupq_n_u8(0x0f);
  72. /*
  73. * while (bytes--) {
  74. * *p++ ^= *dq = qmul[*q ^ *dq];
  75. * q++; dq++;
  76. * }
  77. */
  78. while (bytes) {
  79. uint8x16_t vx, vy;
  80. vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
  81. vy = vshrq_n_u8(vx, 4);
  82. vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
  83. vy = vqtbl1q_u8(qm1, vy);
  84. vx = veorq_u8(vx, vy);
  85. vy = veorq_u8(vx, vld1q_u8(p));
  86. vst1q_u8(dq, vx);
  87. vst1q_u8(p, vy);
  88. bytes -= 16;
  89. p += 16;
  90. q += 16;
  91. dq += 16;
  92. }
  93. }